/**
 * Convert linear float soa values to packed srgb AoS values.
 * This only handles packed formats which are 4x8bit in size
 * (rgba and rgbx plus swizzles), and 16bit 565-style formats
 * with no alpha. (In the latter case the return values won't be
 * fully packed, it will look like r5g6b5x16r5g6b5x16...)
 *
 * @param src   float SoA (vector) values to convert.
 */
LLVMValueRef
lp_build_float_to_srgb_packed(struct gallivm_state *gallivm,
                              const struct util_format_description *dst_fmt,
                              struct lp_type src_type,
                              LLVMValueRef *src)
{
   LLVMBuilderRef builder = gallivm->builder;
   unsigned chan;
   struct lp_build_context f32_bld;
   struct lp_type int32_type = lp_int_type(src_type);
   LLVMValueRef tmpsrgb[4], alpha, dst;

   lp_build_context_init(&f32_bld, gallivm, src_type);

   /* rgb is subject to linear->srgb conversion, alpha is not */
   for (chan = 0; chan < 3; chan++) {
      unsigned chan_bits = dst_fmt->channel[dst_fmt->swizzle[chan]].size;
      tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, chan_bits, src[chan]);
   }
   /*
    * can't use lp_build_conv since we want to keep values as 32bit
    * here so we can interleave with rgb to go from SoA->AoS.
    */
   alpha = lp_build_clamp_zero_one_nanzero(&f32_bld, src[3]);
   alpha = lp_build_mul(&f32_bld, alpha,
                        lp_build_const_vec(gallivm, src_type, 255.0f));
   tmpsrgb[3] = lp_build_iround(&f32_bld, alpha);

   dst = lp_build_zero(gallivm, int32_type);
   for (chan = 0; chan < dst_fmt->nr_channels; chan++) {
      if (dst_fmt->swizzle[chan] <= PIPE_SWIZZLE_W) {
         unsigned ls;
         LLVMValueRef shifted, shift_val;
         ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift;
         shift_val = lp_build_const_int_vec(gallivm, int32_type, ls);
         shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, "");
         dst = LLVMBuildOr(builder, dst, shifted, "");
      }
   }
   return dst;
}
void
lp_build_tgsi_aos(struct gallivm_state *gallivm,
                  const struct tgsi_token *tokens,
                  struct lp_type type,
                  const unsigned char swizzles[4],
                  LLVMValueRef consts_ptr,
                  const LLVMValueRef *inputs,
                  LLVMValueRef *outputs,
                  struct lp_build_sampler_aos *sampler,
                  const struct tgsi_shader_info *info)
{
   struct lp_build_tgsi_aos_context bld;
   struct tgsi_parse_context parse;
   uint num_immediates = 0;
   unsigned chan;
   int pc = 0;

   /* Setup build context */
   memset(&bld, 0, sizeof bld);
   lp_build_context_init(&bld.bld_base.base, gallivm, type);
   lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type));
   lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type));
   lp_build_context_init(&bld.int_bld, gallivm, lp_int_type(type));

   for (chan = 0; chan < 4; ++chan) {
      bld.swizzles[chan] = swizzles[chan];
      bld.inv_swizzles[swizzles[chan]] = chan;
   }

   bld.inputs = inputs;
   bld.outputs = outputs;
   bld.consts_ptr = consts_ptr;
   bld.sampler = sampler;
   bld.indirect_files = info->indirect_files;
   bld.bld_base.emit_swizzle = swizzle_aos;
   bld.bld_base.info = info;

   bld.bld_base.emit_fetch_funcs[TGSI_FILE_CONSTANT] = emit_fetch_constant;
   bld.bld_base.emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch_immediate;
   bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input;
   bld.bld_base.emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch_temporary;

   /* Set opcode actions */
   lp_set_default_actions_cpu(&bld.bld_base);

   if (!lp_bld_tgsi_list_init(&bld.bld_base)) {
      return;
   }

   tgsi_parse_init(&parse, tokens);

   while (!tgsi_parse_end_of_tokens(&parse)) {
      tgsi_parse_token(&parse);

      switch(parse.FullToken.Token.Type) {
      case TGSI_TOKEN_TYPE_DECLARATION:
         /* Inputs already interpolated */
         lp_emit_declaration_aos(&bld, &parse.FullToken.FullDeclaration);
         break;

      case TGSI_TOKEN_TYPE_INSTRUCTION:
         /* save expanded instruction */
         lp_bld_tgsi_add_instruction(&bld.bld_base,
                                     &parse.FullToken.FullInstruction);
         break;

      case TGSI_TOKEN_TYPE_IMMEDIATE:
         /* simply copy the immediate values into the next immediates[] slot */
         {
            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
            float imm[4];
            assert(size <= 4);
            assert(num_immediates < LP_MAX_TGSI_IMMEDIATES);
            for (chan = 0; chan < 4; ++chan) {
               imm[chan] = 0.0f;
            }
            for (chan = 0; chan < size; ++chan) {
               unsigned swizzle = bld.swizzles[chan];
               imm[swizzle] = parse.FullToken.FullImmediate.u[chan].Float;
            }
            bld.immediates[num_immediates] =
                     lp_build_const_aos(gallivm, type,
                                        imm[0], imm[1], imm[2], imm[3],
                                        NULL);
            num_immediates++;
         }
         break;

      case TGSI_TOKEN_TYPE_PROPERTY:
         break;

      default:
         assert(0);
      }
   }

   while (pc != -1) {
      struct tgsi_full_instruction *instr = bld.bld_base.instructions + pc;
      const struct tgsi_opcode_info *opcode_info =
         tgsi_get_opcode_info(instr->Instruction.Opcode);
      if (!lp_emit_instruction_aos(&bld, instr, opcode_info, &pc))
         _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
                       opcode_info->mnemonic);
   }

   if (0) {
      LLVMBasicBlockRef block = LLVMGetInsertBlock(gallivm->builder);
      LLVMValueRef function = LLVMGetBasicBlockParent(block);
      debug_printf("11111111111111111111111111111 \n");
      tgsi_dump(tokens, 0);
      lp_debug_dump_value(function);
      debug_printf("2222222222222222222222222222 \n");
   }
   tgsi_parse_free(&parse);
   FREE(bld.bld_base.instructions);

   if (0) {
      LLVMModuleRef module = LLVMGetGlobalParent(
         LLVMGetBasicBlockParent(LLVMGetInsertBlock(gallivm->builder)));
      LLVMDumpModule(module);
   }

}
Esempio n. 3
0
/**
 * Generate code for performing depth and/or stencil tests.
 * We operate on a vector of values (typically n 2x2 quads).
 *
 * \param depth  the depth test state
 * \param stencil  the front/back stencil state
 * \param type  the data type of the fragment depth/stencil values
 * \param format_desc  description of the depth/stencil surface
 * \param mask  the alive/dead pixel mask for the quad (vector)
 * \param stencil_refs  the front/back stencil ref values (scalar)
 * \param z_src  the incoming depth/stencil values (n 2x2 quad values, float32)
 * \param zs_dst  the depth/stencil values in framebuffer
 * \param face  contains boolean value indicating front/back facing polygon
 */
void
lp_build_depth_stencil_test(struct gallivm_state *gallivm,
                            const struct pipe_depth_state *depth,
                            const struct pipe_stencil_state stencil[2],
                            struct lp_type z_src_type,
                            const struct util_format_description *format_desc,
                            struct lp_build_mask_context *mask,
                            LLVMValueRef stencil_refs[2],
                            LLVMValueRef z_src,
                            LLVMValueRef z_fb,
                            LLVMValueRef s_fb,
                            LLVMValueRef face,
                            LLVMValueRef *z_value,
                            LLVMValueRef *s_value,
                            boolean do_branch)
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_type z_type;
   struct lp_build_context z_bld;
   struct lp_build_context s_bld;
   struct lp_type s_type;
   unsigned z_shift = 0, z_width = 0, z_mask = 0;
   LLVMValueRef z_dst = NULL;
   LLVMValueRef stencil_vals = NULL;
   LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
   LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
   LLVMValueRef orig_mask = lp_build_mask_value(mask);
   LLVMValueRef front_facing = NULL;
   boolean have_z, have_s;

   /*
    * Depths are expected to be between 0 and 1, even if they are stored in
    * floats. Setting these bits here will ensure that the lp_build_conv() call
    * below won't try to unnecessarily clamp the incoming values.
    */
   if(z_src_type.floating) {
      z_src_type.sign = FALSE;
      z_src_type.norm = TRUE;
   }
   else {
      assert(!z_src_type.sign);
      assert(z_src_type.norm);
   }

   /* Pick the type matching the depth-stencil format. */
   z_type = lp_depth_type(format_desc, z_src_type.length);

   /* Pick the intermediate type for depth operations. */
   z_type.width = z_src_type.width;
   assert(z_type.length == z_src_type.length);

   /* FIXME: for non-float depth/stencil might generate better code
    * if we'd always split it up to use 128bit operations.
    * For stencil we'd almost certainly want to pack to 8xi16 values,
    * for z just run twice.
    */

   /* Sanity checking */
   {
      const unsigned z_swizzle = format_desc->swizzle[0];
      const unsigned s_swizzle = format_desc->swizzle[1];

      assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE ||
             s_swizzle != UTIL_FORMAT_SWIZZLE_NONE);

      assert(depth->enabled || stencil[0].enabled);

      assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
      assert(format_desc->block.width == 1);
      assert(format_desc->block.height == 1);

      if (stencil[0].enabled) {
         assert(s_swizzle < 4);
         assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
         assert(format_desc->channel[s_swizzle].pure_integer);
         assert(!format_desc->channel[s_swizzle].normalized);
         assert(format_desc->channel[s_swizzle].size == 8);
      }

      if (depth->enabled) {
         assert(z_swizzle < 4);
         if (z_type.floating) {
            assert(z_swizzle == 0);
            assert(format_desc->channel[z_swizzle].type ==
                   UTIL_FORMAT_TYPE_FLOAT);
            assert(format_desc->channel[z_swizzle].size == 32);
         }
         else {
            assert(format_desc->channel[z_swizzle].type ==
                   UTIL_FORMAT_TYPE_UNSIGNED);
            assert(format_desc->channel[z_swizzle].normalized);
            assert(!z_type.fixed);
         }
      }
   }


   /* Setup build context for Z vals */
   lp_build_context_init(&z_bld, gallivm, z_type);

   /* Setup build context for stencil vals */
   s_type = lp_int_type(z_type);
   lp_build_context_init(&s_bld, gallivm, s_type);

   /* Compute and apply the Z/stencil bitmasks and shifts.
    */
   {
      unsigned s_shift, s_mask;

      z_dst = z_fb;
      stencil_vals = s_fb;

      have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
      have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);

      if (have_z) {
         if (z_mask != 0xffffffff) {
            z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);
         }

         /*
          * Align the framebuffer Z 's LSB to the right.
          */
         if (z_shift) {
            LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
            z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");
         } else if (z_bitmask) {
            z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");
         } else {
            lp_build_name(z_dst, "z_dst");
         }
      }

      if (have_s) {
         if (s_shift) {
            LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);
            stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");
            stencil_shift = shift;  /* used below */
         }

         if (s_mask != 0xffffffff) {
            LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);
            stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
         }

         lp_build_name(stencil_vals, "s_dst");
      }
   }

   if (stencil[0].enabled) {

      if (face) {
         LLVMValueRef zero = lp_build_const_int32(gallivm, 0);

         /* front_facing = face != 0 ? ~0 : 0 */
         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
         front_facing = LLVMBuildSExt(builder, front_facing,
                                      LLVMIntTypeInContext(gallivm->context,
                                             s_bld.type.length*s_bld.type.width),
                                      "");
         front_facing = LLVMBuildBitCast(builder, front_facing,
                                         s_bld.int_vec_type, "");
      }

      /* convert scalar stencil refs into vectors */
      stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
      stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);

      s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
                                          stencil_refs, stencil_vals,
                                          front_facing);

      /* apply stencil-fail operator */
      {
         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask);
         stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
                                            stencil_refs, stencil_vals,
                                            s_fail_mask, front_facing);
      }
   }

   if (depth->enabled) {
      /*
       * Convert fragment Z to the desired type, aligning the LSB to the right.
       */

      assert(z_type.width == z_src_type.width);
      assert(z_type.length == z_src_type.length);
      assert(lp_check_value(z_src_type, z_src));
      if (z_src_type.floating) {
         /*
          * Convert from floating point values
          */

         if (!z_type.floating) {
            z_src = lp_build_clamped_float_to_unsigned_norm(gallivm,
                                                            z_src_type,
                                                            z_width,
                                                            z_src);
         }
      } else {
         /*
          * Convert from unsigned normalized values.
          */

         assert(!z_src_type.sign);
         assert(!z_src_type.fixed);
         assert(z_src_type.norm);
         assert(!z_type.floating);
         if (z_src_type.width > z_width) {
            LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type,
                                                        z_src_type.width - z_width);
            z_src = LLVMBuildLShr(builder, z_src, shift, "");
         }
      }
      assert(lp_check_value(z_type, z_src));

      lp_build_name(z_src, "z_src");

      /* compare src Z to dst Z, returning 'pass' mask */
      z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);

      if (!stencil[0].enabled) {
         /* We can potentially skip all remaining operations here, but only
          * if stencil is disabled because we still need to update the stencil
          * buffer values.  Don't need to update Z buffer values.
          */
         lp_build_mask_update(mask, z_pass);

         if (do_branch) {
            lp_build_mask_check(mask);
            do_branch = FALSE;
         }
      }

      if (depth->writemask) {
         LLVMValueRef zselectmask;

         /* mask off bits that failed Z test */
         zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, "");

         /* mask off bits that failed stencil test */
         if (s_pass_mask) {
            zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
         }

         /* Mix the old and new Z buffer values.
          * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
          */
         z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst);
      }

      if (stencil[0].enabled) {
         /* update stencil buffer values according to z pass/fail result */
         LLVMValueRef z_fail_mask, z_pass_mask;

         /* apply Z-fail operator */
         z_fail_mask = lp_build_andnot(&s_bld, orig_mask, z_pass);
         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
                                            stencil_refs, stencil_vals,
                                            z_fail_mask, front_facing);

         /* apply Z-pass operator */
         z_pass_mask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                            stencil_refs, stencil_vals,
                                            z_pass_mask, front_facing);
      }
   }
   else {
      /* No depth test: apply Z-pass operator to stencil buffer values which
       * passed the stencil test.
       */
      s_pass_mask = LLVMBuildAnd(builder, orig_mask, s_pass_mask, "");
      stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                         stencil_refs, stencil_vals,
                                         s_pass_mask, front_facing);
   }

   /* Put Z and stencil bits in the right place */
   if (have_z && z_shift) {
      LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
      z_dst = LLVMBuildShl(builder, z_dst, shift, "");
   }
   if (stencil_vals && stencil_shift)
      stencil_vals = LLVMBuildShl(builder, stencil_vals,
                                  stencil_shift, "");

   /* Finally, merge the z/stencil values */
   if (format_desc->block.bits <= 32) {
      if (have_z && have_s)
         *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");
      else if (have_z)
         *z_value = z_dst;
      else
         *z_value = stencil_vals;
      *s_value = *z_value;
   }
   else {
      *z_value = z_dst;
      *s_value = stencil_vals;
   }

   if (s_pass_mask)
      lp_build_mask_update(mask, s_pass_mask);

   if (depth->enabled && stencil[0].enabled)
      lp_build_mask_update(mask, z_pass);
}
/**
 * Convert linear float values to srgb int values.
 * Several possibilities how to do this, e.g.
 * - use table (based on exponent/highest order mantissa bits) and do
 *   linear interpolation (https://gist.github.com/rygorous/2203834)
 * - Chebyshev polynomial
 * - Approximation using reciprocals
 * - using int-to-float and float-to-int tricks for pow()
 *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
 *
 * @param src   float (vector) value(s) to convert.
 */
static LLVMValueRef
lp_build_linear_to_srgb(struct gallivm_state *gallivm,
                        struct lp_type src_type,
                        LLVMValueRef src)
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_build_context f32_bld;
   LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;

   lp_build_context_init(&f32_bld, gallivm, src_type);

   src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);

   if (0) {
      /*
       * using int-to-float and float-to-int trick for pow().
       * This is much more accurate than necessary thanks to the correction,
       * but it most certainly makes no sense without rsqrt available.
       * Bonus points if you understand how this works...
       * All in all (including min/max clamp, conversion) 19 instructions.
       */

      float exp_f = 2.0f / 3.0f;
      /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */
      float exp2f_c = 1.30438178253e+19f;
      float coeff_f = 0.62996f;
      LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;
      struct lp_type int_type = lp_int_type(src_type);

      /*
       * First calculate approx x^8/12
       */
      exponent = lp_build_const_vec(gallivm, src_type, exp_f);
      coeff = lp_build_const_vec(gallivm, src_type,
                                 exp2f_c * powf(coeff_f, 1.0f / exp_f));

      /* premultiply src */
      tmp = lp_build_mul(&f32_bld, coeff, src);
      /* "log2" */
      tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");
      tmp = lp_build_int_to_float(&f32_bld, tmp);
      /* multiply for pow */
      tmp = lp_build_mul(&f32_bld, tmp, exponent);
      /* "exp2" */
      pow_approx = lp_build_itrunc(&f32_bld, tmp);
      pow_approx = LLVMBuildBitCast(builder, pow_approx,
                                    lp_build_vec_type(gallivm, src_type), "");

      /*
       * Since that pow was inaccurate (like 3 bits, though each sqrt step would
       * give another bit), compensate the error (which is why we chose another
       * exponent in the first place).
       */
      /* x * x^(8/12) = x^(20/12) */
      pow_1 = lp_build_mul(&f32_bld, pow_approx, src);

      /* x * x * x^(-4/12) = x^(20/12) */
      /* Should avoid using rsqrt if it's not available, but
       * using x * x^(4/12) * x^(4/12) instead will change error weight */
      tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
      x2 = lp_build_mul(&f32_bld, src, src);
      pow_2 = lp_build_mul(&f32_bld, x2, tmp);

      /* average the values so the errors cancel out, compensate bias,
       * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
       * for conversion to int in here */
      tmp = lp_build_add(&f32_bld, pow_1, pow_2);
      coeff = lp_build_const_vec(gallivm, src_type,
                                 1.0f / (3.0f * coeff_f) * 0.999852f *
                                 powf(1.055f * 255.0f, 4.0f));
      pow_final = lp_build_mul(&f32_bld, tmp, coeff);

      /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
      if (lp_build_fast_rsqrt_available(src_type)) {
         pow_final = lp_build_fast_rsqrt(&f32_bld,
                        lp_build_fast_rsqrt(&f32_bld, pow_final));
      }
      else {
         pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));
      }
      pow_final = lp_build_add(&f32_bld, pow_final,
                               lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));
   }

   else {
      /*
       * using "rational polynomial" approximation here.
       * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
       * factoring in the 255.0 mul and the scaling mul.
       * (a is closer to actual value so has higher weight than b.)
       * Note: the constants are magic values. They were found empirically,
       * possibly could be improved but good enough (be VERY careful with
       * error metric if you'd want to tweak them, they also MUST fit with
       * the crappy polynomial above for srgb->linear since it is required
       * that each srgb value maps back to the same value).
       * This function has an error of max +-0.17 (and we'd only require +-0.6),
       * for the approximated srgb->linear values the error is naturally larger
       * (+-0.42) but still accurate enough (required +-0.5 essentially).
       * All in all (including min/max clamp, conversion) 15 instructions.
       * FMA would help (minus 2 instructions).
       */

      LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;

      if (lp_build_fast_rsqrt_available(src_type)) {
         tmp = lp_build_fast_rsqrt(&f32_bld, src);
         x05 = lp_build_mul(&f32_bld, src, tmp);
      }
      else {
         /*
          * I don't really expect this to be practical without rsqrt
          * but there's no reason for triple punishment so at least
          * save the otherwise resulting division and unnecessary mul...
          */
         x05 = lp_build_sqrt(&f32_bld, src);
      }

      tmp = lp_build_mul(&f32_bld, x05, src);
      if (lp_build_fast_rsqrt_available(src_type)) {
         x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));
      }
      else {
         x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
      }

      a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);
      b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);
      c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);

      tmp = lp_build_mul(&f32_bld, a_const, x0375);
      tmp2 = lp_build_mul(&f32_bld, b_const, x05);
      tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
      pow_final = lp_build_add(&f32_bld, tmp, tmp2);
   }

   /* linear part is easy */
   lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
   lin = lp_build_mul(&f32_bld, src, lin_const);

   lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
   is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);
   tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);

   f32_bld.type.sign = 0;
   return lp_build_iround(&f32_bld, tmp);
}
Esempio n. 5
0
/**
 * Fetch a texels from a texture, returning them in SoA layout.
 *
 * \param type  the desired return type for 'rgba'.  The vector length
 *              is the number of texels to fetch
 * \param aligned if the offset is guaranteed to be aligned to element width
 *
 * \param base_ptr  points to the base of the texture mip tree.
 * \param offset    offset to start of the texture image block.  For non-
 *                  compressed formats, this simply is an offset to the texel.
 *                  For compressed formats, it is an offset to the start of the
 *                  compressed data block.
 *
 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 *              these will always be (0,0).  For compressed formats, i will
 *              be in [0, block_width-1] and j will be in [0, block_height-1].
 * \param cache  optional value pointing to a lp_build_format_cache structure
 */
void
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                        const struct util_format_description *format_desc,
                        struct lp_type type,
                        boolean aligned,
                        LLVMValueRef base_ptr,
                        LLVMValueRef offset,
                        LLVMValueRef i,
                        LLVMValueRef j,
                        LLVMValueRef cache,
                        LLVMValueRef rgba_out[4])
{
   LLVMBuilderRef builder = gallivm->builder;
   enum pipe_format format = format_desc->format;
   struct lp_type fetch_type;

   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       format_desc->block.bits <= type.width &&
       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
        format_desc->channel[0].size == 32 ||
        format_desc->channel[0].size == 16))
   {
      /*
       * The packed pixel fits into an element of the destination format. Put
       * the packed pixels into a vector and extract each component for all
       * vector elements in parallel.
       */

      LLVMValueRef packed;

      /*
       * gather the texels from the texture
       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
       */
      assert(format_desc->block.bits <= type.width);
      fetch_type = lp_type_uint(type.width);
      packed = lp_build_gather(gallivm,
                               type.length,
                               format_desc->block.bits,
                               fetch_type,
                               aligned,
                               base_ptr, offset, FALSE);

      /*
       * convert texels to float rgba
       */
      lp_build_unpack_rgba_soa(gallivm,
                               format_desc,
                               type,
                               packed, rgba_out);
      return;
   }


   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       format_desc->block.bits > type.width &&
       ((format_desc->block.bits <= type.width * type.length &&
         format_desc->channel[0].size <= type.width) ||
        (format_desc->channel[0].size == 64 &&
         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
         type.floating)))
   {
      /*
       * Similar to above, but the packed pixel is larger than what fits
       * into an element of the destination format. The packed pixels will be
       * shuffled into SoA vectors appropriately, and then the extraction will
       * be done in parallel as much as possible.
       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
       * the gathered vectors can be shuffled easily (even with avx).
       * 64xn float -> 32xn float is handled too but it's a bit special as
       * it does the conversion pre-shuffle.
       */

      LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
      struct lp_type fetch_type, gather_type = type;
      unsigned num_gather, fetch_width, i, j;
      struct lp_build_context bld;
      boolean fp64 = format_desc->channel[0].size == 64;

      lp_build_context_init(&bld, gallivm, type);

      assert(type.width == 32);
      assert(format_desc->block.bits > type.width);

      /*
       * First, figure out fetch order.
       */
      fetch_width = util_next_power_of_two(format_desc->block.bits);
      num_gather = fetch_width / type.width;
      /*
       * fp64 are treated like fp32 except we fetch twice wide values
       * (as we shuffle after trunc). The shuffles for that work out
       * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
       * albeit we miss the potential opportunity for hw gather (as it
       * only handles native size).
       */
      num_gather = fetch_width / type.width;
      gather_type.width *= num_gather;
      if (fp64) {
         num_gather /= 2;
      }
      gather_type.length /= num_gather;

      for (i = 0; i < num_gather; i++) {
         LLVMValueRef offsetr, shuf_vec;
         if(num_gather == 4) {
            for (j = 0; j < gather_type.length; j++) {
               unsigned idx = i + 4*j;
               shuffles[j] = lp_build_const_int32(gallivm, idx);
            }
            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");

         }
         else if (num_gather == 2) {
            assert(num_gather == 2);
            for (j = 0; j < gather_type.length; j++) {
               unsigned idx = i*2 + (j%2) + (j/2)*4;
               shuffles[j] = lp_build_const_int32(gallivm, idx);
            }
            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
         }
         else {
            assert(num_gather == 1);
            offsetr = offset;
         }
         if (gather_type.length == 1) {
            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
            offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
         }

         /*
          * Determine whether to use float or int loads. This is mostly
          * to outsmart the (stupid) llvm int/float shuffle logic, we
          * don't really care much if the data is floats or ints...
          * But llvm will refuse to use single float shuffle with int data
          * and instead use 3 int shuffles instead, the code looks atrocious.
          * (Note bitcasts often won't help, as llvm is too smart to be
          * fooled by that.)
          * Nobody cares about simd float<->int domain transition penalties,
          * which usually don't even exist for shuffles anyway.
          * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
          * going into transpose, which is unpacks, so doesn't really matter
          * much).
          * With 2x32bit or 4x16bit fetch, we use float vec, since those
          * go into the weird channel separation shuffle. With floats,
          * this is (with 128bit vectors):
          * - 2 movq, 2 movhpd, 2 shufps
          * With ints it would be:
          * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
          * I've seen texture functions increase in code size by 15% just due
          * to that (there's lots of such fetches in them...)
          * (We could chose a different gather order to improve this somewhat
          * for the int path, but it would basically just drop the blends,
          * so the float path with this order really is optimal.)
          * Albeit it is tricky sometimes llvm doesn't ignore the float->int
          * casts so must avoid them until we're done with the float shuffle...
          * 3x16bit formats (the same is also true for 3x8) are pretty bad but
          * there's nothing we can do about them (we could overallocate by
          * those couple bytes and use unaligned but pot sized load).
          * Note that this is very much x86 specific. I don't know if this
          * affect other archs at all.
          */
         if (num_gather > 1) {
            /*
             * We always want some float type here (with x86)
             * due to shuffles being float ones afterwards (albeit for
             * the num_gather == 4 case int should work fine too
             * (unless there's some problems with avx but not avx2).
             */
            if (format_desc->channel[0].size == 64) {
               fetch_type = lp_type_float_vec(64, gather_type.width);
            } else {
               fetch_type = lp_type_int_vec(32, gather_type.width);
            }
         }
         else {
            /* type doesn't matter much */
            if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
                (format_desc->channel[0].size == 32 ||
                 format_desc->channel[0].size == 64)) {
            fetch_type = lp_type_float(gather_type.width);
            } else {
               fetch_type = lp_type_uint(gather_type.width);
            }
         }

         /* Now finally gather the values */
         packed[i] = lp_build_gather(gallivm, gather_type.length,
                                     format_desc->block.bits,
                                     fetch_type, aligned,
                                     base_ptr, offsetr, FALSE);
         if (fp64) {
            struct lp_type conv_type = type;
            conv_type.width *= 2;
            packed[i] = LLVMBuildBitCast(builder, packed[i],
                                         lp_build_vec_type(gallivm, conv_type), "");
            packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
         }
      }

      /* shuffle the gathered values to SoA */
      if (num_gather == 2) {
         for (i = 0; i < num_gather; i++) {
            for (j = 0; j < type.length; j++) {
               unsigned idx = (j%2)*2 + (j/4)*4 + i;
               if ((j/2)%2)
                  idx += type.length;
               shuffles[j] = lp_build_const_int32(gallivm, idx);
            }
            dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
                                            LLVMConstVector(shuffles, type.length), "");
         }
      }
      else if (num_gather == 4) {
         lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
      }
      else {
         assert(num_gather == 1);
         dst[0] = packed[0];
      }

      /*
       * And finally unpack exactly as above, except that
       * chan shift is adjusted and the right vector selected.
       */
      if (!fp64) {
         for (i = 0; i < num_gather; i++) {
            dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
         }
         for (i = 0; i < format_desc->nr_channels; i++) {
            struct util_format_channel_description chan_desc = format_desc->channel[i];
            unsigned blockbits = type.width;
            unsigned vec_nr = chan_desc.shift / type.width;
            chan_desc.shift %= type.width;

            output[i] = lp_build_extract_soa_chan(&bld,
                                                  blockbits,
                                                  FALSE,
                                                  chan_desc,
                                                  dst[vec_nr]);
         }
      }
      else {
         for (i = 0; i < format_desc->nr_channels; i++)  {
            output[i] = dst[i];
         }
      }

      lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
      return;
   }

   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
      /*
       * similar conceptually to above but requiring special
       * AoS packed -> SoA float conversion code.
       */
      LLVMValueRef packed;
      struct lp_type fetch_type = lp_type_uint(type.width);

      assert(type.floating);
      assert(type.width == 32);

      packed = lp_build_gather(gallivm, type.length,
                               format_desc->block.bits,
                               fetch_type, aligned,
                               base_ptr, offset, FALSE);
      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
      }
      else {
         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
      }
      return;
   }

   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
       format_desc->block.bits == 64) {
      /*
       * special case the format is 64 bits but we only require
       * 32bit (or 8bit) from each block.
       */
      LLVMValueRef packed;
      struct lp_type fetch_type = lp_type_uint(type.width);

      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
         /*
          * for stencil simply fix up offsets - could in fact change
          * base_ptr instead even outside the shader.
          */
         unsigned mask = (1 << 8) - 1;
         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
         offset = LLVMBuildAdd(builder, offset, s_offset, "");
         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                  aligned, base_ptr, offset, FALSE);
         packed = LLVMBuildAnd(builder, packed,
                               lp_build_const_int_vec(gallivm, type, mask), "");
      }
      else {
         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                  aligned, base_ptr, offset, TRUE);
         packed = LLVMBuildBitCast(builder, packed,
                                   lp_build_vec_type(gallivm, type), "");
      }
      /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
      rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
      rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
      return;
   }

   /*
    * Try calling lp_build_fetch_rgba_aos for all pixels.
    * Should only really hit subsampled, compressed
    * (for s3tc srgb too, for rgtc the unorm ones only) by now.
    * (This is invalid for plain 8unorm formats because we're lazy with
    * the swizzle since some results would arrive swizzled, some not.)
    */

   if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
       (util_format_fits_8unorm(format_desc) ||
        format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
       type.floating && type.width == 32 &&
       (type.length == 1 || (type.length % 4 == 0))) {
      struct lp_type tmp_type;
      struct lp_build_context bld;
      LLVMValueRef packed, rgba[4];
      const struct util_format_description *flinear_desc;
      const struct util_format_description *frgba8_desc;
      unsigned chan;

      lp_build_context_init(&bld, gallivm, type);

      /*
       * Make sure the conversion in aos really only does convert to rgba8
       * and not anything more (so use linear format, adjust type).
       */
      flinear_desc = util_format_description(util_format_linear(format));
      memset(&tmp_type, 0, sizeof tmp_type);
      tmp_type.width = 8;
      tmp_type.length = type.length * 4;
      tmp_type.norm = TRUE;

      packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
                                       aligned, base_ptr, offset, i, j, cache);
      packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");

      /*
       * The values are now packed so they match ordinary (srgb) RGBA8 format,
       * hence need to use matching format for unpack.
       */
      frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
         assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
         frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
      }
      lp_build_unpack_rgba_soa(gallivm,
                               frgba8_desc,
                               type,
                               packed, rgba);

      /*
       * We converted 4 channels. Make sure llvm can drop unneeded ones
       * (luckily the rgba order is fixed, only LA needs special case).
       */
      for (chan = 0; chan < 4; chan++) {
         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
         if (chan == 3 && util_format_is_luminance_alpha(format)) {
            swizzle = PIPE_SWIZZLE_W;
         }
         rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
      }
      return;
   }


   /*
    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
    *
    * This is not the most efficient way of fetching pixels, as we
    * miss some opportunities to do vectorization, but this is
    * convenient for formats or scenarios for which there was no
    * opportunity or incentive to optimize.
    *
    * We do NOT want to end up here, this typically is quite terrible,
    * in particular if the formats have less than 4 channels.
    *
    * Right now, this should only be hit for:
    * - RGTC snorm formats
    *   (those miss fast fetch functions hence they are terrible anyway)
    */

   {
      unsigned k;
      struct lp_type tmp_type;
      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];

      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
         debug_printf("%s: AoS fetch fallback for %s\n",
                      __FUNCTION__, format_desc->short_name);
      }

      tmp_type = type;
      tmp_type.length = 4;

      /*
       * Note that vector transpose can be worse compared to insert/extract
       * for aos->soa conversion (for formats with 1 or 2 channels). However,
       * we should try to avoid getting here for just about all formats, so
       * don't bother.
       */

      /* loop over number of pixels */
      for(k = 0; k < type.length; ++k) {
         LLVMValueRef index = lp_build_const_int32(gallivm, k);
         LLVMValueRef offset_elem;
         LLVMValueRef i_elem, j_elem;

         offset_elem = LLVMBuildExtractElement(builder, offset,
                                               index, "");

         i_elem = LLVMBuildExtractElement(builder, i, index, "");
         j_elem = LLVMBuildExtractElement(builder, j, index, "");

         /* Get a single float[4]={R,G,B,A} pixel */
         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                                aligned, base_ptr, offset_elem,
                                                i_elem, j_elem, cache);

      }
      convert_to_soa(gallivm, aos_fetch, rgba_out, type);
   }
}
Esempio n. 6
0
/**
 * Expand the relevent bits of mask_input to a 4-dword mask for the 
 * four pixels in a 2x2 quad.  This will set the four elements of the
 * quad mask vector to 0 or ~0.
 *
 * \param quad  which quad of the quad group to test, in [0,3]
 * \param mask_input  bitwise mask for the whole 4x4 stamp
 */
static LLVMValueRef
generate_quad_mask(LLVMBuilderRef builder,
                   struct lp_type fs_type,
                   unsigned quad,
                   LLVMValueRef mask_input) /* int32 */
{
   struct lp_type mask_type;
   LLVMTypeRef i32t = LLVMInt32Type();
   LLVMValueRef bits[4];
   LLVMValueRef mask;
   int shift;

   /*
    * XXX: We'll need a different path for 16 x u8
    */
   assert(fs_type.width == 32);
   assert(fs_type.length == 4);
   mask_type = lp_int_type(fs_type);

   /*
    * mask_input >>= (quad * 4)
    */
   
   switch (quad) {
   case 0:
      shift = 0;
      break;
   case 1:
      shift = 2;
      break;
   case 2:
      shift = 8;
      break;
   case 3:
      shift = 10;
      break;
   default:
      assert(0);
      shift = 0;
   }

   mask_input = LLVMBuildLShr(builder,
                              mask_input,
                              LLVMConstInt(i32t, shift, 0),
                              "");

   /*
    * mask = { mask_input & (1 << i), for i in [0,3] }
    */

   mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input);

   bits[0] = LLVMConstInt(i32t, 1 << 0, 0);
   bits[1] = LLVMConstInt(i32t, 1 << 1, 0);
   bits[2] = LLVMConstInt(i32t, 1 << 4, 0);
   bits[3] = LLVMConstInt(i32t, 1 << 5, 0);
   
   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), "");

   /*
    * mask = mask != 0 ? ~0 : 0
    */

   mask = lp_build_compare(builder,
                           mask_type, PIPE_FUNC_NOTEQUAL,
                           mask,
                           lp_build_const_int_vec(mask_type, 0));

   return mask;
}