/**
 * Fetch a pixel into a 4 float AoS.
 *
 * \param format_desc  describes format of the image we're fetching from
 * \param ptr  address of the pixel block (or the texel if uncompressed)
 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 *              these will always be (0, 0).
 * \return  a 4 element vector with the pixel's RGBA values.
 */
LLVMValueRef
lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                        const struct util_format_description *format_desc,
                        struct lp_type type,
                        LLVMValueRef base_ptr,
                        LLVMValueRef offset,
                        LLVMValueRef i,
                        LLVMValueRef j)
{
   LLVMBuilderRef builder = gallivm->builder;
   unsigned num_pixels = type.length / 4;
   struct lp_build_context bld;

   assert(type.length <= LP_MAX_VECTOR_LENGTH);
   assert(type.length % 4 == 0);

   lp_build_context_init(&bld, gallivm, type);

   /*
    * Trivial case
    *
    * The format matches the type (apart of a swizzle) so no need for
    * scaling or converting.
    */

   if (format_matches_type(format_desc, type) &&
       format_desc->block.bits <= type.width * 4 &&
       util_is_power_of_two(format_desc->block.bits)) {
      LLVMValueRef packed;

      /*
       * The format matches the type (apart of a swizzle) so no need for
       * scaling or converting.
       */

      packed = lp_build_gather(gallivm, type.length/4,
                               format_desc->block.bits, type.width*4,
                               base_ptr, offset);

      assert(format_desc->block.bits <= type.width * type.length);

      packed = LLVMBuildBitCast(gallivm->builder, packed,
                                lp_build_vec_type(gallivm, type), "");

      return lp_build_format_swizzle_aos(format_desc, &bld, packed);
   }

   /*
    * Bit arithmetic
    */

   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       util_is_power_of_two(format_desc->block.bits) &&
       format_desc->block.bits <= 32 &&
       format_desc->is_bitmask &&
       !format_desc->is_mixed &&
       (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) {

      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
      LLVMValueRef res;
      unsigned k;

      /*
       * Unpack a pixel at a time into a <4 x float> RGBA vector
       */

      for (k = 0; k < num_pixels; ++k) {
         LLVMValueRef packed;

         packed = lp_build_gather_elem(gallivm, num_pixels,
                                       format_desc->block.bits, 32,
                                       base_ptr, offset, k);

         tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
                                                  format_desc,
                                                  packed);
      }

      /*
       * Type conversion.
       *
       * TODO: We could avoid floating conversion for integer to
       * integer conversions.
       */

      if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
         debug_printf("%s: unpacking %s with floating point\n",
                      __FUNCTION__, format_desc->short_name);
      }

      lp_build_conv(gallivm,
                    lp_float32_vec4_type(),
                    type,
                    tmps, num_pixels, &res, 1);

      return lp_build_format_swizzle_aos(format_desc, &bld, res);
   }

   /*
    * YUV / subsampled formats
    */

   if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
      struct lp_type tmp_type;
      LLVMValueRef tmp;

      memset(&tmp_type, 0, sizeof tmp_type);
      tmp_type.width = 8;
      tmp_type.length = num_pixels * 4;
      tmp_type.norm = TRUE;

      tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
                                               format_desc,
                                               num_pixels,
                                               base_ptr,
                                               offset,
                                               i, j);

      lp_build_conv(gallivm,
                    tmp_type, type,
                    &tmp, 1, &tmp, 1);

      return tmp;
   }

   /*
    * Fallback to util_format_description::fetch_rgba_8unorm().
    */

   if (format_desc->fetch_rgba_8unorm &&
       !type.floating && type.width == 8 && !type.sign && type.norm) {
      /*
       * Fallback to calling util_format_description::fetch_rgba_8unorm.
       *
       * This is definitely not the most efficient way of fetching pixels, as
       * we miss the opportunity to do vectorization, but this it is a
       * convenient for formats or scenarios for which there was no opportunity
       * or incentive to optimize.
       */

      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(gallivm->builder)));
      char name[256];
      LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
      LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
      LLVMValueRef function;
      LLVMValueRef tmp_ptr;
      LLVMValueRef tmp;
      LLVMValueRef res;
      LLVMValueRef callee;
      unsigned k;

      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm",
                    format_desc->short_name);

      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
         debug_printf("%s: falling back to %s\n", __FUNCTION__, name);
      }

      /*
       * Declare and bind format_desc->fetch_rgba_8unorm().
       */

      function = LLVMGetNamedFunction(module, name);
      if (!function) {
         /*
          * Function to call looks like:
          *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
          */
         LLVMTypeRef ret_type;
         LLVMTypeRef arg_types[4];
         LLVMTypeRef function_type;

         ret_type = LLVMVoidTypeInContext(gallivm->context);
         arg_types[0] = pi8t;
         arg_types[1] = pi8t;
         arg_types[2] = i32t;
         arg_types[3] = i32t;
         function_type = LLVMFunctionType(ret_type, arg_types,
                                          Elements(arg_types), 0);
         function = LLVMAddFunction(module, name, function_type);

         LLVMSetFunctionCallConv(function, LLVMCCallConv);
         LLVMSetLinkage(function, LLVMExternalLinkage);

         assert(LLVMIsDeclaration(function));
      }

      /* make const pointer for the C fetch_rgba_float function */
      callee = lp_build_const_int_pointer(gallivm,
         func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));

      /* cast the callee pointer to the function's type */
      function = LLVMBuildBitCast(builder, callee,
                                  LLVMTypeOf(function), "cast callee");

      tmp_ptr = lp_build_alloca(gallivm, i32t, "");

      res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));

      /*
       * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
       * in the SoA vectors.
       */

      for (k = 0; k < num_pixels; ++k) {
         LLVMValueRef index = lp_build_const_int32(gallivm, k);
         LLVMValueRef args[4];

         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
         args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
                                            base_ptr, offset, k);

         if (num_pixels == 1) {
            args[2] = i;
            args[3] = j;
         }
         else {
            args[2] = LLVMBuildExtractElement(builder, i, index, "");
            args[3] = LLVMBuildExtractElement(builder, j, index, "");
         }

         LLVMBuildCall(builder, function, args, Elements(args), "");

         tmp = LLVMBuildLoad(builder, tmp_ptr, "");

         if (num_pixels == 1) {
            res = tmp;
         }
         else {
            res = LLVMBuildInsertElement(builder, res, tmp, index, "");
         }
      }

      /* Bitcast from <n x i32> to <4n x i8> */
      res = LLVMBuildBitCast(builder, res, bld.vec_type, "");

      return res;
   }


   /*
    * Fallback to util_format_description::fetch_rgba_float().
    */

   if (format_desc->fetch_rgba_float) {
      /*
       * Fallback to calling util_format_description::fetch_rgba_float.
       *
       * This is definitely not the most efficient way of fetching pixels, as
       * we miss the opportunity to do vectorization, but this it is a
       * convenient for formats or scenarios for which there was no opportunity
       * or incentive to optimize.
       */

      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
      char name[256];
      LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
      LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
      LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
      LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
      LLVMValueRef function;
      LLVMValueRef tmp_ptr;
      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
      LLVMValueRef res;
      LLVMValueRef callee;
      unsigned k;

      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float",
                    format_desc->short_name);

      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
         debug_printf("%s: falling back to %s\n", __FUNCTION__, name);
      }

      /*
       * Declare and bind format_desc->fetch_rgba_float().
       */

      function = LLVMGetNamedFunction(module, name);
      if (!function) {
         /*
          * Function to call looks like:
          *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
          */
         LLVMTypeRef ret_type;
         LLVMTypeRef arg_types[4];
         LLVMTypeRef function_type;

         ret_type = LLVMVoidTypeInContext(gallivm->context);
         arg_types[0] = pf32t;
         arg_types[1] = pi8t;
         arg_types[2] = i32t;
         arg_types[3] = i32t;
         function_type = LLVMFunctionType(ret_type, arg_types,
                                          Elements(arg_types), 0);
         function = LLVMAddFunction(module, name, function_type);

         LLVMSetFunctionCallConv(function, LLVMCCallConv);
         LLVMSetLinkage(function, LLVMExternalLinkage);

         assert(LLVMIsDeclaration(function));
      }

      /* Note: we're using this casting here instead of LLVMAddGlobalMapping()
       * to work around a bug in LLVM 2.6.
       */

      /* make const pointer for the C fetch_rgba_float function */
      callee = lp_build_const_int_pointer(gallivm,
         func_to_pointer((func_pointer) format_desc->fetch_rgba_float));

      /* cast the callee pointer to the function's type */
      function = LLVMBuildBitCast(builder, callee,
                                  LLVMTypeOf(function), "cast callee");


      tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");

      /*
       * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
       * in the SoA vectors.
       */

      for (k = 0; k < num_pixels; ++k) {
         LLVMValueRef args[4];

         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
         args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
                                            base_ptr, offset, k);

         if (num_pixels == 1) {
            args[2] = i;
            args[3] = j;
         }
         else {
            LLVMValueRef index = lp_build_const_int32(gallivm, k);
            args[2] = LLVMBuildExtractElement(builder, i, index, "");
            args[3] = LLVMBuildExtractElement(builder, j, index, "");
         }

         LLVMBuildCall(builder, function, args, Elements(args), "");

         tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
      }

      lp_build_conv(gallivm,
                    lp_float32_vec4_type(),
                    type,
                    tmps, num_pixels, &res, 1);

      return res;
   }

   assert(0);
   return lp_build_undef(gallivm, type);
}
Exemple #2
0
/**
 * Fetch a texels from a texture, returning them in SoA layout.
 *
 * \param type  the desired return type for 'rgba'.  The vector length
 *              is the number of texels to fetch
 *
 * \param base_ptr  points to the base of the texture mip tree.
 * \param offset    offset to start of the texture image block.  For non-
 *                  compressed formats, this simply is an offset to the texel.
 *                  For compressed formats, it is an offset to the start of the
 *                  compressed data block.
 *
 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 *              these will always be (0,0).  For compressed formats, i will
 *              be in [0, block_width-1] and j will be in [0, block_height-1].
 */
void
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                        const struct util_format_description *format_desc,
                        struct lp_type type,
                        LLVMValueRef base_ptr,
                        LLVMValueRef offset,
                        LLVMValueRef i,
                        LLVMValueRef j,
                        LLVMValueRef rgba_out[4])
{
   LLVMBuilderRef builder = gallivm->builder;

   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       format_desc->block.bits <= type.width &&
       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
        format_desc->channel[0].size == 32))
   {
      /*
       * The packed pixel fits into an element of the destination format. Put
       * the packed pixels into a vector and extract each component for all
       * vector elements in parallel.
       */

      LLVMValueRef packed;

      /*
       * gather the texels from the texture
       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
       */
      assert(format_desc->block.bits <= type.width);
      packed = lp_build_gather(gallivm,
                               type.length,
                               format_desc->block.bits,
                               type.width,
                               base_ptr, offset, FALSE);

      /*
       * convert texels to float rgba
       */
      lp_build_unpack_rgba_soa(gallivm,
                               format_desc,
                               type,
                               packed, rgba_out);
      return;
   }

   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
       format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
      /*
       * similar conceptually to above but requiring special
       * AoS packed -> SoA float conversion code.
       */
      LLVMValueRef packed;

      assert(type.floating);
      assert(type.width == 32);

      packed = lp_build_gather(gallivm, type.length,
                               format_desc->block.bits,
                               type.width, base_ptr, offset,
                               FALSE);
      if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
      }
      else {
         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
      }
      return;
   }

   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
       format_desc->block.bits == 64) {
      /*
       * special case the format is 64 bits but we only require
       * 32bit (or 8bit) from each block.
       */
      LLVMValueRef packed;

      if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
         /*
          * for stencil simply fix up offsets - could in fact change
          * base_ptr instead even outside the shader.
          */
         unsigned mask = (1 << 8) - 1;
         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
         offset = LLVMBuildAdd(builder, offset, s_offset, "");
         packed = lp_build_gather(gallivm, type.length,
                                  32, type.width, base_ptr, offset, FALSE);
         packed = LLVMBuildAnd(builder, packed,
                               lp_build_const_int_vec(gallivm, type, mask), "");
      }
      else {
         assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
         packed = lp_build_gather(gallivm, type.length,
                                  32, type.width, base_ptr, offset, TRUE);
         packed = LLVMBuildBitCast(builder, packed,
                                   lp_build_vec_type(gallivm, type), "");
      }
      /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
      rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
      rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
      return;
   }

   /*
    * Try calling lp_build_fetch_rgba_aos for all pixels.
    */

   if (util_format_fits_8unorm(format_desc) &&
       type.floating && type.width == 32 &&
       (type.length == 1 || (type.length % 4 == 0))) {
      struct lp_type tmp_type;
      LLVMValueRef tmp;

      memset(&tmp_type, 0, sizeof tmp_type);
      tmp_type.width = 8;
      tmp_type.length = type.length * 4;
      tmp_type.norm = TRUE;

      tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                    base_ptr, offset, i, j);

      lp_build_rgba8_to_fi32_soa(gallivm,
                                type,
                                tmp,
                                rgba_out);

      return;
   }

   /*
    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
    *
    * This is not the most efficient way of fetching pixels, as we
    * miss some opportunities to do vectorization, but this is
    * convenient for formats or scenarios for which there was no
    * opportunity or incentive to optimize.
    */

   {
      unsigned k, chan;
      struct lp_type tmp_type;

      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
         debug_printf("%s: scalar unpacking of %s\n",
                      __FUNCTION__, format_desc->short_name);
      }

      tmp_type = type;
      tmp_type.length = 4;

      for (chan = 0; chan < 4; ++chan) {
         rgba_out[chan] = lp_build_undef(gallivm, type);
      }

      /* loop over number of pixels */
      for(k = 0; k < type.length; ++k) {
         LLVMValueRef index = lp_build_const_int32(gallivm, k);
         LLVMValueRef offset_elem;
         LLVMValueRef i_elem, j_elem;
         LLVMValueRef tmp;

         offset_elem = LLVMBuildExtractElement(builder, offset,
                                               index, "");

         i_elem = LLVMBuildExtractElement(builder, i, index, "");
         j_elem = LLVMBuildExtractElement(builder, j, index, "");

         /* Get a single float[4]={R,G,B,A} pixel */
         tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                       base_ptr, offset_elem,
                                       i_elem, j_elem);

         /*
          * Insert the AoS tmp value channels into the SoA result vectors at
          * position = 'index'.
          */
         for (chan = 0; chan < 4; ++chan) {
            LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
            tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
            rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
                                                    tmp_chan, index, "");
         }
      }
   }
}
/**
 * Truncate or expand the bitwidth.
 *
 * NOTE: Getting the right sign flags is crucial here, as we employ some
 * intrinsics that do saturation.
 */
void
lp_build_resize(struct gallivm_state *gallivm,
                struct lp_type src_type,
                struct lp_type dst_type,
                const LLVMValueRef *src, unsigned num_srcs,
                LLVMValueRef *dst, unsigned num_dsts)
{
   LLVMBuilderRef builder = gallivm->builder;
   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
   unsigned i;

   /*
    * We don't support float <-> int conversion here. That must be done
    * before/after calling this function.
    */
   assert(src_type.floating == dst_type.floating);

   /*
    * We don't support double <-> float conversion yet, although it could be
    * added with little effort.
    */
   assert((!src_type.floating && !dst_type.floating) ||
          src_type.width == dst_type.width);

   /* We must not loose or gain channels. Only precision */
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);

   /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */
   assert(num_srcs == 1 || num_dsts == 1);

   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);

   if (src_type.width > dst_type.width) {
      /*
       * Truncate bit width.
       */

      assert(num_dsts == 1);

      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
        /*
         * Register width remains constant -- use vector packing intrinsics
         */
         tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
      }
      else {
         if (src_type.width / dst_type.width > num_srcs) {
            /*
            * First change src vectors size (with shuffle) so they have the
            * same size as the destination vector, then pack normally.
            * Note: cannot use cast/extract because llvm generates atrocious code.
            */
            unsigned size_ratio = (src_type.width * src_type.length) /
                                  (dst_type.length * dst_type.width);
            unsigned new_length = src_type.length / size_ratio;

            for (i = 0; i < size_ratio * num_srcs; i++) {
               unsigned start_index = (i % size_ratio) * new_length;
               tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
                                               start_index, new_length);
            }
            num_srcs *= size_ratio;
            src_type.length = new_length;
            tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
         }
         else {
            /*
             * Truncate bit width but expand vector size - first pack
             * then expand simply because this should be more AVX-friendly
             * for the cases we probably hit.
             */
            unsigned size_ratio = (dst_type.width * dst_type.length) /
                                  (src_type.length * src_type.width);
            unsigned num_pack_srcs = num_srcs / size_ratio;
            dst_type.length = dst_type.length / size_ratio;

            for (i = 0; i < size_ratio; i++) {
               tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
                                      &src[i*num_pack_srcs], num_pack_srcs);
            }
            tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
         }
      }
   }
   else if (src_type.width < dst_type.width) {
      /*
       * Expand bit width.
       */

      assert(num_srcs == 1);

      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
         /*
          * Register width remains constant -- use vector unpack intrinsics
          */
         lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
      }
      else {
         /*
          * Do it element-wise.
          */
         assert(src_type.length * num_srcs == dst_type.length * num_dsts);

         for (i = 0; i < num_dsts; i++) {
            tmp[i] = lp_build_undef(gallivm, dst_type);
         }

         for (i = 0; i < src_type.length; ++i) {
            unsigned j = i / dst_type.length;
            LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
            LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");

            if (src_type.sign && dst_type.sign) {
               val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
            } else {
               val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
            }
            tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
         }
      }
   }
   else {
      /*
       * No-op
       */

      assert(num_srcs == 1);
      assert(num_dsts == 1);

      tmp[0] = src[0];
   }

   for(i = 0; i < num_dsts; ++i)
      dst[i] = tmp[i];
}
Exemple #4
0
/**
 * Unpack several pixels in SoA.
 *
 * It takes a vector of packed pixels:
 *
 *   packed = {P0, P1, P2, P3, ..., Pn}
 *
 * And will produce four vectors:
 *
 *   red    = {R0, R1, R2, R3, ..., Rn}
 *   green  = {G0, G1, G2, G3, ..., Gn}
 *   blue   = {B0, B1, B2, B3, ..., Bn}
 *   alpha  = {A0, A1, A2, A3, ..., An}
 *
 * It requires that a packed pixel fits into an element of the output
 * channels. The common case is when converting pixel with a depth of 32 bit or
 * less into floats.
 *
 * \param format_desc  the format of the 'packed' incoming pixel vector
 * \param type  the desired type for rgba_out (type.length = n, above)
 * \param packed  the incoming vector of packed pixels
 * \param rgba_out  returns the SoA R,G,B,A vectors
 */
void
lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
                         LLVMValueRef packed,
                         LLVMValueRef rgba_out[4])
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_build_context bld;
   LLVMValueRef inputs[4];
   unsigned chan;

   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
   assert(format_desc->block.width == 1);
   assert(format_desc->block.height == 1);
   assert(format_desc->block.bits <= type.width);
   /* FIXME: Support more output types */
   assert(type.width == 32);

   lp_build_context_init(&bld, gallivm, type);

   /* Decode the input vector components */
   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
      const unsigned width = format_desc->channel[chan].size;
      const unsigned start = format_desc->channel[chan].shift;
      const unsigned stop = start + width;
      LLVMValueRef input;

      input = packed;

      switch(format_desc->channel[chan].type) {
      case UTIL_FORMAT_TYPE_VOID:
         input = lp_build_undef(gallivm, type);
         break;

      case UTIL_FORMAT_TYPE_UNSIGNED:
         /*
          * Align the LSB
          */

         if (start) {
            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
         }

         /*
          * Zero the MSBs
          */

         if (stop < format_desc->block.bits) {
            unsigned mask = ((unsigned long long)1 << width) - 1;
            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
         }

         /*
          * Type conversion
          */

         if (type.floating) {
            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
               assert(width == 8);
               if (format_desc->swizzle[3] == chan) {
                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
               }
               else {
                  struct lp_type conv_type = lp_uint_type(type);
                  input = lp_build_srgb_to_linear(gallivm, conv_type, input);
               }
            }
            else {
               if(format_desc->channel[chan].normalized)
                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
               else
                  input = LLVMBuildSIToFP(builder, input,
                                          lp_build_vec_type(gallivm, type), "");
            }
         }
         else if (format_desc->channel[chan].pure_integer) {
            /* Nothing to do */
         } else {
             /* FIXME */
             assert(0);
         }

         break;

      case UTIL_FORMAT_TYPE_SIGNED:
         /*
          * Align the sign bit first.
          */

         if (stop < type.width) {
            unsigned bits = type.width - stop;
            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
            input = LLVMBuildShl(builder, input, bits_val, "");
         }

         /*
          * Align the LSB (with an arithmetic shift to preserve the sign)
          */

         if (format_desc->channel[chan].size < type.width) {
            unsigned bits = type.width - format_desc->channel[chan].size;
            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
            input = LLVMBuildAShr(builder, input, bits_val, "");
         }

         /*
          * Type conversion
          */

         if (type.floating) {
            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
            if (format_desc->channel[chan].normalized) {
               double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
               LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
               input = LLVMBuildFMul(builder, input, scale_val, "");
               /* the formula above will produce value below -1.0 for most negative
                * value but everything seems happy with that hence disable for now */
               if (0)
                  input = lp_build_max(&bld, input,
                                       lp_build_const_vec(gallivm, type, -1.0f));
            }
         }
         else if (format_desc->channel[chan].pure_integer) {
            /* Nothing to do */
         } else {
             /* FIXME */
             assert(0);
         }

         break;

      case UTIL_FORMAT_TYPE_FLOAT:
         if (type.floating) {
            assert(start == 0);
            assert(stop == 32);
            assert(type.width == 32);
            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
         }
         else {
            /* FIXME */
            assert(0);
            input = lp_build_undef(gallivm, type);
         }
         break;

      case UTIL_FORMAT_TYPE_FIXED:
         if (type.floating) {
            double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
            input = LLVMBuildFMul(builder, input, scale_val, "");
         }
         else {
            /* FIXME */
            assert(0);
            input = lp_build_undef(gallivm, type);
         }
         break;

      default:
         assert(0);
         input = lp_build_undef(gallivm, type);
         break;
      }

      inputs[chan] = input;
   }

   lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
}
/**
 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
 * \param func  one of PIPE_FUNC_x
 * The result values will be 0 for false or ~0 for true.
 */
LLVMValueRef
lp_build_compare(struct gallivm_state *gallivm,
                 const struct lp_type type,
                 unsigned func,
                 LLVMValueRef a,
                 LLVMValueRef b)
{
   LLVMBuilderRef builder = gallivm->builder;
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
   LLVMValueRef cond;
   LLVMValueRef res;

   assert(func >= PIPE_FUNC_NEVER);
   assert(func <= PIPE_FUNC_ALWAYS);
   assert(lp_check_value(type, a));
   assert(lp_check_value(type, b));

   if(func == PIPE_FUNC_NEVER)
      return zeros;
   if(func == PIPE_FUNC_ALWAYS)
      return ones;

#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
   /*
    * There are no unsigned integer comparison instructions in SSE.
    */

   if (!type.floating && !type.sign &&
       type.width * type.length == 128 &&
       util_cpu_caps.has_sse2 &&
       (func == PIPE_FUNC_LESS ||
        func == PIPE_FUNC_LEQUAL ||
        func == PIPE_FUNC_GREATER ||
        func == PIPE_FUNC_GEQUAL) &&
       (gallivm_debug & GALLIVM_DEBUG_PERF)) {
         debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
                      __FUNCTION__, type.length, type.width);
   }
#endif

#if HAVE_LLVM < 0x0207
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
   if(type.width * type.length == 128) {
      if(type.floating && util_cpu_caps.has_sse) {
         /* float[4] comparison */
         LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
         LLVMValueRef args[3];
         unsigned cc;
         boolean swap;

         swap = FALSE;
         switch(func) {
         case PIPE_FUNC_EQUAL:
            cc = 0;
            break;
         case PIPE_FUNC_NOTEQUAL:
            cc = 4;
            break;
         case PIPE_FUNC_LESS:
            cc = 1;
            break;
         case PIPE_FUNC_LEQUAL:
            cc = 2;
            break;
         case PIPE_FUNC_GREATER:
            cc = 1;
            swap = TRUE;
            break;
         case PIPE_FUNC_GEQUAL:
            cc = 2;
            swap = TRUE;
            break;
         default:
            assert(0);
            return lp_build_undef(gallivm, type);
         }

         if(swap) {
            args[0] = b;
            args[1] = a;
         }
         else {
            args[0] = a;
            args[1] = b;
         }

         args[2] = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), cc, 0);
         res = lp_build_intrinsic(builder,
                                  "llvm.x86.sse.cmp.ps",
                                  vec_type,
                                  args, 3);
         res = LLVMBuildBitCast(builder, res, int_vec_type, "");
         return res;
      }
      else if(util_cpu_caps.has_sse2) {
         /* int[4] comparison */
         static const struct {
            unsigned swap:1;
            unsigned eq:1;
            unsigned gt:1;
            unsigned not:1;
         } table[] = {
            {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */
            {1, 0, 1, 0}, /* PIPE_FUNC_LESS */
            {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */
            {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */
            {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */
            {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */
            {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */
            {0, 0, 0, 0}  /* PIPE_FUNC_ALWAYS */
         };
         const char *pcmpeq;
         const char *pcmpgt;
         LLVMValueRef args[2];
         LLVMValueRef res;
         LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);

         switch (type.width) {
         case 8:
            pcmpeq = "llvm.x86.sse2.pcmpeq.b";
            pcmpgt = "llvm.x86.sse2.pcmpgt.b";
            break;
         case 16:
            pcmpeq = "llvm.x86.sse2.pcmpeq.w";
            pcmpgt = "llvm.x86.sse2.pcmpgt.w";
            break;
         case 32:
            pcmpeq = "llvm.x86.sse2.pcmpeq.d";
            pcmpgt = "llvm.x86.sse2.pcmpgt.d";
            break;
         default:
            assert(0);
            return lp_build_undef(gallivm, type);
         }

         /* There are no unsigned comparison instructions. So flip the sign bit
          * so that the results match.
          */
         if (table[func].gt && !type.sign) {
            LLVMValueRef msb = lp_build_const_int_vec(gallivm, type, (unsigned long long)1 << (type.width - 1));
            a = LLVMBuildXor(builder, a, msb, "");
            b = LLVMBuildXor(builder, b, msb, "");
         }

         if(table[func].swap) {
            args[0] = b;
            args[1] = a;
         }
         else {
            args[0] = a;
            args[1] = b;
         }

         if(table[func].eq)
            res = lp_build_intrinsic(builder, pcmpeq, vec_type, args, 2);
         else if (table[func].gt)
            res = lp_build_intrinsic(builder, pcmpgt, vec_type, args, 2);
         else
            res = LLVMConstNull(vec_type);

         if(table[func].not)
            res = LLVMBuildNot(builder, res, "");

         return res;
      }
   } /* if (type.width * type.length == 128) */
#endif
#endif /* HAVE_LLVM < 0x0207 */

   /* XXX: It is not clear if we should use the ordered or unordered operators */

   if(type.floating) {
      LLVMRealPredicate op;
      switch(func) {
      case PIPE_FUNC_NEVER:
         op = LLVMRealPredicateFalse;
         break;
      case PIPE_FUNC_ALWAYS:
         op = LLVMRealPredicateTrue;
         break;
      case PIPE_FUNC_EQUAL:
         op = LLVMRealUEQ;
         break;
      case PIPE_FUNC_NOTEQUAL:
         op = LLVMRealUNE;
         break;
      case PIPE_FUNC_LESS:
         op = LLVMRealULT;
         break;
      case PIPE_FUNC_LEQUAL:
         op = LLVMRealULE;
         break;
      case PIPE_FUNC_GREATER:
         op = LLVMRealUGT;
         break;
      case PIPE_FUNC_GEQUAL:
         op = LLVMRealUGE;
         break;
      default:
         assert(0);
         return lp_build_undef(gallivm, type);
      }

#if HAVE_LLVM >= 0x0207
      cond = LLVMBuildFCmp(builder, op, a, b, "");
      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
#else
      if (type.length == 1) {
         cond = LLVMBuildFCmp(builder, op, a, b, "");
         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
      }
      else {
         unsigned i;

         res = LLVMGetUndef(int_vec_type);

         debug_printf("%s: warning: using slow element-wise float"
                      " vector comparison\n", __FUNCTION__);
         for (i = 0; i < type.length; ++i) {
            LLVMValueRef index = lp_build_const_int32(gallivm, i);
            cond = LLVMBuildFCmp(builder, op,
                                 LLVMBuildExtractElement(builder, a, index, ""),
                                 LLVMBuildExtractElement(builder, b, index, ""),
                                 "");
            cond = LLVMBuildSelect(builder, cond,
                                   LLVMConstExtractElement(ones, index),
                                   LLVMConstExtractElement(zeros, index),
                                   "");
            res = LLVMBuildInsertElement(builder, res, cond, index, "");
         }
      }
#endif
   }
   else {
      LLVMIntPredicate op;
      switch(func) {
      case PIPE_FUNC_EQUAL:
         op = LLVMIntEQ;
         break;
      case PIPE_FUNC_NOTEQUAL:
         op = LLVMIntNE;
         break;
      case PIPE_FUNC_LESS:
         op = type.sign ? LLVMIntSLT : LLVMIntULT;
         break;
      case PIPE_FUNC_LEQUAL:
         op = type.sign ? LLVMIntSLE : LLVMIntULE;
         break;
      case PIPE_FUNC_GREATER:
         op = type.sign ? LLVMIntSGT : LLVMIntUGT;
         break;
      case PIPE_FUNC_GEQUAL:
         op = type.sign ? LLVMIntSGE : LLVMIntUGE;
         break;
      default:
         assert(0);
         return lp_build_undef(gallivm, type);
      }

#if HAVE_LLVM >= 0x0207
      cond = LLVMBuildICmp(builder, op, a, b, "");
      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
#else
      if (type.length == 1) {
         cond = LLVMBuildICmp(builder, op, a, b, "");
         res = LLVMBuildSExt(builder, cond, int_vec_type, "");
      }
      else {
         unsigned i;

         res = LLVMGetUndef(int_vec_type);

         if (gallivm_debug & GALLIVM_DEBUG_PERF) {
            debug_printf("%s: using slow element-wise int"
                         " vector comparison\n", __FUNCTION__);
         }

         for(i = 0; i < type.length; ++i) {
            LLVMValueRef index = lp_build_const_int32(gallivm, i);
            cond = LLVMBuildICmp(builder, op,
                                 LLVMBuildExtractElement(builder, a, index, ""),
                                 LLVMBuildExtractElement(builder, b, index, ""),
                                 "");
            cond = LLVMBuildSelect(builder, cond,
                                   LLVMConstExtractElement(ones, index),
                                   LLVMConstExtractElement(zeros, index),
                                   "");
            res = LLVMBuildInsertElement(builder, res, cond, index, "");
         }
      }
#endif
   }

   return res;
}