Ejemplo n.º 1
0
/**
 * Interleave vector elements.
 *
 * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
 * (but not for 256bit AVX vectors).
 */
LLVMValueRef
lp_build_interleave2(struct gallivm_state *gallivm,
                     struct lp_type type,
                     LLVMValueRef a,
                     LLVMValueRef b,
                     unsigned lo_hi)
{
   LLVMValueRef shuffle;

   if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
      /*
       * XXX: This is a workaround for llvm code generation deficiency. Strangely
       * enough, while this needs vinsertf128/vextractf128 instructions (hence
       * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
       * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
       * So use some different shuffles instead (the exact shuffles don't seem to
       * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
       */
      struct lp_type tmp_type = type;
      LLVMValueRef srchalf[2], tmpdst;
      tmp_type.length = 4;
      tmp_type.width = 64;
      a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
      b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
      srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
      srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
      tmp_type.length = 2;
      tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
      return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
   }

   shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);

   return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
}
Ejemplo n.º 2
0
/**
 * Combines vectors to reduce from num_srcs to num_dsts.
 * Returns the number of src vectors concatenated in a single dst.
 *
 * num_srcs must be exactly divisible by num_dsts.
 *
 * e.g. For num_srcs = 4 and src = [x, y, z, w]
 *          num_dsts = 1  dst = [xyzw]    return = 4
 *          num_dsts = 2  dst = [xy, zw]  return = 2
 */
int
lp_build_concat_n(struct gallivm_state *gallivm,
                  struct lp_type src_type,
                  LLVMValueRef *src,
                  unsigned num_srcs,
                  LLVMValueRef *dst,
                  unsigned num_dsts)
{
   int size = num_srcs / num_dsts;
   int i;

   assert(num_srcs >= num_dsts);
   assert((num_srcs % size) == 0);

   if (num_srcs == num_dsts) {
      for (i = 0; i < num_dsts; ++i) {
         dst[i] = src[i];
      }
      return 1;
   }

   for (i = 0; i < num_dsts; ++i) {
      dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
   }

   return size;
}
Ejemplo n.º 3
0
static void
convert_to_soa(struct gallivm_state *gallivm,
               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
               LLVMValueRef dst_soa[4],
               const struct lp_type soa_type)
{
   unsigned j, k;
   struct lp_type aos_channel_type = soa_type;

   LLVMValueRef aos_channels[4];
   unsigned pixels_per_channel = soa_type.length / 4;

   debug_assert((soa_type.length % 4) == 0);

   aos_channel_type.length >>= 1;

   for (j = 0; j < 4; ++j) {
      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };

      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);

      for (k = 0; k < pixels_per_channel; ++k) {
         channel[k] = src_aos[j + 4 * k];
      }

      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
   }

   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
}
/**
 * Truncate or expand the bitwidth.
 *
 * NOTE: Getting the right sign flags is crucial here, as we employ some
 * intrinsics that do saturation.
 */
void
lp_build_resize(struct gallivm_state *gallivm,
                struct lp_type src_type,
                struct lp_type dst_type,
                const LLVMValueRef *src, unsigned num_srcs,
                LLVMValueRef *dst, unsigned num_dsts)
{
   LLVMBuilderRef builder = gallivm->builder;
   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
   unsigned i;

   /*
    * We don't support float <-> int conversion here. That must be done
    * before/after calling this function.
    */
   assert(src_type.floating == dst_type.floating);

   /*
    * We don't support double <-> float conversion yet, although it could be
    * added with little effort.
    */
   assert((!src_type.floating && !dst_type.floating) ||
          src_type.width == dst_type.width);

   /* We must not loose or gain channels. Only precision */
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);

   /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */
   assert(num_srcs == 1 || num_dsts == 1);

   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);

   if (src_type.width > dst_type.width) {
      /*
       * Truncate bit width.
       */

      assert(num_dsts == 1);

      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
        /*
         * Register width remains constant -- use vector packing intrinsics
         */
         tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
      }
      else {
         if (src_type.width / dst_type.width > num_srcs) {
            /*
            * First change src vectors size (with shuffle) so they have the
            * same size as the destination vector, then pack normally.
            * Note: cannot use cast/extract because llvm generates atrocious code.
            */
            unsigned size_ratio = (src_type.width * src_type.length) /
                                  (dst_type.length * dst_type.width);
            unsigned new_length = src_type.length / size_ratio;

            for (i = 0; i < size_ratio * num_srcs; i++) {
               unsigned start_index = (i % size_ratio) * new_length;
               tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
                                               start_index, new_length);
            }
            num_srcs *= size_ratio;
            src_type.length = new_length;
            tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
         }
         else {
            /*
             * Truncate bit width but expand vector size - first pack
             * then expand simply because this should be more AVX-friendly
             * for the cases we probably hit.
             */
            unsigned size_ratio = (dst_type.width * dst_type.length) /
                                  (src_type.length * src_type.width);
            unsigned num_pack_srcs = num_srcs / size_ratio;
            dst_type.length = dst_type.length / size_ratio;

            for (i = 0; i < size_ratio; i++) {
               tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
                                      &src[i*num_pack_srcs], num_pack_srcs);
            }
            tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
         }
      }
   }
   else if (src_type.width < dst_type.width) {
      /*
       * Expand bit width.
       */

      assert(num_srcs == 1);

      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
         /*
          * Register width remains constant -- use vector unpack intrinsics
          */
         lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
      }
      else {
         /*
          * Do it element-wise.
          */
         assert(src_type.length * num_srcs == dst_type.length * num_dsts);

         for (i = 0; i < num_dsts; i++) {
            tmp[i] = lp_build_undef(gallivm, dst_type);
         }

         for (i = 0; i < src_type.length; ++i) {
            unsigned j = i / dst_type.length;
            LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
            LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");

            if (src_type.sign && dst_type.sign) {
               val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
            } else {
               val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
            }
            tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
         }
      }
   }
   else {
      /*
       * No-op
       */

      assert(num_srcs == 1);
      assert(num_dsts == 1);

      tmp[0] = src[0];
   }

   for(i = 0; i < num_dsts; ++i)
      dst[i] = tmp[i];
}
/**
 * Non-interleaved pack.
 *
 * This will move values as
 *         (LSB)                     (MSB)
 *   lo =   l0 __ l1 __ l2 __..  __ ln __
 *   hi =   h0 __ h1 __ h2 __..  __ hn __
 *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
 *
 * This will only change the number of bits the values are represented, not the
 * values themselves.
 *
 * It is assumed the values are already clamped into the destination type range.
 * Values outside that range will produce undefined results. Use
 * lp_build_packs2 instead.
 */
LLVMValueRef
lp_build_pack2(struct gallivm_state *gallivm,
               struct lp_type src_type,
               struct lp_type dst_type,
               LLVMValueRef lo,
               LLVMValueRef hi)
{
   LLVMBuilderRef builder = gallivm->builder;
   LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
   LLVMValueRef shuffle;
   LLVMValueRef res = NULL;
   struct lp_type intr_type = dst_type;

#if HAVE_LLVM < 0x0207
   intr_type = src_type;
#endif

   assert(!src_type.floating);
   assert(!dst_type.floating);
   assert(src_type.width == dst_type.width * 2);
   assert(src_type.length * 2 == dst_type.length);

   /* Check for special cases first */
   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) {
      const char *intrinsic = NULL;

      switch(src_type.width) {
      case 32:
         if(dst_type.sign) {
            intrinsic = "llvm.x86.sse2.packssdw.128";
         }
         else {
            if (util_cpu_caps.has_sse4_1) {
               intrinsic = "llvm.x86.sse41.packusdw";
#if HAVE_LLVM < 0x0207
               /* llvm < 2.7 has inconsistent signatures except for packusdw */
               intr_type = dst_type;
#endif
            }
         }
         break;
      case 16:
         if (dst_type.sign) {
            intrinsic = "llvm.x86.sse2.packsswb.128";
         }
         else {
            intrinsic = "llvm.x86.sse2.packuswb.128";
         }
         break;
      /* default uses generic shuffle below */
      }
      if (intrinsic) {
         if (src_type.width * src_type.length == 128) {
            LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
            res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
            if (dst_vec_type != intr_vec_type) {
               res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
            }
         }
         else {
            int num_split = src_type.width * src_type.length / 128;
            int i;
            int nlen = 128 / src_type.width;
            struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
            struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
            LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
            LLVMValueRef tmplo, tmphi;
            LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
            LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);

            assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);

            for (i = 0; i < num_split / 2; i++) {
               tmplo = lp_build_extract_range(gallivm,
                                              lo, i*nlen*2, nlen);
               tmphi = lp_build_extract_range(gallivm,
                                              lo, i*nlen*2 + nlen, nlen);
               tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
                                                     nintr_vec_type, tmplo, tmphi);
               if (ndst_vec_type != nintr_vec_type) {
                  tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
               }
            }
            for (i = 0; i < num_split / 2; i++) {
               tmplo = lp_build_extract_range(gallivm,
                                              hi, i*nlen*2, nlen);
               tmphi = lp_build_extract_range(gallivm,
                                              hi, i*nlen*2 + nlen, nlen);
               tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
                                                                 nintr_vec_type,
                                                                 tmplo, tmphi);
               if (ndst_vec_type != nintr_vec_type) {
                  tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
                                                           ndst_vec_type, "");
               }
            }
            res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
         }
         return res;
      }
   }

   /* generic shuffle */
   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");

   shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);

   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");

   return res;
}
Ejemplo n.º 6
0
/**
 * Gather elements from scatter positions in memory into a single vector.
 * Use for fetching texels from a texture.
 * For SSE, typical values are length=4, src_width=32, dst_width=32.
 *
 * When src_width < dst_width, the return value can be justified in
 * one of two ways:
 * "integer justification" is used when the caller treats the destination
 * as a packed integer bitmask, as described by the channels' "shift" and
 * "width" fields;
 * "vector justification" is used when the caller casts the destination
 * to a vector and needs channel X to be in vector element 0.
 *
 * @param length length of the offsets
 * @param src_width src element width in bits
 * @param dst_type result element type (src will be expanded to fit,
 *        but truncation is not allowed)
 *        (this may be a vector, must be pot sized)
 * @param aligned whether the data is guaranteed to be aligned (to src_width)
 * @param base_ptr base pointer, needs to be a i8 pointer type.
 * @param offsets vector with offsets
 * @param vector_justify select vector rather than integer justification
 */
LLVMValueRef
lp_build_gather(struct gallivm_state *gallivm,
                unsigned length,
                unsigned src_width,
                struct lp_type dst_type,
                boolean aligned,
                LLVMValueRef base_ptr,
                LLVMValueRef offsets,
                boolean vector_justify)
{
   LLVMValueRef res;
   boolean need_expansion = src_width < dst_type.width * dst_type.length;
   boolean vec_fetch;
   struct lp_type fetch_type, fetch_dst_type;
   LLVMTypeRef src_type;

   assert(src_width <= dst_type.width * dst_type.length);

   /*
    * This is quite a mess...
    * Figure out if the fetch should be done as:
    * a) scalar or vector
    * b) float or int
    *
    * As an example, for a 96bit fetch expanded into 4x32bit, it is better
    * to use (3x32bit) vector type (then pad the vector). Otherwise, the
    * zext will cause extra instructions.
    * However, the same isn't true for 3x16bit (the codegen for that is
    * completely worthless on x86 simd, and for 3x8bit is is way worse
    * still, don't try that... (To get really good code out of llvm for
    * these cases, the only way is to decompose the fetches manually
    * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
    * case requires sse41, otherwise simple scalar zext is way better.
    * But probably not important enough, so don't bother.)
    * Also, we try to honor the floating bit of destination (but isn't
    * possible if caller asks for instance for 2x32bit dst_type with
    * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
    * cast to 2x32f type, so the fetch is always int and on top of that
    * we avoid the vec pad and use scalar zext due the above mentioned
    * issue).
    * Note this is optimized for x86 sse2 and up backend. Could be tweaked
    * for other archs if necessary...
    */
   if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
       (dst_type.length > 1)) {
      /* use vector fetch (if dst_type is vector) */
      vec_fetch = TRUE;
      if (dst_type.floating) {
         fetch_type = lp_type_float_vec(dst_type.width, src_width);
      } else {
         fetch_type = lp_type_int_vec(dst_type.width, src_width);
      }
      /* intentionally not using lp_build_vec_type here */
      src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
                                fetch_type.length);
      fetch_dst_type = fetch_type;
      fetch_dst_type.length = dst_type.length;
    } else {
      /* use scalar fetch */
      vec_fetch = FALSE;
      if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
         fetch_type = lp_type_float(src_width);
      } else {
         fetch_type = lp_type_int(src_width);
      }
      src_type = lp_build_vec_type(gallivm, fetch_type);
      fetch_dst_type = fetch_type;
      fetch_dst_type.width = dst_type.width * dst_type.length;
   }

   if (length == 1) {
      /* Scalar */
      res = lp_build_gather_elem_vec(gallivm, length,
                                     src_width, src_type, fetch_dst_type,
                                     aligned, base_ptr, offsets, 0,
                                     vector_justify);
      return LLVMBuildBitCast(gallivm->builder, res,
                              lp_build_vec_type(gallivm, dst_type), "");
      /*
       * Excluding expansion from these paths because if you need it for
       * 32bit/64bit fetches you're doing it wrong (this is gather, not
       * conversion) and it would be awkward for floats.
       */
   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
              src_width == 32 && (length == 4 || length == 8)) {
      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
                                  base_ptr, offsets);
   /*
    * This looks bad on paper wrt throughtput/latency on Haswell.
    * Even on Broadwell it doesn't look stellar.
    * Albeit no measurements were done (but tested to work).
    * Should definitely enable on Skylake.
    * (In general, should be more of a win if the fetch is 256bit wide -
    * this is true for the 32bit case above too.)
    */
   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
              src_width == 64 && (length == 2 || length == 4)) {
      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
                                  base_ptr, offsets);
   } else {
      /* Vector */

      LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
      unsigned i;
      boolean vec_zext = FALSE;
      struct lp_type res_type, gather_res_type;
      LLVMTypeRef res_t, gather_res_t;

      res_type = fetch_dst_type;
      res_type.length *= length;
      gather_res_type = res_type;

      if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
         /*
          * Note that llvm is never able to optimize zext/insert combos
          * directly (i.e. zero the simd reg, then place the elements into
          * the appropriate place directly). (I think this has to do with
          * scalar/vector transition.) And scalar 16->32bit zext simd loads
          * aren't possible (instead loading to scalar reg first).
          * No idea about other archs...
          * We could do this manually, but instead we just use a vector
          * zext, which is simple enough (and, in fact, llvm might optimize
          * this away).
          * (We're not trying that with other bit widths as that might not be
          * easier, in particular with 8 bit values at least with only sse2.)
          */
         assert(vec_fetch == FALSE);
         gather_res_type.width /= 2;
         fetch_dst_type = fetch_type;
         src_type = lp_build_vec_type(gallivm, fetch_type);
         vec_zext = TRUE;
      }
      res_t = lp_build_vec_type(gallivm, res_type);
      gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
      res = LLVMGetUndef(gather_res_t);
      for (i = 0; i < length; ++i) {
         LLVMValueRef index = lp_build_const_int32(gallivm, i);
         elems[i] = lp_build_gather_elem_vec(gallivm, length,
                                             src_width, src_type, fetch_dst_type,
                                             aligned, base_ptr, offsets, i,
                                             vector_justify);
         if (!vec_fetch) {
            res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
         }
      }
      if (vec_zext) {
         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
         if (vector_justify) {
#ifdef PIPE_ARCH_BIG_ENDIAN
            unsigned sv = dst_type.width - src_width;
            res = LLVMBuildShl(gallivm->builder, res,
                               lp_build_const_int_vec(gallivm, res_type, sv), "");
#endif
         }
      }
      if (vec_fetch) {
         /*
          * Do bitcast now otherwise llvm might get some funny ideas wrt
          * float/int types...
          */
         for (i = 0; i < length; i++) {
            elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
                                        lp_build_vec_type(gallivm, dst_type), "");
         }
         res = lp_build_concat(gallivm, elems, dst_type, length);
      } else {
         struct lp_type really_final_type = dst_type;
         assert(res_type.length * res_type.width ==
                dst_type.length * dst_type.width * length);
         really_final_type.length *= length;
         res = LLVMBuildBitCast(gallivm->builder, res,
                                lp_build_vec_type(gallivm, really_final_type), "");
      }
   }

   return res;
}
Ejemplo n.º 7
0
/**
 * Call intrinsic with arguments adapted to intrinsic vector length.
 *
 * Split vectors which are too large for the hw, or expand them if they
 * are too small, so a caller calling a function which might use intrinsics
 * doesn't need to do splitting/expansion on its own.
 * This only supports intrinsics where src and dst types match.
 */
LLVMValueRef
lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
                                    const char *name,
                                    struct lp_type src_type,
                                    unsigned intr_size,
                                    LLVMValueRef a,
                                    LLVMValueRef b)
{
   unsigned i;
   struct lp_type intrin_type = src_type;
   LLVMBuilderRef builder = gallivm->builder;
   LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
   LLVMValueRef anative, bnative;
   unsigned intrin_length = intr_size / src_type.width;

   intrin_type.length = intrin_length;

   if (intrin_length > src_type.length) {
      LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
      LLVMValueRef constvec, tmp;

      for (i = 0; i < src_type.length; i++) {
         elems[i] = lp_build_const_int32(gallivm, i);
      }
      for (; i < intrin_length; i++) {
         elems[i] = i32undef;
      }
      if (src_type.length == 1) {
         LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type);
         a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), "");
         b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), "");
      }
      constvec = LLVMConstVector(elems, intrin_length);
      anative = LLVMBuildShuffleVector(builder, a, a, constvec, "");
      bnative = LLVMBuildShuffleVector(builder, b, b, constvec, "");
      tmp = lp_build_intrinsic_binary(builder, name,
                                      lp_build_vec_type(gallivm, intrin_type),
                                      anative, bnative);
      if (src_type.length > 1) {
         constvec = LLVMConstVector(elems, src_type.length);
         return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, "");
      }
      else {
         return LLVMBuildExtractElement(builder, tmp, elems[0], "");
      }
   }
   else if (intrin_length < src_type.length) {
      unsigned num_vec = src_type.length / intrin_length;
      LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];

      /* don't support arbitrary size here as this is so yuck */
      if (src_type.length % intrin_length) {
         /* FIXME: This is something which should be supported
          * but there doesn't seem to be any need for it currently
          * so crash and burn.
          */
         debug_printf("%s: should handle arbitrary vector size\n",
                      __FUNCTION__);
         assert(0);
         return NULL;
      }

      for (i = 0; i < num_vec; i++) {
         anative = lp_build_extract_range(gallivm, a, i*intrin_length,
                                        intrin_length);
         bnative = lp_build_extract_range(gallivm, b, i*intrin_length,
                                        intrin_length);
         tmp[i] = lp_build_intrinsic_binary(builder, name,
                                            lp_build_vec_type(gallivm, intrin_type),
                                            anative, bnative);
      }
      return lp_build_concat(gallivm, tmp, intrin_type, num_vec);
   }
   else {
      return lp_build_intrinsic_binary(builder, name,
                                       lp_build_vec_type(gallivm, src_type),
                                       a, b);
   }
}