/** * Interleave vector elements. * * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions * (but not for 256bit AVX vectors). */ LLVMValueRef lp_build_interleave2(struct gallivm_state *gallivm, struct lp_type type, LLVMValueRef a, LLVMValueRef b, unsigned lo_hi) { LLVMValueRef shuffle; if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) { /* * XXX: This is a workaround for llvm code generation deficiency. Strangely * enough, while this needs vinsertf128/vextractf128 instructions (hence * a natural match when using 2x128bit vectors) the "normal" unpack shuffle * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3). * So use some different shuffles instead (the exact shuffles don't seem to * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64). */ struct lp_type tmp_type = type; LLVMValueRef srchalf[2], tmpdst; tmp_type.length = 4; tmp_type.width = 64; a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), ""); b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), ""); srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2); srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2); tmp_type.length = 2; tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2); return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), ""); } shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi); return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); }
/** * Truncate or expand the bitwidth. * * NOTE: Getting the right sign flags is crucial here, as we employ some * intrinsics that do saturation. */ void lp_build_resize(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned i; /* * We don't support float <-> int conversion here. That must be done * before/after calling this function. */ assert(src_type.floating == dst_type.floating); /* * We don't support double <-> float conversion yet, although it could be * added with little effort. */ assert((!src_type.floating && !dst_type.floating) || src_type.width == dst_type.width); /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */ assert(num_srcs == 1 || num_dsts == 1); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); assert(num_srcs <= LP_MAX_VECTOR_LENGTH); assert(num_dsts <= LP_MAX_VECTOR_LENGTH); if (src_type.width > dst_type.width) { /* * Truncate bit width. */ assert(num_dsts == 1); if (src_type.width * src_type.length == dst_type.width * dst_type.length) { /* * Register width remains constant -- use vector packing intrinsics */ tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); } else { if (src_type.width / dst_type.width > num_srcs) { /* * First change src vectors size (with shuffle) so they have the * same size as the destination vector, then pack normally. * Note: cannot use cast/extract because llvm generates atrocious code. */ unsigned size_ratio = (src_type.width * src_type.length) / (dst_type.length * dst_type.width); unsigned new_length = src_type.length / size_ratio; for (i = 0; i < size_ratio * num_srcs; i++) { unsigned start_index = (i % size_ratio) * new_length; tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio], start_index, new_length); } num_srcs *= size_ratio; src_type.length = new_length; tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs); } else { /* * Truncate bit width but expand vector size - first pack * then expand simply because this should be more AVX-friendly * for the cases we probably hit. */ unsigned size_ratio = (dst_type.width * dst_type.length) / (src_type.length * src_type.width); unsigned num_pack_srcs = num_srcs / size_ratio; dst_type.length = dst_type.length / size_ratio; for (i = 0; i < size_ratio; i++) { tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE, &src[i*num_pack_srcs], num_pack_srcs); } tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio); } } } else if (src_type.width < dst_type.width) { /* * Expand bit width. */ assert(num_srcs == 1); if (src_type.width * src_type.length == dst_type.width * dst_type.length) { /* * Register width remains constant -- use vector unpack intrinsics */ lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts); } else { /* * Do it element-wise. */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); for (i = 0; i < num_dsts; i++) { tmp[i] = lp_build_undef(gallivm, dst_type); } for (i = 0; i < src_type.length; ++i) { unsigned j = i / dst_type.length; LLVMValueRef srcindex = lp_build_const_int32(gallivm, i); LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length); LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, ""); if (src_type.sign && dst_type.sign) { val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); } else { val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); } tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, ""); } } } else { /* * No-op */ assert(num_srcs == 1); assert(num_dsts == 1); tmp[0] = src[0]; } for(i = 0; i < num_dsts; ++i) dst[i] = tmp[i]; }
/** * Non-interleaved pack. * * This will move values as * (LSB) (MSB) * lo = l0 __ l1 __ l2 __.. __ ln __ * hi = h0 __ h1 __ h2 __.. __ hn __ * res = l0 l1 l2 .. ln h0 h1 h2 .. hn * * This will only change the number of bits the values are represented, not the * values themselves. * * It is assumed the values are already clamped into the destination type range. * Values outside that range will produce undefined results. Use * lp_build_packs2 instead. */ LLVMValueRef lp_build_pack2(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, LLVMValueRef lo, LLVMValueRef hi) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type); LLVMValueRef shuffle; LLVMValueRef res = NULL; struct lp_type intr_type = dst_type; #if HAVE_LLVM < 0x0207 intr_type = src_type; #endif assert(!src_type.floating); assert(!dst_type.floating); assert(src_type.width == dst_type.width * 2); assert(src_type.length * 2 == dst_type.length); /* Check for special cases first */ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) { const char *intrinsic = NULL; switch(src_type.width) { case 32: if(dst_type.sign) { intrinsic = "llvm.x86.sse2.packssdw.128"; } else { if (util_cpu_caps.has_sse4_1) { intrinsic = "llvm.x86.sse41.packusdw"; #if HAVE_LLVM < 0x0207 /* llvm < 2.7 has inconsistent signatures except for packusdw */ intr_type = dst_type; #endif } } break; case 16: if (dst_type.sign) { intrinsic = "llvm.x86.sse2.packsswb.128"; } else { intrinsic = "llvm.x86.sse2.packuswb.128"; } break; /* default uses generic shuffle below */ } if (intrinsic) { if (src_type.width * src_type.length == 128) { LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi); if (dst_vec_type != intr_vec_type) { res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); } } else { int num_split = src_type.width * src_type.length / 128; int i; int nlen = 128 / src_type.width; struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128); struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128); LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128]; LLVMValueRef tmplo, tmphi; LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type); LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type); assert(num_split <= LP_MAX_VECTOR_WIDTH / 128); for (i = 0; i < num_split / 2; i++) { tmplo = lp_build_extract_range(gallivm, lo, i*nlen*2, nlen); tmphi = lp_build_extract_range(gallivm, lo, i*nlen*2 + nlen, nlen); tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic, nintr_vec_type, tmplo, tmphi); if (ndst_vec_type != nintr_vec_type) { tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, ""); } } for (i = 0; i < num_split / 2; i++) { tmplo = lp_build_extract_range(gallivm, hi, i*nlen*2, nlen); tmphi = lp_build_extract_range(gallivm, hi, i*nlen*2 + nlen, nlen); tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic, nintr_vec_type, tmplo, tmphi); if (ndst_vec_type != nintr_vec_type) { tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2], ndst_vec_type, ""); } } res = lp_build_concat(gallivm, tmpres, ndst_type, num_split); } return res; } } /* generic shuffle */ lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length); res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, ""); return res; }
/** * Generic type conversion. * * TODO: Take a precision argument, or even better, add a new precision member * to the lp_type union. */ void lp_build_conv(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { LLVMBuilderRef builder = gallivm->builder; struct lp_type tmp_type; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned num_tmps; unsigned i; /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); assert(num_srcs <= LP_MAX_VECTOR_LENGTH); assert(num_dsts <= LP_MAX_VECTOR_LENGTH); tmp_type = src_type; for(i = 0; i < num_srcs; ++i) { assert(lp_check_value(src_type, src[i])); tmp[i] = src[i]; } num_tmps = num_srcs; /* Special case 4x4f --> 1x16ub */ if (src_type.floating == 1 && src_type.fixed == 0 && src_type.sign == 1 && src_type.norm == 0 && src_type.width == 32 && src_type.length == 4 && dst_type.floating == 0 && dst_type.fixed == 0 && dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && dst_type.length == 16 && 4 * num_dsts == num_srcs && util_cpu_caps.has_sse2) { struct lp_build_context bld; struct lp_type int16_type = dst_type; struct lp_type int32_type = dst_type; LLVMValueRef const_255f; unsigned i, j; lp_build_context_init(&bld, gallivm, src_type); int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; int32_type.width *= 4; int32_type.length /= 4; int32_type.sign = 1; const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); for (i = 0; i < num_dsts; ++i, src += 4) { LLVMValueRef lo, hi; for (j = 0; j < 4; ++j) { tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, ""); tmp[j] = lp_build_iround(&bld, tmp[j]); } /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); } return; } /* Special case 2x8f --> 1x16ub */ else if (src_type.floating == 1 && src_type.fixed == 0 && src_type.sign == 1 && src_type.norm == 0 && src_type.width == 32 && src_type.length == 8 && dst_type.floating == 0 && dst_type.fixed == 0 && dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && dst_type.length == 16 && 2 * num_dsts == num_srcs && util_cpu_caps.has_avx) { struct lp_build_context bld; struct lp_type int16_type = dst_type; struct lp_type int32_type = dst_type; LLVMValueRef const_255f; unsigned i; lp_build_context_init(&bld, gallivm, src_type); int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; int32_type.width *= 4; int32_type.length /= 4; int32_type.sign = 1; const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); for (i = 0; i < num_dsts; ++i, src += 2) { LLVMValueRef lo, hi, a, b; a = LLVMBuildFMul(builder, src[0], const_255f, ""); b = LLVMBuildFMul(builder, src[1], const_255f, ""); a = lp_build_iround(&bld, a); b = lp_build_iround(&bld, b); tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); } return; } /* Pre convert half-floats to floats */ else if (src_type.floating && src_type.width == 16) { for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]); tmp_type.width = 32; } /* * Clamp if necessary */ if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { struct lp_build_context bld; double src_min = lp_const_min(src_type); double dst_min = lp_const_min(dst_type); double src_max = lp_const_max(src_type); double dst_max = lp_const_max(dst_type); LLVMValueRef thres; lp_build_context_init(&bld, gallivm, tmp_type); if(src_min < dst_min) { if(dst_min == 0.0) thres = bld.zero; else thres = lp_build_const_vec(gallivm, src_type, dst_min); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_max(&bld, tmp[i], thres); } if(src_max > dst_max) { if(dst_max == 1.0) thres = bld.one; else thres = lp_build_const_vec(gallivm, src_type, dst_max); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_min(&bld, tmp[i], thres); } } /* * Scale to the narrowest range */ if(dst_type.floating) { /* Nothing to do */ } else if(tmp_type.floating) { if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, tmp_type, dst_type.width, tmp[i]); } tmp_type.floating = FALSE; } else { double dst_scale = lp_const_scale(dst_type); LLVMTypeRef tmp_vec_type; if (dst_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } /* Use an equally sized integer for intermediate computations */ tmp_type.floating = FALSE; tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); #endif } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); unsigned src_offset = lp_const_offset(src_type); unsigned dst_offset = lp_const_offset(dst_type); /* Compensate for different offsets */ if (dst_offset > src_offset && src_type.width > dst_type.width) { for (i = 0; i < num_tmps; ++i) { LLVMValueRef shifted; LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1); if(src_type.sign) shifted = LLVMBuildAShr(builder, tmp[i], shift, ""); else shifted = LLVMBuildLShr(builder, tmp[i], shift, ""); tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); } } if(src_shift > dst_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - dst_shift); for(i = 0; i < num_tmps; ++i) if(src_type.sign) tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); else tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); } } /* * Truncate or expand bit width * * No data conversion should happen here, although the sign bits are * crucial to avoid bad clamping. */ { struct lp_type new_type; new_type = tmp_type; new_type.sign = dst_type.sign; new_type.width = dst_type.width; new_type.length = dst_type.length; lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); tmp_type = new_type; num_tmps = num_dsts; } /* * Scale to the widest range */ if(src_type.floating) { /* Nothing to do */ } else if(!src_type.floating && dst_type.floating) { if(!src_type.fixed && !src_type.sign && src_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_unsigned_norm_to_float(gallivm, src_type.width, dst_type, tmp[i]); } tmp_type.floating = TRUE; } else { double src_scale = lp_const_scale(src_type); LLVMTypeRef tmp_vec_type; /* Use an equally sized integer for intermediate computations */ tmp_type.floating = TRUE; tmp_type.sign = TRUE; tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); #endif } if (src_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); unsigned src_offset = lp_const_offset(src_type); unsigned dst_offset = lp_const_offset(dst_type); if (src_shift < dst_shift) { LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift); for (i = 0; i < num_tmps; ++i) { pre_shift[i] = tmp[i]; tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); } /* Compensate for different offsets */ if (dst_offset > src_offset) { for (i = 0; i < num_tmps; ++i) { tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); } } } } for(i = 0; i < num_dsts; ++i) { dst[i] = tmp[i]; assert(lp_check_value(dst_type, dst[i])); } }
/** * Store depth/stencil values. * Incoming values are swizzled (typically n 2x2 quads), stored linear. * If there's a mask it will do select/store otherwise just store. * * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param z_fb z values read from fb (with padding) * \param s_fb s values read from fb (with padding) * \param loop_counter the current loop iteration * \param depth_ptr pointer to the depth/stencil values of this 4x4 block * \param depth_stride stride of the depth/stencil buffer * \param z_value the depth values to store (with padding) * \param s_value the stencil values to store (with padding) */ void lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm, struct lp_type z_src_type, const struct util_format_description *format_desc, struct lp_build_mask_context *mask, LLVMValueRef z_fb, LLVMValueRef s_fb, LLVMValueRef loop_counter, LLVMValueRef depth_ptr, LLVMValueRef depth_stride, LLVMValueRef z_value, LLVMValueRef s_value) { struct lp_build_context z_bld; LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef mask_value = NULL; LLVMValueRef zs_dst1, zs_dst2; LLVMValueRef zs_dst_ptr1, zs_dst_ptr2; LLVMValueRef depth_offset1, depth_offset2; LLVMTypeRef load_ptr_type; unsigned depth_bytes = format_desc->block.bits / 8; struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); struct lp_type z_type = zs_type; struct lp_type zs_load_type = zs_type; zs_load_type.length = zs_load_type.length / 2; load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0); z_type.width = z_src_type.width; lp_build_context_init(&z_bld, gallivm, z_type); /* * This is far from ideal, at least for late depth write we should do this * outside the fs loop to avoid all the swizzle stuff. */ if (z_src_type.length == 4) { LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, lp_build_const_int32(gallivm, 1), ""); LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, lp_build_const_int32(gallivm, 2), ""); LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, depth_stride, ""); depth_offset1 = LLVMBuildMul(builder, looplsb, lp_build_const_int32(gallivm, depth_bytes * 2), ""); depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); } else { unsigned i; LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, lp_build_const_int32(gallivm, 1), ""); assert(z_src_type.length == 8); depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); /* * We load 2x4 values, and need to swizzle them (order * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. */ for (i = 0; i < 8; i++) { shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); } } depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, ""); zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, ""); if (format_desc->block.bits > 32) { s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, ""); } if (mask) { mask_value = lp_build_mask_value(mask); z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb); if (format_desc->block.bits > 32) { s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, ""); s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb); } } if (zs_type.width < z_src_type.width) { /* Truncate ZS values (e.g., when writing to Z16_UNORM) */ z_value = LLVMBuildTrunc(builder, z_value, lp_build_int_vec_type(gallivm, zs_type), ""); } if (format_desc->block.bits <= 32) { if (z_src_type.length == 4) { zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2); zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2); } else { assert(z_src_type.length == 8); zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value, LLVMConstVector(&shuffles[0], zs_load_type.length), ""); zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value, LLVMConstVector(&shuffles[4], zs_load_type.length), ""); } } else { if (z_src_type.length == 4) { zs_dst1 = lp_build_interleave2(gallivm, z_type, z_value, s_value, 0); zs_dst2 = lp_build_interleave2(gallivm, z_type, z_value, s_value, 1); } else { unsigned i; LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2]; assert(z_src_type.length == 8); for (i = 0; i < 8; i++) { shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 + z_src_type.length); } zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value, LLVMConstVector(&shuffles[0], z_src_type.length), ""); zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value, LLVMConstVector(&shuffles[8], z_src_type.length), ""); } zs_dst1 = LLVMBuildBitCast(builder, zs_dst1, lp_build_vec_type(gallivm, zs_load_type), ""); zs_dst2 = LLVMBuildBitCast(builder, zs_dst2, lp_build_vec_type(gallivm, zs_load_type), ""); } LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1); LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2); }
/** * Converts float32 to int16 half-float * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16) * [llvm.x86.vcvtps2ph / _mm_cvtps_ph] * * @param src value to convert * * Convert float32 to half floats, preserving Infs and NaNs, * with rounding towards zero (trunc). */ LLVMValueRef lp_build_float_to_half(struct gallivm_state *gallivm, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef f32_vec_type = LLVMTypeOf(src); unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind ? LLVMGetVectorSize(f32_vec_type) : 1; struct lp_type i32_type = lp_type_int_vec(32, 32 * length); struct lp_type i16_type = lp_type_int_vec(16, 16 * length); LLVMValueRef result; if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 && (length == 4 || length == 8)) { struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); const char *intrinsic = NULL; if (length == 4) { intrinsic = "llvm.x86.vcvtps2ph.128"; } else { intrinsic = "llvm.x86.vcvtps2ph.256"; } result = lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(gallivm, i168_type), src, LLVMConstInt(i32t, mode, 0)); if (length == 4) { result = lp_build_extract_range(gallivm, result, 0, 4); } } else { result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true); /* Convert int32 vector to int16 vector by trunc (might generate bad code) */ result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), ""); } /* * Debugging code. */ if (0) { LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context); LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length)); unsigned i; LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0); LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half)); func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half"); for (i = 0; i < length; ++i) { LLVMValueRef index = LLVMConstInt(i32t, i, 0); LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, ""); #if 0 /* XXX: not really supported by backends */ LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32); #else LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, ""); #endif ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, ""); } lp_build_print_value(gallivm, "src = ", src); lp_build_print_value(gallivm, "llvm = ", result); lp_build_print_value(gallivm, "util = ", ref_result); lp_build_printf(gallivm, "\n"); } return result; }
/** * Call intrinsic with arguments adapted to intrinsic vector length. * * Split vectors which are too large for the hw, or expand them if they * are too small, so a caller calling a function which might use intrinsics * doesn't need to do splitting/expansion on its own. * This only supports intrinsics where src and dst types match. */ LLVMValueRef lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm, const char *name, struct lp_type src_type, unsigned intr_size, LLVMValueRef a, LLVMValueRef b) { unsigned i; struct lp_type intrin_type = src_type; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); LLVMValueRef anative, bnative; unsigned intrin_length = intr_size / src_type.width; intrin_type.length = intrin_length; if (intrin_length > src_type.length) { LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; LLVMValueRef constvec, tmp; for (i = 0; i < src_type.length; i++) { elems[i] = lp_build_const_int32(gallivm, i); } for (; i < intrin_length; i++) { elems[i] = i32undef; } if (src_type.length == 1) { LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type); a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), ""); b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), ""); } constvec = LLVMConstVector(elems, intrin_length); anative = LLVMBuildShuffleVector(builder, a, a, constvec, ""); bnative = LLVMBuildShuffleVector(builder, b, b, constvec, ""); tmp = lp_build_intrinsic_binary(builder, name, lp_build_vec_type(gallivm, intrin_type), anative, bnative); if (src_type.length > 1) { constvec = LLVMConstVector(elems, src_type.length); return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, ""); } else { return LLVMBuildExtractElement(builder, tmp, elems[0], ""); } } else if (intrin_length < src_type.length) { unsigned num_vec = src_type.length / intrin_length; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; /* don't support arbitrary size here as this is so yuck */ if (src_type.length % intrin_length) { /* FIXME: This is something which should be supported * but there doesn't seem to be any need for it currently * so crash and burn. */ debug_printf("%s: should handle arbitrary vector size\n", __FUNCTION__); assert(0); return NULL; } for (i = 0; i < num_vec; i++) { anative = lp_build_extract_range(gallivm, a, i*intrin_length, intrin_length); bnative = lp_build_extract_range(gallivm, b, i*intrin_length, intrin_length); tmp[i] = lp_build_intrinsic_binary(builder, name, lp_build_vec_type(gallivm, intrin_type), anative, bnative); } return lp_build_concat(gallivm, tmp, intrin_type, num_vec); } else { return lp_build_intrinsic_binary(builder, name, lp_build_vec_type(gallivm, src_type), a, b); } }