assert(0); return LLVMGetUndef(dst_vec_type); break; } if (res) { res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); return res; } } /* generic shuffle */ lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length); res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, ""); return res; } /** * Non-interleaved pack and saturate. * * Same as lp_build_pack2 but will saturate values so that they fit into the * destination type. */ LLVMValueRef
/** * Non-interleaved pack. * * This will move values as * (LSB) (MSB) * lo = l0 __ l1 __ l2 __.. __ ln __ * hi = h0 __ h1 __ h2 __.. __ hn __ * res = l0 l1 l2 .. ln h0 h1 h2 .. hn * * This will only change the number of bits the values are represented, not the * values themselves. * * It is assumed the values are already clamped into the destination type range. * Values outside that range will produce undefined results. Use * lp_build_packs2 instead. */ LLVMValueRef lp_build_pack2(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, LLVMValueRef lo, LLVMValueRef hi) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type); LLVMValueRef shuffle; LLVMValueRef res = NULL; struct lp_type intr_type = dst_type; #if HAVE_LLVM < 0x0207 intr_type = src_type; #endif assert(!src_type.floating); assert(!dst_type.floating); assert(src_type.width == dst_type.width * 2); assert(src_type.length * 2 == dst_type.length); /* Check for special cases first */ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) { const char *intrinsic = NULL; switch(src_type.width) { case 32: if(dst_type.sign) { intrinsic = "llvm.x86.sse2.packssdw.128"; } else { if (util_cpu_caps.has_sse4_1) { intrinsic = "llvm.x86.sse41.packusdw"; #if HAVE_LLVM < 0x0207 /* llvm < 2.7 has inconsistent signatures except for packusdw */ intr_type = dst_type; #endif } } break; case 16: if (dst_type.sign) { intrinsic = "llvm.x86.sse2.packsswb.128"; } else { intrinsic = "llvm.x86.sse2.packuswb.128"; } break; /* default uses generic shuffle below */ } if (intrinsic) { if (src_type.width * src_type.length == 128) { LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi); if (dst_vec_type != intr_vec_type) { res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); } } else { int num_split = src_type.width * src_type.length / 128; int i; int nlen = 128 / src_type.width; struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128); struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128); LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128]; LLVMValueRef tmplo, tmphi; LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type); LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type); assert(num_split <= LP_MAX_VECTOR_WIDTH / 128); for (i = 0; i < num_split / 2; i++) { tmplo = lp_build_extract_range(gallivm, lo, i*nlen*2, nlen); tmphi = lp_build_extract_range(gallivm, lo, i*nlen*2 + nlen, nlen); tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic, nintr_vec_type, tmplo, tmphi); if (ndst_vec_type != nintr_vec_type) { tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, ""); } } for (i = 0; i < num_split / 2; i++) { tmplo = lp_build_extract_range(gallivm, hi, i*nlen*2, nlen); tmphi = lp_build_extract_range(gallivm, hi, i*nlen*2 + nlen, nlen); tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic, nintr_vec_type, tmplo, tmphi); if (ndst_vec_type != nintr_vec_type) { tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2], ndst_vec_type, ""); } } res = lp_build_concat(gallivm, tmpres, ndst_type, num_split); } return res; } } /* generic shuffle */ lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length); res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, ""); return res; }