static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) { struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); struct lp_build_context * base = &bld_base->base; unsigned i; /* Add the necessary export instructions */ for (i = 0; i < ctx->output_reg_count; i++) { unsigned chan; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { LLVMValueRef output; LLVMValueRef store_output; unsigned adjusted_reg_idx = i + ctx->reserved_reg_count; LLVMValueRef reg_index = lp_build_const_int32( base->gallivm, radeon_llvm_reg_index_soa(adjusted_reg_idx, chan)); output = LLVMBuildLoad(base->gallivm->builder, ctx->soa.outputs[i][chan], ""); store_output = lp_build_intrinsic_binary( base->gallivm->builder, "llvm.AMDGPU.store.output", base->elem_type, output, reg_index); lp_build_intrinsic_unary(base->gallivm->builder, "llvm.AMDGPU.export.reg", LLVMVoidTypeInContext(base->gallivm->context), store_output); } } }
static INLINE LLVMValueRef lp_build_round_sse41(struct lp_build_context *bld, LLVMValueRef a, enum lp_build_round_sse41_mode mode) { const struct lp_type type = bld->type; LLVMTypeRef vec_type = lp_build_vec_type(type); const char *intrinsic; assert(type.floating); assert(type.width*type.length == 128); assert(lp_check_value(type, a)); assert(util_cpu_caps.has_sse4_1); switch(type.width) { case 32: intrinsic = "llvm.x86.sse41.round.ps"; break; case 64: intrinsic = "llvm.x86.sse41.round.pd"; break; default: assert(0); return bld->undef; } return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a, LLVMConstInt(LLVMInt32Type(), mode, 0)); }
/** * Generate a + b */ LLVMValueRef lp_build_add(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { const struct lp_type type = bld->type; LLVMValueRef res; if(a == bld->zero) return b; if(b == bld->zero) return a; if(a == bld->undef || b == bld->undef) return bld->undef; if(bld->type.norm) { const char *intrinsic = NULL; if(a == bld->one || b == bld->one) return bld->one; if(util_cpu_caps.has_sse2 && type.width * type.length == 128 && !type.floating && !type.fixed) { if(type.width == 8) intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; if(type.width == 16) intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; } if(intrinsic) return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); } if(LLVMIsConstant(a) && LLVMIsConstant(b)) res = LLVMConstAdd(a, b); else res = LLVMBuildAdd(bld->builder, a, b, ""); /* clamp to ceiling of 1.0 */ if(bld->type.norm && (bld->type.floating || bld->type.fixed)) res = lp_build_min_simple(bld, res, bld->one); /* XXX clamp to floor of -1 or 0??? */ return res; }
/** * Generate a - b */ LLVMValueRef lp_build_sub(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { const struct lp_type type = bld->type; LLVMValueRef res; if(b == bld->zero) return a; if(a == bld->undef || b == bld->undef) return bld->undef; if(a == b) return bld->zero; if(bld->type.norm) { const char *intrinsic = NULL; if(b == bld->one) return bld->zero; if(util_cpu_caps.has_sse2 && type.width * type.length == 128 && !type.floating && !type.fixed) { if(type.width == 8) intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; if(type.width == 16) intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; } if(intrinsic) return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); } if(LLVMIsConstant(a) && LLVMIsConstant(b)) res = LLVMConstSub(a, b); else res = LLVMBuildSub(bld->builder, a, b, ""); if(bld->type.norm && (bld->type.floating || bld->type.fixed)) res = lp_build_max_simple(bld, res, bld->zero); return res; }
/** * Generate min(a, b) * No checks for special case values of a or b = 1 or 0 are done. */ static LLVMValueRef lp_build_min_simple(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { const struct lp_type type = bld->type; const char *intrinsic = NULL; LLVMValueRef cond; /* TODO: optimize the constant case */ if(type.width * type.length == 128) { if(type.floating) { if(type.width == 32 && util_cpu_caps.has_sse) intrinsic = "llvm.x86.sse.min.ps"; if(type.width == 64 && util_cpu_caps.has_sse2) intrinsic = "llvm.x86.sse2.min.pd"; } else { if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) intrinsic = "llvm.x86.sse2.pminu.b"; if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) intrinsic = "llvm.x86.sse41.pminsb"; if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) intrinsic = "llvm.x86.sse41.pminuw"; if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) intrinsic = "llvm.x86.sse2.pmins.w"; if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) intrinsic = "llvm.x86.sse41.pminud"; if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) intrinsic = "llvm.x86.sse41.pminsd"; } } if(intrinsic) return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b); cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); return lp_build_select(bld, cond, a, b); }
/** * Non-interleaved pack. * * This will move values as * * lo = __ l0 __ l1 __ l2 __.. __ ln * hi = __ h0 __ h1 __ h2 __.. __ hn * res = l0 l1 l2 .. ln h0 h1 h2 .. hn * * This will only change the number of bits the values are represented, not the * values themselves. * * It is assumed the values are already clamped into the destination type range. * Values outside that range will produce undefined results. Use * lp_build_packs2 instead. */ LLVMValueRef lp_build_pack2(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, LLVMValueRef lo, LLVMValueRef hi) { LLVMBuilderRef builder = gallivm->builder; #if HAVE_LLVM < 0x0207 LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type); #endif LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type); LLVMValueRef shuffle; LLVMValueRef res = NULL; assert(!src_type.floating); assert(!dst_type.floating); assert(src_type.width == dst_type.width * 2); assert(src_type.length * 2 == dst_type.length); /* Check for special cases first */ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) { switch(src_type.width) { case 32: if(dst_type.sign) { #if HAVE_LLVM >= 0x0207 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi); #else res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi); #endif } else { if (util_cpu_caps.has_sse4_1) { return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi); } else { /* use generic shuffle below */ res = NULL; } } break; case 16: if(dst_type.sign) #if HAVE_LLVM >= 0x0207 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi); #else res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi); #endif else #if HAVE_LLVM >= 0x0207 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi); #else res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi); #endif break; default: assert(0); return LLVMGetUndef(dst_vec_type); break; } if (res) { res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); return res; } }
/** * Non-interleaved pack. * * This will move values as * (LSB) (MSB) * lo = l0 __ l1 __ l2 __.. __ ln __ * hi = h0 __ h1 __ h2 __.. __ hn __ * res = l0 l1 l2 .. ln h0 h1 h2 .. hn * * This will only change the number of bits the values are represented, not the * values themselves. * * It is assumed the values are already clamped into the destination type range. * Values outside that range will produce undefined results. Use * lp_build_packs2 instead. */ LLVMValueRef lp_build_pack2(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, LLVMValueRef lo, LLVMValueRef hi) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type); LLVMValueRef shuffle; LLVMValueRef res = NULL; struct lp_type intr_type = dst_type; #if HAVE_LLVM < 0x0207 intr_type = src_type; #endif assert(!src_type.floating); assert(!dst_type.floating); assert(src_type.width == dst_type.width * 2); assert(src_type.length * 2 == dst_type.length); /* Check for special cases first */ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) { const char *intrinsic = NULL; switch(src_type.width) { case 32: if(dst_type.sign) { intrinsic = "llvm.x86.sse2.packssdw.128"; } else { if (util_cpu_caps.has_sse4_1) { intrinsic = "llvm.x86.sse41.packusdw"; #if HAVE_LLVM < 0x0207 /* llvm < 2.7 has inconsistent signatures except for packusdw */ intr_type = dst_type; #endif } } break; case 16: if (dst_type.sign) { intrinsic = "llvm.x86.sse2.packsswb.128"; } else { intrinsic = "llvm.x86.sse2.packuswb.128"; } break; /* default uses generic shuffle below */ } if (intrinsic) { if (src_type.width * src_type.length == 128) { LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi); if (dst_vec_type != intr_vec_type) { res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); } } else { int num_split = src_type.width * src_type.length / 128; int i; int nlen = 128 / src_type.width; struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128); struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128); LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128]; LLVMValueRef tmplo, tmphi; LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type); LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type); assert(num_split <= LP_MAX_VECTOR_WIDTH / 128); for (i = 0; i < num_split / 2; i++) { tmplo = lp_build_extract_range(gallivm, lo, i*nlen*2, nlen); tmphi = lp_build_extract_range(gallivm, lo, i*nlen*2 + nlen, nlen); tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic, nintr_vec_type, tmplo, tmphi); if (ndst_vec_type != nintr_vec_type) { tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, ""); } } for (i = 0; i < num_split / 2; i++) { tmplo = lp_build_extract_range(gallivm, hi, i*nlen*2, nlen); tmphi = lp_build_extract_range(gallivm, hi, i*nlen*2 + nlen, nlen); tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic, nintr_vec_type, tmplo, tmphi); if (ndst_vec_type != nintr_vec_type) { tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2], ndst_vec_type, ""); } } res = lp_build_concat(gallivm, tmpres, ndst_type, num_split); } return res; } } /* generic shuffle */ lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length); res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, ""); return res; }
/** * Converts float32 to int16 half-float * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16) * [llvm.x86.vcvtps2ph / _mm_cvtps_ph] * * @param src value to convert * * Convert float32 to half floats, preserving Infs and NaNs, * with rounding towards zero (trunc). */ LLVMValueRef lp_build_float_to_half(struct gallivm_state *gallivm, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef f32_vec_type = LLVMTypeOf(src); unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind ? LLVMGetVectorSize(f32_vec_type) : 1; struct lp_type i32_type = lp_type_int_vec(32, 32 * length); struct lp_type i16_type = lp_type_int_vec(16, 16 * length); LLVMValueRef result; if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 && (length == 4 || length == 8)) { struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); const char *intrinsic = NULL; if (length == 4) { intrinsic = "llvm.x86.vcvtps2ph.128"; } else { intrinsic = "llvm.x86.vcvtps2ph.256"; } result = lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(gallivm, i168_type), src, LLVMConstInt(i32t, mode, 0)); if (length == 4) { result = lp_build_extract_range(gallivm, result, 0, 4); } } else { result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true); /* Convert int32 vector to int16 vector by trunc (might generate bad code) */ result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), ""); } /* * Debugging code. */ if (0) { LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context); LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length)); unsigned i; LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0); LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half)); func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half"); for (i = 0; i < length; ++i) { LLVMValueRef index = LLVMConstInt(i32t, i, 0); LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, ""); #if 0 /* XXX: not really supported by backends */ LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32); #else LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, ""); #endif ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, ""); } lp_build_print_value(gallivm, "src = ", src); lp_build_print_value(gallivm, "llvm = ", result); lp_build_print_value(gallivm, "util = ", ref_result); lp_build_printf(gallivm, "\n"); } return result; }
/** * Call intrinsic with arguments adapted to intrinsic vector length. * * Split vectors which are too large for the hw, or expand them if they * are too small, so a caller calling a function which might use intrinsics * doesn't need to do splitting/expansion on its own. * This only supports intrinsics where src and dst types match. */ LLVMValueRef lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm, const char *name, struct lp_type src_type, unsigned intr_size, LLVMValueRef a, LLVMValueRef b) { unsigned i; struct lp_type intrin_type = src_type; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); LLVMValueRef anative, bnative; unsigned intrin_length = intr_size / src_type.width; intrin_type.length = intrin_length; if (intrin_length > src_type.length) { LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; LLVMValueRef constvec, tmp; for (i = 0; i < src_type.length; i++) { elems[i] = lp_build_const_int32(gallivm, i); } for (; i < intrin_length; i++) { elems[i] = i32undef; } if (src_type.length == 1) { LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type); a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), ""); b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), ""); } constvec = LLVMConstVector(elems, intrin_length); anative = LLVMBuildShuffleVector(builder, a, a, constvec, ""); bnative = LLVMBuildShuffleVector(builder, b, b, constvec, ""); tmp = lp_build_intrinsic_binary(builder, name, lp_build_vec_type(gallivm, intrin_type), anative, bnative); if (src_type.length > 1) { constvec = LLVMConstVector(elems, src_type.length); return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, ""); } else { return LLVMBuildExtractElement(builder, tmp, elems[0], ""); } } else if (intrin_length < src_type.length) { unsigned num_vec = src_type.length / intrin_length; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; /* don't support arbitrary size here as this is so yuck */ if (src_type.length % intrin_length) { /* FIXME: This is something which should be supported * but there doesn't seem to be any need for it currently * so crash and burn. */ debug_printf("%s: should handle arbitrary vector size\n", __FUNCTION__); assert(0); return NULL; } for (i = 0; i < num_vec; i++) { anative = lp_build_extract_range(gallivm, a, i*intrin_length, intrin_length); bnative = lp_build_extract_range(gallivm, b, i*intrin_length, intrin_length); tmp[i] = lp_build_intrinsic_binary(builder, name, lp_build_vec_type(gallivm, intrin_type), anative, bnative); } return lp_build_concat(gallivm, tmp, intrin_type, num_vec); } else { return lp_build_intrinsic_binary(builder, name, lp_build_vec_type(gallivm, src_type), a, b); } }
static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) { struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); struct lp_build_context * base = &bld_base->base; unsigned i; unsigned color_count = 0; boolean has_color = false; /* Add the necessary export instructions */ for (i = 0; i < ctx->output_reg_count; i++) { unsigned chan; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { LLVMValueRef output; unsigned adjusted_reg_idx = i + ctx->reserved_reg_count; output = LLVMBuildLoad(base->gallivm->builder, ctx->soa.outputs[i][chan], ""); if (ctx->type == TGSI_PROCESSOR_VERTEX) { LLVMValueRef reg_index = lp_build_const_int32( base->gallivm, radeon_llvm_reg_index_soa(adjusted_reg_idx, chan)); lp_build_intrinsic_binary( base->gallivm->builder, "llvm.AMDGPU.store.output", LLVMVoidTypeInContext(base->gallivm->context), output, reg_index); } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { switch (ctx->r600_outputs[i].name) { case TGSI_SEMANTIC_COLOR: has_color = true; if ( color_count/4 < ctx->color_buffer_count) { if (ctx->fs_color_all) { for (unsigned j = 0; j < ctx->color_buffer_count; j++) { LLVMValueRef reg_index = lp_build_const_int32( base->gallivm, (j * 4) + chan); lp_build_intrinsic_binary( base->gallivm->builder, "llvm.R600.store.pixel.color", LLVMVoidTypeInContext(base->gallivm->context), output, reg_index); } } else { LLVMValueRef reg_index = lp_build_const_int32( base->gallivm, (color_count++/4) * 4 + chan); lp_build_intrinsic_binary( base->gallivm->builder, "llvm.R600.store.pixel.color", LLVMVoidTypeInContext(base->gallivm->context), output, reg_index); } } break; case TGSI_SEMANTIC_POSITION: if (chan != 2) continue; lp_build_intrinsic_unary( base->gallivm->builder, "llvm.R600.store.pixel.depth", LLVMVoidTypeInContext(base->gallivm->context), output); break; case TGSI_SEMANTIC_STENCIL: if (chan != 1) continue; lp_build_intrinsic_unary( base->gallivm->builder, "llvm.R600.store.pixel.stencil", LLVMVoidTypeInContext(base->gallivm->context), output); break; } } } } if (!has_color && ctx->type == TGSI_PROCESSOR_FRAGMENT) lp_build_intrinsic(base->gallivm->builder, "llvm.R600.store.pixel.dummy", LLVMVoidTypeInContext(base->gallivm->context), 0, 0); }