/** * Fetch a pixel into a 4 float AoS. * * \param format_desc describes format of the image we're fetching from * \param ptr address of the pixel block (or the texel if uncompressed) * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0, 0). * \return a 4 element vector with the pixel's RGBA values. */ LLVMValueRef lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j) { LLVMBuilderRef builder = gallivm->builder; unsigned num_pixels = type.length / 4; struct lp_build_context bld; assert(type.length <= LP_MAX_VECTOR_LENGTH); assert(type.length % 4 == 0); lp_build_context_init(&bld, gallivm, type); /* * Trivial case * * The format matches the type (apart of a swizzle) so no need for * scaling or converting. */ if (format_matches_type(format_desc, type) && format_desc->block.bits <= type.width * 4 && util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; /* * The format matches the type (apart of a swizzle) so no need for * scaling or converting. */ packed = lp_build_gather(gallivm, type.length/4, format_desc->block.bits, type.width*4, base_ptr, offset); assert(format_desc->block.bits <= type.width * type.length); packed = LLVMBuildBitCast(gallivm->builder, packed, lp_build_vec_type(gallivm, type), ""); return lp_build_format_swizzle_aos(format_desc, &bld, packed); } /* * Bit arithmetic */ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && !format_desc->is_mixed && (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED || format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) { LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; LLVMValueRef res; unsigned k; /* * Unpack a pixel at a time into a <4 x float> RGBA vector */ for (k = 0; k < num_pixels; ++k) { LLVMValueRef packed; packed = lp_build_gather_elem(gallivm, num_pixels, format_desc->block.bits, 32, base_ptr, offset, k); tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm, format_desc, packed); } /* * Type conversion. * * TODO: We could avoid floating conversion for integer to * integer conversions. */ if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) { debug_printf("%s: unpacking %s with floating point\n", __FUNCTION__, format_desc->short_name); } lp_build_conv(gallivm, lp_float32_vec4_type(), type, tmps, num_pixels, &res, 1); return lp_build_format_swizzle_aos(format_desc, &bld, res); } /* * YUV / subsampled formats */ if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { struct lp_type tmp_type; LLVMValueRef tmp; memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = num_pixels * 4; tmp_type.norm = TRUE; tmp = lp_build_fetch_subsampled_rgba_aos(gallivm, format_desc, num_pixels, base_ptr, offset, i, j); lp_build_conv(gallivm, tmp_type, type, &tmp, 1, &tmp, 1); return tmp; } /* * Fallback to util_format_description::fetch_rgba_8unorm(). */ if (format_desc->fetch_rgba_8unorm && !type.floating && type.width == 8 && !type.sign && type.norm) { /* * Fallback to calling util_format_description::fetch_rgba_8unorm. * * This is definitely not the most efficient way of fetching pixels, as * we miss the opportunity to do vectorization, but this it is a * convenient for formats or scenarios for which there was no opportunity * or incentive to optimize. */ LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(gallivm->builder))); char name[256]; LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef function; LLVMValueRef tmp_ptr; LLVMValueRef tmp; LLVMValueRef res; LLVMValueRef callee; unsigned k; util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm", format_desc->short_name); if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: falling back to %s\n", __FUNCTION__, name); } /* * Declare and bind format_desc->fetch_rgba_8unorm(). */ function = LLVMGetNamedFunction(module, name); if (!function) { /* * Function to call looks like: * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) */ LLVMTypeRef ret_type; LLVMTypeRef arg_types[4]; LLVMTypeRef function_type; ret_type = LLVMVoidTypeInContext(gallivm->context); arg_types[0] = pi8t; arg_types[1] = pi8t; arg_types[2] = i32t; arg_types[3] = i32t; function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); function = LLVMAddFunction(module, name, function_type); LLVMSetFunctionCallConv(function, LLVMCCallConv); LLVMSetLinkage(function, LLVMExternalLinkage); assert(LLVMIsDeclaration(function)); } /* make const pointer for the C fetch_rgba_float function */ callee = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm)); /* cast the callee pointer to the function's type */ function = LLVMBuildBitCast(builder, callee, LLVMTypeOf(function), "cast callee"); tmp_ptr = lp_build_alloca(gallivm, i32t, ""); res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels)); /* * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result * in the SoA vectors. */ for (k = 0; k < num_pixels; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef args[4]; args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, base_ptr, offset, k); if (num_pixels == 1) { args[2] = i; args[3] = j; } else { args[2] = LLVMBuildExtractElement(builder, i, index, ""); args[3] = LLVMBuildExtractElement(builder, j, index, ""); } LLVMBuildCall(builder, function, args, Elements(args), ""); tmp = LLVMBuildLoad(builder, tmp_ptr, ""); if (num_pixels == 1) { res = tmp; } else { res = LLVMBuildInsertElement(builder, res, tmp, index, ""); } } /* Bitcast from <n x i32> to <4n x i8> */ res = LLVMBuildBitCast(builder, res, bld.vec_type, ""); return res; } /* * Fallback to util_format_description::fetch_rgba_float(). */ if (format_desc->fetch_rgba_float) { /* * Fallback to calling util_format_description::fetch_rgba_float. * * This is definitely not the most efficient way of fetching pixels, as * we miss the opportunity to do vectorization, but this it is a * convenient for formats or scenarios for which there was no opportunity * or incentive to optimize. */ LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); char name[256]; LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4); LLVMTypeRef pf32t = LLVMPointerType(f32t, 0); LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef function; LLVMValueRef tmp_ptr; LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; LLVMValueRef res; LLVMValueRef callee; unsigned k; util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float", format_desc->short_name); if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: falling back to %s\n", __FUNCTION__, name); } /* * Declare and bind format_desc->fetch_rgba_float(). */ function = LLVMGetNamedFunction(module, name); if (!function) { /* * Function to call looks like: * fetch(float *dst, const uint8_t *src, unsigned i, unsigned j) */ LLVMTypeRef ret_type; LLVMTypeRef arg_types[4]; LLVMTypeRef function_type; ret_type = LLVMVoidTypeInContext(gallivm->context); arg_types[0] = pf32t; arg_types[1] = pi8t; arg_types[2] = i32t; arg_types[3] = i32t; function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); function = LLVMAddFunction(module, name, function_type); LLVMSetFunctionCallConv(function, LLVMCCallConv); LLVMSetLinkage(function, LLVMExternalLinkage); assert(LLVMIsDeclaration(function)); } /* Note: we're using this casting here instead of LLVMAddGlobalMapping() * to work around a bug in LLVM 2.6. */ /* make const pointer for the C fetch_rgba_float function */ callee = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer) format_desc->fetch_rgba_float)); /* cast the callee pointer to the function's type */ function = LLVMBuildBitCast(builder, callee, LLVMTypeOf(function), "cast callee"); tmp_ptr = lp_build_alloca(gallivm, f32x4t, ""); /* * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result * in the SoA vectors. */ for (k = 0; k < num_pixels; ++k) { LLVMValueRef args[4]; args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, ""); args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, base_ptr, offset, k); if (num_pixels == 1) { args[2] = i; args[3] = j; } else { LLVMValueRef index = lp_build_const_int32(gallivm, k); args[2] = LLVMBuildExtractElement(builder, i, index, ""); args[3] = LLVMBuildExtractElement(builder, j, index, ""); } LLVMBuildCall(builder, function, args, Elements(args), ""); tmps[k] = LLVMBuildLoad(builder, tmp_ptr, ""); } lp_build_conv(gallivm, lp_float32_vec4_type(), type, tmps, num_pixels, &res, 1); return res; } assert(0); return lp_build_undef(gallivm, type); }
/** * Fetch a texels from a texture, returning them in SoA layout. * * \param type the desired return type for 'rgba'. The vector length * is the number of texels to fetch * * \param base_ptr points to the base of the texture mip tree. * \param offset offset to start of the texture image block. For non- * compressed formats, this simply is an offset to the texel. * For compressed formats, it is an offset to the start of the * compressed data block. * * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0,0). For compressed formats, i will * be in [0, block_width-1] and j will be in [0, block_height-1]. */ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits <= type.width && (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || format_desc->channel[0].size == 32)) { /* * The packed pixel fits into an element of the destination format. Put * the packed pixels into a vector and extract each component for all * vector elements in parallel. */ LLVMValueRef packed; /* * gather the texels from the texture * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ assert(format_desc->block.bits <= type.width); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, type.width, base_ptr, offset, FALSE); /* * convert texels to float rgba */ lp_build_unpack_rgba_soa(gallivm, format_desc, type, packed, rgba_out); return; } if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* * similar conceptually to above but requiring special * AoS packed -> SoA float conversion code. */ LLVMValueRef packed; assert(type.floating); assert(type.width == 32); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, type.width, base_ptr, offset, FALSE); if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); } else { lp_build_rgb9e5_to_float(gallivm, packed, rgba_out); } return; } if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && format_desc->block.bits == 64) { /* * special case the format is 64 bits but we only require * 32bit (or 8bit) from each block. */ LLVMValueRef packed; if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) { /* * for stencil simply fix up offsets - could in fact change * base_ptr instead even outside the shader. */ unsigned mask = (1 << 8) - 1; LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); offset = LLVMBuildAdd(builder, offset, s_offset, ""); packed = lp_build_gather(gallivm, type.length, 32, type.width, base_ptr, offset, FALSE); packed = LLVMBuildAnd(builder, packed, lp_build_const_int_vec(gallivm, type, mask), ""); } else { assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); packed = lp_build_gather(gallivm, type.length, 32, type.width, base_ptr, offset, TRUE); packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(gallivm, type), ""); } /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */ rgba_out[0] = rgba_out[1] = rgba_out[2] = packed; rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f); return; } /* * Try calling lp_build_fetch_rgba_aos for all pixels. */ if (util_format_fits_8unorm(format_desc) && type.floating && type.width == 32 && (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; LLVMValueRef tmp; memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = type.length * 4; tmp_type.norm = TRUE; tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, base_ptr, offset, i, j); lp_build_rgba8_to_fi32_soa(gallivm, type, tmp, rgba_out); return; } /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * * This is not the most efficient way of fetching pixels, as we * miss some opportunities to do vectorization, but this is * convenient for formats or scenarios for which there was no * opportunity or incentive to optimize. */ { unsigned k, chan; struct lp_type tmp_type; if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: scalar unpacking of %s\n", __FUNCTION__, format_desc->short_name); } tmp_type = type; tmp_type.length = 4; for (chan = 0; chan < 4; ++chan) { rgba_out[chan] = lp_build_undef(gallivm, type); } /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; LLVMValueRef i_elem, j_elem; LLVMValueRef tmp; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, base_ptr, offset_elem, i_elem, j_elem); /* * Insert the AoS tmp value channels into the SoA result vectors at * position = 'index'. */ for (chan = 0; chan < 4; ++chan) { LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan), tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, ""); rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan], tmp_chan, index, ""); } } } }
/** * Truncate or expand the bitwidth. * * NOTE: Getting the right sign flags is crucial here, as we employ some * intrinsics that do saturation. */ void lp_build_resize(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned i; /* * We don't support float <-> int conversion here. That must be done * before/after calling this function. */ assert(src_type.floating == dst_type.floating); /* * We don't support double <-> float conversion yet, although it could be * added with little effort. */ assert((!src_type.floating && !dst_type.floating) || src_type.width == dst_type.width); /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */ assert(num_srcs == 1 || num_dsts == 1); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); assert(num_srcs <= LP_MAX_VECTOR_LENGTH); assert(num_dsts <= LP_MAX_VECTOR_LENGTH); if (src_type.width > dst_type.width) { /* * Truncate bit width. */ assert(num_dsts == 1); if (src_type.width * src_type.length == dst_type.width * dst_type.length) { /* * Register width remains constant -- use vector packing intrinsics */ tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); } else { if (src_type.width / dst_type.width > num_srcs) { /* * First change src vectors size (with shuffle) so they have the * same size as the destination vector, then pack normally. * Note: cannot use cast/extract because llvm generates atrocious code. */ unsigned size_ratio = (src_type.width * src_type.length) / (dst_type.length * dst_type.width); unsigned new_length = src_type.length / size_ratio; for (i = 0; i < size_ratio * num_srcs; i++) { unsigned start_index = (i % size_ratio) * new_length; tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio], start_index, new_length); } num_srcs *= size_ratio; src_type.length = new_length; tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs); } else { /* * Truncate bit width but expand vector size - first pack * then expand simply because this should be more AVX-friendly * for the cases we probably hit. */ unsigned size_ratio = (dst_type.width * dst_type.length) / (src_type.length * src_type.width); unsigned num_pack_srcs = num_srcs / size_ratio; dst_type.length = dst_type.length / size_ratio; for (i = 0; i < size_ratio; i++) { tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE, &src[i*num_pack_srcs], num_pack_srcs); } tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio); } } } else if (src_type.width < dst_type.width) { /* * Expand bit width. */ assert(num_srcs == 1); if (src_type.width * src_type.length == dst_type.width * dst_type.length) { /* * Register width remains constant -- use vector unpack intrinsics */ lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts); } else { /* * Do it element-wise. */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); for (i = 0; i < num_dsts; i++) { tmp[i] = lp_build_undef(gallivm, dst_type); } for (i = 0; i < src_type.length; ++i) { unsigned j = i / dst_type.length; LLVMValueRef srcindex = lp_build_const_int32(gallivm, i); LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length); LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, ""); if (src_type.sign && dst_type.sign) { val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); } else { val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); } tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, ""); } } } else { /* * No-op */ assert(num_srcs == 1); assert(num_dsts == 1); tmp[0] = src[0]; } for(i = 0; i < num_dsts; ++i) dst[i] = tmp[i]; }
/** * Unpack several pixels in SoA. * * It takes a vector of packed pixels: * * packed = {P0, P1, P2, P3, ..., Pn} * * And will produce four vectors: * * red = {R0, R1, R2, R3, ..., Rn} * green = {G0, G1, G2, G3, ..., Gn} * blue = {B0, B1, B2, B3, ..., Bn} * alpha = {A0, A1, A2, A3, ..., An} * * It requires that a packed pixel fits into an element of the output * channels. The common case is when converting pixel with a depth of 32 bit or * less into floats. * * \param format_desc the format of the 'packed' incoming pixel vector * \param type the desired type for rgba_out (type.length = n, above) * \param packed the incoming vector of packed pixels * \param rgba_out returns the SoA R,G,B,A vectors */ void lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef packed, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context bld; LLVMValueRef inputs[4]; unsigned chan; assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); assert(format_desc->block.bits <= type.width); /* FIXME: Support more output types */ assert(type.width == 32); lp_build_context_init(&bld, gallivm, type); /* Decode the input vector components */ for (chan = 0; chan < format_desc->nr_channels; ++chan) { const unsigned width = format_desc->channel[chan].size; const unsigned start = format_desc->channel[chan].shift; const unsigned stop = start + width; LLVMValueRef input; input = packed; switch(format_desc->channel[chan].type) { case UTIL_FORMAT_TYPE_VOID: input = lp_build_undef(gallivm, type); break; case UTIL_FORMAT_TYPE_UNSIGNED: /* * Align the LSB */ if (start) { input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), ""); } /* * Zero the MSBs */ if (stop < format_desc->block.bits) { unsigned mask = ((unsigned long long)1 << width) - 1; input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), ""); } /* * Type conversion */ if (type.floating) { if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { assert(width == 8); if (format_desc->swizzle[3] == chan) { input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); } else { struct lp_type conv_type = lp_uint_type(type); input = lp_build_srgb_to_linear(gallivm, conv_type, input); } } else { if(format_desc->channel[chan].normalized) input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); else input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); } } else if (format_desc->channel[chan].pure_integer) { /* Nothing to do */ } else { /* FIXME */ assert(0); } break; case UTIL_FORMAT_TYPE_SIGNED: /* * Align the sign bit first. */ if (stop < type.width) { unsigned bits = type.width - stop; LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); input = LLVMBuildShl(builder, input, bits_val, ""); } /* * Align the LSB (with an arithmetic shift to preserve the sign) */ if (format_desc->channel[chan].size < type.width) { unsigned bits = type.width - format_desc->channel[chan].size; LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); input = LLVMBuildAShr(builder, input, bits_val, ""); } /* * Type conversion */ if (type.floating) { input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); if (format_desc->channel[chan].normalized) { double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1); LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); input = LLVMBuildFMul(builder, input, scale_val, ""); /* the formula above will produce value below -1.0 for most negative * value but everything seems happy with that hence disable for now */ if (0) input = lp_build_max(&bld, input, lp_build_const_vec(gallivm, type, -1.0f)); } } else if (format_desc->channel[chan].pure_integer) { /* Nothing to do */ } else { /* FIXME */ assert(0); } break; case UTIL_FORMAT_TYPE_FLOAT: if (type.floating) { assert(start == 0); assert(stop == 32); assert(type.width == 32); input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), ""); } else { /* FIXME */ assert(0); input = lp_build_undef(gallivm, type); } break; case UTIL_FORMAT_TYPE_FIXED: if (type.floating) { double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1); LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); input = LLVMBuildFMul(builder, input, scale_val, ""); } else { /* FIXME */ assert(0); input = lp_build_undef(gallivm, type); } break; default: assert(0); input = lp_build_undef(gallivm, type); break; } inputs[chan] = input; } lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out); }
/** * Build code to compare two values 'a' and 'b' of 'type' using the given func. * \param func one of PIPE_FUNC_x * The result values will be 0 for false or ~0 for true. */ LLVMValueRef lp_build_compare(struct gallivm_state *gallivm, const struct lp_type type, unsigned func, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type); LLVMValueRef zeros = LLVMConstNull(int_vec_type); LLVMValueRef ones = LLVMConstAllOnes(int_vec_type); LLVMValueRef cond; LLVMValueRef res; assert(func >= PIPE_FUNC_NEVER); assert(func <= PIPE_FUNC_ALWAYS); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(func == PIPE_FUNC_NEVER) return zeros; if(func == PIPE_FUNC_ALWAYS) return ones; #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* * There are no unsigned integer comparison instructions in SSE. */ if (!type.floating && !type.sign && type.width * type.length == 128 && util_cpu_caps.has_sse2 && (func == PIPE_FUNC_LESS || func == PIPE_FUNC_LEQUAL || func == PIPE_FUNC_GREATER || func == PIPE_FUNC_GEQUAL) && (gallivm_debug & GALLIVM_DEBUG_PERF)) { debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n", __FUNCTION__, type.length, type.width); } #endif #if HAVE_LLVM < 0x0207 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) if(type.width * type.length == 128) { if(type.floating && util_cpu_caps.has_sse) { /* float[4] comparison */ LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type); LLVMValueRef args[3]; unsigned cc; boolean swap; swap = FALSE; switch(func) { case PIPE_FUNC_EQUAL: cc = 0; break; case PIPE_FUNC_NOTEQUAL: cc = 4; break; case PIPE_FUNC_LESS: cc = 1; break; case PIPE_FUNC_LEQUAL: cc = 2; break; case PIPE_FUNC_GREATER: cc = 1; swap = TRUE; break; case PIPE_FUNC_GEQUAL: cc = 2; swap = TRUE; break; default: assert(0); return lp_build_undef(gallivm, type); } if(swap) { args[0] = b; args[1] = a; } else { args[0] = a; args[1] = b; } args[2] = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), cc, 0); res = lp_build_intrinsic(builder, "llvm.x86.sse.cmp.ps", vec_type, args, 3); res = LLVMBuildBitCast(builder, res, int_vec_type, ""); return res; } else if(util_cpu_caps.has_sse2) { /* int[4] comparison */ static const struct { unsigned swap:1; unsigned eq:1; unsigned gt:1; unsigned not:1; } table[] = { {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */ {1, 0, 1, 0}, /* PIPE_FUNC_LESS */ {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */ {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */ {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */ {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */ {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */ {0, 0, 0, 0} /* PIPE_FUNC_ALWAYS */ }; const char *pcmpeq; const char *pcmpgt; LLVMValueRef args[2]; LLVMValueRef res; LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type); switch (type.width) { case 8: pcmpeq = "llvm.x86.sse2.pcmpeq.b"; pcmpgt = "llvm.x86.sse2.pcmpgt.b"; break; case 16: pcmpeq = "llvm.x86.sse2.pcmpeq.w"; pcmpgt = "llvm.x86.sse2.pcmpgt.w"; break; case 32: pcmpeq = "llvm.x86.sse2.pcmpeq.d"; pcmpgt = "llvm.x86.sse2.pcmpgt.d"; break; default: assert(0); return lp_build_undef(gallivm, type); } /* There are no unsigned comparison instructions. So flip the sign bit * so that the results match. */ if (table[func].gt && !type.sign) { LLVMValueRef msb = lp_build_const_int_vec(gallivm, type, (unsigned long long)1 << (type.width - 1)); a = LLVMBuildXor(builder, a, msb, ""); b = LLVMBuildXor(builder, b, msb, ""); } if(table[func].swap) { args[0] = b; args[1] = a; } else { args[0] = a; args[1] = b; } if(table[func].eq) res = lp_build_intrinsic(builder, pcmpeq, vec_type, args, 2); else if (table[func].gt) res = lp_build_intrinsic(builder, pcmpgt, vec_type, args, 2); else res = LLVMConstNull(vec_type); if(table[func].not) res = LLVMBuildNot(builder, res, ""); return res; } } /* if (type.width * type.length == 128) */ #endif #endif /* HAVE_LLVM < 0x0207 */ /* XXX: It is not clear if we should use the ordered or unordered operators */ if(type.floating) { LLVMRealPredicate op; switch(func) { case PIPE_FUNC_NEVER: op = LLVMRealPredicateFalse; break; case PIPE_FUNC_ALWAYS: op = LLVMRealPredicateTrue; break; case PIPE_FUNC_EQUAL: op = LLVMRealUEQ; break; case PIPE_FUNC_NOTEQUAL: op = LLVMRealUNE; break; case PIPE_FUNC_LESS: op = LLVMRealULT; break; case PIPE_FUNC_LEQUAL: op = LLVMRealULE; break; case PIPE_FUNC_GREATER: op = LLVMRealUGT; break; case PIPE_FUNC_GEQUAL: op = LLVMRealUGE; break; default: assert(0); return lp_build_undef(gallivm, type); } #if HAVE_LLVM >= 0x0207 cond = LLVMBuildFCmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); #else if (type.length == 1) { cond = LLVMBuildFCmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); } else { unsigned i; res = LLVMGetUndef(int_vec_type); debug_printf("%s: warning: using slow element-wise float" " vector comparison\n", __FUNCTION__); for (i = 0; i < type.length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); cond = LLVMBuildFCmp(builder, op, LLVMBuildExtractElement(builder, a, index, ""), LLVMBuildExtractElement(builder, b, index, ""), ""); cond = LLVMBuildSelect(builder, cond, LLVMConstExtractElement(ones, index), LLVMConstExtractElement(zeros, index), ""); res = LLVMBuildInsertElement(builder, res, cond, index, ""); } } #endif } else { LLVMIntPredicate op; switch(func) { case PIPE_FUNC_EQUAL: op = LLVMIntEQ; break; case PIPE_FUNC_NOTEQUAL: op = LLVMIntNE; break; case PIPE_FUNC_LESS: op = type.sign ? LLVMIntSLT : LLVMIntULT; break; case PIPE_FUNC_LEQUAL: op = type.sign ? LLVMIntSLE : LLVMIntULE; break; case PIPE_FUNC_GREATER: op = type.sign ? LLVMIntSGT : LLVMIntUGT; break; case PIPE_FUNC_GEQUAL: op = type.sign ? LLVMIntSGE : LLVMIntUGE; break; default: assert(0); return lp_build_undef(gallivm, type); } #if HAVE_LLVM >= 0x0207 cond = LLVMBuildICmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); #else if (type.length == 1) { cond = LLVMBuildICmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); } else { unsigned i; res = LLVMGetUndef(int_vec_type); if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: using slow element-wise int" " vector comparison\n", __FUNCTION__); } for(i = 0; i < type.length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); cond = LLVMBuildICmp(builder, op, LLVMBuildExtractElement(builder, a, index, ""), LLVMBuildExtractElement(builder, b, index, ""), ""); cond = LLVMBuildSelect(builder, cond, LLVMConstExtractElement(ones, index), LLVMConstExtractElement(zeros, index), ""); res = LLVMBuildInsertElement(builder, res, cond, index, ""); } } #endif } return res; }