/** * Gather one element from scatter positions in memory. * * @sa lp_build_gather() */ LLVMValueRef lp_build_gather_elem(struct gallivm_state *gallivm, unsigned length, unsigned src_width, unsigned dst_width, LLVMValueRef base_ptr, LLVMValueRef offsets, unsigned i) { LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width); LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width); LLVMValueRef ptr; LLVMValueRef res; assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); res = LLVMBuildLoad(gallivm->builder, ptr, ""); assert(src_width <= dst_width); if (src_width > dst_width) res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, ""); if (src_width < dst_width) res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); return res; }
/** * Gather one element from scatter positions in memory. * * @sa lp_build_gather() */ LLVMValueRef lp_build_gather_elem(struct gallivm_state *gallivm, unsigned length, unsigned src_width, unsigned dst_width, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offsets, unsigned i, boolean vector_justify) { LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width); LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width); LLVMValueRef ptr; LLVMValueRef res; assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); res = LLVMBuildLoad(gallivm->builder, ptr, ""); /* XXX * On some archs we probably really want to avoid having to deal * with alignments lower than 4 bytes (if fetch size is a power of * two >= 32). On x86 it doesn't matter, however. * We should be able to guarantee full alignment for any kind of texture * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends * but I don't think that's quite what we wanted). * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT * looks like a good fit, but it seems this cap bit (and OpenGL) aren't * enforcing what we want (which is what d3d10 does, the offset needs to * be aligned to element size, but GL has bytes regardless of element * size which would only leave us with minimum alignment restriction of 16 * which doesn't make much sense if the type isn't 4x32bit). Due to * translation of offsets to first_elem in sampler_views it actually seems * gallium could not do anything else except 16 no matter what... */ if (!aligned) { LLVMSetAlignment(res, 1); } assert(src_width <= dst_width); if (src_width > dst_width) { res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, ""); } else if (src_width < dst_width) { res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); if (vector_justify) { #ifdef PIPE_ARCH_BIG_ENDIAN res = LLVMBuildShl(gallivm->builder, res, LLVMConstInt(dst_elem_type, dst_width - src_width, 0), ""); #endif } } return res; }
/** * Fetch a pixel into a 4 float AoS. * * \param format_desc describes format of the image we're fetching from * \param ptr address of the pixel block (or the texel if uncompressed) * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0, 0). * \return a 4 element vector with the pixel's RGBA values. */ LLVMValueRef lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j) { LLVMBuilderRef builder = gallivm->builder; unsigned num_pixels = type.length / 4; struct lp_build_context bld; assert(type.length <= LP_MAX_VECTOR_LENGTH); assert(type.length % 4 == 0); lp_build_context_init(&bld, gallivm, type); /* * Trivial case * * The format matches the type (apart of a swizzle) so no need for * scaling or converting. */ if (format_matches_type(format_desc, type) && format_desc->block.bits <= type.width * 4 && util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; /* * The format matches the type (apart of a swizzle) so no need for * scaling or converting. */ packed = lp_build_gather(gallivm, type.length/4, format_desc->block.bits, type.width*4, base_ptr, offset); assert(format_desc->block.bits <= type.width * type.length); packed = LLVMBuildBitCast(gallivm->builder, packed, lp_build_vec_type(gallivm, type), ""); return lp_build_format_swizzle_aos(format_desc, &bld, packed); } /* * Bit arithmetic */ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && !format_desc->is_mixed && (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED || format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) { LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; LLVMValueRef res; unsigned k; /* * Unpack a pixel at a time into a <4 x float> RGBA vector */ for (k = 0; k < num_pixels; ++k) { LLVMValueRef packed; packed = lp_build_gather_elem(gallivm, num_pixels, format_desc->block.bits, 32, base_ptr, offset, k); tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm, format_desc, packed); } /* * Type conversion. * * TODO: We could avoid floating conversion for integer to * integer conversions. */ if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) { debug_printf("%s: unpacking %s with floating point\n", __FUNCTION__, format_desc->short_name); } lp_build_conv(gallivm, lp_float32_vec4_type(), type, tmps, num_pixels, &res, 1); return lp_build_format_swizzle_aos(format_desc, &bld, res); } /* * YUV / subsampled formats */ if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { struct lp_type tmp_type; LLVMValueRef tmp; memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = num_pixels * 4; tmp_type.norm = TRUE; tmp = lp_build_fetch_subsampled_rgba_aos(gallivm, format_desc, num_pixels, base_ptr, offset, i, j); lp_build_conv(gallivm, tmp_type, type, &tmp, 1, &tmp, 1); return tmp; } /* * Fallback to util_format_description::fetch_rgba_8unorm(). */ if (format_desc->fetch_rgba_8unorm && !type.floating && type.width == 8 && !type.sign && type.norm) { /* * Fallback to calling util_format_description::fetch_rgba_8unorm. * * This is definitely not the most efficient way of fetching pixels, as * we miss the opportunity to do vectorization, but this it is a * convenient for formats or scenarios for which there was no opportunity * or incentive to optimize. */ LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(gallivm->builder))); char name[256]; LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef function; LLVMValueRef tmp_ptr; LLVMValueRef tmp; LLVMValueRef res; LLVMValueRef callee; unsigned k; util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm", format_desc->short_name); if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: falling back to %s\n", __FUNCTION__, name); } /* * Declare and bind format_desc->fetch_rgba_8unorm(). */ function = LLVMGetNamedFunction(module, name); if (!function) { /* * Function to call looks like: * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) */ LLVMTypeRef ret_type; LLVMTypeRef arg_types[4]; LLVMTypeRef function_type; ret_type = LLVMVoidTypeInContext(gallivm->context); arg_types[0] = pi8t; arg_types[1] = pi8t; arg_types[2] = i32t; arg_types[3] = i32t; function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); function = LLVMAddFunction(module, name, function_type); LLVMSetFunctionCallConv(function, LLVMCCallConv); LLVMSetLinkage(function, LLVMExternalLinkage); assert(LLVMIsDeclaration(function)); } /* make const pointer for the C fetch_rgba_float function */ callee = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm)); /* cast the callee pointer to the function's type */ function = LLVMBuildBitCast(builder, callee, LLVMTypeOf(function), "cast callee"); tmp_ptr = lp_build_alloca(gallivm, i32t, ""); res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels)); /* * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result * in the SoA vectors. */ for (k = 0; k < num_pixels; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef args[4]; args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, base_ptr, offset, k); if (num_pixels == 1) { args[2] = i; args[3] = j; } else { args[2] = LLVMBuildExtractElement(builder, i, index, ""); args[3] = LLVMBuildExtractElement(builder, j, index, ""); } LLVMBuildCall(builder, function, args, Elements(args), ""); tmp = LLVMBuildLoad(builder, tmp_ptr, ""); if (num_pixels == 1) { res = tmp; } else { res = LLVMBuildInsertElement(builder, res, tmp, index, ""); } } /* Bitcast from <n x i32> to <4n x i8> */ res = LLVMBuildBitCast(builder, res, bld.vec_type, ""); return res; } /* * Fallback to util_format_description::fetch_rgba_float(). */ if (format_desc->fetch_rgba_float) { /* * Fallback to calling util_format_description::fetch_rgba_float. * * This is definitely not the most efficient way of fetching pixels, as * we miss the opportunity to do vectorization, but this it is a * convenient for formats or scenarios for which there was no opportunity * or incentive to optimize. */ LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); char name[256]; LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4); LLVMTypeRef pf32t = LLVMPointerType(f32t, 0); LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef function; LLVMValueRef tmp_ptr; LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; LLVMValueRef res; LLVMValueRef callee; unsigned k; util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float", format_desc->short_name); if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: falling back to %s\n", __FUNCTION__, name); } /* * Declare and bind format_desc->fetch_rgba_float(). */ function = LLVMGetNamedFunction(module, name); if (!function) { /* * Function to call looks like: * fetch(float *dst, const uint8_t *src, unsigned i, unsigned j) */ LLVMTypeRef ret_type; LLVMTypeRef arg_types[4]; LLVMTypeRef function_type; ret_type = LLVMVoidTypeInContext(gallivm->context); arg_types[0] = pf32t; arg_types[1] = pi8t; arg_types[2] = i32t; arg_types[3] = i32t; function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); function = LLVMAddFunction(module, name, function_type); LLVMSetFunctionCallConv(function, LLVMCCallConv); LLVMSetLinkage(function, LLVMExternalLinkage); assert(LLVMIsDeclaration(function)); } /* Note: we're using this casting here instead of LLVMAddGlobalMapping() * to work around a bug in LLVM 2.6. */ /* make const pointer for the C fetch_rgba_float function */ callee = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer) format_desc->fetch_rgba_float)); /* cast the callee pointer to the function's type */ function = LLVMBuildBitCast(builder, callee, LLVMTypeOf(function), "cast callee"); tmp_ptr = lp_build_alloca(gallivm, f32x4t, ""); /* * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result * in the SoA vectors. */ for (k = 0; k < num_pixels; ++k) { LLVMValueRef args[4]; args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, ""); args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, base_ptr, offset, k); if (num_pixels == 1) { args[2] = i; args[3] = j; } else { LLVMValueRef index = lp_build_const_int32(gallivm, k); args[2] = LLVMBuildExtractElement(builder, i, index, ""); args[3] = LLVMBuildExtractElement(builder, j, index, ""); } LLVMBuildCall(builder, function, args, Elements(args), ""); tmps[k] = LLVMBuildLoad(builder, tmp_ptr, ""); } lp_build_conv(gallivm, lp_float32_vec4_type(), type, tmps, num_pixels, &res, 1); return res; } assert(0); return lp_build_undef(gallivm, type); }
/** * Gather one element from scatter positions in memory. * Nearly the same as above, however the individual elements * may be vectors themselves, and fetches may be float type. * Can also do pad vector instead of ZExt. * * @sa lp_build_gather() */ static LLVMValueRef lp_build_gather_elem_vec(struct gallivm_state *gallivm, unsigned length, unsigned src_width, LLVMTypeRef src_type, struct lp_type dst_type, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offsets, unsigned i, boolean vector_justify) { LLVMValueRef ptr, res; LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); res = LLVMBuildLoad(gallivm->builder, ptr, ""); /* XXX * On some archs we probably really want to avoid having to deal * with alignments lower than 4 bytes (if fetch size is a power of * two >= 32). On x86 it doesn't matter, however. * We should be able to guarantee full alignment for any kind of texture * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends * but I don't think that's quite what we wanted). * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT * looks like a good fit, but it seems this cap bit (and OpenGL) aren't * enforcing what we want (which is what d3d10 does, the offset needs to * be aligned to element size, but GL has bytes regardless of element * size which would only leave us with minimum alignment restriction of 16 * which doesn't make much sense if the type isn't 4x32bit). Due to * translation of offsets to first_elem in sampler_views it actually seems * gallium could not do anything else except 16 no matter what... */ if (!aligned) { LLVMSetAlignment(res, 1); } else if (!util_is_power_of_two(src_width)) { /* * Full alignment is impossible, assume the caller really meant * the individual elements were aligned (e.g. 3x32bit format). * And yes the generated code may otherwise crash, llvm will * really assume 128bit alignment with a 96bit fetch (I suppose * that makes sense as it can just assume the upper 32bit to be * whatever). * Maybe the caller should be able to explicitly set this, but * this should cover all the 3-channel formats. */ if (((src_width / 24) * 24 == src_width) && util_is_power_of_two(src_width / 24)) { LLVMSetAlignment(res, src_width / 24); } else { LLVMSetAlignment(res, 1); } } assert(src_width <= dst_type.width * dst_type.length); if (src_width < dst_type.width * dst_type.length) { if (dst_type.length > 1) { res = lp_build_pad_vector(gallivm, res, dst_type.length); /* * vector_justify hopefully a non-issue since we only deal * with src_width >= 32 here? */ } else { LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type); /* * Only valid if src_ptr_type is int type... */ res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); #ifdef PIPE_ARCH_BIG_ENDIAN if (vector_justify) { res = LLVMBuildShl(gallivm->builder, res, LLVMConstInt(dst_elem_type, dst_type.width - src_width, 0), ""); } if (src_width == 48) { /* Load 3x16 bit vector. * The sequence of loads on big-endian hardware proceeds as follows. * 16-bit fields are denoted by X, Y, Z, and 0. In memory, the sequence * of three fields appears in the order X, Y, Z. * * Load 32-bit word: 0.0.X.Y * Load 16-bit halfword: 0.0.0.Z * Rotate left: 0.X.Y.0 * Bitwise OR: 0.X.Y.Z * * The order in which we need the fields in the result is 0.Z.Y.X, * the same as on little-endian; permute 16-bit fields accordingly * within 64-bit register: */ LLVMValueRef shuffles[4] = { lp_build_const_int32(gallivm, 2), lp_build_const_int32(gallivm, 1), lp_build_const_int32(gallivm, 0), lp_build_const_int32(gallivm, 3), }; res = LLVMBuildBitCast(gallivm->builder, res, lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), ""); res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), ""); res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, ""); } #endif } } return res; }