/* * Derive from the quad's upper left scalar coordinates the coordinates for * all other quad pixels */ static void generate_pos0(LLVMBuilderRef builder, LLVMValueRef x, LLVMValueRef y, LLVMValueRef *x0, LLVMValueRef *y0) { LLVMTypeRef int_elem_type = LLVMInt32Type(); LLVMTypeRef int_vec_type = LLVMVectorType(int_elem_type, QUAD_SIZE); LLVMTypeRef elem_type = LLVMFloatType(); LLVMTypeRef vec_type = LLVMVectorType(elem_type, QUAD_SIZE); LLVMValueRef x_offsets[QUAD_SIZE]; LLVMValueRef y_offsets[QUAD_SIZE]; unsigned i; x = lp_build_broadcast(builder, int_vec_type, x); y = lp_build_broadcast(builder, int_vec_type, y); for(i = 0; i < QUAD_SIZE; ++i) { x_offsets[i] = LLVMConstInt(int_elem_type, quad_offset_x[i], 0); y_offsets[i] = LLVMConstInt(int_elem_type, quad_offset_y[i], 0); } x = LLVMBuildAdd(builder, x, LLVMConstVector(x_offsets, QUAD_SIZE), ""); y = LLVMBuildAdd(builder, y, LLVMConstVector(y_offsets, QUAD_SIZE), ""); *x0 = LLVMBuildSIToFP(builder, x, vec_type, ""); *y0 = LLVMBuildSIToFP(builder, y, vec_type, ""); }
/** * Combined extract and broadcast (mere shuffle in most cases) */ LLVMValueRef lp_build_extract_broadcast(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, LLVMValueRef vector, LLVMValueRef index) { LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef res; assert(src_type.floating == dst_type.floating); assert(src_type.width == dst_type.width); assert(lp_check_value(src_type, vector)); assert(LLVMTypeOf(index) == i32t); if (src_type.length == 1) { if (dst_type.length == 1) { /* * Trivial scalar -> scalar. */ res = vector; } else { /* * Broadcast scalar -> vector. */ res = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, dst_type), vector); } } else { if (dst_type.length > 1) { /* * shuffle - result can be of different length. */ LLVMValueRef shuffle; shuffle = lp_build_broadcast(gallivm, LLVMVectorType(i32t, dst_type.length), index); res = LLVMBuildShuffleVector(gallivm->builder, vector, LLVMGetUndef(lp_build_vec_type(gallivm, src_type)), shuffle, ""); } else { /* * Trivial extract scalar from vector. */ res = LLVMBuildExtractElement(gallivm->builder, vector, index, ""); } } return res; }
/** * Expands src vector from src.length to dst_length */ LLVMValueRef lp_build_pad_vector(struct gallivm_state *gallivm, LLVMValueRef src, struct lp_type src_type, unsigned dst_length) { LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type)); LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; unsigned i; assert(dst_length <= Elements(elems)); assert(dst_length > src_type.length); if (src_type.length == dst_length) return src; /* If its a single scalar type, no need to reinvent the wheel */ if (src_type.length == 1) { return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src); } /* All elements from src vector */ for (i = 0; i < src_type.length; ++i) elems[i] = lp_build_const_int32(gallivm, i); /* Undef fill remaining space */ for (i = src_type.length; i < dst_length; ++i) elems[i] = lp_build_const_int32(gallivm, src_type.length); /* Combine the two vectors */ return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), ""); }
/** * Broadcast */ LLVMValueRef lp_build_broadcast_scalar(struct lp_build_context *bld, LLVMValueRef scalar) { assert(lp_check_elem_type(bld->type, LLVMTypeOf(scalar))); return lp_build_broadcast(bld->gallivm, bld->vec_type, scalar); }
static LLVMValueRef generate_scissor_test(LLVMBuilderRef builder, LLVMValueRef context_ptr, const struct lp_build_interp_soa_context *interp, struct lp_type type) { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1]; LLVMValueRef xmin, ymin, xmax, ymax; LLVMValueRef m0, m1, m2, m3, m; /* xpos, ypos contain the window coords for the four pixels in the quad */ assert(xpos); assert(ypos); /* get the current scissor bounds, convert to vectors */ xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr); xmin = lp_build_broadcast(builder, vec_type, xmin); ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr); ymin = lp_build_broadcast(builder, vec_type, ymin); xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr); xmax = lp_build_broadcast(builder, vec_type, xmax); ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr); ymax = lp_build_broadcast(builder, vec_type, ymax); /* compare the fragment's position coordinates against the scissor bounds */ m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin); m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin); m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax); m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax); /* AND all the masks together */ m = LLVMBuildAnd(builder, m0, m1, ""); m = LLVMBuildAnd(builder, m, m2, ""); m = LLVMBuildAnd(builder, m, m3, ""); lp_build_name(m, "scissormask"); return m; }
/** * Generate the fragment shader, depth/stencil test, and alpha tests. * \param i which quad in the tile, in range [0,3] * \param do_tri_test if 1, do triangle edge in/out testing */ static void generate_fs(struct llvmpipe_context *lp, struct lp_fragment_shader *shader, const struct lp_fragment_shader_variant_key *key, LLVMBuilderRef builder, struct lp_type type, LLVMValueRef context_ptr, unsigned i, const struct lp_build_interp_soa_context *interp, struct lp_build_sampler_soa *sampler, LLVMValueRef *pmask, LLVMValueRef (*color)[4], LLVMValueRef depth_ptr, unsigned do_tri_test, LLVMValueRef c0, LLVMValueRef c1, LLVMValueRef c2, LLVMValueRef step0_ptr, LLVMValueRef step1_ptr, LLVMValueRef step2_ptr) { const struct tgsi_token *tokens = shader->base.tokens; LLVMTypeRef elem_type; LLVMTypeRef vec_type; LLVMTypeRef int_vec_type; LLVMValueRef consts_ptr; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; LLVMValueRef z = interp->pos[2]; struct lp_build_flow_context *flow; struct lp_build_mask_context mask; boolean early_depth_test; unsigned attrib; unsigned chan; unsigned cbuf; assert(i < 4); elem_type = lp_build_elem_type(type); vec_type = lp_build_vec_type(type); int_vec_type = lp_build_int_vec_type(type); consts_ptr = lp_jit_context_constants(builder, context_ptr); flow = lp_build_flow_create(builder); memset(outputs, 0, sizeof outputs); lp_build_flow_scope_begin(flow); /* Declare the color and z variables */ for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { for(chan = 0; chan < NUM_CHANNELS; ++chan) { color[cbuf][chan] = LLVMGetUndef(vec_type); lp_build_flow_scope_declare(flow, &color[cbuf][chan]); } } lp_build_flow_scope_declare(flow, &z); /* do triangle edge testing */ if (do_tri_test) { generate_tri_edge_mask(builder, i, pmask, c0, c1, c2, step0_ptr, step1_ptr, step2_ptr); } else { *pmask = build_int32_vec_const(~0); } /* 'mask' will control execution based on quad's pixel alive/killed state */ lp_build_mask_begin(&mask, flow, type, *pmask); if (key->scissor) { LLVMValueRef smask = generate_scissor_test(builder, context_ptr, interp, type); lp_build_mask_update(&mask, smask); } early_depth_test = key->depth.enabled && !key->alpha.enabled && !shader->info.uses_kill && !shader->info.writes_z; if(early_depth_test) generate_depth(builder, key, type, &mask, z, depth_ptr); lp_build_tgsi_soa(builder, tokens, type, &mask, consts_ptr, interp->pos, interp->inputs, outputs, sampler); for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) { for(chan = 0; chan < NUM_CHANNELS; ++chan) { if(outputs[attrib][chan]) { LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], ""); lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]); switch (shader->info.output_semantic_name[attrib]) { case TGSI_SEMANTIC_COLOR: { unsigned cbuf = shader->info.output_semantic_index[attrib]; lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]); /* Alpha test */ /* XXX: should the alpha reference value be passed separately? */ /* XXX: should only test the final assignment to alpha */ if(cbuf == 0 && chan == 3) { LLVMValueRef alpha = out; LLVMValueRef alpha_ref_value; alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr); alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value); lp_build_alpha_test(builder, &key->alpha, type, &mask, alpha, alpha_ref_value); } color[cbuf][chan] = out; break; } case TGSI_SEMANTIC_POSITION: if(chan == 2) z = out; break; } } } } if(!early_depth_test) generate_depth(builder, key, type, &mask, z, depth_ptr); lp_build_mask_end(&mask); lp_build_flow_scope_end(flow); lp_build_flow_destroy(flow); *pmask = mask.value; }
/** * Generate the code to do inside/outside triangle testing for the * four pixels in a 2x2 quad. This will set the four elements of the * quad mask vector to 0 or ~0. * \param i which quad of the quad group to test, in [0,3] */ static void generate_tri_edge_mask(LLVMBuilderRef builder, unsigned i, LLVMValueRef *mask, /* ivec4, out */ LLVMValueRef c0, /* int32 */ LLVMValueRef c1, /* int32 */ LLVMValueRef c2, /* int32 */ LLVMValueRef step0_ptr, /* ivec4 */ LLVMValueRef step1_ptr, /* ivec4 */ LLVMValueRef step2_ptr) /* ivec4 */ { #define OPTIMIZE_IN_OUT_TEST 0 #if OPTIMIZE_IN_OUT_TEST struct lp_build_if_state ifctx; LLVMValueRef not_draw_all; #endif struct lp_build_flow_context *flow; struct lp_type i32_type; LLVMTypeRef i32vec4_type, mask_type; LLVMValueRef c0_vec, c1_vec, c2_vec; LLVMValueRef in_out_mask; assert(i < 4); /* int32 vector type */ memset(&i32_type, 0, sizeof i32_type); i32_type.floating = FALSE; /* values are integers */ i32_type.sign = TRUE; /* values are signed */ i32_type.norm = FALSE; /* values are not normalized */ i32_type.width = 32; /* 32-bit int values */ i32_type.length = 4; /* 4 elements per vector */ i32vec4_type = lp_build_int32_vec4_type(); mask_type = LLVMIntType(32 * 4); /* * Use a conditional here to do detailed pixel in/out testing. * We only have to do this if c0 != INT_MIN. */ flow = lp_build_flow_create(builder); lp_build_flow_scope_begin(flow); { #if OPTIMIZE_IN_OUT_TEST /* not_draw_all = (c0 != INT_MIN) */ not_draw_all = LLVMBuildICmp(builder, LLVMIntNE, c0, LLVMConstInt(LLVMInt32Type(), INT_MIN, 0), ""); in_out_mask = lp_build_int_const_scalar(i32_type, ~0); lp_build_flow_scope_declare(flow, &in_out_mask); /* if (not_draw_all) {... */ lp_build_if(&ifctx, flow, builder, not_draw_all); #endif { LLVMValueRef step0_vec, step1_vec, step2_vec; LLVMValueRef m0_vec, m1_vec, m2_vec; LLVMValueRef index, m; /* c0_vec = {c0, c0, c0, c0} * Note that we emit this code four times but LLVM optimizes away * three instances of it. */ c0_vec = lp_build_broadcast(builder, i32vec4_type, c0); c1_vec = lp_build_broadcast(builder, i32vec4_type, c1); c2_vec = lp_build_broadcast(builder, i32vec4_type, c2); lp_build_name(c0_vec, "edgeconst0vec"); lp_build_name(c1_vec, "edgeconst1vec"); lp_build_name(c2_vec, "edgeconst2vec"); /* load step0vec, step1, step2 vec from memory */ index = LLVMConstInt(LLVMInt32Type(), i, 0); step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), ""); step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), ""); step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), ""); lp_build_name(step0_vec, "step0vec"); lp_build_name(step1_vec, "step1vec"); lp_build_name(step2_vec, "step2vec"); /* m0_vec = step0_ptr[i] > c0_vec */ m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec); m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec); m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec); /* in_out_mask = m0_vec & m1_vec & m2_vec */ m = LLVMBuildAnd(builder, m0_vec, m1_vec, ""); in_out_mask = LLVMBuildAnd(builder, m, m2_vec, ""); lp_build_name(in_out_mask, "inoutmaskvec"); } #if OPTIMIZE_IN_OUT_TEST lp_build_endif(&ifctx); #endif } lp_build_flow_scope_end(flow); lp_build_flow_destroy(flow); /* This is the initial alive/dead pixel mask for a quad of four pixels. * It's an int[4] vector with each word set to 0 or ~0. * Words will get cleared when pixels faile the Z test, etc. */ *mask = in_out_mask; }
static LLVMValueRef lp_build_gather_avx2(struct gallivm_state *gallivm, unsigned length, unsigned src_width, struct lp_type dst_type, LLVMValueRef base_ptr, LLVMValueRef offsets) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef src_type, src_vec_type; LLVMValueRef res; struct lp_type res_type = dst_type; res_type.length *= length; if (dst_type.floating) { src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) : LLVMFloatTypeInContext(gallivm->context); } else { src_type = LLVMIntTypeInContext(gallivm->context, src_width); } src_vec_type = LLVMVectorType(src_type, length); /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */ assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); if (0) { /* * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but * will not use the AVX2 gather instrinsics (even with llvm 4.0), at * least with Haswell. See * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html * And the generated code doing the emulation is quite a bit worse * than what we get by doing it ourselves too. */ LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32); LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length); LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1); LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length); LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); LLVMValueRef src_ptr; base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, ""); /* Rescale offsets from bytes to elements */ LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0); scale = lp_build_broadcast(gallivm, i32_vec_type, scale); assert(LLVMTypeOf(offsets) == i32_vec_type); offsets = LLVMBuildSDiv(builder, offsets, scale, ""); src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep"); char intrinsic[64]; util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u", length, dst_type.floating ? "f" : "i", src_width); LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0); LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type); LLVMValueRef passthru = LLVMGetUndef(src_vec_type); LLVMValueRef args[] = { src_ptr, alignment, mask, passthru }; res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0); } else { LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8); const char *intrinsic = NULL; unsigned l_idx = 0; assert(src_width == 32 || src_width == 64); if (src_width == 32) { assert(length == 4 || length == 8); } else { assert(length == 2 || length == 4); } static const char *intrinsics[2][2][2] = { {{"llvm.x86.avx2.gather.d.d", "llvm.x86.avx2.gather.d.d.256"}, {"llvm.x86.avx2.gather.d.q", "llvm.x86.avx2.gather.d.q.256"}}, {{"llvm.x86.avx2.gather.d.ps", "llvm.x86.avx2.gather.d.ps.256"}, {"llvm.x86.avx2.gather.d.pd", "llvm.x86.avx2.gather.d.pd.256"}}, }; if ((src_width == 32 && length == 8) || (src_width == 64 && length == 4)) { l_idx = 1; } intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx]; LLVMValueRef passthru = LLVMGetUndef(src_vec_type); LLVMValueRef mask = LLVMConstAllOnes(src_vec_type); mask = LLVMConstBitCast(mask, src_vec_type); LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0); LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale }; res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0); } res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), ""); return res; }
/** * Generate the fragment shader, depth/stencil test, and alpha tests. */ static void generate_fs(struct llvmpipe_context *lp, struct lp_fragment_shader *shader, const struct lp_fragment_shader_variant_key *key, LLVMBuilderRef builder, struct lp_type type, LLVMValueRef context_ptr, unsigned i, const struct lp_build_interp_soa_context *interp, struct lp_build_sampler_soa *sampler, LLVMValueRef *pmask, LLVMValueRef *color, LLVMValueRef depth_ptr) { const struct tgsi_token *tokens = shader->base.tokens; LLVMTypeRef elem_type; LLVMTypeRef vec_type; LLVMTypeRef int_vec_type; LLVMValueRef consts_ptr; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; LLVMValueRef z = interp->pos[2]; struct lp_build_flow_context *flow; struct lp_build_mask_context mask; boolean early_depth_test; unsigned attrib; unsigned chan; elem_type = lp_build_elem_type(type); vec_type = lp_build_vec_type(type); int_vec_type = lp_build_int_vec_type(type); consts_ptr = lp_jit_context_constants(builder, context_ptr); flow = lp_build_flow_create(builder); memset(outputs, 0, sizeof outputs); lp_build_flow_scope_begin(flow); /* Declare the color and z variables */ for(chan = 0; chan < NUM_CHANNELS; ++chan) { color[chan] = LLVMGetUndef(vec_type); lp_build_flow_scope_declare(flow, &color[chan]); } lp_build_flow_scope_declare(flow, &z); lp_build_mask_begin(&mask, flow, type, *pmask); early_depth_test = key->depth.enabled && !key->alpha.enabled && !shader->info.uses_kill && !shader->info.writes_z; if(early_depth_test) generate_depth(builder, key, type, &mask, z, depth_ptr); lp_build_tgsi_soa(builder, tokens, type, &mask, consts_ptr, interp->pos, interp->inputs, outputs, sampler); for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) { for(chan = 0; chan < NUM_CHANNELS; ++chan) { if(outputs[attrib][chan]) { lp_build_name(outputs[attrib][chan], "output%u.%u.%c", i, attrib, "xyzw"[chan]); switch (shader->info.output_semantic_name[attrib]) { case TGSI_SEMANTIC_COLOR: { unsigned cbuf = shader->info.output_semantic_index[attrib]; lp_build_name(outputs[attrib][chan], "color%u.%u.%c", i, attrib, "rgba"[chan]); /* Alpha test */ /* XXX: should the alpha reference value be passed separately? */ if(cbuf == 0 && chan == 3) { LLVMValueRef alpha = outputs[attrib][chan]; LLVMValueRef alpha_ref_value; alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr); alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value); lp_build_alpha_test(builder, &key->alpha, type, &mask, alpha, alpha_ref_value); } if(cbuf == 0) color[chan] = outputs[attrib][chan]; break; } case TGSI_SEMANTIC_POSITION: if(chan == 2) z = outputs[attrib][chan]; break; } } } } if(!early_depth_test) generate_depth(builder, key, type, &mask, z, depth_ptr); lp_build_mask_end(&mask); lp_build_flow_scope_end(flow); lp_build_flow_destroy(flow); *pmask = mask.value; }
/** * Expand the relevent bits of mask_input to a 4-dword mask for the * four pixels in a 2x2 quad. This will set the four elements of the * quad mask vector to 0 or ~0. * * \param quad which quad of the quad group to test, in [0,3] * \param mask_input bitwise mask for the whole 4x4 stamp */ static LLVMValueRef generate_quad_mask(LLVMBuilderRef builder, struct lp_type fs_type, unsigned quad, LLVMValueRef mask_input) /* int32 */ { struct lp_type mask_type; LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef bits[4]; LLVMValueRef mask; int shift; /* * XXX: We'll need a different path for 16 x u8 */ assert(fs_type.width == 32); assert(fs_type.length == 4); mask_type = lp_int_type(fs_type); /* * mask_input >>= (quad * 4) */ switch (quad) { case 0: shift = 0; break; case 1: shift = 2; break; case 2: shift = 8; break; case 3: shift = 10; break; default: assert(0); shift = 0; } mask_input = LLVMBuildLShr(builder, mask_input, LLVMConstInt(i32t, shift, 0), ""); /* * mask = { mask_input & (1 << i), for i in [0,3] } */ mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input); bits[0] = LLVMConstInt(i32t, 1 << 0, 0); bits[1] = LLVMConstInt(i32t, 1 << 1, 0); bits[2] = LLVMConstInt(i32t, 1 << 4, 0); bits[3] = LLVMConstInt(i32t, 1 << 5, 0); mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), ""); /* * mask = mask != 0 ? ~0 : 0 */ mask = lp_build_compare(builder, mask_type, PIPE_FUNC_NOTEQUAL, mask, lp_build_const_int_vec(mask_type, 0)); return mask; }