Ejemplo n.º 1
0
static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_TYPE ApiType, u32 components)
{
	// Non-uid template parameters will write to the dummy data (=> gets optimized out)
	pixel_shader_uid_data dummy_data;
	pixel_shader_uid_data* uid_data = out.template GetUidData<pixel_shader_uid_data>();
	if (uid_data == nullptr)
		uid_data = &dummy_data;

	out.SetBuffer(text);
	const bool is_writing_shadercode = (out.GetBuffer() != nullptr);

	if (is_writing_shadercode)
		text[sizeof(text) - 1] = 0x7C;  // canary

	unsigned int numStages = bpmem.genMode.numtevstages + 1;
	unsigned int numTexgen = bpmem.genMode.numtexgens;

	out.Write("//Pixel Shader for TEV stages\n");
	out.Write("//%i TEV stages, %i texgens, %i IND stages\n",
		numStages, numTexgen, bpmem.genMode.numindstages);

	uid_data->dstAlphaMode = dstAlphaMode;
	uid_data->genMode_numindstages = bpmem.genMode.numindstages;
	uid_data->genMode_numtevstages = bpmem.genMode.numtevstages;
	uid_data->genMode_numtexgens = bpmem.genMode.numtexgens;

	// dot product for integer vectors
	out.Write("int idot(int3 x, int3 y)\n"
	          "{\n"
	          "\tint3 tmp = x * y;\n"
	          "\treturn tmp.x + tmp.y + tmp.z;\n"
	          "}\n");

	out.Write("int idot(int4 x, int4 y)\n"
	          "{\n"
	          "\tint4 tmp = x * y;\n"
	          "\treturn tmp.x + tmp.y + tmp.z + tmp.w;\n"
	          "}\n\n");

	// rounding + casting to integer at once in a single function
	out.Write("int  iround(float  x) { return int (round(x)); }\n"
	          "int2 iround(float2 x) { return int2(round(x)); }\n"
	          "int3 iround(float3 x) { return int3(round(x)); }\n"
	          "int4 iround(float4 x) { return int4(round(x)); }\n\n");

	out.Write("int  itrunc(float  x) { return int (trunc(x)); }\n"
	          "int2 itrunc(float2 x) { return int2(trunc(x)); }\n"
	          "int3 itrunc(float3 x) { return int3(trunc(x)); }\n"
	          "int4 itrunc(float4 x) { return int4(trunc(x)); }\n\n");

	if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
	{
		// Add functions to do shifts on scalars and ivecs.
		// These functions all have the same name to enable them to be used no matter what code is generated.
		// For example: tev color op code uses .rgb as a swizzle, but alpha code only uses .a.
		out.Write("int ilshift(int a, int b) { return a << b; }\n"
		          "int irshift(int a, int b) { return a >> b; }\n"

		          "int2 ilshift(int2 a, int2 b) { return int2(a.x << b.x, a.y << b.y); }\n"
		          "int2 ilshift(int2 a, int b) { return int2(a.x << b, a.y << b); }\n"
		          "int2 irshift(int2 a, int2 b) { return int2(a.x >> b.x, a.y >> b.y); }\n"
		          "int2 irshift(int2 a, int b) { return int2(a.x >> b, a.y >> b); }\n"

		          "int3 ilshift(int3 a, int3 b) { return int3(a.x << b.x, a.y << b.y, a.z << b.z); }\n"
		          "int3 ilshift(int3 a, int b) { return int3(a.x << b, a.y << b, a.z << b); }\n"
		          "int3 irshift(int3 a, int3 b) { return int3(a.x >> b.x, a.y >> b.y, a.z >> b.z); }\n"
		          "int3 irshift(int3 a, int b) { return int3(a.x >> b, a.y >> b, a.z >> b); }\n"

		          "int4 ilshift(int4 a, int4 b) { return int4(a.x << b.x, a.y << b.y, a.z << b.z, a.w << b.w); }\n"
		          "int4 ilshift(int4 a, int b) { return int4(a.x << b, a.y << b, a.z << b, a.w << b); }\n"
		          "int4 irshift(int4 a, int4 b) { return int4(a.x >> b.x, a.y >> b.y, a.z >> b.z, a.w >> b.w); }\n"
		          "int4 irshift(int4 a, int b) { return int4(a.x >> b, a.y >> b, a.z >> b, a.w >> b); }\n\n");
	}

	if (ApiType == API_OPENGL)
	{
		// Declare samplers
		for (int i = 0; i < 8; ++i)
			out.Write("SAMPLER_BINDING(%d) uniform sampler2DArray samp%d;\n", i, i);
	}
	else // D3D
	{
		// Declare samplers
		for (int i = 0; i < 8; ++i)
			out.Write("sampler samp%d : register(s%d);\n", i, i);

		out.Write("\n");
		for (int i = 0; i < 8; ++i)
			out.Write("Texture2DArray Tex%d : register(t%d);\n", i, i);
	}
	out.Write("\n");

	if (ApiType == API_OPENGL)
	{
		out.Write("layout(std140%s) uniform PSBlock {\n", g_ActiveConfig.backend_info.bSupportsBindingLayout ? ", binding = 1" : "");
	}
	else
	{
		out.Write("cbuffer PSBlock : register(b0) {\n");
	}
	out.Write(
		"\tint4 " I_COLORS"[4];\n"
		"\tint4 " I_KCOLORS"[4];\n"
		"\tint4 " I_ALPHA";\n"
		"\tfloat4 " I_TEXDIMS"[8];\n"
		"\tint4 " I_ZBIAS"[2];\n"
		"\tint4 " I_INDTEXSCALE"[2];\n"
		"\tint4 " I_INDTEXMTX"[6];\n"
		"\tint4 " I_FOGCOLOR";\n"
		"\tint4 " I_FOGI";\n"
		"\tfloat4 " I_FOGF"[2];\n"
		"\tfloat4 " I_ZSLOPE";\n"
		"\tfloat4 " I_EFBSCALE";\n"
		"};\n");

	if (g_ActiveConfig.bEnablePixelLighting)
	{
		out.Write("%s", s_lighting_struct);

		if (ApiType == API_OPENGL)
		{
			out.Write("layout(std140%s) uniform VSBlock {\n", g_ActiveConfig.backend_info.bSupportsBindingLayout ? ", binding = 2" : "");
		}
		else
		{
			out.Write("cbuffer VSBlock : register(b1) {\n");
		}
		out.Write(s_shader_uniforms);
		out.Write("};\n");
	}

	if (g_ActiveConfig.backend_info.bSupportsBBox)
	{
		if (ApiType == API_OPENGL)
		{
			out.Write(
				"layout(std140, binding = 3) buffer BBox {\n"
				"\tint4 bbox_data;\n"
				"};\n"
				);
		}
		else
		{
			out.Write(
				"globallycoherent RWBuffer<int> bbox_data : register(u2);\n"
				);
		}
	}

	out.Write("struct VS_OUTPUT {\n");
	GenerateVSOutputMembers<T>(out, ApiType);
	out.Write("};\n");

	const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest()
	                            && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED)
	                            // We can't allow early_ztest for zfreeze because depth is overridden per-pixel.
	                            // This means it's impossible for zcomploc to be emulated on a zfrozen polygon.
	                            && !(bpmem.zmode.testenable && bpmem.genMode.zfreeze);
	const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest())
	                             || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z)
	                             || (bpmem.zmode.testenable && bpmem.genMode.zfreeze);

	if (forced_early_z)
	{
		// Zcomploc (aka early_ztest) is a way to control whether depth test is done before
		// or after texturing and alpha test. PC graphics APIs used to provide no way to emulate
		// this feature properly until 2012: Depth tests were always done after alpha testing.
		// Most importantly, it was not possible to write to the depth buffer without also writing
		// a color value (unless color writing was disabled altogether).

		// OpenGL has a flag which allows the driver to still update the depth buffer if alpha
		// test fails. The driver isn't required to do this, but I (degasus) assume all of them do
		// because it's the much faster code path for the GPU.

		// D3D11 also has a way to force the driver to enable early-z, so we're fine here.
		if(ApiType == API_OPENGL)
		{
			out.Write("layout(early_fragment_tests) in;\n");
		}
		else
		{
			out.Write("[earlydepthstencil]\n");
		}
	}
	else if (bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) && is_writing_shadercode)
	{
		static bool warn_once = true;
		if (warn_once)
			WARN_LOG(VIDEO, "Early z test enabled but not possible to emulate with current configuration. Make sure to enable fast depth calculations. If this message still shows up your hardware isn't able to emulate the feature properly (a GPU with D3D 11.0 / OGL 4.2 support is required).");
		warn_once = false;
	}

	if (ApiType == API_OPENGL)
	{
		out.Write("out vec4 ocol0;\n");
		if (dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND)
			out.Write("out vec4 ocol1;\n");

		if (per_pixel_depth)
			out.Write("#define depth gl_FragDepth\n");

		// We use the flag "centroid" to fix some MSAA rendering bugs. With MSAA, the
		// pixel shader will be executed for each pixel which has at least one passed sample.
		// So there may be rendered pixels where the center of the pixel isn't in the primitive.
		// As the pixel shader usually renders at the center of the pixel, this position may be
		// outside the primitive. This will lead to sampling outside the texture, sign changes, ...
		// As a workaround, we interpolate at the centroid of the coveraged pixel, which
		// is always inside the primitive.
		// Without MSAA, this flag is defined to have no effect.
		uid_data->stereo = g_ActiveConfig.iStereoMode > 0;
		if (g_ActiveConfig.backend_info.bSupportsGeometryShaders)
		{
			out.Write("in VertexData {\n");
			GenerateVSOutputMembers<T>(out, ApiType, g_ActiveConfig.backend_info.bSupportsBindingLayout ? "centroid" : "centroid in");

			if (g_ActiveConfig.iStereoMode > 0)
				out.Write("\tflat int layer;\n");

			out.Write("};\n");
		}
		else
		{
			out.Write("centroid in float4 colors_0;\n");
			out.Write("centroid in float4 colors_1;\n");
			// compute window position if needed because binding semantic WPOS is not widely supported
			// Let's set up attributes
			for (unsigned int i = 0; i < numTexgen; ++i)
			{
				out.Write("centroid in float3 uv%d;\n", i);
			}
			out.Write("centroid in float4 clipPos;\n");
			if (g_ActiveConfig.bEnablePixelLighting)
			{
				out.Write("centroid in float3 Normal;\n");
				out.Write("centroid in float3 WorldPos;\n");
			}
		}

		out.Write("void main()\n{\n");

		if (g_ActiveConfig.backend_info.bSupportsGeometryShaders)
		{
			for (unsigned int i = 0; i < numTexgen; ++i)
				out.Write("\tfloat3 uv%d = tex%d;\n", i, i);
		}

		out.Write("\tfloat4 rawpos = gl_FragCoord;\n");
	}
	else // D3D
	{
		out.Write("void main(\n");
		out.Write("  out float4 ocol0 : SV_Target0,%s%s\n  in float4 rawpos : SV_Position,\n",
			dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND ? "\n  out float4 ocol1 : SV_Target1," : "",
			per_pixel_depth ? "\n  out float depth : SV_Depth," : "");

		out.Write("  in centroid float4 colors_0 : COLOR0,\n");
		out.Write("  in centroid float4 colors_1 : COLOR1\n");

		// compute window position if needed because binding semantic WPOS is not widely supported
		for (unsigned int i = 0; i < numTexgen; ++i)
			out.Write(",\n  in centroid float3 uv%d : TEXCOORD%d", i, i);
		out.Write(",\n  in centroid float4 clipPos : TEXCOORD%d", numTexgen);
		if (g_ActiveConfig.bEnablePixelLighting)
		{
			out.Write(",\n  in centroid float3 Normal : TEXCOORD%d", numTexgen + 1);
			out.Write(",\n  in centroid float3 WorldPos : TEXCOORD%d", numTexgen + 2);
		}
		uid_data->stereo = g_ActiveConfig.iStereoMode > 0;
		if (g_ActiveConfig.iStereoMode > 0)
			out.Write(",\n  in uint layer : SV_RenderTargetArrayIndex\n");
		out.Write("        ) {\n");
	}

	out.Write("\tint4 c0 = " I_COLORS"[1], c1 = " I_COLORS"[2], c2 = " I_COLORS"[3], prev = " I_COLORS"[0];\n"
	          "\tint4 rastemp = int4(0, 0, 0, 0), textemp = int4(0, 0, 0, 0), konsttemp = int4(0, 0, 0, 0);\n"
	          "\tint3 comp16 = int3(1, 256, 0), comp24 = int3(1, 256, 256*256);\n"
	          "\tint alphabump=0;\n"
	          "\tint3 tevcoord=int3(0, 0, 0);\n"
	          "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n"
	          "\tint4 tevin_a=int4(0,0,0,0),tevin_b=int4(0,0,0,0),tevin_c=int4(0,0,0,0),tevin_d=int4(0,0,0,0);\n\n"); // tev combiner inputs

	// On GLSL, input variables must not be assigned to.
	// This is why we declare these variables locally instead.
	out.Write("\tfloat4 col0 = colors_0;\n");
	out.Write("\tfloat4 col1 = colors_1;\n");

	if (g_ActiveConfig.bEnablePixelLighting)
	{
		out.Write("\tfloat3 _norm0 = normalize(Normal.xyz);\n\n");
		out.Write("\tfloat3 pos = WorldPos;\n");

		out.Write("\tint4 lacc;\n"
				"\tfloat3 ldir, h, cosAttn, distAttn;\n"
				"\tfloat dist, dist2, attn;\n");

		// TODO: Our current constant usage code isn't able to handle more than one buffer.
		//       So we can't mark the VS constant as used here. But keep them here as reference.
		//out.SetConstantsUsed(C_PLIGHT_COLORS, C_PLIGHT_COLORS+7); // TODO: Can be optimized further
		//out.SetConstantsUsed(C_PLIGHTS, C_PLIGHTS+31); // TODO: Can be optimized further
		//out.SetConstantsUsed(C_PMATERIALS, C_PMATERIALS+3);
		uid_data->components = components;
		GenerateLightingShader<T>(out, uid_data->lighting, components, "colors_", "col");
	}

	// HACK to handle cases where the tex gen is not enabled
	if (numTexgen == 0)
	{
		out.Write("\tint2 fixpoint_uv0 = int2(0, 0);\n\n");
	}
	else
	{
		out.SetConstantsUsed(C_TEXDIMS, C_TEXDIMS+numTexgen-1);
		for (unsigned int i = 0; i < numTexgen; ++i)
		{
			out.Write("\tint2 fixpoint_uv%d = itrunc(", i);
			// optional perspective divides
			uid_data->texMtxInfo_n_projection |= xfmem.texMtxInfo[i].projection << i;
			if (xfmem.texMtxInfo[i].projection == XF_TEXPROJ_STQ)
			{
				out.Write("(uv%d.z == 0.0 ? uv%d.xy : uv%d.xy / uv%d.z)", i, i, i, i);
			}
			else
			{
				out.Write("uv%d.xy", i);
			}
			out.Write(" * " I_TEXDIMS"[%d].zw * 128.0);\n", i);
			// TODO: S24 overflows here?
		}
	}

	// indirect texture map lookup
	int nIndirectStagesUsed = 0;
	if (bpmem.genMode.numindstages > 0)
	{
		for (unsigned int i = 0; i < numStages; ++i)
		{
			if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages)
				nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
		}
	}

	uid_data->nIndirectStagesUsed = nIndirectStagesUsed;
	for (u32 i = 0; i < bpmem.genMode.numindstages; ++i)
	{
		if (nIndirectStagesUsed & (1 << i))
		{
			unsigned int texcoord = bpmem.tevindref.getTexCoord(i);
			unsigned int texmap = bpmem.tevindref.getTexMap(i);

			uid_data->SetTevindrefValues(i, texcoord, texmap);
			if (texcoord < numTexgen)
			{
				out.SetConstantsUsed(C_INDTEXSCALE+i/2,C_INDTEXSCALE+i/2);

				if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
					out.Write("\ttempcoord = irshift(fixpoint_uv%d, " I_INDTEXSCALE"[%d].%s);\n", texcoord, i / 2, (i & 1) ? "zw" : "xy");
				else
					out.Write("\ttempcoord = fixpoint_uv%d >> " I_INDTEXSCALE"[%d].%s;\n", texcoord, i / 2, (i & 1) ? "zw" : "xy");
			}
			else
				out.Write("\ttempcoord = int2(0, 0);\n");

			out.Write("\tint3 iindtex%d = ", i);
			SampleTexture<T>(out, "(float2(tempcoord)/128.0)", "abg", texmap, ApiType);
		}
	}

	// Uid fields for BuildSwapModeTable are set in WriteStage
	char swapModeTable[4][5];
	const char* swapColors = "rgba";
	for (int i = 0; i < 4; i++)
	{
		swapModeTable[i][0] = swapColors[bpmem.tevksel[i*2].swap1];
		swapModeTable[i][1] = swapColors[bpmem.tevksel[i*2].swap2];
		swapModeTable[i][2] = swapColors[bpmem.tevksel[i*2+1].swap1];
		swapModeTable[i][3] = swapColors[bpmem.tevksel[i*2+1].swap2];
		swapModeTable[i][4] = '\0';
	}

	for (unsigned int i = 0; i < numStages; i++)
		WriteStage<T>(out, uid_data, i, ApiType, swapModeTable); // build the equation for this stage

#define MY_STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str)))
	bool enable_pl = g_ActiveConfig.bEnablePixelLighting;
	uid_data->num_values = (enable_pl) ? sizeof(*uid_data) : MY_STRUCT_OFFSET(*uid_data,stagehash[numStages]);


	if (numStages)
	{
		// The results of the last texenv stage are put onto the screen,
		// regardless of the used destination register
		if (bpmem.combiners[numStages - 1].colorC.dest != 0)
		{
			out.Write("\tprev.rgb = %s;\n", tevCOutputTable[bpmem.combiners[numStages - 1].colorC.dest]);
		}
		if (bpmem.combiners[numStages - 1].alphaC.dest != 0)
		{
			out.Write("\tprev.a = %s;\n", tevAOutputTable[bpmem.combiners[numStages - 1].alphaC.dest]);
		}
	}
	out.Write("\tprev = prev & 255;\n");

	AlphaTest::TEST_RESULT Pretest = bpmem.alpha_test.TestResult();
	uid_data->Pretest = Pretest;

	// NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
	// (in this case we need to write a depth value if depth test passes regardless of the alpha testing result)
	if (Pretest == AlphaTest::UNDETERMINED || (Pretest == AlphaTest::FAIL && bpmem.UseLateDepthTest()))
		WriteAlphaTest<T>(out, uid_data, ApiType, dstAlphaMode, per_pixel_depth);

	if (bpmem.genMode.zfreeze)
	{
		out.SetConstantsUsed(C_ZSLOPE, C_ZSLOPE);
		out.SetConstantsUsed(C_EFBSCALE, C_EFBSCALE);

		out.Write("\tfloat2 screenpos = rawpos.xy * " I_EFBSCALE".xy;\n");

		// Opengl has reversed vertical screenspace coordiantes
		if (ApiType == API_OPENGL)
			out.Write("\tscreenpos.y = %i - screenpos.y;\n", EFB_HEIGHT);

		out.Write("\tint zCoord = int(" I_ZSLOPE".z + " I_ZSLOPE".x * screenpos.x + " I_ZSLOPE".y * screenpos.y);\n");
	}
	else if (!g_ActiveConfig.bFastDepthCalc)
	{
		// FastDepth means to trust the depth generated in perspective division.
		// It should be correct, but it seems not to be as accurate as required. TODO: Find out why!
		// For disabled FastDepth we just calculate the depth value again.
		// The performance impact of this additional calculation doesn't matter, but it prevents
		// the host GPU driver from performing any early depth test optimizations.
		out.SetConstantsUsed(C_ZBIAS+1, C_ZBIAS+1);
		// the screen space depth value = far z + (clip z / clip w) * z range
		out.Write("\tint zCoord = " I_ZBIAS"[1].x + int((clipPos.z / clipPos.w) * float(" I_ZBIAS"[1].y));\n");
	}
	else
	{
		out.Write("\tint zCoord = int(rawpos.z * 16777216.0);\n");
	}
	out.Write("\tzCoord = clamp(zCoord, " I_ZBIAS"[1].x - " I_ZBIAS"[1].y, " I_ZBIAS"[1].x);\n");

	// depth texture can safely be ignored if the result won't be written to the depth buffer (early_ztest) and isn't used for fog either
	const bool skip_ztexture = !per_pixel_depth && !bpmem.fog.c_proj_fsel.fsel;

	uid_data->ztex_op = bpmem.ztex2.op;
	uid_data->per_pixel_depth = per_pixel_depth;
	uid_data->forced_early_z = forced_early_z;
	uid_data->fast_depth_calc = g_ActiveConfig.bFastDepthCalc;
	uid_data->early_ztest = bpmem.UseEarlyDepthTest();
	uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
	uid_data->zfreeze = bpmem.genMode.zfreeze;

	// Note: z-textures are not written to depth buffer if early depth test is used
	if (per_pixel_depth && bpmem.UseEarlyDepthTest())
	{
		out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
	}

	// Note: depth texture output is only written to depth buffer if late depth test is used
	// theoretical final depth value is used for fog calculation, though, so we have to emulate ztextures anyway
	if (bpmem.ztex2.op != ZTEXTURE_DISABLE && !skip_ztexture)
	{
		// use the texture input of the last texture stage (textemp), hopefully this has been read and is in correct format...
		out.SetConstantsUsed(C_ZBIAS, C_ZBIAS+1);
		out.Write("\tzCoord = idot(" I_ZBIAS"[0].xyzw, textemp.xyzw) + " I_ZBIAS"[1].w %s;\n",
									(bpmem.ztex2.op == ZTEXTURE_ADD) ? "+ zCoord" : "");
		out.Write("\tzCoord = zCoord & 0xFFFFFF;\n");
	}

	if (per_pixel_depth && bpmem.UseLateDepthTest())
	{
		out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
	}

	if (dstAlphaMode == DSTALPHA_ALPHA_PASS)
	{
		out.SetConstantsUsed(C_ALPHA, C_ALPHA);
		out.Write("\tocol0 = float4(float3(prev.rgb), float(" I_ALPHA".a)) / 255.0;\n");
	}
	else
	{
		WriteFog<T>(out, uid_data);
		out.Write("\tocol0 = float4(prev) / 255.0;\n");
	}

	// Use dual-source color blending to perform dst alpha in a single pass
	if (dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND)
	{
		out.SetConstantsUsed(C_ALPHA, C_ALPHA);

		// Colors will be blended against the alpha from ocol1 and
		// the alpha from ocol0 will be written to the framebuffer.
		out.Write("\tocol1 = float4(prev) / 255.0;\n");
		out.Write("\tocol0.a = float(" I_ALPHA".a) / 255.0;\n");
	}

	if (g_ActiveConfig.backend_info.bSupportsBBox && BoundingBox::active)
	{
		uid_data->bounding_box = true;
		const char* atomic_op = ApiType == API_OPENGL ? "atomic" : "Interlocked";
		out.Write(
			"\tif(bbox_data[0] > int(rawpos.x)) %sMin(bbox_data[0], int(rawpos.x));\n"
			"\tif(bbox_data[1] < int(rawpos.x)) %sMax(bbox_data[1], int(rawpos.x));\n"
			"\tif(bbox_data[2] > int(rawpos.y)) %sMin(bbox_data[2], int(rawpos.y));\n"
			"\tif(bbox_data[3] < int(rawpos.y)) %sMax(bbox_data[3], int(rawpos.y));\n",
			atomic_op, atomic_op, atomic_op, atomic_op);
	}

	out.Write("}\n");

	if (is_writing_shadercode)
	{
		if (text[sizeof(text) - 1] != 0x7C)
			PanicAlert("PixelShader generator - buffer too small, canary has been eaten!");
	}
}
Ejemplo n.º 2
0
static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_TYPE ApiType, u32 components)
{
	// Non-uid template parameters will write to the dummy data (=> gets optimized out)
	pixel_shader_uid_data dummy_data;
	pixel_shader_uid_data& uid_data = (&out.template GetUidData<pixel_shader_uid_data>() != nullptr)
										? out.template GetUidData<pixel_shader_uid_data>() : dummy_data;

	out.SetBuffer(text);
	const bool is_writing_shadercode = (out.GetBuffer() != nullptr);
#ifndef ANDROID
	locale_t locale;
	locale_t old_locale;
	if (is_writing_shadercode)
	{
		locale = newlocale(LC_NUMERIC_MASK, "C", nullptr); // New locale for compilation
		old_locale = uselocale(locale); // Apply the locale for this thread
	}
#endif

	if (is_writing_shadercode)
		text[sizeof(text) - 1] = 0x7C;  // canary

	unsigned int numStages = bpmem.genMode.numtevstages + 1;
	unsigned int numTexgen = bpmem.genMode.numtexgens;

	out.Write("//Pixel Shader for TEV stages\n");
	out.Write("//%i TEV stages, %i texgens, %i IND stages\n",
		numStages, numTexgen, bpmem.genMode.numindstages);

	uid_data.dstAlphaMode = dstAlphaMode;
	uid_data.genMode_numindstages = bpmem.genMode.numindstages;
	uid_data.genMode_numtevstages = bpmem.genMode.numtevstages;
	uid_data.genMode_numtexgens = bpmem.genMode.numtexgens;

	// dot product for integer vectors
	out.Write("int idot(int3 x, int3 y)\n"
	          "{\n"
	          "\tint3 tmp = x * y;\n"
	          "\treturn tmp.x + tmp.y + tmp.z;\n"
	          "}\n");

	out.Write("int idot(int4 x, int4 y)\n"
	          "{\n"
	          "\tint4 tmp = x * y;\n"
	          "\treturn tmp.x + tmp.y + tmp.z + tmp.w;\n"
	          "}\n\n");

	// rounding + casting to integer at once in a single function
	out.Write("int  iround(float  x) { return int (round(x)); }\n"
	          "int2 iround(float2 x) { return int2(round(x)); }\n"
	          "int3 iround(float3 x) { return int3(round(x)); }\n"
	          "int4 iround(float4 x) { return int4(round(x)); }\n\n");

	if (ApiType == API_OPENGL)
	{
		// Declare samplers
		for (int i = 0; i < 8; ++i)
			out.Write("uniform sampler2D samp%d;\n", i);
	}
	else // D3D
	{
		// Declare samplers
		for (int i = 0; i < 8; ++i)
			out.Write("sampler samp%d : register(s%d);\n", i, i);

		out.Write("\n");
		for (int i = 0; i < 8; ++i)
			out.Write("Texture2D Tex%d : register(t%d);\n", i, i);
	}
	out.Write("\n");

	if (ApiType == API_OPENGL)
		out.Write("layout(std140%s) uniform PSBlock {\n", g_ActiveConfig.backend_info.bSupportsBindingLayout ? ", binding = 1" : "");
	else
		out.Write("cbuffer PSBlock {\n");
	out.Write(
		"\tint4 " I_COLORS"[4];\n"
		"\tint4 " I_KCOLORS"[4];\n"
		"\tint4 " I_ALPHA";\n"
		"\tfloat4 " I_TEXDIMS"[8];\n"
		"\tint4 " I_ZBIAS"[2];\n"
		"\tint4 " I_INDTEXSCALE"[2];\n"
		"\tint4 " I_INDTEXMTX"[6];\n"
		"\tint4 " I_FOGCOLOR";\n"
		"\tint4 " I_FOGI";\n"
		"\tfloat4 " I_FOGF"[2];\n" );
	if (g_ActiveConfig.bEnablePixelLighting) 
	{
		out.Write(
		// For pixel lighting - TODO: Should only be defined when per pixel lighting is enabled!
		"\tint4 " I_PLIGHT_COLORS"[8];\n"
		"\tfloat4 " I_PLIGHTS"[32];\n"
		"\tint4 " I_PMATERIALS"[4];\n"
		"};\n");
	} else {
		out.Write( "};\n" );
	}

	const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED);
	const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z);

	if (forced_early_z)
	{
		// Zcomploc (aka early_ztest) is a way to control whether depth test is done before
		// or after texturing and alpha test. PC graphics APIs used to provide no way to emulate
		// this feature properly until 2012: Depth tests were always done after alpha testing.
		// Most importantly, it was not possible to write to the depth buffer without also writing
		// a color value (unless color writing was disabled altogether).

		// OpenGL has a flag which allows the driver to still update the depth buffer if alpha
		// test fails. The driver isn't required to do this, but I (degasus) assume all of them do
		// because it's the much faster code path for the GPU.

		// D3D11 also has a way to force the driver to enable early-z, so we're fine here.
		if(ApiType == API_OPENGL)
		{
			out.Write("layout(early_fragment_tests) in;\n");
		}
		else
		{
			out.Write("[earlydepthstencil]\n");
		}
	}
	else if (bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) && is_writing_shadercode)
	{
		static bool warn_once = true;
		if (warn_once)
			WARN_LOG(VIDEO, "Early z test enabled but not possible to emulate with current configuration. Make sure to enable fast depth calculations. If this message still shows up your hardware isn't able to emulate the feature properly (a GPU with D3D 11.0 / OGL 4.2 support is required).");
		warn_once = false;
	}

	if (ApiType == API_OPENGL)
	{
		out.Write("out vec4 ocol0;\n");
		if (dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND)
			out.Write("out vec4 ocol1;\n");

		if (per_pixel_depth)
			out.Write("#define depth gl_FragDepth\n");

		// We use the flag "centroid" to fix some MSAA rendering bugs. With MSAA, the
		// pixel shader will be executed for each pixel which has at least one passed sample.
		// So there may be rendered pixels where the center of the pixel isn't in the primitive.
		// As the pixel shader usually renders at the center of the pixel, this position may be
		// outside the primitive. This will lead to sampling outside the texture, sign changes, ...
		// As a workaround, we interpolate at the centroid of the coveraged pixel, which
		// is always inside the primitive.
		// Without MSAA, this flag is defined to have no effect.
		out.Write("centroid in float4 colors_02;\n");
		out.Write("centroid in float4 colors_12;\n");

		// compute window position if needed because binding semantic WPOS is not widely supported
		// Let's set up attributes
		for (unsigned int i = 0; i < xfmem.numTexGen.numTexGens; ++i)
		{
			out.Write("centroid in float3 uv%d;\n", i);
		}
		out.Write("centroid in float4 clipPos;\n");
		if (g_ActiveConfig.bEnablePixelLighting)
		{
			out.Write("centroid in float4 Normal;\n");
		}

		out.Write("void main()\n{\n");
		out.Write("\tfloat4 rawpos = gl_FragCoord;\n");
	}
	else // D3D
	{
		out.Write("void main(\n");
		out.Write("  out float4 ocol0 : SV_Target0,%s%s\n  in float4 rawpos : SV_Position,\n",
			dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND ? "\n  out float4 ocol1 : SV_Target1," : "",
			per_pixel_depth ? "\n  out float depth : SV_Depth," : "");

		out.Write("  in centroid float4 colors_0 : COLOR0,\n");
		out.Write("  in centroid float4 colors_1 : COLOR1");

		// compute window position if needed because binding semantic WPOS is not widely supported
		for (unsigned int i = 0; i < numTexgen; ++i)
			out.Write(",\n  in centroid float3 uv%d : TEXCOORD%d", i, i);
		out.Write(",\n  in centroid float4 clipPos : TEXCOORD%d", numTexgen);
		if (g_ActiveConfig.bEnablePixelLighting)
			out.Write(",\n  in centroid float4 Normal : TEXCOORD%d", numTexgen + 1);
		out.Write("        ) {\n");
	}

	out.Write("\tint4 c0 = " I_COLORS"[1], c1 = " I_COLORS"[2], c2 = " I_COLORS"[3], prev = " I_COLORS"[0];\n"
	          "\tint4 rastemp = int4(0, 0, 0, 0), textemp = int4(0, 0, 0, 0), konsttemp = int4(0, 0, 0, 0);\n"
	          "\tint3 comp16 = int3(1, 256, 0), comp24 = int3(1, 256, 256*256);\n"
	          "\tint alphabump=0;\n"
	          "\tint3 tevcoord=int3(0, 0, 0);\n"
	          "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n"
	          "\tint4 tevin_a=int4(0,0,0,0),tevin_b=int4(0,0,0,0),tevin_c=int4(0,0,0,0),tevin_d=int4(0,0,0,0);\n\n"); // tev combiner inputs

	if (ApiType == API_OPENGL)
	{
		// On Mali, global variables must be initialized as constants.
		// This is why we initialize these variables locally instead.
		out.Write("\tfloat4 colors_0 = colors_02;\n");
		out.Write("\tfloat4 colors_1 = colors_12;\n");
	}

	if (g_ActiveConfig.bEnablePixelLighting)
	{
		out.Write("\tfloat3 _norm0 = normalize(Normal.xyz);\n\n");
		out.Write("\tfloat3 pos = float3(clipPos.x,clipPos.y,Normal.w);\n");

		out.Write("\tint4 lacc;\n"
				"\tfloat3 ldir, h;\n"
				"\tfloat dist, dist2, attn;\n");

		out.SetConstantsUsed(C_PLIGHT_COLORS, C_PLIGHT_COLORS+7); // TODO: Can be optimized further
		out.SetConstantsUsed(C_PLIGHTS, C_PLIGHTS+31); // TODO: Can be optimized further
		out.SetConstantsUsed(C_PMATERIALS, C_PMATERIALS+3);
		uid_data.components = components;
		GenerateLightingShader<T>(out, uid_data.lighting, components, I_PMATERIALS, I_PLIGHT_COLORS, I_PLIGHTS, "colors_", "colors_");
	}

	// HACK to handle cases where the tex gen is not enabled
	if (numTexgen == 0)
	{
		out.Write("\tint2 fixpoint_uv0 = int2(0, 0);\n\n");
	}
	else
	{
		out.SetConstantsUsed(C_TEXDIMS, C_TEXDIMS+numTexgen-1);
		for (unsigned int i = 0; i < numTexgen; ++i)
		{
			out.Write("\tint2 fixpoint_uv%d = iround(", i);
			// optional perspective divides
			uid_data.texMtxInfo_n_projection |= xfmem.texMtxInfo[i].projection << i;
			if (xfmem.texMtxInfo[i].projection == XF_TEXPROJ_STQ)
			{
				out.Write("(uv%d.z == 0.0 ? uv%d.xy : uv%d.xy / uv%d.z)", i, i, i, i);
			}
			else
			{
				out.Write("uv%d.xy", i);
			}
			out.Write(" * " I_TEXDIMS"[%d].zw * 128.0);\n\n", i);
			// TODO: S24 overflows here?
		}
	}

	// indirect texture map lookup
	int nIndirectStagesUsed = 0;
	if (bpmem.genMode.numindstages > 0)
	{
		for (unsigned int i = 0; i < numStages; ++i)
		{
			if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages)
				nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
		}
	}

	uid_data.nIndirectStagesUsed = nIndirectStagesUsed;
	for (u32 i = 0; i < bpmem.genMode.numindstages; ++i)
	{
		if (nIndirectStagesUsed & (1 << i))
		{
			unsigned int texcoord = bpmem.tevindref.getTexCoord(i);
			unsigned int texmap = bpmem.tevindref.getTexMap(i);

			uid_data.SetTevindrefValues(i, texcoord, texmap);
			if (texcoord < numTexgen)
			{
				out.SetConstantsUsed(C_INDTEXSCALE+i/2,C_INDTEXSCALE+i/2);
				out.Write("\ttempcoord = fixpoint_uv%d >> " I_INDTEXSCALE"[%d].%s;\n", texcoord, i / 2, (i & 1) ? "zw" : "xy");
			}
			else
				out.Write("\ttempcoord = int2(0, 0);\n");

			out.Write("\tint3 iindtex%d = ", i);
			SampleTexture<T>(out, "(float2(tempcoord)/128.0)", "abg", texmap, ApiType);
		}
	}

	// Uid fields for BuildSwapModeTable are set in WriteStage
	char swapModeTable[4][5];
	const char* swapColors = "rgba";
	for (int i = 0; i < 4; i++)
	{
		swapModeTable[i][0] = swapColors[bpmem.tevksel[i*2].swap1];
		swapModeTable[i][1] = swapColors[bpmem.tevksel[i*2].swap2];
		swapModeTable[i][2] = swapColors[bpmem.tevksel[i*2+1].swap1];
		swapModeTable[i][3] = swapColors[bpmem.tevksel[i*2+1].swap2];
		swapModeTable[i][4] = '\0';
	}

	for (unsigned int i = 0; i < numStages; i++)
		WriteStage<T>(out, uid_data, i, ApiType, swapModeTable); // build the equation for this stage

#define MY_STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str)))
	bool enable_pl = g_ActiveConfig.bEnablePixelLighting;
	uid_data.num_values = (enable_pl) ? sizeof(uid_data) : MY_STRUCT_OFFSET(uid_data,stagehash[numStages]);


	if (numStages)
	{
		// The results of the last texenv stage are put onto the screen,
		// regardless of the used destination register
		if (bpmem.combiners[numStages - 1].colorC.dest != 0)
		{
			out.Write("\tprev.rgb = %s;\n", tevCOutputTable[bpmem.combiners[numStages - 1].colorC.dest]);
		}
		if (bpmem.combiners[numStages - 1].alphaC.dest != 0)
		{
			out.Write("\tprev.a = %s;\n", tevAOutputTable[bpmem.combiners[numStages - 1].alphaC.dest]);
		}
	}
	out.Write("\tprev = prev & 255;\n");

	AlphaTest::TEST_RESULT Pretest = bpmem.alpha_test.TestResult();
	uid_data.Pretest = Pretest;

	// NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
	// (in this case we need to write a depth value if depth test passes regardless of the alpha testing result)
	if (Pretest == AlphaTest::UNDETERMINED || (Pretest == AlphaTest::FAIL && bpmem.UseLateDepthTest()))
		WriteAlphaTest<T>(out, uid_data, ApiType, dstAlphaMode, per_pixel_depth);

	// FastDepth means to trust the depth generated in perspective division.
	// It should be correct, but it seems not to be as accurate as required. TODO: Find out why!
	// For disabled FastDepth we just calculate the depth value again.
	// The performance impact of this additional calculation doesn't matter, but it prevents
	// the host GPU driver from performing any early depth test optimizations.
	if (g_ActiveConfig.bFastDepthCalc)
		out.Write("\tint zCoord = iround(rawpos.z * float(0xFFFFFF));\n");
	else
	{
		out.SetConstantsUsed(C_ZBIAS+1, C_ZBIAS+1);
		// the screen space depth value = far z + (clip z / clip w) * z range
		out.Write("\tint zCoord = " I_ZBIAS"[1].x + iround((clipPos.z / clipPos.w) * float(" I_ZBIAS"[1].y));\n");
	}

	// depth texture can safely be ignored if the result won't be written to the depth buffer (early_ztest) and isn't used for fog either
	const bool skip_ztexture = !per_pixel_depth && !bpmem.fog.c_proj_fsel.fsel;

	uid_data.ztex_op = bpmem.ztex2.op;
	uid_data.per_pixel_depth = per_pixel_depth;
	uid_data.forced_early_z = forced_early_z;
	uid_data.fast_depth_calc = g_ActiveConfig.bFastDepthCalc;
	uid_data.early_ztest = bpmem.UseEarlyDepthTest();
	uid_data.fog_fsel = bpmem.fog.c_proj_fsel.fsel;

	// Note: z-textures are not written to depth buffer if early depth test is used
	if (per_pixel_depth && bpmem.UseEarlyDepthTest())
		out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");

	// Note: depth texture output is only written to depth buffer if late depth test is used
	// theoretical final depth value is used for fog calculation, though, so we have to emulate ztextures anyway
	if (bpmem.ztex2.op != ZTEXTURE_DISABLE && !skip_ztexture)
	{
		// use the texture input of the last texture stage (textemp), hopefully this has been read and is in correct format...
		out.SetConstantsUsed(C_ZBIAS, C_ZBIAS+1);
		out.Write("\tzCoord = idot(" I_ZBIAS"[0].xyzw, textemp.xyzw) + " I_ZBIAS"[1].w %s;\n",
									(bpmem.ztex2.op == ZTEXTURE_ADD) ? "+ zCoord" : "");
		out.Write("\tzCoord = zCoord & 0xFFFFFF;\n");
	}

	if (per_pixel_depth && bpmem.UseLateDepthTest())
		out.Write("\tdepth = float(zCoord) / float(0xFFFFFF);\n");

	if (dstAlphaMode == DSTALPHA_ALPHA_PASS)
	{
		out.SetConstantsUsed(C_ALPHA, C_ALPHA);
		out.Write("\tocol0 = float4(float3(prev.rgb), float(" I_ALPHA".a)) / 255.0;\n");
	}
	else
	{
		WriteFog<T>(out, uid_data);
		out.Write("\tocol0 = float4(prev) / 255.0;\n");
	}

	// Use dual-source color blending to perform dst alpha in a single pass
	if (dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND)
	{
		out.SetConstantsUsed(C_ALPHA, C_ALPHA);

		// Colors will be blended against the alpha from ocol1 and
		// the alpha from ocol0 will be written to the framebuffer.
		out.Write("\tocol1 = float4(prev) / 255.0;\n");
		out.Write("\tocol0.a = float(" I_ALPHA".a) / 255.0;\n");
	}

	out.Write("}\n");

	if (is_writing_shadercode)
	{
		if (text[sizeof(text) - 1] != 0x7C)
			PanicAlert("PixelShader generator - buffer too small, canary has been eaten!");

#ifndef ANDROID
		uselocale(old_locale); // restore locale
		freelocale(locale);
#endif
	}
}