PixelShaderGen.cpp 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252
  1. // Copyright 2008 Dolphin Emulator Project
  2. // Licensed under GPLv2+
  3. // Refer to the license.txt file included.
  4. #include <cassert>
  5. #include <cmath>
  6. #include <cstdio>
  7. #include "VideoCommon/BoundingBox.h"
  8. #include "VideoCommon/BPMemory.h"
  9. #include "VideoCommon/ConstantManager.h"
  10. #include "VideoCommon/DriverDetails.h"
  11. #include "VideoCommon/LightingShaderGen.h"
  12. #include "VideoCommon/NativeVertexFormat.h"
  13. #include "VideoCommon/PixelShaderGen.h"
  14. #include "VideoCommon/VertexShaderGen.h"
  15. #include "VideoCommon/VideoConfig.h"
  16. #include "VideoCommon/XFMemory.h" // for texture projection mode
  17. static const char *tevKSelTableC[] =
  18. {
  19. "255,255,255", // 1 = 0x00
  20. "223,223,223", // 7_8 = 0x01
  21. "191,191,191", // 3_4 = 0x02
  22. "159,159,159", // 5_8 = 0x03
  23. "128,128,128", // 1_2 = 0x04
  24. "96,96,96", // 3_8 = 0x05
  25. "64,64,64", // 1_4 = 0x06
  26. "32,32,32", // 1_8 = 0x07
  27. "0,0,0", // INVALID = 0x08
  28. "0,0,0", // INVALID = 0x09
  29. "0,0,0", // INVALID = 0x0a
  30. "0,0,0", // INVALID = 0x0b
  31. I_KCOLORS"[0].rgb", // K0 = 0x0C
  32. I_KCOLORS"[1].rgb", // K1 = 0x0D
  33. I_KCOLORS"[2].rgb", // K2 = 0x0E
  34. I_KCOLORS"[3].rgb", // K3 = 0x0F
  35. I_KCOLORS"[0].rrr", // K0_R = 0x10
  36. I_KCOLORS"[1].rrr", // K1_R = 0x11
  37. I_KCOLORS"[2].rrr", // K2_R = 0x12
  38. I_KCOLORS"[3].rrr", // K3_R = 0x13
  39. I_KCOLORS"[0].ggg", // K0_G = 0x14
  40. I_KCOLORS"[1].ggg", // K1_G = 0x15
  41. I_KCOLORS"[2].ggg", // K2_G = 0x16
  42. I_KCOLORS"[3].ggg", // K3_G = 0x17
  43. I_KCOLORS"[0].bbb", // K0_B = 0x18
  44. I_KCOLORS"[1].bbb", // K1_B = 0x19
  45. I_KCOLORS"[2].bbb", // K2_B = 0x1A
  46. I_KCOLORS"[3].bbb", // K3_B = 0x1B
  47. I_KCOLORS"[0].aaa", // K0_A = 0x1C
  48. I_KCOLORS"[1].aaa", // K1_A = 0x1D
  49. I_KCOLORS"[2].aaa", // K2_A = 0x1E
  50. I_KCOLORS"[3].aaa", // K3_A = 0x1F
  51. };
  52. static const char *tevKSelTableA[] =
  53. {
  54. "255", // 1 = 0x00
  55. "223", // 7_8 = 0x01
  56. "191", // 3_4 = 0x02
  57. "159", // 5_8 = 0x03
  58. "128", // 1_2 = 0x04
  59. "96", // 3_8 = 0x05
  60. "64", // 1_4 = 0x06
  61. "32", // 1_8 = 0x07
  62. "0", // INVALID = 0x08
  63. "0", // INVALID = 0x09
  64. "0", // INVALID = 0x0a
  65. "0", // INVALID = 0x0b
  66. "0", // INVALID = 0x0c
  67. "0", // INVALID = 0x0d
  68. "0", // INVALID = 0x0e
  69. "0", // INVALID = 0x0f
  70. I_KCOLORS"[0].r", // K0_R = 0x10
  71. I_KCOLORS"[1].r", // K1_R = 0x11
  72. I_KCOLORS"[2].r", // K2_R = 0x12
  73. I_KCOLORS"[3].r", // K3_R = 0x13
  74. I_KCOLORS"[0].g", // K0_G = 0x14
  75. I_KCOLORS"[1].g", // K1_G = 0x15
  76. I_KCOLORS"[2].g", // K2_G = 0x16
  77. I_KCOLORS"[3].g", // K3_G = 0x17
  78. I_KCOLORS"[0].b", // K0_B = 0x18
  79. I_KCOLORS"[1].b", // K1_B = 0x19
  80. I_KCOLORS"[2].b", // K2_B = 0x1A
  81. I_KCOLORS"[3].b", // K3_B = 0x1B
  82. I_KCOLORS"[0].a", // K0_A = 0x1C
  83. I_KCOLORS"[1].a", // K1_A = 0x1D
  84. I_KCOLORS"[2].a", // K2_A = 0x1E
  85. I_KCOLORS"[3].a", // K3_A = 0x1F
  86. };
  87. static const char *tevCInputTable[] =
  88. {
  89. "prev.rgb", // CPREV,
  90. "prev.aaa", // APREV,
  91. "c0.rgb", // C0,
  92. "c0.aaa", // A0,
  93. "c1.rgb", // C1,
  94. "c1.aaa", // A1,
  95. "c2.rgb", // C2,
  96. "c2.aaa", // A2,
  97. "textemp.rgb", // TEXC,
  98. "textemp.aaa", // TEXA,
  99. "rastemp.rgb", // RASC,
  100. "rastemp.aaa", // RASA,
  101. "int3(255,255,255)", // ONE
  102. "int3(128,128,128)", // HALF
  103. "konsttemp.rgb", // KONST
  104. "int3(0,0,0)", // ZERO
  105. };
  106. static const char *tevAInputTable[] =
  107. {
  108. "prev.a", // APREV,
  109. "c0.a", // A0,
  110. "c1.a", // A1,
  111. "c2.a", // A2,
  112. "textemp.a", // TEXA,
  113. "rastemp.a", // RASA,
  114. "konsttemp.a", // KONST, (hw1 had quarter)
  115. "0", // ZERO
  116. };
  117. static const char *tevRasTable[] =
  118. {
  119. "iround(col0 * 255.0)",
  120. "iround(col1 * 255.0)",
  121. "ERROR13", //2
  122. "ERROR14", //3
  123. "ERROR15", //4
  124. "(int4(1, 1, 1, 1) * alphabump)", // bump alpha (0..248)
  125. "(int4(1, 1, 1, 1) * (alphabump | (alphabump >> 5)))", // normalized bump alpha (0..255)
  126. "int4(0, 0, 0, 0)", // zero
  127. };
  128. static const char *tevCOutputTable[] = { "prev.rgb", "c0.rgb", "c1.rgb", "c2.rgb" };
  129. static const char *tevAOutputTable[] = { "prev.a", "c0.a", "c1.a", "c2.a" };
  130. static char text[32768];
  131. template<class T> static inline void WriteStage(T& out, pixel_shader_uid_data* uid_data, int n, API_TYPE ApiType, const char swapModeTable[4][5]);
  132. template<class T> static inline void WriteTevRegular(T& out, const char* components, int bias, int op, int clamp, int shift);
  133. template<class T> static inline void SampleTexture(T& out, const char *texcoords, const char *texswap, int texmap, API_TYPE ApiType);
  134. template<class T> static inline void WriteAlphaTest(T& out, pixel_shader_uid_data* uid_data, API_TYPE ApiType,DSTALPHA_MODE dstAlphaMode, bool per_pixel_depth);
  135. template<class T> static inline void WriteFog(T& out, pixel_shader_uid_data* uid_data);
  136. template<class T>
  137. static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_TYPE ApiType, u32 components)
  138. {
  139. // Non-uid template parameters will write to the dummy data (=> gets optimized out)
  140. pixel_shader_uid_data dummy_data;
  141. pixel_shader_uid_data* uid_data = out.template GetUidData<pixel_shader_uid_data>();
  142. if (uid_data == nullptr)
  143. uid_data = &dummy_data;
  144. out.SetBuffer(text);
  145. const bool is_writing_shadercode = (out.GetBuffer() != nullptr);
  146. if (is_writing_shadercode)
  147. text[sizeof(text) - 1] = 0x7C; // canary
  148. unsigned int numStages = bpmem.genMode.numtevstages + 1;
  149. unsigned int numTexgen = bpmem.genMode.numtexgens;
  150. out.Write("//Pixel Shader for TEV stages\n");
  151. out.Write("//%i TEV stages, %i texgens, %i IND stages\n",
  152. numStages, numTexgen, bpmem.genMode.numindstages);
  153. uid_data->dstAlphaMode = dstAlphaMode;
  154. uid_data->genMode_numindstages = bpmem.genMode.numindstages;
  155. uid_data->genMode_numtevstages = bpmem.genMode.numtevstages;
  156. uid_data->genMode_numtexgens = bpmem.genMode.numtexgens;
  157. // dot product for integer vectors
  158. out.Write("int idot(int3 x, int3 y)\n"
  159. "{\n"
  160. "\tint3 tmp = x * y;\n"
  161. "\treturn tmp.x + tmp.y + tmp.z;\n"
  162. "}\n");
  163. out.Write("int idot(int4 x, int4 y)\n"
  164. "{\n"
  165. "\tint4 tmp = x * y;\n"
  166. "\treturn tmp.x + tmp.y + tmp.z + tmp.w;\n"
  167. "}\n\n");
  168. // rounding + casting to integer at once in a single function
  169. out.Write("int iround(float x) { return int (round(x)); }\n"
  170. "int2 iround(float2 x) { return int2(round(x)); }\n"
  171. "int3 iround(float3 x) { return int3(round(x)); }\n"
  172. "int4 iround(float4 x) { return int4(round(x)); }\n\n");
  173. out.Write("int itrunc(float x) { return int (trunc(x)); }\n"
  174. "int2 itrunc(float2 x) { return int2(trunc(x)); }\n"
  175. "int3 itrunc(float3 x) { return int3(trunc(x)); }\n"
  176. "int4 itrunc(float4 x) { return int4(trunc(x)); }\n\n");
  177. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
  178. {
  179. // Add functions to do shifts on scalars and ivecs.
  180. // These functions all have the same name to enable them to be used no matter what code is generated.
  181. // For example: tev color op code uses .rgb as a swizzle, but alpha code only uses .a.
  182. out.Write("int ilshift(int a, int b) { return a << b; }\n"
  183. "int irshift(int a, int b) { return a >> b; }\n"
  184. "int2 ilshift(int2 a, int2 b) { return int2(a.x << b.x, a.y << b.y); }\n"
  185. "int2 ilshift(int2 a, int b) { return int2(a.x << b, a.y << b); }\n"
  186. "int2 irshift(int2 a, int2 b) { return int2(a.x >> b.x, a.y >> b.y); }\n"
  187. "int2 irshift(int2 a, int b) { return int2(a.x >> b, a.y >> b); }\n"
  188. "int3 ilshift(int3 a, int3 b) { return int3(a.x << b.x, a.y << b.y, a.z << b.z); }\n"
  189. "int3 ilshift(int3 a, int b) { return int3(a.x << b, a.y << b, a.z << b); }\n"
  190. "int3 irshift(int3 a, int3 b) { return int3(a.x >> b.x, a.y >> b.y, a.z >> b.z); }\n"
  191. "int3 irshift(int3 a, int b) { return int3(a.x >> b, a.y >> b, a.z >> b); }\n"
  192. "int4 ilshift(int4 a, int4 b) { return int4(a.x << b.x, a.y << b.y, a.z << b.z, a.w << b.w); }\n"
  193. "int4 ilshift(int4 a, int b) { return int4(a.x << b, a.y << b, a.z << b, a.w << b); }\n"
  194. "int4 irshift(int4 a, int4 b) { return int4(a.x >> b.x, a.y >> b.y, a.z >> b.z, a.w >> b.w); }\n"
  195. "int4 irshift(int4 a, int b) { return int4(a.x >> b, a.y >> b, a.z >> b, a.w >> b); }\n\n");
  196. }
  197. if (ApiType == API_OPENGL)
  198. {
  199. // Declare samplers
  200. for (int i = 0; i < 8; ++i)
  201. out.Write("SAMPLER_BINDING(%d) uniform sampler2DArray samp%d;\n", i, i);
  202. }
  203. else // D3D
  204. {
  205. // Declare samplers
  206. for (int i = 0; i < 8; ++i)
  207. out.Write("sampler samp%d : register(s%d);\n", i, i);
  208. out.Write("\n");
  209. for (int i = 0; i < 8; ++i)
  210. out.Write("Texture2DArray Tex%d : register(t%d);\n", i, i);
  211. }
  212. out.Write("\n");
  213. if (ApiType == API_OPENGL)
  214. {
  215. out.Write("layout(std140%s) uniform PSBlock {\n", g_ActiveConfig.backend_info.bSupportsBindingLayout ? ", binding = 1" : "");
  216. }
  217. else
  218. {
  219. out.Write("cbuffer PSBlock : register(b0) {\n");
  220. }
  221. out.Write(
  222. "\tint4 " I_COLORS"[4];\n"
  223. "\tint4 " I_KCOLORS"[4];\n"
  224. "\tint4 " I_ALPHA";\n"
  225. "\tfloat4 " I_TEXDIMS"[8];\n"
  226. "\tint4 " I_ZBIAS"[2];\n"
  227. "\tint4 " I_INDTEXSCALE"[2];\n"
  228. "\tint4 " I_INDTEXMTX"[6];\n"
  229. "\tint4 " I_FOGCOLOR";\n"
  230. "\tint4 " I_FOGI";\n"
  231. "\tfloat4 " I_FOGF"[2];\n"
  232. "\tfloat4 " I_ZSLOPE";\n"
  233. "\tfloat4 " I_EFBSCALE";\n"
  234. "};\n");
  235. if (g_ActiveConfig.bEnablePixelLighting)
  236. {
  237. out.Write("%s", s_lighting_struct);
  238. if (ApiType == API_OPENGL)
  239. {
  240. out.Write("layout(std140%s) uniform VSBlock {\n", g_ActiveConfig.backend_info.bSupportsBindingLayout ? ", binding = 2" : "");
  241. }
  242. else
  243. {
  244. out.Write("cbuffer VSBlock : register(b1) {\n");
  245. }
  246. out.Write(s_shader_uniforms);
  247. out.Write("};\n");
  248. }
  249. if (g_ActiveConfig.backend_info.bSupportsBBox && g_ActiveConfig.bBBoxEnable)
  250. {
  251. if (ApiType == API_OPENGL)
  252. {
  253. out.Write(
  254. "layout(std140, binding = 3) buffer BBox {\n"
  255. "\tint4 bbox_data;\n"
  256. "};\n"
  257. );
  258. }
  259. else
  260. {
  261. out.Write(
  262. "globallycoherent RWBuffer<int> bbox_data : register(u2);\n"
  263. );
  264. }
  265. }
  266. out.Write("struct VS_OUTPUT {\n");
  267. GenerateVSOutputMembers<T>(out, ApiType);
  268. out.Write("};\n");
  269. const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest()
  270. && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED)
  271. // We can't allow early_ztest for zfreeze because depth is overridden per-pixel.
  272. // This means it's impossible for zcomploc to be emulated on a zfrozen polygon.
  273. && !(bpmem.zmode.testenable && bpmem.genMode.zfreeze);
  274. const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest())
  275. || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z)
  276. || (bpmem.zmode.testenable && bpmem.genMode.zfreeze);
  277. if (forced_early_z)
  278. {
  279. // Zcomploc (aka early_ztest) is a way to control whether depth test is done before
  280. // or after texturing and alpha test. PC graphics APIs used to provide no way to emulate
  281. // this feature properly until 2012: Depth tests were always done after alpha testing.
  282. // Most importantly, it was not possible to write to the depth buffer without also writing
  283. // a color value (unless color writing was disabled altogether).
  284. // OpenGL has a flag which allows the driver to still update the depth buffer if alpha
  285. // test fails. The driver isn't required to do this, but I (degasus) assume all of them do
  286. // because it's the much faster code path for the GPU.
  287. // D3D11 also has a way to force the driver to enable early-z, so we're fine here.
  288. if(ApiType == API_OPENGL)
  289. {
  290. out.Write("layout(early_fragment_tests) in;\n");
  291. }
  292. else
  293. {
  294. out.Write("[earlydepthstencil]\n");
  295. }
  296. }
  297. else if (bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) && is_writing_shadercode)
  298. {
  299. static bool warn_once = true;
  300. if (warn_once)
  301. WARN_LOG(VIDEO, "Early z test enabled but not possible to emulate with current configuration. Make sure to enable fast depth calculations. If this message still shows up your hardware isn't able to emulate the feature properly (a GPU with D3D 11.0 / OGL 4.2 support is required).");
  302. warn_once = false;
  303. }
  304. if (ApiType == API_OPENGL)
  305. {
  306. out.Write("out vec4 ocol0;\n");
  307. if (dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND)
  308. out.Write("out vec4 ocol1;\n");
  309. if (per_pixel_depth)
  310. out.Write("#define depth gl_FragDepth\n");
  311. // We use the flag "centroid" to fix some MSAA rendering bugs. With MSAA, the
  312. // pixel shader will be executed for each pixel which has at least one passed sample.
  313. // So there may be rendered pixels where the center of the pixel isn't in the primitive.
  314. // As the pixel shader usually renders at the center of the pixel, this position may be
  315. // outside the primitive. This will lead to sampling outside the texture, sign changes, ...
  316. // As a workaround, we interpolate at the centroid of the coveraged pixel, which
  317. // is always inside the primitive.
  318. // Without MSAA, this flag is defined to have no effect.
  319. uid_data->stereo = g_ActiveConfig.iStereoMode > 0;
  320. if (g_ActiveConfig.backend_info.bSupportsGeometryShaders)
  321. {
  322. out.Write("in VertexData {\n");
  323. GenerateVSOutputMembers<T>(out, ApiType, g_ActiveConfig.backend_info.bSupportsBindingLayout ? "centroid" : "centroid in");
  324. if (g_ActiveConfig.iStereoMode > 0)
  325. out.Write("\tflat int layer;\n");
  326. out.Write("};\n");
  327. }
  328. else
  329. {
  330. out.Write("centroid in float4 colors_0;\n");
  331. out.Write("centroid in float4 colors_1;\n");
  332. // compute window position if needed because binding semantic WPOS is not widely supported
  333. // Let's set up attributes
  334. for (unsigned int i = 0; i < numTexgen; ++i)
  335. {
  336. out.Write("centroid in float3 uv%d;\n", i);
  337. }
  338. out.Write("centroid in float4 clipPos;\n");
  339. if (g_ActiveConfig.bEnablePixelLighting)
  340. {
  341. out.Write("centroid in float3 Normal;\n");
  342. out.Write("centroid in float3 WorldPos;\n");
  343. }
  344. }
  345. out.Write("void main()\n{\n");
  346. if (g_ActiveConfig.backend_info.bSupportsGeometryShaders)
  347. {
  348. for (unsigned int i = 0; i < numTexgen; ++i)
  349. out.Write("\tfloat3 uv%d = tex%d;\n", i, i);
  350. }
  351. out.Write("\tfloat4 rawpos = gl_FragCoord;\n");
  352. }
  353. else // D3D
  354. {
  355. out.Write("void main(\n");
  356. out.Write(" out float4 ocol0 : SV_Target0,%s%s\n in float4 rawpos : SV_Position,\n",
  357. dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND ? "\n out float4 ocol1 : SV_Target1," : "",
  358. per_pixel_depth ? "\n out float depth : SV_Depth," : "");
  359. out.Write(" in centroid float4 colors_0 : COLOR0,\n");
  360. out.Write(" in centroid float4 colors_1 : COLOR1\n");
  361. // compute window position if needed because binding semantic WPOS is not widely supported
  362. for (unsigned int i = 0; i < numTexgen; ++i)
  363. out.Write(",\n in centroid float3 uv%d : TEXCOORD%d", i, i);
  364. out.Write(",\n in centroid float4 clipPos : TEXCOORD%d", numTexgen);
  365. if (g_ActiveConfig.bEnablePixelLighting)
  366. {
  367. out.Write(",\n in centroid float3 Normal : TEXCOORD%d", numTexgen + 1);
  368. out.Write(",\n in centroid float3 WorldPos : TEXCOORD%d", numTexgen + 2);
  369. }
  370. uid_data->stereo = g_ActiveConfig.iStereoMode > 0;
  371. if (g_ActiveConfig.iStereoMode > 0)
  372. out.Write(",\n in uint layer : SV_RenderTargetArrayIndex\n");
  373. out.Write(" ) {\n");
  374. }
  375. out.Write("\tint4 c0 = " I_COLORS"[1], c1 = " I_COLORS"[2], c2 = " I_COLORS"[3], prev = " I_COLORS"[0];\n"
  376. "\tint4 rastemp = int4(0, 0, 0, 0), textemp = int4(0, 0, 0, 0), konsttemp = int4(0, 0, 0, 0);\n"
  377. "\tint3 comp16 = int3(1, 256, 0), comp24 = int3(1, 256, 256*256);\n"
  378. "\tint alphabump=0;\n"
  379. "\tint3 tevcoord=int3(0, 0, 0);\n"
  380. "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n"
  381. "\tint4 tevin_a=int4(0,0,0,0),tevin_b=int4(0,0,0,0),tevin_c=int4(0,0,0,0),tevin_d=int4(0,0,0,0);\n\n"); // tev combiner inputs
  382. // On GLSL, input variables must not be assigned to.
  383. // This is why we declare these variables locally instead.
  384. out.Write("\tfloat4 col0 = colors_0;\n");
  385. out.Write("\tfloat4 col1 = colors_1;\n");
  386. if (g_ActiveConfig.bEnablePixelLighting)
  387. {
  388. out.Write("\tfloat3 _norm0 = normalize(Normal.xyz);\n\n");
  389. out.Write("\tfloat3 pos = WorldPos;\n");
  390. out.Write("\tint4 lacc;\n"
  391. "\tfloat3 ldir, h, cosAttn, distAttn;\n"
  392. "\tfloat dist, dist2, attn;\n");
  393. // TODO: Our current constant usage code isn't able to handle more than one buffer.
  394. // So we can't mark the VS constant as used here. But keep them here as reference.
  395. //out.SetConstantsUsed(C_PLIGHT_COLORS, C_PLIGHT_COLORS+7); // TODO: Can be optimized further
  396. //out.SetConstantsUsed(C_PLIGHTS, C_PLIGHTS+31); // TODO: Can be optimized further
  397. //out.SetConstantsUsed(C_PMATERIALS, C_PMATERIALS+3);
  398. uid_data->components = components;
  399. GenerateLightingShader<T>(out, uid_data->lighting, components, "colors_", "col");
  400. }
  401. // HACK to handle cases where the tex gen is not enabled
  402. if (numTexgen == 0)
  403. {
  404. out.Write("\tint2 fixpoint_uv0 = int2(0, 0);\n\n");
  405. }
  406. else
  407. {
  408. out.SetConstantsUsed(C_TEXDIMS, C_TEXDIMS+numTexgen-1);
  409. for (unsigned int i = 0; i < numTexgen; ++i)
  410. {
  411. out.Write("\tint2 fixpoint_uv%d = itrunc(", i);
  412. // optional perspective divides
  413. uid_data->texMtxInfo_n_projection |= xfmem.texMtxInfo[i].projection << i;
  414. if (xfmem.texMtxInfo[i].projection == XF_TEXPROJ_STQ)
  415. {
  416. out.Write("(uv%d.z == 0.0 ? uv%d.xy : uv%d.xy / uv%d.z)", i, i, i, i);
  417. }
  418. else
  419. {
  420. out.Write("uv%d.xy", i);
  421. }
  422. out.Write(" * " I_TEXDIMS"[%d].zw * 128.0);\n", i);
  423. // TODO: S24 overflows here?
  424. }
  425. }
  426. // indirect texture map lookup
  427. int nIndirectStagesUsed = 0;
  428. if (bpmem.genMode.numindstages > 0)
  429. {
  430. for (unsigned int i = 0; i < numStages; ++i)
  431. {
  432. if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < bpmem.genMode.numindstages)
  433. nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
  434. }
  435. }
  436. uid_data->nIndirectStagesUsed = nIndirectStagesUsed;
  437. for (u32 i = 0; i < bpmem.genMode.numindstages; ++i)
  438. {
  439. if (nIndirectStagesUsed & (1 << i))
  440. {
  441. unsigned int texcoord = bpmem.tevindref.getTexCoord(i);
  442. unsigned int texmap = bpmem.tevindref.getTexMap(i);
  443. uid_data->SetTevindrefValues(i, texcoord, texmap);
  444. if (texcoord < numTexgen)
  445. {
  446. out.SetConstantsUsed(C_INDTEXSCALE+i/2,C_INDTEXSCALE+i/2);
  447. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
  448. out.Write("\ttempcoord = irshift(fixpoint_uv%d, " I_INDTEXSCALE"[%d].%s);\n", texcoord, i / 2, (i & 1) ? "zw" : "xy");
  449. else
  450. out.Write("\ttempcoord = fixpoint_uv%d >> " I_INDTEXSCALE"[%d].%s;\n", texcoord, i / 2, (i & 1) ? "zw" : "xy");
  451. }
  452. else
  453. out.Write("\ttempcoord = int2(0, 0);\n");
  454. out.Write("\tint3 iindtex%d = ", i);
  455. SampleTexture<T>(out, "(float2(tempcoord)/128.0)", "abg", texmap, ApiType);
  456. }
  457. }
  458. // Uid fields for BuildSwapModeTable are set in WriteStage
  459. char swapModeTable[4][5];
  460. const char* swapColors = "rgba";
  461. for (int i = 0; i < 4; i++)
  462. {
  463. swapModeTable[i][0] = swapColors[bpmem.tevksel[i*2].swap1];
  464. swapModeTable[i][1] = swapColors[bpmem.tevksel[i*2].swap2];
  465. swapModeTable[i][2] = swapColors[bpmem.tevksel[i*2+1].swap1];
  466. swapModeTable[i][3] = swapColors[bpmem.tevksel[i*2+1].swap2];
  467. swapModeTable[i][4] = '\0';
  468. }
  469. for (unsigned int i = 0; i < numStages; i++)
  470. WriteStage<T>(out, uid_data, i, ApiType, swapModeTable); // build the equation for this stage
  471. #define MY_STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str)))
  472. bool enable_pl = g_ActiveConfig.bEnablePixelLighting;
  473. uid_data->num_values = (enable_pl) ? sizeof(*uid_data) : MY_STRUCT_OFFSET(*uid_data,stagehash[numStages]);
  474. if (numStages)
  475. {
  476. // The results of the last texenv stage are put onto the screen,
  477. // regardless of the used destination register
  478. if (bpmem.combiners[numStages - 1].colorC.dest != 0)
  479. {
  480. out.Write("\tprev.rgb = %s;\n", tevCOutputTable[bpmem.combiners[numStages - 1].colorC.dest]);
  481. }
  482. if (bpmem.combiners[numStages - 1].alphaC.dest != 0)
  483. {
  484. out.Write("\tprev.a = %s;\n", tevAOutputTable[bpmem.combiners[numStages - 1].alphaC.dest]);
  485. }
  486. }
  487. out.Write("\tprev = prev & 255;\n");
  488. AlphaTest::TEST_RESULT Pretest = bpmem.alpha_test.TestResult();
  489. uid_data->Pretest = Pretest;
  490. // NOTE: Fragment may not be discarded if alpha test always fails and early depth test is enabled
  491. // (in this case we need to write a depth value if depth test passes regardless of the alpha testing result)
  492. if (Pretest == AlphaTest::UNDETERMINED || (Pretest == AlphaTest::FAIL && bpmem.UseLateDepthTest()))
  493. WriteAlphaTest<T>(out, uid_data, ApiType, dstAlphaMode, per_pixel_depth);
  494. if (bpmem.genMode.zfreeze)
  495. {
  496. out.SetConstantsUsed(C_ZSLOPE, C_ZSLOPE);
  497. out.SetConstantsUsed(C_EFBSCALE, C_EFBSCALE);
  498. out.Write("\tfloat2 screenpos = rawpos.xy * " I_EFBSCALE".xy;\n");
  499. // Opengl has reversed vertical screenspace coordiantes
  500. if (ApiType == API_OPENGL)
  501. out.Write("\tscreenpos.y = %i - screenpos.y;\n", EFB_HEIGHT);
  502. out.Write("\tint zCoord = int(" I_ZSLOPE".z + " I_ZSLOPE".x * screenpos.x + " I_ZSLOPE".y * screenpos.y);\n");
  503. }
  504. else if (!g_ActiveConfig.bFastDepthCalc)
  505. {
  506. // FastDepth means to trust the depth generated in perspective division.
  507. // It should be correct, but it seems not to be as accurate as required. TODO: Find out why!
  508. // For disabled FastDepth we just calculate the depth value again.
  509. // The performance impact of this additional calculation doesn't matter, but it prevents
  510. // the host GPU driver from performing any early depth test optimizations.
  511. out.SetConstantsUsed(C_ZBIAS+1, C_ZBIAS+1);
  512. // the screen space depth value = far z + (clip z / clip w) * z range
  513. out.Write("\tint zCoord = " I_ZBIAS"[1].x + int((clipPos.z / clipPos.w) * float(" I_ZBIAS"[1].y));\n");
  514. }
  515. else
  516. {
  517. if (ApiType == API_D3D)
  518. out.Write("\tint zCoord = int((1.0 - rawpos.z) * 16777216.0);\n");
  519. else
  520. out.Write("\tint zCoord = int(rawpos.z * 16777216.0);\n");
  521. }
  522. out.Write("\tzCoord = clamp(zCoord, 0, 0xFFFFFF);\n");
  523. // depth texture can safely be ignored if the result won't be written to the depth buffer (early_ztest) and isn't used for fog either
  524. const bool skip_ztexture = !per_pixel_depth && !bpmem.fog.c_proj_fsel.fsel;
  525. uid_data->ztex_op = bpmem.ztex2.op;
  526. uid_data->per_pixel_depth = per_pixel_depth;
  527. uid_data->forced_early_z = forced_early_z;
  528. uid_data->fast_depth_calc = g_ActiveConfig.bFastDepthCalc;
  529. uid_data->early_ztest = bpmem.UseEarlyDepthTest();
  530. uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
  531. uid_data->zfreeze = bpmem.genMode.zfreeze;
  532. // Note: z-textures are not written to depth buffer if early depth test is used
  533. if (per_pixel_depth && bpmem.UseEarlyDepthTest())
  534. {
  535. if (ApiType == API_D3D)
  536. out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
  537. else
  538. out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
  539. }
  540. // Note: depth texture output is only written to depth buffer if late depth test is used
  541. // theoretical final depth value is used for fog calculation, though, so we have to emulate ztextures anyway
  542. if (bpmem.ztex2.op != ZTEXTURE_DISABLE && !skip_ztexture)
  543. {
  544. // use the texture input of the last texture stage (textemp), hopefully this has been read and is in correct format...
  545. out.SetConstantsUsed(C_ZBIAS, C_ZBIAS+1);
  546. out.Write("\tzCoord = idot(" I_ZBIAS"[0].xyzw, textemp.xyzw) + " I_ZBIAS"[1].w %s;\n",
  547. (bpmem.ztex2.op == ZTEXTURE_ADD) ? "+ zCoord" : "");
  548. out.Write("\tzCoord = zCoord & 0xFFFFFF;\n");
  549. }
  550. if (per_pixel_depth && bpmem.UseLateDepthTest())
  551. {
  552. if (ApiType == API_D3D)
  553. out.Write("\tdepth = 1.0 - float(zCoord) / 16777216.0;\n");
  554. else
  555. out.Write("\tdepth = float(zCoord) / 16777216.0;\n");
  556. }
  557. if (dstAlphaMode == DSTALPHA_ALPHA_PASS)
  558. {
  559. out.SetConstantsUsed(C_ALPHA, C_ALPHA);
  560. out.Write("\tocol0 = float4(float3(prev.rgb), float(" I_ALPHA".a)) / 255.0;\n");
  561. }
  562. else
  563. {
  564. WriteFog<T>(out, uid_data);
  565. out.Write("\tocol0 = float4(prev) / 255.0;\n");
  566. }
  567. // Use dual-source color blending to perform dst alpha in a single pass
  568. if (dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND)
  569. {
  570. out.SetConstantsUsed(C_ALPHA, C_ALPHA);
  571. // Colors will be blended against the alpha from ocol1 and
  572. // the alpha from ocol0 will be written to the framebuffer.
  573. out.Write("\tocol1 = float4(prev) / 255.0;\n");
  574. out.Write("\tocol0.a = float(" I_ALPHA".a) / 255.0;\n");
  575. }
  576. if (g_ActiveConfig.backend_info.bSupportsBBox && g_ActiveConfig.bBBoxEnable && BoundingBox::active)
  577. {
  578. uid_data->bounding_box = true;
  579. const char* atomic_op = ApiType == API_OPENGL ? "atomic" : "Interlocked";
  580. out.Write(
  581. "\tif(bbox_data[0] > int(rawpos.x)) %sMin(bbox_data[0], int(rawpos.x));\n"
  582. "\tif(bbox_data[1] < int(rawpos.x)) %sMax(bbox_data[1], int(rawpos.x));\n"
  583. "\tif(bbox_data[2] > int(rawpos.y)) %sMin(bbox_data[2], int(rawpos.y));\n"
  584. "\tif(bbox_data[3] < int(rawpos.y)) %sMax(bbox_data[3], int(rawpos.y));\n",
  585. atomic_op, atomic_op, atomic_op, atomic_op);
  586. }
  587. out.Write("}\n");
  588. if (is_writing_shadercode)
  589. {
  590. if (text[sizeof(text) - 1] != 0x7C)
  591. PanicAlert("PixelShader generator - buffer too small, canary has been eaten!");
  592. }
  593. }
  594. template<class T>
  595. static inline void WriteStage(T& out, pixel_shader_uid_data* uid_data, int n, API_TYPE ApiType, const char swapModeTable[4][5])
  596. {
  597. int texcoord = bpmem.tevorders[n/2].getTexCoord(n&1);
  598. bool bHasTexCoord = (u32)texcoord < bpmem.genMode.numtexgens;
  599. bool bHasIndStage = bpmem.tevind[n].bt < bpmem.genMode.numindstages;
  600. // HACK to handle cases where the tex gen is not enabled
  601. if (!bHasTexCoord)
  602. texcoord = 0;
  603. out.Write("\n\t// TEV stage %d\n", n);
  604. uid_data->stagehash[n].hasindstage = bHasIndStage;
  605. uid_data->stagehash[n].tevorders_texcoord = texcoord;
  606. if (bHasIndStage)
  607. {
  608. uid_data->stagehash[n].tevind = bpmem.tevind[n].hex & 0x7FFFFF;
  609. out.Write("\t// indirect op\n");
  610. // perform the indirect op on the incoming regular coordinates using iindtex%d as the offset coords
  611. if (bpmem.tevind[n].bs != ITBA_OFF)
  612. {
  613. const char *tevIndAlphaSel[] = {"", "x", "y", "z"};
  614. const char *tevIndAlphaMask[] = {"248", "224", "240", "248"}; // 0b11111000, 0b11100000, 0b11110000, 0b11111000
  615. out.Write("alphabump = iindtex%d.%s & %s;\n",
  616. bpmem.tevind[n].bt,
  617. tevIndAlphaSel[bpmem.tevind[n].bs],
  618. tevIndAlphaMask[bpmem.tevind[n].fmt]);
  619. }
  620. else
  621. {
  622. // TODO: Should we reset alphabump to 0 here?
  623. }
  624. if (bpmem.tevind[n].mid != 0)
  625. {
  626. // format
  627. const char *tevIndFmtMask[] = { "255", "31", "15", "7" };
  628. out.Write("\tint3 iindtevcrd%d = iindtex%d & %s;\n", n, bpmem.tevind[n].bt, tevIndFmtMask[bpmem.tevind[n].fmt]);
  629. // bias - TODO: Check if this needs to be this complicated..
  630. const char *tevIndBiasField[] = { "", "x", "y", "xy", "z", "xz", "yz", "xyz" }; // indexed by bias
  631. const char *tevIndBiasAdd[] = { "-128", "1", "1", "1" }; // indexed by fmt
  632. if (bpmem.tevind[n].bias == ITB_S || bpmem.tevind[n].bias == ITB_T || bpmem.tevind[n].bias == ITB_U)
  633. out.Write("\tiindtevcrd%d.%s += int(%s);\n", n, tevIndBiasField[bpmem.tevind[n].bias], tevIndBiasAdd[bpmem.tevind[n].fmt]);
  634. else if (bpmem.tevind[n].bias == ITB_ST || bpmem.tevind[n].bias == ITB_SU || bpmem.tevind[n].bias == ITB_TU)
  635. out.Write("\tiindtevcrd%d.%s += int2(%s, %s);\n", n, tevIndBiasField[bpmem.tevind[n].bias], tevIndBiasAdd[bpmem.tevind[n].fmt], tevIndBiasAdd[bpmem.tevind[n].fmt]);
  636. else if (bpmem.tevind[n].bias == ITB_STU)
  637. out.Write("\tiindtevcrd%d.%s += int3(%s, %s, %s);\n", n, tevIndBiasField[bpmem.tevind[n].bias], tevIndBiasAdd[bpmem.tevind[n].fmt], tevIndBiasAdd[bpmem.tevind[n].fmt], tevIndBiasAdd[bpmem.tevind[n].fmt]);
  638. // multiply by offset matrix and scale - calculations are likely to overflow badly,
  639. // yet it works out since we only care about the lower 23 bits (+1 sign bit) of the result
  640. if (bpmem.tevind[n].mid <= 3)
  641. {
  642. int mtxidx = 2*(bpmem.tevind[n].mid-1);
  643. out.SetConstantsUsed(C_INDTEXMTX+mtxidx, C_INDTEXMTX+mtxidx);
  644. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
  645. {
  646. out.Write("\tint2 indtevtrans%d = irshift(int2(idot(" I_INDTEXMTX"[%d].xyz, iindtevcrd%d), idot(" I_INDTEXMTX"[%d].xyz, iindtevcrd%d)), 3);\n", n, mtxidx, n, mtxidx+1, n);
  647. // TODO: should use a shader uid branch for this for better performance
  648. out.Write("\tif (" I_INDTEXMTX"[%d].w >= 0) indtevtrans%d = irshift(indtevtrans%d, " I_INDTEXMTX"[%d].w);\n", mtxidx, n, n, mtxidx);
  649. out.Write("\telse indtevtrans%d = ilshift(indtevtrans%d, -" I_INDTEXMTX"[%d].w);\n", n, n, mtxidx);
  650. }
  651. else
  652. {
  653. out.Write("\tint2 indtevtrans%d = int2(idot(" I_INDTEXMTX"[%d].xyz, iindtevcrd%d), idot(" I_INDTEXMTX"[%d].xyz, iindtevcrd%d)) >> 3;\n", n, mtxidx, n, mtxidx+1, n);
  654. // TODO: should use a shader uid branch for this for better performance
  655. out.Write("\tif (" I_INDTEXMTX"[%d].w >= 0) indtevtrans%d = indtevtrans%d >> " I_INDTEXMTX"[%d].w;\n", mtxidx, n, n, mtxidx);
  656. out.Write("\telse indtevtrans%d = indtevtrans%d << (-" I_INDTEXMTX"[%d].w);\n", n, n, mtxidx);
  657. }
  658. }
  659. else if (bpmem.tevind[n].mid <= 7 && bHasTexCoord)
  660. { // s matrix
  661. _assert_(bpmem.tevind[n].mid >= 5);
  662. int mtxidx = 2*(bpmem.tevind[n].mid-5);
  663. out.SetConstantsUsed(C_INDTEXMTX+mtxidx, C_INDTEXMTX+mtxidx);
  664. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
  665. {
  666. out.Write("\tint2 indtevtrans%d = irshift(int2(fixpoint_uv%d * iindtevcrd%d.xx), 8);\n", n, texcoord, n);
  667. out.Write("\tif (" I_INDTEXMTX"[%d].w >= 0) indtevtrans%d = irshift(indtevtrans%d, " I_INDTEXMTX"[%d].w);\n", mtxidx, n, n, mtxidx);
  668. out.Write("\telse indtevtrans%d = ilshift(indtevtrans%d, -" I_INDTEXMTX"[%d].w);\n", n, n, mtxidx);
  669. }
  670. else
  671. {
  672. out.Write("\tint2 indtevtrans%d = int2(fixpoint_uv%d * iindtevcrd%d.xx) >> 8;\n", n, texcoord, n);
  673. out.Write("\tif (" I_INDTEXMTX"[%d].w >= 0) indtevtrans%d = indtevtrans%d >> " I_INDTEXMTX"[%d].w;\n", mtxidx, n, n, mtxidx);
  674. out.Write("\telse indtevtrans%d = indtevtrans%d << (-" I_INDTEXMTX"[%d].w);\n", n, n, mtxidx);
  675. }
  676. }
  677. else if (bpmem.tevind[n].mid <= 11 && bHasTexCoord)
  678. { // t matrix
  679. _assert_(bpmem.tevind[n].mid >= 9);
  680. int mtxidx = 2*(bpmem.tevind[n].mid-9);
  681. out.SetConstantsUsed(C_INDTEXMTX+mtxidx, C_INDTEXMTX+mtxidx);
  682. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
  683. {
  684. out.Write("\tint2 indtevtrans%d = irshift(int2(fixpoint_uv%d * iindtevcrd%d.yy), 8);\n", n, texcoord, n);
  685. out.Write("\tif (" I_INDTEXMTX"[%d].w >= 0) indtevtrans%d = irshift(indtevtrans%d, " I_INDTEXMTX"[%d].w);\n", mtxidx, n, n, mtxidx);
  686. out.Write("\telse indtevtrans%d = ilshift(indtevtrans%d, -" I_INDTEXMTX"[%d].w);\n", n, n, mtxidx);
  687. }
  688. else
  689. {
  690. out.Write("\tint2 indtevtrans%d = int2(fixpoint_uv%d * iindtevcrd%d.yy) >> 8;\n", n, texcoord, n);
  691. out.Write("\tif (" I_INDTEXMTX"[%d].w >= 0) indtevtrans%d = indtevtrans%d >> " I_INDTEXMTX"[%d].w;\n", mtxidx, n, n, mtxidx);
  692. out.Write("\telse indtevtrans%d = indtevtrans%d << (-" I_INDTEXMTX"[%d].w);\n", n, n, mtxidx);
  693. }
  694. }
  695. else
  696. {
  697. out.Write("\tint2 indtevtrans%d = int2(0, 0);\n", n);
  698. }
  699. }
  700. else
  701. {
  702. out.Write("\tint2 indtevtrans%d = int2(0, 0);\n", n);
  703. }
  704. // ---------
  705. // Wrapping
  706. // ---------
  707. const char *tevIndWrapStart[] = {"0", "(256<<7)", "(128<<7)", "(64<<7)", "(32<<7)", "(16<<7)", "1" }; // TODO: Should the last one be 1 or (1<<7)?
  708. // wrap S
  709. if (bpmem.tevind[n].sw == ITW_OFF)
  710. out.Write("\twrappedcoord.x = fixpoint_uv%d.x;\n", texcoord);
  711. else if (bpmem.tevind[n].sw == ITW_0)
  712. out.Write("\twrappedcoord.x = 0;\n");
  713. else
  714. out.Write("\twrappedcoord.x = fixpoint_uv%d.x %% %s;\n", texcoord, tevIndWrapStart[bpmem.tevind[n].sw]);
  715. // wrap T
  716. if (bpmem.tevind[n].tw == ITW_OFF)
  717. out.Write("\twrappedcoord.y = fixpoint_uv%d.y;\n", texcoord);
  718. else if (bpmem.tevind[n].tw == ITW_0)
  719. out.Write("\twrappedcoord.y = 0;\n");
  720. else
  721. out.Write("\twrappedcoord.y = fixpoint_uv%d.y %% %s;\n", texcoord, tevIndWrapStart[bpmem.tevind[n].tw]);
  722. if (bpmem.tevind[n].fb_addprev) // add previous tevcoord
  723. out.Write("\ttevcoord.xy += wrappedcoord + indtevtrans%d;\n", n);
  724. else
  725. out.Write("\ttevcoord.xy = wrappedcoord + indtevtrans%d;\n", n);
  726. // Emulate s24 overflows
  727. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
  728. out.Write("\ttevcoord.xy = irshift(ilshift(tevcoord.xy, 8), 8);\n");
  729. else
  730. out.Write("\ttevcoord.xy = (tevcoord.xy << 8) >> 8;\n");
  731. }
  732. TevStageCombiner::ColorCombiner &cc = bpmem.combiners[n].colorC;
  733. TevStageCombiner::AlphaCombiner &ac = bpmem.combiners[n].alphaC;
  734. uid_data->stagehash[n].cc = cc.hex & 0xFFFFFF;
  735. uid_data->stagehash[n].ac = ac.hex & 0xFFFFF0; // Storing rswap and tswap later
  736. if (cc.a == TEVCOLORARG_RASA || cc.a == TEVCOLORARG_RASC ||
  737. cc.b == TEVCOLORARG_RASA || cc.b == TEVCOLORARG_RASC ||
  738. cc.c == TEVCOLORARG_RASA || cc.c == TEVCOLORARG_RASC ||
  739. cc.d == TEVCOLORARG_RASA || cc.d == TEVCOLORARG_RASC ||
  740. ac.a == TEVALPHAARG_RASA || ac.b == TEVALPHAARG_RASA ||
  741. ac.c == TEVALPHAARG_RASA || ac.d == TEVALPHAARG_RASA)
  742. {
  743. const int i = bpmem.combiners[n].alphaC.rswap;
  744. uid_data->stagehash[n].ac |= bpmem.combiners[n].alphaC.rswap;
  745. uid_data->stagehash[n].tevksel_swap1a = bpmem.tevksel[i*2].swap1;
  746. uid_data->stagehash[n].tevksel_swap2a = bpmem.tevksel[i*2].swap2;
  747. uid_data->stagehash[n].tevksel_swap1b = bpmem.tevksel[i*2+1].swap1;
  748. uid_data->stagehash[n].tevksel_swap2b = bpmem.tevksel[i*2+1].swap2;
  749. uid_data->stagehash[n].tevorders_colorchan = bpmem.tevorders[n / 2].getColorChan(n & 1);
  750. const char *rasswap = swapModeTable[bpmem.combiners[n].alphaC.rswap];
  751. out.Write("\trastemp = %s.%s;\n", tevRasTable[bpmem.tevorders[n / 2].getColorChan(n & 1)], rasswap);
  752. }
  753. uid_data->stagehash[n].tevorders_enable = bpmem.tevorders[n / 2].getEnable(n & 1);
  754. if (bpmem.tevorders[n/2].getEnable(n&1))
  755. {
  756. int texmap = bpmem.tevorders[n/2].getTexMap(n&1);
  757. if (!bHasIndStage)
  758. {
  759. // calc tevcord
  760. if (bHasTexCoord)
  761. out.Write("\ttevcoord.xy = fixpoint_uv%d;\n", texcoord);
  762. else
  763. out.Write("\ttevcoord.xy = int2(0, 0);\n");
  764. }
  765. const int i = bpmem.combiners[n].alphaC.tswap;
  766. uid_data->stagehash[n].ac |= bpmem.combiners[n].alphaC.tswap << 2;
  767. uid_data->stagehash[n].tevksel_swap1c = bpmem.tevksel[i*2].swap1;
  768. uid_data->stagehash[n].tevksel_swap2c = bpmem.tevksel[i*2].swap2;
  769. uid_data->stagehash[n].tevksel_swap1d = bpmem.tevksel[i*2+1].swap1;
  770. uid_data->stagehash[n].tevksel_swap2d = bpmem.tevksel[i*2+1].swap2;
  771. uid_data->stagehash[n].tevorders_texmap= bpmem.tevorders[n/2].getTexMap(n&1);
  772. const char *texswap = swapModeTable[bpmem.combiners[n].alphaC.tswap];
  773. uid_data->SetTevindrefTexmap(i, texmap);
  774. out.Write("\ttextemp = ");
  775. SampleTexture<T>(out, "(float2(tevcoord.xy)/128.0)", texswap, texmap, ApiType);
  776. }
  777. else
  778. {
  779. out.Write("\ttextemp = int4(255, 255, 255, 255);\n");
  780. }
  781. if (cc.a == TEVCOLORARG_KONST || cc.b == TEVCOLORARG_KONST ||
  782. cc.c == TEVCOLORARG_KONST || cc.d == TEVCOLORARG_KONST ||
  783. ac.a == TEVALPHAARG_KONST || ac.b == TEVALPHAARG_KONST ||
  784. ac.c == TEVALPHAARG_KONST || ac.d == TEVALPHAARG_KONST)
  785. {
  786. int kc = bpmem.tevksel[n / 2].getKC(n & 1);
  787. int ka = bpmem.tevksel[n / 2].getKA(n & 1);
  788. uid_data->stagehash[n].tevksel_kc = kc;
  789. uid_data->stagehash[n].tevksel_ka = ka;
  790. out.Write("\tkonsttemp = int4(%s, %s);\n", tevKSelTableC[kc], tevKSelTableA[ka]);
  791. if (kc > 7)
  792. out.SetConstantsUsed(C_KCOLORS+((kc-0xc)%4),C_KCOLORS+((kc-0xc)%4));
  793. if (ka > 7)
  794. out.SetConstantsUsed(C_KCOLORS+((ka-0xc)%4),C_KCOLORS+((ka-0xc)%4));
  795. }
  796. if (cc.d == TEVCOLORARG_C0 || cc.d == TEVCOLORARG_A0 || ac.d == TEVALPHAARG_A0)
  797. out.SetConstantsUsed(C_COLORS+1,C_COLORS+1);
  798. if (cc.d == TEVCOLORARG_C1 || cc.d == TEVCOLORARG_A1 || ac.d == TEVALPHAARG_A1)
  799. out.SetConstantsUsed(C_COLORS+2,C_COLORS+2);
  800. if (cc.d == TEVCOLORARG_C2 || cc.d == TEVCOLORARG_A2 || ac.d == TEVALPHAARG_A2)
  801. out.SetConstantsUsed(C_COLORS+3,C_COLORS+3);
  802. if (cc.dest >= GX_TEVREG0 && cc.dest <= GX_TEVREG2)
  803. out.SetConstantsUsed(C_COLORS+cc.dest, C_COLORS+cc.dest);
  804. if (ac.dest >= GX_TEVREG0 && ac.dest <= GX_TEVREG2)
  805. out.SetConstantsUsed(C_COLORS+ac.dest, C_COLORS+ac.dest);
  806. out.Write("\ttevin_a = int4(%s, %s)&255;\n", tevCInputTable[cc.a], tevAInputTable[ac.a]);
  807. out.Write("\ttevin_b = int4(%s, %s)&255;\n", tevCInputTable[cc.b], tevAInputTable[ac.b]);
  808. out.Write("\ttevin_c = int4(%s, %s)&255;\n", tevCInputTable[cc.c], tevAInputTable[ac.c]);
  809. out.Write("\ttevin_d = int4(%s, %s);\n", tevCInputTable[cc.d], tevAInputTable[ac.d]);
  810. out.Write("\t// color combine\n");
  811. out.Write("\t%s = clamp(", tevCOutputTable[cc.dest]);
  812. if (cc.bias != TevBias_COMPARE)
  813. {
  814. WriteTevRegular(out, "rgb", cc.bias, cc.op, cc.clamp, cc.shift);
  815. }
  816. else
  817. {
  818. const char *function_table[] =
  819. {
  820. "((tevin_a.r > tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_GT
  821. "((tevin_a.r == tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_EQ
  822. "((idot(tevin_a.rgb, comp16) > idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_GT
  823. "((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_EQ
  824. "((idot(tevin_a.rgb, comp24) > idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_GT
  825. "((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_EQ
  826. "(max(sign(tevin_a.rgb - tevin_b.rgb), int3(0,0,0)) * tevin_c.rgb)", // TEVCMP_RGB8_GT
  827. "((int3(1,1,1) - sign(abs(tevin_a.rgb - tevin_b.rgb))) * tevin_c.rgb)" // TEVCMP_RGB8_EQ
  828. };
  829. int mode = (cc.shift<<1)|cc.op;
  830. out.Write(" tevin_d.rgb + ");
  831. out.Write(function_table[mode]);
  832. }
  833. if (cc.clamp)
  834. out.Write(", int3(0,0,0), int3(255,255,255))");
  835. else
  836. out.Write(", int3(-1024,-1024,-1024), int3(1023,1023,1023))");
  837. out.Write(";\n");
  838. out.Write("\t// alpha combine\n");
  839. out.Write("\t%s = clamp(", tevAOutputTable[ac.dest]);
  840. if (ac.bias != TevBias_COMPARE)
  841. {
  842. WriteTevRegular(out, "a", ac.bias, ac.op, ac.clamp, ac.shift);
  843. }
  844. else
  845. {
  846. const char *function_table[] =
  847. {
  848. "((tevin_a.r > tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_GT
  849. "((tevin_a.r == tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_EQ
  850. "((idot(tevin_a.rgb, comp16) > idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_GT
  851. "((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_EQ
  852. "((idot(tevin_a.rgb, comp24) > idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_GT
  853. "((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_EQ
  854. "((tevin_a.a > tevin_b.a) ? tevin_c.a : 0)", // TEVCMP_A8_GT
  855. "((tevin_a.a == tevin_b.a) ? tevin_c.a : 0)" // TEVCMP_A8_EQ
  856. };
  857. int mode = (ac.shift<<1)|ac.op;
  858. out.Write(" tevin_d.a + ");
  859. out.Write(function_table[mode]);
  860. }
  861. if (ac.clamp)
  862. out.Write(", 0, 255)");
  863. else
  864. out.Write(", -1024, 1023)");
  865. out.Write(";\n");
  866. }
  867. template<class T>
  868. static inline void WriteTevRegular(T& out, const char* components, int bias, int op, int clamp, int shift)
  869. {
  870. const char *tevScaleTableLeft[] =
  871. {
  872. "", // SCALE_1
  873. " << 1", // SCALE_2
  874. " << 2", // SCALE_4
  875. "", // DIVIDE_2
  876. };
  877. const char *tevScaleTableRight[] =
  878. {
  879. "", // SCALE_1
  880. "", // SCALE_2
  881. "", // SCALE_4
  882. " >> 1", // DIVIDE_2
  883. };
  884. const char *tevLerpBias[] = // indexed by 2*op+(shift==3)
  885. {
  886. "",
  887. " + 128",
  888. "",
  889. " + 127",
  890. };
  891. const char *tevBiasTable[] =
  892. {
  893. "", // ZERO,
  894. " + 128", // ADDHALF,
  895. " - 128", // SUBHALF,
  896. "",
  897. };
  898. const char *tevOpTable[] = {
  899. "+", // TEVOP_ADD = 0,
  900. "-", // TEVOP_SUB = 1,
  901. };
  902. // Regular TEV stage: (d + bias + lerp(a,b,c)) * scale
  903. // The GameCube/Wii GPU uses a very sophisticated algorithm for scale-lerping:
  904. // - c is scaled from 0..255 to 0..256, which allows dividing the result by 256 instead of 255
  905. // - if scale is bigger than one, it is moved inside the lerp calculation for increased accuracy
  906. // - a rounding bias is added before dividing by 256
  907. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
  908. {
  909. // Haxx - cleaner code by not having irshift and ilshift in the emitted code by omitting them if not used.
  910. const char* leftShift = tevScaleTableLeft[shift];
  911. const char* rightShift = tevScaleTableRight[shift];
  912. if (rightShift[0])
  913. out.Write("irshift(((tevin_d.%s%s)%s)", components, tevBiasTable[bias], tevScaleTableLeft[shift]);
  914. else
  915. out.Write("((tevin_d.%s%s)%s)", components, tevBiasTable[bias], tevScaleTableLeft[shift]);
  916. out.Write(" %s ", tevOpTable[op]);
  917. if (leftShift[0])
  918. out.Write("irshift((ilshift((ilshift(tevin_a.%s, 8) + (tevin_b.%s-tevin_a.%s)*(tevin_c.%s+irshift(tevin_c.%s, 7))), %s)%s), 8)",
  919. components, components, components, components, components,
  920. leftShift+4, tevLerpBias[2*op+(shift!=3)]);
  921. else
  922. out.Write("irshift(((ilshift(tevin_a.%s, 8) + (tevin_b.%s-tevin_a.%s)*(tevin_c.%s+irshift(tevin_c.%s, 7)))%s), 8)",
  923. components, components, components, components, components, tevLerpBias[2*op+(shift!=3)]);
  924. if (rightShift[0])
  925. out.Write(", %s)", rightShift+4);
  926. }
  927. else
  928. {
  929. out.Write("(((tevin_d.%s%s)%s)", components, tevBiasTable[bias], tevScaleTableLeft[shift]);
  930. out.Write(" %s ", tevOpTable[op]);
  931. out.Write("(((((tevin_a.%s<<8) + (tevin_b.%s-tevin_a.%s)*(tevin_c.%s+(tevin_c.%s>>7)))%s)%s)>>8)",
  932. components, components, components, components, components,
  933. tevScaleTableLeft[shift], tevLerpBias[2*op+(shift!=3)]);
  934. out.Write(")%s", tevScaleTableRight[shift]);
  935. }
  936. }
  937. template<class T>
  938. static inline void SampleTexture(T& out, const char *texcoords, const char *texswap, int texmap, API_TYPE ApiType)
  939. {
  940. out.SetConstantsUsed(C_TEXDIMS+texmap,C_TEXDIMS+texmap);
  941. if (ApiType == API_D3D)
  942. out.Write("iround(255.0 * Tex%d.Sample(samp%d, float3(%s.xy * " I_TEXDIMS"[%d].xy, %s))).%s;\n", texmap, texmap, texcoords, texmap, g_ActiveConfig.iStereoMode > 0 ? "layer" : "0.0", texswap);
  943. else
  944. out.Write("iround(255.0 * texture(samp%d, float3(%s.xy * " I_TEXDIMS"[%d].xy, %s))).%s;\n", texmap, texcoords, texmap, g_ActiveConfig.iStereoMode > 0 ? "layer" : "0.0", texswap);
  945. }
  946. static const char *tevAlphaFuncsTable[] =
  947. {
  948. "(false)", // NEVER
  949. "(prev.a < %s)", // LESS
  950. "(prev.a == %s)", // EQUAL
  951. "(prev.a <= %s)", // LEQUAL
  952. "(prev.a > %s)", // GREATER
  953. "(prev.a != %s)", // NEQUAL
  954. "(prev.a >= %s)", // GEQUAL
  955. "(true)" // ALWAYS
  956. };
  957. static const char *tevAlphaFunclogicTable[] =
  958. {
  959. " && ", // and
  960. " || ", // or
  961. " != ", // xor
  962. " == " // xnor
  963. };
  964. template<class T>
  965. static inline void WriteAlphaTest(T& out, pixel_shader_uid_data* uid_data, API_TYPE ApiType, DSTALPHA_MODE dstAlphaMode, bool per_pixel_depth)
  966. {
  967. static const char *alphaRef[2] =
  968. {
  969. I_ALPHA".r",
  970. I_ALPHA".g"
  971. };
  972. out.SetConstantsUsed(C_ALPHA, C_ALPHA);
  973. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENNEGATEDBOOLEAN))
  974. out.Write("\tif(( ");
  975. else
  976. out.Write("\tif(!( ");
  977. uid_data->alpha_test_comp0 = bpmem.alpha_test.comp0;
  978. uid_data->alpha_test_comp1 = bpmem.alpha_test.comp1;
  979. uid_data->alpha_test_logic = bpmem.alpha_test.logic;
  980. // Lookup the first component from the alpha function table
  981. int compindex = bpmem.alpha_test.comp0;
  982. out.Write(tevAlphaFuncsTable[compindex], alphaRef[0]);
  983. out.Write("%s", tevAlphaFunclogicTable[bpmem.alpha_test.logic]); // lookup the logic op
  984. // Lookup the second component from the alpha function table
  985. compindex = bpmem.alpha_test.comp1;
  986. out.Write(tevAlphaFuncsTable[compindex], alphaRef[1]);
  987. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENNEGATEDBOOLEAN))
  988. out.Write(") == false) {\n");
  989. else
  990. out.Write(")) {\n");
  991. out.Write("\t\tocol0 = float4(0.0, 0.0, 0.0, 0.0);\n");
  992. if (dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND)
  993. out.Write("\t\tocol1 = float4(0.0, 0.0, 0.0, 0.0);\n");
  994. if (per_pixel_depth)
  995. out.Write("\t\tdepth = %s;\n", (ApiType == API_D3D) ? "0.0" : "1.0");
  996. // ZCOMPLOC HACK:
  997. // The only way to emulate alpha test + early-z is to force early-z in the shader.
  998. // As this isn't available on all drivers and as we can't emulate this feature otherwise,
  999. // we are only able to choose which one we want to respect more.
  1000. // Tests seem to have proven that writing depth even when the alpha test fails is more
  1001. // important that a reliable alpha test, so we just force the alpha test to always succeed.
  1002. // At least this seems to be less buggy.
  1003. uid_data->alpha_test_use_zcomploc_hack = bpmem.UseEarlyDepthTest()
  1004. && bpmem.zmode.updateenable
  1005. && !g_ActiveConfig.backend_info.bSupportsEarlyZ
  1006. && !bpmem.genMode.zfreeze;
  1007. if (!uid_data->alpha_test_use_zcomploc_hack)
  1008. {
  1009. out.Write("\t\tdiscard;\n");
  1010. if (ApiType != API_D3D)
  1011. out.Write("\t\treturn;\n");
  1012. }
  1013. out.Write("\t}\n");
  1014. }
  1015. static const char *tevFogFuncsTable[] =
  1016. {
  1017. "", // No Fog
  1018. "", // ?
  1019. "", // Linear
  1020. "", // ?
  1021. "\tfog = 1.0 - exp2(-8.0 * fog);\n", // exp
  1022. "\tfog = 1.0 - exp2(-8.0 * fog * fog);\n", // exp2
  1023. "\tfog = exp2(-8.0 * (1.0 - fog));\n", // backward exp
  1024. "\tfog = 1.0 - fog;\n fog = exp2(-8.0 * fog * fog);\n" // backward exp2
  1025. };
  1026. template<class T>
  1027. static inline void WriteFog(T& out, pixel_shader_uid_data* uid_data)
  1028. {
  1029. uid_data->fog_fsel = bpmem.fog.c_proj_fsel.fsel;
  1030. if (bpmem.fog.c_proj_fsel.fsel == 0)
  1031. return; // no Fog
  1032. uid_data->fog_proj = bpmem.fog.c_proj_fsel.proj;
  1033. out.SetConstantsUsed(C_FOGCOLOR, C_FOGCOLOR);
  1034. out.SetConstantsUsed(C_FOGI, C_FOGI);
  1035. out.SetConstantsUsed(C_FOGF, C_FOGF+1);
  1036. if (bpmem.fog.c_proj_fsel.proj == 0)
  1037. {
  1038. // perspective
  1039. // ze = A/(B - (Zs >> B_SHF)
  1040. // TODO: Verify that we want to drop lower bits here! (currently taken over from software renderer)
  1041. // Maybe we want to use "ze = (A << B_SHF)/((B << B_SHF) - Zs)" instead?
  1042. // That's equivalent, but keeps the lower bits of Zs.
  1043. out.Write("\tfloat ze = (" I_FOGF"[1].x * 16777216.0) / float(" I_FOGI".y - (zCoord >> " I_FOGI".w));\n");
  1044. }
  1045. else
  1046. {
  1047. // orthographic
  1048. // ze = a*Zs (here, no B_SHF)
  1049. out.Write("\tfloat ze = " I_FOGF"[1].x * float(zCoord) / 16777216.0;\n");
  1050. }
  1051. // x_adjust = sqrt((x-center)^2 + k^2)/k
  1052. // ze *= x_adjust
  1053. // TODO Instead of this theoretical calculation, we should use the
  1054. // coefficient table given in the fog range BP registers!
  1055. uid_data->fog_RangeBaseEnabled = bpmem.fogRange.Base.Enabled;
  1056. if (bpmem.fogRange.Base.Enabled)
  1057. {
  1058. out.SetConstantsUsed(C_FOGF, C_FOGF);
  1059. out.Write("\tfloat x_adjust = (2.0 * (rawpos.x / " I_FOGF"[0].y)) - 1.0 - " I_FOGF"[0].x;\n");
  1060. out.Write("\tx_adjust = sqrt(x_adjust * x_adjust + " I_FOGF"[0].z * " I_FOGF"[0].z) / " I_FOGF"[0].z;\n");
  1061. out.Write("\tze *= x_adjust;\n");
  1062. }
  1063. out.Write("\tfloat fog = clamp(ze - " I_FOGF"[1].z, 0.0, 1.0);\n");
  1064. if (bpmem.fog.c_proj_fsel.fsel > 3)
  1065. {
  1066. out.Write("%s", tevFogFuncsTable[bpmem.fog.c_proj_fsel.fsel]);
  1067. }
  1068. else
  1069. {
  1070. if (bpmem.fog.c_proj_fsel.fsel != 2 && out.GetBuffer() != nullptr)
  1071. WARN_LOG(VIDEO, "Unknown Fog Type! %08x", bpmem.fog.c_proj_fsel.fsel);
  1072. }
  1073. out.Write("\tint ifog = iround(fog * 256.0);\n");
  1074. if (DriverDetails::HasBug(DriverDetails::BUG_BROKENIVECSHIFTS))
  1075. out.Write("\tprev.rgb = irshift((prev.rgb * (256 - ifog) + " I_FOGCOLOR".rgb * ifog), 8);\n");
  1076. else
  1077. out.Write("\tprev.rgb = (prev.rgb * (256 - ifog) + " I_FOGCOLOR".rgb * ifog) >> 8;\n");
  1078. }
  1079. void GetPixelShaderUid(PixelShaderUid& object, DSTALPHA_MODE dstAlphaMode, API_TYPE ApiType, u32 components)
  1080. {
  1081. GeneratePixelShader<PixelShaderUid>(object, dstAlphaMode, ApiType, components);
  1082. }
  1083. void GeneratePixelShaderCode(PixelShaderCode& object, DSTALPHA_MODE dstAlphaMode, API_TYPE ApiType, u32 components)
  1084. {
  1085. GeneratePixelShader<PixelShaderCode>(object, dstAlphaMode, ApiType, components);
  1086. }
  1087. void GetPixelShaderConstantProfile(PixelShaderConstantProfile& object, DSTALPHA_MODE dstAlphaMode, API_TYPE ApiType, u32 components)
  1088. {
  1089. GeneratePixelShader<PixelShaderConstantProfile>(object, dstAlphaMode, ApiType, components);
  1090. }