bloom-approx.fs 731 KB


  1. #version 150
  2. uniform sampler2D source[];
  3. uniform vec4 sourceSize[];
  4. uniform vec4 targetSize;
  5. in Vertex {
  6. vec2 vTexCoord;
  7. vec2 tex_uv;
  8. vec2 blur_dxdy;
  9. vec2 uv_scanline_step;
  10. float estimated_viewport_size_x;
  11. vec2 texture_size_inv;
  12. vec2 tex_uv_to_pixel_scale;
  13. };
  14. out vec4 FragColor;
  15. // USER SETTINGS BLOCK //
  16. #define crt_gamma 2.500000
  17. #define lcd_gamma 2.200000
  18. #define levels_contrast 1.0
  19. #define halation_weight 0.0
  20. #define diffusion_weight 0.075
  21. #define bloom_underestimate_levels 0.8
  22. #define bloom_excess 0.000000
  23. #define beam_min_sigma 0.020000
  24. #define beam_max_sigma 0.300000
  25. #define beam_spot_power 0.330000
  26. #define beam_min_shape 2.000000
  27. #define beam_max_shape 4.000000
  28. #define beam_shape_power 0.250000
  29. #define beam_horiz_filter 0.000000
  30. #define beam_horiz_sigma 0.35
  31. #define beam_horiz_linear_rgb_weight 1.000000
  32. #define convergence_offset_x_r -0.000000
  33. #define convergence_offset_x_g 0.000000
  34. #define convergence_offset_x_b 0.000000
  35. #define convergence_offset_y_r 0.000000
  36. #define convergence_offset_y_g -0.000000
  37. #define convergence_offset_y_b 0.000000
  38. #define mask_type 1.000000
  39. #define mask_sample_mode_desired 0.000000
  40. #define mask_specify_num_triads 0.000000
  41. #define mask_triad_size_desired 3.000000
  42. #define mask_num_triads_desired 480.000000
  43. #define aa_subpixel_r_offset_x_runtime -0.0
  44. #define aa_subpixel_r_offset_y_runtime 0.000000
  45. #define aa_cubic_c 0.500000
  46. #define aa_gauss_sigma 0.500000
  47. #define geom_mode_runtime 0.000000
  48. #define geom_radius 2.000000
  49. #define geom_view_dist 2.000000
  50. #define geom_tilt_angle_x 0.000000
  51. #define geom_tilt_angle_y 0.000000
  52. #define geom_aspect_ratio_x 432.000000
  53. #define geom_aspect_ratio_y 329.000000
  54. #define geom_overscan_x 1.000000
  55. #define geom_overscan_y 1.000000
  56. #define border_size 0.015
  57. #define border_darkness 2.0
  58. #define border_compress 2.500000
  59. #define interlace_bff 0.000000
  60. #define interlace_1080i 0.000000
  61. // END USER SETTINGS BLOCK //
  62. // compatibility macros for transparently converting HLSLisms into GLSLisms
  63. #define mul(a,b) (b*a)
  64. #define lerp(a,b,c) mix(a,b,c)
  65. #define saturate(c) clamp(c, 0.0, 1.0)
  66. #define frac(x) (fract(x))
  67. #define float2 vec2
  68. #define float3 vec3
  69. #define float4 vec4
  70. #define bool2 bvec2
  71. #define bool3 bvec3
  72. #define bool4 bvec4
  73. #define float2x2 mat2x2
  74. #define float3x3 mat3x3
  75. #define float4x4 mat4x4
  76. #define float4x3 mat4x3
  77. #define float2x4 mat2x4
  78. #define IN params
  79. #define texture_size sourceSize[0].xy
  80. #define video_size sourceSize[0].xy
  81. #define output_size targetSize.xy
  82. #define frame_count phase
  83. #define static
  84. #define inline
  85. #define const
  86. #define fmod(x,y) mod(x,y)
  87. #define ddx(c) dFdx(c)
  88. #define ddy(c) dFdy(c)
  89. #define atan2(x,y) atan(y,x)
  90. #define rsqrt(c) inversesqrt(c)
  91. #define input_texture source[0]
  92. #if defined(GL_ES)
  93. #define COMPAT_PRECISION mediump
  94. #else
  95. #define COMPAT_PRECISION
  96. #endif
  97. #if __VERSION__ >= 130
  98. #define COMPAT_TEXTURE texture
  99. #else
  100. #define COMPAT_TEXTURE texture2D
  101. #endif
  102. #define ORIG_LINEARIZEDvideo_size sourceSize[1].xy
  103. #define ORIG_LINEARIZEDtexture_size sourceSize[1].xy
  104. #define ORIG_LINEARIZED source[1]
  105. float bloom_approx_scale_x = targetSize.x / sourceSize[0].y;
  106. const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
  107. /////////////////////////////// VERTEX INCLUDES ///////////////////////////////
  108. //#include "../user-settings.h"
  109. ///////////////////////////// BEGIN USER-SETTINGS ////////////////////////////
  110. #ifndef USER_SETTINGS_H
  111. #define USER_SETTINGS_H
  112. ///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
  113. // The Cg compiler uses different "profiles" with different capabilities.
  114. // This shader requires a Cg compilation profile >= arbfp1, but a few options
  115. // require higher profiles like fp30 or fp40. The shader can't detect profile
  116. // or driver capabilities, so instead you must comment or uncomment the lines
  117. // below with "//" before "#define." Disable an option if you get compilation
  118. // errors resembling those listed. Generally speaking, all of these options
  119. // will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
  120. // likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
  121. // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
  122. // Among other things, derivatives help us fix anisotropic filtering artifacts
  123. // with curved manually tiled phosphor mask coords. Related errors:
  124. // error C3004: function "float2 ddx(float2);" not supported in this profile
  125. // error C3004: function "float2 ddy(float2);" not supported in this profile
  126. //#define DRIVERS_ALLOW_DERIVATIVES
  127. // Fine derivatives: Unsupported on older ATI cards.
  128. // Fine derivatives enable 2x2 fragment block communication, letting us perform
  129. // fast single-pass blur operations. If your card uses coarse derivatives and
  130. // these are enabled, blurs could look broken. Derivatives are a prerequisite.
  131. #ifdef DRIVERS_ALLOW_DERIVATIVES
  132. #define DRIVERS_ALLOW_FINE_DERIVATIVES
  133. #endif
  134. // Dynamic looping: Requires an fp30 or newer profile.
  135. // This makes phosphor mask resampling faster in some cases. Related errors:
  136. // error C5013: profile does not support "for" statements and "for" could not
  137. // be unrolled
  138. //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
  139. // Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
  140. // Using one static loop avoids overhead if the user is right, but if the user
  141. // is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
  142. // binary search can potentially save some iterations. However, it may fail:
  143. // error C6001: Temporary register limit of 32 exceeded; 35 registers
  144. // needed to compile program
  145. //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
  146. // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
  147. // anisotropic filtering, thereby fixing related artifacts. Related errors:
  148. // error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
  149. // this profile
  150. //#define DRIVERS_ALLOW_TEX2DLOD
  151. // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
  152. // artifacts from anisotropic filtering and mipmapping. Related errors:
  153. // error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
  154. // in this profile
  155. //#define DRIVERS_ALLOW_TEX2DBIAS
  156. // Integrated graphics compatibility: Integrated graphics like Intel HD 4000
  157. // impose stricter limitations on register counts and instructions. Enable
  158. // INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
  159. // error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
  160. // to compile program.
  161. // Enabling integrated graphics compatibility mode will automatically disable:
  162. // 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
  163. // (This may be reenabled in a later release.)
  164. // 2.) RUNTIME_GEOMETRY_MODE
  165. // 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
  166. //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  167. //////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
  168. // To disable a #define option, turn its line into a comment with "//."
  169. // RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
  170. // Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
  171. // many of the options in this file and allow real-time tuning, but many of
  172. // them are slower. Disabling them and using this text file will boost FPS.
  173. #define RUNTIME_SHADER_PARAMS_ENABLE
  174. // Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
  175. // it's the only way to do a wide-enough full bloom with a runtime dot pitch.
  176. #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
  177. // Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
  178. #define RUNTIME_ANTIALIAS_WEIGHTS
  179. // Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
  180. //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  181. // Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
  182. // parameters? This will require more math or dynamic branching.
  183. #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  184. // Specify the tilt at runtime? This makes things about 3% slower.
  185. #define RUNTIME_GEOMETRY_TILT
  186. // Specify the geometry mode at runtime?
  187. #define RUNTIME_GEOMETRY_MODE
  188. // Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
  189. // mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
  190. // dynamic branches? This is cheap if mask_resize_viewport_scale is small.
  191. #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  192. // PHOSPHOR MASK:
  193. // Manually resize the phosphor mask for best results (slower)? Disabling this
  194. // removes the option to do so, but it may be faster without dynamic branches.
  195. #define PHOSPHOR_MASK_MANUALLY_RESIZE
  196. // If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
  197. #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
  198. // Larger blurs are expensive, but we need them to blur larger triads. We can
  199. // detect the right blur if the triad size is static or our profile allows
  200. // dynamic branches, but otherwise we use the largest blur the user indicates
  201. // they might need:
  202. #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  203. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  204. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  205. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  206. // Here's a helpful chart:
  207. // MaxTriadSize BlurSize MinTriadCountsByResolution
  208. // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  209. // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  210. // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  211. // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  212. // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  213. /////////////////////////////// USER PARAMETERS //////////////////////////////
  214. // Note: Many of these static parameters are overridden by runtime shader
  215. // parameters when those are enabled. However, many others are static codepath
  216. // options that were cleaner or more convert to code as static constants.
  217. // GAMMA:
  218. static const float crt_gamma_static = 2.5; // range [1, 5]
  219. static const float lcd_gamma_static = 2.2; // range [1, 5]
  220. // LEVELS MANAGEMENT:
  221. // Control the final multiplicative image contrast:
  222. static const float levels_contrast_static = 1.0; // range [0, 4)
  223. // We auto-dim to avoid clipping between passes and restore brightness
  224. // later. Control the dim factor here: Lower values clip less but crush
  225. // blacks more (static only for now).
  226. static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
  227. // HALATION/DIFFUSION/BLOOM:
  228. // Halation weight: How much energy should be lost to electrons bounding
  229. // around under the CRT glass and exciting random phosphors?
  230. static const float halation_weight_static = 0.0; // range [0, 1]
  231. // Refractive diffusion weight: How much light should spread/diffuse from
  232. // refracting through the CRT glass?
  233. static const float diffusion_weight_static = 0.075; // range [0, 1]
  234. // Underestimate brightness: Bright areas bloom more, but we can base the
  235. // bloom brightpass on a lower brightness to sharpen phosphors, or a higher
  236. // brightness to soften them. Low values clip, but >= 0.8 looks okay.
  237. static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
  238. // Blur all colors more than necessary for a softer phosphor bloom?
  239. static const float bloom_excess_static = 0.0; // range [0, 1]
  240. // The BLOOM_APPROX pass approximates a phosphor blur early on with a small
  241. // blurred resize of the input (convergence offsets are applied as well).
  242. // There are three filter options (static option only for now):
  243. // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
  244. // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
  245. // and beam_max_sigma is low.
  246. // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
  247. // always uses a static sigma regardless of beam_max_sigma or
  248. // mask_num_triads_desired.
  249. // 2.) True 4x4 Gaussian resize: Slowest, technically correct.
  250. // These options are more pronounced for the fast, unbloomed shader version.
  251. #ifndef RADEON_FIX
  252. static const float bloom_approx_filter_static = 2.0;
  253. #else
  254. static const float bloom_approx_filter_static = 1.0;
  255. #endif
  256. // ELECTRON BEAM SCANLINE DISTRIBUTION:
  257. // How many scanlines should contribute light to each pixel? Using more
  258. // scanlines is slower (especially for a generalized Gaussian) but less
  259. // distorted with larger beam sigmas (especially for a pure Gaussian). The
  260. // max_beam_sigma at which the closest unused weight is guaranteed <
  261. // 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
  262. // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
  263. // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
  264. // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
  265. // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
  266. // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
  267. static const float beam_num_scanlines = 3.0; // range [2, 6]
  268. // A generalized Gaussian beam varies shape with color too, now just width.
  269. // It's slower but more flexible (static option only for now).
  270. static const bool beam_generalized_gaussian = true;
  271. // What kind of scanline antialiasing do you want?
  272. // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
  273. // Integrals are slow (especially for generalized Gaussians) and rarely any
  274. // better than 3x antialiasing (static option only for now).
  275. static const float beam_antialias_level = 1.0; // range [0, 2]
  276. // Min/max standard deviations for scanline beams: Higher values widen and
  277. // soften scanlines. Depending on other options, low min sigmas can alias.
  278. static const float beam_min_sigma_static = 0.02; // range (0, 1]
  279. static const float beam_max_sigma_static = 0.3; // range (0, 1]
  280. // Beam width varies as a function of color: A power function (0) is more
  281. // configurable, but a spherical function (1) gives the widest beam
  282. // variability without aliasing (static option only for now).
  283. static const float beam_spot_shape_function = 0.0;
  284. // Spot shape power: Powers <= 1 give smoother spot shapes but lower
  285. // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
  286. static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
  287. // Generalized Gaussian max shape parameters: Higher values give flatter
  288. // scanline plateaus and steeper dropoffs, simultaneously widening and
  289. // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
  290. // values > ~40.0 cause artifacts with integrals.
  291. static const float beam_min_shape_static = 2.0; // range [2, 32]
  292. static const float beam_max_shape_static = 4.0; // range [2, 32]
  293. // Generalized Gaussian shape power: Affects how quickly the distribution
  294. // changes shape from Gaussian to steep/plateaued as color increases from 0
  295. // to 1.0. Higher powers appear softer for most colors, and lower powers
  296. // appear sharper for most colors.
  297. static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
  298. // What filter should be used to sample scanlines horizontally?
  299. // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
  300. static const float beam_horiz_filter_static = 0.0;
  301. // Standard deviation for horizontal Gaussian resampling:
  302. static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
  303. // Do horizontal scanline sampling in linear RGB (correct light mixing),
  304. // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
  305. // limiting circuitry in some CRT's), or a weighted avg.?
  306. static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
  307. // Simulate scanline misconvergence? This needs 3x horizontal texture
  308. // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
  309. // later passes (static option only for now).
  310. static const bool beam_misconvergence = true;
  311. // Convergence offsets in x/y directions for R/G/B scanline beams in units
  312. // of scanlines. Positive offsets go right/down; ranges [-2, 2]
  313. static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
  314. static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
  315. static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
  316. // Detect interlacing (static option only for now)?
  317. static const bool interlace_detect = true;
  318. // Assume 1080-line sources are interlaced?
  319. static const bool interlace_1080i_static = false;
  320. // For interlaced sources, assume TFF (top-field first) or BFF order?
  321. // (Whether this matters depends on the nature of the interlaced input.)
  322. static const bool interlace_bff_static = false;
  323. // ANTIALIASING:
  324. // What AA level do you want for curvature/overscan/subpixels? Options:
  325. // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
  326. // (Static option only for now)
  327. static const float aa_level = 12.0; // range [0, 24]
  328. // What antialiasing filter do you want (static option only)? Options:
  329. // 0: Box (separable), 1: Box (cylindrical),
  330. // 2: Tent (separable), 3: Tent (cylindrical),
  331. // 4: Gaussian (separable), 5: Gaussian (cylindrical),
  332. // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
  333. // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
  334. // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
  335. static const float aa_filter = 6.0; // range [0, 9]
  336. // Flip the sample grid on odd/even frames (static option only for now)?
  337. static const bool aa_temporal = false;
  338. // Use RGB subpixel offsets for antialiasing? The pixel is at green, and
  339. // the blue offset is the negative r offset; range [0, 0.5]
  340. static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
  341. // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
  342. // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
  343. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
  344. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
  345. // 4.) C = 0.0 is a soft spline filter.
  346. static const float aa_cubic_c_static = 0.5; // range [0, 4]
  347. // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
  348. static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
  349. // PHOSPHOR MASK:
  350. // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
  351. static const float mask_type_static = 1.0; // range [0, 2]
  352. // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
  353. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
  354. // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
  355. // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
  356. // is halfway decent with LUT mipmapping but atrocious without it.
  357. // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
  358. // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
  359. // This mode reuses the same masks, so triads will be enormous unless
  360. // you change the mask LUT filenames in your .cgp file.
  361. static const float mask_sample_mode_static = 0.0; // range [0, 2]
  362. // Prefer setting the triad size (0.0) or number on the screen (1.0)?
  363. // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
  364. // will always be used to calculate the full bloom sigma statically.
  365. static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
  366. // Specify the phosphor triad size, in pixels. Each tile (usually with 8
  367. // triads) will be rounded to the nearest integer tile size and clamped to
  368. // obey minimum size constraints (imposed to reduce downsize taps) and
  369. // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
  370. // To increase the size limit, double the viewport-relative scales for the
  371. // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
  372. // range [1, mask_texture_small_size/mask_triads_per_tile]
  373. static const float mask_triad_size_desired_static = 24.0 / 8.0;
  374. // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
  375. // final size will be rounded and constrained as above); default 480.0
  376. static const float mask_num_triads_desired_static = 480.0;
  377. // How many lobes should the sinc/Lanczos resizer use? More lobes require
  378. // more samples and avoid moire a bit better, but some is unavoidable
  379. // depending on the destination size (static option for now).
  380. static const float mask_sinc_lobes = 3.0; // range [2, 4]
  381. // The mask is resized using a variable number of taps in each dimension,
  382. // but some Cg profiles always fetch a constant number of taps no matter
  383. // what (no dynamic branching). We can limit the maximum number of taps if
  384. // we statically limit the minimum phosphor triad size. Larger values are
  385. // faster, but the limit IS enforced (static option only, forever);
  386. // range [1, mask_texture_small_size/mask_triads_per_tile]
  387. // TODO: Make this 1.0 and compensate with smarter sampling!
  388. static const float mask_min_allowed_triad_size = 2.0;
  389. // GEOMETRY:
  390. // Geometry mode:
  391. // 0: Off (default), 1: Spherical mapping (like cgwg's),
  392. // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
  393. static const float geom_mode_static = 0.0; // range [0, 3]
  394. // Radius of curvature: Measured in units of your viewport's diagonal size.
  395. static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
  396. // View dist is the distance from the player to their physical screen, in
  397. // units of the viewport's diagonal size. It controls the field of view.
  398. static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
  399. // Tilt angle in radians (clockwise around up and right vectors):
  400. static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
  401. // Aspect ratio: When the true viewport size is unknown, this value is used
  402. // to help convert between the phosphor triad size and count, along with
  403. // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
  404. // this equal to Retroarch's display aspect ratio (DAR) for best results;
  405. // range [1, geom_max_aspect_ratio from user-cgp-constants.h];
  406. // default (256/224)*(54/47) = 1.313069909 (see below)
  407. static const float geom_aspect_ratio_static = 1.313069909;
  408. // Before getting into overscan, here's some general aspect ratio info:
  409. // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
  410. // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
  411. // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
  412. // Geometry processing has to "undo" the screen-space 2D DAR to calculate
  413. // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
  414. // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
  415. // a.) Enable Retroarch's "Crop Overscan"
  416. // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
  417. // Real consoles use horizontal black padding in the signal, but emulators
  418. // often crop this without cropping the vertical padding; a 256x224 [S]NES
  419. // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
  420. // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
  421. // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
  422. // http://forums.nesdev.com/viewtopic.php?p=24815#p24815
  423. // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
  424. // without doing a. or b., but horizontal image borders will be tighter
  425. // than vertical ones, messing up curvature and overscan. Fixing the
  426. // padding first corrects this.
  427. // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
  428. // or adjust x/y independently to e.g. readd horizontal padding, as noted
  429. // above: Values < 1.0 zoom out; range (0, inf)
  430. static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
  431. // Compute a proper pixel-space to texture-space matrix even without ddx()/
  432. // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
  433. // with strong curvature (static option only for now).
  434. static const bool geom_force_correct_tangent_matrix = true;
  435. // BORDERS:
  436. // Rounded border size in texture uv coords:
  437. static const float border_size_static = 0.015; // range [0, 0.5]
  438. // Border darkness: Moderate values darken the border smoothly, and high
  439. // values make the image very dark just inside the border:
  440. static const float border_darkness_static = 2.0; // range [0, inf)
  441. // Border compression: High numbers compress border transitions, narrowing
  442. // the dark border area.
  443. static const float border_compress_static = 2.5; // range [1, inf)
  444. #endif // USER_SETTINGS_H
  445. //////////////////////////// END USER-SETTINGS //////////////////////////
  446. //#include "bind-shader-h"
  447. ///////////////////////////// BEGIN BIND-SHADER-PARAMS ////////////////////////////
  448. #ifndef BIND_SHADER_PARAMS_H
  449. #define BIND_SHADER_PARAMS_H
  450. ///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
  451. // crt-royale: A full-featured CRT shader, with cheese.
  452. // Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
  453. //
  454. // This program is free software; you can redistribute it and/or modify it
  455. // under the terms of the GNU General Public License as published by the Free
  456. // Software Foundation; either version 2 of the License, or any later version.
  457. //
  458. // This program is distributed in the hope that it will be useful, but WITHOUT
  459. // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  460. // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  461. // more details.
  462. //
  463. // You should have received a copy of the GNU General Public License along with
  464. // this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  465. // Place, Suite 330, Boston, MA 02111-1307 USA
  466. ///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
  467. /////////////////////////////// BEGIN INCLUDES ///////////////////////////////
  468. //#include "../user-settings.h"
  469. ///////////////////////////// BEGIN USER-SETTINGS ////////////////////////////
  470. #ifndef USER_SETTINGS_H
  471. #define USER_SETTINGS_H
  472. ///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
  473. // The Cg compiler uses different "profiles" with different capabilities.
  474. // This shader requires a Cg compilation profile >= arbfp1, but a few options
  475. // require higher profiles like fp30 or fp40. The shader can't detect profile
  476. // or driver capabilities, so instead you must comment or uncomment the lines
  477. // below with "//" before "#define." Disable an option if you get compilation
  478. // errors resembling those listed. Generally speaking, all of these options
  479. // will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
  480. // likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
  481. // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
  482. // Among other things, derivatives help us fix anisotropic filtering artifacts
  483. // with curved manually tiled phosphor mask coords. Related errors:
  484. // error C3004: function "float2 ddx(float2);" not supported in this profile
  485. // error C3004: function "float2 ddy(float2);" not supported in this profile
  486. //#define DRIVERS_ALLOW_DERIVATIVES
  487. // Fine derivatives: Unsupported on older ATI cards.
  488. // Fine derivatives enable 2x2 fragment block communication, letting us perform
  489. // fast single-pass blur operations. If your card uses coarse derivatives and
  490. // these are enabled, blurs could look broken. Derivatives are a prerequisite.
  491. #ifdef DRIVERS_ALLOW_DERIVATIVES
  492. #define DRIVERS_ALLOW_FINE_DERIVATIVES
  493. #endif
  494. // Dynamic looping: Requires an fp30 or newer profile.
  495. // This makes phosphor mask resampling faster in some cases. Related errors:
  496. // error C5013: profile does not support "for" statements and "for" could not
  497. // be unrolled
  498. //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
  499. // Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
  500. // Using one static loop avoids overhead if the user is right, but if the user
  501. // is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
  502. // binary search can potentially save some iterations. However, it may fail:
  503. // error C6001: Temporary register limit of 32 exceeded; 35 registers
  504. // needed to compile program
  505. //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
  506. // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
  507. // anisotropic filtering, thereby fixing related artifacts. Related errors:
  508. // error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
  509. // this profile
  510. //#define DRIVERS_ALLOW_TEX2DLOD
  511. // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
  512. // artifacts from anisotropic filtering and mipmapping. Related errors:
  513. // error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
  514. // in this profile
  515. //#define DRIVERS_ALLOW_TEX2DBIAS
  516. // Integrated graphics compatibility: Integrated graphics like Intel HD 4000
  517. // impose stricter limitations on register counts and instructions. Enable
  518. // INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
  519. // error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
  520. // to compile program.
  521. // Enabling integrated graphics compatibility mode will automatically disable:
  522. // 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
  523. // (This may be reenabled in a later release.)
  524. // 2.) RUNTIME_GEOMETRY_MODE
  525. // 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
  526. //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  527. //////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
  528. // To disable a #define option, turn its line into a comment with "//."
  529. // RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
  530. // Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
  531. // many of the options in this file and allow real-time tuning, but many of
  532. // them are slower. Disabling them and using this text file will boost FPS.
  533. #define RUNTIME_SHADER_PARAMS_ENABLE
  534. // Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
  535. // it's the only way to do a wide-enough full bloom with a runtime dot pitch.
  536. #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
  537. // Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
  538. #define RUNTIME_ANTIALIAS_WEIGHTS
  539. // Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
  540. //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  541. // Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
  542. // parameters? This will require more math or dynamic branching.
  543. #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  544. // Specify the tilt at runtime? This makes things about 3% slower.
  545. #define RUNTIME_GEOMETRY_TILT
  546. // Specify the geometry mode at runtime?
  547. #define RUNTIME_GEOMETRY_MODE
  548. // Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
  549. // mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
  550. // dynamic branches? This is cheap if mask_resize_viewport_scale is small.
  551. #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  552. // PHOSPHOR MASK:
  553. // Manually resize the phosphor mask for best results (slower)? Disabling this
  554. // removes the option to do so, but it may be faster without dynamic branches.
  555. #define PHOSPHOR_MASK_MANUALLY_RESIZE
  556. // If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
  557. #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
  558. // Larger blurs are expensive, but we need them to blur larger triads. We can
  559. // detect the right blur if the triad size is static or our profile allows
  560. // dynamic branches, but otherwise we use the largest blur the user indicates
  561. // they might need:
  562. #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  563. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  564. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  565. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  566. // Here's a helpful chart:
  567. // MaxTriadSize BlurSize MinTriadCountsByResolution
  568. // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  569. // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  570. // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  571. // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  572. // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  573. /////////////////////////////// USER PARAMETERS //////////////////////////////
  574. // Note: Many of these static parameters are overridden by runtime shader
  575. // parameters when those are enabled. However, many others are static codepath
  576. // options that were cleaner or more convert to code as static constants.
  577. // GAMMA:
  578. static const float crt_gamma_static = 2.5; // range [1, 5]
  579. static const float lcd_gamma_static = 2.2; // range [1, 5]
  580. // LEVELS MANAGEMENT:
  581. // Control the final multiplicative image contrast:
  582. static const float levels_contrast_static = 1.0; // range [0, 4)
  583. // We auto-dim to avoid clipping between passes and restore brightness
  584. // later. Control the dim factor here: Lower values clip less but crush
  585. // blacks more (static only for now).
  586. static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
  587. // HALATION/DIFFUSION/BLOOM:
  588. // Halation weight: How much energy should be lost to electrons bounding
  589. // around under the CRT glass and exciting random phosphors?
  590. static const float halation_weight_static = 0.0; // range [0, 1]
  591. // Refractive diffusion weight: How much light should spread/diffuse from
  592. // refracting through the CRT glass?
  593. static const float diffusion_weight_static = 0.075; // range [0, 1]
  594. // Underestimate brightness: Bright areas bloom more, but we can base the
  595. // bloom brightpass on a lower brightness to sharpen phosphors, or a higher
  596. // brightness to soften them. Low values clip, but >= 0.8 looks okay.
  597. static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
  598. // Blur all colors more than necessary for a softer phosphor bloom?
  599. static const float bloom_excess_static = 0.0; // range [0, 1]
  600. // The BLOOM_APPROX pass approximates a phosphor blur early on with a small
  601. // blurred resize of the input (convergence offsets are applied as well).
  602. // There are three filter options (static option only for now):
  603. // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
  604. // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
  605. // and beam_max_sigma is low.
  606. // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
  607. // always uses a static sigma regardless of beam_max_sigma or
  608. // mask_num_triads_desired.
  609. // 2.) True 4x4 Gaussian resize: Slowest, technically correct.
  610. // These options are more pronounced for the fast, unbloomed shader version.
  611. #ifndef RADEON_FIX
  612. static const float bloom_approx_filter_static = 2.0;
  613. #else
  614. static const float bloom_approx_filter_static = 1.0;
  615. #endif
  616. // ELECTRON BEAM SCANLINE DISTRIBUTION:
  617. // How many scanlines should contribute light to each pixel? Using more
  618. // scanlines is slower (especially for a generalized Gaussian) but less
  619. // distorted with larger beam sigmas (especially for a pure Gaussian). The
  620. // max_beam_sigma at which the closest unused weight is guaranteed <
  621. // 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
  622. // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
  623. // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
  624. // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
  625. // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
  626. // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
  627. static const float beam_num_scanlines = 3.0; // range [2, 6]
  628. // A generalized Gaussian beam varies shape with color too, now just width.
  629. // It's slower but more flexible (static option only for now).
  630. static const bool beam_generalized_gaussian = true;
  631. // What kind of scanline antialiasing do you want?
  632. // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
  633. // Integrals are slow (especially for generalized Gaussians) and rarely any
  634. // better than 3x antialiasing (static option only for now).
  635. static const float beam_antialias_level = 1.0; // range [0, 2]
  636. // Min/max standard deviations for scanline beams: Higher values widen and
  637. // soften scanlines. Depending on other options, low min sigmas can alias.
  638. static const float beam_min_sigma_static = 0.02; // range (0, 1]
  639. static const float beam_max_sigma_static = 0.3; // range (0, 1]
  640. // Beam width varies as a function of color: A power function (0) is more
  641. // configurable, but a spherical function (1) gives the widest beam
  642. // variability without aliasing (static option only for now).
  643. static const float beam_spot_shape_function = 0.0;
  644. // Spot shape power: Powers <= 1 give smoother spot shapes but lower
  645. // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
  646. static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
  647. // Generalized Gaussian max shape parameters: Higher values give flatter
  648. // scanline plateaus and steeper dropoffs, simultaneously widening and
  649. // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
  650. // values > ~40.0 cause artifacts with integrals.
  651. static const float beam_min_shape_static = 2.0; // range [2, 32]
  652. static const float beam_max_shape_static = 4.0; // range [2, 32]
  653. // Generalized Gaussian shape power: Affects how quickly the distribution
  654. // changes shape from Gaussian to steep/plateaued as color increases from 0
  655. // to 1.0. Higher powers appear softer for most colors, and lower powers
  656. // appear sharper for most colors.
  657. static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
  658. // What filter should be used to sample scanlines horizontally?
  659. // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
  660. static const float beam_horiz_filter_static = 0.0;
  661. // Standard deviation for horizontal Gaussian resampling:
  662. static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
  663. // Do horizontal scanline sampling in linear RGB (correct light mixing),
  664. // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
  665. // limiting circuitry in some CRT's), or a weighted avg.?
  666. static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
  667. // Simulate scanline misconvergence? This needs 3x horizontal texture
  668. // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
  669. // later passes (static option only for now).
  670. static const bool beam_misconvergence = true;
  671. // Convergence offsets in x/y directions for R/G/B scanline beams in units
  672. // of scanlines. Positive offsets go right/down; ranges [-2, 2]
  673. static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
  674. static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
  675. static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
  676. // Detect interlacing (static option only for now)?
  677. static const bool interlace_detect = true;
  678. // Assume 1080-line sources are interlaced?
  679. static const bool interlace_1080i_static = false;
  680. // For interlaced sources, assume TFF (top-field first) or BFF order?
  681. // (Whether this matters depends on the nature of the interlaced input.)
  682. static const bool interlace_bff_static = false;
  683. // ANTIALIASING:
  684. // What AA level do you want for curvature/overscan/subpixels? Options:
  685. // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
  686. // (Static option only for now)
  687. static const float aa_level = 12.0; // range [0, 24]
  688. // What antialiasing filter do you want (static option only)? Options:
  689. // 0: Box (separable), 1: Box (cylindrical),
  690. // 2: Tent (separable), 3: Tent (cylindrical),
  691. // 4: Gaussian (separable), 5: Gaussian (cylindrical),
  692. // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
  693. // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
  694. // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
  695. static const float aa_filter = 6.0; // range [0, 9]
  696. // Flip the sample grid on odd/even frames (static option only for now)?
  697. static const bool aa_temporal = false;
  698. // Use RGB subpixel offsets for antialiasing? The pixel is at green, and
  699. // the blue offset is the negative r offset; range [0, 0.5]
  700. static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
  701. // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
  702. // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
  703. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
  704. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
  705. // 4.) C = 0.0 is a soft spline filter.
  706. static const float aa_cubic_c_static = 0.5; // range [0, 4]
  707. // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
  708. static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
  709. // PHOSPHOR MASK:
  710. // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
  711. static const float mask_type_static = 1.0; // range [0, 2]
  712. // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
  713. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
  714. // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
  715. // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
  716. // is halfway decent with LUT mipmapping but atrocious without it.
  717. // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
  718. // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
  719. // This mode reuses the same masks, so triads will be enormous unless
  720. // you change the mask LUT filenames in your .cgp file.
  721. static const float mask_sample_mode_static = 0.0; // range [0, 2]
  722. // Prefer setting the triad size (0.0) or number on the screen (1.0)?
  723. // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
  724. // will always be used to calculate the full bloom sigma statically.
  725. static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
  726. // Specify the phosphor triad size, in pixels. Each tile (usually with 8
  727. // triads) will be rounded to the nearest integer tile size and clamped to
  728. // obey minimum size constraints (imposed to reduce downsize taps) and
  729. // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
  730. // To increase the size limit, double the viewport-relative scales for the
  731. // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
  732. // range [1, mask_texture_small_size/mask_triads_per_tile]
  733. static const float mask_triad_size_desired_static = 24.0 / 8.0;
  734. // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
  735. // final size will be rounded and constrained as above); default 480.0
  736. static const float mask_num_triads_desired_static = 480.0;
  737. // How many lobes should the sinc/Lanczos resizer use? More lobes require
  738. // more samples and avoid moire a bit better, but some is unavoidable
  739. // depending on the destination size (static option for now).
  740. static const float mask_sinc_lobes = 3.0; // range [2, 4]
  741. // The mask is resized using a variable number of taps in each dimension,
  742. // but some Cg profiles always fetch a constant number of taps no matter
  743. // what (no dynamic branching). We can limit the maximum number of taps if
  744. // we statically limit the minimum phosphor triad size. Larger values are
  745. // faster, but the limit IS enforced (static option only, forever);
  746. // range [1, mask_texture_small_size/mask_triads_per_tile]
  747. // TODO: Make this 1.0 and compensate with smarter sampling!
  748. static const float mask_min_allowed_triad_size = 2.0;
  749. // GEOMETRY:
  750. // Geometry mode:
  751. // 0: Off (default), 1: Spherical mapping (like cgwg's),
  752. // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
  753. static const float geom_mode_static = 0.0; // range [0, 3]
  754. // Radius of curvature: Measured in units of your viewport's diagonal size.
  755. static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
  756. // View dist is the distance from the player to their physical screen, in
  757. // units of the viewport's diagonal size. It controls the field of view.
  758. static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
  759. // Tilt angle in radians (clockwise around up and right vectors):
  760. static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
  761. // Aspect ratio: When the true viewport size is unknown, this value is used
  762. // to help convert between the phosphor triad size and count, along with
  763. // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
  764. // this equal to Retroarch's display aspect ratio (DAR) for best results;
  765. // range [1, geom_max_aspect_ratio from user-cgp-constants.h];
  766. // default (256/224)*(54/47) = 1.313069909 (see below)
  767. static const float geom_aspect_ratio_static = 1.313069909;
  768. // Before getting into overscan, here's some general aspect ratio info:
  769. // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
  770. // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
  771. // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
  772. // Geometry processing has to "undo" the screen-space 2D DAR to calculate
  773. // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
  774. // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
  775. // a.) Enable Retroarch's "Crop Overscan"
  776. // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
  777. // Real consoles use horizontal black padding in the signal, but emulators
  778. // often crop this without cropping the vertical padding; a 256x224 [S]NES
  779. // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
  780. // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
  781. // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
  782. // http://forums.nesdev.com/viewtopic.php?p=24815#p24815
  783. // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
  784. // without doing a. or b., but horizontal image borders will be tighter
  785. // than vertical ones, messing up curvature and overscan. Fixing the
  786. // padding first corrects this.
  787. // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
  788. // or adjust x/y independently to e.g. readd horizontal padding, as noted
  789. // above: Values < 1.0 zoom out; range (0, inf)
  790. static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
  791. // Compute a proper pixel-space to texture-space matrix even without ddx()/
  792. // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
  793. // with strong curvature (static option only for now).
  794. static const bool geom_force_correct_tangent_matrix = true;
  795. // BORDERS:
  796. // Rounded border size in texture uv coords:
  797. static const float border_size_static = 0.015; // range [0, 0.5]
  798. // Border darkness: Moderate values darken the border smoothly, and high
  799. // values make the image very dark just inside the border:
  800. static const float border_darkness_static = 2.0; // range [0, inf)
  801. // Border compression: High numbers compress border transitions, narrowing
  802. // the dark border area.
  803. static const float border_compress_static = 2.5; // range [1, inf)
  804. #endif // USER_SETTINGS_H
  805. ///////////////////////////// END USER-SETTINGS ////////////////////////////
  806. //#include "derived-settings-and-constants.h"
  807. ///////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS ////////////////////
  808. #ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
  809. #define DERIVED_SETTINGS_AND_CONSTANTS_H
  810. ///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
  811. // crt-royale: A full-featured CRT shader, with cheese.
  812. // Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
  813. //
  814. // This program is free software; you can redistribute it and/or modify it
  815. // under the terms of the GNU General Public License as published by the Free
  816. // Software Foundation; either version 2 of the License, or any later version.
  817. //
  818. // This program is distributed in the hope that it will be useful, but WITHOUT
  819. // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  820. // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  821. // more details.
  822. //
  823. // You should have received a copy of the GNU General Public License along with
  824. // this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  825. // Place, Suite 330, Boston, MA 02111-1307 USA
  826. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  827. // These macros and constants can be used across the whole codebase.
  828. // Unlike the values in user-settings.cgh, end users shouldn't modify these.
  829. /////////////////////////////// BEGIN INCLUDES ///////////////////////////////
  830. //#include "../user-settings.h"
  831. ///////////////////////////// BEGIN USER-SETTINGS ////////////////////////////
  832. #ifndef USER_SETTINGS_H
  833. #define USER_SETTINGS_H
  834. ///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
  835. // The Cg compiler uses different "profiles" with different capabilities.
  836. // This shader requires a Cg compilation profile >= arbfp1, but a few options
  837. // require higher profiles like fp30 or fp40. The shader can't detect profile
  838. // or driver capabilities, so instead you must comment or uncomment the lines
  839. // below with "//" before "#define." Disable an option if you get compilation
  840. // errors resembling those listed. Generally speaking, all of these options
  841. // will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
  842. // likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
  843. // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
  844. // Among other things, derivatives help us fix anisotropic filtering artifacts
  845. // with curved manually tiled phosphor mask coords. Related errors:
  846. // error C3004: function "float2 ddx(float2);" not supported in this profile
  847. // error C3004: function "float2 ddy(float2);" not supported in this profile
  848. //#define DRIVERS_ALLOW_DERIVATIVES
  849. // Fine derivatives: Unsupported on older ATI cards.
  850. // Fine derivatives enable 2x2 fragment block communication, letting us perform
  851. // fast single-pass blur operations. If your card uses coarse derivatives and
  852. // these are enabled, blurs could look broken. Derivatives are a prerequisite.
  853. #ifdef DRIVERS_ALLOW_DERIVATIVES
  854. #define DRIVERS_ALLOW_FINE_DERIVATIVES
  855. #endif
  856. // Dynamic looping: Requires an fp30 or newer profile.
  857. // This makes phosphor mask resampling faster in some cases. Related errors:
  858. // error C5013: profile does not support "for" statements and "for" could not
  859. // be unrolled
  860. //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
  861. // Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
  862. // Using one static loop avoids overhead if the user is right, but if the user
  863. // is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
  864. // binary search can potentially save some iterations. However, it may fail:
  865. // error C6001: Temporary register limit of 32 exceeded; 35 registers
  866. // needed to compile program
  867. //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
  868. // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
  869. // anisotropic filtering, thereby fixing related artifacts. Related errors:
  870. // error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
  871. // this profile
  872. //#define DRIVERS_ALLOW_TEX2DLOD
  873. // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
  874. // artifacts from anisotropic filtering and mipmapping. Related errors:
  875. // error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
  876. // in this profile
  877. //#define DRIVERS_ALLOW_TEX2DBIAS
  878. // Integrated graphics compatibility: Integrated graphics like Intel HD 4000
  879. // impose stricter limitations on register counts and instructions. Enable
  880. // INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
  881. // error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
  882. // to compile program.
  883. // Enabling integrated graphics compatibility mode will automatically disable:
  884. // 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
  885. // (This may be reenabled in a later release.)
  886. // 2.) RUNTIME_GEOMETRY_MODE
  887. // 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
  888. //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  889. //////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
  890. // To disable a #define option, turn its line into a comment with "//."
  891. // RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
  892. // Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
  893. // many of the options in this file and allow real-time tuning, but many of
  894. // them are slower. Disabling them and using this text file will boost FPS.
  895. #define RUNTIME_SHADER_PARAMS_ENABLE
  896. // Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
  897. // it's the only way to do a wide-enough full bloom with a runtime dot pitch.
  898. #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
  899. // Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
  900. #define RUNTIME_ANTIALIAS_WEIGHTS
  901. // Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
  902. //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  903. // Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
  904. // parameters? This will require more math or dynamic branching.
  905. #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  906. // Specify the tilt at runtime? This makes things about 3% slower.
  907. #define RUNTIME_GEOMETRY_TILT
  908. // Specify the geometry mode at runtime?
  909. #define RUNTIME_GEOMETRY_MODE
  910. // Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
  911. // mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
  912. // dynamic branches? This is cheap if mask_resize_viewport_scale is small.
  913. #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  914. // PHOSPHOR MASK:
  915. // Manually resize the phosphor mask for best results (slower)? Disabling this
  916. // removes the option to do so, but it may be faster without dynamic branches.
  917. #define PHOSPHOR_MASK_MANUALLY_RESIZE
  918. // If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
  919. #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
  920. // Larger blurs are expensive, but we need them to blur larger triads. We can
  921. // detect the right blur if the triad size is static or our profile allows
  922. // dynamic branches, but otherwise we use the largest blur the user indicates
  923. // they might need:
  924. #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  925. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  926. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  927. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  928. // Here's a helpful chart:
  929. // MaxTriadSize BlurSize MinTriadCountsByResolution
  930. // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  931. // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  932. // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  933. // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  934. // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  935. /////////////////////////////// USER PARAMETERS //////////////////////////////
  936. // Note: Many of these static parameters are overridden by runtime shader
  937. // parameters when those are enabled. However, many others are static codepath
  938. // options that were cleaner or more convert to code as static constants.
  939. // GAMMA:
  940. static const float crt_gamma_static = 2.5; // range [1, 5]
  941. static const float lcd_gamma_static = 2.2; // range [1, 5]
  942. // LEVELS MANAGEMENT:
  943. // Control the final multiplicative image contrast:
  944. static const float levels_contrast_static = 1.0; // range [0, 4)
  945. // We auto-dim to avoid clipping between passes and restore brightness
  946. // later. Control the dim factor here: Lower values clip less but crush
  947. // blacks more (static only for now).
  948. static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
  949. // HALATION/DIFFUSION/BLOOM:
  950. // Halation weight: How much energy should be lost to electrons bounding
  951. // around under the CRT glass and exciting random phosphors?
  952. static const float halation_weight_static = 0.0; // range [0, 1]
  953. // Refractive diffusion weight: How much light should spread/diffuse from
  954. // refracting through the CRT glass?
  955. static const float diffusion_weight_static = 0.075; // range [0, 1]
  956. // Underestimate brightness: Bright areas bloom more, but we can base the
  957. // bloom brightpass on a lower brightness to sharpen phosphors, or a higher
  958. // brightness to soften them. Low values clip, but >= 0.8 looks okay.
  959. static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
  960. // Blur all colors more than necessary for a softer phosphor bloom?
  961. static const float bloom_excess_static = 0.0; // range [0, 1]
  962. // The BLOOM_APPROX pass approximates a phosphor blur early on with a small
  963. // blurred resize of the input (convergence offsets are applied as well).
  964. // There are three filter options (static option only for now):
  965. // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
  966. // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
  967. // and beam_max_sigma is low.
  968. // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
  969. // always uses a static sigma regardless of beam_max_sigma or
  970. // mask_num_triads_desired.
  971. // 2.) True 4x4 Gaussian resize: Slowest, technically correct.
  972. // These options are more pronounced for the fast, unbloomed shader version.
  973. #ifndef RADEON_FIX
  974. static const float bloom_approx_filter_static = 2.0;
  975. #else
  976. static const float bloom_approx_filter_static = 1.0;
  977. #endif
  978. // ELECTRON BEAM SCANLINE DISTRIBUTION:
  979. // How many scanlines should contribute light to each pixel? Using more
  980. // scanlines is slower (especially for a generalized Gaussian) but less
  981. // distorted with larger beam sigmas (especially for a pure Gaussian). The
  982. // max_beam_sigma at which the closest unused weight is guaranteed <
  983. // 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
  984. // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
  985. // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
  986. // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
  987. // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
  988. // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
  989. static const float beam_num_scanlines = 3.0; // range [2, 6]
  990. // A generalized Gaussian beam varies shape with color too, now just width.
  991. // It's slower but more flexible (static option only for now).
  992. static const bool beam_generalized_gaussian = true;
  993. // What kind of scanline antialiasing do you want?
  994. // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
  995. // Integrals are slow (especially for generalized Gaussians) and rarely any
  996. // better than 3x antialiasing (static option only for now).
  997. static const float beam_antialias_level = 1.0; // range [0, 2]
  998. // Min/max standard deviations for scanline beams: Higher values widen and
  999. // soften scanlines. Depending on other options, low min sigmas can alias.
  1000. static const float beam_min_sigma_static = 0.02; // range (0, 1]
  1001. static const float beam_max_sigma_static = 0.3; // range (0, 1]
  1002. // Beam width varies as a function of color: A power function (0) is more
  1003. // configurable, but a spherical function (1) gives the widest beam
  1004. // variability without aliasing (static option only for now).
  1005. static const float beam_spot_shape_function = 0.0;
  1006. // Spot shape power: Powers <= 1 give smoother spot shapes but lower
  1007. // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
  1008. static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
  1009. // Generalized Gaussian max shape parameters: Higher values give flatter
  1010. // scanline plateaus and steeper dropoffs, simultaneously widening and
  1011. // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
  1012. // values > ~40.0 cause artifacts with integrals.
  1013. static const float beam_min_shape_static = 2.0; // range [2, 32]
  1014. static const float beam_max_shape_static = 4.0; // range [2, 32]
  1015. // Generalized Gaussian shape power: Affects how quickly the distribution
  1016. // changes shape from Gaussian to steep/plateaued as color increases from 0
  1017. // to 1.0. Higher powers appear softer for most colors, and lower powers
  1018. // appear sharper for most colors.
  1019. static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
  1020. // What filter should be used to sample scanlines horizontally?
  1021. // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
  1022. static const float beam_horiz_filter_static = 0.0;
  1023. // Standard deviation for horizontal Gaussian resampling:
  1024. static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
  1025. // Do horizontal scanline sampling in linear RGB (correct light mixing),
  1026. // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
  1027. // limiting circuitry in some CRT's), or a weighted avg.?
  1028. static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
  1029. // Simulate scanline misconvergence? This needs 3x horizontal texture
  1030. // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
  1031. // later passes (static option only for now).
  1032. static const bool beam_misconvergence = true;
  1033. // Convergence offsets in x/y directions for R/G/B scanline beams in units
  1034. // of scanlines. Positive offsets go right/down; ranges [-2, 2]
  1035. static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
  1036. static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
  1037. static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
  1038. // Detect interlacing (static option only for now)?
  1039. static const bool interlace_detect = true;
  1040. // Assume 1080-line sources are interlaced?
  1041. static const bool interlace_1080i_static = false;
  1042. // For interlaced sources, assume TFF (top-field first) or BFF order?
  1043. // (Whether this matters depends on the nature of the interlaced input.)
  1044. static const bool interlace_bff_static = false;
  1045. // ANTIALIASING:
  1046. // What AA level do you want for curvature/overscan/subpixels? Options:
  1047. // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
  1048. // (Static option only for now)
  1049. static const float aa_level = 12.0; // range [0, 24]
  1050. // What antialiasing filter do you want (static option only)? Options:
  1051. // 0: Box (separable), 1: Box (cylindrical),
  1052. // 2: Tent (separable), 3: Tent (cylindrical),
  1053. // 4: Gaussian (separable), 5: Gaussian (cylindrical),
  1054. // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
  1055. // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
  1056. // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
  1057. static const float aa_filter = 6.0; // range [0, 9]
  1058. // Flip the sample grid on odd/even frames (static option only for now)?
  1059. static const bool aa_temporal = false;
  1060. // Use RGB subpixel offsets for antialiasing? The pixel is at green, and
  1061. // the blue offset is the negative r offset; range [0, 0.5]
  1062. static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
  1063. // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
  1064. // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
  1065. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
  1066. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
  1067. // 4.) C = 0.0 is a soft spline filter.
  1068. static const float aa_cubic_c_static = 0.5; // range [0, 4]
  1069. // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
  1070. static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
  1071. // PHOSPHOR MASK:
  1072. // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
  1073. static const float mask_type_static = 1.0; // range [0, 2]
  1074. // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
  1075. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
  1076. // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
  1077. // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
  1078. // is halfway decent with LUT mipmapping but atrocious without it.
  1079. // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
  1080. // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
  1081. // This mode reuses the same masks, so triads will be enormous unless
  1082. // you change the mask LUT filenames in your .cgp file.
  1083. static const float mask_sample_mode_static = 0.0; // range [0, 2]
  1084. // Prefer setting the triad size (0.0) or number on the screen (1.0)?
  1085. // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
  1086. // will always be used to calculate the full bloom sigma statically.
  1087. static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
  1088. // Specify the phosphor triad size, in pixels. Each tile (usually with 8
  1089. // triads) will be rounded to the nearest integer tile size and clamped to
  1090. // obey minimum size constraints (imposed to reduce downsize taps) and
  1091. // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
  1092. // To increase the size limit, double the viewport-relative scales for the
  1093. // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
  1094. // range [1, mask_texture_small_size/mask_triads_per_tile]
  1095. static const float mask_triad_size_desired_static = 24.0 / 8.0;
  1096. // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
  1097. // final size will be rounded and constrained as above); default 480.0
  1098. static const float mask_num_triads_desired_static = 480.0;
  1099. // How many lobes should the sinc/Lanczos resizer use? More lobes require
  1100. // more samples and avoid moire a bit better, but some is unavoidable
  1101. // depending on the destination size (static option for now).
  1102. static const float mask_sinc_lobes = 3.0; // range [2, 4]
  1103. // The mask is resized using a variable number of taps in each dimension,
  1104. // but some Cg profiles always fetch a constant number of taps no matter
  1105. // what (no dynamic branching). We can limit the maximum number of taps if
  1106. // we statically limit the minimum phosphor triad size. Larger values are
  1107. // faster, but the limit IS enforced (static option only, forever);
  1108. // range [1, mask_texture_small_size/mask_triads_per_tile]
  1109. // TODO: Make this 1.0 and compensate with smarter sampling!
  1110. static const float mask_min_allowed_triad_size = 2.0;
  1111. // GEOMETRY:
  1112. // Geometry mode:
  1113. // 0: Off (default), 1: Spherical mapping (like cgwg's),
  1114. // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
  1115. static const float geom_mode_static = 0.0; // range [0, 3]
  1116. // Radius of curvature: Measured in units of your viewport's diagonal size.
  1117. static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
  1118. // View dist is the distance from the player to their physical screen, in
  1119. // units of the viewport's diagonal size. It controls the field of view.
  1120. static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
  1121. // Tilt angle in radians (clockwise around up and right vectors):
  1122. static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
  1123. // Aspect ratio: When the true viewport size is unknown, this value is used
  1124. // to help convert between the phosphor triad size and count, along with
  1125. // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
  1126. // this equal to Retroarch's display aspect ratio (DAR) for best results;
  1127. // range [1, geom_max_aspect_ratio from user-cgp-constants.h];
  1128. // default (256/224)*(54/47) = 1.313069909 (see below)
  1129. static const float geom_aspect_ratio_static = 1.313069909;
  1130. // Before getting into overscan, here's some general aspect ratio info:
  1131. // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
  1132. // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
  1133. // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
  1134. // Geometry processing has to "undo" the screen-space 2D DAR to calculate
  1135. // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
  1136. // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
  1137. // a.) Enable Retroarch's "Crop Overscan"
  1138. // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
  1139. // Real consoles use horizontal black padding in the signal, but emulators
  1140. // often crop this without cropping the vertical padding; a 256x224 [S]NES
  1141. // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
  1142. // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
  1143. // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
  1144. // http://forums.nesdev.com/viewtopic.php?p=24815#p24815
  1145. // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
  1146. // without doing a. or b., but horizontal image borders will be tighter
  1147. // than vertical ones, messing up curvature and overscan. Fixing the
  1148. // padding first corrects this.
  1149. // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
  1150. // or adjust x/y independently to e.g. readd horizontal padding, as noted
  1151. // above: Values < 1.0 zoom out; range (0, inf)
  1152. static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
  1153. // Compute a proper pixel-space to texture-space matrix even without ddx()/
  1154. // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
  1155. // with strong curvature (static option only for now).
  1156. static const bool geom_force_correct_tangent_matrix = true;
  1157. // BORDERS:
  1158. // Rounded border size in texture uv coords:
  1159. static const float border_size_static = 0.015; // range [0, 0.5]
  1160. // Border darkness: Moderate values darken the border smoothly, and high
  1161. // values make the image very dark just inside the border:
  1162. static const float border_darkness_static = 2.0; // range [0, inf)
  1163. // Border compression: High numbers compress border transitions, narrowing
  1164. // the dark border area.
  1165. static const float border_compress_static = 2.5; // range [1, inf)
  1166. #endif // USER_SETTINGS_H
  1167. ///////////////////////////// END USER-SETTINGS ////////////////////////////
  1168. //#include "user-cgp-constants.h"
  1169. ///////////////////////// BEGIN USER-CGP-CONSTANTS /////////////////////////
  1170. #ifndef USER_CGP_CONSTANTS_H
  1171. #define USER_CGP_CONSTANTS_H
  1172. // IMPORTANT:
  1173. // These constants MUST be set appropriately for the settings in crt-royale.cgp
  1174. // (or whatever related .cgp file you're using). If they aren't, you're likely
  1175. // to get artifacts, the wrong phosphor mask size, etc. I wish these could be
  1176. // set directly in the .cgp file to make things easier, but...they can't.
  1177. // PASS SCALES AND RELATED CONSTANTS:
  1178. // Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of
  1179. // this shader: One does a viewport-scale bloom, and the other skips it. The
  1180. // latter benefits from a higher bloom_approx_scale_x, so save both separately:
  1181. static const float bloom_approx_size_x = 320.0;
  1182. static const float bloom_approx_size_x_for_fake = 400.0;
  1183. // Copy the viewport-relative scales of the phosphor mask resize passes
  1184. // (MASK_RESIZE and the pass immediately preceding it):
  1185. static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
  1186. // Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
  1187. static const float geom_max_aspect_ratio = 4.0/3.0;
  1188. // PHOSPHOR MASK TEXTURE CONSTANTS:
  1189. // Set the following constants to reflect the properties of the phosphor mask
  1190. // texture named in crt-royale.cgp. The shader optionally resizes a mask tile
  1191. // based on user settings, then repeats a single tile until filling the screen.
  1192. // The shader must know the input texture size (default 64x64), and to manually
  1193. // resize, it must also know the horizontal triads per tile (default 8).
  1194. static const float2 mask_texture_small_size = float2(64.0, 64.0);
  1195. static const float2 mask_texture_large_size = float2(512.0, 512.0);
  1196. static const float mask_triads_per_tile = 8.0;
  1197. // We need the average brightness of the phosphor mask to compensate for the
  1198. // dimming it causes. The following four values are roughly correct for the
  1199. // masks included with the shader. Update the value for any LUT texture you
  1200. // change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
  1201. // the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
  1202. //#define PHOSPHOR_MASK_GRILLE14
  1203. static const float mask_grille14_avg_color = 50.6666666/255.0;
  1204. // TileableLinearApertureGrille14Wide7d33Spacing*.png
  1205. // TileableLinearApertureGrille14Wide10And6Spacing*.png
  1206. static const float mask_grille15_avg_color = 53.0/255.0;
  1207. // TileableLinearApertureGrille15Wide6d33Spacing*.png
  1208. // TileableLinearApertureGrille15Wide8And5d5Spacing*.png
  1209. static const float mask_slot_avg_color = 46.0/255.0;
  1210. // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
  1211. // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
  1212. static const float mask_shadow_avg_color = 41.0/255.0;
  1213. // TileableLinearShadowMask*.png
  1214. // TileableLinearShadowMaskEDP*.png
  1215. #ifdef PHOSPHOR_MASK_GRILLE14
  1216. static const float mask_grille_avg_color = mask_grille14_avg_color;
  1217. #else
  1218. static const float mask_grille_avg_color = mask_grille15_avg_color;
  1219. #endif
  1220. #endif // USER_CGP_CONSTANTS_H
  1221. ////////////////////////// END USER-CGP-CONSTANTS //////////////////////////
  1222. //////////////////////////////// END INCLUDES ////////////////////////////////
  1223. /////////////////////////////// FIXED SETTINGS ///////////////////////////////
  1224. // Avoid dividing by zero; using a macro overloads for float, float2, etc.:
  1225. #define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16
  1226. // Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
  1227. #ifndef SIMULATE_CRT_ON_LCD
  1228. #define SIMULATE_CRT_ON_LCD
  1229. #endif
  1230. // Manually tiling a manually resized texture creates texture coord derivative
  1231. // discontinuities and confuses anisotropic filtering, causing discolored tile
  1232. // seams in the phosphor mask. Workarounds:
  1233. // a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's
  1234. // downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
  1235. // disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
  1236. // b.) "Tile flat twice" requires drawing two full tiles without border padding
  1237. // to the resized mask FBO, and it's incompatible with same-pass curvature.
  1238. // (Same-pass curvature isn't used but could be in the future...maybe.)
  1239. // c.) "Fix discontinuities" requires derivatives and drawing one tile with
  1240. // border padding to the resized mask FBO, but it works with same-pass
  1241. // curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
  1242. // Precedence: a, then, b, then c (if multiple strategies are #defined).
  1243. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen
  1244. #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen
  1245. #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen
  1246. // Also, manually resampling the phosphor mask is slightly blurrier with
  1247. // anisotropic filtering. (Resampling with mipmapping is even worse: It
  1248. // creates artifacts, but only with the fully bloomed shader.) The difference
  1249. // is subtle with small triads, but you can fix it for a small cost.
  1250. //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  1251. ////////////////////////////// DERIVED SETTINGS //////////////////////////////
  1252. // Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
  1253. // geometry mode at runtime, or a 4x4 true Gaussian resize. Disable
  1254. // incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
  1255. // #defined by either user-settings.h or a wrapper .cg that #includes the
  1256. // current .cg pass.)
  1257. #ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  1258. #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
  1259. #undef PHOSPHOR_MASK_MANUALLY_RESIZE
  1260. #endif
  1261. #ifdef RUNTIME_GEOMETRY_MODE
  1262. #undef RUNTIME_GEOMETRY_MODE
  1263. #endif
  1264. // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
  1265. // inferior in most cases, so replace 2.0 with 0.0:
  1266. static const float bloom_approx_filter =
  1267. bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
  1268. #else
  1269. static const float bloom_approx_filter = bloom_approx_filter_static;
  1270. #endif
  1271. // Disable slow runtime paths if static parameters are used. Most of these
  1272. // won't be a problem anyway once the params are disabled, but some will.
  1273. #ifndef RUNTIME_SHADER_PARAMS_ENABLE
  1274. #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  1275. #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  1276. #endif
  1277. #ifdef RUNTIME_ANTIALIAS_WEIGHTS
  1278. #undef RUNTIME_ANTIALIAS_WEIGHTS
  1279. #endif
  1280. #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  1281. #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  1282. #endif
  1283. #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  1284. #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  1285. #endif
  1286. #ifdef RUNTIME_GEOMETRY_TILT
  1287. #undef RUNTIME_GEOMETRY_TILT
  1288. #endif
  1289. #ifdef RUNTIME_GEOMETRY_MODE
  1290. #undef RUNTIME_GEOMETRY_MODE
  1291. #endif
  1292. #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  1293. #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  1294. #endif
  1295. #endif
  1296. // Make tex2Dbias a backup for tex2Dlod for wider compatibility.
  1297. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  1298. #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  1299. #endif
  1300. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  1301. #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  1302. #endif
  1303. // Rule out unavailable anisotropic compatibility strategies:
  1304. #ifndef DRIVERS_ALLOW_DERIVATIVES
  1305. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1306. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1307. #endif
  1308. #endif
  1309. #ifndef DRIVERS_ALLOW_TEX2DLOD
  1310. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  1311. #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  1312. #endif
  1313. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  1314. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  1315. #endif
  1316. #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
  1317. #undef ANTIALIAS_DISABLE_ANISOTROPIC
  1318. #endif
  1319. #endif
  1320. #ifndef DRIVERS_ALLOW_TEX2DBIAS
  1321. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  1322. #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  1323. #endif
  1324. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  1325. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  1326. #endif
  1327. #endif
  1328. // Prioritize anisotropic tiling compatibility strategies by performance and
  1329. // disable unused strategies. This concentrates all the nesting in one place.
  1330. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  1331. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  1332. #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  1333. #endif
  1334. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  1335. #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  1336. #endif
  1337. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1338. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1339. #endif
  1340. #else
  1341. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  1342. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  1343. #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  1344. #endif
  1345. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1346. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1347. #endif
  1348. #else
  1349. // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
  1350. // flat texture coords in the same pass, but that's all we use.
  1351. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  1352. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1353. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1354. #endif
  1355. #endif
  1356. #endif
  1357. #endif
  1358. // The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
  1359. // reduce some #ifdef nesting in the next section by essentially OR'ing them:
  1360. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  1361. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  1362. #endif
  1363. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  1364. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  1365. #endif
  1366. // Prioritize anisotropic resampling compatibility strategies the same way:
  1367. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  1368. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  1369. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  1370. #endif
  1371. #endif
  1372. /////////////////////// DERIVED PHOSPHOR MASK CONSTANTS //////////////////////
  1373. // If we can use the large mipmapped LUT without mipmapping artifacts, we
  1374. // should: It gives us more options for using fewer samples.
  1375. #ifdef DRIVERS_ALLOW_TEX2DLOD
  1376. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  1377. // TODO: Take advantage of this!
  1378. #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
  1379. static const float2 mask_resize_src_lut_size = mask_texture_large_size;
  1380. #else
  1381. static const float2 mask_resize_src_lut_size = mask_texture_small_size;
  1382. #endif
  1383. #else
  1384. static const float2 mask_resize_src_lut_size = mask_texture_small_size;
  1385. #endif
  1386. // tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
  1387. // main_fragment, or a static alias of one of the above. This makes it hard
  1388. // to select the phosphor mask at runtime: We can't even assign to a uniform
  1389. // global in the vertex shader or select a sampler2D in the vertex shader and
  1390. // pass it to the fragment shader (even with explicit TEXUNIT# bindings),
  1391. // because it just gives us the input texture or a black screen. However, we
  1392. // can get around these limitations by calling tex2D three times with different
  1393. // uniform samplers (or resizing the phosphor mask three times altogether).
  1394. // With dynamic branches, we can process only one of these branches on top of
  1395. // quickly discarding fragments we don't need (cgc seems able to overcome
  1396. // limigations around dependent texture fetches inside of branches). Without
  1397. // dynamic branches, we have to process every branch for every fragment...which
  1398. // is slower. Runtime sampling mode selection is slower without dynamic
  1399. // branches as well. Let the user's static #defines decide if it's worth it.
  1400. #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
  1401. #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  1402. #else
  1403. #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  1404. #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  1405. #endif
  1406. #endif
  1407. // We need to render some minimum number of tiles in the resize passes.
  1408. // We need at least 1.0 just to repeat a single tile, and we need extra
  1409. // padding beyond that for anisotropic filtering, discontinuitity fixing,
  1410. // antialiasing, same-pass curvature (not currently used), etc. First
  1411. // determine how many border texels and tiles we need, based on how the result
  1412. // will be sampled:
  1413. #ifdef GEOMETRY_EARLY
  1414. static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
  1415. // Most antialiasing filters have a base radius of 4.0 pixels:
  1416. static const float max_aa_base_pixel_border = 4.0 +
  1417. max_subpixel_offset;
  1418. #else
  1419. static const float max_aa_base_pixel_border = 0.0;
  1420. #endif
  1421. // Anisotropic filtering adds about 0.5 to the pixel border:
  1422. #ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  1423. static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
  1424. #else
  1425. static const float max_aniso_pixel_border = max_aa_base_pixel_border;
  1426. #endif
  1427. // Fixing discontinuities adds 1.0 more to the pixel border:
  1428. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  1429. static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
  1430. #else
  1431. static const float max_tiled_pixel_border = max_aniso_pixel_border;
  1432. #endif
  1433. // Convert the pixel border to an integer texel border. Assume same-pass
  1434. // curvature about triples the texel frequency:
  1435. #ifdef GEOMETRY_EARLY
  1436. static const float max_mask_texel_border =
  1437. ceil(max_tiled_pixel_border * 3.0);
  1438. #else
  1439. static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
  1440. #endif
  1441. // Convert the texel border to a tile border using worst-case assumptions:
  1442. static const float max_mask_tile_border = max_mask_texel_border/
  1443. (mask_min_allowed_triad_size * mask_triads_per_tile);
  1444. // Finally, set the number of resized tiles to render to MASK_RESIZE, and set
  1445. // the starting texel (inside borders) for sampling it.
  1446. #ifndef GEOMETRY_EARLY
  1447. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  1448. // Special case: Render two tiles without borders. Anisotropic
  1449. // filtering doesn't seem to be a problem here.
  1450. static const float mask_resize_num_tiles = 1.0 + 1.0;
  1451. static const float mask_start_texels = 0.0;
  1452. #else
  1453. static const float mask_resize_num_tiles = 1.0 +
  1454. 2.0 * max_mask_tile_border;
  1455. static const float mask_start_texels = max_mask_texel_border;
  1456. #endif
  1457. #else
  1458. static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
  1459. static const float mask_start_texels = max_mask_texel_border;
  1460. #endif
  1461. // We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
  1462. // mask_resize_viewport_scale. This limits the maximum final triad size.
  1463. // Estimate the minimum number of triads we can split the screen into in each
  1464. // dimension (we'll be as correct as mask_resize_viewport_scale is):
  1465. static const float mask_resize_num_triads =
  1466. mask_resize_num_tiles * mask_triads_per_tile;
  1467. static const float2 min_allowed_viewport_triads =
  1468. float2(mask_resize_num_triads) / mask_resize_viewport_scale;
  1469. //////////////////////// COMMON MATHEMATICAL CONSTANTS ///////////////////////
  1470. static const float pi = 3.141592653589;
  1471. // We often want to find the location of the previous texel, e.g.:
  1472. // const float2 curr_texel = uv * texture_size;
  1473. // const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
  1474. // const float2 prev_texel_uv = prev_texel / texture_size;
  1475. // However, many GPU drivers round incorrectly around exact texel locations.
  1476. // We need to subtract a little less than 0.5 before flooring, and some GPU's
  1477. // require this value to be farther from 0.5 than others; define it here.
  1478. // const float2 prev_texel =
  1479. // floor(curr_texel - float2(under_half)) + float2(0.5);
  1480. static const float under_half = 0.4995;
  1481. #endif // DERIVED_SETTINGS_AND_CONSTANTS_H
  1482. //////////////////// END DERIVED-SETTINGS-AND-CONSTANTS /////////////////////
  1483. //////////////////////////////// END INCLUDES ////////////////////////////////
  1484. // Override some parameters for gamma-management.h and tex2Dantialias.h:
  1485. #define OVERRIDE_DEVICE_GAMMA
  1486. static const float gba_gamma = 3.5; // Irrelevant but necessary to define.
  1487. #define ANTIALIAS_OVERRIDE_BASICS
  1488. #define ANTIALIAS_OVERRIDE_PARAMETERS
  1489. // Provide accessors for vector constants that pack scalar uniforms:
  1490. inline float2 get_aspect_vector(const float geom_aspect_ratio)
  1491. {
  1492. // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent
  1493. // the absolute scale from affecting the uv-mapping for curvature:
  1494. const float geom_clamped_aspect_ratio =
  1495. min(geom_aspect_ratio, geom_max_aspect_ratio);
  1496. const float2 geom_aspect =
  1497. normalize(float2(geom_clamped_aspect_ratio, 1.0));
  1498. return geom_aspect;
  1499. }
  1500. inline float2 get_geom_overscan_vector()
  1501. {
  1502. return float2(geom_overscan_x, geom_overscan_y);
  1503. }
  1504. inline float2 get_geom_tilt_angle_vector()
  1505. {
  1506. return float2(geom_tilt_angle_x, geom_tilt_angle_y);
  1507. }
  1508. inline float3 get_convergence_offsets_x_vector()
  1509. {
  1510. return float3(convergence_offset_x_r, convergence_offset_x_g,
  1511. convergence_offset_x_b);
  1512. }
  1513. inline float3 get_convergence_offsets_y_vector()
  1514. {
  1515. return float3(convergence_offset_y_r, convergence_offset_y_g,
  1516. convergence_offset_y_b);
  1517. }
  1518. inline float2 get_convergence_offsets_r_vector()
  1519. {
  1520. return float2(convergence_offset_x_r, convergence_offset_y_r);
  1521. }
  1522. inline float2 get_convergence_offsets_g_vector()
  1523. {
  1524. return float2(convergence_offset_x_g, convergence_offset_y_g);
  1525. }
  1526. inline float2 get_convergence_offsets_b_vector()
  1527. {
  1528. return float2(convergence_offset_x_b, convergence_offset_y_b);
  1529. }
  1530. inline float2 get_aa_subpixel_r_offset()
  1531. {
  1532. #ifdef RUNTIME_ANTIALIAS_WEIGHTS
  1533. #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  1534. // WARNING: THIS IS EXTREMELY EXPENSIVE.
  1535. return float2(aa_subpixel_r_offset_x_runtime,
  1536. aa_subpixel_r_offset_y_runtime);
  1537. #else
  1538. return aa_subpixel_r_offset_static;
  1539. #endif
  1540. #else
  1541. return aa_subpixel_r_offset_static;
  1542. #endif
  1543. }
  1544. // Provide accessors settings which still need "cooking:"
  1545. inline float get_mask_amplify()
  1546. {
  1547. static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
  1548. static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
  1549. static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
  1550. return mask_type < 0.5 ? mask_grille_amplify :
  1551. mask_type < 1.5 ? mask_slot_amplify :
  1552. mask_shadow_amplify;
  1553. }
  1554. inline float get_mask_sample_mode()
  1555. {
  1556. #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  1557. #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
  1558. return mask_sample_mode_desired;
  1559. #else
  1560. return clamp(mask_sample_mode_desired, 1.0, 2.0);
  1561. #endif
  1562. #else
  1563. #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
  1564. return mask_sample_mode_static;
  1565. #else
  1566. return clamp(mask_sample_mode_static, 1.0, 2.0);
  1567. #endif
  1568. #endif
  1569. }
  1570. #endif // BIND_SHADER_PARAMS_H
  1571. //////////////////////////// END BIND-SHADER-PARAMS ///////////////////////////
  1572. //#include "../../../../include/gamma-management.h"
  1573. //////////////////////////// BEGIN GAMMA-MANAGEMENT //////////////////////////
  1574. #ifndef GAMMA_MANAGEMENT_H
  1575. #define GAMMA_MANAGEMENT_H
  1576. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  1577. // Copyright (C) 2014 TroggleMonkey
  1578. //
  1579. // Permission is hereby granted, free of charge, to any person obtaining a copy
  1580. // of this software and associated documentation files (the "Software"), to
  1581. // deal in the Software without restriction, including without limitation the
  1582. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  1583. // sell copies of the Software, and to permit persons to whom the Software is
  1584. // furnished to do so, subject to the following conditions:
  1585. //
  1586. // The above copyright notice and this permission notice shall be included in
  1587. // all copies or substantial portions of the Software.
  1588. //
  1589. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  1590. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  1591. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  1592. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  1593. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  1594. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  1595. // IN THE SOFTWARE.
  1596. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  1597. // This file provides gamma-aware tex*D*() and encode_output() functions.
  1598. // Requires: Before #include-ing this file, the including file must #define
  1599. // the following macros when applicable and follow their rules:
  1600. // 1.) #define FIRST_PASS if this is the first pass.
  1601. // 2.) #define LAST_PASS if this is the last pass.
  1602. // 3.) If sRGB is available, set srgb_framebufferN = "true" for
  1603. // every pass except the last in your .cgp preset.
  1604. // 4.) If sRGB isn't available but you want gamma-correctness with
  1605. // no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
  1606. // 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
  1607. // 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
  1608. // 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
  1609. // 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
  1610. // If an option in [5, 8] is #defined in the first or last pass, it
  1611. // should be #defined for both. It shouldn't make a difference
  1612. // whether it's #defined for intermediate passes or not.
  1613. // Optional: The including file (or an earlier included file) may optionally
  1614. // #define a number of macros indicating it will override certain
  1615. // macros and associated constants are as follows:
  1616. // static constants with either static or uniform constants. The
  1617. // 1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
  1618. // static const float ntsc_gamma
  1619. // static const float pal_gamma
  1620. // static const float crt_reference_gamma_high
  1621. // static const float crt_reference_gamma_low
  1622. // static const float lcd_reference_gamma
  1623. // static const float crt_office_gamma
  1624. // static const float lcd_office_gamma
  1625. // 2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
  1626. // static const float crt_gamma
  1627. // static const float gba_gamma
  1628. // static const float lcd_gamma
  1629. // 3.) OVERRIDE_FINAL_GAMMA: The user must first define:
  1630. // static const float input_gamma
  1631. // static const float intermediate_gamma
  1632. // static const float output_gamma
  1633. // (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
  1634. // 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
  1635. // static const bool assume_opaque_alpha
  1636. // The gamma constant overrides must be used in every pass or none,
  1637. // and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
  1638. // OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
  1639. // Usage: After setting macros appropriately, ignore gamma correction and
  1640. // replace all tex*D*() calls with equivalent gamma-aware
  1641. // tex*D*_linearize calls, except:
  1642. // 1.) When you read an LUT, use regular tex*D or a gamma-specified
  1643. // function, depending on its gamma encoding:
  1644. // tex*D*_linearize_gamma (takes a runtime gamma parameter)
  1645. // 2.) If you must read pass0's original input in a later pass, use
  1646. // tex2D_linearize_ntsc_gamma. If you want to read pass0's
  1647. // input with gamma-corrected bilinear filtering, consider
  1648. // creating a first linearizing pass and reading from the input
  1649. // of pass1 later.
  1650. // Then, return encode_output(color) from every fragment shader.
  1651. // Finally, use the global gamma_aware_bilinear boolean if you want
  1652. // to statically branch based on whether bilinear filtering is
  1653. // gamma-correct or not (e.g. for placing Gaussian blur samples).
  1654. //
  1655. // Detailed Policy:
  1656. // tex*D*_linearize() functions enforce a consistent gamma-management policy
  1657. // based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume
  1658. // their input texture has the same encoding characteristics as the input for
  1659. // the current pass (which doesn't apply to the exceptions listed above).
  1660. // Similarly, encode_output() enforces a policy based on the LAST_PASS and
  1661. // GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the
  1662. // following two pipelines.
  1663. // Typical pipeline with intermediate sRGB framebuffers:
  1664. // linear_color = pow(pass0_encoded_color, input_gamma);
  1665. // intermediate_output = linear_color; // Automatic sRGB encoding
  1666. // linear_color = intermediate_output; // Automatic sRGB decoding
  1667. // final_output = pow(intermediate_output, 1.0/output_gamma);
  1668. // Typical pipeline without intermediate sRGB framebuffers:
  1669. // linear_color = pow(pass0_encoded_color, input_gamma);
  1670. // intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
  1671. // linear_color = pow(intermediate_output, intermediate_gamma);
  1672. // final_output = pow(intermediate_output, 1.0/output_gamma);
  1673. // Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
  1674. // easily get gamma-correctness without banding on devices where sRGB isn't
  1675. // supported.
  1676. //
  1677. // Use This Header to Maximize Code Reuse:
  1678. // The purpose of this header is to provide a consistent interface for texture
  1679. // reads and output gamma-encoding that localizes and abstracts away all the
  1680. // annoying details. This greatly reduces the amount of code in each shader
  1681. // pass that depends on the pass number in the .cgp preset or whether sRGB
  1682. // FBO's are being used: You can trivially change the gamma behavior of your
  1683. // whole pass by commenting or uncommenting 1-3 #defines. To reuse the same
  1684. // code in your first, Nth, and last passes, you can even put it all in another
  1685. // header file and #include it from skeleton .cg files that #define the
  1686. // appropriate pass-specific settings.
  1687. //
  1688. // Rationale for Using Three Macros:
  1689. // This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
  1690. // SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
  1691. // a lower maintenance burden on each pass. At first glance it seems we could
  1692. // accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
  1693. // This works for simple use cases where input_gamma == output_gamma, but it
  1694. // breaks down for more complex scenarios like CRT simulation, where the pass
  1695. // number determines the gamma encoding of the input and output.
  1696. /////////////////////////////// BASE CONSTANTS ///////////////////////////////
  1697. // Set standard gamma constants, but allow users to override them:
  1698. #ifndef OVERRIDE_STANDARD_GAMMA
  1699. // Standard encoding gammas:
  1700. static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too?
  1701. static const float pal_gamma = 2.8; // Never actually 2.8 in practice
  1702. // Typical device decoding gammas (only use for emulating devices):
  1703. // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
  1704. // gammas: The standards purposely undercorrected for an analog CRT's
  1705. // assumed 2.5 reference display gamma to maintain contrast in assumed
  1706. // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
  1707. // These unstated assumptions about display gamma and perceptual rendering
  1708. // intent caused a lot of confusion, and more modern CRT's seemed to target
  1709. // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit
  1710. // (they struggle near black with 2.5 gamma anyway), especially PC/laptop
  1711. // displays designed to view sRGB in bright environments. (Standards are
  1712. // also in flux again with BT.1886, but it's underspecified for displays.)
  1713. static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55)
  1714. static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55)
  1715. static const float lcd_reference_gamma = 2.5; // To match CRT
  1716. static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC
  1717. static const float lcd_office_gamma = 2.2; // Approximates sRGB
  1718. #endif // OVERRIDE_STANDARD_GAMMA
  1719. // Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
  1720. // but only if they're aware of it.
  1721. #ifndef OVERRIDE_ALPHA_ASSUMPTIONS
  1722. static const bool assume_opaque_alpha = false;
  1723. #endif
  1724. /////////////////////// DERIVED CONSTANTS AS FUNCTIONS ///////////////////////
  1725. // gamma-management.h should be compatible with overriding gamma values with
  1726. // runtime user parameters, but we can only define other global constants in
  1727. // terms of static constants, not uniform user parameters. To get around this
  1728. // limitation, we need to define derived constants using functions.
  1729. // Set device gamma constants, but allow users to override them:
  1730. #ifdef OVERRIDE_DEVICE_GAMMA
  1731. // The user promises to globally define the appropriate constants:
  1732. inline float get_crt_gamma() { return crt_gamma; }
  1733. inline float get_gba_gamma() { return gba_gamma; }
  1734. inline float get_lcd_gamma() { return lcd_gamma; }
  1735. #else
  1736. inline float get_crt_gamma() { return crt_reference_gamma_high; }
  1737. inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0)
  1738. inline float get_lcd_gamma() { return lcd_office_gamma; }
  1739. #endif // OVERRIDE_DEVICE_GAMMA
  1740. // Set decoding/encoding gammas for the first/lass passes, but allow overrides:
  1741. #ifdef OVERRIDE_FINAL_GAMMA
  1742. // The user promises to globally define the appropriate constants:
  1743. inline float get_intermediate_gamma() { return intermediate_gamma; }
  1744. inline float get_input_gamma() { return input_gamma; }
  1745. inline float get_output_gamma() { return output_gamma; }
  1746. #else
  1747. // If we gamma-correct every pass, always use ntsc_gamma between passes to
  1748. // ensure middle passes don't need to care if anything is being simulated:
  1749. inline float get_intermediate_gamma() { return ntsc_gamma; }
  1750. #ifdef SIMULATE_CRT_ON_LCD
  1751. inline float get_input_gamma() { return get_crt_gamma(); }
  1752. inline float get_output_gamma() { return get_lcd_gamma(); }
  1753. #else
  1754. #ifdef SIMULATE_GBA_ON_LCD
  1755. inline float get_input_gamma() { return get_gba_gamma(); }
  1756. inline float get_output_gamma() { return get_lcd_gamma(); }
  1757. #else
  1758. #ifdef SIMULATE_LCD_ON_CRT
  1759. inline float get_input_gamma() { return get_lcd_gamma(); }
  1760. inline float get_output_gamma() { return get_crt_gamma(); }
  1761. #else
  1762. #ifdef SIMULATE_GBA_ON_CRT
  1763. inline float get_input_gamma() { return get_gba_gamma(); }
  1764. inline float get_output_gamma() { return get_crt_gamma(); }
  1765. #else // Don't simulate anything:
  1766. inline float get_input_gamma() { return ntsc_gamma; }
  1767. inline float get_output_gamma() { return ntsc_gamma; }
  1768. #endif // SIMULATE_GBA_ON_CRT
  1769. #endif // SIMULATE_LCD_ON_CRT
  1770. #endif // SIMULATE_GBA_ON_LCD
  1771. #endif // SIMULATE_CRT_ON_LCD
  1772. #endif // OVERRIDE_FINAL_GAMMA
  1773. // Set decoding/encoding gammas for the current pass. Use static constants for
  1774. // linearize_input and gamma_encode_output, because they aren't derived, and
  1775. // they let the compiler do dead-code elimination.
  1776. #ifndef GAMMA_ENCODE_EVERY_FBO
  1777. #ifdef FIRST_PASS
  1778. static const bool linearize_input = true;
  1779. inline float get_pass_input_gamma() { return get_input_gamma(); }
  1780. #else
  1781. static const bool linearize_input = false;
  1782. inline float get_pass_input_gamma() { return 1.0; }
  1783. #endif
  1784. #ifdef LAST_PASS
  1785. static const bool gamma_encode_output = true;
  1786. inline float get_pass_output_gamma() { return get_output_gamma(); }
  1787. #else
  1788. static const bool gamma_encode_output = false;
  1789. inline float get_pass_output_gamma() { return 1.0; }
  1790. #endif
  1791. #else
  1792. static const bool linearize_input = true;
  1793. static const bool gamma_encode_output = true;
  1794. #ifdef FIRST_PASS
  1795. inline float get_pass_input_gamma() { return get_input_gamma(); }
  1796. #else
  1797. inline float get_pass_input_gamma() { return get_intermediate_gamma(); }
  1798. #endif
  1799. #ifdef LAST_PASS
  1800. inline float get_pass_output_gamma() { return get_output_gamma(); }
  1801. #else
  1802. inline float get_pass_output_gamma() { return get_intermediate_gamma(); }
  1803. #endif
  1804. #endif
  1805. // Users might want to know if bilinear filtering will be gamma-correct:
  1806. static const bool gamma_aware_bilinear = !linearize_input;
  1807. ////////////////////// COLOR ENCODING/DECODING FUNCTIONS /////////////////////
  1808. inline float4 encode_output(const float4 color)
  1809. {
  1810. if(gamma_encode_output)
  1811. {
  1812. if(assume_opaque_alpha)
  1813. {
  1814. return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
  1815. }
  1816. else
  1817. {
  1818. return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
  1819. }
  1820. }
  1821. else
  1822. {
  1823. return color;
  1824. }
  1825. }
  1826. inline float4 decode_input(const float4 color)
  1827. {
  1828. if(linearize_input)
  1829. {
  1830. if(assume_opaque_alpha)
  1831. {
  1832. return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
  1833. }
  1834. else
  1835. {
  1836. return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
  1837. }
  1838. }
  1839. else
  1840. {
  1841. return color;
  1842. }
  1843. }
  1844. inline float4 decode_gamma_input(const float4 color, const float3 gamma)
  1845. {
  1846. if(assume_opaque_alpha)
  1847. {
  1848. return float4(pow(color.rgb, gamma), 1.0);
  1849. }
  1850. else
  1851. {
  1852. return float4(pow(color.rgb, gamma), color.a);
  1853. }
  1854. }
  1855. //TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
  1856. //#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
  1857. // EDIT: it's the 'const' in front of the coords that's doing it
  1858. /////////////////////////// TEXTURE LOOKUP WRAPPERS //////////////////////////
  1859. // "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  1860. // Provide a wide array of linearizing texture lookup wrapper functions. The
  1861. // Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
  1862. // lookups are provided for completeness in case that changes someday. Nobody
  1863. // is likely to use the *fetch and *proj functions, but they're included just
  1864. // in case. The only tex*D texture sampling functions omitted are:
  1865. // - tex*Dcmpbias
  1866. // - tex*Dcmplod
  1867. // - tex*DARRAY*
  1868. // - tex*DMS*
  1869. // - Variants returning integers
  1870. // Standard line length restrictions are ignored below for vertical brevity.
  1871. /*
  1872. // tex1D:
  1873. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
  1874. { return decode_input(tex1D(tex, tex_coords)); }
  1875. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
  1876. { return decode_input(tex1D(tex, tex_coords)); }
  1877. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
  1878. { return decode_input(tex1D(tex, tex_coords, texel_off)); }
  1879. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
  1880. { return decode_input(tex1D(tex, tex_coords, texel_off)); }
  1881. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
  1882. { return decode_input(tex1D(tex, tex_coords, dx, dy)); }
  1883. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
  1884. { return decode_input(tex1D(tex, tex_coords, dx, dy)); }
  1885. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
  1886. { return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
  1887. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
  1888. { return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
  1889. // tex1Dbias:
  1890. inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
  1891. { return decode_input(tex1Dbias(tex, tex_coords)); }
  1892. inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
  1893. { return decode_input(tex1Dbias(tex, tex_coords, texel_off)); }
  1894. // tex1Dfetch:
  1895. inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
  1896. { return decode_input(tex1Dfetch(tex, tex_coords)); }
  1897. inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
  1898. { return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); }
  1899. // tex1Dlod:
  1900. inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
  1901. { return decode_input(tex1Dlod(tex, tex_coords)); }
  1902. inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
  1903. { return decode_input(tex1Dlod(tex, tex_coords, texel_off)); }
  1904. // tex1Dproj:
  1905. inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
  1906. { return decode_input(tex1Dproj(tex, tex_coords)); }
  1907. inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
  1908. { return decode_input(tex1Dproj(tex, tex_coords)); }
  1909. inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
  1910. { return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
  1911. inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
  1912. { return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
  1913. */
  1914. // tex2D:
  1915. inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
  1916. { return decode_input(COMPAT_TEXTURE(tex, tex_coords)); }
  1917. inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
  1918. { return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); }
  1919. inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
  1920. { return decode_input(textureLod(tex, tex_coords, texel_off)); }
  1921. inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
  1922. { return decode_input(textureLod(tex, tex_coords.xy, texel_off)); }
  1923. //inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
  1924. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); }
  1925. //inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
  1926. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); }
  1927. //inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
  1928. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); }
  1929. //inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
  1930. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); }
  1931. // tex2Dbias:
  1932. //inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
  1933. //{ return decode_input(tex2Dbias(tex, tex_coords)); }
  1934. //inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
  1935. //{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); }
  1936. // tex2Dfetch:
  1937. //inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
  1938. //{ return decode_input(tex2Dfetch(tex, tex_coords)); }
  1939. //inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
  1940. //{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); }
  1941. // tex2Dlod:
  1942. inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
  1943. { return decode_input(textureLod(tex, tex_coords.xy, 0.0)); }
  1944. inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
  1945. { return decode_input(textureLod(tex, tex_coords.xy, texel_off)); }
  1946. /*
  1947. // tex2Dproj:
  1948. inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
  1949. { return decode_input(tex2Dproj(tex, tex_coords)); }
  1950. inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
  1951. { return decode_input(tex2Dproj(tex, tex_coords)); }
  1952. inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
  1953. { return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
  1954. inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
  1955. { return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
  1956. */
  1957. /*
  1958. // tex3D:
  1959. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
  1960. { return decode_input(tex3D(tex, tex_coords)); }
  1961. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
  1962. { return decode_input(tex3D(tex, tex_coords, texel_off)); }
  1963. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
  1964. { return decode_input(tex3D(tex, tex_coords, dx, dy)); }
  1965. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
  1966. { return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); }
  1967. // tex3Dbias:
  1968. inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
  1969. { return decode_input(tex3Dbias(tex, tex_coords)); }
  1970. inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  1971. { return decode_input(tex3Dbias(tex, tex_coords, texel_off)); }
  1972. // tex3Dfetch:
  1973. inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
  1974. { return decode_input(tex3Dfetch(tex, tex_coords)); }
  1975. inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
  1976. { return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); }
  1977. // tex3Dlod:
  1978. inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
  1979. { return decode_input(tex3Dlod(tex, tex_coords)); }
  1980. inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  1981. { return decode_input(tex3Dlod(tex, tex_coords, texel_off)); }
  1982. // tex3Dproj:
  1983. inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
  1984. { return decode_input(tex3Dproj(tex, tex_coords)); }
  1985. inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  1986. { return decode_input(tex3Dproj(tex, tex_coords, texel_off)); }
  1987. /////////*
  1988. // NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  1989. // This narrow selection of nonstandard tex2D* functions can be useful:
  1990. // tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
  1991. //inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
  1992. //{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); }
  1993. //inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
  1994. //{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); }
  1995. // MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  1996. // Provide a narrower selection of tex2D* wrapper functions that decode an
  1997. // input sample with a specified gamma value. These are useful for reading
  1998. // LUT's and for reading the input of pass0 in a later pass.
  1999. // tex2D:
  2000. inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
  2001. { return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); }
  2002. inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
  2003. { return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); }
  2004. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
  2005. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); }
  2006. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
  2007. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); }
  2008. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
  2009. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); }
  2010. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
  2011. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); }
  2012. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
  2013. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); }
  2014. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
  2015. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); }
  2016. /*
  2017. // tex2Dbias:
  2018. inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
  2019. { return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); }
  2020. inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
  2021. { return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); }
  2022. // tex2Dfetch:
  2023. inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
  2024. { return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); }
  2025. inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
  2026. { return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); }
  2027. */
  2028. // tex2Dlod:
  2029. inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
  2030. { return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); }
  2031. inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
  2032. { return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); }
  2033. #endif // GAMMA_MANAGEMENT_H
  2034. //////////////////////////// END GAMMA-MANAGEMENT //////////////////////////
  2035. //#include "derived-settings-and-constants.h"
  2036. //////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS ////////////////////
  2037. #ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
  2038. #define DERIVED_SETTINGS_AND_CONSTANTS_H
  2039. ///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
  2040. // crt-royale: A full-featured CRT shader, with cheese.
  2041. // Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
  2042. //
  2043. // This program is free software; you can redistribute it and/or modify it
  2044. // under the terms of the GNU General Public License as published by the Free
  2045. // Software Foundation; either version 2 of the License, or any later version.
  2046. //
  2047. // This program is distributed in the hope that it will be useful, but WITHOUT
  2048. // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  2049. // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  2050. // more details.
  2051. //
  2052. // You should have received a copy of the GNU General Public License along with
  2053. // this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  2054. // Place, Suite 330, Boston, MA 02111-1307 USA
  2055. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  2056. // These macros and constants can be used across the whole codebase.
  2057. // Unlike the values in user-settings.cgh, end users shouldn't modify these.
  2058. /////////////////////////////// BEGIN INCLUDES ///////////////////////////////
  2059. //#include "../user-settings.h"
  2060. ///////////////////////////// BEGIN USER-SETTINGS ////////////////////////////
  2061. #ifndef USER_SETTINGS_H
  2062. #define USER_SETTINGS_H
  2063. ///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
  2064. // The Cg compiler uses different "profiles" with different capabilities.
  2065. // This shader requires a Cg compilation profile >= arbfp1, but a few options
  2066. // require higher profiles like fp30 or fp40. The shader can't detect profile
  2067. // or driver capabilities, so instead you must comment or uncomment the lines
  2068. // below with "//" before "#define." Disable an option if you get compilation
  2069. // errors resembling those listed. Generally speaking, all of these options
  2070. // will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
  2071. // likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
  2072. // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
  2073. // Among other things, derivatives help us fix anisotropic filtering artifacts
  2074. // with curved manually tiled phosphor mask coords. Related errors:
  2075. // error C3004: function "float2 ddx(float2);" not supported in this profile
  2076. // error C3004: function "float2 ddy(float2);" not supported in this profile
  2077. //#define DRIVERS_ALLOW_DERIVATIVES
  2078. // Fine derivatives: Unsupported on older ATI cards.
  2079. // Fine derivatives enable 2x2 fragment block communication, letting us perform
  2080. // fast single-pass blur operations. If your card uses coarse derivatives and
  2081. // these are enabled, blurs could look broken. Derivatives are a prerequisite.
  2082. #ifdef DRIVERS_ALLOW_DERIVATIVES
  2083. #define DRIVERS_ALLOW_FINE_DERIVATIVES
  2084. #endif
  2085. // Dynamic looping: Requires an fp30 or newer profile.
  2086. // This makes phosphor mask resampling faster in some cases. Related errors:
  2087. // error C5013: profile does not support "for" statements and "for" could not
  2088. // be unrolled
  2089. //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
  2090. // Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
  2091. // Using one static loop avoids overhead if the user is right, but if the user
  2092. // is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
  2093. // binary search can potentially save some iterations. However, it may fail:
  2094. // error C6001: Temporary register limit of 32 exceeded; 35 registers
  2095. // needed to compile program
  2096. //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
  2097. // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
  2098. // anisotropic filtering, thereby fixing related artifacts. Related errors:
  2099. // error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
  2100. // this profile
  2101. //#define DRIVERS_ALLOW_TEX2DLOD
  2102. // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
  2103. // artifacts from anisotropic filtering and mipmapping. Related errors:
  2104. // error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
  2105. // in this profile
  2106. //#define DRIVERS_ALLOW_TEX2DBIAS
  2107. // Integrated graphics compatibility: Integrated graphics like Intel HD 4000
  2108. // impose stricter limitations on register counts and instructions. Enable
  2109. // INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
  2110. // error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
  2111. // to compile program.
  2112. // Enabling integrated graphics compatibility mode will automatically disable:
  2113. // 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
  2114. // (This may be reenabled in a later release.)
  2115. // 2.) RUNTIME_GEOMETRY_MODE
  2116. // 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
  2117. //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  2118. //////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
  2119. // To disable a #define option, turn its line into a comment with "//."
  2120. // RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
  2121. // Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
  2122. // many of the options in this file and allow real-time tuning, but many of
  2123. // them are slower. Disabling them and using this text file will boost FPS.
  2124. #define RUNTIME_SHADER_PARAMS_ENABLE
  2125. // Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
  2126. // it's the only way to do a wide-enough full bloom with a runtime dot pitch.
  2127. #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
  2128. // Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
  2129. #define RUNTIME_ANTIALIAS_WEIGHTS
  2130. // Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
  2131. //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  2132. // Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
  2133. // parameters? This will require more math or dynamic branching.
  2134. #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  2135. // Specify the tilt at runtime? This makes things about 3% slower.
  2136. #define RUNTIME_GEOMETRY_TILT
  2137. // Specify the geometry mode at runtime?
  2138. #define RUNTIME_GEOMETRY_MODE
  2139. // Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
  2140. // mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
  2141. // dynamic branches? This is cheap if mask_resize_viewport_scale is small.
  2142. #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  2143. // PHOSPHOR MASK:
  2144. // Manually resize the phosphor mask for best results (slower)? Disabling this
  2145. // removes the option to do so, but it may be faster without dynamic branches.
  2146. #define PHOSPHOR_MASK_MANUALLY_RESIZE
  2147. // If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
  2148. #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
  2149. // Larger blurs are expensive, but we need them to blur larger triads. We can
  2150. // detect the right blur if the triad size is static or our profile allows
  2151. // dynamic branches, but otherwise we use the largest blur the user indicates
  2152. // they might need:
  2153. #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  2154. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  2155. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  2156. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  2157. // Here's a helpful chart:
  2158. // MaxTriadSize BlurSize MinTriadCountsByResolution
  2159. // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2160. // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2161. // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2162. // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2163. // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2164. /////////////////////////////// USER PARAMETERS //////////////////////////////
  2165. // Note: Many of these static parameters are overridden by runtime shader
  2166. // parameters when those are enabled. However, many others are static codepath
  2167. // options that were cleaner or more convert to code as static constants.
  2168. // GAMMA:
  2169. static const float crt_gamma_static = 2.5; // range [1, 5]
  2170. static const float lcd_gamma_static = 2.2; // range [1, 5]
  2171. // LEVELS MANAGEMENT:
  2172. // Control the final multiplicative image contrast:
  2173. static const float levels_contrast_static = 1.0; // range [0, 4)
  2174. // We auto-dim to avoid clipping between passes and restore brightness
  2175. // later. Control the dim factor here: Lower values clip less but crush
  2176. // blacks more (static only for now).
  2177. static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
  2178. // HALATION/DIFFUSION/BLOOM:
  2179. // Halation weight: How much energy should be lost to electrons bounding
  2180. // around under the CRT glass and exciting random phosphors?
  2181. static const float halation_weight_static = 0.0; // range [0, 1]
  2182. // Refractive diffusion weight: How much light should spread/diffuse from
  2183. // refracting through the CRT glass?
  2184. static const float diffusion_weight_static = 0.075; // range [0, 1]
  2185. // Underestimate brightness: Bright areas bloom more, but we can base the
  2186. // bloom brightpass on a lower brightness to sharpen phosphors, or a higher
  2187. // brightness to soften them. Low values clip, but >= 0.8 looks okay.
  2188. static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
  2189. // Blur all colors more than necessary for a softer phosphor bloom?
  2190. static const float bloom_excess_static = 0.0; // range [0, 1]
  2191. // The BLOOM_APPROX pass approximates a phosphor blur early on with a small
  2192. // blurred resize of the input (convergence offsets are applied as well).
  2193. // There are three filter options (static option only for now):
  2194. // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
  2195. // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
  2196. // and beam_max_sigma is low.
  2197. // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
  2198. // always uses a static sigma regardless of beam_max_sigma or
  2199. // mask_num_triads_desired.
  2200. // 2.) True 4x4 Gaussian resize: Slowest, technically correct.
  2201. // These options are more pronounced for the fast, unbloomed shader version.
  2202. #ifndef RADEON_FIX
  2203. static const float bloom_approx_filter_static = 2.0;
  2204. #else
  2205. static const float bloom_approx_filter_static = 1.0;
  2206. #endif
  2207. // ELECTRON BEAM SCANLINE DISTRIBUTION:
  2208. // How many scanlines should contribute light to each pixel? Using more
  2209. // scanlines is slower (especially for a generalized Gaussian) but less
  2210. // distorted with larger beam sigmas (especially for a pure Gaussian). The
  2211. // max_beam_sigma at which the closest unused weight is guaranteed <
  2212. // 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
  2213. // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
  2214. // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
  2215. // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
  2216. // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
  2217. // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
  2218. static const float beam_num_scanlines = 3.0; // range [2, 6]
  2219. // A generalized Gaussian beam varies shape with color too, now just width.
  2220. // It's slower but more flexible (static option only for now).
  2221. static const bool beam_generalized_gaussian = true;
  2222. // What kind of scanline antialiasing do you want?
  2223. // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
  2224. // Integrals are slow (especially for generalized Gaussians) and rarely any
  2225. // better than 3x antialiasing (static option only for now).
  2226. static const float beam_antialias_level = 1.0; // range [0, 2]
  2227. // Min/max standard deviations for scanline beams: Higher values widen and
  2228. // soften scanlines. Depending on other options, low min sigmas can alias.
  2229. static const float beam_min_sigma_static = 0.02; // range (0, 1]
  2230. static const float beam_max_sigma_static = 0.3; // range (0, 1]
  2231. // Beam width varies as a function of color: A power function (0) is more
  2232. // configurable, but a spherical function (1) gives the widest beam
  2233. // variability without aliasing (static option only for now).
  2234. static const float beam_spot_shape_function = 0.0;
  2235. // Spot shape power: Powers <= 1 give smoother spot shapes but lower
  2236. // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
  2237. static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
  2238. // Generalized Gaussian max shape parameters: Higher values give flatter
  2239. // scanline plateaus and steeper dropoffs, simultaneously widening and
  2240. // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
  2241. // values > ~40.0 cause artifacts with integrals.
  2242. static const float beam_min_shape_static = 2.0; // range [2, 32]
  2243. static const float beam_max_shape_static = 4.0; // range [2, 32]
  2244. // Generalized Gaussian shape power: Affects how quickly the distribution
  2245. // changes shape from Gaussian to steep/plateaued as color increases from 0
  2246. // to 1.0. Higher powers appear softer for most colors, and lower powers
  2247. // appear sharper for most colors.
  2248. static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
  2249. // What filter should be used to sample scanlines horizontally?
  2250. // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
  2251. static const float beam_horiz_filter_static = 0.0;
  2252. // Standard deviation for horizontal Gaussian resampling:
  2253. static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
  2254. // Do horizontal scanline sampling in linear RGB (correct light mixing),
  2255. // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
  2256. // limiting circuitry in some CRT's), or a weighted avg.?
  2257. static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
  2258. // Simulate scanline misconvergence? This needs 3x horizontal texture
  2259. // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
  2260. // later passes (static option only for now).
  2261. static const bool beam_misconvergence = true;
  2262. // Convergence offsets in x/y directions for R/G/B scanline beams in units
  2263. // of scanlines. Positive offsets go right/down; ranges [-2, 2]
  2264. static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
  2265. static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
  2266. static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
  2267. // Detect interlacing (static option only for now)?
  2268. static const bool interlace_detect = true;
  2269. // Assume 1080-line sources are interlaced?
  2270. static const bool interlace_1080i_static = false;
  2271. // For interlaced sources, assume TFF (top-field first) or BFF order?
  2272. // (Whether this matters depends on the nature of the interlaced input.)
  2273. static const bool interlace_bff_static = false;
  2274. // ANTIALIASING:
  2275. // What AA level do you want for curvature/overscan/subpixels? Options:
  2276. // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
  2277. // (Static option only for now)
  2278. static const float aa_level = 12.0; // range [0, 24]
  2279. // What antialiasing filter do you want (static option only)? Options:
  2280. // 0: Box (separable), 1: Box (cylindrical),
  2281. // 2: Tent (separable), 3: Tent (cylindrical),
  2282. // 4: Gaussian (separable), 5: Gaussian (cylindrical),
  2283. // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
  2284. // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
  2285. // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
  2286. static const float aa_filter = 6.0; // range [0, 9]
  2287. // Flip the sample grid on odd/even frames (static option only for now)?
  2288. static const bool aa_temporal = false;
  2289. // Use RGB subpixel offsets for antialiasing? The pixel is at green, and
  2290. // the blue offset is the negative r offset; range [0, 0.5]
  2291. static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
  2292. // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
  2293. // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
  2294. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
  2295. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
  2296. // 4.) C = 0.0 is a soft spline filter.
  2297. static const float aa_cubic_c_static = 0.5; // range [0, 4]
  2298. // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
  2299. static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
  2300. // PHOSPHOR MASK:
  2301. // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
  2302. static const float mask_type_static = 1.0; // range [0, 2]
  2303. // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
  2304. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
  2305. // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
  2306. // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
  2307. // is halfway decent with LUT mipmapping but atrocious without it.
  2308. // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
  2309. // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
  2310. // This mode reuses the same masks, so triads will be enormous unless
  2311. // you change the mask LUT filenames in your .cgp file.
  2312. static const float mask_sample_mode_static = 0.0; // range [0, 2]
  2313. // Prefer setting the triad size (0.0) or number on the screen (1.0)?
  2314. // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
  2315. // will always be used to calculate the full bloom sigma statically.
  2316. static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
  2317. // Specify the phosphor triad size, in pixels. Each tile (usually with 8
  2318. // triads) will be rounded to the nearest integer tile size and clamped to
  2319. // obey minimum size constraints (imposed to reduce downsize taps) and
  2320. // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
  2321. // To increase the size limit, double the viewport-relative scales for the
  2322. // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
  2323. // range [1, mask_texture_small_size/mask_triads_per_tile]
  2324. static const float mask_triad_size_desired_static = 24.0 / 8.0;
  2325. // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
  2326. // final size will be rounded and constrained as above); default 480.0
  2327. static const float mask_num_triads_desired_static = 480.0;
  2328. // How many lobes should the sinc/Lanczos resizer use? More lobes require
  2329. // more samples and avoid moire a bit better, but some is unavoidable
  2330. // depending on the destination size (static option for now).
  2331. static const float mask_sinc_lobes = 3.0; // range [2, 4]
  2332. // The mask is resized using a variable number of taps in each dimension,
  2333. // but some Cg profiles always fetch a constant number of taps no matter
  2334. // what (no dynamic branching). We can limit the maximum number of taps if
  2335. // we statically limit the minimum phosphor triad size. Larger values are
  2336. // faster, but the limit IS enforced (static option only, forever);
  2337. // range [1, mask_texture_small_size/mask_triads_per_tile]
  2338. // TODO: Make this 1.0 and compensate with smarter sampling!
  2339. static const float mask_min_allowed_triad_size = 2.0;
  2340. // GEOMETRY:
  2341. // Geometry mode:
  2342. // 0: Off (default), 1: Spherical mapping (like cgwg's),
  2343. // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
  2344. static const float geom_mode_static = 0.0; // range [0, 3]
  2345. // Radius of curvature: Measured in units of your viewport's diagonal size.
  2346. static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
  2347. // View dist is the distance from the player to their physical screen, in
  2348. // units of the viewport's diagonal size. It controls the field of view.
  2349. static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
  2350. // Tilt angle in radians (clockwise around up and right vectors):
  2351. static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
  2352. // Aspect ratio: When the true viewport size is unknown, this value is used
  2353. // to help convert between the phosphor triad size and count, along with
  2354. // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
  2355. // this equal to Retroarch's display aspect ratio (DAR) for best results;
  2356. // range [1, geom_max_aspect_ratio from user-cgp-constants.h];
  2357. // default (256/224)*(54/47) = 1.313069909 (see below)
  2358. static const float geom_aspect_ratio_static = 1.313069909;
  2359. // Before getting into overscan, here's some general aspect ratio info:
  2360. // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
  2361. // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
  2362. // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
  2363. // Geometry processing has to "undo" the screen-space 2D DAR to calculate
  2364. // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
  2365. // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
  2366. // a.) Enable Retroarch's "Crop Overscan"
  2367. // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
  2368. // Real consoles use horizontal black padding in the signal, but emulators
  2369. // often crop this without cropping the vertical padding; a 256x224 [S]NES
  2370. // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
  2371. // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
  2372. // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
  2373. // http://forums.nesdev.com/viewtopic.php?p=24815#p24815
  2374. // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
  2375. // without doing a. or b., but horizontal image borders will be tighter
  2376. // than vertical ones, messing up curvature and overscan. Fixing the
  2377. // padding first corrects this.
  2378. // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
  2379. // or adjust x/y independently to e.g. readd horizontal padding, as noted
  2380. // above: Values < 1.0 zoom out; range (0, inf)
  2381. static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
  2382. // Compute a proper pixel-space to texture-space matrix even without ddx()/
  2383. // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
  2384. // with strong curvature (static option only for now).
  2385. static const bool geom_force_correct_tangent_matrix = true;
  2386. // BORDERS:
  2387. // Rounded border size in texture uv coords:
  2388. static const float border_size_static = 0.015; // range [0, 0.5]
  2389. // Border darkness: Moderate values darken the border smoothly, and high
  2390. // values make the image very dark just inside the border:
  2391. static const float border_darkness_static = 2.0; // range [0, inf)
  2392. // Border compression: High numbers compress border transitions, narrowing
  2393. // the dark border area.
  2394. static const float border_compress_static = 2.5; // range [1, inf)
  2395. #endif // USER_SETTINGS_H
  2396. ///////////////////////////// END USER-SETTINGS ////////////////////////////
  2397. //#include "user-cgp-constants.h"
  2398. ///////////////////////// BEGIN USER-CGP-CONSTANTS /////////////////////////
  2399. #ifndef USER_CGP_CONSTANTS_H
  2400. #define USER_CGP_CONSTANTS_H
  2401. // IMPORTANT:
  2402. // These constants MUST be set appropriately for the settings in crt-royale.cgp
  2403. // (or whatever related .cgp file you're using). If they aren't, you're likely
  2404. // to get artifacts, the wrong phosphor mask size, etc. I wish these could be
  2405. // set directly in the .cgp file to make things easier, but...they can't.
  2406. // PASS SCALES AND RELATED CONSTANTS:
  2407. // Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of
  2408. // this shader: One does a viewport-scale bloom, and the other skips it. The
  2409. // latter benefits from a higher bloom_approx_scale_x, so save both separately:
  2410. static const float bloom_approx_size_x = 320.0;
  2411. static const float bloom_approx_size_x_for_fake = 400.0;
  2412. // Copy the viewport-relative scales of the phosphor mask resize passes
  2413. // (MASK_RESIZE and the pass immediately preceding it):
  2414. static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
  2415. // Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
  2416. static const float geom_max_aspect_ratio = 4.0/3.0;
  2417. // PHOSPHOR MASK TEXTURE CONSTANTS:
  2418. // Set the following constants to reflect the properties of the phosphor mask
  2419. // texture named in crt-royale.cgp. The shader optionally resizes a mask tile
  2420. // based on user settings, then repeats a single tile until filling the screen.
  2421. // The shader must know the input texture size (default 64x64), and to manually
  2422. // resize, it must also know the horizontal triads per tile (default 8).
  2423. static const float2 mask_texture_small_size = float2(64.0, 64.0);
  2424. static const float2 mask_texture_large_size = float2(512.0, 512.0);
  2425. static const float mask_triads_per_tile = 8.0;
  2426. // We need the average brightness of the phosphor mask to compensate for the
  2427. // dimming it causes. The following four values are roughly correct for the
  2428. // masks included with the shader. Update the value for any LUT texture you
  2429. // change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
  2430. // the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
  2431. //#define PHOSPHOR_MASK_GRILLE14
  2432. static const float mask_grille14_avg_color = 50.6666666/255.0;
  2433. // TileableLinearApertureGrille14Wide7d33Spacing*.png
  2434. // TileableLinearApertureGrille14Wide10And6Spacing*.png
  2435. static const float mask_grille15_avg_color = 53.0/255.0;
  2436. // TileableLinearApertureGrille15Wide6d33Spacing*.png
  2437. // TileableLinearApertureGrille15Wide8And5d5Spacing*.png
  2438. static const float mask_slot_avg_color = 46.0/255.0;
  2439. // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
  2440. // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
  2441. static const float mask_shadow_avg_color = 41.0/255.0;
  2442. // TileableLinearShadowMask*.png
  2443. // TileableLinearShadowMaskEDP*.png
  2444. #ifdef PHOSPHOR_MASK_GRILLE14
  2445. static const float mask_grille_avg_color = mask_grille14_avg_color;
  2446. #else
  2447. static const float mask_grille_avg_color = mask_grille15_avg_color;
  2448. #endif
  2449. #endif // USER_CGP_CONSTANTS_H
  2450. ////////////////////////// END USER-CGP-CONSTANTS //////////////////////////
  2451. //////////////////////////////// END INCLUDES ////////////////////////////////
  2452. /////////////////////////////// FIXED SETTINGS ///////////////////////////////
  2453. // Avoid dividing by zero; using a macro overloads for float, float2, etc.:
  2454. #define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16
  2455. // Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
  2456. #ifndef SIMULATE_CRT_ON_LCD
  2457. #define SIMULATE_CRT_ON_LCD
  2458. #endif
  2459. // Manually tiling a manually resized texture creates texture coord derivative
  2460. // discontinuities and confuses anisotropic filtering, causing discolored tile
  2461. // seams in the phosphor mask. Workarounds:
  2462. // a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's
  2463. // downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
  2464. // disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
  2465. // b.) "Tile flat twice" requires drawing two full tiles without border padding
  2466. // to the resized mask FBO, and it's incompatible with same-pass curvature.
  2467. // (Same-pass curvature isn't used but could be in the future...maybe.)
  2468. // c.) "Fix discontinuities" requires derivatives and drawing one tile with
  2469. // border padding to the resized mask FBO, but it works with same-pass
  2470. // curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
  2471. // Precedence: a, then, b, then c (if multiple strategies are #defined).
  2472. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen
  2473. #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen
  2474. #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen
  2475. // Also, manually resampling the phosphor mask is slightly blurrier with
  2476. // anisotropic filtering. (Resampling with mipmapping is even worse: It
  2477. // creates artifacts, but only with the fully bloomed shader.) The difference
  2478. // is subtle with small triads, but you can fix it for a small cost.
  2479. //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  2480. ////////////////////////////// DERIVED SETTINGS //////////////////////////////
  2481. // Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
  2482. // geometry mode at runtime, or a 4x4 true Gaussian resize. Disable
  2483. // incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
  2484. // #defined by either user-settings.h or a wrapper .cg that #includes the
  2485. // current .cg pass.)
  2486. #ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  2487. #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
  2488. #undef PHOSPHOR_MASK_MANUALLY_RESIZE
  2489. #endif
  2490. #ifdef RUNTIME_GEOMETRY_MODE
  2491. #undef RUNTIME_GEOMETRY_MODE
  2492. #endif
  2493. // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
  2494. // inferior in most cases, so replace 2.0 with 0.0:
  2495. static const float bloom_approx_filter =
  2496. bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
  2497. #else
  2498. static const float bloom_approx_filter = bloom_approx_filter_static;
  2499. #endif
  2500. // Disable slow runtime paths if static parameters are used. Most of these
  2501. // won't be a problem anyway once the params are disabled, but some will.
  2502. #ifndef RUNTIME_SHADER_PARAMS_ENABLE
  2503. #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  2504. #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  2505. #endif
  2506. #ifdef RUNTIME_ANTIALIAS_WEIGHTS
  2507. #undef RUNTIME_ANTIALIAS_WEIGHTS
  2508. #endif
  2509. #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  2510. #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  2511. #endif
  2512. #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  2513. #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  2514. #endif
  2515. #ifdef RUNTIME_GEOMETRY_TILT
  2516. #undef RUNTIME_GEOMETRY_TILT
  2517. #endif
  2518. #ifdef RUNTIME_GEOMETRY_MODE
  2519. #undef RUNTIME_GEOMETRY_MODE
  2520. #endif
  2521. #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  2522. #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  2523. #endif
  2524. #endif
  2525. // Make tex2Dbias a backup for tex2Dlod for wider compatibility.
  2526. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  2527. #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  2528. #endif
  2529. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  2530. #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  2531. #endif
  2532. // Rule out unavailable anisotropic compatibility strategies:
  2533. #ifndef DRIVERS_ALLOW_DERIVATIVES
  2534. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2535. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2536. #endif
  2537. #endif
  2538. #ifndef DRIVERS_ALLOW_TEX2DLOD
  2539. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  2540. #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  2541. #endif
  2542. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  2543. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  2544. #endif
  2545. #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
  2546. #undef ANTIALIAS_DISABLE_ANISOTROPIC
  2547. #endif
  2548. #endif
  2549. #ifndef DRIVERS_ALLOW_TEX2DBIAS
  2550. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  2551. #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  2552. #endif
  2553. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  2554. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  2555. #endif
  2556. #endif
  2557. // Prioritize anisotropic tiling compatibility strategies by performance and
  2558. // disable unused strategies. This concentrates all the nesting in one place.
  2559. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  2560. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  2561. #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  2562. #endif
  2563. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  2564. #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  2565. #endif
  2566. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2567. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2568. #endif
  2569. #else
  2570. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  2571. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  2572. #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  2573. #endif
  2574. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2575. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2576. #endif
  2577. #else
  2578. // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
  2579. // flat texture coords in the same pass, but that's all we use.
  2580. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  2581. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2582. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2583. #endif
  2584. #endif
  2585. #endif
  2586. #endif
  2587. // The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
  2588. // reduce some #ifdef nesting in the next section by essentially OR'ing them:
  2589. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  2590. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  2591. #endif
  2592. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  2593. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  2594. #endif
  2595. // Prioritize anisotropic resampling compatibility strategies the same way:
  2596. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  2597. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  2598. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  2599. #endif
  2600. #endif
  2601. /////////////////////// DERIVED PHOSPHOR MASK CONSTANTS //////////////////////
  2602. // If we can use the large mipmapped LUT without mipmapping artifacts, we
  2603. // should: It gives us more options for using fewer samples.
  2604. #ifdef DRIVERS_ALLOW_TEX2DLOD
  2605. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  2606. // TODO: Take advantage of this!
  2607. #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
  2608. static const float2 mask_resize_src_lut_size = mask_texture_large_size;
  2609. #else
  2610. static const float2 mask_resize_src_lut_size = mask_texture_small_size;
  2611. #endif
  2612. #else
  2613. static const float2 mask_resize_src_lut_size = mask_texture_small_size;
  2614. #endif
  2615. // tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
  2616. // main_fragment, or a static alias of one of the above. This makes it hard
  2617. // to select the phosphor mask at runtime: We can't even assign to a uniform
  2618. // global in the vertex shader or select a sampler2D in the vertex shader and
  2619. // pass it to the fragment shader (even with explicit TEXUNIT# bindings),
  2620. // because it just gives us the input texture or a black screen. However, we
  2621. // can get around these limitations by calling tex2D three times with different
  2622. // uniform samplers (or resizing the phosphor mask three times altogether).
  2623. // With dynamic branches, we can process only one of these branches on top of
  2624. // quickly discarding fragments we don't need (cgc seems able to overcome
  2625. // limigations around dependent texture fetches inside of branches). Without
  2626. // dynamic branches, we have to process every branch for every fragment...which
  2627. // is slower. Runtime sampling mode selection is slower without dynamic
  2628. // branches as well. Let the user's static #defines decide if it's worth it.
  2629. #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
  2630. #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  2631. #else
  2632. #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  2633. #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  2634. #endif
  2635. #endif
  2636. // We need to render some minimum number of tiles in the resize passes.
  2637. // We need at least 1.0 just to repeat a single tile, and we need extra
  2638. // padding beyond that for anisotropic filtering, discontinuitity fixing,
  2639. // antialiasing, same-pass curvature (not currently used), etc. First
  2640. // determine how many border texels and tiles we need, based on how the result
  2641. // will be sampled:
  2642. #ifdef GEOMETRY_EARLY
  2643. static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
  2644. // Most antialiasing filters have a base radius of 4.0 pixels:
  2645. static const float max_aa_base_pixel_border = 4.0 +
  2646. max_subpixel_offset;
  2647. #else
  2648. static const float max_aa_base_pixel_border = 0.0;
  2649. #endif
  2650. // Anisotropic filtering adds about 0.5 to the pixel border:
  2651. #ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  2652. static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
  2653. #else
  2654. static const float max_aniso_pixel_border = max_aa_base_pixel_border;
  2655. #endif
  2656. // Fixing discontinuities adds 1.0 more to the pixel border:
  2657. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  2658. static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
  2659. #else
  2660. static const float max_tiled_pixel_border = max_aniso_pixel_border;
  2661. #endif
  2662. // Convert the pixel border to an integer texel border. Assume same-pass
  2663. // curvature about triples the texel frequency:
  2664. #ifdef GEOMETRY_EARLY
  2665. static const float max_mask_texel_border =
  2666. ceil(max_tiled_pixel_border * 3.0);
  2667. #else
  2668. static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
  2669. #endif
  2670. // Convert the texel border to a tile border using worst-case assumptions:
  2671. static const float max_mask_tile_border = max_mask_texel_border/
  2672. (mask_min_allowed_triad_size * mask_triads_per_tile);
  2673. // Finally, set the number of resized tiles to render to MASK_RESIZE, and set
  2674. // the starting texel (inside borders) for sampling it.
  2675. #ifndef GEOMETRY_EARLY
  2676. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  2677. // Special case: Render two tiles without borders. Anisotropic
  2678. // filtering doesn't seem to be a problem here.
  2679. static const float mask_resize_num_tiles = 1.0 + 1.0;
  2680. static const float mask_start_texels = 0.0;
  2681. #else
  2682. static const float mask_resize_num_tiles = 1.0 +
  2683. 2.0 * max_mask_tile_border;
  2684. static const float mask_start_texels = max_mask_texel_border;
  2685. #endif
  2686. #else
  2687. static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
  2688. static const float mask_start_texels = max_mask_texel_border;
  2689. #endif
  2690. // We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
  2691. // mask_resize_viewport_scale. This limits the maximum final triad size.
  2692. // Estimate the minimum number of triads we can split the screen into in each
  2693. // dimension (we'll be as correct as mask_resize_viewport_scale is):
  2694. static const float mask_resize_num_triads =
  2695. mask_resize_num_tiles * mask_triads_per_tile;
  2696. static const float2 min_allowed_viewport_triads =
  2697. float2(mask_resize_num_triads) / mask_resize_viewport_scale;
  2698. //////////////////////// COMMON MATHEMATICAL CONSTANTS ///////////////////////
  2699. static const float pi = 3.141592653589;
  2700. // We often want to find the location of the previous texel, e.g.:
  2701. // const float2 curr_texel = uv * texture_size;
  2702. // const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
  2703. // const float2 prev_texel_uv = prev_texel / texture_size;
  2704. // However, many GPU drivers round incorrectly around exact texel locations.
  2705. // We need to subtract a little less than 0.5 before flooring, and some GPU's
  2706. // require this value to be farther from 0.5 than others; define it here.
  2707. // const float2 prev_texel =
  2708. // floor(curr_texel - float2(under_half)) + float2(0.5);
  2709. static const float under_half = 0.4995;
  2710. #endif // DERIVED_SETTINGS_AND_CONSTANTS_H
  2711. ///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ////////////////////////////
  2712. //#include "scanline-functions.h"
  2713. ///////////////////////////// BEGIN SCANLINE-FUNCTIONS ////////////////////////////
  2714. #ifndef SCANLINE_FUNCTIONS_H
  2715. #define SCANLINE_FUNCTIONS_H
  2716. ///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
  2717. // crt-royale: A full-featured CRT shader, with cheese.
  2718. // Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
  2719. //
  2720. // This program is free software; you can redistribute it and/or modify it
  2721. // under the terms of the GNU General Public License as published by the Free
  2722. // Software Foundation; either version 2 of the License, or any later version.
  2723. //
  2724. // This program is distributed in the hope that it will be useful, but WITHOUT
  2725. // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  2726. // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  2727. // more details.
  2728. //
  2729. // You should have received a copy of the GNU General Public License along with
  2730. // this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  2731. // Place, Suite 330, Boston, MA 02111-1307 USA
  2732. /////////////////////////////// BEGIN INCLUDES ///////////////////////////////
  2733. //#include "../user-settings.h"
  2734. ///////////////////////////// BEGIN USER-SETTINGS ////////////////////////////
  2735. #ifndef USER_SETTINGS_H
  2736. #define USER_SETTINGS_H
  2737. ///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
  2738. // The Cg compiler uses different "profiles" with different capabilities.
  2739. // This shader requires a Cg compilation profile >= arbfp1, but a few options
  2740. // require higher profiles like fp30 or fp40. The shader can't detect profile
  2741. // or driver capabilities, so instead you must comment or uncomment the lines
  2742. // below with "//" before "#define." Disable an option if you get compilation
  2743. // errors resembling those listed. Generally speaking, all of these options
  2744. // will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
  2745. // likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
  2746. // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
  2747. // Among other things, derivatives help us fix anisotropic filtering artifacts
  2748. // with curved manually tiled phosphor mask coords. Related errors:
  2749. // error C3004: function "float2 ddx(float2);" not supported in this profile
  2750. // error C3004: function "float2 ddy(float2);" not supported in this profile
  2751. //#define DRIVERS_ALLOW_DERIVATIVES
  2752. // Fine derivatives: Unsupported on older ATI cards.
  2753. // Fine derivatives enable 2x2 fragment block communication, letting us perform
  2754. // fast single-pass blur operations. If your card uses coarse derivatives and
  2755. // these are enabled, blurs could look broken. Derivatives are a prerequisite.
  2756. #ifdef DRIVERS_ALLOW_DERIVATIVES
  2757. #define DRIVERS_ALLOW_FINE_DERIVATIVES
  2758. #endif
  2759. // Dynamic looping: Requires an fp30 or newer profile.
  2760. // This makes phosphor mask resampling faster in some cases. Related errors:
  2761. // error C5013: profile does not support "for" statements and "for" could not
  2762. // be unrolled
  2763. //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
  2764. // Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
  2765. // Using one static loop avoids overhead if the user is right, but if the user
  2766. // is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
  2767. // binary search can potentially save some iterations. However, it may fail:
  2768. // error C6001: Temporary register limit of 32 exceeded; 35 registers
  2769. // needed to compile program
  2770. //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
  2771. // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
  2772. // anisotropic filtering, thereby fixing related artifacts. Related errors:
  2773. // error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
  2774. // this profile
  2775. //#define DRIVERS_ALLOW_TEX2DLOD
  2776. // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
  2777. // artifacts from anisotropic filtering and mipmapping. Related errors:
  2778. // error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
  2779. // in this profile
  2780. //#define DRIVERS_ALLOW_TEX2DBIAS
  2781. // Integrated graphics compatibility: Integrated graphics like Intel HD 4000
  2782. // impose stricter limitations on register counts and instructions. Enable
  2783. // INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
  2784. // error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
  2785. // to compile program.
  2786. // Enabling integrated graphics compatibility mode will automatically disable:
  2787. // 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
  2788. // (This may be reenabled in a later release.)
  2789. // 2.) RUNTIME_GEOMETRY_MODE
  2790. // 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
  2791. //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  2792. //////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
  2793. // To disable a #define option, turn its line into a comment with "//."
  2794. // RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
  2795. // Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
  2796. // many of the options in this file and allow real-time tuning, but many of
  2797. // them are slower. Disabling them and using this text file will boost FPS.
  2798. #define RUNTIME_SHADER_PARAMS_ENABLE
  2799. // Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
  2800. // it's the only way to do a wide-enough full bloom with a runtime dot pitch.
  2801. #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
  2802. // Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
  2803. #define RUNTIME_ANTIALIAS_WEIGHTS
  2804. // Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
  2805. //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  2806. // Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
  2807. // parameters? This will require more math or dynamic branching.
  2808. #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  2809. // Specify the tilt at runtime? This makes things about 3% slower.
  2810. #define RUNTIME_GEOMETRY_TILT
  2811. // Specify the geometry mode at runtime?
  2812. #define RUNTIME_GEOMETRY_MODE
  2813. // Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
  2814. // mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
  2815. // dynamic branches? This is cheap if mask_resize_viewport_scale is small.
  2816. #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  2817. // PHOSPHOR MASK:
  2818. // Manually resize the phosphor mask for best results (slower)? Disabling this
  2819. // removes the option to do so, but it may be faster without dynamic branches.
  2820. #define PHOSPHOR_MASK_MANUALLY_RESIZE
  2821. // If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
  2822. #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
  2823. // Larger blurs are expensive, but we need them to blur larger triads. We can
  2824. // detect the right blur if the triad size is static or our profile allows
  2825. // dynamic branches, but otherwise we use the largest blur the user indicates
  2826. // they might need:
  2827. #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  2828. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  2829. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  2830. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  2831. // Here's a helpful chart:
  2832. // MaxTriadSize BlurSize MinTriadCountsByResolution
  2833. // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2834. // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2835. // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2836. // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2837. // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  2838. /////////////////////////////// USER PARAMETERS //////////////////////////////
  2839. // Note: Many of these static parameters are overridden by runtime shader
  2840. // parameters when those are enabled. However, many others are static codepath
  2841. // options that were cleaner or more convert to code as static constants.
  2842. // GAMMA:
  2843. static const float crt_gamma_static = 2.5; // range [1, 5]
  2844. static const float lcd_gamma_static = 2.2; // range [1, 5]
  2845. // LEVELS MANAGEMENT:
  2846. // Control the final multiplicative image contrast:
  2847. static const float levels_contrast_static = 1.0; // range [0, 4)
  2848. // We auto-dim to avoid clipping between passes and restore brightness
  2849. // later. Control the dim factor here: Lower values clip less but crush
  2850. // blacks more (static only for now).
  2851. static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
  2852. // HALATION/DIFFUSION/BLOOM:
  2853. // Halation weight: How much energy should be lost to electrons bounding
  2854. // around under the CRT glass and exciting random phosphors?
  2855. static const float halation_weight_static = 0.0; // range [0, 1]
  2856. // Refractive diffusion weight: How much light should spread/diffuse from
  2857. // refracting through the CRT glass?
  2858. static const float diffusion_weight_static = 0.075; // range [0, 1]
  2859. // Underestimate brightness: Bright areas bloom more, but we can base the
  2860. // bloom brightpass on a lower brightness to sharpen phosphors, or a higher
  2861. // brightness to soften them. Low values clip, but >= 0.8 looks okay.
  2862. static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
  2863. // Blur all colors more than necessary for a softer phosphor bloom?
  2864. static const float bloom_excess_static = 0.0; // range [0, 1]
  2865. // The BLOOM_APPROX pass approximates a phosphor blur early on with a small
  2866. // blurred resize of the input (convergence offsets are applied as well).
  2867. // There are three filter options (static option only for now):
  2868. // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
  2869. // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
  2870. // and beam_max_sigma is low.
  2871. // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
  2872. // always uses a static sigma regardless of beam_max_sigma or
  2873. // mask_num_triads_desired.
  2874. // 2.) True 4x4 Gaussian resize: Slowest, technically correct.
  2875. // These options are more pronounced for the fast, unbloomed shader version.
  2876. #ifndef RADEON_FIX
  2877. static const float bloom_approx_filter_static = 2.0;
  2878. #else
  2879. static const float bloom_approx_filter_static = 1.0;
  2880. #endif
  2881. // ELECTRON BEAM SCANLINE DISTRIBUTION:
  2882. // How many scanlines should contribute light to each pixel? Using more
  2883. // scanlines is slower (especially for a generalized Gaussian) but less
  2884. // distorted with larger beam sigmas (especially for a pure Gaussian). The
  2885. // max_beam_sigma at which the closest unused weight is guaranteed <
  2886. // 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
  2887. // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
  2888. // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
  2889. // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
  2890. // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
  2891. // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
  2892. static const float beam_num_scanlines = 3.0; // range [2, 6]
  2893. // A generalized Gaussian beam varies shape with color too, now just width.
  2894. // It's slower but more flexible (static option only for now).
  2895. static const bool beam_generalized_gaussian = true;
  2896. // What kind of scanline antialiasing do you want?
  2897. // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
  2898. // Integrals are slow (especially for generalized Gaussians) and rarely any
  2899. // better than 3x antialiasing (static option only for now).
  2900. static const float beam_antialias_level = 1.0; // range [0, 2]
  2901. // Min/max standard deviations for scanline beams: Higher values widen and
  2902. // soften scanlines. Depending on other options, low min sigmas can alias.
  2903. static const float beam_min_sigma_static = 0.02; // range (0, 1]
  2904. static const float beam_max_sigma_static = 0.3; // range (0, 1]
  2905. // Beam width varies as a function of color: A power function (0) is more
  2906. // configurable, but a spherical function (1) gives the widest beam
  2907. // variability without aliasing (static option only for now).
  2908. static const float beam_spot_shape_function = 0.0;
  2909. // Spot shape power: Powers <= 1 give smoother spot shapes but lower
  2910. // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
  2911. static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
  2912. // Generalized Gaussian max shape parameters: Higher values give flatter
  2913. // scanline plateaus and steeper dropoffs, simultaneously widening and
  2914. // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
  2915. // values > ~40.0 cause artifacts with integrals.
  2916. static const float beam_min_shape_static = 2.0; // range [2, 32]
  2917. static const float beam_max_shape_static = 4.0; // range [2, 32]
  2918. // Generalized Gaussian shape power: Affects how quickly the distribution
  2919. // changes shape from Gaussian to steep/plateaued as color increases from 0
  2920. // to 1.0. Higher powers appear softer for most colors, and lower powers
  2921. // appear sharper for most colors.
  2922. static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
  2923. // What filter should be used to sample scanlines horizontally?
  2924. // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
  2925. static const float beam_horiz_filter_static = 0.0;
  2926. // Standard deviation for horizontal Gaussian resampling:
  2927. static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
  2928. // Do horizontal scanline sampling in linear RGB (correct light mixing),
  2929. // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
  2930. // limiting circuitry in some CRT's), or a weighted avg.?
  2931. static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
  2932. // Simulate scanline misconvergence? This needs 3x horizontal texture
  2933. // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
  2934. // later passes (static option only for now).
  2935. static const bool beam_misconvergence = true;
  2936. // Convergence offsets in x/y directions for R/G/B scanline beams in units
  2937. // of scanlines. Positive offsets go right/down; ranges [-2, 2]
  2938. static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
  2939. static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
  2940. static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
  2941. // Detect interlacing (static option only for now)?
  2942. static const bool interlace_detect = true;
  2943. // Assume 1080-line sources are interlaced?
  2944. static const bool interlace_1080i_static = false;
  2945. // For interlaced sources, assume TFF (top-field first) or BFF order?
  2946. // (Whether this matters depends on the nature of the interlaced input.)
  2947. static const bool interlace_bff_static = false;
  2948. // ANTIALIASING:
  2949. // What AA level do you want for curvature/overscan/subpixels? Options:
  2950. // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
  2951. // (Static option only for now)
  2952. static const float aa_level = 12.0; // range [0, 24]
  2953. // What antialiasing filter do you want (static option only)? Options:
  2954. // 0: Box (separable), 1: Box (cylindrical),
  2955. // 2: Tent (separable), 3: Tent (cylindrical),
  2956. // 4: Gaussian (separable), 5: Gaussian (cylindrical),
  2957. // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
  2958. // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
  2959. // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
  2960. static const float aa_filter = 6.0; // range [0, 9]
  2961. // Flip the sample grid on odd/even frames (static option only for now)?
  2962. static const bool aa_temporal = false;
  2963. // Use RGB subpixel offsets for antialiasing? The pixel is at green, and
  2964. // the blue offset is the negative r offset; range [0, 0.5]
  2965. static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
  2966. // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
  2967. // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
  2968. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
  2969. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
  2970. // 4.) C = 0.0 is a soft spline filter.
  2971. static const float aa_cubic_c_static = 0.5; // range [0, 4]
  2972. // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
  2973. static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
  2974. // PHOSPHOR MASK:
  2975. // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
  2976. static const float mask_type_static = 1.0; // range [0, 2]
  2977. // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
  2978. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
  2979. // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
  2980. // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
  2981. // is halfway decent with LUT mipmapping but atrocious without it.
  2982. // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
  2983. // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
  2984. // This mode reuses the same masks, so triads will be enormous unless
  2985. // you change the mask LUT filenames in your .cgp file.
  2986. static const float mask_sample_mode_static = 0.0; // range [0, 2]
  2987. // Prefer setting the triad size (0.0) or number on the screen (1.0)?
  2988. // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
  2989. // will always be used to calculate the full bloom sigma statically.
  2990. static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
  2991. // Specify the phosphor triad size, in pixels. Each tile (usually with 8
  2992. // triads) will be rounded to the nearest integer tile size and clamped to
  2993. // obey minimum size constraints (imposed to reduce downsize taps) and
  2994. // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
  2995. // To increase the size limit, double the viewport-relative scales for the
  2996. // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
  2997. // range [1, mask_texture_small_size/mask_triads_per_tile]
  2998. static const float mask_triad_size_desired_static = 24.0 / 8.0;
  2999. // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
  3000. // final size will be rounded and constrained as above); default 480.0
  3001. static const float mask_num_triads_desired_static = 480.0;
  3002. // How many lobes should the sinc/Lanczos resizer use? More lobes require
  3003. // more samples and avoid moire a bit better, but some is unavoidable
  3004. // depending on the destination size (static option for now).
  3005. static const float mask_sinc_lobes = 3.0; // range [2, 4]
  3006. // The mask is resized using a variable number of taps in each dimension,
  3007. // but some Cg profiles always fetch a constant number of taps no matter
  3008. // what (no dynamic branching). We can limit the maximum number of taps if
  3009. // we statically limit the minimum phosphor triad size. Larger values are
  3010. // faster, but the limit IS enforced (static option only, forever);
  3011. // range [1, mask_texture_small_size/mask_triads_per_tile]
  3012. // TODO: Make this 1.0 and compensate with smarter sampling!
  3013. static const float mask_min_allowed_triad_size = 2.0;
  3014. // GEOMETRY:
  3015. // Geometry mode:
  3016. // 0: Off (default), 1: Spherical mapping (like cgwg's),
  3017. // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
  3018. static const float geom_mode_static = 0.0; // range [0, 3]
  3019. // Radius of curvature: Measured in units of your viewport's diagonal size.
  3020. static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
  3021. // View dist is the distance from the player to their physical screen, in
  3022. // units of the viewport's diagonal size. It controls the field of view.
  3023. static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
  3024. // Tilt angle in radians (clockwise around up and right vectors):
  3025. static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
  3026. // Aspect ratio: When the true viewport size is unknown, this value is used
  3027. // to help convert between the phosphor triad size and count, along with
  3028. // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
  3029. // this equal to Retroarch's display aspect ratio (DAR) for best results;
  3030. // range [1, geom_max_aspect_ratio from user-cgp-constants.h];
  3031. // default (256/224)*(54/47) = 1.313069909 (see below)
  3032. static const float geom_aspect_ratio_static = 1.313069909;
  3033. // Before getting into overscan, here's some general aspect ratio info:
  3034. // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
  3035. // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
  3036. // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
  3037. // Geometry processing has to "undo" the screen-space 2D DAR to calculate
  3038. // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
  3039. // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
  3040. // a.) Enable Retroarch's "Crop Overscan"
  3041. // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
  3042. // Real consoles use horizontal black padding in the signal, but emulators
  3043. // often crop this without cropping the vertical padding; a 256x224 [S]NES
  3044. // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
  3045. // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
  3046. // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
  3047. // http://forums.nesdev.com/viewtopic.php?p=24815#p24815
  3048. // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
  3049. // without doing a. or b., but horizontal image borders will be tighter
  3050. // than vertical ones, messing up curvature and overscan. Fixing the
  3051. // padding first corrects this.
  3052. // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
  3053. // or adjust x/y independently to e.g. readd horizontal padding, as noted
  3054. // above: Values < 1.0 zoom out; range (0, inf)
  3055. static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
  3056. // Compute a proper pixel-space to texture-space matrix even without ddx()/
  3057. // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
  3058. // with strong curvature (static option only for now).
  3059. static const bool geom_force_correct_tangent_matrix = true;
  3060. // BORDERS:
  3061. // Rounded border size in texture uv coords:
  3062. static const float border_size_static = 0.015; // range [0, 0.5]
  3063. // Border darkness: Moderate values darken the border smoothly, and high
  3064. // values make the image very dark just inside the border:
  3065. static const float border_darkness_static = 2.0; // range [0, inf)
  3066. // Border compression: High numbers compress border transitions, narrowing
  3067. // the dark border area.
  3068. static const float border_compress_static = 2.5; // range [1, inf)
  3069. #endif // USER_SETTINGS_H
  3070. //////////////////////////// END USER-SETTINGS //////////////////////////
  3071. //#include "derived-settings-and-constants.h"
  3072. //////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS ////////////////////
  3073. #ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
  3074. #define DERIVED_SETTINGS_AND_CONSTANTS_H
  3075. ///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
  3076. // crt-royale: A full-featured CRT shader, with cheese.
  3077. // Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
  3078. //
  3079. // This program is free software; you can redistribute it and/or modify it
  3080. // under the terms of the GNU General Public License as published by the Free
  3081. // Software Foundation; either version 2 of the License, or any later version.
  3082. //
  3083. // This program is distributed in the hope that it will be useful, but WITHOUT
  3084. // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  3085. // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  3086. // more details.
  3087. //
  3088. // You should have received a copy of the GNU General Public License along with
  3089. // this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  3090. // Place, Suite 330, Boston, MA 02111-1307 USA
  3091. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  3092. // These macros and constants can be used across the whole codebase.
  3093. // Unlike the values in user-settings.cgh, end users shouldn't modify these.
  3094. /////////////////////////////// BEGIN INCLUDES ///////////////////////////////
  3095. //#include "../user-settings.h"
  3096. ///////////////////////////// BEGIN USER-SETTINGS ////////////////////////////
  3097. #ifndef USER_SETTINGS_H
  3098. #define USER_SETTINGS_H
  3099. ///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
  3100. // The Cg compiler uses different "profiles" with different capabilities.
  3101. // This shader requires a Cg compilation profile >= arbfp1, but a few options
  3102. // require higher profiles like fp30 or fp40. The shader can't detect profile
  3103. // or driver capabilities, so instead you must comment or uncomment the lines
  3104. // below with "//" before "#define." Disable an option if you get compilation
  3105. // errors resembling those listed. Generally speaking, all of these options
  3106. // will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
  3107. // likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
  3108. // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
  3109. // Among other things, derivatives help us fix anisotropic filtering artifacts
  3110. // with curved manually tiled phosphor mask coords. Related errors:
  3111. // error C3004: function "float2 ddx(float2);" not supported in this profile
  3112. // error C3004: function "float2 ddy(float2);" not supported in this profile
  3113. //#define DRIVERS_ALLOW_DERIVATIVES
  3114. // Fine derivatives: Unsupported on older ATI cards.
  3115. // Fine derivatives enable 2x2 fragment block communication, letting us perform
  3116. // fast single-pass blur operations. If your card uses coarse derivatives and
  3117. // these are enabled, blurs could look broken. Derivatives are a prerequisite.
  3118. #ifdef DRIVERS_ALLOW_DERIVATIVES
  3119. #define DRIVERS_ALLOW_FINE_DERIVATIVES
  3120. #endif
  3121. // Dynamic looping: Requires an fp30 or newer profile.
  3122. // This makes phosphor mask resampling faster in some cases. Related errors:
  3123. // error C5013: profile does not support "for" statements and "for" could not
  3124. // be unrolled
  3125. //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
  3126. // Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
  3127. // Using one static loop avoids overhead if the user is right, but if the user
  3128. // is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
  3129. // binary search can potentially save some iterations. However, it may fail:
  3130. // error C6001: Temporary register limit of 32 exceeded; 35 registers
  3131. // needed to compile program
  3132. //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
  3133. // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
  3134. // anisotropic filtering, thereby fixing related artifacts. Related errors:
  3135. // error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
  3136. // this profile
  3137. //#define DRIVERS_ALLOW_TEX2DLOD
  3138. // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
  3139. // artifacts from anisotropic filtering and mipmapping. Related errors:
  3140. // error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
  3141. // in this profile
  3142. //#define DRIVERS_ALLOW_TEX2DBIAS
  3143. // Integrated graphics compatibility: Integrated graphics like Intel HD 4000
  3144. // impose stricter limitations on register counts and instructions. Enable
  3145. // INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
  3146. // error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
  3147. // to compile program.
  3148. // Enabling integrated graphics compatibility mode will automatically disable:
  3149. // 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
  3150. // (This may be reenabled in a later release.)
  3151. // 2.) RUNTIME_GEOMETRY_MODE
  3152. // 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
  3153. //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  3154. //////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
  3155. // To disable a #define option, turn its line into a comment with "//."
  3156. // RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
  3157. // Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
  3158. // many of the options in this file and allow real-time tuning, but many of
  3159. // them are slower. Disabling them and using this text file will boost FPS.
  3160. #define RUNTIME_SHADER_PARAMS_ENABLE
  3161. // Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
  3162. // it's the only way to do a wide-enough full bloom with a runtime dot pitch.
  3163. #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
  3164. // Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
  3165. #define RUNTIME_ANTIALIAS_WEIGHTS
  3166. // Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
  3167. //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  3168. // Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
  3169. // parameters? This will require more math or dynamic branching.
  3170. #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  3171. // Specify the tilt at runtime? This makes things about 3% slower.
  3172. #define RUNTIME_GEOMETRY_TILT
  3173. // Specify the geometry mode at runtime?
  3174. #define RUNTIME_GEOMETRY_MODE
  3175. // Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
  3176. // mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
  3177. // dynamic branches? This is cheap if mask_resize_viewport_scale is small.
  3178. #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  3179. // PHOSPHOR MASK:
  3180. // Manually resize the phosphor mask for best results (slower)? Disabling this
  3181. // removes the option to do so, but it may be faster without dynamic branches.
  3182. #define PHOSPHOR_MASK_MANUALLY_RESIZE
  3183. // If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
  3184. #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
  3185. // Larger blurs are expensive, but we need them to blur larger triads. We can
  3186. // detect the right blur if the triad size is static or our profile allows
  3187. // dynamic branches, but otherwise we use the largest blur the user indicates
  3188. // they might need:
  3189. #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  3190. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  3191. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  3192. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  3193. // Here's a helpful chart:
  3194. // MaxTriadSize BlurSize MinTriadCountsByResolution
  3195. // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  3196. // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  3197. // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  3198. // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  3199. // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  3200. /////////////////////////////// USER PARAMETERS //////////////////////////////
  3201. // Note: Many of these static parameters are overridden by runtime shader
  3202. // parameters when those are enabled. However, many others are static codepath
  3203. // options that were cleaner or more convert to code as static constants.
  3204. // GAMMA:
  3205. static const float crt_gamma_static = 2.5; // range [1, 5]
  3206. static const float lcd_gamma_static = 2.2; // range [1, 5]
  3207. // LEVELS MANAGEMENT:
  3208. // Control the final multiplicative image contrast:
  3209. static const float levels_contrast_static = 1.0; // range [0, 4)
  3210. // We auto-dim to avoid clipping between passes and restore brightness
  3211. // later. Control the dim factor here: Lower values clip less but crush
  3212. // blacks more (static only for now).
  3213. static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
  3214. // HALATION/DIFFUSION/BLOOM:
  3215. // Halation weight: How much energy should be lost to electrons bounding
  3216. // around under the CRT glass and exciting random phosphors?
  3217. static const float halation_weight_static = 0.0; // range [0, 1]
  3218. // Refractive diffusion weight: How much light should spread/diffuse from
  3219. // refracting through the CRT glass?
  3220. static const float diffusion_weight_static = 0.075; // range [0, 1]
  3221. // Underestimate brightness: Bright areas bloom more, but we can base the
  3222. // bloom brightpass on a lower brightness to sharpen phosphors, or a higher
  3223. // brightness to soften them. Low values clip, but >= 0.8 looks okay.
  3224. static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
  3225. // Blur all colors more than necessary for a softer phosphor bloom?
  3226. static const float bloom_excess_static = 0.0; // range [0, 1]
  3227. // The BLOOM_APPROX pass approximates a phosphor blur early on with a small
  3228. // blurred resize of the input (convergence offsets are applied as well).
  3229. // There are three filter options (static option only for now):
  3230. // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
  3231. // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
  3232. // and beam_max_sigma is low.
  3233. // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
  3234. // always uses a static sigma regardless of beam_max_sigma or
  3235. // mask_num_triads_desired.
  3236. // 2.) True 4x4 Gaussian resize: Slowest, technically correct.
  3237. // These options are more pronounced for the fast, unbloomed shader version.
  3238. #ifndef RADEON_FIX
  3239. static const float bloom_approx_filter_static = 2.0;
  3240. #else
  3241. static const float bloom_approx_filter_static = 1.0;
  3242. #endif
  3243. // ELECTRON BEAM SCANLINE DISTRIBUTION:
  3244. // How many scanlines should contribute light to each pixel? Using more
  3245. // scanlines is slower (especially for a generalized Gaussian) but less
  3246. // distorted with larger beam sigmas (especially for a pure Gaussian). The
  3247. // max_beam_sigma at which the closest unused weight is guaranteed <
  3248. // 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
  3249. // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
  3250. // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
  3251. // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
  3252. // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
  3253. // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
  3254. static const float beam_num_scanlines = 3.0; // range [2, 6]
  3255. // A generalized Gaussian beam varies shape with color too, now just width.
  3256. // It's slower but more flexible (static option only for now).
  3257. static const bool beam_generalized_gaussian = true;
  3258. // What kind of scanline antialiasing do you want?
  3259. // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
  3260. // Integrals are slow (especially for generalized Gaussians) and rarely any
  3261. // better than 3x antialiasing (static option only for now).
  3262. static const float beam_antialias_level = 1.0; // range [0, 2]
  3263. // Min/max standard deviations for scanline beams: Higher values widen and
  3264. // soften scanlines. Depending on other options, low min sigmas can alias.
  3265. static const float beam_min_sigma_static = 0.02; // range (0, 1]
  3266. static const float beam_max_sigma_static = 0.3; // range (0, 1]
  3267. // Beam width varies as a function of color: A power function (0) is more
  3268. // configurable, but a spherical function (1) gives the widest beam
  3269. // variability without aliasing (static option only for now).
  3270. static const float beam_spot_shape_function = 0.0;
  3271. // Spot shape power: Powers <= 1 give smoother spot shapes but lower
  3272. // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
  3273. static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
  3274. // Generalized Gaussian max shape parameters: Higher values give flatter
  3275. // scanline plateaus and steeper dropoffs, simultaneously widening and
  3276. // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
  3277. // values > ~40.0 cause artifacts with integrals.
  3278. static const float beam_min_shape_static = 2.0; // range [2, 32]
  3279. static const float beam_max_shape_static = 4.0; // range [2, 32]
  3280. // Generalized Gaussian shape power: Affects how quickly the distribution
  3281. // changes shape from Gaussian to steep/plateaued as color increases from 0
  3282. // to 1.0. Higher powers appear softer for most colors, and lower powers
  3283. // appear sharper for most colors.
  3284. static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
  3285. // What filter should be used to sample scanlines horizontally?
  3286. // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
  3287. static const float beam_horiz_filter_static = 0.0;
  3288. // Standard deviation for horizontal Gaussian resampling:
  3289. static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
  3290. // Do horizontal scanline sampling in linear RGB (correct light mixing),
  3291. // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
  3292. // limiting circuitry in some CRT's), or a weighted avg.?
  3293. static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
  3294. // Simulate scanline misconvergence? This needs 3x horizontal texture
  3295. // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
  3296. // later passes (static option only for now).
  3297. static const bool beam_misconvergence = true;
  3298. // Convergence offsets in x/y directions for R/G/B scanline beams in units
  3299. // of scanlines. Positive offsets go right/down; ranges [-2, 2]
  3300. static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
  3301. static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
  3302. static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
  3303. // Detect interlacing (static option only for now)?
  3304. static const bool interlace_detect = true;
  3305. // Assume 1080-line sources are interlaced?
  3306. static const bool interlace_1080i_static = false;
  3307. // For interlaced sources, assume TFF (top-field first) or BFF order?
  3308. // (Whether this matters depends on the nature of the interlaced input.)
  3309. static const bool interlace_bff_static = false;
  3310. // ANTIALIASING:
  3311. // What AA level do you want for curvature/overscan/subpixels? Options:
  3312. // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
  3313. // (Static option only for now)
  3314. static const float aa_level = 12.0; // range [0, 24]
  3315. // What antialiasing filter do you want (static option only)? Options:
  3316. // 0: Box (separable), 1: Box (cylindrical),
  3317. // 2: Tent (separable), 3: Tent (cylindrical),
  3318. // 4: Gaussian (separable), 5: Gaussian (cylindrical),
  3319. // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
  3320. // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
  3321. // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
  3322. static const float aa_filter = 6.0; // range [0, 9]
  3323. // Flip the sample grid on odd/even frames (static option only for now)?
  3324. static const bool aa_temporal = false;
  3325. // Use RGB subpixel offsets for antialiasing? The pixel is at green, and
  3326. // the blue offset is the negative r offset; range [0, 0.5]
  3327. static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
  3328. // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
  3329. // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
  3330. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
  3331. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
  3332. // 4.) C = 0.0 is a soft spline filter.
  3333. static const float aa_cubic_c_static = 0.5; // range [0, 4]
  3334. // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
  3335. static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
  3336. // PHOSPHOR MASK:
  3337. // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
  3338. static const float mask_type_static = 1.0; // range [0, 2]
  3339. // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
  3340. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
  3341. // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
  3342. // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
  3343. // is halfway decent with LUT mipmapping but atrocious without it.
  3344. // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
  3345. // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
  3346. // This mode reuses the same masks, so triads will be enormous unless
  3347. // you change the mask LUT filenames in your .cgp file.
  3348. static const float mask_sample_mode_static = 0.0; // range [0, 2]
  3349. // Prefer setting the triad size (0.0) or number on the screen (1.0)?
  3350. // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
  3351. // will always be used to calculate the full bloom sigma statically.
  3352. static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
  3353. // Specify the phosphor triad size, in pixels. Each tile (usually with 8
  3354. // triads) will be rounded to the nearest integer tile size and clamped to
  3355. // obey minimum size constraints (imposed to reduce downsize taps) and
  3356. // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
  3357. // To increase the size limit, double the viewport-relative scales for the
  3358. // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
  3359. // range [1, mask_texture_small_size/mask_triads_per_tile]
  3360. static const float mask_triad_size_desired_static = 24.0 / 8.0;
  3361. // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
  3362. // final size will be rounded and constrained as above); default 480.0
  3363. static const float mask_num_triads_desired_static = 480.0;
  3364. // How many lobes should the sinc/Lanczos resizer use? More lobes require
  3365. // more samples and avoid moire a bit better, but some is unavoidable
  3366. // depending on the destination size (static option for now).
  3367. static const float mask_sinc_lobes = 3.0; // range [2, 4]
  3368. // The mask is resized using a variable number of taps in each dimension,
  3369. // but some Cg profiles always fetch a constant number of taps no matter
  3370. // what (no dynamic branching). We can limit the maximum number of taps if
  3371. // we statically limit the minimum phosphor triad size. Larger values are
  3372. // faster, but the limit IS enforced (static option only, forever);
  3373. // range [1, mask_texture_small_size/mask_triads_per_tile]
  3374. // TODO: Make this 1.0 and compensate with smarter sampling!
  3375. static const float mask_min_allowed_triad_size = 2.0;
  3376. // GEOMETRY:
  3377. // Geometry mode:
  3378. // 0: Off (default), 1: Spherical mapping (like cgwg's),
  3379. // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
  3380. static const float geom_mode_static = 0.0; // range [0, 3]
  3381. // Radius of curvature: Measured in units of your viewport's diagonal size.
  3382. static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
  3383. // View dist is the distance from the player to their physical screen, in
  3384. // units of the viewport's diagonal size. It controls the field of view.
  3385. static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
  3386. // Tilt angle in radians (clockwise around up and right vectors):
  3387. static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
  3388. // Aspect ratio: When the true viewport size is unknown, this value is used
  3389. // to help convert between the phosphor triad size and count, along with
  3390. // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
  3391. // this equal to Retroarch's display aspect ratio (DAR) for best results;
  3392. // range [1, geom_max_aspect_ratio from user-cgp-constants.h];
  3393. // default (256/224)*(54/47) = 1.313069909 (see below)
  3394. static const float geom_aspect_ratio_static = 1.313069909;
  3395. // Before getting into overscan, here's some general aspect ratio info:
  3396. // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
  3397. // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
  3398. // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
  3399. // Geometry processing has to "undo" the screen-space 2D DAR to calculate
  3400. // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
  3401. // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
  3402. // a.) Enable Retroarch's "Crop Overscan"
  3403. // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
  3404. // Real consoles use horizontal black padding in the signal, but emulators
  3405. // often crop this without cropping the vertical padding; a 256x224 [S]NES
  3406. // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
  3407. // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
  3408. // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
  3409. // http://forums.nesdev.com/viewtopic.php?p=24815#p24815
  3410. // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
  3411. // without doing a. or b., but horizontal image borders will be tighter
  3412. // than vertical ones, messing up curvature and overscan. Fixing the
  3413. // padding first corrects this.
  3414. // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
  3415. // or adjust x/y independently to e.g. readd horizontal padding, as noted
  3416. // above: Values < 1.0 zoom out; range (0, inf)
  3417. static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
  3418. // Compute a proper pixel-space to texture-space matrix even without ddx()/
  3419. // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
  3420. // with strong curvature (static option only for now).
  3421. static const bool geom_force_correct_tangent_matrix = true;
  3422. // BORDERS:
  3423. // Rounded border size in texture uv coords:
  3424. static const float border_size_static = 0.015; // range [0, 0.5]
  3425. // Border darkness: Moderate values darken the border smoothly, and high
  3426. // values make the image very dark just inside the border:
  3427. static const float border_darkness_static = 2.0; // range [0, inf)
  3428. // Border compression: High numbers compress border transitions, narrowing
  3429. // the dark border area.
  3430. static const float border_compress_static = 2.5; // range [1, inf)
  3431. #endif // USER_SETTINGS_H
  3432. ///////////////////////////// END USER-SETTINGS ////////////////////////////
  3433. //#include "user-cgp-constants.h"
  3434. ///////////////////////// BEGIN USER-CGP-CONSTANTS /////////////////////////
  3435. #ifndef USER_CGP_CONSTANTS_H
  3436. #define USER_CGP_CONSTANTS_H
  3437. // IMPORTANT:
  3438. // These constants MUST be set appropriately for the settings in crt-royale.cgp
  3439. // (or whatever related .cgp file you're using). If they aren't, you're likely
  3440. // to get artifacts, the wrong phosphor mask size, etc. I wish these could be
  3441. // set directly in the .cgp file to make things easier, but...they can't.
  3442. // PASS SCALES AND RELATED CONSTANTS:
  3443. // Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of
  3444. // this shader: One does a viewport-scale bloom, and the other skips it. The
  3445. // latter benefits from a higher bloom_approx_scale_x, so save both separately:
  3446. static const float bloom_approx_size_x = 320.0;
  3447. static const float bloom_approx_size_x_for_fake = 400.0;
  3448. // Copy the viewport-relative scales of the phosphor mask resize passes
  3449. // (MASK_RESIZE and the pass immediately preceding it):
  3450. static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
  3451. // Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
  3452. static const float geom_max_aspect_ratio = 4.0/3.0;
  3453. // PHOSPHOR MASK TEXTURE CONSTANTS:
  3454. // Set the following constants to reflect the properties of the phosphor mask
  3455. // texture named in crt-royale.cgp. The shader optionally resizes a mask tile
  3456. // based on user settings, then repeats a single tile until filling the screen.
  3457. // The shader must know the input texture size (default 64x64), and to manually
  3458. // resize, it must also know the horizontal triads per tile (default 8).
  3459. static const float2 mask_texture_small_size = float2(64.0, 64.0);
  3460. static const float2 mask_texture_large_size = float2(512.0, 512.0);
  3461. static const float mask_triads_per_tile = 8.0;
  3462. // We need the average brightness of the phosphor mask to compensate for the
  3463. // dimming it causes. The following four values are roughly correct for the
  3464. // masks included with the shader. Update the value for any LUT texture you
  3465. // change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
  3466. // the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
  3467. //#define PHOSPHOR_MASK_GRILLE14
  3468. static const float mask_grille14_avg_color = 50.6666666/255.0;
  3469. // TileableLinearApertureGrille14Wide7d33Spacing*.png
  3470. // TileableLinearApertureGrille14Wide10And6Spacing*.png
  3471. static const float mask_grille15_avg_color = 53.0/255.0;
  3472. // TileableLinearApertureGrille15Wide6d33Spacing*.png
  3473. // TileableLinearApertureGrille15Wide8And5d5Spacing*.png
  3474. static const float mask_slot_avg_color = 46.0/255.0;
  3475. // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
  3476. // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
  3477. static const float mask_shadow_avg_color = 41.0/255.0;
  3478. // TileableLinearShadowMask*.png
  3479. // TileableLinearShadowMaskEDP*.png
  3480. #ifdef PHOSPHOR_MASK_GRILLE14
  3481. static const float mask_grille_avg_color = mask_grille14_avg_color;
  3482. #else
  3483. static const float mask_grille_avg_color = mask_grille15_avg_color;
  3484. #endif
  3485. #endif // USER_CGP_CONSTANTS_H
  3486. ////////////////////////// END USER-CGP-CONSTANTS //////////////////////////
  3487. //////////////////////////////// END INCLUDES ////////////////////////////////
  3488. /////////////////////////////// FIXED SETTINGS ///////////////////////////////
  3489. // Avoid dividing by zero; using a macro overloads for float, float2, etc.:
  3490. #define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16
  3491. // Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
  3492. #ifndef SIMULATE_CRT_ON_LCD
  3493. #define SIMULATE_CRT_ON_LCD
  3494. #endif
  3495. // Manually tiling a manually resized texture creates texture coord derivative
  3496. // discontinuities and confuses anisotropic filtering, causing discolored tile
  3497. // seams in the phosphor mask. Workarounds:
  3498. // a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's
  3499. // downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
  3500. // disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
  3501. // b.) "Tile flat twice" requires drawing two full tiles without border padding
  3502. // to the resized mask FBO, and it's incompatible with same-pass curvature.
  3503. // (Same-pass curvature isn't used but could be in the future...maybe.)
  3504. // c.) "Fix discontinuities" requires derivatives and drawing one tile with
  3505. // border padding to the resized mask FBO, but it works with same-pass
  3506. // curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
  3507. // Precedence: a, then, b, then c (if multiple strategies are #defined).
  3508. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen
  3509. #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen
  3510. #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen
  3511. // Also, manually resampling the phosphor mask is slightly blurrier with
  3512. // anisotropic filtering. (Resampling with mipmapping is even worse: It
  3513. // creates artifacts, but only with the fully bloomed shader.) The difference
  3514. // is subtle with small triads, but you can fix it for a small cost.
  3515. //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  3516. ////////////////////////////// DERIVED SETTINGS //////////////////////////////
  3517. // Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
  3518. // geometry mode at runtime, or a 4x4 true Gaussian resize. Disable
  3519. // incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
  3520. // #defined by either user-settings.h or a wrapper .cg that #includes the
  3521. // current .cg pass.)
  3522. #ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  3523. #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
  3524. #undef PHOSPHOR_MASK_MANUALLY_RESIZE
  3525. #endif
  3526. #ifdef RUNTIME_GEOMETRY_MODE
  3527. #undef RUNTIME_GEOMETRY_MODE
  3528. #endif
  3529. // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
  3530. // inferior in most cases, so replace 2.0 with 0.0:
  3531. static const float bloom_approx_filter =
  3532. bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
  3533. #else
  3534. static const float bloom_approx_filter = bloom_approx_filter_static;
  3535. #endif
  3536. // Disable slow runtime paths if static parameters are used. Most of these
  3537. // won't be a problem anyway once the params are disabled, but some will.
  3538. #ifndef RUNTIME_SHADER_PARAMS_ENABLE
  3539. #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  3540. #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  3541. #endif
  3542. #ifdef RUNTIME_ANTIALIAS_WEIGHTS
  3543. #undef RUNTIME_ANTIALIAS_WEIGHTS
  3544. #endif
  3545. #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  3546. #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  3547. #endif
  3548. #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  3549. #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  3550. #endif
  3551. #ifdef RUNTIME_GEOMETRY_TILT
  3552. #undef RUNTIME_GEOMETRY_TILT
  3553. #endif
  3554. #ifdef RUNTIME_GEOMETRY_MODE
  3555. #undef RUNTIME_GEOMETRY_MODE
  3556. #endif
  3557. #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  3558. #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  3559. #endif
  3560. #endif
  3561. // Make tex2Dbias a backup for tex2Dlod for wider compatibility.
  3562. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  3563. #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  3564. #endif
  3565. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  3566. #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  3567. #endif
  3568. // Rule out unavailable anisotropic compatibility strategies:
  3569. #ifndef DRIVERS_ALLOW_DERIVATIVES
  3570. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3571. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3572. #endif
  3573. #endif
  3574. #ifndef DRIVERS_ALLOW_TEX2DLOD
  3575. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  3576. #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  3577. #endif
  3578. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  3579. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  3580. #endif
  3581. #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
  3582. #undef ANTIALIAS_DISABLE_ANISOTROPIC
  3583. #endif
  3584. #endif
  3585. #ifndef DRIVERS_ALLOW_TEX2DBIAS
  3586. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  3587. #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  3588. #endif
  3589. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  3590. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  3591. #endif
  3592. #endif
  3593. // Prioritize anisotropic tiling compatibility strategies by performance and
  3594. // disable unused strategies. This concentrates all the nesting in one place.
  3595. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  3596. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  3597. #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  3598. #endif
  3599. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  3600. #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  3601. #endif
  3602. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3603. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3604. #endif
  3605. #else
  3606. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  3607. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  3608. #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  3609. #endif
  3610. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3611. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3612. #endif
  3613. #else
  3614. // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
  3615. // flat texture coords in the same pass, but that's all we use.
  3616. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  3617. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3618. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3619. #endif
  3620. #endif
  3621. #endif
  3622. #endif
  3623. // The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
  3624. // reduce some #ifdef nesting in the next section by essentially OR'ing them:
  3625. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  3626. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  3627. #endif
  3628. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  3629. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  3630. #endif
  3631. // Prioritize anisotropic resampling compatibility strategies the same way:
  3632. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  3633. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  3634. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  3635. #endif
  3636. #endif
  3637. /////////////////////// DERIVED PHOSPHOR MASK CONSTANTS //////////////////////
  3638. // If we can use the large mipmapped LUT without mipmapping artifacts, we
  3639. // should: It gives us more options for using fewer samples.
  3640. #ifdef DRIVERS_ALLOW_TEX2DLOD
  3641. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  3642. // TODO: Take advantage of this!
  3643. #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
  3644. static const float2 mask_resize_src_lut_size = mask_texture_large_size;
  3645. #else
  3646. static const float2 mask_resize_src_lut_size = mask_texture_small_size;
  3647. #endif
  3648. #else
  3649. static const float2 mask_resize_src_lut_size = mask_texture_small_size;
  3650. #endif
  3651. // tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
  3652. // main_fragment, or a static alias of one of the above. This makes it hard
  3653. // to select the phosphor mask at runtime: We can't even assign to a uniform
  3654. // global in the vertex shader or select a sampler2D in the vertex shader and
  3655. // pass it to the fragment shader (even with explicit TEXUNIT# bindings),
  3656. // because it just gives us the input texture or a black screen. However, we
  3657. // can get around these limitations by calling tex2D three times with different
  3658. // uniform samplers (or resizing the phosphor mask three times altogether).
  3659. // With dynamic branches, we can process only one of these branches on top of
  3660. // quickly discarding fragments we don't need (cgc seems able to overcome
  3661. // limigations around dependent texture fetches inside of branches). Without
  3662. // dynamic branches, we have to process every branch for every fragment...which
  3663. // is slower. Runtime sampling mode selection is slower without dynamic
  3664. // branches as well. Let the user's static #defines decide if it's worth it.
  3665. #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
  3666. #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  3667. #else
  3668. #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  3669. #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  3670. #endif
  3671. #endif
  3672. // We need to render some minimum number of tiles in the resize passes.
  3673. // We need at least 1.0 just to repeat a single tile, and we need extra
  3674. // padding beyond that for anisotropic filtering, discontinuitity fixing,
  3675. // antialiasing, same-pass curvature (not currently used), etc. First
  3676. // determine how many border texels and tiles we need, based on how the result
  3677. // will be sampled:
  3678. #ifdef GEOMETRY_EARLY
  3679. static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
  3680. // Most antialiasing filters have a base radius of 4.0 pixels:
  3681. static const float max_aa_base_pixel_border = 4.0 +
  3682. max_subpixel_offset;
  3683. #else
  3684. static const float max_aa_base_pixel_border = 0.0;
  3685. #endif
  3686. // Anisotropic filtering adds about 0.5 to the pixel border:
  3687. #ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  3688. static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
  3689. #else
  3690. static const float max_aniso_pixel_border = max_aa_base_pixel_border;
  3691. #endif
  3692. // Fixing discontinuities adds 1.0 more to the pixel border:
  3693. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  3694. static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
  3695. #else
  3696. static const float max_tiled_pixel_border = max_aniso_pixel_border;
  3697. #endif
  3698. // Convert the pixel border to an integer texel border. Assume same-pass
  3699. // curvature about triples the texel frequency:
  3700. #ifdef GEOMETRY_EARLY
  3701. static const float max_mask_texel_border =
  3702. ceil(max_tiled_pixel_border * 3.0);
  3703. #else
  3704. static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
  3705. #endif
  3706. // Convert the texel border to a tile border using worst-case assumptions:
  3707. static const float max_mask_tile_border = max_mask_texel_border/
  3708. (mask_min_allowed_triad_size * mask_triads_per_tile);
  3709. // Finally, set the number of resized tiles to render to MASK_RESIZE, and set
  3710. // the starting texel (inside borders) for sampling it.
  3711. #ifndef GEOMETRY_EARLY
  3712. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  3713. // Special case: Render two tiles without borders. Anisotropic
  3714. // filtering doesn't seem to be a problem here.
  3715. static const float mask_resize_num_tiles = 1.0 + 1.0;
  3716. static const float mask_start_texels = 0.0;
  3717. #else
  3718. static const float mask_resize_num_tiles = 1.0 +
  3719. 2.0 * max_mask_tile_border;
  3720. static const float mask_start_texels = max_mask_texel_border;
  3721. #endif
  3722. #else
  3723. static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
  3724. static const float mask_start_texels = max_mask_texel_border;
  3725. #endif
  3726. // We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
  3727. // mask_resize_viewport_scale. This limits the maximum final triad size.
  3728. // Estimate the minimum number of triads we can split the screen into in each
  3729. // dimension (we'll be as correct as mask_resize_viewport_scale is):
  3730. static const float mask_resize_num_triads =
  3731. mask_resize_num_tiles * mask_triads_per_tile;
  3732. static const float2 min_allowed_viewport_triads =
  3733. float2(mask_resize_num_triads) / mask_resize_viewport_scale;
  3734. //////////////////////// COMMON MATHEMATICAL CONSTANTS ///////////////////////
  3735. static const float pi = 3.141592653589;
  3736. // We often want to find the location of the previous texel, e.g.:
  3737. // const float2 curr_texel = uv * texture_size;
  3738. // const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
  3739. // const float2 prev_texel_uv = prev_texel / texture_size;
  3740. // However, many GPU drivers round incorrectly around exact texel locations.
  3741. // We need to subtract a little less than 0.5 before flooring, and some GPU's
  3742. // require this value to be farther from 0.5 than others; define it here.
  3743. // const float2 prev_texel =
  3744. // floor(curr_texel - float2(under_half)) + float2(0.5);
  3745. static const float under_half = 0.4995;
  3746. #endif // DERIVED_SETTINGS_AND_CONSTANTS_H
  3747. ///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ////////////////////////////
  3748. //#include "../../../../include/special-functions.h"
  3749. /////////////////////////// BEGIN SPECIAL-FUNCTIONS //////////////////////////
  3750. #ifndef SPECIAL_FUNCTIONS_H
  3751. #define SPECIAL_FUNCTIONS_H
  3752. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  3753. // Copyright (C) 2014 TroggleMonkey
  3754. //
  3755. // Permission is hereby granted, free of charge, to any person obtaining a copy
  3756. // of this software and associated documentation files (the "Software"), to
  3757. // deal in the Software without restriction, including without limitation the
  3758. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  3759. // sell copies of the Software, and to permit persons to whom the Software is
  3760. // furnished to do so, subject to the following conditions:
  3761. //
  3762. // The above copyright notice and this permission notice shall be included in
  3763. // all copies or substantial portions of the Software.
  3764. //
  3765. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  3766. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  3767. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  3768. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  3769. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  3770. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  3771. // IN THE SOFTWARE.
  3772. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  3773. // This file implements the following mathematical special functions:
  3774. // 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
  3775. // 2.) gamma(s), a real-numbered extension of the integer factorial function
  3776. // It also implements normalized_ligamma(s, z), a normalized lower incomplete
  3777. // gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can
  3778. // be called with an _impl suffix to use an implementation version with a few
  3779. // extra precomputed parameters (which may be useful for the caller to reuse).
  3780. // See below for details.
  3781. //
  3782. // Design Rationale:
  3783. // Pretty much every line of code in this file is duplicated four times for
  3784. // different input types (float4/float3/float2/float). This is unfortunate,
  3785. // but Cg doesn't allow function templates. Macros would be far less verbose,
  3786. // but they would make the code harder to document and read. I don't expect
  3787. // these functions will require a whole lot of maintenance changes unless
  3788. // someone ever has need for more robust incomplete gamma functions, so code
  3789. // duplication seems to be the lesser evil in this case.
  3790. /////////////////////////// GAUSSIAN ERROR FUNCTION //////////////////////////
  3791. float4 erf6(float4 x)
  3792. {
  3793. // Requires: x is the standard parameter to erf().
  3794. // Returns: Return an Abramowitz/Stegun approximation of erf(), where:
  3795. // erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
  3796. // This approximation has a max absolute error of 2.5*10**-5
  3797. // with solid numerical robustness and efficiency. See:
  3798. // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
  3799. static const float4 one = float4(1.0);
  3800. const float4 sign_x = sign(x);
  3801. const float4 t = one/(one + 0.47047*abs(x));
  3802. const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  3803. exp(-(x*x));
  3804. return result * sign_x;
  3805. }
  3806. float3 erf6(const float3 x)
  3807. {
  3808. // Float3 version:
  3809. static const float3 one = float3(1.0);
  3810. const float3 sign_x = sign(x);
  3811. const float3 t = one/(one + 0.47047*abs(x));
  3812. const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  3813. exp(-(x*x));
  3814. return result * sign_x;
  3815. }
  3816. float2 erf6(const float2 x)
  3817. {
  3818. // Float2 version:
  3819. static const float2 one = float2(1.0);
  3820. const float2 sign_x = sign(x);
  3821. const float2 t = one/(one + 0.47047*abs(x));
  3822. const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  3823. exp(-(x*x));
  3824. return result * sign_x;
  3825. }
  3826. float erf6(const float x)
  3827. {
  3828. // Float version:
  3829. const float sign_x = sign(x);
  3830. const float t = 1.0/(1.0 + 0.47047*abs(x));
  3831. const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  3832. exp(-(x*x));
  3833. return result * sign_x;
  3834. }
  3835. float4 erft(const float4 x)
  3836. {
  3837. // Requires: x is the standard parameter to erf().
  3838. // Returns: Approximate erf() with the hyperbolic tangent. The error is
  3839. // visually noticeable, but it's blazing fast and perceptually
  3840. // close...at least on ATI hardware. See:
  3841. // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
  3842. // Warning: Only use this if your hardware drivers correctly implement
  3843. // tanh(): My nVidia 8800GTS returns garbage output.
  3844. return tanh(1.202760580 * x);
  3845. }
  3846. float3 erft(const float3 x)
  3847. {
  3848. // Float3 version:
  3849. return tanh(1.202760580 * x);
  3850. }
  3851. float2 erft(const float2 x)
  3852. {
  3853. // Float2 version:
  3854. return tanh(1.202760580 * x);
  3855. }
  3856. float erft(const float x)
  3857. {
  3858. // Float version:
  3859. return tanh(1.202760580 * x);
  3860. }
  3861. inline float4 erf(const float4 x)
  3862. {
  3863. // Requires: x is the standard parameter to erf().
  3864. // Returns: Some approximation of erf(x), depending on user settings.
  3865. #ifdef ERF_FAST_APPROXIMATION
  3866. return erft(x);
  3867. #else
  3868. return erf6(x);
  3869. #endif
  3870. }
  3871. inline float3 erf(const float3 x)
  3872. {
  3873. // Float3 version:
  3874. #ifdef ERF_FAST_APPROXIMATION
  3875. return erft(x);
  3876. #else
  3877. return erf6(x);
  3878. #endif
  3879. }
  3880. inline float2 erf(const float2 x)
  3881. {
  3882. // Float2 version:
  3883. #ifdef ERF_FAST_APPROXIMATION
  3884. return erft(x);
  3885. #else
  3886. return erf6(x);
  3887. #endif
  3888. }
  3889. inline float erf(const float x)
  3890. {
  3891. // Float version:
  3892. #ifdef ERF_FAST_APPROXIMATION
  3893. return erft(x);
  3894. #else
  3895. return erf6(x);
  3896. #endif
  3897. }
  3898. /////////////////////////// COMPLETE GAMMA FUNCTION //////////////////////////
  3899. float4 gamma_impl(const float4 s, const float4 s_inv)
  3900. {
  3901. // Requires: 1.) s is the standard parameter to the gamma function, and
  3902. // it should lie in the [0, 36] range.
  3903. // 2.) s_inv = 1.0/s. This implementation function requires
  3904. // the caller to precompute this value, giving users the
  3905. // opportunity to reuse it.
  3906. // Returns: Return approximate gamma function (real-numbered factorial)
  3907. // output using the Lanczos approximation with two coefficients
  3908. // calculated using Paul Godfrey's method here:
  3909. // http://my.fit.edu/~gabdo/gamma.txt
  3910. // An optimal g value for s in [0, 36] is ~1.12906830989, with
  3911. // a maximum relative error of 0.000463 for 2**16 equally
  3912. // evals. We could use three coeffs (0.0000346 error) without
  3913. // hurting latency, but this allows more parallelism with
  3914. // outside instructions.
  3915. static const float4 g = float4(1.12906830989);
  3916. static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
  3917. static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
  3918. static const float4 e = float4(2.71828182845904523536028747135266249775724709);
  3919. const float4 sph = s + float4(0.5);
  3920. const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
  3921. const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e
  3922. // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
  3923. // This has less error for small s's than (s -= 1.0) at the beginning.
  3924. return (pow(base, sph) * lanczos_sum) * s_inv;
  3925. }
  3926. float3 gamma_impl(const float3 s, const float3 s_inv)
  3927. {
  3928. // Float3 version:
  3929. static const float3 g = float3(1.12906830989);
  3930. static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
  3931. static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
  3932. static const float3 e = float3(2.71828182845904523536028747135266249775724709);
  3933. const float3 sph = s + float3(0.5);
  3934. const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
  3935. const float3 base = (sph + g)/e;
  3936. return (pow(base, sph) * lanczos_sum) * s_inv;
  3937. }
  3938. float2 gamma_impl(const float2 s, const float2 s_inv)
  3939. {
  3940. // Float2 version:
  3941. static const float2 g = float2(1.12906830989);
  3942. static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
  3943. static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
  3944. static const float2 e = float2(2.71828182845904523536028747135266249775724709);
  3945. const float2 sph = s + float2(0.5);
  3946. const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
  3947. const float2 base = (sph + g)/e;
  3948. return (pow(base, sph) * lanczos_sum) * s_inv;
  3949. }
  3950. float gamma_impl(const float s, const float s_inv)
  3951. {
  3952. // Float version:
  3953. static const float g = 1.12906830989;
  3954. static const float c0 = 0.8109119309638332633713423362694399653724431;
  3955. static const float c1 = 0.4808354605142681877121661197951496120000040;
  3956. static const float e = 2.71828182845904523536028747135266249775724709;
  3957. const float sph = s + 0.5;
  3958. const float lanczos_sum = c0 + c1/(s + 1.0);
  3959. const float base = (sph + g)/e;
  3960. return (pow(base, sph) * lanczos_sum) * s_inv;
  3961. }
  3962. float4 gamma(const float4 s)
  3963. {
  3964. // Requires: s is the standard parameter to the gamma function, and it
  3965. // should lie in the [0, 36] range.
  3966. // Returns: Return approximate gamma function output with a maximum
  3967. // relative error of 0.000463. See gamma_impl for details.
  3968. return gamma_impl(s, float4(1.0)/s);
  3969. }
  3970. float3 gamma(const float3 s)
  3971. {
  3972. // Float3 version:
  3973. return gamma_impl(s, float3(1.0)/s);
  3974. }
  3975. float2 gamma(const float2 s)
  3976. {
  3977. // Float2 version:
  3978. return gamma_impl(s, float2(1.0)/s);
  3979. }
  3980. float gamma(const float s)
  3981. {
  3982. // Float version:
  3983. return gamma_impl(s, 1.0/s);
  3984. }
  3985. //////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) ///////////////
  3986. // Lower incomplete gamma function for small s and z (implementation):
  3987. float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
  3988. {
  3989. // Requires: 1.) s < ~0.5
  3990. // 2.) z <= ~0.775075
  3991. // 3.) s_inv = 1.0/s (precomputed for outside reuse)
  3992. // Returns: A series representation for the lower incomplete gamma
  3993. // function for small s and small z (4 terms).
  3994. // The actual "rolled up" summation looks like:
  3995. // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
  3996. // sum = last_sign * last_pow / ((s + k) * last_factorial)
  3997. // for(int i = 0; i < 4; ++i)
  3998. // {
  3999. // last_sign *= -1.0; last_pow *= z; last_factorial *= i;
  4000. // sum += last_sign * last_pow / ((s + k) * last_factorial);
  4001. // }
  4002. // Unrolled, constant-unfolded and arranged for madds and parallelism:
  4003. const float4 scale = pow(z, s);
  4004. float4 sum = s_inv; // Summation iteration 0 result
  4005. // Summation iterations 1, 2, and 3:
  4006. const float4 z_sq = z*z;
  4007. const float4 denom1 = s + float4(1.0);
  4008. const float4 denom2 = 2.0*s + float4(4.0);
  4009. const float4 denom3 = 6.0*s + float4(18.0);
  4010. //float4 denom4 = 24.0*s + float4(96.0);
  4011. sum -= z/denom1;
  4012. sum += z_sq/denom2;
  4013. sum -= z * z_sq/denom3;
  4014. //sum += z_sq * z_sq / denom4;
  4015. // Scale and return:
  4016. return scale * sum;
  4017. }
  4018. float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
  4019. {
  4020. // Float3 version:
  4021. const float3 scale = pow(z, s);
  4022. float3 sum = s_inv;
  4023. const float3 z_sq = z*z;
  4024. const float3 denom1 = s + float3(1.0);
  4025. const float3 denom2 = 2.0*s + float3(4.0);
  4026. const float3 denom3 = 6.0*s + float3(18.0);
  4027. sum -= z/denom1;
  4028. sum += z_sq/denom2;
  4029. sum -= z * z_sq/denom3;
  4030. return scale * sum;
  4031. }
  4032. float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
  4033. {
  4034. // Float2 version:
  4035. const float2 scale = pow(z, s);
  4036. float2 sum = s_inv;
  4037. const float2 z_sq = z*z;
  4038. const float2 denom1 = s + float2(1.0);
  4039. const float2 denom2 = 2.0*s + float2(4.0);
  4040. const float2 denom3 = 6.0*s + float2(18.0);
  4041. sum -= z/denom1;
  4042. sum += z_sq/denom2;
  4043. sum -= z * z_sq/denom3;
  4044. return scale * sum;
  4045. }
  4046. float ligamma_small_z_impl(const float s, const float z, const float s_inv)
  4047. {
  4048. // Float version:
  4049. const float scale = pow(z, s);
  4050. float sum = s_inv;
  4051. const float z_sq = z*z;
  4052. const float denom1 = s + 1.0;
  4053. const float denom2 = 2.0*s + 4.0;
  4054. const float denom3 = 6.0*s + 18.0;
  4055. sum -= z/denom1;
  4056. sum += z_sq/denom2;
  4057. sum -= z * z_sq/denom3;
  4058. return scale * sum;
  4059. }
  4060. // Upper incomplete gamma function for small s and large z (implementation):
  4061. float4 uigamma_large_z_impl(const float4 s, const float4 z)
  4062. {
  4063. // Requires: 1.) s < ~0.5
  4064. // 2.) z > ~0.775075
  4065. // Returns: Gauss's continued fraction representation for the upper
  4066. // incomplete gamma function (4 terms).
  4067. // The "rolled up" continued fraction looks like this. The denominator
  4068. // is truncated, and it's calculated "from the bottom up:"
  4069. // denom = float4('inf');
  4070. // float4 one = float4(1.0);
  4071. // for(int i = 4; i > 0; --i)
  4072. // {
  4073. // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
  4074. // }
  4075. // Unrolled and constant-unfolded for madds and parallelism:
  4076. const float4 numerator = pow(z, s) * exp(-z);
  4077. float4 denom = float4(7.0) + z - s;
  4078. denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
  4079. denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
  4080. denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
  4081. return numerator / denom;
  4082. }
  4083. float3 uigamma_large_z_impl(const float3 s, const float3 z)
  4084. {
  4085. // Float3 version:
  4086. const float3 numerator = pow(z, s) * exp(-z);
  4087. float3 denom = float3(7.0) + z - s;
  4088. denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
  4089. denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
  4090. denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
  4091. return numerator / denom;
  4092. }
  4093. float2 uigamma_large_z_impl(const float2 s, const float2 z)
  4094. {
  4095. // Float2 version:
  4096. const float2 numerator = pow(z, s) * exp(-z);
  4097. float2 denom = float2(7.0) + z - s;
  4098. denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
  4099. denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
  4100. denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
  4101. return numerator / denom;
  4102. }
  4103. float uigamma_large_z_impl(const float s, const float z)
  4104. {
  4105. // Float version:
  4106. const float numerator = pow(z, s) * exp(-z);
  4107. float denom = 7.0 + z - s;
  4108. denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
  4109. denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
  4110. denom = 1.0 + z - s + (s - 1.0)/denom;
  4111. return numerator / denom;
  4112. }
  4113. // Normalized lower incomplete gamma function for small s (implementation):
  4114. float4 normalized_ligamma_impl(const float4 s, const float4 z,
  4115. const float4 s_inv, const float4 gamma_s_inv)
  4116. {
  4117. // Requires: 1.) s < ~0.5
  4118. // 2.) s_inv = 1/s (precomputed for outside reuse)
  4119. // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
  4120. // Returns: Approximate the normalized lower incomplete gamma function
  4121. // for s < 0.5. Since we only care about s < 0.5, we only need
  4122. // to evaluate two branches (not four) based on z. Each branch
  4123. // uses four terms, with a max relative error of ~0.00182. The
  4124. // branch threshold and specifics were adapted for fewer terms
  4125. // from Gil/Segura/Temme's paper here:
  4126. // http://oai.cwi.nl/oai/asset/20433/20433B.pdf
  4127. // Evaluate both branches: Real branches test slower even when available.
  4128. static const float4 thresh = float4(0.775075);
  4129. bool4 z_is_large;
  4130. z_is_large.x = z.x > thresh.x;
  4131. z_is_large.y = z.y > thresh.y;
  4132. z_is_large.z = z.z > thresh.z;
  4133. z_is_large.w = z.w > thresh.w;
  4134. const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  4135. const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  4136. // Combine the results from both branches:
  4137. bool4 inverse_z_is_large = not(z_is_large);
  4138. return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
  4139. }
  4140. float3 normalized_ligamma_impl(const float3 s, const float3 z,
  4141. const float3 s_inv, const float3 gamma_s_inv)
  4142. {
  4143. // Float3 version:
  4144. static const float3 thresh = float3(0.775075);
  4145. bool3 z_is_large;
  4146. z_is_large.x = z.x > thresh.x;
  4147. z_is_large.y = z.y > thresh.y;
  4148. z_is_large.z = z.z > thresh.z;
  4149. const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  4150. const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  4151. bool3 inverse_z_is_large = not(z_is_large);
  4152. return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
  4153. }
  4154. float2 normalized_ligamma_impl(const float2 s, const float2 z,
  4155. const float2 s_inv, const float2 gamma_s_inv)
  4156. {
  4157. // Float2 version:
  4158. static const float2 thresh = float2(0.775075);
  4159. bool2 z_is_large;
  4160. z_is_large.x = z.x > thresh.x;
  4161. z_is_large.y = z.y > thresh.y;
  4162. const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  4163. const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  4164. bool2 inverse_z_is_large = not(z_is_large);
  4165. return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
  4166. }
  4167. float normalized_ligamma_impl(const float s, const float z,
  4168. const float s_inv, const float gamma_s_inv)
  4169. {
  4170. // Float version:
  4171. static const float thresh = 0.775075;
  4172. const bool z_is_large = z > thresh;
  4173. const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
  4174. const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  4175. return large_z * float(z_is_large) + small_z * float(!z_is_large);
  4176. }
  4177. // Normalized lower incomplete gamma function for small s:
  4178. float4 normalized_ligamma(const float4 s, const float4 z)
  4179. {
  4180. // Requires: s < ~0.5
  4181. // Returns: Approximate the normalized lower incomplete gamma function
  4182. // for s < 0.5. See normalized_ligamma_impl() for details.
  4183. const float4 s_inv = float4(1.0)/s;
  4184. const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
  4185. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  4186. }
  4187. float3 normalized_ligamma(const float3 s, const float3 z)
  4188. {
  4189. // Float3 version:
  4190. const float3 s_inv = float3(1.0)/s;
  4191. const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
  4192. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  4193. }
  4194. float2 normalized_ligamma(const float2 s, const float2 z)
  4195. {
  4196. // Float2 version:
  4197. const float2 s_inv = float2(1.0)/s;
  4198. const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
  4199. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  4200. }
  4201. float normalized_ligamma(const float s, const float z)
  4202. {
  4203. // Float version:
  4204. const float s_inv = 1.0/s;
  4205. const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
  4206. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  4207. }
  4208. #endif // SPECIAL_FUNCTIONS_H
  4209. //////////////////////////// END SPECIAL-FUNCTIONS ///////////////////////////
  4210. //#include "../../../../include/gamma-management.h"
  4211. //////////////////////////// BEGIN GAMMA-MANAGEMENT //////////////////////////
  4212. #ifndef GAMMA_MANAGEMENT_H
  4213. #define GAMMA_MANAGEMENT_H
  4214. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  4215. // Copyright (C) 2014 TroggleMonkey
  4216. //
  4217. // Permission is hereby granted, free of charge, to any person obtaining a copy
  4218. // of this software and associated documentation files (the "Software"), to
  4219. // deal in the Software without restriction, including without limitation the
  4220. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  4221. // sell copies of the Software, and to permit persons to whom the Software is
  4222. // furnished to do so, subject to the following conditions:
  4223. //
  4224. // The above copyright notice and this permission notice shall be included in
  4225. // all copies or substantial portions of the Software.
  4226. //
  4227. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  4228. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  4229. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  4230. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  4231. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  4232. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  4233. // IN THE SOFTWARE.
  4234. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  4235. // This file provides gamma-aware tex*D*() and encode_output() functions.
  4236. // Requires: Before #include-ing this file, the including file must #define
  4237. // the following macros when applicable and follow their rules:
  4238. // 1.) #define FIRST_PASS if this is the first pass.
  4239. // 2.) #define LAST_PASS if this is the last pass.
  4240. // 3.) If sRGB is available, set srgb_framebufferN = "true" for
  4241. // every pass except the last in your .cgp preset.
  4242. // 4.) If sRGB isn't available but you want gamma-correctness with
  4243. // no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
  4244. // 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
  4245. // 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
  4246. // 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
  4247. // 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
  4248. // If an option in [5, 8] is #defined in the first or last pass, it
  4249. // should be #defined for both. It shouldn't make a difference
  4250. // whether it's #defined for intermediate passes or not.
  4251. // Optional: The including file (or an earlier included file) may optionally
  4252. // #define a number of macros indicating it will override certain
  4253. // macros and associated constants are as follows:
  4254. // static constants with either static or uniform constants. The
  4255. // 1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
  4256. // static const float ntsc_gamma
  4257. // static const float pal_gamma
  4258. // static const float crt_reference_gamma_high
  4259. // static const float crt_reference_gamma_low
  4260. // static const float lcd_reference_gamma
  4261. // static const float crt_office_gamma
  4262. // static const float lcd_office_gamma
  4263. // 2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
  4264. // static const float crt_gamma
  4265. // static const float gba_gamma
  4266. // static const float lcd_gamma
  4267. // 3.) OVERRIDE_FINAL_GAMMA: The user must first define:
  4268. // static const float input_gamma
  4269. // static const float intermediate_gamma
  4270. // static const float output_gamma
  4271. // (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
  4272. // 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
  4273. // static const bool assume_opaque_alpha
  4274. // The gamma constant overrides must be used in every pass or none,
  4275. // and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
  4276. // OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
  4277. // Usage: After setting macros appropriately, ignore gamma correction and
  4278. // replace all tex*D*() calls with equivalent gamma-aware
  4279. // tex*D*_linearize calls, except:
  4280. // 1.) When you read an LUT, use regular tex*D or a gamma-specified
  4281. // function, depending on its gamma encoding:
  4282. // tex*D*_linearize_gamma (takes a runtime gamma parameter)
  4283. // 2.) If you must read pass0's original input in a later pass, use
  4284. // tex2D_linearize_ntsc_gamma. If you want to read pass0's
  4285. // input with gamma-corrected bilinear filtering, consider
  4286. // creating a first linearizing pass and reading from the input
  4287. // of pass1 later.
  4288. // Then, return encode_output(color) from every fragment shader.
  4289. // Finally, use the global gamma_aware_bilinear boolean if you want
  4290. // to statically branch based on whether bilinear filtering is
  4291. // gamma-correct or not (e.g. for placing Gaussian blur samples).
  4292. //
  4293. // Detailed Policy:
  4294. // tex*D*_linearize() functions enforce a consistent gamma-management policy
  4295. // based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume
  4296. // their input texture has the same encoding characteristics as the input for
  4297. // the current pass (which doesn't apply to the exceptions listed above).
  4298. // Similarly, encode_output() enforces a policy based on the LAST_PASS and
  4299. // GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the
  4300. // following two pipelines.
  4301. // Typical pipeline with intermediate sRGB framebuffers:
  4302. // linear_color = pow(pass0_encoded_color, input_gamma);
  4303. // intermediate_output = linear_color; // Automatic sRGB encoding
  4304. // linear_color = intermediate_output; // Automatic sRGB decoding
  4305. // final_output = pow(intermediate_output, 1.0/output_gamma);
  4306. // Typical pipeline without intermediate sRGB framebuffers:
  4307. // linear_color = pow(pass0_encoded_color, input_gamma);
  4308. // intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
  4309. // linear_color = pow(intermediate_output, intermediate_gamma);
  4310. // final_output = pow(intermediate_output, 1.0/output_gamma);
  4311. // Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
  4312. // easily get gamma-correctness without banding on devices where sRGB isn't
  4313. // supported.
  4314. //
  4315. // Use This Header to Maximize Code Reuse:
  4316. // The purpose of this header is to provide a consistent interface for texture
  4317. // reads and output gamma-encoding that localizes and abstracts away all the
  4318. // annoying details. This greatly reduces the amount of code in each shader
  4319. // pass that depends on the pass number in the .cgp preset or whether sRGB
  4320. // FBO's are being used: You can trivially change the gamma behavior of your
  4321. // whole pass by commenting or uncommenting 1-3 #defines. To reuse the same
  4322. // code in your first, Nth, and last passes, you can even put it all in another
  4323. // header file and #include it from skeleton .cg files that #define the
  4324. // appropriate pass-specific settings.
  4325. //
  4326. // Rationale for Using Three Macros:
  4327. // This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
  4328. // SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
  4329. // a lower maintenance burden on each pass. At first glance it seems we could
  4330. // accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
  4331. // This works for simple use cases where input_gamma == output_gamma, but it
  4332. // breaks down for more complex scenarios like CRT simulation, where the pass
  4333. // number determines the gamma encoding of the input and output.
  4334. /////////////////////////////// BASE CONSTANTS ///////////////////////////////
  4335. // Set standard gamma constants, but allow users to override them:
  4336. #ifndef OVERRIDE_STANDARD_GAMMA
  4337. // Standard encoding gammas:
  4338. static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too?
  4339. static const float pal_gamma = 2.8; // Never actually 2.8 in practice
  4340. // Typical device decoding gammas (only use for emulating devices):
  4341. // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
  4342. // gammas: The standards purposely undercorrected for an analog CRT's
  4343. // assumed 2.5 reference display gamma to maintain contrast in assumed
  4344. // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
  4345. // These unstated assumptions about display gamma and perceptual rendering
  4346. // intent caused a lot of confusion, and more modern CRT's seemed to target
  4347. // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit
  4348. // (they struggle near black with 2.5 gamma anyway), especially PC/laptop
  4349. // displays designed to view sRGB in bright environments. (Standards are
  4350. // also in flux again with BT.1886, but it's underspecified for displays.)
  4351. static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55)
  4352. static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55)
  4353. static const float lcd_reference_gamma = 2.5; // To match CRT
  4354. static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC
  4355. static const float lcd_office_gamma = 2.2; // Approximates sRGB
  4356. #endif // OVERRIDE_STANDARD_GAMMA
  4357. // Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
  4358. // but only if they're aware of it.
  4359. #ifndef OVERRIDE_ALPHA_ASSUMPTIONS
  4360. static const bool assume_opaque_alpha = false;
  4361. #endif
  4362. /////////////////////// DERIVED CONSTANTS AS FUNCTIONS ///////////////////////
  4363. // gamma-management.h should be compatible with overriding gamma values with
  4364. // runtime user parameters, but we can only define other global constants in
  4365. // terms of static constants, not uniform user parameters. To get around this
  4366. // limitation, we need to define derived constants using functions.
  4367. // Set device gamma constants, but allow users to override them:
  4368. #ifdef OVERRIDE_DEVICE_GAMMA
  4369. // The user promises to globally define the appropriate constants:
  4370. inline float get_crt_gamma() { return crt_gamma; }
  4371. inline float get_gba_gamma() { return gba_gamma; }
  4372. inline float get_lcd_gamma() { return lcd_gamma; }
  4373. #else
  4374. inline float get_crt_gamma() { return crt_reference_gamma_high; }
  4375. inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0)
  4376. inline float get_lcd_gamma() { return lcd_office_gamma; }
  4377. #endif // OVERRIDE_DEVICE_GAMMA
  4378. // Set decoding/encoding gammas for the first/lass passes, but allow overrides:
  4379. #ifdef OVERRIDE_FINAL_GAMMA
  4380. // The user promises to globally define the appropriate constants:
  4381. inline float get_intermediate_gamma() { return intermediate_gamma; }
  4382. inline float get_input_gamma() { return input_gamma; }
  4383. inline float get_output_gamma() { return output_gamma; }
  4384. #else
  4385. // If we gamma-correct every pass, always use ntsc_gamma between passes to
  4386. // ensure middle passes don't need to care if anything is being simulated:
  4387. inline float get_intermediate_gamma() { return ntsc_gamma; }
  4388. #ifdef SIMULATE_CRT_ON_LCD
  4389. inline float get_input_gamma() { return get_crt_gamma(); }
  4390. inline float get_output_gamma() { return get_lcd_gamma(); }
  4391. #else
  4392. #ifdef SIMULATE_GBA_ON_LCD
  4393. inline float get_input_gamma() { return get_gba_gamma(); }
  4394. inline float get_output_gamma() { return get_lcd_gamma(); }
  4395. #else
  4396. #ifdef SIMULATE_LCD_ON_CRT
  4397. inline float get_input_gamma() { return get_lcd_gamma(); }
  4398. inline float get_output_gamma() { return get_crt_gamma(); }
  4399. #else
  4400. #ifdef SIMULATE_GBA_ON_CRT
  4401. inline float get_input_gamma() { return get_gba_gamma(); }
  4402. inline float get_output_gamma() { return get_crt_gamma(); }
  4403. #else // Don't simulate anything:
  4404. inline float get_input_gamma() { return ntsc_gamma; }
  4405. inline float get_output_gamma() { return ntsc_gamma; }
  4406. #endif // SIMULATE_GBA_ON_CRT
  4407. #endif // SIMULATE_LCD_ON_CRT
  4408. #endif // SIMULATE_GBA_ON_LCD
  4409. #endif // SIMULATE_CRT_ON_LCD
  4410. #endif // OVERRIDE_FINAL_GAMMA
  4411. // Set decoding/encoding gammas for the current pass. Use static constants for
  4412. // linearize_input and gamma_encode_output, because they aren't derived, and
  4413. // they let the compiler do dead-code elimination.
  4414. #ifndef GAMMA_ENCODE_EVERY_FBO
  4415. #ifdef FIRST_PASS
  4416. static const bool linearize_input = true;
  4417. inline float get_pass_input_gamma() { return get_input_gamma(); }
  4418. #else
  4419. static const bool linearize_input = false;
  4420. inline float get_pass_input_gamma() { return 1.0; }
  4421. #endif
  4422. #ifdef LAST_PASS
  4423. static const bool gamma_encode_output = true;
  4424. inline float get_pass_output_gamma() { return get_output_gamma(); }
  4425. #else
  4426. static const bool gamma_encode_output = false;
  4427. inline float get_pass_output_gamma() { return 1.0; }
  4428. #endif
  4429. #else
  4430. static const bool linearize_input = true;
  4431. static const bool gamma_encode_output = true;
  4432. #ifdef FIRST_PASS
  4433. inline float get_pass_input_gamma() { return get_input_gamma(); }
  4434. #else
  4435. inline float get_pass_input_gamma() { return get_intermediate_gamma(); }
  4436. #endif
  4437. #ifdef LAST_PASS
  4438. inline float get_pass_output_gamma() { return get_output_gamma(); }
  4439. #else
  4440. inline float get_pass_output_gamma() { return get_intermediate_gamma(); }
  4441. #endif
  4442. #endif
  4443. // Users might want to know if bilinear filtering will be gamma-correct:
  4444. static const bool gamma_aware_bilinear = !linearize_input;
  4445. ////////////////////// COLOR ENCODING/DECODING FUNCTIONS /////////////////////
  4446. inline float4 encode_output(const float4 color)
  4447. {
  4448. if(gamma_encode_output)
  4449. {
  4450. if(assume_opaque_alpha)
  4451. {
  4452. return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
  4453. }
  4454. else
  4455. {
  4456. return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
  4457. }
  4458. }
  4459. else
  4460. {
  4461. return color;
  4462. }
  4463. }
  4464. inline float4 decode_input(const float4 color)
  4465. {
  4466. if(linearize_input)
  4467. {
  4468. if(assume_opaque_alpha)
  4469. {
  4470. return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
  4471. }
  4472. else
  4473. {
  4474. return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
  4475. }
  4476. }
  4477. else
  4478. {
  4479. return color;
  4480. }
  4481. }
  4482. inline float4 decode_gamma_input(const float4 color, const float3 gamma)
  4483. {
  4484. if(assume_opaque_alpha)
  4485. {
  4486. return float4(pow(color.rgb, gamma), 1.0);
  4487. }
  4488. else
  4489. {
  4490. return float4(pow(color.rgb, gamma), color.a);
  4491. }
  4492. }
  4493. //TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
  4494. //#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
  4495. // EDIT: it's the 'const' in front of the coords that's doing it
  4496. /////////////////////////// TEXTURE LOOKUP WRAPPERS //////////////////////////
  4497. // "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  4498. // Provide a wide array of linearizing texture lookup wrapper functions. The
  4499. // Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
  4500. // lookups are provided for completeness in case that changes someday. Nobody
  4501. // is likely to use the *fetch and *proj functions, but they're included just
  4502. // in case. The only tex*D texture sampling functions omitted are:
  4503. // - tex*Dcmpbias
  4504. // - tex*Dcmplod
  4505. // - tex*DARRAY*
  4506. // - tex*DMS*
  4507. // - Variants returning integers
  4508. // Standard line length restrictions are ignored below for vertical brevity.
  4509. /*
  4510. // tex1D:
  4511. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
  4512. { return decode_input(tex1D(tex, tex_coords)); }
  4513. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
  4514. { return decode_input(tex1D(tex, tex_coords)); }
  4515. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
  4516. { return decode_input(tex1D(tex, tex_coords, texel_off)); }
  4517. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
  4518. { return decode_input(tex1D(tex, tex_coords, texel_off)); }
  4519. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
  4520. { return decode_input(tex1D(tex, tex_coords, dx, dy)); }
  4521. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
  4522. { return decode_input(tex1D(tex, tex_coords, dx, dy)); }
  4523. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
  4524. { return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
  4525. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
  4526. { return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
  4527. // tex1Dbias:
  4528. inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
  4529. { return decode_input(tex1Dbias(tex, tex_coords)); }
  4530. inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
  4531. { return decode_input(tex1Dbias(tex, tex_coords, texel_off)); }
  4532. // tex1Dfetch:
  4533. inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
  4534. { return decode_input(tex1Dfetch(tex, tex_coords)); }
  4535. inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
  4536. { return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); }
  4537. // tex1Dlod:
  4538. inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
  4539. { return decode_input(tex1Dlod(tex, tex_coords)); }
  4540. inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
  4541. { return decode_input(tex1Dlod(tex, tex_coords, texel_off)); }
  4542. // tex1Dproj:
  4543. inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
  4544. { return decode_input(tex1Dproj(tex, tex_coords)); }
  4545. inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
  4546. { return decode_input(tex1Dproj(tex, tex_coords)); }
  4547. inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
  4548. { return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
  4549. inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
  4550. { return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
  4551. */
  4552. // tex2D:
  4553. inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
  4554. { return decode_input(COMPAT_TEXTURE(tex, tex_coords)); }
  4555. inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
  4556. { return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); }
  4557. inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
  4558. { return decode_input(textureLod(tex, tex_coords, texel_off)); }
  4559. inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
  4560. { return decode_input(textureLod(tex, tex_coords.xy, texel_off)); }
  4561. //inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
  4562. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); }
  4563. //inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
  4564. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); }
  4565. //inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
  4566. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); }
  4567. //inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
  4568. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); }
  4569. // tex2Dbias:
  4570. //inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
  4571. //{ return decode_input(tex2Dbias(tex, tex_coords)); }
  4572. //inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
  4573. //{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); }
  4574. // tex2Dfetch:
  4575. //inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
  4576. //{ return decode_input(tex2Dfetch(tex, tex_coords)); }
  4577. //inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
  4578. //{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); }
  4579. // tex2Dlod:
  4580. inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
  4581. { return decode_input(textureLod(tex, tex_coords.xy, 0.0)); }
  4582. inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
  4583. { return decode_input(textureLod(tex, tex_coords.xy, texel_off)); }
  4584. /*
  4585. // tex2Dproj:
  4586. inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
  4587. { return decode_input(tex2Dproj(tex, tex_coords)); }
  4588. inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
  4589. { return decode_input(tex2Dproj(tex, tex_coords)); }
  4590. inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
  4591. { return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
  4592. inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
  4593. { return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
  4594. */
  4595. /*
  4596. // tex3D:
  4597. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
  4598. { return decode_input(tex3D(tex, tex_coords)); }
  4599. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
  4600. { return decode_input(tex3D(tex, tex_coords, texel_off)); }
  4601. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
  4602. { return decode_input(tex3D(tex, tex_coords, dx, dy)); }
  4603. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
  4604. { return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); }
  4605. // tex3Dbias:
  4606. inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
  4607. { return decode_input(tex3Dbias(tex, tex_coords)); }
  4608. inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  4609. { return decode_input(tex3Dbias(tex, tex_coords, texel_off)); }
  4610. // tex3Dfetch:
  4611. inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
  4612. { return decode_input(tex3Dfetch(tex, tex_coords)); }
  4613. inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
  4614. { return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); }
  4615. // tex3Dlod:
  4616. inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
  4617. { return decode_input(tex3Dlod(tex, tex_coords)); }
  4618. inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  4619. { return decode_input(tex3Dlod(tex, tex_coords, texel_off)); }
  4620. // tex3Dproj:
  4621. inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
  4622. { return decode_input(tex3Dproj(tex, tex_coords)); }
  4623. inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  4624. { return decode_input(tex3Dproj(tex, tex_coords, texel_off)); }
  4625. /////////*
  4626. // NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  4627. // This narrow selection of nonstandard tex2D* functions can be useful:
  4628. // tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
  4629. //inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
  4630. //{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); }
  4631. //inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
  4632. //{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); }
  4633. // MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  4634. // Provide a narrower selection of tex2D* wrapper functions that decode an
  4635. // input sample with a specified gamma value. These are useful for reading
  4636. // LUT's and for reading the input of pass0 in a later pass.
  4637. // tex2D:
  4638. inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
  4639. { return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); }
  4640. inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
  4641. { return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); }
  4642. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
  4643. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); }
  4644. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
  4645. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); }
  4646. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
  4647. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); }
  4648. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
  4649. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); }
  4650. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
  4651. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); }
  4652. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
  4653. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); }
  4654. /*
  4655. // tex2Dbias:
  4656. inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
  4657. { return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); }
  4658. inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
  4659. { return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); }
  4660. // tex2Dfetch:
  4661. inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
  4662. { return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); }
  4663. inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
  4664. { return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); }
  4665. */
  4666. // tex2Dlod:
  4667. inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
  4668. { return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); }
  4669. inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
  4670. { return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); }
  4671. #endif // GAMMA_MANAGEMENT_H
  4672. //////////////////////////// END GAMMA-MANAGEMENT //////////////////////////
  4673. //////////////////////////////// END INCLUDES ////////////////////////////////
  4674. ///////////////////////////// SCANLINE FUNCTIONS /////////////////////////////
  4675. inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
  4676. {
  4677. // Requires: Globals:
  4678. // 1.) beam_min_sigma and beam_max_sigma are global floats
  4679. // containing the desired minimum and maximum beam standard
  4680. // deviations, for dim and bright colors respectively.
  4681. // 2.) beam_max_sigma must be > 0.0
  4682. // 3.) beam_min_sigma must be in (0.0, beam_max_sigma]
  4683. // 4.) beam_spot_power must be defined as a global float.
  4684. // Parameters:
  4685. // 1.) color is the underlying source color along a scanline
  4686. // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
  4687. // sigma_range as a parameter to avoid repeated computation
  4688. // when beam_{min, max}_sigma are runtime shader parameters
  4689. // Optional: Users may set beam_spot_shape_function to 1 to define the
  4690. // inner f(color) subfunction (see below) as:
  4691. // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
  4692. // Otherwise (technically, if beam_spot_shape_function < 0.5):
  4693. // f(color) = pow(color, beam_spot_power)
  4694. // Returns: The standard deviation of the Gaussian beam for "color:"
  4695. // sigma = beam_min_sigma + sigma_range * f(color)
  4696. // Details/Discussion:
  4697. // The beam's spot shape vaguely resembles an aspect-corrected f() in the
  4698. // range [0, 1] (not quite, but it's related). f(color) = color makes
  4699. // spots look like diamonds, and a spherical function or cube balances
  4700. // between variable width and a soft/realistic shape. A beam_spot_power
  4701. // > 1.0 can produce an ugly spot shape and more initial clipping, but the
  4702. // final shape also differs based on the horizontal resampling filter and
  4703. // the phosphor bloom. For instance, resampling horizontally in nonlinear
  4704. // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
  4705. // shape, but a sixth root is still quite soft. A power function (default
  4706. // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
  4707. // has the highest variability without an awful spot shape.
  4708. //
  4709. // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
  4710. // difference from beam_max_sigma affects beam width variability. It only
  4711. // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
  4712. // a conservative estimate for a more complex constraint).
  4713. //
  4714. // beam_max_sigma affects clipping and increasing scanline width/softness
  4715. // as color increases. The wider this is, the more scanlines need to be
  4716. // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma
  4717. // at which the first unused scanline always has a weight < 1.0/255.0 is:
  4718. // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
  4719. // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
  4720. // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
  4721. // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
  4722. // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
  4723. // Generalized Gaussians permit more leeway here as steepness increases.
  4724. if(beam_spot_shape_function < 0.5)
  4725. {
  4726. // Use a power function:
  4727. return float3(beam_min_sigma) + sigma_range *
  4728. pow(color, float3(beam_spot_power));
  4729. }
  4730. else
  4731. {
  4732. // Use a spherical function:
  4733. const float3 color_minus_1 = color - float3(1.0);
  4734. return float3(beam_min_sigma) + sigma_range *
  4735. sqrt(float3(1.0) - color_minus_1*color_minus_1);
  4736. }
  4737. }
  4738. inline float3 get_generalized_gaussian_beta(const float3 color,
  4739. const float shape_range)
  4740. {
  4741. // Requires: Globals:
  4742. // 1.) beam_min_shape and beam_max_shape are global floats
  4743. // containing the desired min/max generalized Gaussian
  4744. // beta parameters, for dim and bright colors respectively.
  4745. // 2.) beam_max_shape must be >= 2.0
  4746. // 3.) beam_min_shape must be in [2.0, beam_max_shape]
  4747. // 4.) beam_shape_power must be defined as a global float.
  4748. // Parameters:
  4749. // 1.) color is the underlying source color along a scanline
  4750. // 2.) shape_range = beam_max_shape - beam_min_shape; we take
  4751. // shape_range as a parameter to avoid repeated computation
  4752. // when beam_{min, max}_shape are runtime shader parameters
  4753. // Returns: The type-I generalized Gaussian "shape" parameter beta for
  4754. // the given color.
  4755. // Details/Discussion:
  4756. // Beta affects the scanline distribution as follows:
  4757. // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
  4758. // b.) beta == 2.0 just degenerates to a Gaussian
  4759. // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
  4760. // than a Gaussian. Whereas high sigmas widen and soften peaks, high
  4761. // beta widen and sharpen peaks at the risk of aliasing.
  4762. // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
  4763. // transitions, whereas lower ones sharpen them (at the risk of aliasing).
  4764. return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
  4765. }
  4766. float3 scanline_gaussian_integral_contrib(const float3 dist,
  4767. const float3 color, const float pixel_height, const float sigma_range)
  4768. {
  4769. // Requires: 1.) dist is the distance of the [potentially separate R/G/B]
  4770. // point(s) from a scanline in units of scanlines, where
  4771. // 1.0 means the sample point straddles the next scanline.
  4772. // 2.) color is the underlying source color along a scanline.
  4773. // 3.) pixel_height is the output pixel height in scanlines.
  4774. // 4.) Requirements of get_gaussian_sigma() must be met.
  4775. // Returns: Return a scanline's light output over a given pixel.
  4776. // Details:
  4777. // The CRT beam profile follows a roughly Gaussian distribution which is
  4778. // wider for bright colors than dark ones. The integral over the full
  4779. // range of a Gaussian function is always 1.0, so we can vary the beam
  4780. // with a standard deviation without affecting brightness. 'x' = distance:
  4781. // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
  4782. // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
  4783. // Use a numerical approximation of the "error function" (the Gaussian
  4784. // indefinite integral) to find the definite integral of the scanline's
  4785. // average brightness over a given pixel area. Even if curved coords were
  4786. // used in this pass, a flat scalar pixel height works almost as well as a
  4787. // pixel height computed from a full pixel-space to scanline-space matrix.
  4788. const float3 sigma = get_gaussian_sigma(color, sigma_range);
  4789. const float3 ph_offset = float3(pixel_height * 0.5);
  4790. const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
  4791. const float3 integral_high = erf((dist + ph_offset)*denom_inv);
  4792. const float3 integral_low = erf((dist - ph_offset)*denom_inv);
  4793. return color * 0.5*(integral_high - integral_low)/pixel_height;
  4794. }
  4795. float3 scanline_generalized_gaussian_integral_contrib(float3 dist,
  4796. float3 color, float pixel_height, float sigma_range,
  4797. float shape_range)
  4798. {
  4799. // Requires: 1.) Requirements of scanline_gaussian_integral_contrib()
  4800. // must be met.
  4801. // 2.) Requirements of get_gaussian_sigma() must be met.
  4802. // 3.) Requirements of get_generalized_gaussian_beta() must be
  4803. // met.
  4804. // Returns: Return a scanline's light output over a given pixel.
  4805. // A generalized Gaussian distribution allows the shape (beta) to vary
  4806. // as well as the width (alpha). "gamma" refers to the gamma function:
  4807. // generalized sample =
  4808. // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
  4809. // ligamma(s, z) is the lower incomplete gamma function, for which we only
  4810. // implement two of four branches (because we keep 1/beta <= 0.5):
  4811. // generalized integral = 0.5 + 0.5* sign(x) *
  4812. // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
  4813. // See get_generalized_gaussian_beta() for a discussion of beta.
  4814. // We base alpha on the intended Gaussian sigma, but it only strictly
  4815. // models models standard deviation at beta == 2, because the standard
  4816. // deviation depends on both alpha and beta (keeping alpha independent is
  4817. // faster and preserves intuitive behavior and a full spectrum of results).
  4818. const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
  4819. const float3 beta = get_generalized_gaussian_beta(color, shape_range);
  4820. const float3 alpha_inv = float3(1.0)/alpha;
  4821. const float3 s = float3(1.0)/beta;
  4822. const float3 ph_offset = float3(pixel_height * 0.5);
  4823. // Pass beta to gamma_impl to avoid repeated divides. Similarly pass
  4824. // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
  4825. const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
  4826. const float3 dist1 = dist + ph_offset;
  4827. const float3 dist0 = dist - ph_offset;
  4828. const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
  4829. s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
  4830. const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
  4831. s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
  4832. return color * 0.5*(integral_high - integral_low)/pixel_height;
  4833. }
  4834. float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
  4835. const float pixel_height, const float sigma_range)
  4836. {
  4837. // See scanline_gaussian integral_contrib() for detailed comments!
  4838. // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
  4839. const float3 sigma = get_gaussian_sigma(color, sigma_range);
  4840. // Avoid repeated divides:
  4841. const float3 sigma_inv = float3(1.0)/sigma;
  4842. const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
  4843. const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
  4844. if(beam_antialias_level > 0.5)
  4845. {
  4846. // Sample 1/3 pixel away in each direction as well:
  4847. const float3 sample_offset = float3(pixel_height/3.0);
  4848. const float3 dist2 = dist + sample_offset;
  4849. const float3 dist3 = abs(dist - sample_offset);
  4850. // Average three pure Gaussian samples:
  4851. const float3 scale = color/3.0 * outer_denom_inv;
  4852. const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
  4853. const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
  4854. const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
  4855. return scale * (weight1 + weight2 + weight3);
  4856. }
  4857. else
  4858. {
  4859. return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
  4860. }
  4861. }
  4862. float3 scanline_generalized_gaussian_sampled_contrib(float3 dist,
  4863. float3 color, float pixel_height, float sigma_range,
  4864. float shape_range)
  4865. {
  4866. // See scanline_generalized_gaussian_integral_contrib() for details!
  4867. // generalized sample =
  4868. // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
  4869. const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
  4870. const float3 beta = get_generalized_gaussian_beta(color, shape_range);
  4871. // Avoid repeated divides:
  4872. const float3 alpha_inv = float3(1.0)/alpha;
  4873. const float3 beta_inv = float3(1.0)/beta;
  4874. const float3 scale = color * beta * 0.5 * alpha_inv /
  4875. gamma_impl(beta_inv, beta);
  4876. if(beam_antialias_level > 0.5)
  4877. {
  4878. // Sample 1/3 pixel closer to and farther from the scanline too.
  4879. const float3 sample_offset = float3(pixel_height/3.0);
  4880. const float3 dist2 = dist + sample_offset;
  4881. const float3 dist3 = abs(dist - sample_offset);
  4882. // Average three generalized Gaussian samples:
  4883. const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
  4884. const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
  4885. const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
  4886. return scale/3.0 * (weight1 + weight2 + weight3);
  4887. }
  4888. else
  4889. {
  4890. return scale * exp(-pow(abs(dist*alpha_inv), beta));
  4891. }
  4892. }
  4893. inline float3 scanline_contrib(float3 dist, float3 color,
  4894. float pixel_height, const float sigma_range, const float shape_range)
  4895. {
  4896. // Requires: 1.) Requirements of scanline_gaussian_integral_contrib()
  4897. // must be met.
  4898. // 2.) Requirements of get_gaussian_sigma() must be met.
  4899. // 3.) Requirements of get_generalized_gaussian_beta() must be
  4900. // met.
  4901. // Returns: Return a scanline's light output over a given pixel, using
  4902. // a generalized or pure Gaussian distribution and sampling or
  4903. // integrals as desired by user codepath choices.
  4904. if(beam_generalized_gaussian)
  4905. {
  4906. if(beam_antialias_level > 1.5)
  4907. {
  4908. return scanline_generalized_gaussian_integral_contrib(
  4909. dist, color, pixel_height, sigma_range, shape_range);
  4910. }
  4911. else
  4912. {
  4913. return scanline_generalized_gaussian_sampled_contrib(
  4914. dist, color, pixel_height, sigma_range, shape_range);
  4915. }
  4916. }
  4917. else
  4918. {
  4919. if(beam_antialias_level > 1.5)
  4920. {
  4921. return scanline_gaussian_integral_contrib(
  4922. dist, color, pixel_height, sigma_range);
  4923. }
  4924. else
  4925. {
  4926. return scanline_gaussian_sampled_contrib(
  4927. dist, color, pixel_height, sigma_range);
  4928. }
  4929. }
  4930. }
  4931. inline float3 get_raw_interpolated_color(const float3 color0,
  4932. const float3 color1, const float3 color2, const float3 color3,
  4933. const float4 weights)
  4934. {
  4935. // Use max to avoid bizarre artifacts from negative colors:
  4936. return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
  4937. }
  4938. float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
  4939. const float3 color2, const float3 color3, const float4 weights)
  4940. {
  4941. // Requires: 1.) Requirements of include/gamma-management.h must be met:
  4942. // intermediate_gamma must be globally defined, and input
  4943. // colors are interpreted as linear RGB unless you #define
  4944. // GAMMA_ENCODE_EVERY_FBO (in which case they are
  4945. // interpreted as gamma-encoded with intermediate_gamma).
  4946. // 2.) color0-3 are colors sampled from a texture with tex2D().
  4947. // They are interpreted as defined in requirement 1.
  4948. // 3.) weights contains weights for each color, summing to 1.0.
  4949. // 4.) beam_horiz_linear_rgb_weight must be defined as a global
  4950. // float in [0.0, 1.0] describing how much blending should
  4951. // be done in linear RGB (rest is gamma-corrected RGB).
  4952. // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
  4953. // if beam_horiz_linear_rgb_weight is anything other than a
  4954. // static constant, or we may try branching at runtime
  4955. // without dynamic branches allowed (slow).
  4956. // Returns: Return an interpolated color lookup between the four input
  4957. // colors based on the weights in weights. The final color will
  4958. // be a linear RGB value, but the blending will be done as
  4959. // indicated above.
  4960. const float intermediate_gamma = get_intermediate_gamma();
  4961. // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
  4962. // profile allows dynamic branches (faster than computing extra pows):
  4963. #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  4964. #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
  4965. #else
  4966. #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
  4967. #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
  4968. #endif
  4969. #endif
  4970. #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
  4971. // beam_horiz_linear_rgb_weight is static, so we can branch:
  4972. #ifdef GAMMA_ENCODE_EVERY_FBO
  4973. const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
  4974. color0, color1, color2, color3, weights), float3(intermediate_gamma));
  4975. if(beam_horiz_linear_rgb_weight > 0.0)
  4976. {
  4977. const float3 linear_mixed_color = get_raw_interpolated_color(
  4978. pow(color0, float3(intermediate_gamma)),
  4979. pow(color1, float3(intermediate_gamma)),
  4980. pow(color2, float3(intermediate_gamma)),
  4981. pow(color3, float3(intermediate_gamma)),
  4982. weights);
  4983. return lerp(gamma_mixed_color, linear_mixed_color,
  4984. beam_horiz_linear_rgb_weight);
  4985. }
  4986. else
  4987. {
  4988. return gamma_mixed_color;
  4989. }
  4990. #else
  4991. const float3 linear_mixed_color = get_raw_interpolated_color(
  4992. color0, color1, color2, color3, weights);
  4993. if(beam_horiz_linear_rgb_weight < 1.0)
  4994. {
  4995. const float3 gamma_mixed_color = get_raw_interpolated_color(
  4996. pow(color0, float3(1.0/intermediate_gamma)),
  4997. pow(color1, float3(1.0/intermediate_gamma)),
  4998. pow(color2, float3(1.0/intermediate_gamma)),
  4999. pow(color3, float3(1.0/intermediate_gamma)),
  5000. weights);
  5001. return lerp(gamma_mixed_color, linear_mixed_color,
  5002. beam_horiz_linear_rgb_weight);
  5003. }
  5004. else
  5005. {
  5006. return linear_mixed_color;
  5007. }
  5008. #endif // GAMMA_ENCODE_EVERY_FBO
  5009. #else
  5010. #ifdef GAMMA_ENCODE_EVERY_FBO
  5011. // Inputs: color0-3 are colors in gamma-encoded RGB.
  5012. const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
  5013. color0, color1, color2, color3, weights), intermediate_gamma);
  5014. const float3 linear_mixed_color = get_raw_interpolated_color(
  5015. pow(color0, float3(intermediate_gamma)),
  5016. pow(color1, float3(intermediate_gamma)),
  5017. pow(color2, float3(intermediate_gamma)),
  5018. pow(color3, float3(intermediate_gamma)),
  5019. weights);
  5020. return lerp(gamma_mixed_color, linear_mixed_color,
  5021. beam_horiz_linear_rgb_weight);
  5022. #else
  5023. // Inputs: color0-3 are colors in linear RGB.
  5024. const float3 linear_mixed_color = get_raw_interpolated_color(
  5025. color0, color1, color2, color3, weights);
  5026. const float3 gamma_mixed_color = get_raw_interpolated_color(
  5027. pow(color0, float3(1.0/intermediate_gamma)),
  5028. pow(color1, float3(1.0/intermediate_gamma)),
  5029. pow(color2, float3(1.0/intermediate_gamma)),
  5030. pow(color3, float3(1.0/intermediate_gamma)),
  5031. weights);
  5032. // wtf fixme
  5033. // const float beam_horiz_linear_rgb_weight1 = 1.0;
  5034. return lerp(gamma_mixed_color, linear_mixed_color,
  5035. beam_horiz_linear_rgb_weight);
  5036. #endif // GAMMA_ENCODE_EVERY_FBO
  5037. #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
  5038. }
  5039. float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
  5040. const float2 uv_step_x, const float4 weights)
  5041. {
  5042. // Requires: 1.) scanline_uv must be vertically snapped to the caller's
  5043. // desired line or scanline and horizontally snapped to the
  5044. // texel just left of the output pixel (color1)
  5045. // 2.) uv_step_x must contain the horizontal uv distance
  5046. // between texels.
  5047. // 3.) weights must contain interpolation filter weights for
  5048. // color0, color1, color2, and color3, where color1 is just
  5049. // left of the output pixel.
  5050. // Returns: Return a horizontally interpolated texture lookup using 2-4
  5051. // nearby texels, according to weights and the conventions of
  5052. // get_interpolated_linear_color().
  5053. // We can ignore the outside texture lookups for Quilez resampling.
  5054. const float3 color1 = COMPAT_TEXTURE(tex, scanline_uv).rgb;
  5055. const float3 color2 = COMPAT_TEXTURE(tex, scanline_uv + uv_step_x).rgb;
  5056. float3 color0 = float3(0.0);
  5057. float3 color3 = float3(0.0);
  5058. if(beam_horiz_filter > 0.5)
  5059. {
  5060. color0 = COMPAT_TEXTURE(tex, scanline_uv - uv_step_x).rgb;
  5061. color3 = COMPAT_TEXTURE(tex, scanline_uv + 2.0 * uv_step_x).rgb;
  5062. }
  5063. // Sample the texture as-is, whether it's linear or gamma-encoded:
  5064. // get_interpolated_linear_color() will handle the difference.
  5065. return get_interpolated_linear_color(color0, color1, color2, color3, weights);
  5066. }
  5067. float3 sample_single_scanline_horizontal(const sampler2D tex,
  5068. const float2 tex_uv, const float2 tex_size,
  5069. const float2 texture_size_inv)
  5070. {
  5071. // TODO: Add function requirements.
  5072. // Snap to the previous texel and get sample dists from 2/4 nearby texels:
  5073. const float2 curr_texel = tex_uv * tex_size;
  5074. // Use under_half to fix a rounding bug right around exact texel locations.
  5075. const float2 prev_texel =
  5076. floor(curr_texel - float2(under_half)) + float2(0.5);
  5077. const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
  5078. const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
  5079. const float prev_dist = curr_texel.x - prev_texel_hor.x;
  5080. const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
  5081. 1.0 - prev_dist, 2.0 - prev_dist);
  5082. // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
  5083. float4 weights;
  5084. if(beam_horiz_filter < 0.5)
  5085. {
  5086. // Quilez:
  5087. const float x = sample_dists.y;
  5088. const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
  5089. weights = float4(0.0, 1.0 - w2, w2, 0.0);
  5090. }
  5091. else if(beam_horiz_filter < 1.5)
  5092. {
  5093. // Gaussian:
  5094. float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
  5095. weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
  5096. }
  5097. else
  5098. {
  5099. // Lanczos2:
  5100. const float4 pi_dists = FIX_ZERO(sample_dists * pi);
  5101. weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
  5102. (pi_dists * pi_dists);
  5103. }
  5104. // Ensure the weight sum == 1.0:
  5105. const float4 final_weights = weights/dot(weights, float4(1.0));
  5106. // Get the interpolated horizontal scanline color:
  5107. const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
  5108. return get_scanline_color(
  5109. tex, prev_texel_hor_uv, uv_step_x, final_weights);
  5110. }
  5111. float3 sample_rgb_scanline_horizontal(const sampler2D tex,
  5112. const float2 tex_uv, const float2 tex_size,
  5113. const float2 texture_size_inv)
  5114. {
  5115. // TODO: Add function requirements.
  5116. // Rely on a helper to make convergence easier.
  5117. if(beam_misconvergence)
  5118. {
  5119. const float3 convergence_offsets_rgb =
  5120. get_convergence_offsets_x_vector();
  5121. const float3 offset_u_rgb =
  5122. convergence_offsets_rgb * texture_size_inv.xxx;
  5123. const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
  5124. const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
  5125. const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
  5126. const float3 sample_r = sample_single_scanline_horizontal(
  5127. tex, scanline_uv_r, tex_size, texture_size_inv);
  5128. const float3 sample_g = sample_single_scanline_horizontal(
  5129. tex, scanline_uv_g, tex_size, texture_size_inv);
  5130. const float3 sample_b = sample_single_scanline_horizontal(
  5131. tex, scanline_uv_b, tex_size, texture_size_inv);
  5132. return float3(sample_r.r, sample_g.g, sample_b.b);
  5133. }
  5134. else
  5135. {
  5136. return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
  5137. texture_size_inv);
  5138. }
  5139. }
  5140. float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
  5141. const float2 texture_size_inv, const float2 il_step_multiple,
  5142. const float frame_count, out float dist)
  5143. {
  5144. // Compute texture coords for the last/upper scanline, accounting for
  5145. // interlacing: With interlacing, only consider even/odd scanlines every
  5146. // other frame. Top-field first (TFF) order puts even scanlines on even
  5147. // frames, and BFF order puts them on odd frames. Texels are centered at:
  5148. // frac(tex_uv * tex_size) == x.5
  5149. // Caution: If these coordinates ever seem incorrect, first make sure it's
  5150. // not because anisotropic filtering is blurring across field boundaries.
  5151. // Note: TFF/BFF won't matter for sources that double-weave or similar.
  5152. // wtf fixme
  5153. // const float interlace_bff1 = 1.0;
  5154. const float field_offset = floor(il_step_multiple.y * 0.75) *
  5155. fmod(frame_count + float(interlace_bff), 2.0);
  5156. const float2 curr_texel = tex_uv * tex_size;
  5157. // Use under_half to fix a rounding bug right around exact texel locations.
  5158. const float2 prev_texel_num = floor(curr_texel - float2(under_half));
  5159. const float wrong_field = fmod(
  5160. prev_texel_num.y + field_offset, il_step_multiple.y);
  5161. const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
  5162. // Snap to the center of the previous scanline in the current field:
  5163. const float2 scanline_texel = scanline_texel_num + float2(0.5);
  5164. const float2 scanline_uv = scanline_texel * texture_size_inv;
  5165. // Save the sample's distance from the scanline, in units of scanlines:
  5166. dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
  5167. return scanline_uv;
  5168. }
  5169. inline bool is_interlaced(float num_lines)
  5170. {
  5171. // Detect interlacing based on the number of lines in the source.
  5172. if(interlace_detect)
  5173. {
  5174. // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
  5175. // NTSC Emulators: Typically 224 or 240 lines
  5176. // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
  5177. // PAL Emulators: ?
  5178. // ATSC: 720p, 1080i, 1080p
  5179. // Where do we place our cutoffs? Assumptions:
  5180. // 1.) We only need to care about active lines.
  5181. // 2.) Anything > 288 and <= 576 lines is probably interlaced.
  5182. // 3.) Anything > 576 lines is probably not interlaced...
  5183. // 4.) ...except 1080 lines, which is a crapshoot (user decision).
  5184. // 5.) Just in case the main program uses calculated video sizes,
  5185. // we should nudge the float thresholds a bit.
  5186. const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
  5187. const bool hd_interlace = bool(interlace_1080i) ?
  5188. ((num_lines > 1079.5) && (num_lines < 1080.5)) :
  5189. false;
  5190. return (sd_interlace || hd_interlace);
  5191. }
  5192. else
  5193. {
  5194. return false;
  5195. }
  5196. }
  5197. #endif // SCANLINE_FUNCTIONS_H
  5198. ///////////////////////////// END SCANLINE-FUNCTIONS ////////////////////////////
  5199. /////////////////////////////// END VERTEX INCLUDES /////////////////////////////
  5200. ////////////////////////////// FRAGMENT INCLUDES //////////////////////////////
  5201. //#include "../../../../include/blur-functions.h"
  5202. //////////////////////////// BEGIN BLUR-FUNCTIONS ///////////////////////////
  5203. #ifndef BLUR_FUNCTIONS_H
  5204. #define BLUR_FUNCTIONS_H
  5205. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  5206. // Copyright (C) 2014 TroggleMonkey
  5207. //
  5208. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5209. // of this software and associated documentation files (the "Software"), to
  5210. // deal in the Software without restriction, including without limitation the
  5211. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  5212. // sell copies of the Software, and to permit persons to whom the Software is
  5213. // furnished to do so, subject to the following conditions:
  5214. //
  5215. // The above copyright notice and this permission notice shall be included in
  5216. // all copies or substantial portions of the Software.
  5217. //
  5218. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  5219. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  5220. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  5221. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  5222. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  5223. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  5224. // IN THE SOFTWARE.
  5225. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  5226. // This file provides reusable one-pass and separable (two-pass) blurs.
  5227. // Requires: All blurs share these requirements (dxdy requirement is split):
  5228. // 1.) All requirements of gamma-management.h must be satisfied!
  5229. // 2.) filter_linearN must == "true" in your .cgp preset unless
  5230. // you're using tex2DblurNresize at 1x scale.
  5231. // 3.) mipmap_inputN must == "true" in your .cgp preset if
  5232. // output_size < video_size.
  5233. // 4.) output_size == video_size / pow(2, M), where M is some
  5234. // positive integer. tex2Dblur*resize can resize arbitrarily
  5235. // (and the blur will be done after resizing), but arbitrary
  5236. // resizes "fail" with other blurs due to the way they mix
  5237. // static weights with bilinear sample exploitation.
  5238. // 5.) In general, dxdy should contain the uv pixel spacing:
  5239. // dxdy = (video_size/output_size)/texture_size
  5240. // 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
  5241. // zero out the dxdy component in the unblurred dimension:
  5242. // dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
  5243. // Many blurs share these requirements:
  5244. // 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
  5245. // or they will blur more in the lower-scaled dimension.
  5246. // 2.) One-pass shared sample blurs require ddx(), ddy(), and
  5247. // tex2Dlod() to be supported by the current Cg profile, and
  5248. // the drivers must support high-quality derivatives.
  5249. // 3.) One-pass shared sample blurs require:
  5250. // tex_uv.w == log2(video_size/output_size).y;
  5251. // Non-wrapper blurs share this requirement:
  5252. // 1.) sigma is the intended standard deviation of the blur
  5253. // Wrapper blurs share this requirement, which is automatically
  5254. // met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
  5255. // 1.) blurN_std_dev must be global static const float values
  5256. // specifying standard deviations for Nx blurs in units
  5257. // of destination pixels
  5258. // Optional: 1.) The including file (or an earlier included file) may
  5259. // optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
  5260. // default standard deviations with those matching a binomial
  5261. // distribution. (See below for details/properties.)
  5262. // 2.) The including file (or an earlier included file) may
  5263. // optionally #define OVERRIDE_BLUR_STD_DEVS and override:
  5264. // static const float blur3_std_dev
  5265. // static const float blur4_std_dev
  5266. // static const float blur5_std_dev
  5267. // static const float blur6_std_dev
  5268. // static const float blur7_std_dev
  5269. // static const float blur8_std_dev
  5270. // static const float blur9_std_dev
  5271. // static const float blur10_std_dev
  5272. // static const float blur11_std_dev
  5273. // static const float blur12_std_dev
  5274. // static const float blur17_std_dev
  5275. // static const float blur25_std_dev
  5276. // static const float blur31_std_dev
  5277. // static const float blur43_std_dev
  5278. // 3.) The including file (or an earlier included file) may
  5279. // optionally #define OVERRIDE_ERROR_BLURRING and override:
  5280. // static const float error_blurring
  5281. // This tuning value helps mitigate weighting errors from one-
  5282. // pass shared-sample blurs sharing bilinear samples between
  5283. // fragments. Values closer to 0.0 have "correct" blurriness
  5284. // but allow more artifacts, and values closer to 1.0 blur away
  5285. // artifacts by sampling closer to halfway between texels.
  5286. // UPDATE 6/21/14: The above static constants may now be overridden
  5287. // by non-static uniform constants. This permits exposing blur
  5288. // standard deviations as runtime GUI shader parameters. However,
  5289. // using them keeps weights from being statically computed, and the
  5290. // speed hit depends on the blur: On my machine, uniforms kill over
  5291. // 53% of the framerate with tex2Dblur12x12shared, but they only
  5292. // drop the framerate by about 18% with tex2Dblur11fast.
  5293. // Quality and Performance Comparisons:
  5294. // For the purposes of the following discussion, "no sRGB" means
  5295. // GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
  5296. // 1.) tex2DblurNfast is always faster than tex2DblurNresize.
  5297. // 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
  5298. // well, because they're the only ones that don't exploit bilinear samples.
  5299. // This also means they're the only functions which can be truly gamma-
  5300. // correct without linear (or sRGB FBO) input, but only at 1x scale.
  5301. // 3.) One-pass shared sample blurs only have a speed advantage without sRGB.
  5302. // They also have some inaccuracies due to their shared-[bilinear-]sample
  5303. // design, which grow increasingly bothersome for smaller blurs and higher-
  5304. // frequency source images (relative to their resolution). I had high
  5305. // hopes for them, but their most realistic use case is limited to quickly
  5306. // reblurring an already blurred input at full resolution. Otherwise:
  5307. // a.) If you're blurring a low-resolution source, you want a better blur.
  5308. // b.) If you're blurring a lower mipmap, you want a better blur.
  5309. // c.) If you're blurring a high-resolution, high-frequency source, you
  5310. // want a better blur.
  5311. // 4.) The one-pass blurs without shared samples grow slower for larger blurs,
  5312. // but they're competitive with separable blurs at 5x5 and smaller, and
  5313. // even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
  5314. // Here are some framerates from a GeForce 8800GTS. The first pass resizes to
  5315. // viewport size (4x in this test) and linearizes for sRGB codepaths, and the
  5316. // remaining passes perform 6 full blurs. Mipmapped tests are performed at the
  5317. // same scale, so they just measure the cost of mipmapping each FBO (only every
  5318. // other FBO is mipmapped for separable blurs, to mimic realistic usage).
  5319. // Mipmap Neither sRGB+Mipmap sRGB Function
  5320. // 76.0 92.3 131.3 193.7 tex2Dblur3fast
  5321. // 63.2 74.4 122.4 175.5 tex2Dblur3resize
  5322. // 93.7 121.2 159.3 263.2 tex2Dblur3x3
  5323. // 59.7 68.7 115.4 162.1 tex2Dblur3x3resize
  5324. // 63.2 74.4 122.4 175.5 tex2Dblur5fast
  5325. // 49.3 54.8 100.0 132.7 tex2Dblur5resize
  5326. // 59.7 68.7 115.4 162.1 tex2Dblur5x5
  5327. // 64.9 77.2 99.1 137.2 tex2Dblur6x6shared
  5328. // 55.8 63.7 110.4 151.8 tex2Dblur7fast
  5329. // 39.8 43.9 83.9 105.8 tex2Dblur7resize
  5330. // 40.0 44.2 83.2 104.9 tex2Dblur7x7
  5331. // 56.4 65.5 71.9 87.9 tex2Dblur8x8shared
  5332. // 49.3 55.1 99.9 132.5 tex2Dblur9fast
  5333. // 33.3 36.2 72.4 88.0 tex2Dblur9resize
  5334. // 27.8 29.7 61.3 72.2 tex2Dblur9x9
  5335. // 37.2 41.1 52.6 60.2 tex2Dblur10x10shared
  5336. // 44.4 49.5 91.3 117.8 tex2Dblur11fast
  5337. // 28.8 30.8 63.6 75.4 tex2Dblur11resize
  5338. // 33.6 36.5 40.9 45.5 tex2Dblur12x12shared
  5339. // TODO: Fill in benchmarks for new untested blurs.
  5340. // tex2Dblur17fast
  5341. // tex2Dblur25fast
  5342. // tex2Dblur31fast
  5343. // tex2Dblur43fast
  5344. // tex2Dblur3x3resize
  5345. ///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
  5346. // Set static standard deviations, but allow users to override them with their
  5347. // own constants (even non-static uniforms if they're okay with the speed hit):
  5348. #ifndef OVERRIDE_BLUR_STD_DEVS
  5349. // blurN_std_dev values are specified in terms of dxdy strides.
  5350. #ifdef USE_BINOMIAL_BLUR_STD_DEVS
  5351. // By request, we can define standard deviations corresponding to a
  5352. // binomial distribution with p = 0.5 (related to Pascal's triangle).
  5353. // This distribution works such that blurring multiple times should
  5354. // have the same result as a single larger blur. These values are
  5355. // larger than default for blurs up to 6x and smaller thereafter.
  5356. static const float blur3_std_dev = 0.84931640625;
  5357. static const float blur4_std_dev = 0.84931640625;
  5358. static const float blur5_std_dev = 1.0595703125;
  5359. static const float blur6_std_dev = 1.06591796875;
  5360. static const float blur7_std_dev = 1.17041015625;
  5361. static const float blur8_std_dev = 1.1720703125;
  5362. static const float blur9_std_dev = 1.2259765625;
  5363. static const float blur10_std_dev = 1.21982421875;
  5364. static const float blur11_std_dev = 1.25361328125;
  5365. static const float blur12_std_dev = 1.2423828125;
  5366. static const float blur17_std_dev = 1.27783203125;
  5367. static const float blur25_std_dev = 1.2810546875;
  5368. static const float blur31_std_dev = 1.28125;
  5369. static const float blur43_std_dev = 1.28125;
  5370. #else
  5371. // The defaults are the largest values that keep the largest unused
  5372. // blur term on each side <= 1.0/256.0. (We could get away with more
  5373. // or be more conservative, but this compromise is pretty reasonable.)
  5374. static const float blur3_std_dev = 0.62666015625;
  5375. static const float blur4_std_dev = 0.66171875;
  5376. static const float blur5_std_dev = 0.9845703125;
  5377. static const float blur6_std_dev = 1.02626953125;
  5378. static const float blur7_std_dev = 1.36103515625;
  5379. static const float blur8_std_dev = 1.4080078125;
  5380. static const float blur9_std_dev = 1.7533203125;
  5381. static const float blur10_std_dev = 1.80478515625;
  5382. static const float blur11_std_dev = 2.15986328125;
  5383. static const float blur12_std_dev = 2.215234375;
  5384. static const float blur17_std_dev = 3.45535583496;
  5385. static const float blur25_std_dev = 5.3409576416;
  5386. static const float blur31_std_dev = 6.86488037109;
  5387. static const float blur43_std_dev = 10.1852050781;
  5388. #endif // USE_BINOMIAL_BLUR_STD_DEVS
  5389. #endif // OVERRIDE_BLUR_STD_DEVS
  5390. #ifndef OVERRIDE_ERROR_BLURRING
  5391. // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing
  5392. // in shared-sample blurs but increase blurring and feature shifting.
  5393. static const float error_blurring = 0.5;
  5394. #endif
  5395. ////////////////////////////////// INCLUDES //////////////////////////////////
  5396. // gamma-management.h relies on pass-specific settings to guide its behavior:
  5397. // FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details.
  5398. //#include "gamma-management.h"
  5399. //////////////////////////// BEGIN GAMMA-MANAGEMENT //////////////////////////
  5400. #ifndef GAMMA_MANAGEMENT_H
  5401. #define GAMMA_MANAGEMENT_H
  5402. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  5403. // Copyright (C) 2014 TroggleMonkey
  5404. //
  5405. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5406. // of this software and associated documentation files (the "Software"), to
  5407. // deal in the Software without restriction, including without limitation the
  5408. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  5409. // sell copies of the Software, and to permit persons to whom the Software is
  5410. // furnished to do so, subject to the following conditions:
  5411. //
  5412. // The above copyright notice and this permission notice shall be included in
  5413. // all copies or substantial portions of the Software.
  5414. //
  5415. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  5416. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  5417. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  5418. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  5419. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  5420. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  5421. // IN THE SOFTWARE.
  5422. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  5423. // This file provides gamma-aware tex*D*() and encode_output() functions.
  5424. // Requires: Before #include-ing this file, the including file must #define
  5425. // the following macros when applicable and follow their rules:
  5426. // 1.) #define FIRST_PASS if this is the first pass.
  5427. // 2.) #define LAST_PASS if this is the last pass.
  5428. // 3.) If sRGB is available, set srgb_framebufferN = "true" for
  5429. // every pass except the last in your .cgp preset.
  5430. // 4.) If sRGB isn't available but you want gamma-correctness with
  5431. // no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
  5432. // 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
  5433. // 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
  5434. // 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
  5435. // 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
  5436. // If an option in [5, 8] is #defined in the first or last pass, it
  5437. // should be #defined for both. It shouldn't make a difference
  5438. // whether it's #defined for intermediate passes or not.
  5439. // Optional: The including file (or an earlier included file) may optionally
  5440. // #define a number of macros indicating it will override certain
  5441. // macros and associated constants are as follows:
  5442. // static constants with either static or uniform constants. The
  5443. // 1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
  5444. // static const float ntsc_gamma
  5445. // static const float pal_gamma
  5446. // static const float crt_reference_gamma_high
  5447. // static const float crt_reference_gamma_low
  5448. // static const float lcd_reference_gamma
  5449. // static const float crt_office_gamma
  5450. // static const float lcd_office_gamma
  5451. // 2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
  5452. // static const float crt_gamma
  5453. // static const float gba_gamma
  5454. // static const float lcd_gamma
  5455. // 3.) OVERRIDE_FINAL_GAMMA: The user must first define:
  5456. // static const float input_gamma
  5457. // static const float intermediate_gamma
  5458. // static const float output_gamma
  5459. // (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
  5460. // 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
  5461. // static const bool assume_opaque_alpha
  5462. // The gamma constant overrides must be used in every pass or none,
  5463. // and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
  5464. // OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
  5465. // Usage: After setting macros appropriately, ignore gamma correction and
  5466. // replace all tex*D*() calls with equivalent gamma-aware
  5467. // tex*D*_linearize calls, except:
  5468. // 1.) When you read an LUT, use regular tex*D or a gamma-specified
  5469. // function, depending on its gamma encoding:
  5470. // tex*D*_linearize_gamma (takes a runtime gamma parameter)
  5471. // 2.) If you must read pass0's original input in a later pass, use
  5472. // tex2D_linearize_ntsc_gamma. If you want to read pass0's
  5473. // input with gamma-corrected bilinear filtering, consider
  5474. // creating a first linearizing pass and reading from the input
  5475. // of pass1 later.
  5476. // Then, return encode_output(color) from every fragment shader.
  5477. // Finally, use the global gamma_aware_bilinear boolean if you want
  5478. // to statically branch based on whether bilinear filtering is
  5479. // gamma-correct or not (e.g. for placing Gaussian blur samples).
  5480. //
  5481. // Detailed Policy:
  5482. // tex*D*_linearize() functions enforce a consistent gamma-management policy
  5483. // based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume
  5484. // their input texture has the same encoding characteristics as the input for
  5485. // the current pass (which doesn't apply to the exceptions listed above).
  5486. // Similarly, encode_output() enforces a policy based on the LAST_PASS and
  5487. // GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the
  5488. // following two pipelines.
  5489. // Typical pipeline with intermediate sRGB framebuffers:
  5490. // linear_color = pow(pass0_encoded_color, input_gamma);
  5491. // intermediate_output = linear_color; // Automatic sRGB encoding
  5492. // linear_color = intermediate_output; // Automatic sRGB decoding
  5493. // final_output = pow(intermediate_output, 1.0/output_gamma);
  5494. // Typical pipeline without intermediate sRGB framebuffers:
  5495. // linear_color = pow(pass0_encoded_color, input_gamma);
  5496. // intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
  5497. // linear_color = pow(intermediate_output, intermediate_gamma);
  5498. // final_output = pow(intermediate_output, 1.0/output_gamma);
  5499. // Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
  5500. // easily get gamma-correctness without banding on devices where sRGB isn't
  5501. // supported.
  5502. //
  5503. // Use This Header to Maximize Code Reuse:
  5504. // The purpose of this header is to provide a consistent interface for texture
  5505. // reads and output gamma-encoding that localizes and abstracts away all the
  5506. // annoying details. This greatly reduces the amount of code in each shader
  5507. // pass that depends on the pass number in the .cgp preset or whether sRGB
  5508. // FBO's are being used: You can trivially change the gamma behavior of your
  5509. // whole pass by commenting or uncommenting 1-3 #defines. To reuse the same
  5510. // code in your first, Nth, and last passes, you can even put it all in another
  5511. // header file and #include it from skeleton .cg files that #define the
  5512. // appropriate pass-specific settings.
  5513. //
  5514. // Rationale for Using Three Macros:
  5515. // This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
  5516. // SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
  5517. // a lower maintenance burden on each pass. At first glance it seems we could
  5518. // accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
  5519. // This works for simple use cases where input_gamma == output_gamma, but it
  5520. // breaks down for more complex scenarios like CRT simulation, where the pass
  5521. // number determines the gamma encoding of the input and output.
  5522. /////////////////////////////// BASE CONSTANTS ///////////////////////////////
  5523. // Set standard gamma constants, but allow users to override them:
  5524. #ifndef OVERRIDE_STANDARD_GAMMA
  5525. // Standard encoding gammas:
  5526. static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too?
  5527. static const float pal_gamma = 2.8; // Never actually 2.8 in practice
  5528. // Typical device decoding gammas (only use for emulating devices):
  5529. // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
  5530. // gammas: The standards purposely undercorrected for an analog CRT's
  5531. // assumed 2.5 reference display gamma to maintain contrast in assumed
  5532. // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
  5533. // These unstated assumptions about display gamma and perceptual rendering
  5534. // intent caused a lot of confusion, and more modern CRT's seemed to target
  5535. // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit
  5536. // (they struggle near black with 2.5 gamma anyway), especially PC/laptop
  5537. // displays designed to view sRGB in bright environments. (Standards are
  5538. // also in flux again with BT.1886, but it's underspecified for displays.)
  5539. static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55)
  5540. static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55)
  5541. static const float lcd_reference_gamma = 2.5; // To match CRT
  5542. static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC
  5543. static const float lcd_office_gamma = 2.2; // Approximates sRGB
  5544. #endif // OVERRIDE_STANDARD_GAMMA
  5545. // Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
  5546. // but only if they're aware of it.
  5547. #ifndef OVERRIDE_ALPHA_ASSUMPTIONS
  5548. static const bool assume_opaque_alpha = false;
  5549. #endif
  5550. /////////////////////// DERIVED CONSTANTS AS FUNCTIONS ///////////////////////
  5551. // gamma-management.h should be compatible with overriding gamma values with
  5552. // runtime user parameters, but we can only define other global constants in
  5553. // terms of static constants, not uniform user parameters. To get around this
  5554. // limitation, we need to define derived constants using functions.
  5555. // Set device gamma constants, but allow users to override them:
  5556. #ifdef OVERRIDE_DEVICE_GAMMA
  5557. // The user promises to globally define the appropriate constants:
  5558. inline float get_crt_gamma() { return crt_gamma; }
  5559. inline float get_gba_gamma() { return gba_gamma; }
  5560. inline float get_lcd_gamma() { return lcd_gamma; }
  5561. #else
  5562. inline float get_crt_gamma() { return crt_reference_gamma_high; }
  5563. inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0)
  5564. inline float get_lcd_gamma() { return lcd_office_gamma; }
  5565. #endif // OVERRIDE_DEVICE_GAMMA
  5566. // Set decoding/encoding gammas for the first/lass passes, but allow overrides:
  5567. #ifdef OVERRIDE_FINAL_GAMMA
  5568. // The user promises to globally define the appropriate constants:
  5569. inline float get_intermediate_gamma() { return intermediate_gamma; }
  5570. inline float get_input_gamma() { return input_gamma; }
  5571. inline float get_output_gamma() { return output_gamma; }
  5572. #else
  5573. // If we gamma-correct every pass, always use ntsc_gamma between passes to
  5574. // ensure middle passes don't need to care if anything is being simulated:
  5575. inline float get_intermediate_gamma() { return ntsc_gamma; }
  5576. #ifdef SIMULATE_CRT_ON_LCD
  5577. inline float get_input_gamma() { return get_crt_gamma(); }
  5578. inline float get_output_gamma() { return get_lcd_gamma(); }
  5579. #else
  5580. #ifdef SIMULATE_GBA_ON_LCD
  5581. inline float get_input_gamma() { return get_gba_gamma(); }
  5582. inline float get_output_gamma() { return get_lcd_gamma(); }
  5583. #else
  5584. #ifdef SIMULATE_LCD_ON_CRT
  5585. inline float get_input_gamma() { return get_lcd_gamma(); }
  5586. inline float get_output_gamma() { return get_crt_gamma(); }
  5587. #else
  5588. #ifdef SIMULATE_GBA_ON_CRT
  5589. inline float get_input_gamma() { return get_gba_gamma(); }
  5590. inline float get_output_gamma() { return get_crt_gamma(); }
  5591. #else // Don't simulate anything:
  5592. inline float get_input_gamma() { return ntsc_gamma; }
  5593. inline float get_output_gamma() { return ntsc_gamma; }
  5594. #endif // SIMULATE_GBA_ON_CRT
  5595. #endif // SIMULATE_LCD_ON_CRT
  5596. #endif // SIMULATE_GBA_ON_LCD
  5597. #endif // SIMULATE_CRT_ON_LCD
  5598. #endif // OVERRIDE_FINAL_GAMMA
  5599. // Set decoding/encoding gammas for the current pass. Use static constants for
  5600. // linearize_input and gamma_encode_output, because they aren't derived, and
  5601. // they let the compiler do dead-code elimination.
  5602. #ifndef GAMMA_ENCODE_EVERY_FBO
  5603. #ifdef FIRST_PASS
  5604. static const bool linearize_input = true;
  5605. inline float get_pass_input_gamma() { return get_input_gamma(); }
  5606. #else
  5607. static const bool linearize_input = false;
  5608. inline float get_pass_input_gamma() { return 1.0; }
  5609. #endif
  5610. #ifdef LAST_PASS
  5611. static const bool gamma_encode_output = true;
  5612. inline float get_pass_output_gamma() { return get_output_gamma(); }
  5613. #else
  5614. static const bool gamma_encode_output = false;
  5615. inline float get_pass_output_gamma() { return 1.0; }
  5616. #endif
  5617. #else
  5618. static const bool linearize_input = true;
  5619. static const bool gamma_encode_output = true;
  5620. #ifdef FIRST_PASS
  5621. inline float get_pass_input_gamma() { return get_input_gamma(); }
  5622. #else
  5623. inline float get_pass_input_gamma() { return get_intermediate_gamma(); }
  5624. #endif
  5625. #ifdef LAST_PASS
  5626. inline float get_pass_output_gamma() { return get_output_gamma(); }
  5627. #else
  5628. inline float get_pass_output_gamma() { return get_intermediate_gamma(); }
  5629. #endif
  5630. #endif
  5631. // Users might want to know if bilinear filtering will be gamma-correct:
  5632. static const bool gamma_aware_bilinear = !linearize_input;
  5633. ////////////////////// COLOR ENCODING/DECODING FUNCTIONS /////////////////////
  5634. inline float4 encode_output(const float4 color)
  5635. {
  5636. if(gamma_encode_output)
  5637. {
  5638. if(assume_opaque_alpha)
  5639. {
  5640. return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
  5641. }
  5642. else
  5643. {
  5644. return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
  5645. }
  5646. }
  5647. else
  5648. {
  5649. return color;
  5650. }
  5651. }
  5652. inline float4 decode_input(const float4 color)
  5653. {
  5654. if(linearize_input)
  5655. {
  5656. if(assume_opaque_alpha)
  5657. {
  5658. return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
  5659. }
  5660. else
  5661. {
  5662. return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
  5663. }
  5664. }
  5665. else
  5666. {
  5667. return color;
  5668. }
  5669. }
  5670. inline float4 decode_gamma_input(const float4 color, const float3 gamma)
  5671. {
  5672. if(assume_opaque_alpha)
  5673. {
  5674. return float4(pow(color.rgb, gamma), 1.0);
  5675. }
  5676. else
  5677. {
  5678. return float4(pow(color.rgb, gamma), color.a);
  5679. }
  5680. }
  5681. //TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
  5682. //#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
  5683. // EDIT: it's the 'const' in front of the coords that's doing it
  5684. /////////////////////////// TEXTURE LOOKUP WRAPPERS //////////////////////////
  5685. // "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  5686. // Provide a wide array of linearizing texture lookup wrapper functions. The
  5687. // Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
  5688. // lookups are provided for completeness in case that changes someday. Nobody
  5689. // is likely to use the *fetch and *proj functions, but they're included just
  5690. // in case. The only tex*D texture sampling functions omitted are:
  5691. // - tex*Dcmpbias
  5692. // - tex*Dcmplod
  5693. // - tex*DARRAY*
  5694. // - tex*DMS*
  5695. // - Variants returning integers
  5696. // Standard line length restrictions are ignored below for vertical brevity.
  5697. /*
  5698. // tex1D:
  5699. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
  5700. { return decode_input(tex1D(tex, tex_coords)); }
  5701. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
  5702. { return decode_input(tex1D(tex, tex_coords)); }
  5703. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
  5704. { return decode_input(tex1D(tex, tex_coords, texel_off)); }
  5705. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
  5706. { return decode_input(tex1D(tex, tex_coords, texel_off)); }
  5707. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
  5708. { return decode_input(tex1D(tex, tex_coords, dx, dy)); }
  5709. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
  5710. { return decode_input(tex1D(tex, tex_coords, dx, dy)); }
  5711. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
  5712. { return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
  5713. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
  5714. { return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
  5715. // tex1Dbias:
  5716. inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
  5717. { return decode_input(tex1Dbias(tex, tex_coords)); }
  5718. inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
  5719. { return decode_input(tex1Dbias(tex, tex_coords, texel_off)); }
  5720. // tex1Dfetch:
  5721. inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
  5722. { return decode_input(tex1Dfetch(tex, tex_coords)); }
  5723. inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
  5724. { return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); }
  5725. // tex1Dlod:
  5726. inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
  5727. { return decode_input(tex1Dlod(tex, tex_coords)); }
  5728. inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
  5729. { return decode_input(tex1Dlod(tex, tex_coords, texel_off)); }
  5730. // tex1Dproj:
  5731. inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
  5732. { return decode_input(tex1Dproj(tex, tex_coords)); }
  5733. inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
  5734. { return decode_input(tex1Dproj(tex, tex_coords)); }
  5735. inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
  5736. { return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
  5737. inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
  5738. { return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
  5739. */
  5740. // tex2D:
  5741. inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
  5742. { return decode_input(COMPAT_TEXTURE(tex, tex_coords)); }
  5743. inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
  5744. { return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); }
  5745. inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
  5746. { return decode_input(textureLod(tex, tex_coords, texel_off)); }
  5747. inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
  5748. { return decode_input(textureLod(tex, tex_coords.xy, texel_off)); }
  5749. //inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
  5750. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); }
  5751. //inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
  5752. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); }
  5753. //inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
  5754. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); }
  5755. //inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
  5756. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); }
  5757. // tex2Dbias:
  5758. //inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
  5759. //{ return decode_input(tex2Dbias(tex, tex_coords)); }
  5760. //inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
  5761. //{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); }
  5762. // tex2Dfetch:
  5763. //inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
  5764. //{ return decode_input(tex2Dfetch(tex, tex_coords)); }
  5765. //inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
  5766. //{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); }
  5767. // tex2Dlod:
  5768. inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
  5769. { return decode_input(textureLod(tex, tex_coords.xy, 0.0)); }
  5770. inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
  5771. { return decode_input(textureLod(tex, tex_coords.xy, texel_off)); }
  5772. /*
  5773. // tex2Dproj:
  5774. inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
  5775. { return decode_input(tex2Dproj(tex, tex_coords)); }
  5776. inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
  5777. { return decode_input(tex2Dproj(tex, tex_coords)); }
  5778. inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
  5779. { return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
  5780. inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
  5781. { return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
  5782. */
  5783. /*
  5784. // tex3D:
  5785. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
  5786. { return decode_input(tex3D(tex, tex_coords)); }
  5787. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
  5788. { return decode_input(tex3D(tex, tex_coords, texel_off)); }
  5789. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
  5790. { return decode_input(tex3D(tex, tex_coords, dx, dy)); }
  5791. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
  5792. { return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); }
  5793. // tex3Dbias:
  5794. inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
  5795. { return decode_input(tex3Dbias(tex, tex_coords)); }
  5796. inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  5797. { return decode_input(tex3Dbias(tex, tex_coords, texel_off)); }
  5798. // tex3Dfetch:
  5799. inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
  5800. { return decode_input(tex3Dfetch(tex, tex_coords)); }
  5801. inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
  5802. { return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); }
  5803. // tex3Dlod:
  5804. inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
  5805. { return decode_input(tex3Dlod(tex, tex_coords)); }
  5806. inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  5807. { return decode_input(tex3Dlod(tex, tex_coords, texel_off)); }
  5808. // tex3Dproj:
  5809. inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
  5810. { return decode_input(tex3Dproj(tex, tex_coords)); }
  5811. inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  5812. { return decode_input(tex3Dproj(tex, tex_coords, texel_off)); }
  5813. /////////*
  5814. // NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  5815. // This narrow selection of nonstandard tex2D* functions can be useful:
  5816. // tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
  5817. //inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
  5818. //{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); }
  5819. //inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
  5820. //{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); }
  5821. // MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  5822. // Provide a narrower selection of tex2D* wrapper functions that decode an
  5823. // input sample with a specified gamma value. These are useful for reading
  5824. // LUT's and for reading the input of pass0 in a later pass.
  5825. // tex2D:
  5826. inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
  5827. { return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); }
  5828. inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
  5829. { return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); }
  5830. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
  5831. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); }
  5832. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
  5833. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); }
  5834. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
  5835. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); }
  5836. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
  5837. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); }
  5838. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
  5839. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); }
  5840. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
  5841. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); }
  5842. /*
  5843. // tex2Dbias:
  5844. inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
  5845. { return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); }
  5846. inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
  5847. { return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); }
  5848. // tex2Dfetch:
  5849. inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
  5850. { return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); }
  5851. inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
  5852. { return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); }
  5853. */
  5854. // tex2Dlod:
  5855. inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
  5856. { return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); }
  5857. inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
  5858. { return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); }
  5859. #endif // GAMMA_MANAGEMENT_H
  5860. //////////////////////////// END GAMMA-MANAGEMENT //////////////////////////
  5861. //#include "quad-pixel-communication.h"
  5862. /////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION //////////////////////
  5863. #ifndef QUAD_PIXEL_COMMUNICATION_H
  5864. #define QUAD_PIXEL_COMMUNICATION_H
  5865. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  5866. // Copyright (C) 2014 TroggleMonkey*
  5867. //
  5868. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5869. // of this software and associated documentation files (the "Software"), to
  5870. // deal in the Software without restriction, including without limitation the
  5871. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  5872. // sell copies of the Software, and to permit persons to whom the Software is
  5873. // furnished to do so, subject to the following conditions:
  5874. //
  5875. // The above copyright notice and this permission notice shall be included in
  5876. // all copies or substantial portions of the Software.
  5877. //
  5878. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  5879. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  5880. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  5881. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  5882. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  5883. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  5884. // IN THE SOFTWARE.
  5885. ///////////////////////////////// DISCLAIMER /////////////////////////////////
  5886. // *This code was inspired by "Shader Amortization using Pixel Quad Message
  5887. // Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent
  5888. // is not to plagiarize his fundamentally similar code and assert my own
  5889. // copyright, but the algorithmic helper functions require so little code that
  5890. // implementations can't vary by much except bugfixes and conventions. I just
  5891. // wanted to license my own particular code here to avoid ambiguity and make it
  5892. // clear that as far as I'm concerned, people can do as they please with it.
  5893. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  5894. // Given screen pixel numbers, derive a "quad vector" describing a fragment's
  5895. // position in its 2x2 pixel quad. Given that vector, obtain the values of any
  5896. // variable at neighboring fragments.
  5897. // Requires: Using this file in general requires:
  5898. // 1.) ddx() and ddy() are present in the current Cg profile.
  5899. // 2.) The GPU driver is using fine/high-quality derivatives.
  5900. // Functions will give incorrect results if this is not true,
  5901. // so a test function is included.
  5902. ///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES ////////////////////
  5903. float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
  5904. {
  5905. // Requires: Two measures of the current fragment's output pixel number
  5906. // in the range ([0, output_size.x), [0, output_size.y)):
  5907. // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
  5908. // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
  5909. // Returns: Two measures of the fragment's position in its 2x2 quad:
  5910. // 1.) The .xy components are its 2x2 placement with respect to
  5911. // uv direction (the origin (0, 0) is at the top-left):
  5912. // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0)
  5913. // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0)
  5914. // You need this to arrange/weight shared texture samples.
  5915. // 2.) The .zw components are its 2x2 placement with respect to
  5916. // screen xy direction (position); the origin varies.
  5917. // quad_gather needs this measure to work correctly.
  5918. // Note: quad_vector.zw = quad_vector.xy * float2(
  5919. // ddx(output_pixel_num_wrt_uvxy.x),
  5920. // ddy(output_pixel_num_wrt_uvxy.y));
  5921. // Caveats: This function assumes the GPU driver always starts 2x2 pixel
  5922. // quads at even pixel numbers. This assumption can be wrong
  5923. // for odd output resolutions (nondeterministically so).
  5924. float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
  5925. float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
  5926. return quad_vector;
  5927. }
  5928. float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
  5929. {
  5930. // Requires: Same as get_quad_vector_naive() (see that first).
  5931. // Returns: Same as get_quad_vector_naive() (see that first), but it's
  5932. // correct even if the 2x2 pixel quad starts at an odd pixel,
  5933. // which can occur at odd resolutions.
  5934. float4 quad_vector_guess =
  5935. get_quad_vector_naive(output_pixel_num_wrt_uvxy);
  5936. // If quad_vector_guess.zw doesn't increase with screen xy, we know
  5937. // the 2x2 pixel quad starts at an odd pixel:
  5938. float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
  5939. ddy(quad_vector_guess.w));
  5940. return quad_vector_guess * odd_start_mirror.xyxy;
  5941. }
  5942. float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
  5943. {
  5944. // Requires: 1.) ddx() and ddy() are present in the current Cg profile.
  5945. // 2.) output_pixel_num_wrt_uv must increase with uv coords and
  5946. // measure the current fragment's output pixel number in:
  5947. // ([0, output_size.x), [0, output_size.y))
  5948. // Returns: Same as get_quad_vector_naive() (see that first), but it's
  5949. // correct even if the 2x2 pixel quad starts at an odd pixel,
  5950. // which can occur at odd resolutions.
  5951. // Caveats: This function requires less information than the version
  5952. // taking a float4, but it's potentially slower.
  5953. // Do screen coords increase with or against uv? Get the direction
  5954. // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
  5955. float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
  5956. ddy(output_pixel_num_wrt_uv.y));
  5957. float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
  5958. float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
  5959. float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
  5960. // If quad_vector_screen_guess doesn't increase with screen xy, we know
  5961. // the 2x2 pixel quad starts at an odd pixel:
  5962. float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
  5963. ddy(quad_vector_screen_guess.y));
  5964. float4 quad_vector_guess = float4(
  5965. quad_vector_uv_guess, quad_vector_screen_guess);
  5966. return quad_vector_guess * odd_start_mirror.xyxy;
  5967. }
  5968. void quad_gather(float4 quad_vector, float4 curr,
  5969. out float4 adjx, out float4 adjy, out float4 diag)
  5970. {
  5971. // Requires: 1.) ddx() and ddy() are present in the current Cg profile.
  5972. // 2.) The GPU driver is using fine/high-quality derivatives.
  5973. // 3.) quad_vector describes the current fragment's location in
  5974. // its 2x2 pixel quad using get_quad_vector()'s conventions.
  5975. // 4.) curr is any vector you wish to get neighboring values of.
  5976. // Returns: Values of an input vector (curr) at neighboring fragments
  5977. // adjacent x, adjacent y, and diagonal (via out parameters).
  5978. adjx = curr - ddx(curr) * quad_vector.z;
  5979. adjy = curr - ddy(curr) * quad_vector.w;
  5980. diag = adjx - ddy(adjx) * quad_vector.w;
  5981. }
  5982. void quad_gather(float4 quad_vector, float3 curr,
  5983. out float3 adjx, out float3 adjy, out float3 diag)
  5984. {
  5985. // Float3 version
  5986. adjx = curr - ddx(curr) * quad_vector.z;
  5987. adjy = curr - ddy(curr) * quad_vector.w;
  5988. diag = adjx - ddy(adjx) * quad_vector.w;
  5989. }
  5990. void quad_gather(float4 quad_vector, float2 curr,
  5991. out float2 adjx, out float2 adjy, out float2 diag)
  5992. {
  5993. // Float2 version
  5994. adjx = curr - ddx(curr) * quad_vector.z;
  5995. adjy = curr - ddy(curr) * quad_vector.w;
  5996. diag = adjx - ddy(adjx) * quad_vector.w;
  5997. }
  5998. float4 quad_gather(float4 quad_vector, float curr)
  5999. {
  6000. // Float version:
  6001. // Returns: return.x == current
  6002. // return.y == adjacent x
  6003. // return.z == adjacent y
  6004. // return.w == diagonal
  6005. float4 all = float4(curr);
  6006. all.y = all.x - ddx(all.x) * quad_vector.z;
  6007. all.zw = all.xy - ddy(all.xy) * quad_vector.w;
  6008. return all;
  6009. }
  6010. float4 quad_gather_sum(float4 quad_vector, float4 curr)
  6011. {
  6012. // Requires: Same as quad_gather()
  6013. // Returns: Sum of an input vector (curr) at all fragments in a quad.
  6014. float4 adjx, adjy, diag;
  6015. quad_gather(quad_vector, curr, adjx, adjy, diag);
  6016. return (curr + adjx + adjy + diag);
  6017. }
  6018. float3 quad_gather_sum(float4 quad_vector, float3 curr)
  6019. {
  6020. // Float3 version:
  6021. float3 adjx, adjy, diag;
  6022. quad_gather(quad_vector, curr, adjx, adjy, diag);
  6023. return (curr + adjx + adjy + diag);
  6024. }
  6025. float2 quad_gather_sum(float4 quad_vector, float2 curr)
  6026. {
  6027. // Float2 version:
  6028. float2 adjx, adjy, diag;
  6029. quad_gather(quad_vector, curr, adjx, adjy, diag);
  6030. return (curr + adjx + adjy + diag);
  6031. }
  6032. float quad_gather_sum(float4 quad_vector, float curr)
  6033. {
  6034. // Float version:
  6035. float4 all_values = quad_gather(quad_vector, curr);
  6036. return (all_values.x + all_values.y + all_values.z + all_values.w);
  6037. }
  6038. bool fine_derivatives_working(float4 quad_vector, float4 curr)
  6039. {
  6040. // Requires: 1.) ddx() and ddy() are present in the current Cg profile.
  6041. // 2.) quad_vector describes the current fragment's location in
  6042. // its 2x2 pixel quad using get_quad_vector()'s conventions.
  6043. // 3.) curr must be a test vector with non-constant derivatives
  6044. // (its value should change nonlinearly across fragments).
  6045. // Returns: true if fine/hybrid/high-quality derivatives are used, or
  6046. // false if coarse derivatives are used or inconclusive
  6047. // Usage: Test whether quad-pixel communication is working!
  6048. // Method: We can confirm fine derivatives are used if the following
  6049. // holds (ever, for any value at any fragment):
  6050. // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
  6051. // The more values we test (e.g. test a float4 two ways), the
  6052. // easier it is to demonstrate fine derivatives are working.
  6053. // TODO: Check for floating point exact comparison issues!
  6054. float4 ddx_curr = ddx(curr);
  6055. float4 ddy_curr = ddy(curr);
  6056. float4 adjx = curr - ddx_curr * quad_vector.z;
  6057. float4 adjy = curr - ddy_curr * quad_vector.w;
  6058. bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
  6059. bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
  6060. return any(bool2(ddy_different, ddx_different));
  6061. }
  6062. bool fine_derivatives_working_fast(float4 quad_vector, float curr)
  6063. {
  6064. // Requires: Same as fine_derivatives_working()
  6065. // Returns: Same as fine_derivatives_working()
  6066. // Usage: This is faster than fine_derivatives_working() but more
  6067. // likely to return false negatives, so it's less useful for
  6068. // offline testing/debugging. It's also useless as the basis
  6069. // for dynamic runtime branching as of May 2014: Derivatives
  6070. // (and quad-pixel communication) are currently disallowed in
  6071. // branches. However, future GPU's may allow you to use them
  6072. // in dynamic branches if you promise the branch condition
  6073. // evaluates the same for every fragment in the quad (and/or if
  6074. // the driver enforces that promise by making a single fragment
  6075. // control branch decisions). If that ever happens, this
  6076. // version may become a more economical choice.
  6077. float ddx_curr = ddx(curr);
  6078. float ddy_curr = ddy(curr);
  6079. float adjx = curr - ddx_curr * quad_vector.z;
  6080. return (ddy_curr != ddy(adjx));
  6081. }
  6082. #endif // QUAD_PIXEL_COMMUNICATION_H
  6083. //////////////////////// END QUAD-PIXEL-COMMUNICATION ///////////////////////
  6084. //#include "special-functions.h"
  6085. /////////////////////////// BEGIN SPECIAL-FUNCTIONS //////////////////////////
  6086. #ifndef SPECIAL_FUNCTIONS_H
  6087. #define SPECIAL_FUNCTIONS_H
  6088. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  6089. // Copyright (C) 2014 TroggleMonkey
  6090. //
  6091. // Permission is hereby granted, free of charge, to any person obtaining a copy
  6092. // of this software and associated documentation files (the "Software"), to
  6093. // deal in the Software without restriction, including without limitation the
  6094. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  6095. // sell copies of the Software, and to permit persons to whom the Software is
  6096. // furnished to do so, subject to the following conditions:
  6097. //
  6098. // The above copyright notice and this permission notice shall be included in
  6099. // all copies or substantial portions of the Software.
  6100. //
  6101. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  6102. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  6103. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  6104. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  6105. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  6106. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  6107. // IN THE SOFTWARE.
  6108. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  6109. // This file implements the following mathematical special functions:
  6110. // 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
  6111. // 2.) gamma(s), a real-numbered extension of the integer factorial function
  6112. // It also implements normalized_ligamma(s, z), a normalized lower incomplete
  6113. // gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can
  6114. // be called with an _impl suffix to use an implementation version with a few
  6115. // extra precomputed parameters (which may be useful for the caller to reuse).
  6116. // See below for details.
  6117. //
  6118. // Design Rationale:
  6119. // Pretty much every line of code in this file is duplicated four times for
  6120. // different input types (float4/float3/float2/float). This is unfortunate,
  6121. // but Cg doesn't allow function templates. Macros would be far less verbose,
  6122. // but they would make the code harder to document and read. I don't expect
  6123. // these functions will require a whole lot of maintenance changes unless
  6124. // someone ever has need for more robust incomplete gamma functions, so code
  6125. // duplication seems to be the lesser evil in this case.
  6126. /////////////////////////// GAUSSIAN ERROR FUNCTION //////////////////////////
  6127. float4 erf6(float4 x)
  6128. {
  6129. // Requires: x is the standard parameter to erf().
  6130. // Returns: Return an Abramowitz/Stegun approximation of erf(), where:
  6131. // erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
  6132. // This approximation has a max absolute error of 2.5*10**-5
  6133. // with solid numerical robustness and efficiency. See:
  6134. // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
  6135. static const float4 one = float4(1.0);
  6136. const float4 sign_x = sign(x);
  6137. const float4 t = one/(one + 0.47047*abs(x));
  6138. const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  6139. exp(-(x*x));
  6140. return result * sign_x;
  6141. }
  6142. float3 erf6(const float3 x)
  6143. {
  6144. // Float3 version:
  6145. static const float3 one = float3(1.0);
  6146. const float3 sign_x = sign(x);
  6147. const float3 t = one/(one + 0.47047*abs(x));
  6148. const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  6149. exp(-(x*x));
  6150. return result * sign_x;
  6151. }
  6152. float2 erf6(const float2 x)
  6153. {
  6154. // Float2 version:
  6155. static const float2 one = float2(1.0);
  6156. const float2 sign_x = sign(x);
  6157. const float2 t = one/(one + 0.47047*abs(x));
  6158. const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  6159. exp(-(x*x));
  6160. return result * sign_x;
  6161. }
  6162. float erf6(const float x)
  6163. {
  6164. // Float version:
  6165. const float sign_x = sign(x);
  6166. const float t = 1.0/(1.0 + 0.47047*abs(x));
  6167. const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  6168. exp(-(x*x));
  6169. return result * sign_x;
  6170. }
  6171. float4 erft(const float4 x)
  6172. {
  6173. // Requires: x is the standard parameter to erf().
  6174. // Returns: Approximate erf() with the hyperbolic tangent. The error is
  6175. // visually noticeable, but it's blazing fast and perceptually
  6176. // close...at least on ATI hardware. See:
  6177. // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
  6178. // Warning: Only use this if your hardware drivers correctly implement
  6179. // tanh(): My nVidia 8800GTS returns garbage output.
  6180. return tanh(1.202760580 * x);
  6181. }
  6182. float3 erft(const float3 x)
  6183. {
  6184. // Float3 version:
  6185. return tanh(1.202760580 * x);
  6186. }
  6187. float2 erft(const float2 x)
  6188. {
  6189. // Float2 version:
  6190. return tanh(1.202760580 * x);
  6191. }
  6192. float erft(const float x)
  6193. {
  6194. // Float version:
  6195. return tanh(1.202760580 * x);
  6196. }
  6197. inline float4 erf(const float4 x)
  6198. {
  6199. // Requires: x is the standard parameter to erf().
  6200. // Returns: Some approximation of erf(x), depending on user settings.
  6201. #ifdef ERF_FAST_APPROXIMATION
  6202. return erft(x);
  6203. #else
  6204. return erf6(x);
  6205. #endif
  6206. }
  6207. inline float3 erf(const float3 x)
  6208. {
  6209. // Float3 version:
  6210. #ifdef ERF_FAST_APPROXIMATION
  6211. return erft(x);
  6212. #else
  6213. return erf6(x);
  6214. #endif
  6215. }
  6216. inline float2 erf(const float2 x)
  6217. {
  6218. // Float2 version:
  6219. #ifdef ERF_FAST_APPROXIMATION
  6220. return erft(x);
  6221. #else
  6222. return erf6(x);
  6223. #endif
  6224. }
  6225. inline float erf(const float x)
  6226. {
  6227. // Float version:
  6228. #ifdef ERF_FAST_APPROXIMATION
  6229. return erft(x);
  6230. #else
  6231. return erf6(x);
  6232. #endif
  6233. }
  6234. /////////////////////////// COMPLETE GAMMA FUNCTION //////////////////////////
  6235. float4 gamma_impl(const float4 s, const float4 s_inv)
  6236. {
  6237. // Requires: 1.) s is the standard parameter to the gamma function, and
  6238. // it should lie in the [0, 36] range.
  6239. // 2.) s_inv = 1.0/s. This implementation function requires
  6240. // the caller to precompute this value, giving users the
  6241. // opportunity to reuse it.
  6242. // Returns: Return approximate gamma function (real-numbered factorial)
  6243. // output using the Lanczos approximation with two coefficients
  6244. // calculated using Paul Godfrey's method here:
  6245. // http://my.fit.edu/~gabdo/gamma.txt
  6246. // An optimal g value for s in [0, 36] is ~1.12906830989, with
  6247. // a maximum relative error of 0.000463 for 2**16 equally
  6248. // evals. We could use three coeffs (0.0000346 error) without
  6249. // hurting latency, but this allows more parallelism with
  6250. // outside instructions.
  6251. static const float4 g = float4(1.12906830989);
  6252. static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
  6253. static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
  6254. static const float4 e = float4(2.71828182845904523536028747135266249775724709);
  6255. const float4 sph = s + float4(0.5);
  6256. const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
  6257. const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e
  6258. // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
  6259. // This has less error for small s's than (s -= 1.0) at the beginning.
  6260. return (pow(base, sph) * lanczos_sum) * s_inv;
  6261. }
  6262. float3 gamma_impl(const float3 s, const float3 s_inv)
  6263. {
  6264. // Float3 version:
  6265. static const float3 g = float3(1.12906830989);
  6266. static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
  6267. static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
  6268. static const float3 e = float3(2.71828182845904523536028747135266249775724709);
  6269. const float3 sph = s + float3(0.5);
  6270. const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
  6271. const float3 base = (sph + g)/e;
  6272. return (pow(base, sph) * lanczos_sum) * s_inv;
  6273. }
  6274. float2 gamma_impl(const float2 s, const float2 s_inv)
  6275. {
  6276. // Float2 version:
  6277. static const float2 g = float2(1.12906830989);
  6278. static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
  6279. static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
  6280. static const float2 e = float2(2.71828182845904523536028747135266249775724709);
  6281. const float2 sph = s + float2(0.5);
  6282. const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
  6283. const float2 base = (sph + g)/e;
  6284. return (pow(base, sph) * lanczos_sum) * s_inv;
  6285. }
  6286. float gamma_impl(const float s, const float s_inv)
  6287. {
  6288. // Float version:
  6289. static const float g = 1.12906830989;
  6290. static const float c0 = 0.8109119309638332633713423362694399653724431;
  6291. static const float c1 = 0.4808354605142681877121661197951496120000040;
  6292. static const float e = 2.71828182845904523536028747135266249775724709;
  6293. const float sph = s + 0.5;
  6294. const float lanczos_sum = c0 + c1/(s + 1.0);
  6295. const float base = (sph + g)/e;
  6296. return (pow(base, sph) * lanczos_sum) * s_inv;
  6297. }
  6298. float4 gamma(const float4 s)
  6299. {
  6300. // Requires: s is the standard parameter to the gamma function, and it
  6301. // should lie in the [0, 36] range.
  6302. // Returns: Return approximate gamma function output with a maximum
  6303. // relative error of 0.000463. See gamma_impl for details.
  6304. return gamma_impl(s, float4(1.0)/s);
  6305. }
  6306. float3 gamma(const float3 s)
  6307. {
  6308. // Float3 version:
  6309. return gamma_impl(s, float3(1.0)/s);
  6310. }
  6311. float2 gamma(const float2 s)
  6312. {
  6313. // Float2 version:
  6314. return gamma_impl(s, float2(1.0)/s);
  6315. }
  6316. float gamma(const float s)
  6317. {
  6318. // Float version:
  6319. return gamma_impl(s, 1.0/s);
  6320. }
  6321. //////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) ///////////////
  6322. // Lower incomplete gamma function for small s and z (implementation):
  6323. float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
  6324. {
  6325. // Requires: 1.) s < ~0.5
  6326. // 2.) z <= ~0.775075
  6327. // 3.) s_inv = 1.0/s (precomputed for outside reuse)
  6328. // Returns: A series representation for the lower incomplete gamma
  6329. // function for small s and small z (4 terms).
  6330. // The actual "rolled up" summation looks like:
  6331. // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
  6332. // sum = last_sign * last_pow / ((s + k) * last_factorial)
  6333. // for(int i = 0; i < 4; ++i)
  6334. // {
  6335. // last_sign *= -1.0; last_pow *= z; last_factorial *= i;
  6336. // sum += last_sign * last_pow / ((s + k) * last_factorial);
  6337. // }
  6338. // Unrolled, constant-unfolded and arranged for madds and parallelism:
  6339. const float4 scale = pow(z, s);
  6340. float4 sum = s_inv; // Summation iteration 0 result
  6341. // Summation iterations 1, 2, and 3:
  6342. const float4 z_sq = z*z;
  6343. const float4 denom1 = s + float4(1.0);
  6344. const float4 denom2 = 2.0*s + float4(4.0);
  6345. const float4 denom3 = 6.0*s + float4(18.0);
  6346. //float4 denom4 = 24.0*s + float4(96.0);
  6347. sum -= z/denom1;
  6348. sum += z_sq/denom2;
  6349. sum -= z * z_sq/denom3;
  6350. //sum += z_sq * z_sq / denom4;
  6351. // Scale and return:
  6352. return scale * sum;
  6353. }
  6354. float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
  6355. {
  6356. // Float3 version:
  6357. const float3 scale = pow(z, s);
  6358. float3 sum = s_inv;
  6359. const float3 z_sq = z*z;
  6360. const float3 denom1 = s + float3(1.0);
  6361. const float3 denom2 = 2.0*s + float3(4.0);
  6362. const float3 denom3 = 6.0*s + float3(18.0);
  6363. sum -= z/denom1;
  6364. sum += z_sq/denom2;
  6365. sum -= z * z_sq/denom3;
  6366. return scale * sum;
  6367. }
  6368. float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
  6369. {
  6370. // Float2 version:
  6371. const float2 scale = pow(z, s);
  6372. float2 sum = s_inv;
  6373. const float2 z_sq = z*z;
  6374. const float2 denom1 = s + float2(1.0);
  6375. const float2 denom2 = 2.0*s + float2(4.0);
  6376. const float2 denom3 = 6.0*s + float2(18.0);
  6377. sum -= z/denom1;
  6378. sum += z_sq/denom2;
  6379. sum -= z * z_sq/denom3;
  6380. return scale * sum;
  6381. }
  6382. float ligamma_small_z_impl(const float s, const float z, const float s_inv)
  6383. {
  6384. // Float version:
  6385. const float scale = pow(z, s);
  6386. float sum = s_inv;
  6387. const float z_sq = z*z;
  6388. const float denom1 = s + 1.0;
  6389. const float denom2 = 2.0*s + 4.0;
  6390. const float denom3 = 6.0*s + 18.0;
  6391. sum -= z/denom1;
  6392. sum += z_sq/denom2;
  6393. sum -= z * z_sq/denom3;
  6394. return scale * sum;
  6395. }
  6396. // Upper incomplete gamma function for small s and large z (implementation):
  6397. float4 uigamma_large_z_impl(const float4 s, const float4 z)
  6398. {
  6399. // Requires: 1.) s < ~0.5
  6400. // 2.) z > ~0.775075
  6401. // Returns: Gauss's continued fraction representation for the upper
  6402. // incomplete gamma function (4 terms).
  6403. // The "rolled up" continued fraction looks like this. The denominator
  6404. // is truncated, and it's calculated "from the bottom up:"
  6405. // denom = float4('inf');
  6406. // float4 one = float4(1.0);
  6407. // for(int i = 4; i > 0; --i)
  6408. // {
  6409. // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
  6410. // }
  6411. // Unrolled and constant-unfolded for madds and parallelism:
  6412. const float4 numerator = pow(z, s) * exp(-z);
  6413. float4 denom = float4(7.0) + z - s;
  6414. denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
  6415. denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
  6416. denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
  6417. return numerator / denom;
  6418. }
  6419. float3 uigamma_large_z_impl(const float3 s, const float3 z)
  6420. {
  6421. // Float3 version:
  6422. const float3 numerator = pow(z, s) * exp(-z);
  6423. float3 denom = float3(7.0) + z - s;
  6424. denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
  6425. denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
  6426. denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
  6427. return numerator / denom;
  6428. }
  6429. float2 uigamma_large_z_impl(const float2 s, const float2 z)
  6430. {
  6431. // Float2 version:
  6432. const float2 numerator = pow(z, s) * exp(-z);
  6433. float2 denom = float2(7.0) + z - s;
  6434. denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
  6435. denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
  6436. denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
  6437. return numerator / denom;
  6438. }
  6439. float uigamma_large_z_impl(const float s, const float z)
  6440. {
  6441. // Float version:
  6442. const float numerator = pow(z, s) * exp(-z);
  6443. float denom = 7.0 + z - s;
  6444. denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
  6445. denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
  6446. denom = 1.0 + z - s + (s - 1.0)/denom;
  6447. return numerator / denom;
  6448. }
  6449. // Normalized lower incomplete gamma function for small s (implementation):
  6450. float4 normalized_ligamma_impl(const float4 s, const float4 z,
  6451. const float4 s_inv, const float4 gamma_s_inv)
  6452. {
  6453. // Requires: 1.) s < ~0.5
  6454. // 2.) s_inv = 1/s (precomputed for outside reuse)
  6455. // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
  6456. // Returns: Approximate the normalized lower incomplete gamma function
  6457. // for s < 0.5. Since we only care about s < 0.5, we only need
  6458. // to evaluate two branches (not four) based on z. Each branch
  6459. // uses four terms, with a max relative error of ~0.00182. The
  6460. // branch threshold and specifics were adapted for fewer terms
  6461. // from Gil/Segura/Temme's paper here:
  6462. // http://oai.cwi.nl/oai/asset/20433/20433B.pdf
  6463. // Evaluate both branches: Real branches test slower even when available.
  6464. static const float4 thresh = float4(0.775075);
  6465. bool4 z_is_large;
  6466. z_is_large.x = z.x > thresh.x;
  6467. z_is_large.y = z.y > thresh.y;
  6468. z_is_large.z = z.z > thresh.z;
  6469. z_is_large.w = z.w > thresh.w;
  6470. const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  6471. const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  6472. // Combine the results from both branches:
  6473. bool4 inverse_z_is_large = not(z_is_large);
  6474. return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
  6475. }
  6476. float3 normalized_ligamma_impl(const float3 s, const float3 z,
  6477. const float3 s_inv, const float3 gamma_s_inv)
  6478. {
  6479. // Float3 version:
  6480. static const float3 thresh = float3(0.775075);
  6481. bool3 z_is_large;
  6482. z_is_large.x = z.x > thresh.x;
  6483. z_is_large.y = z.y > thresh.y;
  6484. z_is_large.z = z.z > thresh.z;
  6485. const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  6486. const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  6487. bool3 inverse_z_is_large = not(z_is_large);
  6488. return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
  6489. }
  6490. float2 normalized_ligamma_impl(const float2 s, const float2 z,
  6491. const float2 s_inv, const float2 gamma_s_inv)
  6492. {
  6493. // Float2 version:
  6494. static const float2 thresh = float2(0.775075);
  6495. bool2 z_is_large;
  6496. z_is_large.x = z.x > thresh.x;
  6497. z_is_large.y = z.y > thresh.y;
  6498. const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  6499. const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  6500. bool2 inverse_z_is_large = not(z_is_large);
  6501. return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
  6502. }
  6503. float normalized_ligamma_impl(const float s, const float z,
  6504. const float s_inv, const float gamma_s_inv)
  6505. {
  6506. // Float version:
  6507. static const float thresh = 0.775075;
  6508. const bool z_is_large = z > thresh;
  6509. const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
  6510. const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  6511. return large_z * float(z_is_large) + small_z * float(!z_is_large);
  6512. }
  6513. // Normalized lower incomplete gamma function for small s:
  6514. float4 normalized_ligamma(const float4 s, const float4 z)
  6515. {
  6516. // Requires: s < ~0.5
  6517. // Returns: Approximate the normalized lower incomplete gamma function
  6518. // for s < 0.5. See normalized_ligamma_impl() for details.
  6519. const float4 s_inv = float4(1.0)/s;
  6520. const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
  6521. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  6522. }
  6523. float3 normalized_ligamma(const float3 s, const float3 z)
  6524. {
  6525. // Float3 version:
  6526. const float3 s_inv = float3(1.0)/s;
  6527. const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
  6528. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  6529. }
  6530. float2 normalized_ligamma(const float2 s, const float2 z)
  6531. {
  6532. // Float2 version:
  6533. const float2 s_inv = float2(1.0)/s;
  6534. const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
  6535. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  6536. }
  6537. float normalized_ligamma(const float s, const float z)
  6538. {
  6539. // Float version:
  6540. const float s_inv = 1.0/s;
  6541. const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
  6542. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  6543. }
  6544. #endif // SPECIAL_FUNCTIONS_H
  6545. //////////////////////////// END SPECIAL-FUNCTIONS ///////////////////////////
  6546. //////////////////////////////// END INCLUDES ////////////////////////////////
  6547. /////////////////////////////////// HELPERS //////////////////////////////////
  6548. inline float4 uv2_to_uv4(float2 tex_uv)
  6549. {
  6550. // Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
  6551. return float4(tex_uv, 0.0, 0.0);
  6552. }
  6553. // Make a length squared helper macro (for usage with static constants):
  6554. #define LENGTH_SQ(vec) (dot(vec, vec))
  6555. inline float get_fast_gaussian_weight_sum_inv(const float sigma)
  6556. {
  6557. // We can use the Gaussian integral to calculate the asymptotic weight for
  6558. // the center pixel. Since the unnormalized center pixel weight is 1.0,
  6559. // the normalized weight is the same as the weight sum inverse. Given a
  6560. // large enough blur (9+), the asymptotic weight sum is close and faster:
  6561. // center_weight = 0.5 *
  6562. // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
  6563. // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
  6564. // However, we can get even faster results with curve-fitting. These are
  6565. // also closer than the asymptotic results, because they were constructed
  6566. // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
  6567. // (0, blurN_std_dev), so the results for smaller sigmas are biased toward
  6568. // smaller blurs. The max error is 0.0031793913.
  6569. // Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
  6570. //static const float temp = 0.5/sqrt(2.0);
  6571. //return erf(temp/sigma);
  6572. return min(exp(exp(0.348348412457428/
  6573. (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
  6574. }
  6575. //////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS ///////////////////
  6576. float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
  6577. const float2 dxdy, const float sigma)
  6578. {
  6579. // Requires: Global requirements must be met (see file description).
  6580. // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
  6581. // It may be mipmapped depending on settings and dxdy.
  6582. // Calculate Gaussian blur kernel weights and a normalization factor for
  6583. // distances of 0-4, ignoring constant factors (since we're normalizing).
  6584. const float denom_inv = 0.5/(sigma*sigma);
  6585. const float w0 = 1.0;
  6586. const float w1 = exp(-1.0 * denom_inv);
  6587. const float w2 = exp(-4.0 * denom_inv);
  6588. const float w3 = exp(-9.0 * denom_inv);
  6589. const float w4 = exp(-16.0 * denom_inv);
  6590. const float w5 = exp(-25.0 * denom_inv);
  6591. const float weight_sum_inv = 1.0 /
  6592. (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
  6593. // Statically normalize weights, sum weighted samples, and return. Blurs are
  6594. // currently optimized for dynamic weights.
  6595. float3 sum = float3(0.0,0.0,0.0);
  6596. sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
  6597. sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
  6598. sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
  6599. sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
  6600. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  6601. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  6602. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  6603. sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
  6604. sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
  6605. sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
  6606. sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
  6607. return sum * weight_sum_inv;
  6608. }
  6609. float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
  6610. const float2 dxdy, const float sigma)
  6611. {
  6612. // Requires: Global requirements must be met (see file description).
  6613. // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
  6614. // It may be mipmapped depending on settings and dxdy.
  6615. // First get the texel weights and normalization factor as above.
  6616. const float denom_inv = 0.5/(sigma*sigma);
  6617. const float w0 = 1.0;
  6618. const float w1 = exp(-1.0 * denom_inv);
  6619. const float w2 = exp(-4.0 * denom_inv);
  6620. const float w3 = exp(-9.0 * denom_inv);
  6621. const float w4 = exp(-16.0 * denom_inv);
  6622. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
  6623. // Statically normalize weights, sum weighted samples, and return:
  6624. float3 sum = float3(0.0,0.0,0.0);
  6625. sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
  6626. sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
  6627. sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
  6628. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  6629. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  6630. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  6631. sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
  6632. sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
  6633. sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
  6634. return sum * weight_sum_inv;
  6635. }
  6636. float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
  6637. const float2 dxdy, const float sigma)
  6638. {
  6639. // Requires: Global requirements must be met (see file description).
  6640. // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
  6641. // It may be mipmapped depending on settings and dxdy.
  6642. // First get the texel weights and normalization factor as above.
  6643. const float denom_inv = 0.5/(sigma*sigma);
  6644. const float w0 = 1.0;
  6645. const float w1 = exp(-1.0 * denom_inv);
  6646. const float w2 = exp(-4.0 * denom_inv);
  6647. const float w3 = exp(-9.0 * denom_inv);
  6648. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
  6649. // Statically normalize weights, sum weighted samples, and return:
  6650. float3 sum = float3(0.0,0.0,0.0);
  6651. sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
  6652. sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
  6653. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  6654. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  6655. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  6656. sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
  6657. sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
  6658. return sum * weight_sum_inv;
  6659. }
  6660. float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
  6661. const float2 dxdy, const float sigma)
  6662. {
  6663. // Requires: Global requirements must be met (see file description).
  6664. // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
  6665. // It may be mipmapped depending on settings and dxdy.
  6666. // First get the texel weights and normalization factor as above.
  6667. const float denom_inv = 0.5/(sigma*sigma);
  6668. const float w0 = 1.0;
  6669. const float w1 = exp(-1.0 * denom_inv);
  6670. const float w2 = exp(-4.0 * denom_inv);
  6671. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
  6672. // Statically normalize weights, sum weighted samples, and return:
  6673. float3 sum = float3(0.0,0.0,0.0);
  6674. sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
  6675. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  6676. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  6677. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  6678. sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
  6679. return sum * weight_sum_inv;
  6680. }
  6681. float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
  6682. const float2 dxdy, const float sigma)
  6683. {
  6684. // Requires: Global requirements must be met (see file description).
  6685. // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
  6686. // It may be mipmapped depending on settings and dxdy.
  6687. // First get the texel weights and normalization factor as above.
  6688. const float denom_inv = 0.5/(sigma*sigma);
  6689. const float w0 = 1.0;
  6690. const float w1 = exp(-1.0 * denom_inv);
  6691. const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
  6692. // Statically normalize weights, sum weighted samples, and return:
  6693. float3 sum = float3(0.0,0.0,0.0);
  6694. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  6695. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  6696. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  6697. return sum * weight_sum_inv;
  6698. }
  6699. /////////////////////////// FAST SEPARABLE BLURS ///////////////////////////
  6700. float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
  6701. const float2 dxdy, const float sigma)
  6702. {
  6703. // Requires: 1.) Global requirements must be met (see file description).
  6704. // 2.) filter_linearN must = "true" in your .cgp file.
  6705. // 3.) For gamma-correct bilinear filtering, global
  6706. // gamma_aware_bilinear == true (from gamma-management.h)
  6707. // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear
  6708. // taps. It may be mipmapped depending on settings and dxdy.
  6709. // First get the texel weights and normalization factor as above.
  6710. const float denom_inv = 0.5/(sigma*sigma);
  6711. const float w0 = 1.0;
  6712. const float w1 = exp(-1.0 * denom_inv);
  6713. const float w2 = exp(-4.0 * denom_inv);
  6714. const float w3 = exp(-9.0 * denom_inv);
  6715. const float w4 = exp(-16.0 * denom_inv);
  6716. const float w5 = exp(-25.0 * denom_inv);
  6717. const float weight_sum_inv = 1.0 /
  6718. (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
  6719. // Calculate combined weights and linear sample ratios between texel pairs.
  6720. // The center texel (with weight w0) is used twice, so halve its weight.
  6721. const float w01 = w0 * 0.5 + w1;
  6722. const float w23 = w2 + w3;
  6723. const float w45 = w4 + w5;
  6724. const float w01_ratio = w1/w01;
  6725. const float w23_ratio = w3/w23;
  6726. const float w45_ratio = w5/w45;
  6727. // Statically normalize weights, sum weighted samples, and return:
  6728. float3 sum = float3(0.0,0.0,0.0);
  6729. sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
  6730. sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
  6731. sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
  6732. sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
  6733. sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
  6734. sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
  6735. return sum * weight_sum_inv;
  6736. }
  6737. float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
  6738. const float2 dxdy, const float sigma)
  6739. {
  6740. // Requires: Same as tex2Dblur11()
  6741. // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest
  6742. // neighbor and 4 linear taps. It may be mipmapped depending
  6743. // on settings and dxdy.
  6744. // First get the texel weights and normalization factor as above.
  6745. const float denom_inv = 0.5/(sigma*sigma);
  6746. const float w0 = 1.0;
  6747. const float w1 = exp(-1.0 * denom_inv);
  6748. const float w2 = exp(-4.0 * denom_inv);
  6749. const float w3 = exp(-9.0 * denom_inv);
  6750. const float w4 = exp(-16.0 * denom_inv);
  6751. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
  6752. // Calculate combined weights and linear sample ratios between texel pairs.
  6753. const float w12 = w1 + w2;
  6754. const float w34 = w3 + w4;
  6755. const float w12_ratio = w2/w12;
  6756. const float w34_ratio = w4/w34;
  6757. // Statically normalize weights, sum weighted samples, and return:
  6758. float3 sum = float3(0.0,0.0,0.0);
  6759. sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
  6760. sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
  6761. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  6762. sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
  6763. sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
  6764. return sum * weight_sum_inv;
  6765. }
  6766. float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
  6767. const float2 dxdy, const float sigma)
  6768. {
  6769. // Requires: Same as tex2Dblur11()
  6770. // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear
  6771. // taps. It may be mipmapped depending on settings and dxdy.
  6772. // First get the texel weights and normalization factor as above.
  6773. const float denom_inv = 0.5/(sigma*sigma);
  6774. const float w0 = 1.0;
  6775. const float w1 = exp(-1.0 * denom_inv);
  6776. const float w2 = exp(-4.0 * denom_inv);
  6777. const float w3 = exp(-9.0 * denom_inv);
  6778. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
  6779. // Calculate combined weights and linear sample ratios between texel pairs.
  6780. // The center texel (with weight w0) is used twice, so halve its weight.
  6781. const float w01 = w0 * 0.5 + w1;
  6782. const float w23 = w2 + w3;
  6783. const float w01_ratio = w1/w01;
  6784. const float w23_ratio = w3/w23;
  6785. // Statically normalize weights, sum weighted samples, and return:
  6786. float3 sum = float3(0.0,0.0,0.0);
  6787. sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
  6788. sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
  6789. sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
  6790. sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
  6791. return sum * weight_sum_inv;
  6792. }
  6793. float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
  6794. const float2 dxdy, const float sigma)
  6795. {
  6796. // Requires: Same as tex2Dblur11()
  6797. // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest
  6798. // neighbor and 2 linear taps. It may be mipmapped depending
  6799. // on settings and dxdy.
  6800. // First get the texel weights and normalization factor as above.
  6801. const float denom_inv = 0.5/(sigma*sigma);
  6802. const float w0 = 1.0;
  6803. const float w1 = exp(-1.0 * denom_inv);
  6804. const float w2 = exp(-4.0 * denom_inv);
  6805. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
  6806. // Calculate combined weights and linear sample ratios between texel pairs.
  6807. const float w12 = w1 + w2;
  6808. const float w12_ratio = w2/w12;
  6809. // Statically normalize weights, sum weighted samples, and return:
  6810. float3 sum = float3(0.0,0.0,0.0);
  6811. sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
  6812. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  6813. sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
  6814. return sum * weight_sum_inv;
  6815. }
  6816. float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
  6817. const float2 dxdy, const float sigma)
  6818. {
  6819. // Requires: Same as tex2Dblur11()
  6820. // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear
  6821. // taps. It may be mipmapped depending on settings and dxdy.
  6822. // First get the texel weights and normalization factor as above.
  6823. const float denom_inv = 0.5/(sigma*sigma);
  6824. const float w0 = 1.0;
  6825. const float w1 = exp(-1.0 * denom_inv);
  6826. const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
  6827. // Calculate combined weights and linear sample ratios between texel pairs.
  6828. // The center texel (with weight w0) is used twice, so halve its weight.
  6829. const float w01 = w0 * 0.5 + w1;
  6830. const float w01_ratio = w1/w01;
  6831. // Weights for all samples are the same, so just average them:
  6832. return 0.5 * (
  6833. tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
  6834. tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
  6835. }
  6836. //////////////////////////// HUGE SEPARABLE BLURS ////////////////////////////
  6837. // Huge separable blurs come only in "fast" versions.
  6838. float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
  6839. const float2 dxdy, const float sigma)
  6840. {
  6841. // Requires: Same as tex2Dblur11()
  6842. // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear
  6843. // taps. It may be mipmapped depending on settings and dxdy.
  6844. // First get the texel weights and normalization factor as above.
  6845. const float denom_inv = 0.5/(sigma*sigma);
  6846. const float w0 = 1.0;
  6847. const float w1 = exp(-1.0 * denom_inv);
  6848. const float w2 = exp(-4.0 * denom_inv);
  6849. const float w3 = exp(-9.0 * denom_inv);
  6850. const float w4 = exp(-16.0 * denom_inv);
  6851. const float w5 = exp(-25.0 * denom_inv);
  6852. const float w6 = exp(-36.0 * denom_inv);
  6853. const float w7 = exp(-49.0 * denom_inv);
  6854. const float w8 = exp(-64.0 * denom_inv);
  6855. const float w9 = exp(-81.0 * denom_inv);
  6856. const float w10 = exp(-100.0 * denom_inv);
  6857. const float w11 = exp(-121.0 * denom_inv);
  6858. const float w12 = exp(-144.0 * denom_inv);
  6859. const float w13 = exp(-169.0 * denom_inv);
  6860. const float w14 = exp(-196.0 * denom_inv);
  6861. const float w15 = exp(-225.0 * denom_inv);
  6862. const float w16 = exp(-256.0 * denom_inv);
  6863. const float w17 = exp(-289.0 * denom_inv);
  6864. const float w18 = exp(-324.0 * denom_inv);
  6865. const float w19 = exp(-361.0 * denom_inv);
  6866. const float w20 = exp(-400.0 * denom_inv);
  6867. const float w21 = exp(-441.0 * denom_inv);
  6868. //const float weight_sum_inv = 1.0 /
  6869. // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
  6870. // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
  6871. const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
  6872. // Calculate combined weights and linear sample ratios between texel pairs.
  6873. // The center texel (with weight w0) is used twice, so halve its weight.
  6874. const float w0_1 = w0 * 0.5 + w1;
  6875. const float w2_3 = w2 + w3;
  6876. const float w4_5 = w4 + w5;
  6877. const float w6_7 = w6 + w7;
  6878. const float w8_9 = w8 + w9;
  6879. const float w10_11 = w10 + w11;
  6880. const float w12_13 = w12 + w13;
  6881. const float w14_15 = w14 + w15;
  6882. const float w16_17 = w16 + w17;
  6883. const float w18_19 = w18 + w19;
  6884. const float w20_21 = w20 + w21;
  6885. const float w0_1_ratio = w1/w0_1;
  6886. const float w2_3_ratio = w3/w2_3;
  6887. const float w4_5_ratio = w5/w4_5;
  6888. const float w6_7_ratio = w7/w6_7;
  6889. const float w8_9_ratio = w9/w8_9;
  6890. const float w10_11_ratio = w11/w10_11;
  6891. const float w12_13_ratio = w13/w12_13;
  6892. const float w14_15_ratio = w15/w14_15;
  6893. const float w16_17_ratio = w17/w16_17;
  6894. const float w18_19_ratio = w19/w18_19;
  6895. const float w20_21_ratio = w21/w20_21;
  6896. // Statically normalize weights, sum weighted samples, and return:
  6897. float3 sum = float3(0.0,0.0,0.0);
  6898. sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
  6899. sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
  6900. sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
  6901. sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
  6902. sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
  6903. sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
  6904. sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
  6905. sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
  6906. sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
  6907. sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
  6908. sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
  6909. sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
  6910. sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
  6911. sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
  6912. sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
  6913. sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
  6914. sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
  6915. sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
  6916. sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
  6917. sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
  6918. sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
  6919. sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
  6920. return sum * weight_sum_inv;
  6921. }
  6922. float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
  6923. const float2 dxdy, const float sigma)
  6924. {
  6925. // Requires: Same as tex2Dblur11()
  6926. // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear
  6927. // taps. It may be mipmapped depending on settings and dxdy.
  6928. // First get the texel weights and normalization factor as above.
  6929. const float denom_inv = 0.5/(sigma*sigma);
  6930. const float w0 = 1.0;
  6931. const float w1 = exp(-1.0 * denom_inv);
  6932. const float w2 = exp(-4.0 * denom_inv);
  6933. const float w3 = exp(-9.0 * denom_inv);
  6934. const float w4 = exp(-16.0 * denom_inv);
  6935. const float w5 = exp(-25.0 * denom_inv);
  6936. const float w6 = exp(-36.0 * denom_inv);
  6937. const float w7 = exp(-49.0 * denom_inv);
  6938. const float w8 = exp(-64.0 * denom_inv);
  6939. const float w9 = exp(-81.0 * denom_inv);
  6940. const float w10 = exp(-100.0 * denom_inv);
  6941. const float w11 = exp(-121.0 * denom_inv);
  6942. const float w12 = exp(-144.0 * denom_inv);
  6943. const float w13 = exp(-169.0 * denom_inv);
  6944. const float w14 = exp(-196.0 * denom_inv);
  6945. const float w15 = exp(-225.0 * denom_inv);
  6946. //const float weight_sum_inv = 1.0 /
  6947. // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
  6948. // w9 + w10 + w11 + w12 + w13 + w14 + w15));
  6949. const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
  6950. // Calculate combined weights and linear sample ratios between texel pairs.
  6951. // The center texel (with weight w0) is used twice, so halve its weight.
  6952. const float w0_1 = w0 * 0.5 + w1;
  6953. const float w2_3 = w2 + w3;
  6954. const float w4_5 = w4 + w5;
  6955. const float w6_7 = w6 + w7;
  6956. const float w8_9 = w8 + w9;
  6957. const float w10_11 = w10 + w11;
  6958. const float w12_13 = w12 + w13;
  6959. const float w14_15 = w14 + w15;
  6960. const float w0_1_ratio = w1/w0_1;
  6961. const float w2_3_ratio = w3/w2_3;
  6962. const float w4_5_ratio = w5/w4_5;
  6963. const float w6_7_ratio = w7/w6_7;
  6964. const float w8_9_ratio = w9/w8_9;
  6965. const float w10_11_ratio = w11/w10_11;
  6966. const float w12_13_ratio = w13/w12_13;
  6967. const float w14_15_ratio = w15/w14_15;
  6968. // Statically normalize weights, sum weighted samples, and return:
  6969. float3 sum = float3(0.0,0.0,0.0);
  6970. sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
  6971. sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
  6972. sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
  6973. sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
  6974. sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
  6975. sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
  6976. sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
  6977. sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
  6978. sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
  6979. sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
  6980. sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
  6981. sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
  6982. sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
  6983. sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
  6984. sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
  6985. sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
  6986. return sum * weight_sum_inv;
  6987. }
  6988. float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
  6989. const float2 dxdy, const float sigma)
  6990. {
  6991. // Requires: Same as tex2Dblur11()
  6992. // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest
  6993. // neighbor and 12 linear taps. It may be mipmapped depending
  6994. // on settings and dxdy.
  6995. // First get the texel weights and normalization factor as above.
  6996. const float denom_inv = 0.5/(sigma*sigma);
  6997. const float w0 = 1.0;
  6998. const float w1 = exp(-1.0 * denom_inv);
  6999. const float w2 = exp(-4.0 * denom_inv);
  7000. const float w3 = exp(-9.0 * denom_inv);
  7001. const float w4 = exp(-16.0 * denom_inv);
  7002. const float w5 = exp(-25.0 * denom_inv);
  7003. const float w6 = exp(-36.0 * denom_inv);
  7004. const float w7 = exp(-49.0 * denom_inv);
  7005. const float w8 = exp(-64.0 * denom_inv);
  7006. const float w9 = exp(-81.0 * denom_inv);
  7007. const float w10 = exp(-100.0 * denom_inv);
  7008. const float w11 = exp(-121.0 * denom_inv);
  7009. const float w12 = exp(-144.0 * denom_inv);
  7010. //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
  7011. // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
  7012. const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
  7013. // Calculate combined weights and linear sample ratios between texel pairs.
  7014. const float w1_2 = w1 + w2;
  7015. const float w3_4 = w3 + w4;
  7016. const float w5_6 = w5 + w6;
  7017. const float w7_8 = w7 + w8;
  7018. const float w9_10 = w9 + w10;
  7019. const float w11_12 = w11 + w12;
  7020. const float w1_2_ratio = w2/w1_2;
  7021. const float w3_4_ratio = w4/w3_4;
  7022. const float w5_6_ratio = w6/w5_6;
  7023. const float w7_8_ratio = w8/w7_8;
  7024. const float w9_10_ratio = w10/w9_10;
  7025. const float w11_12_ratio = w12/w11_12;
  7026. // Statically normalize weights, sum weighted samples, and return:
  7027. float3 sum = float3(0.0,0.0,0.0);
  7028. sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
  7029. sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
  7030. sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
  7031. sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
  7032. sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
  7033. sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
  7034. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  7035. sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
  7036. sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
  7037. sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
  7038. sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
  7039. sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
  7040. sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
  7041. return sum * weight_sum_inv;
  7042. }
  7043. float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
  7044. const float2 dxdy, const float sigma)
  7045. {
  7046. // Requires: Same as tex2Dblur11()
  7047. // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest
  7048. // neighbor and 8 linear taps. It may be mipmapped depending
  7049. // on settings and dxdy.
  7050. // First get the texel weights and normalization factor as above.
  7051. const float denom_inv = 0.5/(sigma*sigma);
  7052. const float w0 = 1.0;
  7053. const float w1 = exp(-1.0 * denom_inv);
  7054. const float w2 = exp(-4.0 * denom_inv);
  7055. const float w3 = exp(-9.0 * denom_inv);
  7056. const float w4 = exp(-16.0 * denom_inv);
  7057. const float w5 = exp(-25.0 * denom_inv);
  7058. const float w6 = exp(-36.0 * denom_inv);
  7059. const float w7 = exp(-49.0 * denom_inv);
  7060. const float w8 = exp(-64.0 * denom_inv);
  7061. //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
  7062. // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
  7063. const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
  7064. // Calculate combined weights and linear sample ratios between texel pairs.
  7065. const float w1_2 = w1 + w2;
  7066. const float w3_4 = w3 + w4;
  7067. const float w5_6 = w5 + w6;
  7068. const float w7_8 = w7 + w8;
  7069. const float w1_2_ratio = w2/w1_2;
  7070. const float w3_4_ratio = w4/w3_4;
  7071. const float w5_6_ratio = w6/w5_6;
  7072. const float w7_8_ratio = w8/w7_8;
  7073. // Statically normalize weights, sum weighted samples, and return:
  7074. float3 sum = float3(0.0,0.0,0.0);
  7075. sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
  7076. sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
  7077. sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
  7078. sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
  7079. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  7080. sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
  7081. sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
  7082. sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
  7083. sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
  7084. return sum * weight_sum_inv;
  7085. }
  7086. //////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS ////////////////////
  7087. float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
  7088. const float2 dxdy, const float sigma)
  7089. {
  7090. // Requires: Global requirements must be met (see file description).
  7091. // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the
  7092. // resized input.
  7093. // Description:
  7094. // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
  7095. // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
  7096. const float denom_inv = 0.5/(sigma*sigma);
  7097. // Load each sample. We need all 3x3 samples. Quad-pixel communication
  7098. // won't help either: This should perform like tex2Dblur5x5, but sharing a
  7099. // 4x4 sample field would perform more like tex2Dblur8x8shared (worse).
  7100. const float2 sample4_uv = tex_uv;
  7101. const float2 dx = float2(dxdy.x, 0.0);
  7102. const float2 dy = float2(0.0, dxdy.y);
  7103. const float2 sample1_uv = sample4_uv - dy;
  7104. const float2 sample7_uv = sample4_uv + dy;
  7105. const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
  7106. const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
  7107. const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
  7108. const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
  7109. const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
  7110. const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
  7111. const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
  7112. const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
  7113. const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
  7114. // Statically compute Gaussian sample weights:
  7115. const float w4 = 1.0;
  7116. const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
  7117. const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
  7118. const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
  7119. // Weight and sum the samples:
  7120. const float3 sum = w4 * sample4 +
  7121. w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
  7122. w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
  7123. return sum * weight_sum_inv;
  7124. }
  7125. //////////////////////////// FASTER ONE-PASS BLURS ///////////////////////////
  7126. float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
  7127. const float2 dxdy, const float sigma)
  7128. {
  7129. // Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
  7130. // Requires: Same as tex2Dblur9()
  7131. // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of
  7132. // 5x5 carefully selected bilinear samples.
  7133. // Description:
  7134. // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the
  7135. // bilinear sample location to reflect the true Gaussian weights for each
  7136. // underlying texel. The following diagram illustrates the relative
  7137. // locations of bilinear samples. Each sample with the same number has the
  7138. // same weight (notice the symmetry). The letters a, b, c, d distinguish
  7139. // quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
  7140. // distinguish 1D directions along the line containing the pixel center:
  7141. // 6a 5a 2U 5b 6b
  7142. // 4a 3a 1U 3b 4b
  7143. // 2L 1L 0C 1R 2R
  7144. // 4c 3c 1D 3d 4d
  7145. // 6c 5c 2D 5d 6d
  7146. // The following diagram illustrates the underlying equally spaced texels,
  7147. // named after the sample that accesses them and subnamed by their location
  7148. // within their 2x2, 2x1, 1x2, or 1x1 texel block:
  7149. // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
  7150. // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
  7151. // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
  7152. // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
  7153. // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
  7154. // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
  7155. // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
  7156. // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
  7157. // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
  7158. // Note there is only one C texel and only two texels for each U, D, L, or
  7159. // R sample. The center sample is effectively a nearest neighbor sample,
  7160. // and the U/D/L/R samples use 1D linear filtering. All other texels are
  7161. // read with bilinear samples somewhere within their 2x2 texel blocks.
  7162. // COMPUTE TEXTURE COORDS:
  7163. // Statically compute sampling offsets within each 2x2 texel block, based
  7164. // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
  7165. // the center, and reuse them independently for both dimensions. Compute
  7166. // these offsets based on the relative 1D Gaussian weights of the texels
  7167. // in question. (w1off means "Gaussian weight for the texel 1.0 texels
  7168. // away from the pixel center," etc.).
  7169. const float denom_inv = 0.5/(sigma*sigma);
  7170. const float w1off = exp(-1.0 * denom_inv);
  7171. const float w2off = exp(-4.0 * denom_inv);
  7172. const float w3off = exp(-9.0 * denom_inv);
  7173. const float w4off = exp(-16.0 * denom_inv);
  7174. const float texel1to2ratio = w2off/(w1off + w2off);
  7175. const float texel3to4ratio = w4off/(w3off + w4off);
  7176. // Statically compute texel offsets from the fragment center to each
  7177. // bilinear sample in the bottom-right quadrant, including x-axis-aligned:
  7178. const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
  7179. const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
  7180. const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
  7181. const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
  7182. const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
  7183. const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
  7184. // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
  7185. // Statically compute Gaussian texel weights for the bottom-right quadrant.
  7186. // Read underscores as "and."
  7187. const float w1R1 = w1off;
  7188. const float w1R2 = w2off;
  7189. const float w2R1 = w3off;
  7190. const float w2R2 = w4off;
  7191. const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
  7192. const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
  7193. const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
  7194. const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
  7195. const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
  7196. const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
  7197. const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
  7198. const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
  7199. const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
  7200. const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
  7201. // Statically add texel weights in each sample to get sample weights:
  7202. const float w0 = 1.0;
  7203. const float w1 = w1R1 + w1R2;
  7204. const float w2 = w2R1 + w2R2;
  7205. const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
  7206. const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
  7207. const float w5 = w4;
  7208. const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
  7209. // Get the weight sum inverse (normalization factor):
  7210. const float weight_sum_inv =
  7211. 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
  7212. // LOAD TEXTURE SAMPLES:
  7213. // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
  7214. const float2 mirror_x = float2(-1.0, 1.0);
  7215. const float2 mirror_y = float2(1.0, -1.0);
  7216. const float2 mirror_xy = float2(-1.0, -1.0);
  7217. const float2 dxdy_mirror_x = dxdy * mirror_x;
  7218. const float2 dxdy_mirror_y = dxdy * mirror_y;
  7219. const float2 dxdy_mirror_xy = dxdy * mirror_xy;
  7220. // Sampling order doesn't seem to affect performance, so just be clear:
  7221. const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
  7222. const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
  7223. const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
  7224. const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
  7225. const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
  7226. const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
  7227. const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
  7228. const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
  7229. const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
  7230. const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
  7231. const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
  7232. const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
  7233. const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
  7234. const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
  7235. const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
  7236. const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
  7237. const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
  7238. const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
  7239. const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
  7240. const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
  7241. const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
  7242. const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
  7243. const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
  7244. const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
  7245. const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
  7246. // SUM WEIGHTED SAMPLES:
  7247. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  7248. float3 sum = w0 * sample0C;
  7249. sum += w1 * (sample1R + sample1D + sample1L + sample1U);
  7250. sum += w2 * (sample2R + sample2D + sample2L + sample2U);
  7251. sum += w3 * (sample3d + sample3c + sample3b + sample3a);
  7252. sum += w4 * (sample4d + sample4c + sample4b + sample4a);
  7253. sum += w5 * (sample5d + sample5c + sample5b + sample5a);
  7254. sum += w6 * (sample6d + sample6c + sample6b + sample6a);
  7255. return sum * weight_sum_inv;
  7256. }
  7257. float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
  7258. const float2 dxdy, const float sigma)
  7259. {
  7260. // Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
  7261. // Requires: Same as tex2Dblur9()
  7262. // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of
  7263. // 4x4 carefully selected bilinear samples.
  7264. // Description:
  7265. // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This
  7266. // blur mixes concepts from both. The sample layout is as follows:
  7267. // 4a 3a 3b 4b
  7268. // 2a 1a 1b 2b
  7269. // 2c 1c 1d 2d
  7270. // 4c 3c 3d 4d
  7271. // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d,
  7272. // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
  7273. // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
  7274. // the center texel):
  7275. // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4
  7276. // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2
  7277. // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4
  7278. // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2
  7279. // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4
  7280. // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2
  7281. // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4
  7282. // COMPUTE TEXTURE COORDS:
  7283. // Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
  7284. const float denom_inv = 0.5/(sigma*sigma);
  7285. const float w0off = 1.0;
  7286. const float w1off = exp(-1.0 * denom_inv);
  7287. const float w2off = exp(-4.0 * denom_inv);
  7288. const float w3off = exp(-9.0 * denom_inv);
  7289. const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
  7290. const float texel2to3ratio = w3off/(w2off + w3off);
  7291. // Statically compute texel offsets from the fragment center to each
  7292. // bilinear sample in the bottom-right quadrant, including axis-aligned:
  7293. const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
  7294. const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  7295. const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  7296. const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  7297. // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
  7298. // Statically compute Gaussian texel weights for the bottom-right quadrant.
  7299. // Read underscores as "and."
  7300. const float w1abcd = 1.0;
  7301. const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
  7302. const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
  7303. const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
  7304. const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
  7305. const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
  7306. const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
  7307. const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
  7308. const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
  7309. const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
  7310. // Statically add texel weights in each sample to get sample weights.
  7311. // Split weights for shared texels between samples sharing them:
  7312. const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
  7313. const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
  7314. const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
  7315. // Get the weight sum inverse (normalization factor):
  7316. const float weight_sum_inv =
  7317. 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
  7318. // LOAD TEXTURE SAMPLES:
  7319. // Load all 16 samples using symmetry:
  7320. const float2 mirror_x = float2(-1.0, 1.0);
  7321. const float2 mirror_y = float2(1.0, -1.0);
  7322. const float2 mirror_xy = float2(-1.0, -1.0);
  7323. const float2 dxdy_mirror_x = dxdy * mirror_x;
  7324. const float2 dxdy_mirror_y = dxdy * mirror_y;
  7325. const float2 dxdy_mirror_xy = dxdy * mirror_xy;
  7326. const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
  7327. const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
  7328. const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
  7329. const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
  7330. const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
  7331. const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
  7332. const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
  7333. const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
  7334. const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
  7335. const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
  7336. const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
  7337. const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
  7338. const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
  7339. const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
  7340. const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
  7341. const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
  7342. // SUM WEIGHTED SAMPLES:
  7343. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  7344. float3 sum = float3(0.0,0.0,0.0);
  7345. sum += w1 * (sample1a + sample1b + sample1c + sample1d);
  7346. sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
  7347. sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
  7348. sum += w4 * (sample4a + sample4b + sample4c + sample4d);
  7349. return sum * weight_sum_inv;
  7350. }
  7351. float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
  7352. const float2 dxdy, const float sigma)
  7353. {
  7354. // Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
  7355. // Requires: Same as tex2Dblur9()
  7356. // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of
  7357. // 3x3 carefully selected bilinear samples.
  7358. // Description:
  7359. // First see the description for tex2Dblur9x9(). This blur uses the same
  7360. // concept and sample/texel locations except on a smaller scale. Samples:
  7361. // 2a 1U 2b
  7362. // 1L 0C 1R
  7363. // 2c 1D 2d
  7364. // Texels:
  7365. // 2a4 2a3 1U2 2b3 2b4
  7366. // 2a2 2a1 1U1 2b1 2b2
  7367. // 1L2 1L1 0C1 1R1 1R2
  7368. // 2c2 2c1 1D1 2d1 2d2
  7369. // 2c4 2c3 1D2 2d3 2d4
  7370. // COMPUTE TEXTURE COORDS:
  7371. // Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
  7372. const float denom_inv = 0.5/(sigma*sigma);
  7373. const float w1off = exp(-1.0 * denom_inv);
  7374. const float w2off = exp(-4.0 * denom_inv);
  7375. const float texel1to2ratio = w2off/(w1off + w2off);
  7376. // Statically compute texel offsets from the fragment center to each
  7377. // bilinear sample in the bottom-right quadrant, including x-axis-aligned:
  7378. const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
  7379. const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
  7380. // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
  7381. // Statically compute Gaussian texel weights for the bottom-right quadrant.
  7382. // Read underscores as "and."
  7383. const float w1R1 = w1off;
  7384. const float w1R2 = w2off;
  7385. const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
  7386. const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
  7387. const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
  7388. // Statically add texel weights in each sample to get sample weights:
  7389. const float w0 = 1.0;
  7390. const float w1 = w1R1 + w1R2;
  7391. const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
  7392. // Get the weight sum inverse (normalization factor):
  7393. const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
  7394. // LOAD TEXTURE SAMPLES:
  7395. // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
  7396. const float2 mirror_x = float2(-1.0, 1.0);
  7397. const float2 mirror_y = float2(1.0, -1.0);
  7398. const float2 mirror_xy = float2(-1.0, -1.0);
  7399. const float2 dxdy_mirror_x = dxdy * mirror_x;
  7400. const float2 dxdy_mirror_y = dxdy * mirror_y;
  7401. const float2 dxdy_mirror_xy = dxdy * mirror_xy;
  7402. const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
  7403. const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
  7404. const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
  7405. const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
  7406. const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
  7407. const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
  7408. const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
  7409. const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
  7410. const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
  7411. // SUM WEIGHTED SAMPLES:
  7412. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  7413. float3 sum = w0 * sample0C;
  7414. sum += w1 * (sample1R + sample1D + sample1L + sample1U);
  7415. sum += w2 * (sample2a + sample2b + sample2c + sample2d);
  7416. return sum * weight_sum_inv;
  7417. }
  7418. float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
  7419. const float2 dxdy, const float sigma)
  7420. {
  7421. // Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
  7422. // Requires: Same as tex2Dblur9()
  7423. // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of
  7424. // 2x2 carefully selected bilinear samples.
  7425. // Description:
  7426. // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This
  7427. // blur mixes concepts from both. The sample layout is as follows:
  7428. // 0a 0b
  7429. // 0c 0d
  7430. // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share
  7431. // a vertical column of texels, and samples 0a/0c and 0b/0d share a
  7432. // horizontal row of texels (all samples share the center texel):
  7433. // 0a3 0ab2 0b3
  7434. // 0ac1 0*0 0bd1
  7435. // 0c3 0cd2 0d3
  7436. // COMPUTE TEXTURE COORDS:
  7437. // Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
  7438. const float denom_inv = 0.5/(sigma*sigma);
  7439. const float w0off = 1.0;
  7440. const float w1off = exp(-1.0 * denom_inv);
  7441. const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
  7442. // Statically compute texel offsets from the fragment center to each
  7443. // bilinear sample in the bottom-right quadrant, including axis-aligned:
  7444. const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
  7445. // LOAD TEXTURE SAMPLES:
  7446. // Load all 4 samples using symmetry:
  7447. const float2 mirror_x = float2(-1.0, 1.0);
  7448. const float2 mirror_y = float2(1.0, -1.0);
  7449. const float2 mirror_xy = float2(-1.0, -1.0);
  7450. const float2 dxdy_mirror_x = dxdy * mirror_x;
  7451. const float2 dxdy_mirror_y = dxdy * mirror_y;
  7452. const float2 dxdy_mirror_xy = dxdy * mirror_xy;
  7453. const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
  7454. const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
  7455. const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
  7456. const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
  7457. // SUM WEIGHTED SAMPLES:
  7458. // Weights for all samples are the same, so just average them:
  7459. return 0.25 * (sample0a + sample0b + sample0c + sample0d);
  7460. }
  7461. ////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES /////////////////
  7462. float3 tex2Dblur12x12shared(const sampler2D tex,
  7463. const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
  7464. const float sigma)
  7465. {
  7466. // Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
  7467. // Requires: 1.) Same as tex2Dblur9()
  7468. // 2.) ddx() and ddy() are present in the current Cg profile.
  7469. // 3.) The GPU driver is using fine/high-quality derivatives.
  7470. // 4.) quad_vector *correctly* describes the current fragment's
  7471. // location in its pixel quad, by the conventions noted in
  7472. // get_quad_vector[_naive].
  7473. // 5.) tex_uv.w = log2(video_size/output_size).y
  7474. // 6.) tex2Dlod() is present in the current Cg profile.
  7475. // Optional: Tune artifacts vs. excessive blurriness with the global
  7476. // float error_blurring.
  7477. // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian
  7478. // blur (a 6x6 blur of carefully selected bilinear samples)
  7479. // of the given mip level. There will be subtle inaccuracies,
  7480. // especially for small or high-frequency detailed sources.
  7481. // Description:
  7482. // Perform a 1-pass blur with shared texture lookups across a pixel quad.
  7483. // We'll get neighboring samples with high-quality ddx/ddy derivatives, as
  7484. // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
  7485. // Message Passing" by Eric Penner.
  7486. //
  7487. // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
  7488. // bilinear samples, where bilinear sampling positions are computed from
  7489. // the relative Gaussian weights of the 4 surrounding texels. The catch is
  7490. // that the appropriate texel weights and sample coords differ for each
  7491. // fragment, but we're reusing most of the same samples across a quad of
  7492. // destination fragments. (We do use unique coords for the four nearest
  7493. // samples at each fragment.) Mixing bilinear filtering and sample-sharing
  7494. // therefore introduces some error into the weights, and this can get nasty
  7495. // when the source image is small or high-frequency. Computing bilinear
  7496. // ratios based on weights at the sample field center results in sharpening
  7497. // and ringing artifacts, but we can move samples closer to halfway between
  7498. // texels to try blurring away the error (which can move features around by
  7499. // a texel or so). Tune this with the global float "error_blurring".
  7500. //
  7501. // The pixel quad's sample field covers 12x12 texels, accessed through 6x6
  7502. // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10
  7503. // texels (5x5 bilinear taps), and each fragment is responsible for loading
  7504. // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
  7505. // to use unique bilinear coords for sample0* for each fragment. This
  7506. // diagram illustrates the relative locations of bilinear samples 1-9 for
  7507. // each quadrant a, b, c, d (note samples will not be equally spaced):
  7508. // 8a 7a 6a 6b 7b 8b
  7509. // 5a 4a 3a 3b 4b 5b
  7510. // 2a 1a 0a 0b 1b 2b
  7511. // 2c 1c 0c 0d 1d 2d
  7512. // 5c 4c 3c 3d 4d 5d
  7513. // 8c 7c 6c 6d 7d 8d
  7514. // The following diagram illustrates the underlying equally spaced texels,
  7515. // named after the sample that accesses them and subnamed by their location
  7516. // within their 2x2 texel block:
  7517. // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
  7518. // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
  7519. // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
  7520. // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
  7521. // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
  7522. // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
  7523. // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
  7524. // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
  7525. // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
  7526. // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
  7527. // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
  7528. // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
  7529. // With this symmetric arrangement, we don't have to know which absolute
  7530. // quadrant a sample lies in to assign kernel weights; it's enough to know
  7531. // the sample number and the relative quadrant of the sample (relative to
  7532. // the current quadrant):
  7533. // {current, adjacent x, adjacent y, diagonal}
  7534. // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  7535. // Statically compute sampling offsets within each 2x2 texel block, based
  7536. // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
  7537. // and [4, 5] away from the fragment, and reuse them independently for both
  7538. // dimensions. Use the sample field center as the estimated destination,
  7539. // but nudge the result closer to halfway between texels to blur error.
  7540. const float denom_inv = 0.5/(sigma*sigma);
  7541. const float w0off = 1.0;
  7542. const float w0_5off = exp(-(0.5*0.5) * denom_inv);
  7543. const float w1off = exp(-(1.0*1.0) * denom_inv);
  7544. const float w1_5off = exp(-(1.5*1.5) * denom_inv);
  7545. const float w2off = exp(-(2.0*2.0) * denom_inv);
  7546. const float w2_5off = exp(-(2.5*2.5) * denom_inv);
  7547. const float w3_5off = exp(-(3.5*3.5) * denom_inv);
  7548. const float w4_5off = exp(-(4.5*4.5) * denom_inv);
  7549. const float w5_5off = exp(-(5.5*5.5) * denom_inv);
  7550. const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
  7551. const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
  7552. const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
  7553. // We don't share sample0*, so use the nearest destination fragment:
  7554. const float texel0to1ratio_nearest = w1off/(w0off + w1off);
  7555. const float texel1to2ratio_nearest = w2off/(w1off + w2off);
  7556. // Statically compute texel offsets from the bottom-right fragment to each
  7557. // bilinear sample in the bottom-right quadrant:
  7558. const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
  7559. const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
  7560. const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
  7561. const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
  7562. const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  7563. const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
  7564. const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  7565. const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  7566. const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
  7567. const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
  7568. const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
  7569. const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
  7570. // CALCULATE KERNEL WEIGHTS:
  7571. // Statically compute bilinear sample weights at each destination fragment
  7572. // based on the sum of their 4 underlying texel weights. Assume a same-
  7573. // resolution blur, so each symmetrically named sample weight will compute
  7574. // the same at every fragment in the pixel quad: We can therefore compute
  7575. // texel weights based only on the bottom-right quadrant (fragment at 0d0).
  7576. // Too avoid too much boilerplate code, use a macro to get all 4 texel
  7577. // weights for a bilinear sample based on the offset of its top-left texel:
  7578. #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
  7579. (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
  7580. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
  7581. exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
  7582. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
  7583. const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
  7584. const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
  7585. const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
  7586. const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
  7587. const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
  7588. const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
  7589. const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
  7590. const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
  7591. const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
  7592. const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
  7593. const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
  7594. const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
  7595. const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
  7596. const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
  7597. const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
  7598. const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
  7599. const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
  7600. const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
  7601. const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
  7602. const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
  7603. const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
  7604. const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
  7605. const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
  7606. const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
  7607. const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
  7608. const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
  7609. const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
  7610. const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
  7611. const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
  7612. const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
  7613. const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
  7614. const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
  7615. const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
  7616. const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
  7617. const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
  7618. const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
  7619. #undef GET_TEXEL_QUAD_WEIGHTS
  7620. // Statically pack weights for runtime:
  7621. const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
  7622. const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
  7623. const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
  7624. const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
  7625. const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
  7626. const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
  7627. const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
  7628. const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
  7629. const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
  7630. // Get the weight sum inverse (normalization factor):
  7631. const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
  7632. const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
  7633. const float weight_sum = weight_sum2.x + weight_sum2.y;
  7634. const float weight_sum_inv = 1.0/(weight_sum);
  7635. // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  7636. // Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
  7637. const float2 dxdy_curr = dxdy * quad_vector.xy;
  7638. // Load bilinear samples for the current quadrant (for this fragment):
  7639. const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
  7640. const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
  7641. const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
  7642. const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
  7643. const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
  7644. const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
  7645. const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
  7646. const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
  7647. const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
  7648. const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
  7649. const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
  7650. const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
  7651. // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
  7652. // Fetch the samples from other fragments in the 2x2 quad:
  7653. float3 sample1adjx, sample1adjy, sample1diag;
  7654. float3 sample2adjx, sample2adjy, sample2diag;
  7655. float3 sample3adjx, sample3adjy, sample3diag;
  7656. float3 sample4adjx, sample4adjy, sample4diag;
  7657. float3 sample5adjx, sample5adjy, sample5diag;
  7658. float3 sample6adjx, sample6adjy, sample6diag;
  7659. float3 sample7adjx, sample7adjy, sample7diag;
  7660. float3 sample8adjx, sample8adjy, sample8diag;
  7661. quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
  7662. quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
  7663. quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
  7664. quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
  7665. quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
  7666. quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
  7667. quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
  7668. quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
  7669. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  7670. // Fill each row of a matrix with an rgb sample and pre-multiply by the
  7671. // weights to obtain a weighted result:
  7672. float3 sum = float3(0.0,0.0,0.0);
  7673. sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
  7674. sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
  7675. sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
  7676. sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
  7677. sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
  7678. sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
  7679. sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
  7680. sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
  7681. sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
  7682. return sum * weight_sum_inv;
  7683. }
  7684. float3 tex2Dblur10x10shared(const sampler2D tex,
  7685. const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
  7686. const float sigma)
  7687. {
  7688. // Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
  7689. // Requires: Same as tex2Dblur12x12shared()
  7690. // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian
  7691. // blur (a 5x5 blur of carefully selected bilinear samples)
  7692. // of the given mip level. There will be subtle inaccuracies,
  7693. // especially for small or high-frequency detailed sources.
  7694. // Description:
  7695. // First see the description for tex2Dblur12x12shared(). This
  7696. // function shares the same concept and sample placement, but each fragment
  7697. // only uses 25 of the 36 samples taken across the pixel quad (to cover a
  7698. // 5x5 sample area, or 10x10 texel area), and it uses a lower standard
  7699. // deviation to compensate. Thanks to symmetry, the 11 omitted samples
  7700. // are always the "same:"
  7701. // 8adjx, 2adjx, 5adjx,
  7702. // 6adjy, 7adjy, 8adjy,
  7703. // 2diag, 5diag, 6diag, 7diag, 8diag
  7704. // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  7705. // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
  7706. const float denom_inv = 0.5/(sigma*sigma);
  7707. const float w0off = 1.0;
  7708. const float w0_5off = exp(-(0.5*0.5) * denom_inv);
  7709. const float w1off = exp(-(1.0*1.0) * denom_inv);
  7710. const float w1_5off = exp(-(1.5*1.5) * denom_inv);
  7711. const float w2off = exp(-(2.0*2.0) * denom_inv);
  7712. const float w2_5off = exp(-(2.5*2.5) * denom_inv);
  7713. const float w3_5off = exp(-(3.5*3.5) * denom_inv);
  7714. const float w4_5off = exp(-(4.5*4.5) * denom_inv);
  7715. const float w5_5off = exp(-(5.5*5.5) * denom_inv);
  7716. const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
  7717. const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
  7718. const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
  7719. // We don't share sample0*, so use the nearest destination fragment:
  7720. const float texel0to1ratio_nearest = w1off/(w0off + w1off);
  7721. const float texel1to2ratio_nearest = w2off/(w1off + w2off);
  7722. // Statically compute texel offsets from the bottom-right fragment to each
  7723. // bilinear sample in the bottom-right quadrant:
  7724. const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
  7725. const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
  7726. const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
  7727. const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
  7728. const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  7729. const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
  7730. const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  7731. const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  7732. const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
  7733. const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
  7734. const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
  7735. const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
  7736. // CALCULATE KERNEL WEIGHTS:
  7737. // Statically compute bilinear sample weights at each destination fragment
  7738. // from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
  7739. #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
  7740. (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
  7741. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
  7742. exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
  7743. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
  7744. // We only need 25 of the 36 sample weights. Skip the following weights:
  7745. // 8adjx, 2adjx, 5adjx,
  7746. // 6adjy, 7adjy, 8adjy,
  7747. // 2diag, 5diag, 6diag, 7diag, 8diag
  7748. const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
  7749. const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
  7750. const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
  7751. const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
  7752. const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
  7753. const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
  7754. const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
  7755. const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
  7756. const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
  7757. const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
  7758. const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
  7759. const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
  7760. const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
  7761. const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
  7762. const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
  7763. const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
  7764. const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
  7765. const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
  7766. const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
  7767. const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
  7768. const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
  7769. const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
  7770. const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
  7771. const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
  7772. const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
  7773. #undef GET_TEXEL_QUAD_WEIGHTS
  7774. // Get the weight sum inverse (normalization factor):
  7775. const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
  7776. w4curr + w5curr + w6curr + w7curr + w8curr +
  7777. w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
  7778. w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
  7779. w0diag + w1diag + w3diag + w4diag);
  7780. // Statically pack most weights for runtime. Note the mixed packing:
  7781. const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
  7782. const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
  7783. const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
  7784. const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
  7785. const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
  7786. const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
  7787. // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  7788. // Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
  7789. const float2 dxdy_curr = dxdy * quad_vector.xy;
  7790. // Load bilinear samples for the current quadrant (for this fragment):
  7791. const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
  7792. const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
  7793. const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
  7794. const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
  7795. const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
  7796. const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
  7797. const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
  7798. const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
  7799. const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
  7800. const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
  7801. const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
  7802. const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
  7803. // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
  7804. // Fetch the samples from other fragments in the 2x2 quad in order of need:
  7805. float3 sample1adjx, sample1adjy, sample1diag;
  7806. float3 sample2adjx, sample2adjy, sample2diag;
  7807. float3 sample3adjx, sample3adjy, sample3diag;
  7808. float3 sample4adjx, sample4adjy, sample4diag;
  7809. float3 sample5adjx, sample5adjy, sample5diag;
  7810. float3 sample6adjx, sample6adjy, sample6diag;
  7811. float3 sample7adjx, sample7adjy, sample7diag;
  7812. quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
  7813. quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
  7814. quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
  7815. quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
  7816. quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
  7817. quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
  7818. quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
  7819. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  7820. // Fill each row of a matrix with an rgb sample and pre-multiply by the
  7821. // weights to obtain a weighted result. First do the simple ones:
  7822. float3 sum = float3(0.0,0.0,0.0);
  7823. sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
  7824. sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
  7825. sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
  7826. sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
  7827. // Now do the mixed-sample ones:
  7828. sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
  7829. sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
  7830. sum += w8curr * sample8curr;
  7831. // Normalize the sum (so the weights add to 1.0) and return:
  7832. return sum * weight_sum_inv;
  7833. }
  7834. float3 tex2Dblur8x8shared(const sampler2D tex,
  7835. const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
  7836. const float sigma)
  7837. {
  7838. // Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
  7839. // Requires: Same as tex2Dblur12x12shared()
  7840. // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian
  7841. // blur (a 4x4 blur of carefully selected bilinear samples)
  7842. // of the given mip level. There will be subtle inaccuracies,
  7843. // especially for small or high-frequency detailed sources.
  7844. // Description:
  7845. // First see the description for tex2Dblur12x12shared(). This function
  7846. // shares the same concept and a similar sample placement, except each
  7847. // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
  7848. // respectively. There could be a total of 16 samples, 4 of which each
  7849. // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
  7850. // its own offset to reduce shared sample artifacts, bringing the sample
  7851. // count for each fragment to 7. Sample placement:
  7852. // 3a 2a 2b 3b
  7853. // 1a 0a 0b 1b
  7854. // 1c 0c 0d 1d
  7855. // 3c 2c 2d 3d
  7856. // Texel placement:
  7857. // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
  7858. // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
  7859. // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
  7860. // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
  7861. // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
  7862. // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
  7863. // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
  7864. // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
  7865. // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  7866. // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
  7867. const float denom_inv = 0.5/(sigma*sigma);
  7868. const float w0off = 1.0;
  7869. const float w0_5off = exp(-(0.5*0.5) * denom_inv);
  7870. const float w1off = exp(-(1.0*1.0) * denom_inv);
  7871. const float w1_5off = exp(-(1.5*1.5) * denom_inv);
  7872. const float w2off = exp(-(2.0*2.0) * denom_inv);
  7873. const float w2_5off = exp(-(2.5*2.5) * denom_inv);
  7874. const float w3_5off = exp(-(3.5*3.5) * denom_inv);
  7875. const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
  7876. const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
  7877. // We don't share sample0*, so use the nearest destination fragment:
  7878. const float texel0to1ratio_nearest = w1off/(w0off + w1off);
  7879. const float texel1to2ratio_nearest = w2off/(w1off + w2off);
  7880. // Statically compute texel offsets from the bottom-right fragment to each
  7881. // bilinear sample in the bottom-right quadrant:
  7882. const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
  7883. const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
  7884. const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
  7885. const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
  7886. const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  7887. const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  7888. const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  7889. // CALCULATE KERNEL WEIGHTS:
  7890. // Statically compute bilinear sample weights at each destination fragment
  7891. // from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
  7892. #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
  7893. (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
  7894. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
  7895. exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
  7896. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
  7897. const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
  7898. const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
  7899. const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
  7900. const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
  7901. const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
  7902. const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
  7903. const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
  7904. const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
  7905. const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
  7906. const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
  7907. const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
  7908. const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
  7909. const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
  7910. const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
  7911. const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
  7912. const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
  7913. #undef GET_TEXEL_QUAD_WEIGHTS
  7914. // Statically pack weights for runtime:
  7915. const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
  7916. const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
  7917. const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
  7918. const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
  7919. // Get the weight sum inverse (normalization factor):
  7920. const float4 weight_sum4 = w0 + w1 + w2 + w3;
  7921. const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
  7922. const float weight_sum = weight_sum2.x + weight_sum2.y;
  7923. const float weight_sum_inv = 1.0/(weight_sum);
  7924. // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  7925. // Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
  7926. const float2 dxdy_curr = dxdy * quad_vector.xy;
  7927. // Load bilinear samples for the current quadrant (for this fragment):
  7928. const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
  7929. const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
  7930. const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
  7931. const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
  7932. const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
  7933. const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
  7934. const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
  7935. // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
  7936. // Fetch the samples from other fragments in the 2x2 quad:
  7937. float3 sample1adjx, sample1adjy, sample1diag;
  7938. float3 sample2adjx, sample2adjy, sample2diag;
  7939. float3 sample3adjx, sample3adjy, sample3diag;
  7940. quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
  7941. quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
  7942. quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
  7943. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  7944. // Fill each row of a matrix with an rgb sample and pre-multiply by the
  7945. // weights to obtain a weighted result:
  7946. float3 sum = float3(0.0,0.0,0.0);
  7947. sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
  7948. sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
  7949. sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
  7950. sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
  7951. return sum * weight_sum_inv;
  7952. }
  7953. float3 tex2Dblur6x6shared(const sampler2D tex,
  7954. const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
  7955. const float sigma)
  7956. {
  7957. // Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
  7958. // Requires: Same as tex2Dblur12x12shared()
  7959. // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian
  7960. // blur (a 3x3 blur of carefully selected bilinear samples)
  7961. // of the given mip level. There will be some inaccuracies,subtle inaccuracies,
  7962. // especially for small or high-frequency detailed sources.
  7963. // Description:
  7964. // First see the description for tex2Dblur8x8shared(). This
  7965. // function shares the same concept and sample placement, but each fragment
  7966. // only uses 9 of the 16 samples taken across the pixel quad (to cover a
  7967. // 3x3 sample area, or 6x6 texel area), and it uses a lower standard
  7968. // deviation to compensate. Thanks to symmetry, the 7 omitted samples
  7969. // are always the "same:"
  7970. // 1adjx, 3adjx
  7971. // 2adjy, 3adjy
  7972. // 1diag, 2diag, 3diag
  7973. // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  7974. // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
  7975. const float denom_inv = 0.5/(sigma*sigma);
  7976. const float w0off = 1.0;
  7977. const float w0_5off = exp(-(0.5*0.5) * denom_inv);
  7978. const float w1off = exp(-(1.0*1.0) * denom_inv);
  7979. const float w1_5off = exp(-(1.5*1.5) * denom_inv);
  7980. const float w2off = exp(-(2.0*2.0) * denom_inv);
  7981. const float w2_5off = exp(-(2.5*2.5) * denom_inv);
  7982. const float w3_5off = exp(-(3.5*3.5) * denom_inv);
  7983. const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
  7984. const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
  7985. // We don't share sample0*, so use the nearest destination fragment:
  7986. const float texel0to1ratio_nearest = w1off/(w0off + w1off);
  7987. const float texel1to2ratio_nearest = w2off/(w1off + w2off);
  7988. // Statically compute texel offsets from the bottom-right fragment to each
  7989. // bilinear sample in the bottom-right quadrant:
  7990. const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
  7991. const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
  7992. const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
  7993. const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
  7994. const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  7995. const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  7996. const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  7997. // CALCULATE KERNEL WEIGHTS:
  7998. // Statically compute bilinear sample weights at each destination fragment
  7999. // from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
  8000. #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
  8001. (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
  8002. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
  8003. exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
  8004. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
  8005. // We only need 9 of the 16 sample weights. Skip the following weights:
  8006. // 1adjx, 3adjx
  8007. // 2adjy, 3adjy
  8008. // 1diag, 2diag, 3diag
  8009. const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
  8010. const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
  8011. const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
  8012. const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
  8013. const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
  8014. const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
  8015. const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
  8016. const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
  8017. const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
  8018. #undef GET_TEXEL_QUAD_WEIGHTS
  8019. // Get the weight sum inverse (normalization factor):
  8020. const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
  8021. w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
  8022. // Statically pack some weights for runtime:
  8023. const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
  8024. // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  8025. // Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
  8026. const float2 dxdy_curr = dxdy * quad_vector.xy;
  8027. // Load bilinear samples for the current quadrant (for this fragment):
  8028. const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
  8029. const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
  8030. const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
  8031. const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
  8032. const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
  8033. const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
  8034. const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
  8035. // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
  8036. // Fetch the samples from other fragments in the 2x2 quad:
  8037. float3 sample1adjx, sample1adjy, sample1diag;
  8038. float3 sample2adjx, sample2adjy, sample2diag;
  8039. quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
  8040. quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
  8041. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  8042. // Fill each row of a matrix with an rgb sample and pre-multiply by the
  8043. // weights to obtain a weighted result for sample1*, and handle the rest
  8044. // of the weights more directly/verbosely:
  8045. float3 sum = float3(0.0,0.0,0.0);
  8046. sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
  8047. sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
  8048. w2adjx * sample2adjx + w3curr * sample3curr;
  8049. return sum * weight_sum_inv;
  8050. }
  8051. /////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS //////////////////////
  8052. // The following blurs are static wrappers around the dynamic blurs above.
  8053. // HOPEFULLY, the compiler will be smart enough to do constant-folding.
  8054. // Resizable separable blurs:
  8055. inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
  8056. const float2 dxdy)
  8057. {
  8058. return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
  8059. }
  8060. inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
  8061. const float2 dxdy)
  8062. {
  8063. return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
  8064. }
  8065. inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
  8066. const float2 dxdy)
  8067. {
  8068. return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
  8069. }
  8070. inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
  8071. const float2 dxdy)
  8072. {
  8073. return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
  8074. }
  8075. inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
  8076. const float2 dxdy)
  8077. {
  8078. return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
  8079. }
  8080. // Fast separable blurs:
  8081. inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
  8082. const float2 dxdy)
  8083. {
  8084. return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
  8085. }
  8086. inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
  8087. const float2 dxdy)
  8088. {
  8089. return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
  8090. }
  8091. inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
  8092. const float2 dxdy)
  8093. {
  8094. return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
  8095. }
  8096. inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
  8097. const float2 dxdy)
  8098. {
  8099. return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
  8100. }
  8101. inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
  8102. const float2 dxdy)
  8103. {
  8104. return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
  8105. }
  8106. // Huge, "fast" separable blurs:
  8107. inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
  8108. const float2 dxdy)
  8109. {
  8110. return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
  8111. }
  8112. inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
  8113. const float2 dxdy)
  8114. {
  8115. return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
  8116. }
  8117. inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
  8118. const float2 dxdy)
  8119. {
  8120. return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
  8121. }
  8122. inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
  8123. const float2 dxdy)
  8124. {
  8125. return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
  8126. }
  8127. // Resizable one-pass blurs:
  8128. inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
  8129. const float2 dxdy)
  8130. {
  8131. return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
  8132. }
  8133. // "Fast" one-pass blurs:
  8134. inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
  8135. const float2 dxdy)
  8136. {
  8137. return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
  8138. }
  8139. inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
  8140. const float2 dxdy)
  8141. {
  8142. return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
  8143. }
  8144. inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
  8145. const float2 dxdy)
  8146. {
  8147. return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
  8148. }
  8149. inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
  8150. const float2 dxdy)
  8151. {
  8152. return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
  8153. }
  8154. // "Fast" shared-sample one-pass blurs:
  8155. inline float3 tex2Dblur12x12shared(const sampler2D tex,
  8156. const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
  8157. {
  8158. return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
  8159. }
  8160. inline float3 tex2Dblur10x10shared(const sampler2D tex,
  8161. const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
  8162. {
  8163. return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
  8164. }
  8165. inline float3 tex2Dblur8x8shared(const sampler2D tex,
  8166. const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
  8167. {
  8168. return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
  8169. }
  8170. inline float3 tex2Dblur6x6shared(const sampler2D tex,
  8171. const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
  8172. {
  8173. return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
  8174. }
  8175. #endif // BLUR_FUNCTIONS_H
  8176. //////////////////////////// END BLUR-FUNCTIONS ///////////////////////////
  8177. //#include "bloom-functions.h"
  8178. //////////////////////////// BEGIN BLOOM-FUNCTIONS ///////////////////////////
  8179. #ifndef BLOOM_FUNCTIONS_H
  8180. #define BLOOM_FUNCTIONS_H
  8181. ///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
  8182. // crt-royale: A full-featured CRT shader, with cheese.
  8183. // Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
  8184. //
  8185. // This program is free software; you can redistribute it and/or modify it
  8186. // under the terms of the GNU General Public License as published by the Free
  8187. // Software Foundation; either version 2 of the License, or any later version.
  8188. //
  8189. // This program is distributed in the hope that it will be useful, but WITHOUT
  8190. // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8191. // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  8192. // more details.
  8193. //
  8194. // You should have received a copy of the GNU General Public License along with
  8195. // this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  8196. // Place, Suite 330, Boston, MA 02111-1307 USA
  8197. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  8198. // These utility functions and constants help several passes determine the
  8199. // size and center texel weight of the phosphor bloom in a uniform manner.
  8200. ////////////////////////////////// INCLUDES //////////////////////////////////
  8201. // We need to calculate the correct blur sigma using some .cgp constants:
  8202. //#include "../user-settings.h"
  8203. ///////////////////////////// BEGIN USER-SETTINGS ////////////////////////////
  8204. #ifndef USER_SETTINGS_H
  8205. #define USER_SETTINGS_H
  8206. ///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
  8207. // The Cg compiler uses different "profiles" with different capabilities.
  8208. // This shader requires a Cg compilation profile >= arbfp1, but a few options
  8209. // require higher profiles like fp30 or fp40. The shader can't detect profile
  8210. // or driver capabilities, so instead you must comment or uncomment the lines
  8211. // below with "//" before "#define." Disable an option if you get compilation
  8212. // errors resembling those listed. Generally speaking, all of these options
  8213. // will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
  8214. // likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
  8215. // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
  8216. // Among other things, derivatives help us fix anisotropic filtering artifacts
  8217. // with curved manually tiled phosphor mask coords. Related errors:
  8218. // error C3004: function "float2 ddx(float2);" not supported in this profile
  8219. // error C3004: function "float2 ddy(float2);" not supported in this profile
  8220. //#define DRIVERS_ALLOW_DERIVATIVES
  8221. // Fine derivatives: Unsupported on older ATI cards.
  8222. // Fine derivatives enable 2x2 fragment block communication, letting us perform
  8223. // fast single-pass blur operations. If your card uses coarse derivatives and
  8224. // these are enabled, blurs could look broken. Derivatives are a prerequisite.
  8225. #ifdef DRIVERS_ALLOW_DERIVATIVES
  8226. #define DRIVERS_ALLOW_FINE_DERIVATIVES
  8227. #endif
  8228. // Dynamic looping: Requires an fp30 or newer profile.
  8229. // This makes phosphor mask resampling faster in some cases. Related errors:
  8230. // error C5013: profile does not support "for" statements and "for" could not
  8231. // be unrolled
  8232. //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
  8233. // Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
  8234. // Using one static loop avoids overhead if the user is right, but if the user
  8235. // is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
  8236. // binary search can potentially save some iterations. However, it may fail:
  8237. // error C6001: Temporary register limit of 32 exceeded; 35 registers
  8238. // needed to compile program
  8239. //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
  8240. // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
  8241. // anisotropic filtering, thereby fixing related artifacts. Related errors:
  8242. // error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
  8243. // this profile
  8244. //#define DRIVERS_ALLOW_TEX2DLOD
  8245. // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
  8246. // artifacts from anisotropic filtering and mipmapping. Related errors:
  8247. // error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
  8248. // in this profile
  8249. //#define DRIVERS_ALLOW_TEX2DBIAS
  8250. // Integrated graphics compatibility: Integrated graphics like Intel HD 4000
  8251. // impose stricter limitations on register counts and instructions. Enable
  8252. // INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
  8253. // error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
  8254. // to compile program.
  8255. // Enabling integrated graphics compatibility mode will automatically disable:
  8256. // 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
  8257. // (This may be reenabled in a later release.)
  8258. // 2.) RUNTIME_GEOMETRY_MODE
  8259. // 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
  8260. //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  8261. //////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
  8262. // To disable a #define option, turn its line into a comment with "//."
  8263. // RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
  8264. // Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
  8265. // many of the options in this file and allow real-time tuning, but many of
  8266. // them are slower. Disabling them and using this text file will boost FPS.
  8267. #define RUNTIME_SHADER_PARAMS_ENABLE
  8268. // Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
  8269. // it's the only way to do a wide-enough full bloom with a runtime dot pitch.
  8270. #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
  8271. // Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
  8272. #define RUNTIME_ANTIALIAS_WEIGHTS
  8273. // Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
  8274. //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  8275. // Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
  8276. // parameters? This will require more math or dynamic branching.
  8277. #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  8278. // Specify the tilt at runtime? This makes things about 3% slower.
  8279. #define RUNTIME_GEOMETRY_TILT
  8280. // Specify the geometry mode at runtime?
  8281. #define RUNTIME_GEOMETRY_MODE
  8282. // Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
  8283. // mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
  8284. // dynamic branches? This is cheap if mask_resize_viewport_scale is small.
  8285. #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  8286. // PHOSPHOR MASK:
  8287. // Manually resize the phosphor mask for best results (slower)? Disabling this
  8288. // removes the option to do so, but it may be faster without dynamic branches.
  8289. #define PHOSPHOR_MASK_MANUALLY_RESIZE
  8290. // If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
  8291. #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
  8292. // Larger blurs are expensive, but we need them to blur larger triads. We can
  8293. // detect the right blur if the triad size is static or our profile allows
  8294. // dynamic branches, but otherwise we use the largest blur the user indicates
  8295. // they might need:
  8296. #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  8297. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  8298. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  8299. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  8300. // Here's a helpful chart:
  8301. // MaxTriadSize BlurSize MinTriadCountsByResolution
  8302. // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8303. // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8304. // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8305. // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8306. // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8307. /////////////////////////////// USER PARAMETERS //////////////////////////////
  8308. // Note: Many of these static parameters are overridden by runtime shader
  8309. // parameters when those are enabled. However, many others are static codepath
  8310. // options that were cleaner or more convert to code as static constants.
  8311. // GAMMA:
  8312. static const float crt_gamma_static = 2.5; // range [1, 5]
  8313. static const float lcd_gamma_static = 2.2; // range [1, 5]
  8314. // LEVELS MANAGEMENT:
  8315. // Control the final multiplicative image contrast:
  8316. static const float levels_contrast_static = 1.0; // range [0, 4)
  8317. // We auto-dim to avoid clipping between passes and restore brightness
  8318. // later. Control the dim factor here: Lower values clip less but crush
  8319. // blacks more (static only for now).
  8320. static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
  8321. // HALATION/DIFFUSION/BLOOM:
  8322. // Halation weight: How much energy should be lost to electrons bounding
  8323. // around under the CRT glass and exciting random phosphors?
  8324. static const float halation_weight_static = 0.0; // range [0, 1]
  8325. // Refractive diffusion weight: How much light should spread/diffuse from
  8326. // refracting through the CRT glass?
  8327. static const float diffusion_weight_static = 0.075; // range [0, 1]
  8328. // Underestimate brightness: Bright areas bloom more, but we can base the
  8329. // bloom brightpass on a lower brightness to sharpen phosphors, or a higher
  8330. // brightness to soften them. Low values clip, but >= 0.8 looks okay.
  8331. static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
  8332. // Blur all colors more than necessary for a softer phosphor bloom?
  8333. static const float bloom_excess_static = 0.0; // range [0, 1]
  8334. // The BLOOM_APPROX pass approximates a phosphor blur early on with a small
  8335. // blurred resize of the input (convergence offsets are applied as well).
  8336. // There are three filter options (static option only for now):
  8337. // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
  8338. // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
  8339. // and beam_max_sigma is low.
  8340. // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
  8341. // always uses a static sigma regardless of beam_max_sigma or
  8342. // mask_num_triads_desired.
  8343. // 2.) True 4x4 Gaussian resize: Slowest, technically correct.
  8344. // These options are more pronounced for the fast, unbloomed shader version.
  8345. #ifndef RADEON_FIX
  8346. static const float bloom_approx_filter_static = 2.0;
  8347. #else
  8348. static const float bloom_approx_filter_static = 1.0;
  8349. #endif
  8350. // ELECTRON BEAM SCANLINE DISTRIBUTION:
  8351. // How many scanlines should contribute light to each pixel? Using more
  8352. // scanlines is slower (especially for a generalized Gaussian) but less
  8353. // distorted with larger beam sigmas (especially for a pure Gaussian). The
  8354. // max_beam_sigma at which the closest unused weight is guaranteed <
  8355. // 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
  8356. // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
  8357. // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
  8358. // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
  8359. // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
  8360. // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
  8361. static const float beam_num_scanlines = 3.0; // range [2, 6]
  8362. // A generalized Gaussian beam varies shape with color too, now just width.
  8363. // It's slower but more flexible (static option only for now).
  8364. static const bool beam_generalized_gaussian = true;
  8365. // What kind of scanline antialiasing do you want?
  8366. // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
  8367. // Integrals are slow (especially for generalized Gaussians) and rarely any
  8368. // better than 3x antialiasing (static option only for now).
  8369. static const float beam_antialias_level = 1.0; // range [0, 2]
  8370. // Min/max standard deviations for scanline beams: Higher values widen and
  8371. // soften scanlines. Depending on other options, low min sigmas can alias.
  8372. static const float beam_min_sigma_static = 0.02; // range (0, 1]
  8373. static const float beam_max_sigma_static = 0.3; // range (0, 1]
  8374. // Beam width varies as a function of color: A power function (0) is more
  8375. // configurable, but a spherical function (1) gives the widest beam
  8376. // variability without aliasing (static option only for now).
  8377. static const float beam_spot_shape_function = 0.0;
  8378. // Spot shape power: Powers <= 1 give smoother spot shapes but lower
  8379. // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
  8380. static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
  8381. // Generalized Gaussian max shape parameters: Higher values give flatter
  8382. // scanline plateaus and steeper dropoffs, simultaneously widening and
  8383. // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
  8384. // values > ~40.0 cause artifacts with integrals.
  8385. static const float beam_min_shape_static = 2.0; // range [2, 32]
  8386. static const float beam_max_shape_static = 4.0; // range [2, 32]
  8387. // Generalized Gaussian shape power: Affects how quickly the distribution
  8388. // changes shape from Gaussian to steep/plateaued as color increases from 0
  8389. // to 1.0. Higher powers appear softer for most colors, and lower powers
  8390. // appear sharper for most colors.
  8391. static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
  8392. // What filter should be used to sample scanlines horizontally?
  8393. // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
  8394. static const float beam_horiz_filter_static = 0.0;
  8395. // Standard deviation for horizontal Gaussian resampling:
  8396. static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
  8397. // Do horizontal scanline sampling in linear RGB (correct light mixing),
  8398. // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
  8399. // limiting circuitry in some CRT's), or a weighted avg.?
  8400. static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
  8401. // Simulate scanline misconvergence? This needs 3x horizontal texture
  8402. // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
  8403. // later passes (static option only for now).
  8404. static const bool beam_misconvergence = true;
  8405. // Convergence offsets in x/y directions for R/G/B scanline beams in units
  8406. // of scanlines. Positive offsets go right/down; ranges [-2, 2]
  8407. static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
  8408. static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
  8409. static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
  8410. // Detect interlacing (static option only for now)?
  8411. static const bool interlace_detect = true;
  8412. // Assume 1080-line sources are interlaced?
  8413. static const bool interlace_1080i_static = false;
  8414. // For interlaced sources, assume TFF (top-field first) or BFF order?
  8415. // (Whether this matters depends on the nature of the interlaced input.)
  8416. static const bool interlace_bff_static = false;
  8417. // ANTIALIASING:
  8418. // What AA level do you want for curvature/overscan/subpixels? Options:
  8419. // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
  8420. // (Static option only for now)
  8421. static const float aa_level = 12.0; // range [0, 24]
  8422. // What antialiasing filter do you want (static option only)? Options:
  8423. // 0: Box (separable), 1: Box (cylindrical),
  8424. // 2: Tent (separable), 3: Tent (cylindrical),
  8425. // 4: Gaussian (separable), 5: Gaussian (cylindrical),
  8426. // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
  8427. // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
  8428. // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
  8429. static const float aa_filter = 6.0; // range [0, 9]
  8430. // Flip the sample grid on odd/even frames (static option only for now)?
  8431. static const bool aa_temporal = false;
  8432. // Use RGB subpixel offsets for antialiasing? The pixel is at green, and
  8433. // the blue offset is the negative r offset; range [0, 0.5]
  8434. static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
  8435. // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
  8436. // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
  8437. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
  8438. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
  8439. // 4.) C = 0.0 is a soft spline filter.
  8440. static const float aa_cubic_c_static = 0.5; // range [0, 4]
  8441. // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
  8442. static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
  8443. // PHOSPHOR MASK:
  8444. // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
  8445. static const float mask_type_static = 1.0; // range [0, 2]
  8446. // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
  8447. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
  8448. // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
  8449. // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
  8450. // is halfway decent with LUT mipmapping but atrocious without it.
  8451. // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
  8452. // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
  8453. // This mode reuses the same masks, so triads will be enormous unless
  8454. // you change the mask LUT filenames in your .cgp file.
  8455. static const float mask_sample_mode_static = 0.0; // range [0, 2]
  8456. // Prefer setting the triad size (0.0) or number on the screen (1.0)?
  8457. // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
  8458. // will always be used to calculate the full bloom sigma statically.
  8459. static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
  8460. // Specify the phosphor triad size, in pixels. Each tile (usually with 8
  8461. // triads) will be rounded to the nearest integer tile size and clamped to
  8462. // obey minimum size constraints (imposed to reduce downsize taps) and
  8463. // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
  8464. // To increase the size limit, double the viewport-relative scales for the
  8465. // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
  8466. // range [1, mask_texture_small_size/mask_triads_per_tile]
  8467. static const float mask_triad_size_desired_static = 24.0 / 8.0;
  8468. // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
  8469. // final size will be rounded and constrained as above); default 480.0
  8470. static const float mask_num_triads_desired_static = 480.0;
  8471. // How many lobes should the sinc/Lanczos resizer use? More lobes require
  8472. // more samples and avoid moire a bit better, but some is unavoidable
  8473. // depending on the destination size (static option for now).
  8474. static const float mask_sinc_lobes = 3.0; // range [2, 4]
  8475. // The mask is resized using a variable number of taps in each dimension,
  8476. // but some Cg profiles always fetch a constant number of taps no matter
  8477. // what (no dynamic branching). We can limit the maximum number of taps if
  8478. // we statically limit the minimum phosphor triad size. Larger values are
  8479. // faster, but the limit IS enforced (static option only, forever);
  8480. // range [1, mask_texture_small_size/mask_triads_per_tile]
  8481. // TODO: Make this 1.0 and compensate with smarter sampling!
  8482. static const float mask_min_allowed_triad_size = 2.0;
  8483. // GEOMETRY:
  8484. // Geometry mode:
  8485. // 0: Off (default), 1: Spherical mapping (like cgwg's),
  8486. // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
  8487. static const float geom_mode_static = 0.0; // range [0, 3]
  8488. // Radius of curvature: Measured in units of your viewport's diagonal size.
  8489. static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
  8490. // View dist is the distance from the player to their physical screen, in
  8491. // units of the viewport's diagonal size. It controls the field of view.
  8492. static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
  8493. // Tilt angle in radians (clockwise around up and right vectors):
  8494. static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
  8495. // Aspect ratio: When the true viewport size is unknown, this value is used
  8496. // to help convert between the phosphor triad size and count, along with
  8497. // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
  8498. // this equal to Retroarch's display aspect ratio (DAR) for best results;
  8499. // range [1, geom_max_aspect_ratio from user-cgp-constants.h];
  8500. // default (256/224)*(54/47) = 1.313069909 (see below)
  8501. static const float geom_aspect_ratio_static = 1.313069909;
  8502. // Before getting into overscan, here's some general aspect ratio info:
  8503. // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
  8504. // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
  8505. // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
  8506. // Geometry processing has to "undo" the screen-space 2D DAR to calculate
  8507. // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
  8508. // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
  8509. // a.) Enable Retroarch's "Crop Overscan"
  8510. // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
  8511. // Real consoles use horizontal black padding in the signal, but emulators
  8512. // often crop this without cropping the vertical padding; a 256x224 [S]NES
  8513. // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
  8514. // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
  8515. // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
  8516. // http://forums.nesdev.com/viewtopic.php?p=24815#p24815
  8517. // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
  8518. // without doing a. or b., but horizontal image borders will be tighter
  8519. // than vertical ones, messing up curvature and overscan. Fixing the
  8520. // padding first corrects this.
  8521. // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
  8522. // or adjust x/y independently to e.g. readd horizontal padding, as noted
  8523. // above: Values < 1.0 zoom out; range (0, inf)
  8524. static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
  8525. // Compute a proper pixel-space to texture-space matrix even without ddx()/
  8526. // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
  8527. // with strong curvature (static option only for now).
  8528. static const bool geom_force_correct_tangent_matrix = true;
  8529. // BORDERS:
  8530. // Rounded border size in texture uv coords:
  8531. static const float border_size_static = 0.015; // range [0, 0.5]
  8532. // Border darkness: Moderate values darken the border smoothly, and high
  8533. // values make the image very dark just inside the border:
  8534. static const float border_darkness_static = 2.0; // range [0, inf)
  8535. // Border compression: High numbers compress border transitions, narrowing
  8536. // the dark border area.
  8537. static const float border_compress_static = 2.5; // range [1, inf)
  8538. #endif // USER_SETTINGS_H
  8539. //////////////////////////// END USER-SETTINGS //////////////////////////
  8540. //#include "derived-settings-and-constants.h"
  8541. //////////////////// BEGIN DERIVED-SETTINGS-AND-CONSTANTS ////////////////////
  8542. #ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
  8543. #define DERIVED_SETTINGS_AND_CONSTANTS_H
  8544. ///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
  8545. // crt-royale: A full-featured CRT shader, with cheese.
  8546. // Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
  8547. //
  8548. // This program is free software; you can redistribute it and/or modify it
  8549. // under the terms of the GNU General Public License as published by the Free
  8550. // Software Foundation; either version 2 of the License, or any later version.
  8551. //
  8552. // This program is distributed in the hope that it will be useful, but WITHOUT
  8553. // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8554. // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  8555. // more details.
  8556. //
  8557. // You should have received a copy of the GNU General Public License along with
  8558. // this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  8559. // Place, Suite 330, Boston, MA 02111-1307 USA
  8560. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  8561. // These macros and constants can be used across the whole codebase.
  8562. // Unlike the values in user-settings.cgh, end users shouldn't modify these.
  8563. /////////////////////////////// BEGIN INCLUDES ///////////////////////////////
  8564. //#include "../user-settings.h"
  8565. ///////////////////////////// BEGIN USER-SETTINGS ////////////////////////////
  8566. #ifndef USER_SETTINGS_H
  8567. #define USER_SETTINGS_H
  8568. ///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
  8569. // The Cg compiler uses different "profiles" with different capabilities.
  8570. // This shader requires a Cg compilation profile >= arbfp1, but a few options
  8571. // require higher profiles like fp30 or fp40. The shader can't detect profile
  8572. // or driver capabilities, so instead you must comment or uncomment the lines
  8573. // below with "//" before "#define." Disable an option if you get compilation
  8574. // errors resembling those listed. Generally speaking, all of these options
  8575. // will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
  8576. // likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
  8577. // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
  8578. // Among other things, derivatives help us fix anisotropic filtering artifacts
  8579. // with curved manually tiled phosphor mask coords. Related errors:
  8580. // error C3004: function "float2 ddx(float2);" not supported in this profile
  8581. // error C3004: function "float2 ddy(float2);" not supported in this profile
  8582. //#define DRIVERS_ALLOW_DERIVATIVES
  8583. // Fine derivatives: Unsupported on older ATI cards.
  8584. // Fine derivatives enable 2x2 fragment block communication, letting us perform
  8585. // fast single-pass blur operations. If your card uses coarse derivatives and
  8586. // these are enabled, blurs could look broken. Derivatives are a prerequisite.
  8587. #ifdef DRIVERS_ALLOW_DERIVATIVES
  8588. #define DRIVERS_ALLOW_FINE_DERIVATIVES
  8589. #endif
  8590. // Dynamic looping: Requires an fp30 or newer profile.
  8591. // This makes phosphor mask resampling faster in some cases. Related errors:
  8592. // error C5013: profile does not support "for" statements and "for" could not
  8593. // be unrolled
  8594. //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
  8595. // Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
  8596. // Using one static loop avoids overhead if the user is right, but if the user
  8597. // is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
  8598. // binary search can potentially save some iterations. However, it may fail:
  8599. // error C6001: Temporary register limit of 32 exceeded; 35 registers
  8600. // needed to compile program
  8601. //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
  8602. // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
  8603. // anisotropic filtering, thereby fixing related artifacts. Related errors:
  8604. // error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
  8605. // this profile
  8606. //#define DRIVERS_ALLOW_TEX2DLOD
  8607. // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
  8608. // artifacts from anisotropic filtering and mipmapping. Related errors:
  8609. // error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
  8610. // in this profile
  8611. //#define DRIVERS_ALLOW_TEX2DBIAS
  8612. // Integrated graphics compatibility: Integrated graphics like Intel HD 4000
  8613. // impose stricter limitations on register counts and instructions. Enable
  8614. // INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
  8615. // error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
  8616. // to compile program.
  8617. // Enabling integrated graphics compatibility mode will automatically disable:
  8618. // 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
  8619. // (This may be reenabled in a later release.)
  8620. // 2.) RUNTIME_GEOMETRY_MODE
  8621. // 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
  8622. //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  8623. //////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
  8624. // To disable a #define option, turn its line into a comment with "//."
  8625. // RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
  8626. // Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
  8627. // many of the options in this file and allow real-time tuning, but many of
  8628. // them are slower. Disabling them and using this text file will boost FPS.
  8629. #define RUNTIME_SHADER_PARAMS_ENABLE
  8630. // Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
  8631. // it's the only way to do a wide-enough full bloom with a runtime dot pitch.
  8632. #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
  8633. // Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
  8634. #define RUNTIME_ANTIALIAS_WEIGHTS
  8635. // Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
  8636. //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  8637. // Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
  8638. // parameters? This will require more math or dynamic branching.
  8639. #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  8640. // Specify the tilt at runtime? This makes things about 3% slower.
  8641. #define RUNTIME_GEOMETRY_TILT
  8642. // Specify the geometry mode at runtime?
  8643. #define RUNTIME_GEOMETRY_MODE
  8644. // Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
  8645. // mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
  8646. // dynamic branches? This is cheap if mask_resize_viewport_scale is small.
  8647. #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  8648. // PHOSPHOR MASK:
  8649. // Manually resize the phosphor mask for best results (slower)? Disabling this
  8650. // removes the option to do so, but it may be faster without dynamic branches.
  8651. #define PHOSPHOR_MASK_MANUALLY_RESIZE
  8652. // If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
  8653. #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
  8654. // Larger blurs are expensive, but we need them to blur larger triads. We can
  8655. // detect the right blur if the triad size is static or our profile allows
  8656. // dynamic branches, but otherwise we use the largest blur the user indicates
  8657. // they might need:
  8658. #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  8659. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  8660. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  8661. //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  8662. // Here's a helpful chart:
  8663. // MaxTriadSize BlurSize MinTriadCountsByResolution
  8664. // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8665. // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8666. // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8667. // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8668. // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
  8669. /////////////////////////////// USER PARAMETERS //////////////////////////////
  8670. // Note: Many of these static parameters are overridden by runtime shader
  8671. // parameters when those are enabled. However, many others are static codepath
  8672. // options that were cleaner or more convert to code as static constants.
  8673. // GAMMA:
  8674. static const float crt_gamma_static = 2.5; // range [1, 5]
  8675. static const float lcd_gamma_static = 2.2; // range [1, 5]
  8676. // LEVELS MANAGEMENT:
  8677. // Control the final multiplicative image contrast:
  8678. static const float levels_contrast_static = 1.0; // range [0, 4)
  8679. // We auto-dim to avoid clipping between passes and restore brightness
  8680. // later. Control the dim factor here: Lower values clip less but crush
  8681. // blacks more (static only for now).
  8682. static const float levels_autodim_temp = 0.5; // range (0, 1] default is 0.5 but that was unnecessarily dark for me, so I set it to 1.0
  8683. // HALATION/DIFFUSION/BLOOM:
  8684. // Halation weight: How much energy should be lost to electrons bounding
  8685. // around under the CRT glass and exciting random phosphors?
  8686. static const float halation_weight_static = 0.0; // range [0, 1]
  8687. // Refractive diffusion weight: How much light should spread/diffuse from
  8688. // refracting through the CRT glass?
  8689. static const float diffusion_weight_static = 0.075; // range [0, 1]
  8690. // Underestimate brightness: Bright areas bloom more, but we can base the
  8691. // bloom brightpass on a lower brightness to sharpen phosphors, or a higher
  8692. // brightness to soften them. Low values clip, but >= 0.8 looks okay.
  8693. static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
  8694. // Blur all colors more than necessary for a softer phosphor bloom?
  8695. static const float bloom_excess_static = 0.0; // range [0, 1]
  8696. // The BLOOM_APPROX pass approximates a phosphor blur early on with a small
  8697. // blurred resize of the input (convergence offsets are applied as well).
  8698. // There are three filter options (static option only for now):
  8699. // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
  8700. // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
  8701. // and beam_max_sigma is low.
  8702. // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
  8703. // always uses a static sigma regardless of beam_max_sigma or
  8704. // mask_num_triads_desired.
  8705. // 2.) True 4x4 Gaussian resize: Slowest, technically correct.
  8706. // These options are more pronounced for the fast, unbloomed shader version.
  8707. #ifndef RADEON_FIX
  8708. static const float bloom_approx_filter_static = 2.0;
  8709. #else
  8710. static const float bloom_approx_filter_static = 1.0;
  8711. #endif
  8712. // ELECTRON BEAM SCANLINE DISTRIBUTION:
  8713. // How many scanlines should contribute light to each pixel? Using more
  8714. // scanlines is slower (especially for a generalized Gaussian) but less
  8715. // distorted with larger beam sigmas (especially for a pure Gaussian). The
  8716. // max_beam_sigma at which the closest unused weight is guaranteed <
  8717. // 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
  8718. // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
  8719. // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
  8720. // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
  8721. // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
  8722. // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
  8723. static const float beam_num_scanlines = 3.0; // range [2, 6]
  8724. // A generalized Gaussian beam varies shape with color too, now just width.
  8725. // It's slower but more flexible (static option only for now).
  8726. static const bool beam_generalized_gaussian = true;
  8727. // What kind of scanline antialiasing do you want?
  8728. // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
  8729. // Integrals are slow (especially for generalized Gaussians) and rarely any
  8730. // better than 3x antialiasing (static option only for now).
  8731. static const float beam_antialias_level = 1.0; // range [0, 2]
  8732. // Min/max standard deviations for scanline beams: Higher values widen and
  8733. // soften scanlines. Depending on other options, low min sigmas can alias.
  8734. static const float beam_min_sigma_static = 0.02; // range (0, 1]
  8735. static const float beam_max_sigma_static = 0.3; // range (0, 1]
  8736. // Beam width varies as a function of color: A power function (0) is more
  8737. // configurable, but a spherical function (1) gives the widest beam
  8738. // variability without aliasing (static option only for now).
  8739. static const float beam_spot_shape_function = 0.0;
  8740. // Spot shape power: Powers <= 1 give smoother spot shapes but lower
  8741. // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
  8742. static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
  8743. // Generalized Gaussian max shape parameters: Higher values give flatter
  8744. // scanline plateaus and steeper dropoffs, simultaneously widening and
  8745. // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
  8746. // values > ~40.0 cause artifacts with integrals.
  8747. static const float beam_min_shape_static = 2.0; // range [2, 32]
  8748. static const float beam_max_shape_static = 4.0; // range [2, 32]
  8749. // Generalized Gaussian shape power: Affects how quickly the distribution
  8750. // changes shape from Gaussian to steep/plateaued as color increases from 0
  8751. // to 1.0. Higher powers appear softer for most colors, and lower powers
  8752. // appear sharper for most colors.
  8753. static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
  8754. // What filter should be used to sample scanlines horizontally?
  8755. // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
  8756. static const float beam_horiz_filter_static = 0.0;
  8757. // Standard deviation for horizontal Gaussian resampling:
  8758. static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
  8759. // Do horizontal scanline sampling in linear RGB (correct light mixing),
  8760. // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
  8761. // limiting circuitry in some CRT's), or a weighted avg.?
  8762. static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
  8763. // Simulate scanline misconvergence? This needs 3x horizontal texture
  8764. // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
  8765. // later passes (static option only for now).
  8766. static const bool beam_misconvergence = true;
  8767. // Convergence offsets in x/y directions for R/G/B scanline beams in units
  8768. // of scanlines. Positive offsets go right/down; ranges [-2, 2]
  8769. static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
  8770. static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
  8771. static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
  8772. // Detect interlacing (static option only for now)?
  8773. static const bool interlace_detect = true;
  8774. // Assume 1080-line sources are interlaced?
  8775. static const bool interlace_1080i_static = false;
  8776. // For interlaced sources, assume TFF (top-field first) or BFF order?
  8777. // (Whether this matters depends on the nature of the interlaced input.)
  8778. static const bool interlace_bff_static = false;
  8779. // ANTIALIASING:
  8780. // What AA level do you want for curvature/overscan/subpixels? Options:
  8781. // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
  8782. // (Static option only for now)
  8783. static const float aa_level = 12.0; // range [0, 24]
  8784. // What antialiasing filter do you want (static option only)? Options:
  8785. // 0: Box (separable), 1: Box (cylindrical),
  8786. // 2: Tent (separable), 3: Tent (cylindrical),
  8787. // 4: Gaussian (separable), 5: Gaussian (cylindrical),
  8788. // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
  8789. // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
  8790. // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
  8791. static const float aa_filter = 6.0; // range [0, 9]
  8792. // Flip the sample grid on odd/even frames (static option only for now)?
  8793. static const bool aa_temporal = false;
  8794. // Use RGB subpixel offsets for antialiasing? The pixel is at green, and
  8795. // the blue offset is the negative r offset; range [0, 0.5]
  8796. static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
  8797. // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
  8798. // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
  8799. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
  8800. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
  8801. // 4.) C = 0.0 is a soft spline filter.
  8802. static const float aa_cubic_c_static = 0.5; // range [0, 4]
  8803. // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
  8804. static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
  8805. // PHOSPHOR MASK:
  8806. // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
  8807. static const float mask_type_static = 1.0; // range [0, 2]
  8808. // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
  8809. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
  8810. // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
  8811. // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
  8812. // is halfway decent with LUT mipmapping but atrocious without it.
  8813. // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
  8814. // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
  8815. // This mode reuses the same masks, so triads will be enormous unless
  8816. // you change the mask LUT filenames in your .cgp file.
  8817. static const float mask_sample_mode_static = 0.0; // range [0, 2]
  8818. // Prefer setting the triad size (0.0) or number on the screen (1.0)?
  8819. // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
  8820. // will always be used to calculate the full bloom sigma statically.
  8821. static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
  8822. // Specify the phosphor triad size, in pixels. Each tile (usually with 8
  8823. // triads) will be rounded to the nearest integer tile size and clamped to
  8824. // obey minimum size constraints (imposed to reduce downsize taps) and
  8825. // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
  8826. // To increase the size limit, double the viewport-relative scales for the
  8827. // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
  8828. // range [1, mask_texture_small_size/mask_triads_per_tile]
  8829. static const float mask_triad_size_desired_static = 24.0 / 8.0;
  8830. // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
  8831. // final size will be rounded and constrained as above); default 480.0
  8832. static const float mask_num_triads_desired_static = 480.0;
  8833. // How many lobes should the sinc/Lanczos resizer use? More lobes require
  8834. // more samples and avoid moire a bit better, but some is unavoidable
  8835. // depending on the destination size (static option for now).
  8836. static const float mask_sinc_lobes = 3.0; // range [2, 4]
  8837. // The mask is resized using a variable number of taps in each dimension,
  8838. // but some Cg profiles always fetch a constant number of taps no matter
  8839. // what (no dynamic branching). We can limit the maximum number of taps if
  8840. // we statically limit the minimum phosphor triad size. Larger values are
  8841. // faster, but the limit IS enforced (static option only, forever);
  8842. // range [1, mask_texture_small_size/mask_triads_per_tile]
  8843. // TODO: Make this 1.0 and compensate with smarter sampling!
  8844. static const float mask_min_allowed_triad_size = 2.0;
  8845. // GEOMETRY:
  8846. // Geometry mode:
  8847. // 0: Off (default), 1: Spherical mapping (like cgwg's),
  8848. // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
  8849. static const float geom_mode_static = 0.0; // range [0, 3]
  8850. // Radius of curvature: Measured in units of your viewport's diagonal size.
  8851. static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
  8852. // View dist is the distance from the player to their physical screen, in
  8853. // units of the viewport's diagonal size. It controls the field of view.
  8854. static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
  8855. // Tilt angle in radians (clockwise around up and right vectors):
  8856. static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
  8857. // Aspect ratio: When the true viewport size is unknown, this value is used
  8858. // to help convert between the phosphor triad size and count, along with
  8859. // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
  8860. // this equal to Retroarch's display aspect ratio (DAR) for best results;
  8861. // range [1, geom_max_aspect_ratio from user-cgp-constants.h];
  8862. // default (256/224)*(54/47) = 1.313069909 (see below)
  8863. static const float geom_aspect_ratio_static = 1.313069909;
  8864. // Before getting into overscan, here's some general aspect ratio info:
  8865. // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
  8866. // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
  8867. // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
  8868. // Geometry processing has to "undo" the screen-space 2D DAR to calculate
  8869. // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
  8870. // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
  8871. // a.) Enable Retroarch's "Crop Overscan"
  8872. // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
  8873. // Real consoles use horizontal black padding in the signal, but emulators
  8874. // often crop this without cropping the vertical padding; a 256x224 [S]NES
  8875. // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
  8876. // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
  8877. // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
  8878. // http://forums.nesdev.com/viewtopic.php?p=24815#p24815
  8879. // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
  8880. // without doing a. or b., but horizontal image borders will be tighter
  8881. // than vertical ones, messing up curvature and overscan. Fixing the
  8882. // padding first corrects this.
  8883. // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
  8884. // or adjust x/y independently to e.g. readd horizontal padding, as noted
  8885. // above: Values < 1.0 zoom out; range (0, inf)
  8886. static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
  8887. // Compute a proper pixel-space to texture-space matrix even without ddx()/
  8888. // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
  8889. // with strong curvature (static option only for now).
  8890. static const bool geom_force_correct_tangent_matrix = true;
  8891. // BORDERS:
  8892. // Rounded border size in texture uv coords:
  8893. static const float border_size_static = 0.015; // range [0, 0.5]
  8894. // Border darkness: Moderate values darken the border smoothly, and high
  8895. // values make the image very dark just inside the border:
  8896. static const float border_darkness_static = 2.0; // range [0, inf)
  8897. // Border compression: High numbers compress border transitions, narrowing
  8898. // the dark border area.
  8899. static const float border_compress_static = 2.5; // range [1, inf)
  8900. #endif // USER_SETTINGS_H
  8901. ///////////////////////////// END USER-SETTINGS ////////////////////////////
  8902. //#include "user-cgp-constants.h"
  8903. ///////////////////////// BEGIN USER-CGP-CONSTANTS /////////////////////////
  8904. #ifndef USER_CGP_CONSTANTS_H
  8905. #define USER_CGP_CONSTANTS_H
  8906. // IMPORTANT:
  8907. // These constants MUST be set appropriately for the settings in crt-royale.cgp
  8908. // (or whatever related .cgp file you're using). If they aren't, you're likely
  8909. // to get artifacts, the wrong phosphor mask size, etc. I wish these could be
  8910. // set directly in the .cgp file to make things easier, but...they can't.
  8911. // PASS SCALES AND RELATED CONSTANTS:
  8912. // Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of
  8913. // this shader: One does a viewport-scale bloom, and the other skips it. The
  8914. // latter benefits from a higher bloom_approx_scale_x, so save both separately:
  8915. static const float bloom_approx_size_x = 320.0;
  8916. static const float bloom_approx_size_x_for_fake = 400.0;
  8917. // Copy the viewport-relative scales of the phosphor mask resize passes
  8918. // (MASK_RESIZE and the pass immediately preceding it):
  8919. static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
  8920. // Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
  8921. static const float geom_max_aspect_ratio = 4.0/3.0;
  8922. // PHOSPHOR MASK TEXTURE CONSTANTS:
  8923. // Set the following constants to reflect the properties of the phosphor mask
  8924. // texture named in crt-royale.cgp. The shader optionally resizes a mask tile
  8925. // based on user settings, then repeats a single tile until filling the screen.
  8926. // The shader must know the input texture size (default 64x64), and to manually
  8927. // resize, it must also know the horizontal triads per tile (default 8).
  8928. static const float2 mask_texture_small_size = float2(64.0, 64.0);
  8929. static const float2 mask_texture_large_size = float2(512.0, 512.0);
  8930. static const float mask_triads_per_tile = 8.0;
  8931. // We need the average brightness of the phosphor mask to compensate for the
  8932. // dimming it causes. The following four values are roughly correct for the
  8933. // masks included with the shader. Update the value for any LUT texture you
  8934. // change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
  8935. // the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
  8936. //#define PHOSPHOR_MASK_GRILLE14
  8937. static const float mask_grille14_avg_color = 50.6666666/255.0;
  8938. // TileableLinearApertureGrille14Wide7d33Spacing*.png
  8939. // TileableLinearApertureGrille14Wide10And6Spacing*.png
  8940. static const float mask_grille15_avg_color = 53.0/255.0;
  8941. // TileableLinearApertureGrille15Wide6d33Spacing*.png
  8942. // TileableLinearApertureGrille15Wide8And5d5Spacing*.png
  8943. static const float mask_slot_avg_color = 46.0/255.0;
  8944. // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
  8945. // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
  8946. static const float mask_shadow_avg_color = 41.0/255.0;
  8947. // TileableLinearShadowMask*.png
  8948. // TileableLinearShadowMaskEDP*.png
  8949. #ifdef PHOSPHOR_MASK_GRILLE14
  8950. static const float mask_grille_avg_color = mask_grille14_avg_color;
  8951. #else
  8952. static const float mask_grille_avg_color = mask_grille15_avg_color;
  8953. #endif
  8954. #endif // USER_CGP_CONSTANTS_H
  8955. ////////////////////////// END USER-CGP-CONSTANTS //////////////////////////
  8956. //////////////////////////////// END INCLUDES ////////////////////////////////
  8957. /////////////////////////////// FIXED SETTINGS ///////////////////////////////
  8958. // Avoid dividing by zero; using a macro overloads for float, float2, etc.:
  8959. #define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16
  8960. // Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
  8961. #ifndef SIMULATE_CRT_ON_LCD
  8962. #define SIMULATE_CRT_ON_LCD
  8963. #endif
  8964. // Manually tiling a manually resized texture creates texture coord derivative
  8965. // discontinuities and confuses anisotropic filtering, causing discolored tile
  8966. // seams in the phosphor mask. Workarounds:
  8967. // a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's
  8968. // downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
  8969. // disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
  8970. // b.) "Tile flat twice" requires drawing two full tiles without border padding
  8971. // to the resized mask FBO, and it's incompatible with same-pass curvature.
  8972. // (Same-pass curvature isn't used but could be in the future...maybe.)
  8973. // c.) "Fix discontinuities" requires derivatives and drawing one tile with
  8974. // border padding to the resized mask FBO, but it works with same-pass
  8975. // curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
  8976. // Precedence: a, then, b, then c (if multiple strategies are #defined).
  8977. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen
  8978. #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen
  8979. #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen
  8980. // Also, manually resampling the phosphor mask is slightly blurrier with
  8981. // anisotropic filtering. (Resampling with mipmapping is even worse: It
  8982. // creates artifacts, but only with the fully bloomed shader.) The difference
  8983. // is subtle with small triads, but you can fix it for a small cost.
  8984. //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  8985. ////////////////////////////// DERIVED SETTINGS //////////////////////////////
  8986. // Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
  8987. // geometry mode at runtime, or a 4x4 true Gaussian resize. Disable
  8988. // incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
  8989. // #defined by either user-settings.h or a wrapper .cg that #includes the
  8990. // current .cg pass.)
  8991. #ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
  8992. #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
  8993. #undef PHOSPHOR_MASK_MANUALLY_RESIZE
  8994. #endif
  8995. #ifdef RUNTIME_GEOMETRY_MODE
  8996. #undef RUNTIME_GEOMETRY_MODE
  8997. #endif
  8998. // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
  8999. // inferior in most cases, so replace 2.0 with 0.0:
  9000. static const float bloom_approx_filter =
  9001. bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
  9002. #else
  9003. static const float bloom_approx_filter = bloom_approx_filter_static;
  9004. #endif
  9005. // Disable slow runtime paths if static parameters are used. Most of these
  9006. // won't be a problem anyway once the params are disabled, but some will.
  9007. #ifndef RUNTIME_SHADER_PARAMS_ENABLE
  9008. #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  9009. #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  9010. #endif
  9011. #ifdef RUNTIME_ANTIALIAS_WEIGHTS
  9012. #undef RUNTIME_ANTIALIAS_WEIGHTS
  9013. #endif
  9014. #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  9015. #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
  9016. #endif
  9017. #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  9018. #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
  9019. #endif
  9020. #ifdef RUNTIME_GEOMETRY_TILT
  9021. #undef RUNTIME_GEOMETRY_TILT
  9022. #endif
  9023. #ifdef RUNTIME_GEOMETRY_MODE
  9024. #undef RUNTIME_GEOMETRY_MODE
  9025. #endif
  9026. #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  9027. #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  9028. #endif
  9029. #endif
  9030. // Make tex2Dbias a backup for tex2Dlod for wider compatibility.
  9031. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  9032. #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  9033. #endif
  9034. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  9035. #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  9036. #endif
  9037. // Rule out unavailable anisotropic compatibility strategies:
  9038. #ifndef DRIVERS_ALLOW_DERIVATIVES
  9039. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9040. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9041. #endif
  9042. #endif
  9043. #ifndef DRIVERS_ALLOW_TEX2DLOD
  9044. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  9045. #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  9046. #endif
  9047. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  9048. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  9049. #endif
  9050. #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
  9051. #undef ANTIALIAS_DISABLE_ANISOTROPIC
  9052. #endif
  9053. #endif
  9054. #ifndef DRIVERS_ALLOW_TEX2DBIAS
  9055. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  9056. #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  9057. #endif
  9058. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  9059. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  9060. #endif
  9061. #endif
  9062. // Prioritize anisotropic tiling compatibility strategies by performance and
  9063. // disable unused strategies. This concentrates all the nesting in one place.
  9064. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  9065. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  9066. #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  9067. #endif
  9068. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  9069. #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  9070. #endif
  9071. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9072. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9073. #endif
  9074. #else
  9075. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  9076. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  9077. #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  9078. #endif
  9079. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9080. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9081. #endif
  9082. #else
  9083. // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
  9084. // flat texture coords in the same pass, but that's all we use.
  9085. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  9086. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9087. #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9088. #endif
  9089. #endif
  9090. #endif
  9091. #endif
  9092. // The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
  9093. // reduce some #ifdef nesting in the next section by essentially OR'ing them:
  9094. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
  9095. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  9096. #endif
  9097. #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
  9098. #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  9099. #endif
  9100. // Prioritize anisotropic resampling compatibility strategies the same way:
  9101. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  9102. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  9103. #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
  9104. #endif
  9105. #endif
  9106. /////////////////////// DERIVED PHOSPHOR MASK CONSTANTS //////////////////////
  9107. // If we can use the large mipmapped LUT without mipmapping artifacts, we
  9108. // should: It gives us more options for using fewer samples.
  9109. #ifdef DRIVERS_ALLOW_TEX2DLOD
  9110. #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
  9111. // TODO: Take advantage of this!
  9112. #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
  9113. static const float2 mask_resize_src_lut_size = mask_texture_large_size;
  9114. #else
  9115. static const float2 mask_resize_src_lut_size = mask_texture_small_size;
  9116. #endif
  9117. #else
  9118. static const float2 mask_resize_src_lut_size = mask_texture_small_size;
  9119. #endif
  9120. // tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
  9121. // main_fragment, or a static alias of one of the above. This makes it hard
  9122. // to select the phosphor mask at runtime: We can't even assign to a uniform
  9123. // global in the vertex shader or select a sampler2D in the vertex shader and
  9124. // pass it to the fragment shader (even with explicit TEXUNIT# bindings),
  9125. // because it just gives us the input texture or a black screen. However, we
  9126. // can get around these limitations by calling tex2D three times with different
  9127. // uniform samplers (or resizing the phosphor mask three times altogether).
  9128. // With dynamic branches, we can process only one of these branches on top of
  9129. // quickly discarding fragments we don't need (cgc seems able to overcome
  9130. // limigations around dependent texture fetches inside of branches). Without
  9131. // dynamic branches, we have to process every branch for every fragment...which
  9132. // is slower. Runtime sampling mode selection is slower without dynamic
  9133. // branches as well. Let the user's static #defines decide if it's worth it.
  9134. #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
  9135. #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  9136. #else
  9137. #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  9138. #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
  9139. #endif
  9140. #endif
  9141. // We need to render some minimum number of tiles in the resize passes.
  9142. // We need at least 1.0 just to repeat a single tile, and we need extra
  9143. // padding beyond that for anisotropic filtering, discontinuitity fixing,
  9144. // antialiasing, same-pass curvature (not currently used), etc. First
  9145. // determine how many border texels and tiles we need, based on how the result
  9146. // will be sampled:
  9147. #ifdef GEOMETRY_EARLY
  9148. static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
  9149. // Most antialiasing filters have a base radius of 4.0 pixels:
  9150. static const float max_aa_base_pixel_border = 4.0 +
  9151. max_subpixel_offset;
  9152. #else
  9153. static const float max_aa_base_pixel_border = 0.0;
  9154. #endif
  9155. // Anisotropic filtering adds about 0.5 to the pixel border:
  9156. #ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
  9157. static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
  9158. #else
  9159. static const float max_aniso_pixel_border = max_aa_base_pixel_border;
  9160. #endif
  9161. // Fixing discontinuities adds 1.0 more to the pixel border:
  9162. #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
  9163. static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
  9164. #else
  9165. static const float max_tiled_pixel_border = max_aniso_pixel_border;
  9166. #endif
  9167. // Convert the pixel border to an integer texel border. Assume same-pass
  9168. // curvature about triples the texel frequency:
  9169. #ifdef GEOMETRY_EARLY
  9170. static const float max_mask_texel_border =
  9171. ceil(max_tiled_pixel_border * 3.0);
  9172. #else
  9173. static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
  9174. #endif
  9175. // Convert the texel border to a tile border using worst-case assumptions:
  9176. static const float max_mask_tile_border = max_mask_texel_border/
  9177. (mask_min_allowed_triad_size * mask_triads_per_tile);
  9178. // Finally, set the number of resized tiles to render to MASK_RESIZE, and set
  9179. // the starting texel (inside borders) for sampling it.
  9180. #ifndef GEOMETRY_EARLY
  9181. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
  9182. // Special case: Render two tiles without borders. Anisotropic
  9183. // filtering doesn't seem to be a problem here.
  9184. static const float mask_resize_num_tiles = 1.0 + 1.0;
  9185. static const float mask_start_texels = 0.0;
  9186. #else
  9187. static const float mask_resize_num_tiles = 1.0 +
  9188. 2.0 * max_mask_tile_border;
  9189. static const float mask_start_texels = max_mask_texel_border;
  9190. #endif
  9191. #else
  9192. static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
  9193. static const float mask_start_texels = max_mask_texel_border;
  9194. #endif
  9195. // We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
  9196. // mask_resize_viewport_scale. This limits the maximum final triad size.
  9197. // Estimate the minimum number of triads we can split the screen into in each
  9198. // dimension (we'll be as correct as mask_resize_viewport_scale is):
  9199. static const float mask_resize_num_triads =
  9200. mask_resize_num_tiles * mask_triads_per_tile;
  9201. static const float2 min_allowed_viewport_triads =
  9202. float2(mask_resize_num_triads) / mask_resize_viewport_scale;
  9203. //////////////////////// COMMON MATHEMATICAL CONSTANTS ///////////////////////
  9204. static const float pi = 3.141592653589;
  9205. // We often want to find the location of the previous texel, e.g.:
  9206. // const float2 curr_texel = uv * texture_size;
  9207. // const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
  9208. // const float2 prev_texel_uv = prev_texel / texture_size;
  9209. // However, many GPU drivers round incorrectly around exact texel locations.
  9210. // We need to subtract a little less than 0.5 before flooring, and some GPU's
  9211. // require this value to be farther from 0.5 than others; define it here.
  9212. // const float2 prev_texel =
  9213. // floor(curr_texel - float2(under_half)) + float2(0.5);
  9214. static const float under_half = 0.4995;
  9215. #endif // DERIVED_SETTINGS_AND_CONSTANTS_H
  9216. ///////////////////////////// END DERIVED-SETTINGS-AND-CONSTANTS ////////////////////////////
  9217. //#include "../../../../include/blur-functions.h"
  9218. //////////////////////////// BEGIN BLUR-FUNCTIONS ///////////////////////////
  9219. #ifndef BLUR_FUNCTIONS_H
  9220. #define BLUR_FUNCTIONS_H
  9221. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  9222. // Copyright (C) 2014 TroggleMonkey
  9223. //
  9224. // Permission is hereby granted, free of charge, to any person obtaining a copy
  9225. // of this software and associated documentation files (the "Software"), to
  9226. // deal in the Software without restriction, including without limitation the
  9227. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9228. // sell copies of the Software, and to permit persons to whom the Software is
  9229. // furnished to do so, subject to the following conditions:
  9230. //
  9231. // The above copyright notice and this permission notice shall be included in
  9232. // all copies or substantial portions of the Software.
  9233. //
  9234. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  9235. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  9236. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  9237. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  9238. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  9239. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  9240. // IN THE SOFTWARE.
  9241. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  9242. // This file provides reusable one-pass and separable (two-pass) blurs.
  9243. // Requires: All blurs share these requirements (dxdy requirement is split):
  9244. // 1.) All requirements of gamma-management.h must be satisfied!
  9245. // 2.) filter_linearN must == "true" in your .cgp preset unless
  9246. // you're using tex2DblurNresize at 1x scale.
  9247. // 3.) mipmap_inputN must == "true" in your .cgp preset if
  9248. // output_size < video_size.
  9249. // 4.) output_size == video_size / pow(2, M), where M is some
  9250. // positive integer. tex2Dblur*resize can resize arbitrarily
  9251. // (and the blur will be done after resizing), but arbitrary
  9252. // resizes "fail" with other blurs due to the way they mix
  9253. // static weights with bilinear sample exploitation.
  9254. // 5.) In general, dxdy should contain the uv pixel spacing:
  9255. // dxdy = (video_size/output_size)/texture_size
  9256. // 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
  9257. // zero out the dxdy component in the unblurred dimension:
  9258. // dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
  9259. // Many blurs share these requirements:
  9260. // 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
  9261. // or they will blur more in the lower-scaled dimension.
  9262. // 2.) One-pass shared sample blurs require ddx(), ddy(), and
  9263. // tex2Dlod() to be supported by the current Cg profile, and
  9264. // the drivers must support high-quality derivatives.
  9265. // 3.) One-pass shared sample blurs require:
  9266. // tex_uv.w == log2(video_size/output_size).y;
  9267. // Non-wrapper blurs share this requirement:
  9268. // 1.) sigma is the intended standard deviation of the blur
  9269. // Wrapper blurs share this requirement, which is automatically
  9270. // met (unless OVERRIDE_BLUR_STD_DEVS is #defined; see below):
  9271. // 1.) blurN_std_dev must be global static const float values
  9272. // specifying standard deviations for Nx blurs in units
  9273. // of destination pixels
  9274. // Optional: 1.) The including file (or an earlier included file) may
  9275. // optionally #define USE_BINOMIAL_BLUR_STD_DEVS to replace
  9276. // default standard deviations with those matching a binomial
  9277. // distribution. (See below for details/properties.)
  9278. // 2.) The including file (or an earlier included file) may
  9279. // optionally #define OVERRIDE_BLUR_STD_DEVS and override:
  9280. // static const float blur3_std_dev
  9281. // static const float blur4_std_dev
  9282. // static const float blur5_std_dev
  9283. // static const float blur6_std_dev
  9284. // static const float blur7_std_dev
  9285. // static const float blur8_std_dev
  9286. // static const float blur9_std_dev
  9287. // static const float blur10_std_dev
  9288. // static const float blur11_std_dev
  9289. // static const float blur12_std_dev
  9290. // static const float blur17_std_dev
  9291. // static const float blur25_std_dev
  9292. // static const float blur31_std_dev
  9293. // static const float blur43_std_dev
  9294. // 3.) The including file (or an earlier included file) may
  9295. // optionally #define OVERRIDE_ERROR_BLURRING and override:
  9296. // static const float error_blurring
  9297. // This tuning value helps mitigate weighting errors from one-
  9298. // pass shared-sample blurs sharing bilinear samples between
  9299. // fragments. Values closer to 0.0 have "correct" blurriness
  9300. // but allow more artifacts, and values closer to 1.0 blur away
  9301. // artifacts by sampling closer to halfway between texels.
  9302. // UPDATE 6/21/14: The above static constants may now be overridden
  9303. // by non-static uniform constants. This permits exposing blur
  9304. // standard deviations as runtime GUI shader parameters. However,
  9305. // using them keeps weights from being statically computed, and the
  9306. // speed hit depends on the blur: On my machine, uniforms kill over
  9307. // 53% of the framerate with tex2Dblur12x12shared, but they only
  9308. // drop the framerate by about 18% with tex2Dblur11fast.
  9309. // Quality and Performance Comparisons:
  9310. // For the purposes of the following discussion, "no sRGB" means
  9311. // GAMMA_ENCODE_EVERY_FBO is #defined, and "sRGB" means it isn't.
  9312. // 1.) tex2DblurNfast is always faster than tex2DblurNresize.
  9313. // 2.) tex2DblurNresize functions are the only ones that can arbitrarily resize
  9314. // well, because they're the only ones that don't exploit bilinear samples.
  9315. // This also means they're the only functions which can be truly gamma-
  9316. // correct without linear (or sRGB FBO) input, but only at 1x scale.
  9317. // 3.) One-pass shared sample blurs only have a speed advantage without sRGB.
  9318. // They also have some inaccuracies due to their shared-[bilinear-]sample
  9319. // design, which grow increasingly bothersome for smaller blurs and higher-
  9320. // frequency source images (relative to their resolution). I had high
  9321. // hopes for them, but their most realistic use case is limited to quickly
  9322. // reblurring an already blurred input at full resolution. Otherwise:
  9323. // a.) If you're blurring a low-resolution source, you want a better blur.
  9324. // b.) If you're blurring a lower mipmap, you want a better blur.
  9325. // c.) If you're blurring a high-resolution, high-frequency source, you
  9326. // want a better blur.
  9327. // 4.) The one-pass blurs without shared samples grow slower for larger blurs,
  9328. // but they're competitive with separable blurs at 5x5 and smaller, and
  9329. // even tex2Dblur7x7 isn't bad if you're wanting to conserve passes.
  9330. // Here are some framerates from a GeForce 8800GTS. The first pass resizes to
  9331. // viewport size (4x in this test) and linearizes for sRGB codepaths, and the
  9332. // remaining passes perform 6 full blurs. Mipmapped tests are performed at the
  9333. // same scale, so they just measure the cost of mipmapping each FBO (only every
  9334. // other FBO is mipmapped for separable blurs, to mimic realistic usage).
  9335. // Mipmap Neither sRGB+Mipmap sRGB Function
  9336. // 76.0 92.3 131.3 193.7 tex2Dblur3fast
  9337. // 63.2 74.4 122.4 175.5 tex2Dblur3resize
  9338. // 93.7 121.2 159.3 263.2 tex2Dblur3x3
  9339. // 59.7 68.7 115.4 162.1 tex2Dblur3x3resize
  9340. // 63.2 74.4 122.4 175.5 tex2Dblur5fast
  9341. // 49.3 54.8 100.0 132.7 tex2Dblur5resize
  9342. // 59.7 68.7 115.4 162.1 tex2Dblur5x5
  9343. // 64.9 77.2 99.1 137.2 tex2Dblur6x6shared
  9344. // 55.8 63.7 110.4 151.8 tex2Dblur7fast
  9345. // 39.8 43.9 83.9 105.8 tex2Dblur7resize
  9346. // 40.0 44.2 83.2 104.9 tex2Dblur7x7
  9347. // 56.4 65.5 71.9 87.9 tex2Dblur8x8shared
  9348. // 49.3 55.1 99.9 132.5 tex2Dblur9fast
  9349. // 33.3 36.2 72.4 88.0 tex2Dblur9resize
  9350. // 27.8 29.7 61.3 72.2 tex2Dblur9x9
  9351. // 37.2 41.1 52.6 60.2 tex2Dblur10x10shared
  9352. // 44.4 49.5 91.3 117.8 tex2Dblur11fast
  9353. // 28.8 30.8 63.6 75.4 tex2Dblur11resize
  9354. // 33.6 36.5 40.9 45.5 tex2Dblur12x12shared
  9355. // TODO: Fill in benchmarks for new untested blurs.
  9356. // tex2Dblur17fast
  9357. // tex2Dblur25fast
  9358. // tex2Dblur31fast
  9359. // tex2Dblur43fast
  9360. // tex2Dblur3x3resize
  9361. ///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
  9362. // Set static standard deviations, but allow users to override them with their
  9363. // own constants (even non-static uniforms if they're okay with the speed hit):
  9364. #ifndef OVERRIDE_BLUR_STD_DEVS
  9365. // blurN_std_dev values are specified in terms of dxdy strides.
  9366. #ifdef USE_BINOMIAL_BLUR_STD_DEVS
  9367. // By request, we can define standard deviations corresponding to a
  9368. // binomial distribution with p = 0.5 (related to Pascal's triangle).
  9369. // This distribution works such that blurring multiple times should
  9370. // have the same result as a single larger blur. These values are
  9371. // larger than default for blurs up to 6x and smaller thereafter.
  9372. static const float blur3_std_dev = 0.84931640625;
  9373. static const float blur4_std_dev = 0.84931640625;
  9374. static const float blur5_std_dev = 1.0595703125;
  9375. static const float blur6_std_dev = 1.06591796875;
  9376. static const float blur7_std_dev = 1.17041015625;
  9377. static const float blur8_std_dev = 1.1720703125;
  9378. static const float blur9_std_dev = 1.2259765625;
  9379. static const float blur10_std_dev = 1.21982421875;
  9380. static const float blur11_std_dev = 1.25361328125;
  9381. static const float blur12_std_dev = 1.2423828125;
  9382. static const float blur17_std_dev = 1.27783203125;
  9383. static const float blur25_std_dev = 1.2810546875;
  9384. static const float blur31_std_dev = 1.28125;
  9385. static const float blur43_std_dev = 1.28125;
  9386. #else
  9387. // The defaults are the largest values that keep the largest unused
  9388. // blur term on each side <= 1.0/256.0. (We could get away with more
  9389. // or be more conservative, but this compromise is pretty reasonable.)
  9390. static const float blur3_std_dev = 0.62666015625;
  9391. static const float blur4_std_dev = 0.66171875;
  9392. static const float blur5_std_dev = 0.9845703125;
  9393. static const float blur6_std_dev = 1.02626953125;
  9394. static const float blur7_std_dev = 1.36103515625;
  9395. static const float blur8_std_dev = 1.4080078125;
  9396. static const float blur9_std_dev = 1.7533203125;
  9397. static const float blur10_std_dev = 1.80478515625;
  9398. static const float blur11_std_dev = 2.15986328125;
  9399. static const float blur12_std_dev = 2.215234375;
  9400. static const float blur17_std_dev = 3.45535583496;
  9401. static const float blur25_std_dev = 5.3409576416;
  9402. static const float blur31_std_dev = 6.86488037109;
  9403. static const float blur43_std_dev = 10.1852050781;
  9404. #endif // USE_BINOMIAL_BLUR_STD_DEVS
  9405. #endif // OVERRIDE_BLUR_STD_DEVS
  9406. #ifndef OVERRIDE_ERROR_BLURRING
  9407. // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing
  9408. // in shared-sample blurs but increase blurring and feature shifting.
  9409. static const float error_blurring = 0.5;
  9410. #endif
  9411. ////////////////////////////////// INCLUDES //////////////////////////////////
  9412. // gamma-management.h relies on pass-specific settings to guide its behavior:
  9413. // FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details.
  9414. //#include "gamma-management.h"
  9415. //////////////////////////// BEGIN GAMMA-MANAGEMENT //////////////////////////
  9416. #ifndef GAMMA_MANAGEMENT_H
  9417. #define GAMMA_MANAGEMENT_H
  9418. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  9419. // Copyright (C) 2014 TroggleMonkey
  9420. //
  9421. // Permission is hereby granted, free of charge, to any person obtaining a copy
  9422. // of this software and associated documentation files (the "Software"), to
  9423. // deal in the Software without restriction, including without limitation the
  9424. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9425. // sell copies of the Software, and to permit persons to whom the Software is
  9426. // furnished to do so, subject to the following conditions:
  9427. //
  9428. // The above copyright notice and this permission notice shall be included in
  9429. // all copies or substantial portions of the Software.
  9430. //
  9431. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  9432. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  9433. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  9434. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  9435. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  9436. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  9437. // IN THE SOFTWARE.
  9438. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  9439. // This file provides gamma-aware tex*D*() and encode_output() functions.
  9440. // Requires: Before #include-ing this file, the including file must #define
  9441. // the following macros when applicable and follow their rules:
  9442. // 1.) #define FIRST_PASS if this is the first pass.
  9443. // 2.) #define LAST_PASS if this is the last pass.
  9444. // 3.) If sRGB is available, set srgb_framebufferN = "true" for
  9445. // every pass except the last in your .cgp preset.
  9446. // 4.) If sRGB isn't available but you want gamma-correctness with
  9447. // no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
  9448. // 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
  9449. // 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
  9450. // 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
  9451. // 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
  9452. // If an option in [5, 8] is #defined in the first or last pass, it
  9453. // should be #defined for both. It shouldn't make a difference
  9454. // whether it's #defined for intermediate passes or not.
  9455. // Optional: The including file (or an earlier included file) may optionally
  9456. // #define a number of macros indicating it will override certain
  9457. // macros and associated constants are as follows:
  9458. // static constants with either static or uniform constants. The
  9459. // 1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
  9460. // static const float ntsc_gamma
  9461. // static const float pal_gamma
  9462. // static const float crt_reference_gamma_high
  9463. // static const float crt_reference_gamma_low
  9464. // static const float lcd_reference_gamma
  9465. // static const float crt_office_gamma
  9466. // static const float lcd_office_gamma
  9467. // 2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
  9468. // static const float crt_gamma
  9469. // static const float gba_gamma
  9470. // static const float lcd_gamma
  9471. // 3.) OVERRIDE_FINAL_GAMMA: The user must first define:
  9472. // static const float input_gamma
  9473. // static const float intermediate_gamma
  9474. // static const float output_gamma
  9475. // (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
  9476. // 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
  9477. // static const bool assume_opaque_alpha
  9478. // The gamma constant overrides must be used in every pass or none,
  9479. // and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
  9480. // OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
  9481. // Usage: After setting macros appropriately, ignore gamma correction and
  9482. // replace all tex*D*() calls with equivalent gamma-aware
  9483. // tex*D*_linearize calls, except:
  9484. // 1.) When you read an LUT, use regular tex*D or a gamma-specified
  9485. // function, depending on its gamma encoding:
  9486. // tex*D*_linearize_gamma (takes a runtime gamma parameter)
  9487. // 2.) If you must read pass0's original input in a later pass, use
  9488. // tex2D_linearize_ntsc_gamma. If you want to read pass0's
  9489. // input with gamma-corrected bilinear filtering, consider
  9490. // creating a first linearizing pass and reading from the input
  9491. // of pass1 later.
  9492. // Then, return encode_output(color) from every fragment shader.
  9493. // Finally, use the global gamma_aware_bilinear boolean if you want
  9494. // to statically branch based on whether bilinear filtering is
  9495. // gamma-correct or not (e.g. for placing Gaussian blur samples).
  9496. //
  9497. // Detailed Policy:
  9498. // tex*D*_linearize() functions enforce a consistent gamma-management policy
  9499. // based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume
  9500. // their input texture has the same encoding characteristics as the input for
  9501. // the current pass (which doesn't apply to the exceptions listed above).
  9502. // Similarly, encode_output() enforces a policy based on the LAST_PASS and
  9503. // GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the
  9504. // following two pipelines.
  9505. // Typical pipeline with intermediate sRGB framebuffers:
  9506. // linear_color = pow(pass0_encoded_color, input_gamma);
  9507. // intermediate_output = linear_color; // Automatic sRGB encoding
  9508. // linear_color = intermediate_output; // Automatic sRGB decoding
  9509. // final_output = pow(intermediate_output, 1.0/output_gamma);
  9510. // Typical pipeline without intermediate sRGB framebuffers:
  9511. // linear_color = pow(pass0_encoded_color, input_gamma);
  9512. // intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
  9513. // linear_color = pow(intermediate_output, intermediate_gamma);
  9514. // final_output = pow(intermediate_output, 1.0/output_gamma);
  9515. // Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
  9516. // easily get gamma-correctness without banding on devices where sRGB isn't
  9517. // supported.
  9518. //
  9519. // Use This Header to Maximize Code Reuse:
  9520. // The purpose of this header is to provide a consistent interface for texture
  9521. // reads and output gamma-encoding that localizes and abstracts away all the
  9522. // annoying details. This greatly reduces the amount of code in each shader
  9523. // pass that depends on the pass number in the .cgp preset or whether sRGB
  9524. // FBO's are being used: You can trivially change the gamma behavior of your
  9525. // whole pass by commenting or uncommenting 1-3 #defines. To reuse the same
  9526. // code in your first, Nth, and last passes, you can even put it all in another
  9527. // header file and #include it from skeleton .cg files that #define the
  9528. // appropriate pass-specific settings.
  9529. //
  9530. // Rationale for Using Three Macros:
  9531. // This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
  9532. // SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
  9533. // a lower maintenance burden on each pass. At first glance it seems we could
  9534. // accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
  9535. // This works for simple use cases where input_gamma == output_gamma, but it
  9536. // breaks down for more complex scenarios like CRT simulation, where the pass
  9537. // number determines the gamma encoding of the input and output.
  9538. /////////////////////////////// BASE CONSTANTS ///////////////////////////////
  9539. // Set standard gamma constants, but allow users to override them:
  9540. #ifndef OVERRIDE_STANDARD_GAMMA
  9541. // Standard encoding gammas:
  9542. static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too?
  9543. static const float pal_gamma = 2.8; // Never actually 2.8 in practice
  9544. // Typical device decoding gammas (only use for emulating devices):
  9545. // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
  9546. // gammas: The standards purposely undercorrected for an analog CRT's
  9547. // assumed 2.5 reference display gamma to maintain contrast in assumed
  9548. // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
  9549. // These unstated assumptions about display gamma and perceptual rendering
  9550. // intent caused a lot of confusion, and more modern CRT's seemed to target
  9551. // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit
  9552. // (they struggle near black with 2.5 gamma anyway), especially PC/laptop
  9553. // displays designed to view sRGB in bright environments. (Standards are
  9554. // also in flux again with BT.1886, but it's underspecified for displays.)
  9555. static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55)
  9556. static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55)
  9557. static const float lcd_reference_gamma = 2.5; // To match CRT
  9558. static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC
  9559. static const float lcd_office_gamma = 2.2; // Approximates sRGB
  9560. #endif // OVERRIDE_STANDARD_GAMMA
  9561. // Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
  9562. // but only if they're aware of it.
  9563. #ifndef OVERRIDE_ALPHA_ASSUMPTIONS
  9564. static const bool assume_opaque_alpha = false;
  9565. #endif
  9566. /////////////////////// DERIVED CONSTANTS AS FUNCTIONS ///////////////////////
  9567. // gamma-management.h should be compatible with overriding gamma values with
  9568. // runtime user parameters, but we can only define other global constants in
  9569. // terms of static constants, not uniform user parameters. To get around this
  9570. // limitation, we need to define derived constants using functions.
  9571. // Set device gamma constants, but allow users to override them:
  9572. #ifdef OVERRIDE_DEVICE_GAMMA
  9573. // The user promises to globally define the appropriate constants:
  9574. inline float get_crt_gamma() { return crt_gamma; }
  9575. inline float get_gba_gamma() { return gba_gamma; }
  9576. inline float get_lcd_gamma() { return lcd_gamma; }
  9577. #else
  9578. inline float get_crt_gamma() { return crt_reference_gamma_high; }
  9579. inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0)
  9580. inline float get_lcd_gamma() { return lcd_office_gamma; }
  9581. #endif // OVERRIDE_DEVICE_GAMMA
  9582. // Set decoding/encoding gammas for the first/lass passes, but allow overrides:
  9583. #ifdef OVERRIDE_FINAL_GAMMA
  9584. // The user promises to globally define the appropriate constants:
  9585. inline float get_intermediate_gamma() { return intermediate_gamma; }
  9586. inline float get_input_gamma() { return input_gamma; }
  9587. inline float get_output_gamma() { return output_gamma; }
  9588. #else
  9589. // If we gamma-correct every pass, always use ntsc_gamma between passes to
  9590. // ensure middle passes don't need to care if anything is being simulated:
  9591. inline float get_intermediate_gamma() { return ntsc_gamma; }
  9592. #ifdef SIMULATE_CRT_ON_LCD
  9593. inline float get_input_gamma() { return get_crt_gamma(); }
  9594. inline float get_output_gamma() { return get_lcd_gamma(); }
  9595. #else
  9596. #ifdef SIMULATE_GBA_ON_LCD
  9597. inline float get_input_gamma() { return get_gba_gamma(); }
  9598. inline float get_output_gamma() { return get_lcd_gamma(); }
  9599. #else
  9600. #ifdef SIMULATE_LCD_ON_CRT
  9601. inline float get_input_gamma() { return get_lcd_gamma(); }
  9602. inline float get_output_gamma() { return get_crt_gamma(); }
  9603. #else
  9604. #ifdef SIMULATE_GBA_ON_CRT
  9605. inline float get_input_gamma() { return get_gba_gamma(); }
  9606. inline float get_output_gamma() { return get_crt_gamma(); }
  9607. #else // Don't simulate anything:
  9608. inline float get_input_gamma() { return ntsc_gamma; }
  9609. inline float get_output_gamma() { return ntsc_gamma; }
  9610. #endif // SIMULATE_GBA_ON_CRT
  9611. #endif // SIMULATE_LCD_ON_CRT
  9612. #endif // SIMULATE_GBA_ON_LCD
  9613. #endif // SIMULATE_CRT_ON_LCD
  9614. #endif // OVERRIDE_FINAL_GAMMA
  9615. // Set decoding/encoding gammas for the current pass. Use static constants for
  9616. // linearize_input and gamma_encode_output, because they aren't derived, and
  9617. // they let the compiler do dead-code elimination.
  9618. #ifndef GAMMA_ENCODE_EVERY_FBO
  9619. #ifdef FIRST_PASS
  9620. static const bool linearize_input = true;
  9621. inline float get_pass_input_gamma() { return get_input_gamma(); }
  9622. #else
  9623. static const bool linearize_input = false;
  9624. inline float get_pass_input_gamma() { return 1.0; }
  9625. #endif
  9626. #ifdef LAST_PASS
  9627. static const bool gamma_encode_output = true;
  9628. inline float get_pass_output_gamma() { return get_output_gamma(); }
  9629. #else
  9630. static const bool gamma_encode_output = false;
  9631. inline float get_pass_output_gamma() { return 1.0; }
  9632. #endif
  9633. #else
  9634. static const bool linearize_input = true;
  9635. static const bool gamma_encode_output = true;
  9636. #ifdef FIRST_PASS
  9637. inline float get_pass_input_gamma() { return get_input_gamma(); }
  9638. #else
  9639. inline float get_pass_input_gamma() { return get_intermediate_gamma(); }
  9640. #endif
  9641. #ifdef LAST_PASS
  9642. inline float get_pass_output_gamma() { return get_output_gamma(); }
  9643. #else
  9644. inline float get_pass_output_gamma() { return get_intermediate_gamma(); }
  9645. #endif
  9646. #endif
  9647. // Users might want to know if bilinear filtering will be gamma-correct:
  9648. static const bool gamma_aware_bilinear = !linearize_input;
  9649. ////////////////////// COLOR ENCODING/DECODING FUNCTIONS /////////////////////
  9650. inline float4 encode_output(const float4 color)
  9651. {
  9652. if(gamma_encode_output)
  9653. {
  9654. if(assume_opaque_alpha)
  9655. {
  9656. return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
  9657. }
  9658. else
  9659. {
  9660. return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
  9661. }
  9662. }
  9663. else
  9664. {
  9665. return color;
  9666. }
  9667. }
  9668. inline float4 decode_input(const float4 color)
  9669. {
  9670. if(linearize_input)
  9671. {
  9672. if(assume_opaque_alpha)
  9673. {
  9674. return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
  9675. }
  9676. else
  9677. {
  9678. return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
  9679. }
  9680. }
  9681. else
  9682. {
  9683. return color;
  9684. }
  9685. }
  9686. inline float4 decode_gamma_input(const float4 color, const float3 gamma)
  9687. {
  9688. if(assume_opaque_alpha)
  9689. {
  9690. return float4(pow(color.rgb, gamma), 1.0);
  9691. }
  9692. else
  9693. {
  9694. return float4(pow(color.rgb, gamma), color.a);
  9695. }
  9696. }
  9697. //TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
  9698. //#define tex2D_linearize(C, D) decode_input(vec4(COMPAT_TEXTURE(C, D)))
  9699. // EDIT: it's the 'const' in front of the coords that's doing it
  9700. /////////////////////////// TEXTURE LOOKUP WRAPPERS //////////////////////////
  9701. // "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  9702. // Provide a wide array of linearizing texture lookup wrapper functions. The
  9703. // Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
  9704. // lookups are provided for completeness in case that changes someday. Nobody
  9705. // is likely to use the *fetch and *proj functions, but they're included just
  9706. // in case. The only tex*D texture sampling functions omitted are:
  9707. // - tex*Dcmpbias
  9708. // - tex*Dcmplod
  9709. // - tex*DARRAY*
  9710. // - tex*DMS*
  9711. // - Variants returning integers
  9712. // Standard line length restrictions are ignored below for vertical brevity.
  9713. /*
  9714. // tex1D:
  9715. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
  9716. { return decode_input(tex1D(tex, tex_coords)); }
  9717. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
  9718. { return decode_input(tex1D(tex, tex_coords)); }
  9719. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
  9720. { return decode_input(tex1D(tex, tex_coords, texel_off)); }
  9721. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
  9722. { return decode_input(tex1D(tex, tex_coords, texel_off)); }
  9723. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
  9724. { return decode_input(tex1D(tex, tex_coords, dx, dy)); }
  9725. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
  9726. { return decode_input(tex1D(tex, tex_coords, dx, dy)); }
  9727. inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
  9728. { return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
  9729. inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
  9730. { return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
  9731. // tex1Dbias:
  9732. inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
  9733. { return decode_input(tex1Dbias(tex, tex_coords)); }
  9734. inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
  9735. { return decode_input(tex1Dbias(tex, tex_coords, texel_off)); }
  9736. // tex1Dfetch:
  9737. inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
  9738. { return decode_input(tex1Dfetch(tex, tex_coords)); }
  9739. inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
  9740. { return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); }
  9741. // tex1Dlod:
  9742. inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
  9743. { return decode_input(tex1Dlod(tex, tex_coords)); }
  9744. inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
  9745. { return decode_input(tex1Dlod(tex, tex_coords, texel_off)); }
  9746. // tex1Dproj:
  9747. inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
  9748. { return decode_input(tex1Dproj(tex, tex_coords)); }
  9749. inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
  9750. { return decode_input(tex1Dproj(tex, tex_coords)); }
  9751. inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
  9752. { return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
  9753. inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
  9754. { return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
  9755. */
  9756. // tex2D:
  9757. inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords)
  9758. { return decode_input(COMPAT_TEXTURE(tex, tex_coords)); }
  9759. inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords)
  9760. { return decode_input(COMPAT_TEXTURE(tex, tex_coords.xy)); }
  9761. inline float4 tex2D_linearize(const sampler2D tex, float2 tex_coords, int texel_off)
  9762. { return decode_input(textureLod(tex, tex_coords, texel_off)); }
  9763. inline float4 tex2D_linearize(const sampler2D tex, float3 tex_coords, int texel_off)
  9764. { return decode_input(textureLod(tex, tex_coords.xy, texel_off)); }
  9765. //inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
  9766. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); }
  9767. //inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
  9768. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy)); }
  9769. //inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
  9770. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); }
  9771. //inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
  9772. //{ return decode_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off)); }
  9773. // tex2Dbias:
  9774. //inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
  9775. //{ return decode_input(tex2Dbias(tex, tex_coords)); }
  9776. //inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
  9777. //{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); }
  9778. // tex2Dfetch:
  9779. //inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
  9780. //{ return decode_input(tex2Dfetch(tex, tex_coords)); }
  9781. //inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
  9782. //{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); }
  9783. // tex2Dlod:
  9784. inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords)
  9785. { return decode_input(textureLod(tex, tex_coords.xy, 0.0)); }
  9786. inline float4 tex2Dlod_linearize(const sampler2D tex, float4 tex_coords, int texel_off)
  9787. { return decode_input(textureLod(tex, tex_coords.xy, texel_off)); }
  9788. /*
  9789. // tex2Dproj:
  9790. inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
  9791. { return decode_input(tex2Dproj(tex, tex_coords)); }
  9792. inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
  9793. { return decode_input(tex2Dproj(tex, tex_coords)); }
  9794. inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
  9795. { return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
  9796. inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
  9797. { return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
  9798. */
  9799. /*
  9800. // tex3D:
  9801. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
  9802. { return decode_input(tex3D(tex, tex_coords)); }
  9803. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
  9804. { return decode_input(tex3D(tex, tex_coords, texel_off)); }
  9805. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
  9806. { return decode_input(tex3D(tex, tex_coords, dx, dy)); }
  9807. inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
  9808. { return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); }
  9809. // tex3Dbias:
  9810. inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
  9811. { return decode_input(tex3Dbias(tex, tex_coords)); }
  9812. inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  9813. { return decode_input(tex3Dbias(tex, tex_coords, texel_off)); }
  9814. // tex3Dfetch:
  9815. inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
  9816. { return decode_input(tex3Dfetch(tex, tex_coords)); }
  9817. inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
  9818. { return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); }
  9819. // tex3Dlod:
  9820. inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
  9821. { return decode_input(tex3Dlod(tex, tex_coords)); }
  9822. inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  9823. { return decode_input(tex3Dlod(tex, tex_coords, texel_off)); }
  9824. // tex3Dproj:
  9825. inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
  9826. { return decode_input(tex3Dproj(tex, tex_coords)); }
  9827. inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
  9828. { return decode_input(tex3Dproj(tex, tex_coords, texel_off)); }
  9829. /////////*
  9830. // NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  9831. // This narrow selection of nonstandard tex2D* functions can be useful:
  9832. // tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
  9833. //inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
  9834. //{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); }
  9835. //inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
  9836. //{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); }
  9837. // MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
  9838. // Provide a narrower selection of tex2D* wrapper functions that decode an
  9839. // input sample with a specified gamma value. These are useful for reading
  9840. // LUT's and for reading the input of pass0 in a later pass.
  9841. // tex2D:
  9842. inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
  9843. { return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords), gamma); }
  9844. inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
  9845. { return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords.xy), gamma); }
  9846. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
  9847. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); }
  9848. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
  9849. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, texel_off), gamma); }
  9850. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
  9851. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); }
  9852. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
  9853. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy), gamma); }
  9854. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
  9855. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); }
  9856. //inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
  9857. //{ return decode_gamma_input(COMPAT_TEXTURE(tex, tex_coords, dx, dy, texel_off), gamma); }
  9858. /*
  9859. // tex2Dbias:
  9860. inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
  9861. { return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); }
  9862. inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
  9863. { return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); }
  9864. // tex2Dfetch:
  9865. inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
  9866. { return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); }
  9867. inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
  9868. { return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); }
  9869. */
  9870. // tex2Dlod:
  9871. inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, float3 gamma)
  9872. { return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); }
  9873. inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, float4 tex_coords, int texel_off, float3 gamma)
  9874. { return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); }
  9875. #endif // GAMMA_MANAGEMENT_H
  9876. //////////////////////////// END GAMMA-MANAGEMENT //////////////////////////
  9877. //#include "quad-pixel-communication.h"
  9878. /////////////////////// BEGIN QUAD-PIXEL-COMMUNICATION //////////////////////
  9879. #ifndef QUAD_PIXEL_COMMUNICATION_H
  9880. #define QUAD_PIXEL_COMMUNICATION_H
  9881. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  9882. // Copyright (C) 2014 TroggleMonkey*
  9883. //
  9884. // Permission is hereby granted, free of charge, to any person obtaining a copy
  9885. // of this software and associated documentation files (the "Software"), to
  9886. // deal in the Software without restriction, including without limitation the
  9887. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9888. // sell copies of the Software, and to permit persons to whom the Software is
  9889. // furnished to do so, subject to the following conditions:
  9890. //
  9891. // The above copyright notice and this permission notice shall be included in
  9892. // all copies or substantial portions of the Software.
  9893. //
  9894. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  9895. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  9896. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  9897. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  9898. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  9899. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  9900. // IN THE SOFTWARE.
  9901. ///////////////////////////////// DISCLAIMER /////////////////////////////////
  9902. // *This code was inspired by "Shader Amortization using Pixel Quad Message
  9903. // Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent
  9904. // is not to plagiarize his fundamentally similar code and assert my own
  9905. // copyright, but the algorithmic helper functions require so little code that
  9906. // implementations can't vary by much except bugfixes and conventions. I just
  9907. // wanted to license my own particular code here to avoid ambiguity and make it
  9908. // clear that as far as I'm concerned, people can do as they please with it.
  9909. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  9910. // Given screen pixel numbers, derive a "quad vector" describing a fragment's
  9911. // position in its 2x2 pixel quad. Given that vector, obtain the values of any
  9912. // variable at neighboring fragments.
  9913. // Requires: Using this file in general requires:
  9914. // 1.) ddx() and ddy() are present in the current Cg profile.
  9915. // 2.) The GPU driver is using fine/high-quality derivatives.
  9916. // Functions will give incorrect results if this is not true,
  9917. // so a test function is included.
  9918. ///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES ////////////////////
  9919. float4 get_quad_vector_naive(float4 output_pixel_num_wrt_uvxy)
  9920. {
  9921. // Requires: Two measures of the current fragment's output pixel number
  9922. // in the range ([0, output_size.x), [0, output_size.y)):
  9923. // 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
  9924. // 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
  9925. // Returns: Two measures of the fragment's position in its 2x2 quad:
  9926. // 1.) The .xy components are its 2x2 placement with respect to
  9927. // uv direction (the origin (0, 0) is at the top-left):
  9928. // top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0)
  9929. // bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0)
  9930. // You need this to arrange/weight shared texture samples.
  9931. // 2.) The .zw components are its 2x2 placement with respect to
  9932. // screen xy direction (position); the origin varies.
  9933. // quad_gather needs this measure to work correctly.
  9934. // Note: quad_vector.zw = quad_vector.xy * float2(
  9935. // ddx(output_pixel_num_wrt_uvxy.x),
  9936. // ddy(output_pixel_num_wrt_uvxy.y));
  9937. // Caveats: This function assumes the GPU driver always starts 2x2 pixel
  9938. // quads at even pixel numbers. This assumption can be wrong
  9939. // for odd output resolutions (nondeterministically so).
  9940. float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
  9941. float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
  9942. return quad_vector;
  9943. }
  9944. float4 get_quad_vector(float4 output_pixel_num_wrt_uvxy)
  9945. {
  9946. // Requires: Same as get_quad_vector_naive() (see that first).
  9947. // Returns: Same as get_quad_vector_naive() (see that first), but it's
  9948. // correct even if the 2x2 pixel quad starts at an odd pixel,
  9949. // which can occur at odd resolutions.
  9950. float4 quad_vector_guess =
  9951. get_quad_vector_naive(output_pixel_num_wrt_uvxy);
  9952. // If quad_vector_guess.zw doesn't increase with screen xy, we know
  9953. // the 2x2 pixel quad starts at an odd pixel:
  9954. float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
  9955. ddy(quad_vector_guess.w));
  9956. return quad_vector_guess * odd_start_mirror.xyxy;
  9957. }
  9958. float4 get_quad_vector(float2 output_pixel_num_wrt_uv)
  9959. {
  9960. // Requires: 1.) ddx() and ddy() are present in the current Cg profile.
  9961. // 2.) output_pixel_num_wrt_uv must increase with uv coords and
  9962. // measure the current fragment's output pixel number in:
  9963. // ([0, output_size.x), [0, output_size.y))
  9964. // Returns: Same as get_quad_vector_naive() (see that first), but it's
  9965. // correct even if the 2x2 pixel quad starts at an odd pixel,
  9966. // which can occur at odd resolutions.
  9967. // Caveats: This function requires less information than the version
  9968. // taking a float4, but it's potentially slower.
  9969. // Do screen coords increase with or against uv? Get the direction
  9970. // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
  9971. float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
  9972. ddy(output_pixel_num_wrt_uv.y));
  9973. float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
  9974. float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
  9975. float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
  9976. // If quad_vector_screen_guess doesn't increase with screen xy, we know
  9977. // the 2x2 pixel quad starts at an odd pixel:
  9978. float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
  9979. ddy(quad_vector_screen_guess.y));
  9980. float4 quad_vector_guess = float4(
  9981. quad_vector_uv_guess, quad_vector_screen_guess);
  9982. return quad_vector_guess * odd_start_mirror.xyxy;
  9983. }
  9984. void quad_gather(float4 quad_vector, float4 curr,
  9985. out float4 adjx, out float4 adjy, out float4 diag)
  9986. {
  9987. // Requires: 1.) ddx() and ddy() are present in the current Cg profile.
  9988. // 2.) The GPU driver is using fine/high-quality derivatives.
  9989. // 3.) quad_vector describes the current fragment's location in
  9990. // its 2x2 pixel quad using get_quad_vector()'s conventions.
  9991. // 4.) curr is any vector you wish to get neighboring values of.
  9992. // Returns: Values of an input vector (curr) at neighboring fragments
  9993. // adjacent x, adjacent y, and diagonal (via out parameters).
  9994. adjx = curr - ddx(curr) * quad_vector.z;
  9995. adjy = curr - ddy(curr) * quad_vector.w;
  9996. diag = adjx - ddy(adjx) * quad_vector.w;
  9997. }
  9998. void quad_gather(float4 quad_vector, float3 curr,
  9999. out float3 adjx, out float3 adjy, out float3 diag)
  10000. {
  10001. // Float3 version
  10002. adjx = curr - ddx(curr) * quad_vector.z;
  10003. adjy = curr - ddy(curr) * quad_vector.w;
  10004. diag = adjx - ddy(adjx) * quad_vector.w;
  10005. }
  10006. void quad_gather(float4 quad_vector, float2 curr,
  10007. out float2 adjx, out float2 adjy, out float2 diag)
  10008. {
  10009. // Float2 version
  10010. adjx = curr - ddx(curr) * quad_vector.z;
  10011. adjy = curr - ddy(curr) * quad_vector.w;
  10012. diag = adjx - ddy(adjx) * quad_vector.w;
  10013. }
  10014. float4 quad_gather(float4 quad_vector, float curr)
  10015. {
  10016. // Float version:
  10017. // Returns: return.x == current
  10018. // return.y == adjacent x
  10019. // return.z == adjacent y
  10020. // return.w == diagonal
  10021. float4 all = float4(curr);
  10022. all.y = all.x - ddx(all.x) * quad_vector.z;
  10023. all.zw = all.xy - ddy(all.xy) * quad_vector.w;
  10024. return all;
  10025. }
  10026. float4 quad_gather_sum(float4 quad_vector, float4 curr)
  10027. {
  10028. // Requires: Same as quad_gather()
  10029. // Returns: Sum of an input vector (curr) at all fragments in a quad.
  10030. float4 adjx, adjy, diag;
  10031. quad_gather(quad_vector, curr, adjx, adjy, diag);
  10032. return (curr + adjx + adjy + diag);
  10033. }
  10034. float3 quad_gather_sum(float4 quad_vector, float3 curr)
  10035. {
  10036. // Float3 version:
  10037. float3 adjx, adjy, diag;
  10038. quad_gather(quad_vector, curr, adjx, adjy, diag);
  10039. return (curr + adjx + adjy + diag);
  10040. }
  10041. float2 quad_gather_sum(float4 quad_vector, float2 curr)
  10042. {
  10043. // Float2 version:
  10044. float2 adjx, adjy, diag;
  10045. quad_gather(quad_vector, curr, adjx, adjy, diag);
  10046. return (curr + adjx + adjy + diag);
  10047. }
  10048. float quad_gather_sum(float4 quad_vector, float curr)
  10049. {
  10050. // Float version:
  10051. float4 all_values = quad_gather(quad_vector, curr);
  10052. return (all_values.x + all_values.y + all_values.z + all_values.w);
  10053. }
  10054. bool fine_derivatives_working(float4 quad_vector, float4 curr)
  10055. {
  10056. // Requires: 1.) ddx() and ddy() are present in the current Cg profile.
  10057. // 2.) quad_vector describes the current fragment's location in
  10058. // its 2x2 pixel quad using get_quad_vector()'s conventions.
  10059. // 3.) curr must be a test vector with non-constant derivatives
  10060. // (its value should change nonlinearly across fragments).
  10061. // Returns: true if fine/hybrid/high-quality derivatives are used, or
  10062. // false if coarse derivatives are used or inconclusive
  10063. // Usage: Test whether quad-pixel communication is working!
  10064. // Method: We can confirm fine derivatives are used if the following
  10065. // holds (ever, for any value at any fragment):
  10066. // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
  10067. // The more values we test (e.g. test a float4 two ways), the
  10068. // easier it is to demonstrate fine derivatives are working.
  10069. // TODO: Check for floating point exact comparison issues!
  10070. float4 ddx_curr = ddx(curr);
  10071. float4 ddy_curr = ddy(curr);
  10072. float4 adjx = curr - ddx_curr * quad_vector.z;
  10073. float4 adjy = curr - ddy_curr * quad_vector.w;
  10074. bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
  10075. bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
  10076. return any(bool2(ddy_different, ddx_different));
  10077. }
  10078. bool fine_derivatives_working_fast(float4 quad_vector, float curr)
  10079. {
  10080. // Requires: Same as fine_derivatives_working()
  10081. // Returns: Same as fine_derivatives_working()
  10082. // Usage: This is faster than fine_derivatives_working() but more
  10083. // likely to return false negatives, so it's less useful for
  10084. // offline testing/debugging. It's also useless as the basis
  10085. // for dynamic runtime branching as of May 2014: Derivatives
  10086. // (and quad-pixel communication) are currently disallowed in
  10087. // branches. However, future GPU's may allow you to use them
  10088. // in dynamic branches if you promise the branch condition
  10089. // evaluates the same for every fragment in the quad (and/or if
  10090. // the driver enforces that promise by making a single fragment
  10091. // control branch decisions). If that ever happens, this
  10092. // version may become a more economical choice.
  10093. float ddx_curr = ddx(curr);
  10094. float ddy_curr = ddy(curr);
  10095. float adjx = curr - ddx_curr * quad_vector.z;
  10096. return (ddy_curr != ddy(adjx));
  10097. }
  10098. #endif // QUAD_PIXEL_COMMUNICATION_H
  10099. //////////////////////// END QUAD-PIXEL-COMMUNICATION ///////////////////////
  10100. //#include "special-functions.h"
  10101. /////////////////////////// BEGIN SPECIAL-FUNCTIONS //////////////////////////
  10102. #ifndef SPECIAL_FUNCTIONS_H
  10103. #define SPECIAL_FUNCTIONS_H
  10104. ///////////////////////////////// MIT LICENSE ////////////////////////////////
  10105. // Copyright (C) 2014 TroggleMonkey
  10106. //
  10107. // Permission is hereby granted, free of charge, to any person obtaining a copy
  10108. // of this software and associated documentation files (the "Software"), to
  10109. // deal in the Software without restriction, including without limitation the
  10110. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10111. // sell copies of the Software, and to permit persons to whom the Software is
  10112. // furnished to do so, subject to the following conditions:
  10113. //
  10114. // The above copyright notice and this permission notice shall be included in
  10115. // all copies or substantial portions of the Software.
  10116. //
  10117. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  10118. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  10119. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  10120. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  10121. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  10122. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  10123. // IN THE SOFTWARE.
  10124. ///////////////////////////////// DESCRIPTION ////////////////////////////////
  10125. // This file implements the following mathematical special functions:
  10126. // 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
  10127. // 2.) gamma(s), a real-numbered extension of the integer factorial function
  10128. // It also implements normalized_ligamma(s, z), a normalized lower incomplete
  10129. // gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can
  10130. // be called with an _impl suffix to use an implementation version with a few
  10131. // extra precomputed parameters (which may be useful for the caller to reuse).
  10132. // See below for details.
  10133. //
  10134. // Design Rationale:
  10135. // Pretty much every line of code in this file is duplicated four times for
  10136. // different input types (float4/float3/float2/float). This is unfortunate,
  10137. // but Cg doesn't allow function templates. Macros would be far less verbose,
  10138. // but they would make the code harder to document and read. I don't expect
  10139. // these functions will require a whole lot of maintenance changes unless
  10140. // someone ever has need for more robust incomplete gamma functions, so code
  10141. // duplication seems to be the lesser evil in this case.
  10142. /////////////////////////// GAUSSIAN ERROR FUNCTION //////////////////////////
  10143. float4 erf6(float4 x)
  10144. {
  10145. // Requires: x is the standard parameter to erf().
  10146. // Returns: Return an Abramowitz/Stegun approximation of erf(), where:
  10147. // erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
  10148. // This approximation has a max absolute error of 2.5*10**-5
  10149. // with solid numerical robustness and efficiency. See:
  10150. // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
  10151. static const float4 one = float4(1.0);
  10152. const float4 sign_x = sign(x);
  10153. const float4 t = one/(one + 0.47047*abs(x));
  10154. const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  10155. exp(-(x*x));
  10156. return result * sign_x;
  10157. }
  10158. float3 erf6(const float3 x)
  10159. {
  10160. // Float3 version:
  10161. static const float3 one = float3(1.0);
  10162. const float3 sign_x = sign(x);
  10163. const float3 t = one/(one + 0.47047*abs(x));
  10164. const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  10165. exp(-(x*x));
  10166. return result * sign_x;
  10167. }
  10168. float2 erf6(const float2 x)
  10169. {
  10170. // Float2 version:
  10171. static const float2 one = float2(1.0);
  10172. const float2 sign_x = sign(x);
  10173. const float2 t = one/(one + 0.47047*abs(x));
  10174. const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  10175. exp(-(x*x));
  10176. return result * sign_x;
  10177. }
  10178. float erf6(const float x)
  10179. {
  10180. // Float version:
  10181. const float sign_x = sign(x);
  10182. const float t = 1.0/(1.0 + 0.47047*abs(x));
  10183. const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
  10184. exp(-(x*x));
  10185. return result * sign_x;
  10186. }
  10187. float4 erft(const float4 x)
  10188. {
  10189. // Requires: x is the standard parameter to erf().
  10190. // Returns: Approximate erf() with the hyperbolic tangent. The error is
  10191. // visually noticeable, but it's blazing fast and perceptually
  10192. // close...at least on ATI hardware. See:
  10193. // http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
  10194. // Warning: Only use this if your hardware drivers correctly implement
  10195. // tanh(): My nVidia 8800GTS returns garbage output.
  10196. return tanh(1.202760580 * x);
  10197. }
  10198. float3 erft(const float3 x)
  10199. {
  10200. // Float3 version:
  10201. return tanh(1.202760580 * x);
  10202. }
  10203. float2 erft(const float2 x)
  10204. {
  10205. // Float2 version:
  10206. return tanh(1.202760580 * x);
  10207. }
  10208. float erft(const float x)
  10209. {
  10210. // Float version:
  10211. return tanh(1.202760580 * x);
  10212. }
  10213. inline float4 erf(const float4 x)
  10214. {
  10215. // Requires: x is the standard parameter to erf().
  10216. // Returns: Some approximation of erf(x), depending on user settings.
  10217. #ifdef ERF_FAST_APPROXIMATION
  10218. return erft(x);
  10219. #else
  10220. return erf6(x);
  10221. #endif
  10222. }
  10223. inline float3 erf(const float3 x)
  10224. {
  10225. // Float3 version:
  10226. #ifdef ERF_FAST_APPROXIMATION
  10227. return erft(x);
  10228. #else
  10229. return erf6(x);
  10230. #endif
  10231. }
  10232. inline float2 erf(const float2 x)
  10233. {
  10234. // Float2 version:
  10235. #ifdef ERF_FAST_APPROXIMATION
  10236. return erft(x);
  10237. #else
  10238. return erf6(x);
  10239. #endif
  10240. }
  10241. inline float erf(const float x)
  10242. {
  10243. // Float version:
  10244. #ifdef ERF_FAST_APPROXIMATION
  10245. return erft(x);
  10246. #else
  10247. return erf6(x);
  10248. #endif
  10249. }
  10250. /////////////////////////// COMPLETE GAMMA FUNCTION //////////////////////////
  10251. float4 gamma_impl(const float4 s, const float4 s_inv)
  10252. {
  10253. // Requires: 1.) s is the standard parameter to the gamma function, and
  10254. // it should lie in the [0, 36] range.
  10255. // 2.) s_inv = 1.0/s. This implementation function requires
  10256. // the caller to precompute this value, giving users the
  10257. // opportunity to reuse it.
  10258. // Returns: Return approximate gamma function (real-numbered factorial)
  10259. // output using the Lanczos approximation with two coefficients
  10260. // calculated using Paul Godfrey's method here:
  10261. // http://my.fit.edu/~gabdo/gamma.txt
  10262. // An optimal g value for s in [0, 36] is ~1.12906830989, with
  10263. // a maximum relative error of 0.000463 for 2**16 equally
  10264. // evals. We could use three coeffs (0.0000346 error) without
  10265. // hurting latency, but this allows more parallelism with
  10266. // outside instructions.
  10267. static const float4 g = float4(1.12906830989);
  10268. static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
  10269. static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
  10270. static const float4 e = float4(2.71828182845904523536028747135266249775724709);
  10271. const float4 sph = s + float4(0.5);
  10272. const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
  10273. const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e
  10274. // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
  10275. // This has less error for small s's than (s -= 1.0) at the beginning.
  10276. return (pow(base, sph) * lanczos_sum) * s_inv;
  10277. }
  10278. float3 gamma_impl(const float3 s, const float3 s_inv)
  10279. {
  10280. // Float3 version:
  10281. static const float3 g = float3(1.12906830989);
  10282. static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
  10283. static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
  10284. static const float3 e = float3(2.71828182845904523536028747135266249775724709);
  10285. const float3 sph = s + float3(0.5);
  10286. const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
  10287. const float3 base = (sph + g)/e;
  10288. return (pow(base, sph) * lanczos_sum) * s_inv;
  10289. }
  10290. float2 gamma_impl(const float2 s, const float2 s_inv)
  10291. {
  10292. // Float2 version:
  10293. static const float2 g = float2(1.12906830989);
  10294. static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
  10295. static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
  10296. static const float2 e = float2(2.71828182845904523536028747135266249775724709);
  10297. const float2 sph = s + float2(0.5);
  10298. const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
  10299. const float2 base = (sph + g)/e;
  10300. return (pow(base, sph) * lanczos_sum) * s_inv;
  10301. }
  10302. float gamma_impl(const float s, const float s_inv)
  10303. {
  10304. // Float version:
  10305. static const float g = 1.12906830989;
  10306. static const float c0 = 0.8109119309638332633713423362694399653724431;
  10307. static const float c1 = 0.4808354605142681877121661197951496120000040;
  10308. static const float e = 2.71828182845904523536028747135266249775724709;
  10309. const float sph = s + 0.5;
  10310. const float lanczos_sum = c0 + c1/(s + 1.0);
  10311. const float base = (sph + g)/e;
  10312. return (pow(base, sph) * lanczos_sum) * s_inv;
  10313. }
  10314. float4 gamma(const float4 s)
  10315. {
  10316. // Requires: s is the standard parameter to the gamma function, and it
  10317. // should lie in the [0, 36] range.
  10318. // Returns: Return approximate gamma function output with a maximum
  10319. // relative error of 0.000463. See gamma_impl for details.
  10320. return gamma_impl(s, float4(1.0)/s);
  10321. }
  10322. float3 gamma(const float3 s)
  10323. {
  10324. // Float3 version:
  10325. return gamma_impl(s, float3(1.0)/s);
  10326. }
  10327. float2 gamma(const float2 s)
  10328. {
  10329. // Float2 version:
  10330. return gamma_impl(s, float2(1.0)/s);
  10331. }
  10332. float gamma(const float s)
  10333. {
  10334. // Float version:
  10335. return gamma_impl(s, 1.0/s);
  10336. }
  10337. //////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) ///////////////
  10338. // Lower incomplete gamma function for small s and z (implementation):
  10339. float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
  10340. {
  10341. // Requires: 1.) s < ~0.5
  10342. // 2.) z <= ~0.775075
  10343. // 3.) s_inv = 1.0/s (precomputed for outside reuse)
  10344. // Returns: A series representation for the lower incomplete gamma
  10345. // function for small s and small z (4 terms).
  10346. // The actual "rolled up" summation looks like:
  10347. // last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
  10348. // sum = last_sign * last_pow / ((s + k) * last_factorial)
  10349. // for(int i = 0; i < 4; ++i)
  10350. // {
  10351. // last_sign *= -1.0; last_pow *= z; last_factorial *= i;
  10352. // sum += last_sign * last_pow / ((s + k) * last_factorial);
  10353. // }
  10354. // Unrolled, constant-unfolded and arranged for madds and parallelism:
  10355. const float4 scale = pow(z, s);
  10356. float4 sum = s_inv; // Summation iteration 0 result
  10357. // Summation iterations 1, 2, and 3:
  10358. const float4 z_sq = z*z;
  10359. const float4 denom1 = s + float4(1.0);
  10360. const float4 denom2 = 2.0*s + float4(4.0);
  10361. const float4 denom3 = 6.0*s + float4(18.0);
  10362. //float4 denom4 = 24.0*s + float4(96.0);
  10363. sum -= z/denom1;
  10364. sum += z_sq/denom2;
  10365. sum -= z * z_sq/denom3;
  10366. //sum += z_sq * z_sq / denom4;
  10367. // Scale and return:
  10368. return scale * sum;
  10369. }
  10370. float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
  10371. {
  10372. // Float3 version:
  10373. const float3 scale = pow(z, s);
  10374. float3 sum = s_inv;
  10375. const float3 z_sq = z*z;
  10376. const float3 denom1 = s + float3(1.0);
  10377. const float3 denom2 = 2.0*s + float3(4.0);
  10378. const float3 denom3 = 6.0*s + float3(18.0);
  10379. sum -= z/denom1;
  10380. sum += z_sq/denom2;
  10381. sum -= z * z_sq/denom3;
  10382. return scale * sum;
  10383. }
  10384. float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
  10385. {
  10386. // Float2 version:
  10387. const float2 scale = pow(z, s);
  10388. float2 sum = s_inv;
  10389. const float2 z_sq = z*z;
  10390. const float2 denom1 = s + float2(1.0);
  10391. const float2 denom2 = 2.0*s + float2(4.0);
  10392. const float2 denom3 = 6.0*s + float2(18.0);
  10393. sum -= z/denom1;
  10394. sum += z_sq/denom2;
  10395. sum -= z * z_sq/denom3;
  10396. return scale * sum;
  10397. }
  10398. float ligamma_small_z_impl(const float s, const float z, const float s_inv)
  10399. {
  10400. // Float version:
  10401. const float scale = pow(z, s);
  10402. float sum = s_inv;
  10403. const float z_sq = z*z;
  10404. const float denom1 = s + 1.0;
  10405. const float denom2 = 2.0*s + 4.0;
  10406. const float denom3 = 6.0*s + 18.0;
  10407. sum -= z/denom1;
  10408. sum += z_sq/denom2;
  10409. sum -= z * z_sq/denom3;
  10410. return scale * sum;
  10411. }
  10412. // Upper incomplete gamma function for small s and large z (implementation):
  10413. float4 uigamma_large_z_impl(const float4 s, const float4 z)
  10414. {
  10415. // Requires: 1.) s < ~0.5
  10416. // 2.) z > ~0.775075
  10417. // Returns: Gauss's continued fraction representation for the upper
  10418. // incomplete gamma function (4 terms).
  10419. // The "rolled up" continued fraction looks like this. The denominator
  10420. // is truncated, and it's calculated "from the bottom up:"
  10421. // denom = float4('inf');
  10422. // float4 one = float4(1.0);
  10423. // for(int i = 4; i > 0; --i)
  10424. // {
  10425. // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
  10426. // }
  10427. // Unrolled and constant-unfolded for madds and parallelism:
  10428. const float4 numerator = pow(z, s) * exp(-z);
  10429. float4 denom = float4(7.0) + z - s;
  10430. denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
  10431. denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
  10432. denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
  10433. return numerator / denom;
  10434. }
  10435. float3 uigamma_large_z_impl(const float3 s, const float3 z)
  10436. {
  10437. // Float3 version:
  10438. const float3 numerator = pow(z, s) * exp(-z);
  10439. float3 denom = float3(7.0) + z - s;
  10440. denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
  10441. denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
  10442. denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
  10443. return numerator / denom;
  10444. }
  10445. float2 uigamma_large_z_impl(const float2 s, const float2 z)
  10446. {
  10447. // Float2 version:
  10448. const float2 numerator = pow(z, s) * exp(-z);
  10449. float2 denom = float2(7.0) + z - s;
  10450. denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
  10451. denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
  10452. denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
  10453. return numerator / denom;
  10454. }
  10455. float uigamma_large_z_impl(const float s, const float z)
  10456. {
  10457. // Float version:
  10458. const float numerator = pow(z, s) * exp(-z);
  10459. float denom = 7.0 + z - s;
  10460. denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
  10461. denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
  10462. denom = 1.0 + z - s + (s - 1.0)/denom;
  10463. return numerator / denom;
  10464. }
  10465. // Normalized lower incomplete gamma function for small s (implementation):
  10466. float4 normalized_ligamma_impl(const float4 s, const float4 z,
  10467. const float4 s_inv, const float4 gamma_s_inv)
  10468. {
  10469. // Requires: 1.) s < ~0.5
  10470. // 2.) s_inv = 1/s (precomputed for outside reuse)
  10471. // 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
  10472. // Returns: Approximate the normalized lower incomplete gamma function
  10473. // for s < 0.5. Since we only care about s < 0.5, we only need
  10474. // to evaluate two branches (not four) based on z. Each branch
  10475. // uses four terms, with a max relative error of ~0.00182. The
  10476. // branch threshold and specifics were adapted for fewer terms
  10477. // from Gil/Segura/Temme's paper here:
  10478. // http://oai.cwi.nl/oai/asset/20433/20433B.pdf
  10479. // Evaluate both branches: Real branches test slower even when available.
  10480. static const float4 thresh = float4(0.775075);
  10481. bool4 z_is_large;
  10482. z_is_large.x = z.x > thresh.x;
  10483. z_is_large.y = z.y > thresh.y;
  10484. z_is_large.z = z.z > thresh.z;
  10485. z_is_large.w = z.w > thresh.w;
  10486. const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  10487. const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  10488. // Combine the results from both branches:
  10489. bool4 inverse_z_is_large = not(z_is_large);
  10490. return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
  10491. }
  10492. float3 normalized_ligamma_impl(const float3 s, const float3 z,
  10493. const float3 s_inv, const float3 gamma_s_inv)
  10494. {
  10495. // Float3 version:
  10496. static const float3 thresh = float3(0.775075);
  10497. bool3 z_is_large;
  10498. z_is_large.x = z.x > thresh.x;
  10499. z_is_large.y = z.y > thresh.y;
  10500. z_is_large.z = z.z > thresh.z;
  10501. const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  10502. const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  10503. bool3 inverse_z_is_large = not(z_is_large);
  10504. return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
  10505. }
  10506. float2 normalized_ligamma_impl(const float2 s, const float2 z,
  10507. const float2 s_inv, const float2 gamma_s_inv)
  10508. {
  10509. // Float2 version:
  10510. static const float2 thresh = float2(0.775075);
  10511. bool2 z_is_large;
  10512. z_is_large.x = z.x > thresh.x;
  10513. z_is_large.y = z.y > thresh.y;
  10514. const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
  10515. const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  10516. bool2 inverse_z_is_large = not(z_is_large);
  10517. return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
  10518. }
  10519. float normalized_ligamma_impl(const float s, const float z,
  10520. const float s_inv, const float gamma_s_inv)
  10521. {
  10522. // Float version:
  10523. static const float thresh = 0.775075;
  10524. const bool z_is_large = z > thresh;
  10525. const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
  10526. const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
  10527. return large_z * float(z_is_large) + small_z * float(!z_is_large);
  10528. }
  10529. // Normalized lower incomplete gamma function for small s:
  10530. float4 normalized_ligamma(const float4 s, const float4 z)
  10531. {
  10532. // Requires: s < ~0.5
  10533. // Returns: Approximate the normalized lower incomplete gamma function
  10534. // for s < 0.5. See normalized_ligamma_impl() for details.
  10535. const float4 s_inv = float4(1.0)/s;
  10536. const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
  10537. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  10538. }
  10539. float3 normalized_ligamma(const float3 s, const float3 z)
  10540. {
  10541. // Float3 version:
  10542. const float3 s_inv = float3(1.0)/s;
  10543. const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
  10544. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  10545. }
  10546. float2 normalized_ligamma(const float2 s, const float2 z)
  10547. {
  10548. // Float2 version:
  10549. const float2 s_inv = float2(1.0)/s;
  10550. const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
  10551. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  10552. }
  10553. float normalized_ligamma(const float s, const float z)
  10554. {
  10555. // Float version:
  10556. const float s_inv = 1.0/s;
  10557. const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
  10558. return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
  10559. }
  10560. #endif // SPECIAL_FUNCTIONS_H
  10561. //////////////////////////// END SPECIAL-FUNCTIONS ///////////////////////////
  10562. //////////////////////////////// END INCLUDES ////////////////////////////////
  10563. /////////////////////////////////// HELPERS //////////////////////////////////
  10564. inline float4 uv2_to_uv4(float2 tex_uv)
  10565. {
  10566. // Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
  10567. return float4(tex_uv, 0.0, 0.0);
  10568. }
  10569. // Make a length squared helper macro (for usage with static constants):
  10570. #define LENGTH_SQ(vec) (dot(vec, vec))
  10571. inline float get_fast_gaussian_weight_sum_inv(const float sigma)
  10572. {
  10573. // We can use the Gaussian integral to calculate the asymptotic weight for
  10574. // the center pixel. Since the unnormalized center pixel weight is 1.0,
  10575. // the normalized weight is the same as the weight sum inverse. Given a
  10576. // large enough blur (9+), the asymptotic weight sum is close and faster:
  10577. // center_weight = 0.5 *
  10578. // (erf(0.5/(sigma*sqrt(2.0))) - erf(-0.5/(sigma*sqrt(2.0))))
  10579. // erf(-x) == -erf(x), so we get 0.5 * (2.0 * erf(blah blah)):
  10580. // However, we can get even faster results with curve-fitting. These are
  10581. // also closer than the asymptotic results, because they were constructed
  10582. // from 64 blurs sizes from [3, 131) and 255 equally-spaced sigmas from
  10583. // (0, blurN_std_dev), so the results for smaller sigmas are biased toward
  10584. // smaller blurs. The max error is 0.0031793913.
  10585. // Relative FPS: 134.3 with erf, 135.8 with curve-fitting.
  10586. //static const float temp = 0.5/sqrt(2.0);
  10587. //return erf(temp/sigma);
  10588. return min(exp(exp(0.348348412457428/
  10589. (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
  10590. }
  10591. //////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS ///////////////////
  10592. float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
  10593. const float2 dxdy, const float sigma)
  10594. {
  10595. // Requires: Global requirements must be met (see file description).
  10596. // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
  10597. // It may be mipmapped depending on settings and dxdy.
  10598. // Calculate Gaussian blur kernel weights and a normalization factor for
  10599. // distances of 0-4, ignoring constant factors (since we're normalizing).
  10600. const float denom_inv = 0.5/(sigma*sigma);
  10601. const float w0 = 1.0;
  10602. const float w1 = exp(-1.0 * denom_inv);
  10603. const float w2 = exp(-4.0 * denom_inv);
  10604. const float w3 = exp(-9.0 * denom_inv);
  10605. const float w4 = exp(-16.0 * denom_inv);
  10606. const float w5 = exp(-25.0 * denom_inv);
  10607. const float weight_sum_inv = 1.0 /
  10608. (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
  10609. // Statically normalize weights, sum weighted samples, and return. Blurs are
  10610. // currently optimized for dynamic weights.
  10611. float3 sum = float3(0.0,0.0,0.0);
  10612. sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
  10613. sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
  10614. sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
  10615. sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
  10616. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  10617. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  10618. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  10619. sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
  10620. sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
  10621. sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
  10622. sum += w5 * tex2D_linearize(tex, tex_uv + 5.0 * dxdy).rgb;
  10623. return sum * weight_sum_inv;
  10624. }
  10625. float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
  10626. const float2 dxdy, const float sigma)
  10627. {
  10628. // Requires: Global requirements must be met (see file description).
  10629. // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
  10630. // It may be mipmapped depending on settings and dxdy.
  10631. // First get the texel weights and normalization factor as above.
  10632. const float denom_inv = 0.5/(sigma*sigma);
  10633. const float w0 = 1.0;
  10634. const float w1 = exp(-1.0 * denom_inv);
  10635. const float w2 = exp(-4.0 * denom_inv);
  10636. const float w3 = exp(-9.0 * denom_inv);
  10637. const float w4 = exp(-16.0 * denom_inv);
  10638. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
  10639. // Statically normalize weights, sum weighted samples, and return:
  10640. float3 sum = float3(0.0,0.0,0.0);
  10641. sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
  10642. sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
  10643. sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
  10644. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  10645. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  10646. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  10647. sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
  10648. sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
  10649. sum += w4 * tex2D_linearize(tex, tex_uv + 4.0 * dxdy).rgb;
  10650. return sum * weight_sum_inv;
  10651. }
  10652. float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
  10653. const float2 dxdy, const float sigma)
  10654. {
  10655. // Requires: Global requirements must be met (see file description).
  10656. // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
  10657. // It may be mipmapped depending on settings and dxdy.
  10658. // First get the texel weights and normalization factor as above.
  10659. const float denom_inv = 0.5/(sigma*sigma);
  10660. const float w0 = 1.0;
  10661. const float w1 = exp(-1.0 * denom_inv);
  10662. const float w2 = exp(-4.0 * denom_inv);
  10663. const float w3 = exp(-9.0 * denom_inv);
  10664. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
  10665. // Statically normalize weights, sum weighted samples, and return:
  10666. float3 sum = float3(0.0,0.0,0.0);
  10667. sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
  10668. sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
  10669. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  10670. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  10671. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  10672. sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
  10673. sum += w3 * tex2D_linearize(tex, tex_uv + 3.0 * dxdy).rgb;
  10674. return sum * weight_sum_inv;
  10675. }
  10676. float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
  10677. const float2 dxdy, const float sigma)
  10678. {
  10679. // Requires: Global requirements must be met (see file description).
  10680. // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
  10681. // It may be mipmapped depending on settings and dxdy.
  10682. // First get the texel weights and normalization factor as above.
  10683. const float denom_inv = 0.5/(sigma*sigma);
  10684. const float w0 = 1.0;
  10685. const float w1 = exp(-1.0 * denom_inv);
  10686. const float w2 = exp(-4.0 * denom_inv);
  10687. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
  10688. // Statically normalize weights, sum weighted samples, and return:
  10689. float3 sum = float3(0.0,0.0,0.0);
  10690. sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
  10691. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  10692. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  10693. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  10694. sum += w2 * tex2D_linearize(tex, tex_uv + 2.0 * dxdy).rgb;
  10695. return sum * weight_sum_inv;
  10696. }
  10697. float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
  10698. const float2 dxdy, const float sigma)
  10699. {
  10700. // Requires: Global requirements must be met (see file description).
  10701. // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
  10702. // It may be mipmapped depending on settings and dxdy.
  10703. // First get the texel weights and normalization factor as above.
  10704. const float denom_inv = 0.5/(sigma*sigma);
  10705. const float w0 = 1.0;
  10706. const float w1 = exp(-1.0 * denom_inv);
  10707. const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
  10708. // Statically normalize weights, sum weighted samples, and return:
  10709. float3 sum = float3(0.0,0.0,0.0);
  10710. sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
  10711. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  10712. sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
  10713. return sum * weight_sum_inv;
  10714. }
  10715. /////////////////////////// FAST SEPARABLE BLURS ///////////////////////////
  10716. float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
  10717. const float2 dxdy, const float sigma)
  10718. {
  10719. // Requires: 1.) Global requirements must be met (see file description).
  10720. // 2.) filter_linearN must = "true" in your .cgp file.
  10721. // 3.) For gamma-correct bilinear filtering, global
  10722. // gamma_aware_bilinear == true (from gamma-management.h)
  10723. // Returns: A 1D 11x Gaussian blurred texture lookup using 6 linear
  10724. // taps. It may be mipmapped depending on settings and dxdy.
  10725. // First get the texel weights and normalization factor as above.
  10726. const float denom_inv = 0.5/(sigma*sigma);
  10727. const float w0 = 1.0;
  10728. const float w1 = exp(-1.0 * denom_inv);
  10729. const float w2 = exp(-4.0 * denom_inv);
  10730. const float w3 = exp(-9.0 * denom_inv);
  10731. const float w4 = exp(-16.0 * denom_inv);
  10732. const float w5 = exp(-25.0 * denom_inv);
  10733. const float weight_sum_inv = 1.0 /
  10734. (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
  10735. // Calculate combined weights and linear sample ratios between texel pairs.
  10736. // The center texel (with weight w0) is used twice, so halve its weight.
  10737. const float w01 = w0 * 0.5 + w1;
  10738. const float w23 = w2 + w3;
  10739. const float w45 = w4 + w5;
  10740. const float w01_ratio = w1/w01;
  10741. const float w23_ratio = w3/w23;
  10742. const float w45_ratio = w5/w45;
  10743. // Statically normalize weights, sum weighted samples, and return:
  10744. float3 sum = float3(0.0,0.0,0.0);
  10745. sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
  10746. sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
  10747. sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
  10748. sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
  10749. sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
  10750. sum += w45 * tex2D_linearize(tex, tex_uv + (4.0 + w45_ratio) * dxdy).rgb;
  10751. return sum * weight_sum_inv;
  10752. }
  10753. float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
  10754. const float2 dxdy, const float sigma)
  10755. {
  10756. // Requires: Same as tex2Dblur11()
  10757. // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest
  10758. // neighbor and 4 linear taps. It may be mipmapped depending
  10759. // on settings and dxdy.
  10760. // First get the texel weights and normalization factor as above.
  10761. const float denom_inv = 0.5/(sigma*sigma);
  10762. const float w0 = 1.0;
  10763. const float w1 = exp(-1.0 * denom_inv);
  10764. const float w2 = exp(-4.0 * denom_inv);
  10765. const float w3 = exp(-9.0 * denom_inv);
  10766. const float w4 = exp(-16.0 * denom_inv);
  10767. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
  10768. // Calculate combined weights and linear sample ratios between texel pairs.
  10769. const float w12 = w1 + w2;
  10770. const float w34 = w3 + w4;
  10771. const float w12_ratio = w2/w12;
  10772. const float w34_ratio = w4/w34;
  10773. // Statically normalize weights, sum weighted samples, and return:
  10774. float3 sum = float3(0.0,0.0,0.0);
  10775. sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
  10776. sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
  10777. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  10778. sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
  10779. sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
  10780. return sum * weight_sum_inv;
  10781. }
  10782. float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
  10783. const float2 dxdy, const float sigma)
  10784. {
  10785. // Requires: Same as tex2Dblur11()
  10786. // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear
  10787. // taps. It may be mipmapped depending on settings and dxdy.
  10788. // First get the texel weights and normalization factor as above.
  10789. const float denom_inv = 0.5/(sigma*sigma);
  10790. const float w0 = 1.0;
  10791. const float w1 = exp(-1.0 * denom_inv);
  10792. const float w2 = exp(-4.0 * denom_inv);
  10793. const float w3 = exp(-9.0 * denom_inv);
  10794. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
  10795. // Calculate combined weights and linear sample ratios between texel pairs.
  10796. // The center texel (with weight w0) is used twice, so halve its weight.
  10797. const float w01 = w0 * 0.5 + w1;
  10798. const float w23 = w2 + w3;
  10799. const float w01_ratio = w1/w01;
  10800. const float w23_ratio = w3/w23;
  10801. // Statically normalize weights, sum weighted samples, and return:
  10802. float3 sum = float3(0.0,0.0,0.0);
  10803. sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
  10804. sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
  10805. sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
  10806. sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
  10807. return sum * weight_sum_inv;
  10808. }
  10809. float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
  10810. const float2 dxdy, const float sigma)
  10811. {
  10812. // Requires: Same as tex2Dblur11()
  10813. // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest
  10814. // neighbor and 2 linear taps. It may be mipmapped depending
  10815. // on settings and dxdy.
  10816. // First get the texel weights and normalization factor as above.
  10817. const float denom_inv = 0.5/(sigma*sigma);
  10818. const float w0 = 1.0;
  10819. const float w1 = exp(-1.0 * denom_inv);
  10820. const float w2 = exp(-4.0 * denom_inv);
  10821. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
  10822. // Calculate combined weights and linear sample ratios between texel pairs.
  10823. const float w12 = w1 + w2;
  10824. const float w12_ratio = w2/w12;
  10825. // Statically normalize weights, sum weighted samples, and return:
  10826. float3 sum = float3(0.0,0.0,0.0);
  10827. sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
  10828. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  10829. sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
  10830. return sum * weight_sum_inv;
  10831. }
  10832. float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
  10833. const float2 dxdy, const float sigma)
  10834. {
  10835. // Requires: Same as tex2Dblur11()
  10836. // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear
  10837. // taps. It may be mipmapped depending on settings and dxdy.
  10838. // First get the texel weights and normalization factor as above.
  10839. const float denom_inv = 0.5/(sigma*sigma);
  10840. const float w0 = 1.0;
  10841. const float w1 = exp(-1.0 * denom_inv);
  10842. const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
  10843. // Calculate combined weights and linear sample ratios between texel pairs.
  10844. // The center texel (with weight w0) is used twice, so halve its weight.
  10845. const float w01 = w0 * 0.5 + w1;
  10846. const float w01_ratio = w1/w01;
  10847. // Weights for all samples are the same, so just average them:
  10848. return 0.5 * (
  10849. tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
  10850. tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
  10851. }
  10852. //////////////////////////// HUGE SEPARABLE BLURS ////////////////////////////
  10853. // Huge separable blurs come only in "fast" versions.
  10854. float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
  10855. const float2 dxdy, const float sigma)
  10856. {
  10857. // Requires: Same as tex2Dblur11()
  10858. // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear
  10859. // taps. It may be mipmapped depending on settings and dxdy.
  10860. // First get the texel weights and normalization factor as above.
  10861. const float denom_inv = 0.5/(sigma*sigma);
  10862. const float w0 = 1.0;
  10863. const float w1 = exp(-1.0 * denom_inv);
  10864. const float w2 = exp(-4.0 * denom_inv);
  10865. const float w3 = exp(-9.0 * denom_inv);
  10866. const float w4 = exp(-16.0 * denom_inv);
  10867. const float w5 = exp(-25.0 * denom_inv);
  10868. const float w6 = exp(-36.0 * denom_inv);
  10869. const float w7 = exp(-49.0 * denom_inv);
  10870. const float w8 = exp(-64.0 * denom_inv);
  10871. const float w9 = exp(-81.0 * denom_inv);
  10872. const float w10 = exp(-100.0 * denom_inv);
  10873. const float w11 = exp(-121.0 * denom_inv);
  10874. const float w12 = exp(-144.0 * denom_inv);
  10875. const float w13 = exp(-169.0 * denom_inv);
  10876. const float w14 = exp(-196.0 * denom_inv);
  10877. const float w15 = exp(-225.0 * denom_inv);
  10878. const float w16 = exp(-256.0 * denom_inv);
  10879. const float w17 = exp(-289.0 * denom_inv);
  10880. const float w18 = exp(-324.0 * denom_inv);
  10881. const float w19 = exp(-361.0 * denom_inv);
  10882. const float w20 = exp(-400.0 * denom_inv);
  10883. const float w21 = exp(-441.0 * denom_inv);
  10884. //const float weight_sum_inv = 1.0 /
  10885. // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 +
  10886. // w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
  10887. const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
  10888. // Calculate combined weights and linear sample ratios between texel pairs.
  10889. // The center texel (with weight w0) is used twice, so halve its weight.
  10890. const float w0_1 = w0 * 0.5 + w1;
  10891. const float w2_3 = w2 + w3;
  10892. const float w4_5 = w4 + w5;
  10893. const float w6_7 = w6 + w7;
  10894. const float w8_9 = w8 + w9;
  10895. const float w10_11 = w10 + w11;
  10896. const float w12_13 = w12 + w13;
  10897. const float w14_15 = w14 + w15;
  10898. const float w16_17 = w16 + w17;
  10899. const float w18_19 = w18 + w19;
  10900. const float w20_21 = w20 + w21;
  10901. const float w0_1_ratio = w1/w0_1;
  10902. const float w2_3_ratio = w3/w2_3;
  10903. const float w4_5_ratio = w5/w4_5;
  10904. const float w6_7_ratio = w7/w6_7;
  10905. const float w8_9_ratio = w9/w8_9;
  10906. const float w10_11_ratio = w11/w10_11;
  10907. const float w12_13_ratio = w13/w12_13;
  10908. const float w14_15_ratio = w15/w14_15;
  10909. const float w16_17_ratio = w17/w16_17;
  10910. const float w18_19_ratio = w19/w18_19;
  10911. const float w20_21_ratio = w21/w20_21;
  10912. // Statically normalize weights, sum weighted samples, and return:
  10913. float3 sum = float3(0.0,0.0,0.0);
  10914. sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
  10915. sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
  10916. sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
  10917. sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
  10918. sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
  10919. sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
  10920. sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
  10921. sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
  10922. sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
  10923. sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
  10924. sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
  10925. sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
  10926. sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
  10927. sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
  10928. sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
  10929. sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
  10930. sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
  10931. sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
  10932. sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
  10933. sum += w16_17 * tex2D_linearize(tex, tex_uv + (16.0 + w16_17_ratio) * dxdy).rgb;
  10934. sum += w18_19 * tex2D_linearize(tex, tex_uv + (18.0 + w18_19_ratio) * dxdy).rgb;
  10935. sum += w20_21 * tex2D_linearize(tex, tex_uv + (20.0 + w20_21_ratio) * dxdy).rgb;
  10936. return sum * weight_sum_inv;
  10937. }
  10938. float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
  10939. const float2 dxdy, const float sigma)
  10940. {
  10941. // Requires: Same as tex2Dblur11()
  10942. // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear
  10943. // taps. It may be mipmapped depending on settings and dxdy.
  10944. // First get the texel weights and normalization factor as above.
  10945. const float denom_inv = 0.5/(sigma*sigma);
  10946. const float w0 = 1.0;
  10947. const float w1 = exp(-1.0 * denom_inv);
  10948. const float w2 = exp(-4.0 * denom_inv);
  10949. const float w3 = exp(-9.0 * denom_inv);
  10950. const float w4 = exp(-16.0 * denom_inv);
  10951. const float w5 = exp(-25.0 * denom_inv);
  10952. const float w6 = exp(-36.0 * denom_inv);
  10953. const float w7 = exp(-49.0 * denom_inv);
  10954. const float w8 = exp(-64.0 * denom_inv);
  10955. const float w9 = exp(-81.0 * denom_inv);
  10956. const float w10 = exp(-100.0 * denom_inv);
  10957. const float w11 = exp(-121.0 * denom_inv);
  10958. const float w12 = exp(-144.0 * denom_inv);
  10959. const float w13 = exp(-169.0 * denom_inv);
  10960. const float w14 = exp(-196.0 * denom_inv);
  10961. const float w15 = exp(-225.0 * denom_inv);
  10962. //const float weight_sum_inv = 1.0 /
  10963. // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
  10964. // w9 + w10 + w11 + w12 + w13 + w14 + w15));
  10965. const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
  10966. // Calculate combined weights and linear sample ratios between texel pairs.
  10967. // The center texel (with weight w0) is used twice, so halve its weight.
  10968. const float w0_1 = w0 * 0.5 + w1;
  10969. const float w2_3 = w2 + w3;
  10970. const float w4_5 = w4 + w5;
  10971. const float w6_7 = w6 + w7;
  10972. const float w8_9 = w8 + w9;
  10973. const float w10_11 = w10 + w11;
  10974. const float w12_13 = w12 + w13;
  10975. const float w14_15 = w14 + w15;
  10976. const float w0_1_ratio = w1/w0_1;
  10977. const float w2_3_ratio = w3/w2_3;
  10978. const float w4_5_ratio = w5/w4_5;
  10979. const float w6_7_ratio = w7/w6_7;
  10980. const float w8_9_ratio = w9/w8_9;
  10981. const float w10_11_ratio = w11/w10_11;
  10982. const float w12_13_ratio = w13/w12_13;
  10983. const float w14_15_ratio = w15/w14_15;
  10984. // Statically normalize weights, sum weighted samples, and return:
  10985. float3 sum = float3(0.0,0.0,0.0);
  10986. sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
  10987. sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
  10988. sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
  10989. sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
  10990. sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
  10991. sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
  10992. sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
  10993. sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
  10994. sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
  10995. sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
  10996. sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
  10997. sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
  10998. sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
  10999. sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
  11000. sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
  11001. sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
  11002. return sum * weight_sum_inv;
  11003. }
  11004. float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
  11005. const float2 dxdy, const float sigma)
  11006. {
  11007. // Requires: Same as tex2Dblur11()
  11008. // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest
  11009. // neighbor and 12 linear taps. It may be mipmapped depending
  11010. // on settings and dxdy.
  11011. // First get the texel weights and normalization factor as above.
  11012. const float denom_inv = 0.5/(sigma*sigma);
  11013. const float w0 = 1.0;
  11014. const float w1 = exp(-1.0 * denom_inv);
  11015. const float w2 = exp(-4.0 * denom_inv);
  11016. const float w3 = exp(-9.0 * denom_inv);
  11017. const float w4 = exp(-16.0 * denom_inv);
  11018. const float w5 = exp(-25.0 * denom_inv);
  11019. const float w6 = exp(-36.0 * denom_inv);
  11020. const float w7 = exp(-49.0 * denom_inv);
  11021. const float w8 = exp(-64.0 * denom_inv);
  11022. const float w9 = exp(-81.0 * denom_inv);
  11023. const float w10 = exp(-100.0 * denom_inv);
  11024. const float w11 = exp(-121.0 * denom_inv);
  11025. const float w12 = exp(-144.0 * denom_inv);
  11026. //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
  11027. // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
  11028. const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
  11029. // Calculate combined weights and linear sample ratios between texel pairs.
  11030. const float w1_2 = w1 + w2;
  11031. const float w3_4 = w3 + w4;
  11032. const float w5_6 = w5 + w6;
  11033. const float w7_8 = w7 + w8;
  11034. const float w9_10 = w9 + w10;
  11035. const float w11_12 = w11 + w12;
  11036. const float w1_2_ratio = w2/w1_2;
  11037. const float w3_4_ratio = w4/w3_4;
  11038. const float w5_6_ratio = w6/w5_6;
  11039. const float w7_8_ratio = w8/w7_8;
  11040. const float w9_10_ratio = w10/w9_10;
  11041. const float w11_12_ratio = w12/w11_12;
  11042. // Statically normalize weights, sum weighted samples, and return:
  11043. float3 sum = float3(0.0,0.0,0.0);
  11044. sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
  11045. sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
  11046. sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
  11047. sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
  11048. sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
  11049. sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
  11050. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  11051. sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
  11052. sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
  11053. sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
  11054. sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
  11055. sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
  11056. sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
  11057. return sum * weight_sum_inv;
  11058. }
  11059. float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
  11060. const float2 dxdy, const float sigma)
  11061. {
  11062. // Requires: Same as tex2Dblur11()
  11063. // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest
  11064. // neighbor and 8 linear taps. It may be mipmapped depending
  11065. // on settings and dxdy.
  11066. // First get the texel weights and normalization factor as above.
  11067. const float denom_inv = 0.5/(sigma*sigma);
  11068. const float w0 = 1.0;
  11069. const float w1 = exp(-1.0 * denom_inv);
  11070. const float w2 = exp(-4.0 * denom_inv);
  11071. const float w3 = exp(-9.0 * denom_inv);
  11072. const float w4 = exp(-16.0 * denom_inv);
  11073. const float w5 = exp(-25.0 * denom_inv);
  11074. const float w6 = exp(-36.0 * denom_inv);
  11075. const float w7 = exp(-49.0 * denom_inv);
  11076. const float w8 = exp(-64.0 * denom_inv);
  11077. //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
  11078. // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
  11079. const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
  11080. // Calculate combined weights and linear sample ratios between texel pairs.
  11081. const float w1_2 = w1 + w2;
  11082. const float w3_4 = w3 + w4;
  11083. const float w5_6 = w5 + w6;
  11084. const float w7_8 = w7 + w8;
  11085. const float w1_2_ratio = w2/w1_2;
  11086. const float w3_4_ratio = w4/w3_4;
  11087. const float w5_6_ratio = w6/w5_6;
  11088. const float w7_8_ratio = w8/w7_8;
  11089. // Statically normalize weights, sum weighted samples, and return:
  11090. float3 sum = float3(0.0,0.0,0.0);
  11091. sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
  11092. sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
  11093. sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
  11094. sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
  11095. sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
  11096. sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
  11097. sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
  11098. sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
  11099. sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
  11100. return sum * weight_sum_inv;
  11101. }
  11102. //////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS ////////////////////
  11103. float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
  11104. const float2 dxdy, const float sigma)
  11105. {
  11106. // Requires: Global requirements must be met (see file description).
  11107. // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the
  11108. // resized input.
  11109. // Description:
  11110. // This is the only arbitrarily resizable one-pass blur; tex2Dblur5x5resize
  11111. // would perform like tex2Dblur9x9, MUCH slower than tex2Dblur5resize.
  11112. const float denom_inv = 0.5/(sigma*sigma);
  11113. // Load each sample. We need all 3x3 samples. Quad-pixel communication
  11114. // won't help either: This should perform like tex2Dblur5x5, but sharing a
  11115. // 4x4 sample field would perform more like tex2Dblur8x8shared (worse).
  11116. const float2 sample4_uv = tex_uv;
  11117. const float2 dx = float2(dxdy.x, 0.0);
  11118. const float2 dy = float2(0.0, dxdy.y);
  11119. const float2 sample1_uv = sample4_uv - dy;
  11120. const float2 sample7_uv = sample4_uv + dy;
  11121. const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
  11122. const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
  11123. const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
  11124. const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
  11125. const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
  11126. const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
  11127. const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
  11128. const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
  11129. const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
  11130. // Statically compute Gaussian sample weights:
  11131. const float w4 = 1.0;
  11132. const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
  11133. const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
  11134. const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
  11135. // Weight and sum the samples:
  11136. const float3 sum = w4 * sample4 +
  11137. w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
  11138. w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
  11139. return sum * weight_sum_inv;
  11140. }
  11141. //////////////////////////// FASTER ONE-PASS BLURS ///////////////////////////
  11142. float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
  11143. const float2 dxdy, const float sigma)
  11144. {
  11145. // Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
  11146. // Requires: Same as tex2Dblur9()
  11147. // Returns: A 9x9 Gaussian blurred mipmapped texture lookup composed of
  11148. // 5x5 carefully selected bilinear samples.
  11149. // Description:
  11150. // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. Adjust the
  11151. // bilinear sample location to reflect the true Gaussian weights for each
  11152. // underlying texel. The following diagram illustrates the relative
  11153. // locations of bilinear samples. Each sample with the same number has the
  11154. // same weight (notice the symmetry). The letters a, b, c, d distinguish
  11155. // quadrants, and the letters U, D, L, R, C (up, down, left, right, center)
  11156. // distinguish 1D directions along the line containing the pixel center:
  11157. // 6a 5a 2U 5b 6b
  11158. // 4a 3a 1U 3b 4b
  11159. // 2L 1L 0C 1R 2R
  11160. // 4c 3c 1D 3d 4d
  11161. // 6c 5c 2D 5d 6d
  11162. // The following diagram illustrates the underlying equally spaced texels,
  11163. // named after the sample that accesses them and subnamed by their location
  11164. // within their 2x2, 2x1, 1x2, or 1x1 texel block:
  11165. // 6a4 6a3 5a4 5a3 2U2 5b3 5b4 6b3 6b4
  11166. // 6a2 6a1 5a2 5a1 2U1 5b1 5b2 6b1 6b2
  11167. // 4a4 4a3 3a4 3a3 1U2 3b3 3b4 4b3 4b4
  11168. // 4a2 4a1 3a2 3a1 1U1 3b1 3b2 4b1 4b2
  11169. // 2L2 2L1 1L2 1L1 0C1 1R1 1R2 2R1 2R2
  11170. // 4c2 4c1 3c2 3c1 1D1 3d1 3d2 4d1 4d2
  11171. // 4c4 4c3 3c4 3c3 1D2 3d3 3d4 4d3 4d4
  11172. // 6c2 6c1 5c2 5c1 2D1 5d1 5d2 6d1 6d2
  11173. // 6c4 6c3 5c4 5c3 2D2 5d3 5d4 6d3 6d4
  11174. // Note there is only one C texel and only two texels for each U, D, L, or
  11175. // R sample. The center sample is effectively a nearest neighbor sample,
  11176. // and the U/D/L/R samples use 1D linear filtering. All other texels are
  11177. // read with bilinear samples somewhere within their 2x2 texel blocks.
  11178. // COMPUTE TEXTURE COORDS:
  11179. // Statically compute sampling offsets within each 2x2 texel block, based
  11180. // on 1D sampling ratios between texels [1, 2] and [3, 4] texels away from
  11181. // the center, and reuse them independently for both dimensions. Compute
  11182. // these offsets based on the relative 1D Gaussian weights of the texels
  11183. // in question. (w1off means "Gaussian weight for the texel 1.0 texels
  11184. // away from the pixel center," etc.).
  11185. const float denom_inv = 0.5/(sigma*sigma);
  11186. const float w1off = exp(-1.0 * denom_inv);
  11187. const float w2off = exp(-4.0 * denom_inv);
  11188. const float w3off = exp(-9.0 * denom_inv);
  11189. const float w4off = exp(-16.0 * denom_inv);
  11190. const float texel1to2ratio = w2off/(w1off + w2off);
  11191. const float texel3to4ratio = w4off/(w3off + w4off);
  11192. // Statically compute texel offsets from the fragment center to each
  11193. // bilinear sample in the bottom-right quadrant, including x-axis-aligned:
  11194. const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
  11195. const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
  11196. const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
  11197. const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
  11198. const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
  11199. const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
  11200. // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
  11201. // Statically compute Gaussian texel weights for the bottom-right quadrant.
  11202. // Read underscores as "and."
  11203. const float w1R1 = w1off;
  11204. const float w1R2 = w2off;
  11205. const float w2R1 = w3off;
  11206. const float w2R2 = w4off;
  11207. const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
  11208. const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
  11209. const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
  11210. const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
  11211. const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
  11212. const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
  11213. const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
  11214. const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
  11215. const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
  11216. const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
  11217. // Statically add texel weights in each sample to get sample weights:
  11218. const float w0 = 1.0;
  11219. const float w1 = w1R1 + w1R2;
  11220. const float w2 = w2R1 + w2R2;
  11221. const float w3 = w3d1 + 2.0 * w3d2_3d3 + w3d4;
  11222. const float w4 = w4d1_5d1 + w4d2_5d3 + w4d3_5d2 + w4d4_5d4;
  11223. const float w5 = w4;
  11224. const float w6 = w6d1 + 2.0 * w6d2_6d3 + w6d4;
  11225. // Get the weight sum inverse (normalization factor):
  11226. const float weight_sum_inv =
  11227. 1.0/(w0 + 4.0 * (w1 + w2 + w3 + w4 + w5 + w6));
  11228. // LOAD TEXTURE SAMPLES:
  11229. // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
  11230. const float2 mirror_x = float2(-1.0, 1.0);
  11231. const float2 mirror_y = float2(1.0, -1.0);
  11232. const float2 mirror_xy = float2(-1.0, -1.0);
  11233. const float2 dxdy_mirror_x = dxdy * mirror_x;
  11234. const float2 dxdy_mirror_y = dxdy * mirror_y;
  11235. const float2 dxdy_mirror_xy = dxdy * mirror_xy;
  11236. // Sampling order doesn't seem to affect performance, so just be clear:
  11237. const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
  11238. const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
  11239. const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
  11240. const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
  11241. const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
  11242. const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
  11243. const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
  11244. const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
  11245. const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
  11246. const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
  11247. const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
  11248. const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
  11249. const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
  11250. const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
  11251. const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
  11252. const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
  11253. const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
  11254. const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
  11255. const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
  11256. const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
  11257. const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
  11258. const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
  11259. const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
  11260. const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
  11261. const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
  11262. // SUM WEIGHTED SAMPLES:
  11263. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  11264. float3 sum = w0 * sample0C;
  11265. sum += w1 * (sample1R + sample1D + sample1L + sample1U);
  11266. sum += w2 * (sample2R + sample2D + sample2L + sample2U);
  11267. sum += w3 * (sample3d + sample3c + sample3b + sample3a);
  11268. sum += w4 * (sample4d + sample4c + sample4b + sample4a);
  11269. sum += w5 * (sample5d + sample5c + sample5b + sample5a);
  11270. sum += w6 * (sample6d + sample6c + sample6b + sample6a);
  11271. return sum * weight_sum_inv;
  11272. }
  11273. float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
  11274. const float2 dxdy, const float sigma)
  11275. {
  11276. // Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
  11277. // Requires: Same as tex2Dblur9()
  11278. // Returns: A 7x7 Gaussian blurred mipmapped texture lookup composed of
  11279. // 4x4 carefully selected bilinear samples.
  11280. // Description:
  11281. // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This
  11282. // blur mixes concepts from both. The sample layout is as follows:
  11283. // 4a 3a 3b 4b
  11284. // 2a 1a 1b 2b
  11285. // 2c 1c 1d 2d
  11286. // 4c 3c 3d 4d
  11287. // The texel layout is as follows. Note that samples 3a/3b, 1a/1b, 1c/1d,
  11288. // and 3c/3d share a vertical column of texels, and samples 2a/2c, 1a/1c,
  11289. // 1b/1d, and 2b/2d share a horizontal row of texels (all sample1's share
  11290. // the center texel):
  11291. // 4a4 4a3 3a4 3ab3 3b4 4b3 4b4
  11292. // 4a2 4a1 3a2 3ab1 3b2 4b1 4b2
  11293. // 2a4 2a3 1a4 1ab3 1b4 2b3 2b4
  11294. // 2ac2 2ac1 1ac2 1* 1bd2 2bd1 2bd2
  11295. // 2c4 2c3 1c4 1cd3 1d4 2d3 2d4
  11296. // 4c2 4c1 3c2 3cd1 3d2 4d1 4d2
  11297. // 4c4 4c3 3c4 3cd3 3d4 4d3 4d4
  11298. // COMPUTE TEXTURE COORDS:
  11299. // Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
  11300. const float denom_inv = 0.5/(sigma*sigma);
  11301. const float w0off = 1.0;
  11302. const float w1off = exp(-1.0 * denom_inv);
  11303. const float w2off = exp(-4.0 * denom_inv);
  11304. const float w3off = exp(-9.0 * denom_inv);
  11305. const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
  11306. const float texel2to3ratio = w3off/(w2off + w3off);
  11307. // Statically compute texel offsets from the fragment center to each
  11308. // bilinear sample in the bottom-right quadrant, including axis-aligned:
  11309. const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
  11310. const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  11311. const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  11312. const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  11313. // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
  11314. // Statically compute Gaussian texel weights for the bottom-right quadrant.
  11315. // Read underscores as "and."
  11316. const float w1abcd = 1.0;
  11317. const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
  11318. const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
  11319. const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
  11320. const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
  11321. const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
  11322. const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
  11323. const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
  11324. const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
  11325. const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
  11326. // Statically add texel weights in each sample to get sample weights.
  11327. // Split weights for shared texels between samples sharing them:
  11328. const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
  11329. const float w2_3 = (w2bd1_3cd1 + w2bd2_3cd2) * 0.5 + w2d3_3d2 + w2d4_3d4;
  11330. const float w4 = w4d1 + 2.0 * w4d2_4d3 + w4d4;
  11331. // Get the weight sum inverse (normalization factor):
  11332. const float weight_sum_inv =
  11333. 1.0/(4.0 * (w1 + 2.0 * w2_3 + w4));
  11334. // LOAD TEXTURE SAMPLES:
  11335. // Load all 16 samples using symmetry:
  11336. const float2 mirror_x = float2(-1.0, 1.0);
  11337. const float2 mirror_y = float2(1.0, -1.0);
  11338. const float2 mirror_xy = float2(-1.0, -1.0);
  11339. const float2 dxdy_mirror_x = dxdy * mirror_x;
  11340. const float2 dxdy_mirror_y = dxdy * mirror_y;
  11341. const float2 dxdy_mirror_xy = dxdy * mirror_xy;
  11342. const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
  11343. const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
  11344. const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
  11345. const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
  11346. const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
  11347. const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
  11348. const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
  11349. const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
  11350. const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
  11351. const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
  11352. const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
  11353. const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
  11354. const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
  11355. const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
  11356. const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
  11357. const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
  11358. // SUM WEIGHTED SAMPLES:
  11359. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  11360. float3 sum = float3(0.0,0.0,0.0);
  11361. sum += w1 * (sample1a + sample1b + sample1c + sample1d);
  11362. sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
  11363. sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
  11364. sum += w4 * (sample4a + sample4b + sample4c + sample4d);
  11365. return sum * weight_sum_inv;
  11366. }
  11367. float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
  11368. const float2 dxdy, const float sigma)
  11369. {
  11370. // Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
  11371. // Requires: Same as tex2Dblur9()
  11372. // Returns: A 5x5 Gaussian blurred mipmapped texture lookup composed of
  11373. // 3x3 carefully selected bilinear samples.
  11374. // Description:
  11375. // First see the description for tex2Dblur9x9(). This blur uses the same
  11376. // concept and sample/texel locations except on a smaller scale. Samples:
  11377. // 2a 1U 2b
  11378. // 1L 0C 1R
  11379. // 2c 1D 2d
  11380. // Texels:
  11381. // 2a4 2a3 1U2 2b3 2b4
  11382. // 2a2 2a1 1U1 2b1 2b2
  11383. // 1L2 1L1 0C1 1R1 1R2
  11384. // 2c2 2c1 1D1 2d1 2d2
  11385. // 2c4 2c3 1D2 2d3 2d4
  11386. // COMPUTE TEXTURE COORDS:
  11387. // Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
  11388. const float denom_inv = 0.5/(sigma*sigma);
  11389. const float w1off = exp(-1.0 * denom_inv);
  11390. const float w2off = exp(-4.0 * denom_inv);
  11391. const float texel1to2ratio = w2off/(w1off + w2off);
  11392. // Statically compute texel offsets from the fragment center to each
  11393. // bilinear sample in the bottom-right quadrant, including x-axis-aligned:
  11394. const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
  11395. const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
  11396. // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
  11397. // Statically compute Gaussian texel weights for the bottom-right quadrant.
  11398. // Read underscores as "and."
  11399. const float w1R1 = w1off;
  11400. const float w1R2 = w2off;
  11401. const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
  11402. const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
  11403. const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
  11404. // Statically add texel weights in each sample to get sample weights:
  11405. const float w0 = 1.0;
  11406. const float w1 = w1R1 + w1R2;
  11407. const float w2 = w2d1 + 2.0 * w2d2_3 + w2d4;
  11408. // Get the weight sum inverse (normalization factor):
  11409. const float weight_sum_inv = 1.0/(w0 + 4.0 * (w1 + w2));
  11410. // LOAD TEXTURE SAMPLES:
  11411. // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
  11412. const float2 mirror_x = float2(-1.0, 1.0);
  11413. const float2 mirror_y = float2(1.0, -1.0);
  11414. const float2 mirror_xy = float2(-1.0, -1.0);
  11415. const float2 dxdy_mirror_x = dxdy * mirror_x;
  11416. const float2 dxdy_mirror_y = dxdy * mirror_y;
  11417. const float2 dxdy_mirror_xy = dxdy * mirror_xy;
  11418. const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
  11419. const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
  11420. const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
  11421. const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
  11422. const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
  11423. const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
  11424. const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
  11425. const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
  11426. const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
  11427. // SUM WEIGHTED SAMPLES:
  11428. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  11429. float3 sum = w0 * sample0C;
  11430. sum += w1 * (sample1R + sample1D + sample1L + sample1U);
  11431. sum += w2 * (sample2a + sample2b + sample2c + sample2d);
  11432. return sum * weight_sum_inv;
  11433. }
  11434. float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
  11435. const float2 dxdy, const float sigma)
  11436. {
  11437. // Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
  11438. // Requires: Same as tex2Dblur9()
  11439. // Returns: A 3x3 Gaussian blurred mipmapped texture lookup composed of
  11440. // 2x2 carefully selected bilinear samples.
  11441. // Description:
  11442. // First see the descriptions for tex2Dblur9x9() and tex2Dblur7(). This
  11443. // blur mixes concepts from both. The sample layout is as follows:
  11444. // 0a 0b
  11445. // 0c 0d
  11446. // The texel layout is as follows. Note that samples 0a/0b and 0c/0d share
  11447. // a vertical column of texels, and samples 0a/0c and 0b/0d share a
  11448. // horizontal row of texels (all samples share the center texel):
  11449. // 0a3 0ab2 0b3
  11450. // 0ac1 0*0 0bd1
  11451. // 0c3 0cd2 0d3
  11452. // COMPUTE TEXTURE COORDS:
  11453. // Statically compute bilinear sampling offsets (details in tex2Dblur9x9).
  11454. const float denom_inv = 0.5/(sigma*sigma);
  11455. const float w0off = 1.0;
  11456. const float w1off = exp(-1.0 * denom_inv);
  11457. const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
  11458. // Statically compute texel offsets from the fragment center to each
  11459. // bilinear sample in the bottom-right quadrant, including axis-aligned:
  11460. const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
  11461. // LOAD TEXTURE SAMPLES:
  11462. // Load all 4 samples using symmetry:
  11463. const float2 mirror_x = float2(-1.0, 1.0);
  11464. const float2 mirror_y = float2(1.0, -1.0);
  11465. const float2 mirror_xy = float2(-1.0, -1.0);
  11466. const float2 dxdy_mirror_x = dxdy * mirror_x;
  11467. const float2 dxdy_mirror_y = dxdy * mirror_y;
  11468. const float2 dxdy_mirror_xy = dxdy * mirror_xy;
  11469. const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
  11470. const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
  11471. const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
  11472. const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
  11473. // SUM WEIGHTED SAMPLES:
  11474. // Weights for all samples are the same, so just average them:
  11475. return 0.25 * (sample0a + sample0b + sample0c + sample0d);
  11476. }
  11477. ////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES /////////////////
  11478. float3 tex2Dblur12x12shared(const sampler2D tex,
  11479. const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
  11480. const float sigma)
  11481. {
  11482. // Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
  11483. // Requires: 1.) Same as tex2Dblur9()
  11484. // 2.) ddx() and ddy() are present in the current Cg profile.
  11485. // 3.) The GPU driver is using fine/high-quality derivatives.
  11486. // 4.) quad_vector *correctly* describes the current fragment's
  11487. // location in its pixel quad, by the conventions noted in
  11488. // get_quad_vector[_naive].
  11489. // 5.) tex_uv.w = log2(video_size/output_size).y
  11490. // 6.) tex2Dlod() is present in the current Cg profile.
  11491. // Optional: Tune artifacts vs. excessive blurriness with the global
  11492. // float error_blurring.
  11493. // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian
  11494. // blur (a 6x6 blur of carefully selected bilinear samples)
  11495. // of the given mip level. There will be subtle inaccuracies,
  11496. // especially for small or high-frequency detailed sources.
  11497. // Description:
  11498. // Perform a 1-pass blur with shared texture lookups across a pixel quad.
  11499. // We'll get neighboring samples with high-quality ddx/ddy derivatives, as
  11500. // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
  11501. // Message Passing" by Eric Penner.
  11502. //
  11503. // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
  11504. // bilinear samples, where bilinear sampling positions are computed from
  11505. // the relative Gaussian weights of the 4 surrounding texels. The catch is
  11506. // that the appropriate texel weights and sample coords differ for each
  11507. // fragment, but we're reusing most of the same samples across a quad of
  11508. // destination fragments. (We do use unique coords for the four nearest
  11509. // samples at each fragment.) Mixing bilinear filtering and sample-sharing
  11510. // therefore introduces some error into the weights, and this can get nasty
  11511. // when the source image is small or high-frequency. Computing bilinear
  11512. // ratios based on weights at the sample field center results in sharpening
  11513. // and ringing artifacts, but we can move samples closer to halfway between
  11514. // texels to try blurring away the error (which can move features around by
  11515. // a texel or so). Tune this with the global float "error_blurring".
  11516. //
  11517. // The pixel quad's sample field covers 12x12 texels, accessed through 6x6
  11518. // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10
  11519. // texels (5x5 bilinear taps), and each fragment is responsible for loading
  11520. // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
  11521. // to use unique bilinear coords for sample0* for each fragment. This
  11522. // diagram illustrates the relative locations of bilinear samples 1-9 for
  11523. // each quadrant a, b, c, d (note samples will not be equally spaced):
  11524. // 8a 7a 6a 6b 7b 8b
  11525. // 5a 4a 3a 3b 4b 5b
  11526. // 2a 1a 0a 0b 1b 2b
  11527. // 2c 1c 0c 0d 1d 2d
  11528. // 5c 4c 3c 3d 4d 5d
  11529. // 8c 7c 6c 6d 7d 8d
  11530. // The following diagram illustrates the underlying equally spaced texels,
  11531. // named after the sample that accesses them and subnamed by their location
  11532. // within their 2x2 texel block:
  11533. // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
  11534. // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
  11535. // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
  11536. // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
  11537. // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
  11538. // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
  11539. // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
  11540. // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
  11541. // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
  11542. // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
  11543. // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
  11544. // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
  11545. // With this symmetric arrangement, we don't have to know which absolute
  11546. // quadrant a sample lies in to assign kernel weights; it's enough to know
  11547. // the sample number and the relative quadrant of the sample (relative to
  11548. // the current quadrant):
  11549. // {current, adjacent x, adjacent y, diagonal}
  11550. // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  11551. // Statically compute sampling offsets within each 2x2 texel block, based
  11552. // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
  11553. // and [4, 5] away from the fragment, and reuse them independently for both
  11554. // dimensions. Use the sample field center as the estimated destination,
  11555. // but nudge the result closer to halfway between texels to blur error.
  11556. const float denom_inv = 0.5/(sigma*sigma);
  11557. const float w0off = 1.0;
  11558. const float w0_5off = exp(-(0.5*0.5) * denom_inv);
  11559. const float w1off = exp(-(1.0*1.0) * denom_inv);
  11560. const float w1_5off = exp(-(1.5*1.5) * denom_inv);
  11561. const float w2off = exp(-(2.0*2.0) * denom_inv);
  11562. const float w2_5off = exp(-(2.5*2.5) * denom_inv);
  11563. const float w3_5off = exp(-(3.5*3.5) * denom_inv);
  11564. const float w4_5off = exp(-(4.5*4.5) * denom_inv);
  11565. const float w5_5off = exp(-(5.5*5.5) * denom_inv);
  11566. const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
  11567. const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
  11568. const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
  11569. // We don't share sample0*, so use the nearest destination fragment:
  11570. const float texel0to1ratio_nearest = w1off/(w0off + w1off);
  11571. const float texel1to2ratio_nearest = w2off/(w1off + w2off);
  11572. // Statically compute texel offsets from the bottom-right fragment to each
  11573. // bilinear sample in the bottom-right quadrant:
  11574. const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
  11575. const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
  11576. const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
  11577. const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
  11578. const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  11579. const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
  11580. const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  11581. const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  11582. const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
  11583. const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
  11584. const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
  11585. const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
  11586. // CALCULATE KERNEL WEIGHTS:
  11587. // Statically compute bilinear sample weights at each destination fragment
  11588. // based on the sum of their 4 underlying texel weights. Assume a same-
  11589. // resolution blur, so each symmetrically named sample weight will compute
  11590. // the same at every fragment in the pixel quad: We can therefore compute
  11591. // texel weights based only on the bottom-right quadrant (fragment at 0d0).
  11592. // Too avoid too much boilerplate code, use a macro to get all 4 texel
  11593. // weights for a bilinear sample based on the offset of its top-left texel:
  11594. #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
  11595. (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
  11596. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
  11597. exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
  11598. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
  11599. const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
  11600. const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
  11601. const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
  11602. const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
  11603. const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
  11604. const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
  11605. const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
  11606. const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
  11607. const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
  11608. const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
  11609. const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
  11610. const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
  11611. const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
  11612. const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
  11613. const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
  11614. const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
  11615. const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
  11616. const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
  11617. const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
  11618. const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
  11619. const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
  11620. const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
  11621. const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
  11622. const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
  11623. const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
  11624. const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
  11625. const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
  11626. const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
  11627. const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
  11628. const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
  11629. const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
  11630. const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
  11631. const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
  11632. const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
  11633. const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
  11634. const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
  11635. #undef GET_TEXEL_QUAD_WEIGHTS
  11636. // Statically pack weights for runtime:
  11637. const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
  11638. const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
  11639. const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
  11640. const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
  11641. const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
  11642. const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
  11643. const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
  11644. const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
  11645. const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
  11646. // Get the weight sum inverse (normalization factor):
  11647. const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
  11648. const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
  11649. const float weight_sum = weight_sum2.x + weight_sum2.y;
  11650. const float weight_sum_inv = 1.0/(weight_sum);
  11651. // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  11652. // Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
  11653. const float2 dxdy_curr = dxdy * quad_vector.xy;
  11654. // Load bilinear samples for the current quadrant (for this fragment):
  11655. const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
  11656. const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
  11657. const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
  11658. const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
  11659. const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
  11660. const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
  11661. const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
  11662. const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
  11663. const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
  11664. const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
  11665. const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
  11666. const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
  11667. // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
  11668. // Fetch the samples from other fragments in the 2x2 quad:
  11669. float3 sample1adjx, sample1adjy, sample1diag;
  11670. float3 sample2adjx, sample2adjy, sample2diag;
  11671. float3 sample3adjx, sample3adjy, sample3diag;
  11672. float3 sample4adjx, sample4adjy, sample4diag;
  11673. float3 sample5adjx, sample5adjy, sample5diag;
  11674. float3 sample6adjx, sample6adjy, sample6diag;
  11675. float3 sample7adjx, sample7adjy, sample7diag;
  11676. float3 sample8adjx, sample8adjy, sample8diag;
  11677. quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
  11678. quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
  11679. quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
  11680. quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
  11681. quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
  11682. quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
  11683. quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
  11684. quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
  11685. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  11686. // Fill each row of a matrix with an rgb sample and pre-multiply by the
  11687. // weights to obtain a weighted result:
  11688. float3 sum = float3(0.0,0.0,0.0);
  11689. sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
  11690. sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
  11691. sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
  11692. sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
  11693. sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
  11694. sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
  11695. sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
  11696. sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
  11697. sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
  11698. return sum * weight_sum_inv;
  11699. }
  11700. float3 tex2Dblur10x10shared(const sampler2D tex,
  11701. const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
  11702. const float sigma)
  11703. {
  11704. // Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
  11705. // Requires: Same as tex2Dblur12x12shared()
  11706. // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian
  11707. // blur (a 5x5 blur of carefully selected bilinear samples)
  11708. // of the given mip level. There will be subtle inaccuracies,
  11709. // especially for small or high-frequency detailed sources.
  11710. // Description:
  11711. // First see the description for tex2Dblur12x12shared(). This
  11712. // function shares the same concept and sample placement, but each fragment
  11713. // only uses 25 of the 36 samples taken across the pixel quad (to cover a
  11714. // 5x5 sample area, or 10x10 texel area), and it uses a lower standard
  11715. // deviation to compensate. Thanks to symmetry, the 11 omitted samples
  11716. // are always the "same:"
  11717. // 8adjx, 2adjx, 5adjx,
  11718. // 6adjy, 7adjy, 8adjy,
  11719. // 2diag, 5diag, 6diag, 7diag, 8diag
  11720. // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  11721. // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
  11722. const float denom_inv = 0.5/(sigma*sigma);
  11723. const float w0off = 1.0;
  11724. const float w0_5off = exp(-(0.5*0.5) * denom_inv);
  11725. const float w1off = exp(-(1.0*1.0) * denom_inv);
  11726. const float w1_5off = exp(-(1.5*1.5) * denom_inv);
  11727. const float w2off = exp(-(2.0*2.0) * denom_inv);
  11728. const float w2_5off = exp(-(2.5*2.5) * denom_inv);
  11729. const float w3_5off = exp(-(3.5*3.5) * denom_inv);
  11730. const float w4_5off = exp(-(4.5*4.5) * denom_inv);
  11731. const float w5_5off = exp(-(5.5*5.5) * denom_inv);
  11732. const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
  11733. const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
  11734. const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
  11735. // We don't share sample0*, so use the nearest destination fragment:
  11736. const float texel0to1ratio_nearest = w1off/(w0off + w1off);
  11737. const float texel1to2ratio_nearest = w2off/(w1off + w2off);
  11738. // Statically compute texel offsets from the bottom-right fragment to each
  11739. // bilinear sample in the bottom-right quadrant:
  11740. const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
  11741. const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
  11742. const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
  11743. const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
  11744. const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  11745. const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
  11746. const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  11747. const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  11748. const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
  11749. const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
  11750. const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
  11751. const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
  11752. // CALCULATE KERNEL WEIGHTS:
  11753. // Statically compute bilinear sample weights at each destination fragment
  11754. // from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
  11755. #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
  11756. (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
  11757. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
  11758. exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
  11759. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
  11760. // We only need 25 of the 36 sample weights. Skip the following weights:
  11761. // 8adjx, 2adjx, 5adjx,
  11762. // 6adjy, 7adjy, 8adjy,
  11763. // 2diag, 5diag, 6diag, 7diag, 8diag
  11764. const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
  11765. const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
  11766. const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
  11767. const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
  11768. const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
  11769. const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
  11770. const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
  11771. const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
  11772. const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
  11773. const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
  11774. const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
  11775. const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
  11776. const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
  11777. const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
  11778. const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
  11779. const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
  11780. const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
  11781. const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
  11782. const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
  11783. const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
  11784. const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
  11785. const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
  11786. const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
  11787. const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
  11788. const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
  11789. #undef GET_TEXEL_QUAD_WEIGHTS
  11790. // Get the weight sum inverse (normalization factor):
  11791. const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
  11792. w4curr + w5curr + w6curr + w7curr + w8curr +
  11793. w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
  11794. w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
  11795. w0diag + w1diag + w3diag + w4diag);
  11796. // Statically pack most weights for runtime. Note the mixed packing:
  11797. const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
  11798. const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
  11799. const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
  11800. const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
  11801. const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
  11802. const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
  11803. // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  11804. // Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
  11805. const float2 dxdy_curr = dxdy * quad_vector.xy;
  11806. // Load bilinear samples for the current quadrant (for this fragment):
  11807. const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
  11808. const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
  11809. const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
  11810. const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
  11811. const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
  11812. const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
  11813. const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
  11814. const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
  11815. const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
  11816. const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
  11817. const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
  11818. const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
  11819. // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
  11820. // Fetch the samples from other fragments in the 2x2 quad in order of need:
  11821. float3 sample1adjx, sample1adjy, sample1diag;
  11822. float3 sample2adjx, sample2adjy, sample2diag;
  11823. float3 sample3adjx, sample3adjy, sample3diag;
  11824. float3 sample4adjx, sample4adjy, sample4diag;
  11825. float3 sample5adjx, sample5adjy, sample5diag;
  11826. float3 sample6adjx, sample6adjy, sample6diag;
  11827. float3 sample7adjx, sample7adjy, sample7diag;
  11828. quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
  11829. quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
  11830. quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
  11831. quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
  11832. quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
  11833. quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
  11834. quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
  11835. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  11836. // Fill each row of a matrix with an rgb sample and pre-multiply by the
  11837. // weights to obtain a weighted result. First do the simple ones:
  11838. float3 sum = float3(0.0,0.0,0.0);
  11839. sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
  11840. sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
  11841. sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
  11842. sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
  11843. // Now do the mixed-sample ones:
  11844. sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
  11845. sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
  11846. sum += w8curr * sample8curr;
  11847. // Normalize the sum (so the weights add to 1.0) and return:
  11848. return sum * weight_sum_inv;
  11849. }
  11850. float3 tex2Dblur8x8shared(const sampler2D tex,
  11851. const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
  11852. const float sigma)
  11853. {
  11854. // Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
  11855. // Requires: Same as tex2Dblur12x12shared()
  11856. // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian
  11857. // blur (a 4x4 blur of carefully selected bilinear samples)
  11858. // of the given mip level. There will be subtle inaccuracies,
  11859. // especially for small or high-frequency detailed sources.
  11860. // Description:
  11861. // First see the description for tex2Dblur12x12shared(). This function
  11862. // shares the same concept and a similar sample placement, except each
  11863. // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
  11864. // respectively. There could be a total of 16 samples, 4 of which each
  11865. // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
  11866. // its own offset to reduce shared sample artifacts, bringing the sample
  11867. // count for each fragment to 7. Sample placement:
  11868. // 3a 2a 2b 3b
  11869. // 1a 0a 0b 1b
  11870. // 1c 0c 0d 1d
  11871. // 3c 2c 2d 3d
  11872. // Texel placement:
  11873. // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
  11874. // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
  11875. // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
  11876. // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
  11877. // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
  11878. // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
  11879. // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
  11880. // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
  11881. // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  11882. // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
  11883. const float denom_inv = 0.5/(sigma*sigma);
  11884. const float w0off = 1.0;
  11885. const float w0_5off = exp(-(0.5*0.5) * denom_inv);
  11886. const float w1off = exp(-(1.0*1.0) * denom_inv);
  11887. const float w1_5off = exp(-(1.5*1.5) * denom_inv);
  11888. const float w2off = exp(-(2.0*2.0) * denom_inv);
  11889. const float w2_5off = exp(-(2.5*2.5) * denom_inv);
  11890. const float w3_5off = exp(-(3.5*3.5) * denom_inv);
  11891. const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
  11892. const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
  11893. // We don't share sample0*, so use the nearest destination fragment:
  11894. const float texel0to1ratio_nearest = w1off/(w0off + w1off);
  11895. const float texel1to2ratio_nearest = w2off/(w1off + w2off);
  11896. // Statically compute texel offsets from the bottom-right fragment to each
  11897. // bilinear sample in the bottom-right quadrant:
  11898. const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
  11899. const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
  11900. const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
  11901. const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
  11902. const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  11903. const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  11904. const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  11905. // CALCULATE KERNEL WEIGHTS:
  11906. // Statically compute bilinear sample weights at each destination fragment
  11907. // from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
  11908. #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
  11909. (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
  11910. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
  11911. exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
  11912. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
  11913. const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
  11914. const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
  11915. const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
  11916. const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
  11917. const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
  11918. const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
  11919. const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
  11920. const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
  11921. const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
  11922. const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
  11923. const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
  11924. const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
  11925. const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
  11926. const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
  11927. const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
  11928. const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
  11929. #undef GET_TEXEL_QUAD_WEIGHTS
  11930. // Statically pack weights for runtime:
  11931. const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
  11932. const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
  11933. const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
  11934. const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
  11935. // Get the weight sum inverse (normalization factor):
  11936. const float4 weight_sum4 = w0 + w1 + w2 + w3;
  11937. const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
  11938. const float weight_sum = weight_sum2.x + weight_sum2.y;
  11939. const float weight_sum_inv = 1.0/(weight_sum);
  11940. // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  11941. // Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
  11942. const float2 dxdy_curr = dxdy * quad_vector.xy;
  11943. // Load bilinear samples for the current quadrant (for this fragment):
  11944. const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
  11945. const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
  11946. const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
  11947. const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
  11948. const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
  11949. const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
  11950. const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
  11951. // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
  11952. // Fetch the samples from other fragments in the 2x2 quad:
  11953. float3 sample1adjx, sample1adjy, sample1diag;
  11954. float3 sample2adjx, sample2adjy, sample2diag;
  11955. float3 sample3adjx, sample3adjy, sample3diag;
  11956. quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
  11957. quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
  11958. quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
  11959. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  11960. // Fill each row of a matrix with an rgb sample and pre-multiply by the
  11961. // weights to obtain a weighted result:
  11962. float3 sum = float3(0.0,0.0,0.0);
  11963. sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
  11964. sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
  11965. sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
  11966. sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
  11967. return sum * weight_sum_inv;
  11968. }
  11969. float3 tex2Dblur6x6shared(const sampler2D tex,
  11970. const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
  11971. const float sigma)
  11972. {
  11973. // Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
  11974. // Requires: Same as tex2Dblur12x12shared()
  11975. // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian
  11976. // blur (a 3x3 blur of carefully selected bilinear samples)
  11977. // of the given mip level. There will be some inaccuracies,subtle inaccuracies,
  11978. // especially for small or high-frequency detailed sources.
  11979. // Description:
  11980. // First see the description for tex2Dblur8x8shared(). This
  11981. // function shares the same concept and sample placement, but each fragment
  11982. // only uses 9 of the 16 samples taken across the pixel quad (to cover a
  11983. // 3x3 sample area, or 6x6 texel area), and it uses a lower standard
  11984. // deviation to compensate. Thanks to symmetry, the 7 omitted samples
  11985. // are always the "same:"
  11986. // 1adjx, 3adjx
  11987. // 2adjy, 3adjy
  11988. // 1diag, 2diag, 3diag
  11989. // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  11990. // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
  11991. const float denom_inv = 0.5/(sigma*sigma);
  11992. const float w0off = 1.0;
  11993. const float w0_5off = exp(-(0.5*0.5) * denom_inv);
  11994. const float w1off = exp(-(1.0*1.0) * denom_inv);
  11995. const float w1_5off = exp(-(1.5*1.5) * denom_inv);
  11996. const float w2off = exp(-(2.0*2.0) * denom_inv);
  11997. const float w2_5off = exp(-(2.5*2.5) * denom_inv);
  11998. const float w3_5off = exp(-(3.5*3.5) * denom_inv);
  11999. const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
  12000. const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
  12001. // We don't share sample0*, so use the nearest destination fragment:
  12002. const float texel0to1ratio_nearest = w1off/(w0off + w1off);
  12003. const float texel1to2ratio_nearest = w2off/(w1off + w2off);
  12004. // Statically compute texel offsets from the bottom-right fragment to each
  12005. // bilinear sample in the bottom-right quadrant:
  12006. const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
  12007. const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
  12008. const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
  12009. const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
  12010. const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
  12011. const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
  12012. const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
  12013. // CALCULATE KERNEL WEIGHTS:
  12014. // Statically compute bilinear sample weights at each destination fragment
  12015. // from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
  12016. #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
  12017. (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
  12018. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
  12019. exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
  12020. exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
  12021. // We only need 9 of the 16 sample weights. Skip the following weights:
  12022. // 1adjx, 3adjx
  12023. // 2adjy, 3adjy
  12024. // 1diag, 2diag, 3diag
  12025. const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
  12026. const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
  12027. const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
  12028. const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
  12029. const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
  12030. const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
  12031. const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
  12032. const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
  12033. const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
  12034. #undef GET_TEXEL_QUAD_WEIGHTS
  12035. // Get the weight sum inverse (normalization factor):
  12036. const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
  12037. w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
  12038. // Statically pack some weights for runtime:
  12039. const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
  12040. // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
  12041. // Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
  12042. const float2 dxdy_curr = dxdy * quad_vector.xy;
  12043. // Load bilinear samples for the current quadrant (for this fragment):
  12044. const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
  12045. const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
  12046. const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
  12047. const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
  12048. const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
  12049. const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
  12050. const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
  12051. // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
  12052. // Fetch the samples from other fragments in the 2x2 quad:
  12053. float3 sample1adjx, sample1adjy, sample1diag;
  12054. float3 sample2adjx, sample2adjy, sample2diag;
  12055. quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
  12056. quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
  12057. // Statically normalize weights (so total = 1.0), and sum weighted samples.
  12058. // Fill each row of a matrix with an rgb sample and pre-multiply by the
  12059. // weights to obtain a weighted result for sample1*, and handle the rest
  12060. // of the weights more directly/verbosely:
  12061. float3 sum = float3(0.0,0.0,0.0);
  12062. sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
  12063. sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
  12064. w2adjx * sample2adjx + w3curr * sample3curr;
  12065. return sum * weight_sum_inv;
  12066. }
  12067. /////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS //////////////////////
  12068. // The following blurs are static wrappers around the dynamic blurs above.
  12069. // HOPEFULLY, the compiler will be smart enough to do constant-folding.
  12070. // Resizable separable blurs:
  12071. inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
  12072. const float2 dxdy)
  12073. {
  12074. return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
  12075. }
  12076. inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
  12077. const float2 dxdy)
  12078. {
  12079. return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
  12080. }
  12081. inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
  12082. const float2 dxdy)
  12083. {
  12084. return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
  12085. }
  12086. inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
  12087. const float2 dxdy)
  12088. {
  12089. return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
  12090. }
  12091. inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
  12092. const float2 dxdy)
  12093. {
  12094. return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
  12095. }
  12096. // Fast separable blurs:
  12097. inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
  12098. const float2 dxdy)
  12099. {
  12100. return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
  12101. }
  12102. inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
  12103. const float2 dxdy)
  12104. {
  12105. return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
  12106. }
  12107. inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
  12108. const float2 dxdy)
  12109. {
  12110. return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
  12111. }
  12112. inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
  12113. const float2 dxdy)
  12114. {
  12115. return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
  12116. }
  12117. inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
  12118. const float2 dxdy)
  12119. {
  12120. return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
  12121. }
  12122. // Huge, "fast" separable blurs:
  12123. inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
  12124. const float2 dxdy)
  12125. {
  12126. return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
  12127. }
  12128. inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
  12129. const float2 dxdy)
  12130. {
  12131. return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
  12132. }
  12133. inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
  12134. const float2 dxdy)
  12135. {
  12136. return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
  12137. }
  12138. inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
  12139. const float2 dxdy)
  12140. {
  12141. return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
  12142. }
  12143. // Resizable one-pass blurs:
  12144. inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
  12145. const float2 dxdy)
  12146. {
  12147. return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
  12148. }
  12149. // "Fast" one-pass blurs:
  12150. inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
  12151. const float2 dxdy)
  12152. {
  12153. return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
  12154. }
  12155. inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
  12156. const float2 dxdy)
  12157. {
  12158. return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
  12159. }
  12160. inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
  12161. const float2 dxdy)
  12162. {
  12163. return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
  12164. }
  12165. inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
  12166. const float2 dxdy)
  12167. {
  12168. return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
  12169. }
  12170. // "Fast" shared-sample one-pass blurs:
  12171. inline float3 tex2Dblur12x12shared(const sampler2D tex,
  12172. const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
  12173. {
  12174. return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
  12175. }
  12176. inline float3 tex2Dblur10x10shared(const sampler2D tex,
  12177. const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
  12178. {
  12179. return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
  12180. }
  12181. inline float3 tex2Dblur8x8shared(const sampler2D tex,
  12182. const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
  12183. {
  12184. return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
  12185. }
  12186. inline float3 tex2Dblur6x6shared(const sampler2D tex,
  12187. const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
  12188. {
  12189. return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
  12190. }
  12191. #endif // BLUR_FUNCTIONS_H
  12192. //////////////////////////// END BLUR-FUNCTIONS ///////////////////////////
  12193. /////////////////////////////// BLOOM CONSTANTS //////////////////////////////
  12194. // Compute constants with manual inlines of the functions below:
  12195. static const float bloom_diff_thresh = 1.0/256.0;
  12196. /////////////////////////////////// HELPERS //////////////////////////////////
  12197. inline float get_min_sigma_to_blur_triad(const float triad_size,
  12198. const float thresh)
  12199. {
  12200. // Requires: 1.) triad_size is the final phosphor triad size in pixels
  12201. // 2.) thresh is the max desired pixel difference in the
  12202. // blurred triad (e.g. 1.0/256.0).
  12203. // Returns: Return the minimum sigma that will fully blur a phosphor
  12204. // triad on the screen to an even color, within thresh.
  12205. // This closed-form function was found by curve-fitting data.
  12206. // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
  12207. return -0.05168 + 0.6113*triad_size -
  12208. 1.122*triad_size*sqrt(0.000416 + thresh);
  12209. // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
  12210. //return 0.5985*triad_size - triad_size*sqrt(thresh)
  12211. }
  12212. inline float get_absolute_scale_blur_sigma(const float thresh)
  12213. {
  12214. // Requires: 1.) min_expected_triads must be a global float. The number
  12215. // of horizontal phosphor triads in the final image must be
  12216. // >= min_allowed_viewport_triads.x for realistic results.
  12217. // 2.) bloom_approx_scale_x must be a global float equal to the
  12218. // absolute horizontal scale of BLOOM_APPROX.
  12219. // 3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
  12220. // should be <= 1.1658025090 to keep the final result <
  12221. // 0.62666015625 (the largest sigma ensuring the largest
  12222. // unused texel weight stays < 1.0/256.0 for a 3x3 blur).
  12223. // 4.) thresh is the max desired pixel difference in the
  12224. // blurred triad (e.g. 1.0/256.0).
  12225. // Returns: Return the minimum Gaussian sigma that will blur the pass
  12226. // output as much as it would have taken to blur away
  12227. // bloom_approx_scale_x horizontal phosphor triads.
  12228. // Description:
  12229. // BLOOM_APPROX should look like a downscaled phosphor blur. Ideally, we'd
  12230. // use the same blur sigma as the actual phosphor bloom and scale it down
  12231. // to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
  12232. // we don't know the viewport size in this pass. Instead, we'll blur as
  12233. // much as it would take to blur away min_allowed_viewport_triads.x. This
  12234. // will blur "more than necessary" if the user actually uses more triads,
  12235. // but that's not terrible either, because blurring a constant fraction of
  12236. // the viewport may better resemble a true optical bloom anyway (since the
  12237. // viewport will generally be about the same fraction of each player's
  12238. // field of view, regardless of screen size and resolution).
  12239. // Assume an extremely large viewport size for asymptotic results.
  12240. return bloom_approx_scale_x/max_viewport_size_x *
  12241. get_min_sigma_to_blur_triad(
  12242. max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
  12243. }
  12244. inline float get_center_weight(const float sigma)
  12245. {
  12246. // Given a Gaussian blur sigma, get the blur weight for the center texel.
  12247. #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  12248. return get_fast_gaussian_weight_sum_inv(sigma);
  12249. #else
  12250. const float denom_inv = 0.5/(sigma*sigma);
  12251. const float w0 = 1.0;
  12252. const float w1 = exp(-1.0 * denom_inv);
  12253. const float w2 = exp(-4.0 * denom_inv);
  12254. const float w3 = exp(-9.0 * denom_inv);
  12255. const float w4 = exp(-16.0 * denom_inv);
  12256. const float w5 = exp(-25.0 * denom_inv);
  12257. const float w6 = exp(-36.0 * denom_inv);
  12258. const float w7 = exp(-49.0 * denom_inv);
  12259. const float w8 = exp(-64.0 * denom_inv);
  12260. const float w9 = exp(-81.0 * denom_inv);
  12261. const float w10 = exp(-100.0 * denom_inv);
  12262. const float w11 = exp(-121.0 * denom_inv);
  12263. const float w12 = exp(-144.0 * denom_inv);
  12264. const float w13 = exp(-169.0 * denom_inv);
  12265. const float w14 = exp(-196.0 * denom_inv);
  12266. const float w15 = exp(-225.0 * denom_inv);
  12267. const float w16 = exp(-256.0 * denom_inv);
  12268. const float w17 = exp(-289.0 * denom_inv);
  12269. const float w18 = exp(-324.0 * denom_inv);
  12270. const float w19 = exp(-361.0 * denom_inv);
  12271. const float w20 = exp(-400.0 * denom_inv);
  12272. const float w21 = exp(-441.0 * denom_inv);
  12273. // Note: If the implementation uses a smaller blur than the max allowed,
  12274. // the worst case scenario is that the center weight will be overestimated,
  12275. // so we'll put a bit more energy into the brightpass...no huge deal.
  12276. // Then again, if the implementation uses a larger blur than the max
  12277. // "allowed" because of dynamic branching, the center weight could be
  12278. // underestimated, which is more of a problem...consider always using
  12279. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  12280. // 43x blur:
  12281. const float weight_sum_inv = 1.0 /
  12282. (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 +
  12283. w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
  12284. #else
  12285. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  12286. // 31x blur:
  12287. const float weight_sum_inv = 1.0 /
  12288. (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 +
  12289. w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15));
  12290. #else
  12291. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  12292. // 25x blur:
  12293. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
  12294. w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
  12295. #else
  12296. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  12297. // 17x blur:
  12298. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
  12299. w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
  12300. #else
  12301. // 9x blur:
  12302. const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
  12303. #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  12304. #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  12305. #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  12306. #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  12307. const float center_weight = weight_sum_inv * weight_sum_inv;
  12308. return center_weight;
  12309. #endif
  12310. }
  12311. inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv,
  12312. const float2 dxdy, const float sigma)
  12313. {
  12314. // If sigma is static, we can safely branch and use the smallest blur
  12315. // that's big enough. Ignore #define hints, because we'll only use a
  12316. // large blur if we actually need it, and the branches cost nothing.
  12317. #ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  12318. #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
  12319. #else
  12320. // It's still worth branching if the profile supports dynamic branches:
  12321. // It's much faster than using a hugely excessive blur, but each branch
  12322. // eats ~1% FPS.
  12323. #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
  12324. #define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
  12325. #endif
  12326. #endif
  12327. // Failed optimization notes:
  12328. // I originally created a same-size mipmapped 5-tap separable blur10 that
  12329. // could handle any sigma by reaching into lower mip levels. It was
  12330. // as fast as blur25fast for runtime sigmas and a tad faster than
  12331. // blur31fast for static sigmas, but mipmapping two viewport-size passes
  12332. // ate 10% of FPS across all codepaths, so it wasn't worth it.
  12333. #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
  12334. if(sigma <= blur9_std_dev)
  12335. {
  12336. return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
  12337. }
  12338. else if(sigma <= blur17_std_dev)
  12339. {
  12340. return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
  12341. }
  12342. else if(sigma <= blur25_std_dev)
  12343. {
  12344. return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
  12345. }
  12346. else if(sigma <= blur31_std_dev)
  12347. {
  12348. return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
  12349. }
  12350. else
  12351. {
  12352. return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
  12353. }
  12354. #else
  12355. // If we can't afford to branch, we can only guess at what blur
  12356. // size we need. Therefore, use the largest blur allowed.
  12357. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  12358. return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
  12359. #else
  12360. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  12361. return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
  12362. #else
  12363. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  12364. return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
  12365. #else
  12366. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  12367. return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
  12368. #else
  12369. return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
  12370. #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
  12371. #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
  12372. #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
  12373. #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
  12374. #endif // PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
  12375. }
  12376. inline float get_bloom_approx_sigma(const float output_size_x_runtime,
  12377. const float estimated_viewport_size_x)
  12378. {
  12379. // Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
  12380. // This is included for dynamic codepaths just in case the
  12381. // following two globals are incorrect:
  12382. // 2.) bloom_approx_size_x_for_skip should == the same
  12383. // if PHOSPHOR_BLOOM_FAKE is #defined
  12384. // 3.) bloom_approx_size_x should == the same otherwise
  12385. // Returns: For gaussian4x4, return a dynamic small bloom sigma that's
  12386. // as close to optimal as possible given available information.
  12387. // For blur3x3, return the a static small bloom sigma that
  12388. // works well for typical cases. Otherwise, we're using simple
  12389. // bilinear filtering, so use static calculations.
  12390. // Assume the default static value. This is a compromise that ensures
  12391. // typical triads are blurred, even if unusually large ones aren't.
  12392. static const float mask_num_triads_static =
  12393. max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
  12394. const float mask_num_triads_from_size =
  12395. estimated_viewport_size_x/mask_triad_size_desired;
  12396. const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x,
  12397. lerp(mask_num_triads_from_size, mask_num_triads_desired,
  12398. mask_specify_num_triads));
  12399. // Assume an extremely large viewport size for asymptotic results:
  12400. static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
  12401. if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize
  12402. {
  12403. // Use the runtime num triads and output size:
  12404. const float asymptotic_triad_size =
  12405. max_viewport_size_x/mask_num_triads_runtime;
  12406. const float asymptotic_sigma = get_min_sigma_to_blur_triad(
  12407. asymptotic_triad_size, bloom_diff_thresh);
  12408. const float bloom_approx_sigma =
  12409. asymptotic_sigma * output_size_x_runtime/max_viewport_size_x;
  12410. // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
  12411. // account for the Gaussian scanline sigma from the last pass too.
  12412. // The bloom will be too wide horizontally but tall enough vertically.
  12413. return length(float2(bloom_approx_sigma, beam_max_sigma));
  12414. }
  12415. else // 3x3 blur resize (the bilinear resize doesn't need a sigma)
  12416. {
  12417. // We're either using blur3x3 or bilinear filtering. The biggest
  12418. // reason to choose blur3x3 is to avoid dynamic weights, so use a
  12419. // static calculation.
  12420. #ifdef PHOSPHOR_BLOOM_FAKE
  12421. static const float output_size_x_static =
  12422. bloom_approx_size_x_for_fake;
  12423. #else
  12424. static const float output_size_x_static = bloom_approx_size_x;
  12425. #endif
  12426. static const float asymptotic_triad_size =
  12427. max_viewport_size_x/mask_num_triads_static;
  12428. const float asymptotic_sigma = get_min_sigma_to_blur_triad(
  12429. asymptotic_triad_size, bloom_diff_thresh);
  12430. const float bloom_approx_sigma =
  12431. asymptotic_sigma * output_size_x_static/max_viewport_size_x;
  12432. // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
  12433. // try accounting for the Gaussian scanline sigma from the last pass
  12434. // too; use the static default value:
  12435. return length(float2(bloom_approx_sigma, beam_max_sigma_static));
  12436. }
  12437. }
  12438. inline float get_final_bloom_sigma(const float bloom_sigma_runtime)
  12439. {
  12440. // Requires: 1.) bloom_sigma_runtime is a precalculated sigma that's
  12441. // optimal for the [known] triad size.
  12442. // 2.) Call this from a fragment shader (not a vertex shader),
  12443. // or blurring with static sigmas won't be constant-folded.
  12444. // Returns: Return the optimistic static sigma if the triad size is
  12445. // known at compile time. Otherwise return the optimal runtime
  12446. // sigma (10% slower) or an implementation-specific compromise
  12447. // between an optimistic or pessimistic static sigma.
  12448. // Notes: Call this from the fragment shader, NOT the vertex shader,
  12449. // so static sigmas can be constant-folded!
  12450. const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad(
  12451. mask_triad_size_desired_static, bloom_diff_thresh);
  12452. #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
  12453. return bloom_sigma_runtime;
  12454. #else
  12455. // Overblurring looks as bad as underblurring, so assume average-size
  12456. // triads, not worst-case huge triads:
  12457. return bloom_sigma_optimistic;
  12458. #endif
  12459. }
  12460. #endif // BLOOM_FUNCTIONS_H
  12461. //////////////////////////// END BLOOM-FUNCTIONS ///////////////////////////
  12462. //#include "../../../../include/gamma-management.h"
  12463. /////////////////////////////////// HELPERS //////////////////////////////////
  12464. float3 tex2Dresize_gaussian4x4(sampler2D tex, float2 tex_uv, float2 dxdy, float2 tex_size, float2 texture_size_inv, float2 tex_uv_to_pixel_scale, float sigma)
  12465. {
  12466. // Requires: 1.) All requirements of gamma-management.h must be satisfied!
  12467. // 2.) filter_linearN must == "true" in your .cgp preset.
  12468. // 3.) mipmap_inputN must == "true" in your .cgp preset if
  12469. // output_size << SRC.video_size.
  12470. // 4.) dxdy should contain the uv pixel spacing:
  12471. // dxdy = max(float2(1.0),
  12472. // SRC.video_size/output_size)/SRC.texture_size;
  12473. // 5.) texture_size == SRC.texture_size
  12474. // 6.) texture_size_inv == float2(1.0)/SRC.texture_size
  12475. // 7.) tex_uv_to_pixel_scale == output_size *
  12476. // SRC.texture_size / SRC.video_size;
  12477. // 8.) sigma is the desired Gaussian standard deviation, in
  12478. // terms of output pixels. It should be < ~0.66171875 to
  12479. // ensure the first unused sample (outside the 4x4 box) has
  12480. // a weight < 1.0/256.0.
  12481. // Returns: A true 4x4 Gaussian resize of the input.
  12482. // Description:
  12483. // Given correct inputs, this Gaussian resizer samples 4 pixel locations
  12484. // along each downsized dimension and/or 4 texel locations along each
  12485. // upsized dimension. It computes dynamic weights based on the pixel-space
  12486. // distance of each sample from the destination pixel. It is arbitrarily
  12487. // resizable and higher quality than tex2Dblur3x3_resize, but it's slower.
  12488. // TODO: Move this to a more suitable file once there are others like it.
  12489. const float denom_inv = 0.5/(sigma*sigma);
  12490. // We're taking 4x4 samples, and we're snapping to texels for upsizing.
  12491. // Find texture coords for sample 5 (second row, second column):
  12492. const float2 curr_texel = tex_uv * tex_size;
  12493. const float2 prev_texel =
  12494. floor(curr_texel - float2(under_half)) + float2(0.5);
  12495. const float2 prev_texel_uv = prev_texel * texture_size_inv;
  12496. const float2 snap = float2((dxdy.x <= texture_size_inv.x), (dxdy.y <= texture_size_inv.y));
  12497. const float2 sample5_downsize_uv = tex_uv - 0.5 * dxdy;
  12498. const float2 sample5_uv = lerp(sample5_downsize_uv, prev_texel_uv, snap);
  12499. // Compute texture coords for other samples:
  12500. const float2 dx = float2(dxdy.x, 0.0);
  12501. const float2 sample0_uv = sample5_uv - dxdy;
  12502. const float2 sample10_uv = sample5_uv + dxdy;
  12503. const float2 sample15_uv = sample5_uv + 2.0 * dxdy;
  12504. const float2 sample1_uv = sample0_uv + dx;
  12505. const float2 sample2_uv = sample0_uv + 2.0 * dx;
  12506. const float2 sample3_uv = sample0_uv + 3.0 * dx;
  12507. const float2 sample4_uv = sample5_uv - dx;
  12508. const float2 sample6_uv = sample5_uv + dx;
  12509. const float2 sample7_uv = sample5_uv + 2.0 * dx;
  12510. const float2 sample8_uv = sample10_uv - 2.0 * dx;
  12511. const float2 sample9_uv = sample10_uv - dx;
  12512. const float2 sample11_uv = sample10_uv + dx;
  12513. const float2 sample12_uv = sample15_uv - 3.0 * dx;
  12514. const float2 sample13_uv = sample15_uv - 2.0 * dx;
  12515. const float2 sample14_uv = sample15_uv - dx;
  12516. // Load each sample:
  12517. float3 sample0 = tex2D_linearize(tex, sample0_uv).rgb;
  12518. float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
  12519. float3 sample2 = tex2D_linearize(tex, dx).rgb;
  12520. float3 sample3 = tex2D_linearize(tex, sample3_uv).rgb;
  12521. float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
  12522. float3 sample5 = tex2D_linearize(tex, sample5_uv).rgb;
  12523. float3 sample6 = tex2D_linearize(tex, sample6_uv).rgb;
  12524. float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
  12525. float3 sample8 = tex2D_linearize(tex, sample8_uv).rgb;
  12526. float3 sample9 = tex2D_linearize(tex, sample9_uv).rgb;
  12527. float3 sample10 = tex2D_linearize(tex, sample10_uv).rgb;
  12528. float3 sample11 = tex2D_linearize(tex, sample11_uv).rgb;
  12529. float3 sample12 = tex2D_linearize(tex, sample12_uv).rgb;
  12530. float3 sample13 = tex2D_linearize(tex, sample13_uv).rgb;
  12531. float3 sample14 = tex2D_linearize(tex, sample14_uv).rgb;
  12532. float3 sample15 = tex2D_linearize(tex, sample15_uv).rgb;
  12533. // Compute destination pixel offsets for each sample:
  12534. const float2 dest_pixel = tex_uv * tex_uv_to_pixel_scale;
  12535. const float2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel;
  12536. const float2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel;
  12537. const float2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel;
  12538. const float2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel;
  12539. const float2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel;
  12540. const float2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel;
  12541. const float2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel;
  12542. const float2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel;
  12543. const float2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel;
  12544. const float2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel;
  12545. const float2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel;
  12546. const float2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel;
  12547. const float2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel;
  12548. const float2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel;
  12549. const float2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel;
  12550. const float2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel;
  12551. // Compute Gaussian sample weights:
  12552. const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv);
  12553. const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv);
  12554. const float w2 = exp(-LENGTH_SQ(sample2_offset) * denom_inv);
  12555. const float w3 = exp(-LENGTH_SQ(sample3_offset) * denom_inv);
  12556. const float w4 = exp(-LENGTH_SQ(sample4_offset) * denom_inv);
  12557. const float w5 = exp(-LENGTH_SQ(sample5_offset) * denom_inv);
  12558. const float w6 = exp(-LENGTH_SQ(sample6_offset) * denom_inv);
  12559. const float w7 = exp(-LENGTH_SQ(sample7_offset) * denom_inv);
  12560. const float w8 = exp(-LENGTH_SQ(sample8_offset) * denom_inv);
  12561. const float w9 = exp(-LENGTH_SQ(sample9_offset) * denom_inv);
  12562. const float w10 = exp(-LENGTH_SQ(sample10_offset) * denom_inv);
  12563. const float w11 = exp(-LENGTH_SQ(sample11_offset) * denom_inv);
  12564. const float w12 = exp(-LENGTH_SQ(sample12_offset) * denom_inv);
  12565. const float w13 = exp(-LENGTH_SQ(sample13_offset) * denom_inv);
  12566. const float w14 = exp(-LENGTH_SQ(sample14_offset) * denom_inv);
  12567. const float w15 = exp(-LENGTH_SQ(sample15_offset) * denom_inv);
  12568. const float weight_sum_inv = 1.0/(
  12569. w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 +
  12570. w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15);
  12571. // Weight and sum the samples:
  12572. const float3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
  12573. w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
  12574. w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
  12575. w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15;
  12576. return sum * weight_sum_inv;
  12577. }
  12578. void main() {
  12579. // Would a viewport-relative size work better for this pass? (No.)
  12580. // PROS:
  12581. // 1.) Instead of writing an absolute size to user-cgp-constants.h, we'd
  12582. // write a viewport scale. That number could be used to directly scale
  12583. // the viewport-resolution bloom sigma and/or triad size to a smaller
  12584. // scale. This way, we could calculate an optimal dynamic sigma no
  12585. // matter how the dot pitch is specified.
  12586. // CONS:
  12587. // 1.) Texel smearing would be much worse at small viewport sizes, but
  12588. // performance would be much worse at large viewport sizes, so there
  12589. // would be no easy way to calculate a decent scale.
  12590. // 2.) Worse, we could no longer get away with using a constant-size blur!
  12591. // Instead, we'd have to face all the same difficulties as the real
  12592. // phosphor bloom, which requires static #ifdefs to decide the blur
  12593. // size based on the expected triad size...a dynamic value.
  12594. // 3.) Like the phosphor bloom, we'd have less control over making the blur
  12595. // size correct for an optical blur. That said, we likely overblur (to
  12596. // maintain brightness) more than the eye would do by itself: 20/20
  12597. // human vision distinguishes ~1 arc minute, or 1/60 of a degree. The
  12598. // highest viewing angle recommendation I know of is THX's 40.04 degree
  12599. // recommendation, at which 20/20 vision can distinguish about 2402.4
  12600. // lines. Assuming the "TV lines" definition, that means 1201.2
  12601. // distinct light lines and 1201.2 distinct dark lines can be told
  12602. // apart, i.e. 1201.2 pairs of lines. This would correspond to 1201.2
  12603. // pairs of alternating lit/unlit phosphors, so 2402.4 phosphors total
  12604. // (if they're alternately lit). That's a max of 800.8 triads. Using
  12605. // a more popular 30 degree viewing angle recommendation, 20/20 vision
  12606. // can distinguish 1800 lines, or 600 triads of alternately lit
  12607. // phosphors. In contrast, we currently blur phosphors all the way
  12608. // down to 341.3 triads to ensure full brightness.
  12609. // 4.) Realistically speaking, we're usually just going to use bilinear
  12610. // filtering in this pass anyway, but it only works well to limit
  12611. // bandwidth if it's done at a small constant scale.
  12612. // Get the constants we need to sample:
  12613. // const sampler2D texture = ORIG_LINEARIZED.texture;
  12614. // const float2 tex_uv = tex_uv;
  12615. // const float2 blur_dxdy = blur_dxdy;
  12616. const float2 texture_size_ = ORIG_LINEARIZEDtexture_size;
  12617. // const float2 texture_size_inv = texture_size_inv;
  12618. // const float2 tex_uv_to_pixel_scale = tex_uv_to_pixel_scale;
  12619. float2 tex_uv_r, tex_uv_g, tex_uv_b;
  12620. if(beam_misconvergence)
  12621. {
  12622. const float2 uv_scanline_step = uv_scanline_step;
  12623. const float2 convergence_offsets_r = get_convergence_offsets_r_vector();
  12624. const float2 convergence_offsets_g = get_convergence_offsets_g_vector();
  12625. const float2 convergence_offsets_b = get_convergence_offsets_b_vector();
  12626. tex_uv_r = tex_uv - convergence_offsets_r * uv_scanline_step;
  12627. tex_uv_g = tex_uv - convergence_offsets_g * uv_scanline_step;
  12628. tex_uv_b = tex_uv - convergence_offsets_b * uv_scanline_step;
  12629. }
  12630. // Get the blur sigma:
  12631. const float bloom_approx_sigma = get_bloom_approx_sigma(output_size.x,
  12632. estimated_viewport_size_x);
  12633. // Sample the resized and blurred texture, and apply convergence offsets if
  12634. // necessary. Applying convergence offsets here triples our samples from
  12635. // 16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and
  12636. // HALATION_BLUR 3 times at full resolution every time they're used.
  12637. float3 color_r, color_g, color_b, color;
  12638. if(bloom_approx_filter > 1.5)
  12639. {
  12640. // Use a 4x4 Gaussian resize. This is slower but technically correct.
  12641. if(beam_misconvergence)
  12642. {
  12643. color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r,
  12644. blur_dxdy, texture_size_, texture_size_inv,
  12645. tex_uv_to_pixel_scale, bloom_approx_sigma);
  12646. color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g,
  12647. blur_dxdy, texture_size_, texture_size_inv,
  12648. tex_uv_to_pixel_scale, bloom_approx_sigma);
  12649. color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b,
  12650. blur_dxdy, texture_size_, texture_size_inv,
  12651. tex_uv_to_pixel_scale, bloom_approx_sigma);
  12652. }
  12653. else
  12654. {
  12655. color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv,
  12656. blur_dxdy, texture_size_, texture_size_inv,
  12657. tex_uv_to_pixel_scale, bloom_approx_sigma);
  12658. }
  12659. }
  12660. else if(bloom_approx_filter > 0.5)
  12661. {
  12662. // Use a 3x3 resize blur. This is the softest option, because we're
  12663. // blurring already blurry bilinear samples. It doesn't play quite as
  12664. // nicely with convergence offsets, but it has its charms.
  12665. if(beam_misconvergence)
  12666. {
  12667. color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r,
  12668. blur_dxdy, bloom_approx_sigma);
  12669. color_g = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_g,
  12670. blur_dxdy, bloom_approx_sigma);
  12671. color_b = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_b,
  12672. blur_dxdy, bloom_approx_sigma);
  12673. }
  12674. else
  12675. {
  12676. color = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv, blur_dxdy);
  12677. }
  12678. }
  12679. else
  12680. {
  12681. // Use bilinear sampling. This approximates a 4x4 Gaussian resize MUCH
  12682. // better than tex2Dblur3x3_resize for the very small sigmas we're
  12683. // likely to use at small output resolutions. (This estimate becomes
  12684. // too sharp above ~400x300, but the blurs break down above that
  12685. // resolution too, unless min_allowed_viewport_triads is high enough to
  12686. // keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.)
  12687. if(beam_misconvergence)
  12688. {
  12689. color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb;
  12690. color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb;
  12691. color_b = tex2D_linearize(ORIG_LINEARIZED, tex_uv_b).rgb;
  12692. }
  12693. else
  12694. {
  12695. color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb;
  12696. }
  12697. }
  12698. // Pack the colors from the red/green/blue beams into a single vector:
  12699. if(beam_misconvergence)
  12700. {
  12701. color = float3(color_r.r, color_g.g, color_b.b);
  12702. }
  12703. // Encode and output the blurred image:
  12704. FragColor = encode_output(float4(tex2D_linearize(ORIG_LINEARIZED, tex_uv)));
  12705. }