util_math_fast.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. /*
  2. * Adapted from OpenImageIO library with this license:
  3. *
  4. * Copyright 2008-2014 Larry Gritz and the other authors and contributors.
  5. * All Rights Reserved.
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions are
  8. * met:
  9. * * Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * * Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * * Neither the name of the software's owners nor the names of its
  15. * contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. *
  29. * (This is the Modified BSD License)
  30. *
  31. * A few bits here are based upon code from NVIDIA that was also released
  32. * under the same modified BSD license, and marked as:
  33. * Copyright 2004 NVIDIA Corporation. All Rights Reserved.
  34. *
  35. * Some parts of this file were first open-sourced in Open Shading Language,
  36. * then later moved here. The original copyright notice was:
  37. * Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
  38. *
  39. * Many of the math functions were copied from or inspired by other
  40. * public domain sources or open source packages with compatible licenses.
  41. * The individual functions give references were applicable.
  42. */
  43. #ifndef __UTIL_FAST_MATH__
  44. #define __UTIL_FAST_MATH__
  45. CCL_NAMESPACE_BEGIN
  46. ccl_device_inline float madd(const float a, const float b, const float c)
  47. {
  48. /* NOTE: In the future we may want to explicitly ask for a fused
  49. * multiply-add in a specialized version for float.
  50. *
  51. * NOTE: GCC/ICC will turn this (for float) into a FMA unless
  52. * explicitly asked not to, clang seems to leave the code alone.
  53. */
  54. return a * b + c;
  55. }
  56. ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c)
  57. {
  58. return a * b + c;
  59. }
  60. /*
  61. * FAST & APPROXIMATE MATH
  62. *
  63. * The functions named "fast_*" provide a set of replacements to libm that
  64. * are much faster at the expense of some accuracy and robust handling of
  65. * extreme values. One design goal for these approximation was to avoid
  66. * branches as much as possible and operate on single precision values only
  67. * so that SIMD versions should be straightforward ports We also try to
  68. * implement "safe" semantics (ie: clamp to valid range where possible)
  69. * natively since wrapping these inline calls in another layer would be
  70. * wasteful.
  71. *
  72. * Some functions are fast_safe_*, which is both a faster approximation as
  73. * well as clamped input domain to ensure no NaN, Inf, or divide by zero.
  74. */
  75. /* Round to nearest integer, returning as an int. */
  76. ccl_device_inline int fast_rint(float x)
  77. {
  78. /* used by sin/cos/tan range reduction. */
  79. #ifdef __KERNEL_SSE4__
  80. /* Single roundps instruction on SSE4.1+ (for gcc/clang at least). */
  81. return float_to_int(rintf(x));
  82. #else
  83. /* emulate rounding by adding/substracting 0.5. */
  84. return float_to_int(x + copysignf(0.5f, x));
  85. #endif
  86. }
  87. ccl_device float fast_sinf(float x)
  88. {
  89. /* Very accurate argument reduction from SLEEF,
  90. * starts failing around x=262000
  91. *
  92. * Results on: [-2pi,2pi].
  93. *
  94. * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
  95. * 1.19209e-07 max error
  96. */
  97. int q = fast_rint(x * M_1_PI_F);
  98. float qf = q;
  99. x = madd(qf, -0.78515625f * 4, x);
  100. x = madd(qf, -0.00024187564849853515625f * 4, x);
  101. x = madd(qf, -3.7747668102383613586e-08f * 4, x);
  102. x = madd(qf, -1.2816720341285448015e-12f * 4, x);
  103. x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals */
  104. float s = x * x;
  105. if ((q & 1) != 0)
  106. x = -x;
  107. /* This polynomial approximation has very low error on [-pi/2,+pi/2]
  108. * 1.19209e-07 max error in total over [-2pi,+2pi]. */
  109. float u = 2.6083159809786593541503e-06f;
  110. u = madd(u, s, -0.0001981069071916863322258f);
  111. u = madd(u, s, +0.00833307858556509017944336f);
  112. u = madd(u, s, -0.166666597127914428710938f);
  113. u = madd(s, u * x, x);
  114. /* For large x, the argument reduction can fail and the polynomial can be
  115. * evaluated with arguments outside the valid internal. Just clamp the bad
  116. * values away (setting to 0.0f means no branches need to be generated). */
  117. if (fabsf(u) > 1.0f) {
  118. u = 0.0f;
  119. }
  120. return u;
  121. }
  122. ccl_device float fast_cosf(float x)
  123. {
  124. /* Same argument reduction as fast_sinf(). */
  125. int q = fast_rint(x * M_1_PI_F);
  126. float qf = q;
  127. x = madd(qf, -0.78515625f * 4, x);
  128. x = madd(qf, -0.00024187564849853515625f * 4, x);
  129. x = madd(qf, -3.7747668102383613586e-08f * 4, x);
  130. x = madd(qf, -1.2816720341285448015e-12f * 4, x);
  131. x = M_PI_2_F - (M_PI_2_F - x); /* Crush denormals. */
  132. float s = x * x;
  133. /* Polynomial from SLEEF's sincosf, max error is
  134. * 4.33127e-07 over [-2pi,2pi] (98% of values are "exact"). */
  135. float u = -2.71811842367242206819355e-07f;
  136. u = madd(u, s, +2.47990446951007470488548e-05f);
  137. u = madd(u, s, -0.00138888787478208541870117f);
  138. u = madd(u, s, +0.0416666641831398010253906f);
  139. u = madd(u, s, -0.5f);
  140. u = madd(u, s, +1.0f);
  141. if ((q & 1) != 0) {
  142. u = -u;
  143. }
  144. if (fabsf(u) > 1.0f) {
  145. u = 0.0f;
  146. }
  147. return u;
  148. }
  149. ccl_device void fast_sincosf(float x, float *sine, float *cosine)
  150. {
  151. /* Same argument reduction as fast_sin. */
  152. int q = fast_rint(x * M_1_PI_F);
  153. float qf = q;
  154. x = madd(qf, -0.78515625f * 4, x);
  155. x = madd(qf, -0.00024187564849853515625f * 4, x);
  156. x = madd(qf, -3.7747668102383613586e-08f * 4, x);
  157. x = madd(qf, -1.2816720341285448015e-12f * 4, x);
  158. x = M_PI_2_F - (M_PI_2_F - x); // crush denormals
  159. float s = x * x;
  160. /* NOTE: same exact polynomials as fast_sinf() and fast_cosf() above. */
  161. if ((q & 1) != 0) {
  162. x = -x;
  163. }
  164. float su = 2.6083159809786593541503e-06f;
  165. su = madd(su, s, -0.0001981069071916863322258f);
  166. su = madd(su, s, +0.00833307858556509017944336f);
  167. su = madd(su, s, -0.166666597127914428710938f);
  168. su = madd(s, su * x, x);
  169. float cu = -2.71811842367242206819355e-07f;
  170. cu = madd(cu, s, +2.47990446951007470488548e-05f);
  171. cu = madd(cu, s, -0.00138888787478208541870117f);
  172. cu = madd(cu, s, +0.0416666641831398010253906f);
  173. cu = madd(cu, s, -0.5f);
  174. cu = madd(cu, s, +1.0f);
  175. if ((q & 1) != 0) {
  176. cu = -cu;
  177. }
  178. if (fabsf(su) > 1.0f) {
  179. su = 0.0f;
  180. }
  181. if (fabsf(cu) > 1.0f) {
  182. cu = 0.0f;
  183. }
  184. *sine = su;
  185. *cosine = cu;
  186. }
  187. /* NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts
  188. * becoming really poor outside of this range because the reciprocal amplifies
  189. * errors.
  190. */
  191. ccl_device float fast_tanf(float x)
  192. {
  193. /* Derived from SLEEF implementation.
  194. *
  195. * Note that we cannot apply the "denormal crush" trick everywhere because
  196. * we sometimes need to take the reciprocal of the polynomial
  197. */
  198. int q = fast_rint(x * 2.0f * M_1_PI_F);
  199. float qf = q;
  200. x = madd(qf, -0.78515625f * 2, x);
  201. x = madd(qf, -0.00024187564849853515625f * 2, x);
  202. x = madd(qf, -3.7747668102383613586e-08f * 2, x);
  203. x = madd(qf, -1.2816720341285448015e-12f * 2, x);
  204. if ((q & 1) == 0) {
  205. /* Crush denormals (only if we aren't inverting the result later). */
  206. x = M_PI_4_F - (M_PI_4_F - x);
  207. }
  208. float s = x * x;
  209. float u = 0.00927245803177356719970703f;
  210. u = madd(u, s, 0.00331984995864331722259521f);
  211. u = madd(u, s, 0.0242998078465461730957031f);
  212. u = madd(u, s, 0.0534495301544666290283203f);
  213. u = madd(u, s, 0.133383005857467651367188f);
  214. u = madd(u, s, 0.333331853151321411132812f);
  215. u = madd(s, u * x, x);
  216. if ((q & 1) != 0) {
  217. u = -1.0f / u;
  218. }
  219. return u;
  220. }
  221. /* Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
  222. *
  223. * Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
  224. */
  225. ccl_device float fast_sinpif(float x)
  226. {
  227. /* Fast trick to strip the integral part off, so our domain is [-1, 1]. */
  228. const float z = x - ((x + 25165824.0f) - 25165824.0f);
  229. const float y = z - z * fabsf(z);
  230. const float Q = 3.10396624f;
  231. const float P = 3.584135056f; /* P = 16-4*Q */
  232. return y * (Q + P * fabsf(y));
  233. /* The original article used used inferior constants for Q and P and
  234. * so had max error 1.091e-3.
  235. *
  236. * The optimal value for Q was determined by exhaustive search, minimizing
  237. * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
  238. * over the interval [0,2] (which is where most of the invocations happen).
  239. *
  240. * The basic idea of this approximation starts with the coarse approximation:
  241. * sin(pi*x) ~= f(x) = 4 * (x - x * abs(x))
  242. *
  243. * This approximation always _over_ estimates the target. On the other hand,
  244. * the curve:
  245. * sin(pi*x) ~= f(x) * abs(f(x)) / 4
  246. *
  247. * always lies _under_ the target. Thus we can simply numerically search for
  248. * the optimal constant to LERP these curves into a more precise
  249. * approximation.
  250. *
  251. * After folding the constants together and simplifying the resulting math,
  252. * we end up with the compact implementation above.
  253. *
  254. * NOTE: this function actually computes sin(x * pi) which avoids one or two
  255. * mults in many cases and guarantees exact values at integer periods.
  256. */
  257. }
  258. /* Fast approximate cos(x*M_PI) with ~0.1% absolute error. */
  259. ccl_device_inline float fast_cospif(float x)
  260. {
  261. return fast_sinpif(x + 0.5f);
  262. }
  263. ccl_device float fast_acosf(float x)
  264. {
  265. const float f = fabsf(x);
  266. /* clamp and crush denormals. */
  267. const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
  268. /* Based on http://www.pouet.net/topic.php?which=9132&page=2
  269. * 85% accurate (ulp 0)
  270. * Examined 2130706434 values of acos:
  271. * 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
  272. * Examined 2130706434 values of acos:
  273. * 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
  274. */
  275. const float a = sqrtf(1.0f - m) *
  276. (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
  277. return x < 0 ? M_PI_F - a : a;
  278. }
  279. ccl_device float fast_asinf(float x)
  280. {
  281. /* Based on acosf approximation above.
  282. * Max error is 4.51133e-05 (ulps are higher because we are consistently off
  283. * by a little amount).
  284. */
  285. const float f = fabsf(x);
  286. /* Clamp and crush denormals. */
  287. const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
  288. const float a = M_PI_2_F -
  289. sqrtf(1.0f - m) * (1.5707963267f +
  290. m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
  291. return copysignf(a, x);
  292. }
  293. ccl_device float fast_atanf(float x)
  294. {
  295. const float a = fabsf(x);
  296. const float k = a > 1.0f ? 1 / a : a;
  297. const float s = 1.0f - (1.0f - k); /* Crush denormals. */
  298. const float t = s * s;
  299. /* http://mathforum.org/library/drmath/view/62672.html
  300. * Examined 4278190080 values of atan:
  301. * 2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error // (with denormals)
  302. * Examined 4278190080 values of atan:
  303. * 171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
  304. */
  305. float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
  306. if (a > 1.0f) {
  307. r = M_PI_2_F - r;
  308. }
  309. return copysignf(r, x);
  310. }
  311. ccl_device float fast_atan2f(float y, float x)
  312. {
  313. /* Based on atan approximation above.
  314. *
  315. * The special cases around 0 and infinity were tested explicitly.
  316. *
  317. * The only case not handled correctly is x=NaN,y=0 which returns 0 instead
  318. * of nan.
  319. */
  320. const float a = fabsf(x);
  321. const float b = fabsf(y);
  322. const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
  323. const float s = 1.0f - (1.0f - k); /* Crush denormals */
  324. const float t = s * s;
  325. float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
  326. if (b > a) {
  327. /* Account for arg reduction. */
  328. r = M_PI_2_F - r;
  329. }
  330. /* Test sign bit of x. */
  331. if (__float_as_uint(x) & 0x80000000u) {
  332. r = M_PI_F - r;
  333. }
  334. return copysignf(r, y);
  335. }
  336. /* Based on:
  337. *
  338. * https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
  339. */
  340. ccl_device float fast_log2f(float x)
  341. {
  342. /* NOTE: clamp to avoid special cases and make result "safe" from large
  343. * negative values/nans. */
  344. x = clamp(x, FLT_MIN, FLT_MAX);
  345. unsigned bits = __float_as_uint(x);
  346. int exponent = (int)(bits >> 23) - 127;
  347. float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
  348. /* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
  349. * 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
  350. * ulp histogram:
  351. * 0 = 97.46%
  352. * 1 = 2.29%
  353. * 2 = 0.11%
  354. */
  355. float f2 = f * f;
  356. float f4 = f2 * f2;
  357. float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
  358. float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
  359. hi = madd(f, hi, -0.13753123777116f);
  360. hi = madd(f, hi, 0.24187369696082f);
  361. hi = madd(f, hi, -0.34730547155299f);
  362. lo = madd(f, lo, 1.442689881667200f);
  363. return ((f4 * hi) + (f * lo)) + exponent;
  364. }
  365. ccl_device_inline float fast_logf(float x)
  366. {
  367. /* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
  368. * 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
  369. */
  370. return fast_log2f(x) * M_LN2_F;
  371. }
  372. ccl_device_inline float fast_log10(float x)
  373. {
  374. /* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
  375. * 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
  376. */
  377. return fast_log2f(x) * M_LN2_F / M_LN10_F;
  378. }
  379. ccl_device float fast_logb(float x)
  380. {
  381. /* Don't bother with denormals. */
  382. x = fabsf(x);
  383. x = clamp(x, FLT_MIN, FLT_MAX);
  384. unsigned bits = __float_as_uint(x);
  385. return (int)(bits >> 23) - 127;
  386. }
  387. ccl_device float fast_exp2f(float x)
  388. {
  389. /* Clamp to safe range for final addition. */
  390. x = clamp(x, -126.0f, 126.0f);
  391. /* Range reduction. */
  392. int m = (int)x;
  393. x -= m;
  394. x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
  395. /* 5th degree polynomial generated with sollya
  396. * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
  397. * 232 max ulp.
  398. *
  399. * ulp histogram:
  400. * 0 = 87.81%
  401. * 1 = 4.18%
  402. */
  403. float r = 1.33336498402e-3f;
  404. r = madd(x, r, 9.810352697968e-3f);
  405. r = madd(x, r, 5.551834031939e-2f);
  406. r = madd(x, r, 0.2401793301105f);
  407. r = madd(x, r, 0.693144857883f);
  408. r = madd(x, r, 1.0f);
  409. /* Multiply by 2 ^ m by adding in the exponent. */
  410. /* NOTE: left-shift of negative number is undefined behavior. */
  411. return __uint_as_float(__float_as_uint(r) + ((unsigned)m << 23));
  412. }
  413. ccl_device_inline float fast_expf(float x)
  414. {
  415. /* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
  416. * 2.6666452 avg ulp diff, 230 max ulp.
  417. */
  418. return fast_exp2f(x / M_LN2_F);
  419. }
  420. #ifndef __KERNEL_GPU__
  421. ccl_device float4 fast_exp2f4(float4 x)
  422. {
  423. const float4 one = make_float4(1.0f);
  424. const float4 limit = make_float4(126.0f);
  425. x = clamp(x, -limit, limit);
  426. int4 m = make_int4(x);
  427. x = one - (one - (x - make_float4(m)));
  428. float4 r = make_float4(1.33336498402e-3f);
  429. r = madd4(x, r, make_float4(9.810352697968e-3f));
  430. r = madd4(x, r, make_float4(5.551834031939e-2f));
  431. r = madd4(x, r, make_float4(0.2401793301105f));
  432. r = madd4(x, r, make_float4(0.693144857883f));
  433. r = madd4(x, r, make_float4(1.0f));
  434. return __int4_as_float4(__float4_as_int4(r) + (m << 23));
  435. }
  436. ccl_device_inline float4 fast_expf4(float4 x)
  437. {
  438. return fast_exp2f4(x / M_LN2_F);
  439. }
  440. #endif
  441. ccl_device_inline float fast_exp10(float x)
  442. {
  443. /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
  444. * 2.71732409 avg ulp diff, 232 max ulp.
  445. */
  446. return fast_exp2f(x * M_LN10_F / M_LN2_F);
  447. }
  448. ccl_device_inline float fast_expm1f(float x)
  449. {
  450. if (fabsf(x) < 1e-5f) {
  451. x = 1.0f - (1.0f - x); /* Crush denormals. */
  452. return madd(0.5f, x * x, x);
  453. }
  454. else {
  455. return fast_expf(x) - 1.0f;
  456. }
  457. }
  458. ccl_device float fast_sinhf(float x)
  459. {
  460. float a = fabsf(x);
  461. if (a > 1.0f) {
  462. /* Examined 53389559 values of sinh on [1,87.3300018]:
  463. * 33.6886442 avg ulp diff, 178 max ulp. */
  464. float e = fast_expf(a);
  465. return copysignf(0.5f * e - 0.5f / e, x);
  466. }
  467. else {
  468. a = 1.0f - (1.0f - a); /* Crush denorms. */
  469. float a2 = a * a;
  470. /* Degree 7 polynomial generated with sollya. */
  471. /* Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error. */
  472. float r = 2.03945513931e-4f;
  473. r = madd(r, a2, 8.32990277558e-3f);
  474. r = madd(r, a2, 0.1666673421859f);
  475. r = madd(r * a, a2, a);
  476. return copysignf(r, x);
  477. }
  478. }
  479. ccl_device_inline float fast_coshf(float x)
  480. {
  481. /* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
  482. * 1.78256726 avg ulp diff, 178 max ulp.
  483. */
  484. float e = fast_expf(fabsf(x));
  485. return 0.5f * e + 0.5f / e;
  486. }
  487. ccl_device_inline float fast_tanhf(float x)
  488. {
  489. /* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
  490. * 3.12924e-06 max error.
  491. */
  492. /* NOTE: ulp error is high because of sub-optimal handling around the origin. */
  493. float e = fast_expf(2.0f * fabsf(x));
  494. return copysignf(1.0f - 2.0f / (1.0f + e), x);
  495. }
  496. ccl_device float fast_safe_powf(float x, float y)
  497. {
  498. if (y == 0)
  499. return 1.0f; /* x^1=1 */
  500. if (x == 0)
  501. return 0.0f; /* 0^y=0 */
  502. float sign = 1.0f;
  503. if (x < 0.0f) {
  504. /* if x is negative, only deal with integer powers
  505. * powf returns NaN for non-integers, we will return 0 instead.
  506. */
  507. int ybits = __float_as_int(y) & 0x7fffffff;
  508. if (ybits >= 0x4b800000) {
  509. // always even int, keep positive
  510. }
  511. else if (ybits >= 0x3f800000) {
  512. /* Bigger than 1, check. */
  513. int k = (ybits >> 23) - 127; /* Get exponent. */
  514. int j = ybits >> (23 - k); /* Shift out possible fractional bits. */
  515. if ((j << (23 - k)) == ybits) { /* rebuild number and check for a match. */
  516. /* +1 for even, -1 for odd. */
  517. sign = __int_as_float(0x3f800000 | (j << 31));
  518. }
  519. else {
  520. /* Not an integer. */
  521. return 0.0f;
  522. }
  523. }
  524. else {
  525. /* Not an integer. */
  526. return 0.0f;
  527. }
  528. }
  529. return sign * fast_exp2f(y * fast_log2f(fabsf(x)));
  530. }
  531. /* TODO(sergey): Check speed with our erf functions implementation from
  532. * bsdf_microfacet.h.
  533. */
  534. ccl_device_inline float fast_erff(float x)
  535. {
  536. /* Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error. */
  537. /* Abramowitz and Stegun, 7.1.28. */
  538. const float a1 = 0.0705230784f;
  539. const float a2 = 0.0422820123f;
  540. const float a3 = 0.0092705272f;
  541. const float a4 = 0.0001520143f;
  542. const float a5 = 0.0002765672f;
  543. const float a6 = 0.0000430638f;
  544. const float a = fabsf(x);
  545. if (a >= 12.3f) {
  546. return copysignf(1.0f, x);
  547. }
  548. const float b = 1.0f - (1.0f - a); /* Crush denormals. */
  549. const float r = madd(
  550. madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
  551. const float s = r * r; /* ^2 */
  552. const float t = s * s; /* ^4 */
  553. const float u = t * t; /* ^8 */
  554. const float v = u * u; /* ^16 */
  555. return copysignf(1.0f - 1.0f / v, x);
  556. }
  557. ccl_device_inline float fast_erfcf(float x)
  558. {
  559. /* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
  560. *
  561. * ulp histogram:
  562. *
  563. * 0 = 80.30%
  564. */
  565. return 1.0f - fast_erff(x);
  566. }
  567. ccl_device_inline float fast_ierff(float x)
  568. {
  569. /* From: Approximating the erfinv function by Mike Giles. */
  570. /* To avoid trouble at the limit, clamp input to 1-eps. */
  571. float a = fabsf(x);
  572. if (a > 0.99999994f) {
  573. a = 0.99999994f;
  574. }
  575. float w = -fast_logf((1.0f - a) * (1.0f + a)), p;
  576. if (w < 5.0f) {
  577. w = w - 2.5f;
  578. p = 2.81022636e-08f;
  579. p = madd(p, w, 3.43273939e-07f);
  580. p = madd(p, w, -3.5233877e-06f);
  581. p = madd(p, w, -4.39150654e-06f);
  582. p = madd(p, w, 0.00021858087f);
  583. p = madd(p, w, -0.00125372503f);
  584. p = madd(p, w, -0.00417768164f);
  585. p = madd(p, w, 0.246640727f);
  586. p = madd(p, w, 1.50140941f);
  587. }
  588. else {
  589. w = sqrtf(w) - 3.0f;
  590. p = -0.000200214257f;
  591. p = madd(p, w, 0.000100950558f);
  592. p = madd(p, w, 0.00134934322f);
  593. p = madd(p, w, -0.00367342844f);
  594. p = madd(p, w, 0.00573950773f);
  595. p = madd(p, w, -0.0076224613f);
  596. p = madd(p, w, 0.00943887047f);
  597. p = madd(p, w, 1.00167406f);
  598. p = madd(p, w, 2.83297682f);
  599. }
  600. return p * x;
  601. }
  602. CCL_NAMESPACE_END
  603. #endif /* __UTIL_FAST_MATH__ */