fpu_sqrt.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. /* $NetBSD: fpu_sqrt.c,v 1.4 2005/12/11 12:18:42 christos Exp $ */
  2. /*-
  3. * SPDX-License-Identifier: BSD-3-Clause
  4. *
  5. * Copyright (c) 1992, 1993
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * This software was developed by the Computer Systems Engineering group
  9. * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
  10. * contributed to Berkeley.
  11. *
  12. * All advertising materials mentioning features or use of this software
  13. * must display the following acknowledgement:
  14. * This product includes software developed by the University of
  15. * California, Lawrence Berkeley Laboratory.
  16. *
  17. * Redistribution and use in source and binary forms, with or without
  18. * modification, are permitted provided that the following conditions
  19. * are met:
  20. * 1. Redistributions of source code must retain the above copyright
  21. * notice, this list of conditions and the following disclaimer.
  22. * 2. Redistributions in binary form must reproduce the above copyright
  23. * notice, this list of conditions and the following disclaimer in the
  24. * documentation and/or other materials provided with the distribution.
  25. * 3. Neither the name of the University nor the names of its contributors
  26. * may be used to endorse or promote products derived from this software
  27. * without specific prior written permission.
  28. *
  29. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  30. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  31. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  32. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  33. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  34. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  35. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  36. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  37. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  38. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  39. * SUCH DAMAGE.
  40. */
  41. /*
  42. * Perform an FPU square root (return sqrt(x)).
  43. */
  44. #include <sys/types.h>
  45. #include <sys/systm.h>
  46. #include <machine/fpu.h>
  47. #include <powerpc/fpu/fpu_arith.h>
  48. #include <powerpc/fpu/fpu_emu.h>
  49. /*
  50. * Our task is to calculate the square root of a floating point number x0.
  51. * This number x normally has the form:
  52. *
  53. * exp
  54. * x = mant * 2 (where 1 <= mant < 2 and exp is an integer)
  55. *
  56. * This can be left as it stands, or the mantissa can be doubled and the
  57. * exponent decremented:
  58. *
  59. * exp-1
  60. * x = (2 * mant) * 2 (where 2 <= 2 * mant < 4)
  61. *
  62. * If the exponent `exp' is even, the square root of the number is best
  63. * handled using the first form, and is by definition equal to:
  64. *
  65. * exp/2
  66. * sqrt(x) = sqrt(mant) * 2
  67. *
  68. * If exp is odd, on the other hand, it is convenient to use the second
  69. * form, giving:
  70. *
  71. * (exp-1)/2
  72. * sqrt(x) = sqrt(2 * mant) * 2
  73. *
  74. * In the first case, we have
  75. *
  76. * 1 <= mant < 2
  77. *
  78. * and therefore
  79. *
  80. * sqrt(1) <= sqrt(mant) < sqrt(2)
  81. *
  82. * while in the second case we have
  83. *
  84. * 2 <= 2*mant < 4
  85. *
  86. * and therefore
  87. *
  88. * sqrt(2) <= sqrt(2*mant) < sqrt(4)
  89. *
  90. * so that in any case, we are sure that
  91. *
  92. * sqrt(1) <= sqrt(n * mant) < sqrt(4), n = 1 or 2
  93. *
  94. * or
  95. *
  96. * 1 <= sqrt(n * mant) < 2, n = 1 or 2.
  97. *
  98. * This root is therefore a properly formed mantissa for a floating
  99. * point number. The exponent of sqrt(x) is either exp/2 or (exp-1)/2
  100. * as above. This leaves us with the problem of finding the square root
  101. * of a fixed-point number in the range [1..4).
  102. *
  103. * Though it may not be instantly obvious, the following square root
  104. * algorithm works for any integer x of an even number of bits, provided
  105. * that no overflows occur:
  106. *
  107. * let q = 0
  108. * for k = NBITS-1 to 0 step -1 do -- for each digit in the answer...
  109. * x *= 2 -- multiply by radix, for next digit
  110. * if x >= 2q + 2^k then -- if adding 2^k does not
  111. * x -= 2q + 2^k -- exceed the correct root,
  112. * q += 2^k -- add 2^k and adjust x
  113. * fi
  114. * done
  115. * sqrt = q / 2^(NBITS/2) -- (and any remainder is in x)
  116. *
  117. * If NBITS is odd (so that k is initially even), we can just add another
  118. * zero bit at the top of x. Doing so means that q is not going to acquire
  119. * a 1 bit in the first trip around the loop (since x0 < 2^NBITS). If the
  120. * final value in x is not needed, or can be off by a factor of 2, this is
  121. * equivalant to moving the `x *= 2' step to the bottom of the loop:
  122. *
  123. * for k = NBITS-1 to 0 step -1 do if ... fi; x *= 2; done
  124. *
  125. * and the result q will then be sqrt(x0) * 2^floor(NBITS / 2).
  126. * (Since the algorithm is destructive on x, we will call x's initial
  127. * value, for which q is some power of two times its square root, x0.)
  128. *
  129. * If we insert a loop invariant y = 2q, we can then rewrite this using
  130. * C notation as:
  131. *
  132. * q = y = 0; x = x0;
  133. * for (k = NBITS; --k >= 0;) {
  134. * #if (NBITS is even)
  135. * x *= 2;
  136. * #endif
  137. * t = y + (1 << k);
  138. * if (x >= t) {
  139. * x -= t;
  140. * q += 1 << k;
  141. * y += 1 << (k + 1);
  142. * }
  143. * #if (NBITS is odd)
  144. * x *= 2;
  145. * #endif
  146. * }
  147. *
  148. * If x0 is fixed point, rather than an integer, we can simply alter the
  149. * scale factor between q and sqrt(x0). As it happens, we can easily arrange
  150. * for the scale factor to be 2**0 or 1, so that sqrt(x0) == q.
  151. *
  152. * In our case, however, x0 (and therefore x, y, q, and t) are multiword
  153. * integers, which adds some complication. But note that q is built one
  154. * bit at a time, from the top down, and is not used itself in the loop
  155. * (we use 2q as held in y instead). This means we can build our answer
  156. * in an integer, one word at a time, which saves a bit of work. Also,
  157. * since 1 << k is always a `new' bit in q, 1 << k and 1 << (k+1) are
  158. * `new' bits in y and we can set them with an `or' operation rather than
  159. * a full-blown multiword add.
  160. *
  161. * We are almost done, except for one snag. We must prove that none of our
  162. * intermediate calculations can overflow. We know that x0 is in [1..4)
  163. * and therefore the square root in q will be in [1..2), but what about x,
  164. * y, and t?
  165. *
  166. * We know that y = 2q at the beginning of each loop. (The relation only
  167. * fails temporarily while y and q are being updated.) Since q < 2, y < 4.
  168. * The sum in t can, in our case, be as much as y+(1<<1) = y+2 < 6, and.
  169. * Furthermore, we can prove with a bit of work that x never exceeds y by
  170. * more than 2, so that even after doubling, 0 <= x < 8. (This is left as
  171. * an exercise to the reader, mostly because I have become tired of working
  172. * on this comment.)
  173. *
  174. * If our floating point mantissas (which are of the form 1.frac) occupy
  175. * B+1 bits, our largest intermediary needs at most B+3 bits, or two extra.
  176. * In fact, we want even one more bit (for a carry, to avoid compares), or
  177. * three extra. There is a comment in fpu_emu.h reminding maintainers of
  178. * this, so we have some justification in assuming it.
  179. */
  180. struct fpn *
  181. fpu_sqrt(struct fpemu *fe)
  182. {
  183. struct fpn *x = &fe->fe_f1;
  184. u_int bit, q, tt;
  185. u_int x0, x1, x2, x3;
  186. u_int y0, y1, y2, y3;
  187. u_int d0, d1, d2, d3;
  188. int e;
  189. FPU_DECL_CARRY;
  190. /*
  191. * Take care of special cases first. In order:
  192. *
  193. * sqrt(NaN) = NaN
  194. * sqrt(+0) = +0
  195. * sqrt(-0) = -0
  196. * sqrt(x < 0) = NaN (including sqrt(-Inf))
  197. * sqrt(+Inf) = +Inf
  198. *
  199. * Then all that remains are numbers with mantissas in [1..2).
  200. */
  201. DPRINTF(FPE_REG, ("fpu_sqer:\n"));
  202. DUMPFPN(FPE_REG, x);
  203. DPRINTF(FPE_REG, ("=>\n"));
  204. if (ISNAN(x)) {
  205. fe->fe_cx |= FPSCR_VXSNAN;
  206. DUMPFPN(FPE_REG, x);
  207. return (x);
  208. }
  209. if (ISZERO(x)) {
  210. fe->fe_cx |= FPSCR_ZX;
  211. x->fp_class = FPC_INF;
  212. DUMPFPN(FPE_REG, x);
  213. return (x);
  214. }
  215. if (x->fp_sign) {
  216. fe->fe_cx |= FPSCR_VXSQRT;
  217. return (fpu_newnan(fe));
  218. }
  219. if (ISINF(x)) {
  220. DUMPFPN(FPE_REG, x);
  221. return (x);
  222. }
  223. /*
  224. * Calculate result exponent. As noted above, this may involve
  225. * doubling the mantissa. We will also need to double x each
  226. * time around the loop, so we define a macro for this here, and
  227. * we break out the multiword mantissa.
  228. */
  229. #ifdef FPU_SHL1_BY_ADD
  230. #define DOUBLE_X { \
  231. FPU_ADDS(x3, x3, x3); FPU_ADDCS(x2, x2, x2); \
  232. FPU_ADDCS(x1, x1, x1); FPU_ADDC(x0, x0, x0); \
  233. }
  234. #else
  235. #define DOUBLE_X { \
  236. x0 = (x0 << 1) | (x1 >> 31); x1 = (x1 << 1) | (x2 >> 31); \
  237. x2 = (x2 << 1) | (x3 >> 31); x3 <<= 1; \
  238. }
  239. #endif
  240. #if (FP_NMANT & 1) != 0
  241. # define ODD_DOUBLE DOUBLE_X
  242. # define EVEN_DOUBLE /* nothing */
  243. #else
  244. # define ODD_DOUBLE /* nothing */
  245. # define EVEN_DOUBLE DOUBLE_X
  246. #endif
  247. x0 = x->fp_mant[0];
  248. x1 = x->fp_mant[1];
  249. x2 = x->fp_mant[2];
  250. x3 = x->fp_mant[3];
  251. e = x->fp_exp;
  252. if (e & 1) /* exponent is odd; use sqrt(2mant) */
  253. DOUBLE_X;
  254. /* THE FOLLOWING ASSUMES THAT RIGHT SHIFT DOES SIGN EXTENSION */
  255. x->fp_exp = e >> 1; /* calculates (e&1 ? (e-1)/2 : e/2 */
  256. /*
  257. * Now calculate the mantissa root. Since x is now in [1..4),
  258. * we know that the first trip around the loop will definitely
  259. * set the top bit in q, so we can do that manually and start
  260. * the loop at the next bit down instead. We must be sure to
  261. * double x correctly while doing the `known q=1.0'.
  262. *
  263. * We do this one mantissa-word at a time, as noted above, to
  264. * save work. To avoid `(1U << 31) << 1', we also do the top bit
  265. * outside of each per-word loop.
  266. *
  267. * The calculation `t = y + bit' breaks down into `t0 = y0, ...,
  268. * t3 = y3, t? |= bit' for the appropriate word. Since the bit
  269. * is always a `new' one, this means that three of the `t?'s are
  270. * just the corresponding `y?'; we use `#define's here for this.
  271. * The variable `tt' holds the actual `t?' variable.
  272. */
  273. /* calculate q0 */
  274. #define t0 tt
  275. bit = FP_1;
  276. EVEN_DOUBLE;
  277. /* if (x >= (t0 = y0 | bit)) { */ /* always true */
  278. q = bit;
  279. x0 -= bit;
  280. y0 = bit << 1;
  281. /* } */
  282. ODD_DOUBLE;
  283. while ((bit >>= 1) != 0) { /* for remaining bits in q0 */
  284. EVEN_DOUBLE;
  285. t0 = y0 | bit; /* t = y + bit */
  286. if (x0 >= t0) { /* if x >= t then */
  287. x0 -= t0; /* x -= t */
  288. q |= bit; /* q += bit */
  289. y0 |= bit << 1; /* y += bit << 1 */
  290. }
  291. ODD_DOUBLE;
  292. }
  293. x->fp_mant[0] = q;
  294. #undef t0
  295. /* calculate q1. note (y0&1)==0. */
  296. #define t0 y0
  297. #define t1 tt
  298. q = 0;
  299. y1 = 0;
  300. bit = 1 << 31;
  301. EVEN_DOUBLE;
  302. t1 = bit;
  303. FPU_SUBS(d1, x1, t1);
  304. FPU_SUBC(d0, x0, t0); /* d = x - t */
  305. if ((int)d0 >= 0) { /* if d >= 0 (i.e., x >= t) then */
  306. x0 = d0, x1 = d1; /* x -= t */
  307. q = bit; /* q += bit */
  308. y0 |= 1; /* y += bit << 1 */
  309. }
  310. ODD_DOUBLE;
  311. while ((bit >>= 1) != 0) { /* for remaining bits in q1 */
  312. EVEN_DOUBLE; /* as before */
  313. t1 = y1 | bit;
  314. FPU_SUBS(d1, x1, t1);
  315. FPU_SUBC(d0, x0, t0);
  316. if ((int)d0 >= 0) {
  317. x0 = d0, x1 = d1;
  318. q |= bit;
  319. y1 |= bit << 1;
  320. }
  321. ODD_DOUBLE;
  322. }
  323. x->fp_mant[1] = q;
  324. #undef t1
  325. /* calculate q2. note (y1&1)==0; y0 (aka t0) is fixed. */
  326. #define t1 y1
  327. #define t2 tt
  328. q = 0;
  329. y2 = 0;
  330. bit = 1 << 31;
  331. EVEN_DOUBLE;
  332. t2 = bit;
  333. FPU_SUBS(d2, x2, t2);
  334. FPU_SUBCS(d1, x1, t1);
  335. FPU_SUBC(d0, x0, t0);
  336. if ((int)d0 >= 0) {
  337. x0 = d0, x1 = d1, x2 = d2;
  338. q = bit;
  339. y1 |= 1; /* now t1, y1 are set in concrete */
  340. }
  341. ODD_DOUBLE;
  342. while ((bit >>= 1) != 0) {
  343. EVEN_DOUBLE;
  344. t2 = y2 | bit;
  345. FPU_SUBS(d2, x2, t2);
  346. FPU_SUBCS(d1, x1, t1);
  347. FPU_SUBC(d0, x0, t0);
  348. if ((int)d0 >= 0) {
  349. x0 = d0, x1 = d1, x2 = d2;
  350. q |= bit;
  351. y2 |= bit << 1;
  352. }
  353. ODD_DOUBLE;
  354. }
  355. x->fp_mant[2] = q;
  356. #undef t2
  357. /* calculate q3. y0, t0, y1, t1 all fixed; y2, t2, almost done. */
  358. #define t2 y2
  359. #define t3 tt
  360. q = 0;
  361. y3 = 0;
  362. bit = 1 << 31;
  363. EVEN_DOUBLE;
  364. t3 = bit;
  365. FPU_SUBS(d3, x3, t3);
  366. FPU_SUBCS(d2, x2, t2);
  367. FPU_SUBCS(d1, x1, t1);
  368. FPU_SUBC(d0, x0, t0);
  369. if ((int)d0 >= 0) {
  370. x0 = d0, x1 = d1, x2 = d2; x3 = d3;
  371. q = bit;
  372. y2 |= 1;
  373. }
  374. ODD_DOUBLE;
  375. while ((bit >>= 1) != 0) {
  376. EVEN_DOUBLE;
  377. t3 = y3 | bit;
  378. FPU_SUBS(d3, x3, t3);
  379. FPU_SUBCS(d2, x2, t2);
  380. FPU_SUBCS(d1, x1, t1);
  381. FPU_SUBC(d0, x0, t0);
  382. if ((int)d0 >= 0) {
  383. x0 = d0, x1 = d1, x2 = d2; x3 = d3;
  384. q |= bit;
  385. y3 |= bit << 1;
  386. }
  387. ODD_DOUBLE;
  388. }
  389. x->fp_mant[3] = q;
  390. /*
  391. * The result, which includes guard and round bits, is exact iff
  392. * x is now zero; any nonzero bits in x represent sticky bits.
  393. */
  394. x->fp_sticky = x0 | x1 | x2 | x3;
  395. DUMPFPN(FPE_REG, x);
  396. return (x);
  397. }