mathops.c 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. #include "mathops.h"
  2. #include <limits.h>
  3. /*The fastest fallback strategy for platforms with fast multiplication appears
  4. to be based on de Bruijn sequences~\cite{LP98}.
  5. Tests confirmed this to be true even on an ARM11, where it is actually faster
  6. than using the native clz instruction.
  7. Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
  8. multiplication or table lookups are too expensive.
  9. @UNPUBLISHED{LP98,
  10. author="Charles E. Leiserson and Harald Prokop",
  11. title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
  12. month=Jun,
  13. year=1998,
  14. note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
  15. }*/
  16. #if !defined(OC_ILOG_NODEBRUIJN)&& \
  17. !defined(OC_CLZ32)||!defined(OC_CLZ64)&&LONG_MAX<9223372036854775807LL
  18. static const unsigned char OC_DEBRUIJN_IDX32[32]={
  19. 0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
  20. 31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
  21. };
  22. #endif
  23. int oc_ilog32(ogg_uint32_t _v){
  24. #if defined(OC_CLZ32)
  25. return (OC_CLZ32_OFFS-OC_CLZ32(_v))&-!!_v;
  26. #else
  27. /*On a Pentium M, this branchless version tested as the fastest version without
  28. multiplications on 1,000,000,000 random 32-bit integers, edging out a
  29. similar version with branches, and a 256-entry LUT version.*/
  30. # if defined(OC_ILOG_NODEBRUIJN)
  31. int ret;
  32. int m;
  33. ret=_v>0;
  34. m=(_v>0xFFFFU)<<4;
  35. _v>>=m;
  36. ret|=m;
  37. m=(_v>0xFFU)<<3;
  38. _v>>=m;
  39. ret|=m;
  40. m=(_v>0xFU)<<2;
  41. _v>>=m;
  42. ret|=m;
  43. m=(_v>3)<<1;
  44. _v>>=m;
  45. ret|=m;
  46. ret+=_v>1;
  47. return ret;
  48. /*This de Bruijn sequence version is faster if you have a fast multiplier.*/
  49. # else
  50. int ret;
  51. ret=_v>0;
  52. _v|=_v>>1;
  53. _v|=_v>>2;
  54. _v|=_v>>4;
  55. _v|=_v>>8;
  56. _v|=_v>>16;
  57. _v=(_v>>1)+1;
  58. ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
  59. return ret;
  60. # endif
  61. #endif
  62. }
  63. int oc_ilog64(ogg_int64_t _v){
  64. #if defined(OC_CLZ64)
  65. return (OC_CLZ64_OFFS-OC_CLZ64(_v))&-!!_v;
  66. #else
  67. # if defined(OC_ILOG_NODEBRUIJN)
  68. ogg_uint32_t v;
  69. int ret;
  70. int m;
  71. ret=_v>0;
  72. m=(_v>0xFFFFFFFFU)<<5;
  73. v=(ogg_uint32_t)(_v>>m);
  74. ret|=m;
  75. m=(v>0xFFFFU)<<4;
  76. v>>=m;
  77. ret|=m;
  78. m=(v>0xFFU)<<3;
  79. v>>=m;
  80. ret|=m;
  81. m=(v>0xFU)<<2;
  82. v>>=m;
  83. ret|=m;
  84. m=(v>3)<<1;
  85. v>>=m;
  86. ret|=m;
  87. ret+=v>1;
  88. return ret;
  89. # else
  90. /*If we don't have a 64-bit word, split it into two 32-bit halves.*/
  91. # if LONG_MAX<9223372036854775807LL
  92. ogg_uint32_t v;
  93. int ret;
  94. int m;
  95. ret=_v>0;
  96. m=(_v>0xFFFFFFFFU)<<5;
  97. v=(ogg_uint32_t)(_v>>m);
  98. ret|=m;
  99. v|=v>>1;
  100. v|=v>>2;
  101. v|=v>>4;
  102. v|=v>>8;
  103. v|=v>>16;
  104. v=(v>>1)+1;
  105. ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
  106. return ret;
  107. /*Otherwise do it in one 64-bit operation.*/
  108. # else
  109. static const unsigned char OC_DEBRUIJN_IDX64[64]={
  110. 0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
  111. 5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
  112. 63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
  113. 62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
  114. };
  115. int ret;
  116. ret=_v>0;
  117. _v|=_v>>1;
  118. _v|=_v>>2;
  119. _v|=_v>>4;
  120. _v|=_v>>8;
  121. _v|=_v>>16;
  122. _v|=_v>>32;
  123. _v=(_v>>1)+1;
  124. ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
  125. return ret;
  126. # endif
  127. # endif
  128. #endif
  129. }
  130. /*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/
  131. static const ogg_int64_t OC_ATANH_LOG2[32]={
  132. 0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL,
  133. 0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL,
  134. 0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL,
  135. 0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL,
  136. 0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL,
  137. 0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL,
  138. 0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL,
  139. 0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL,
  140. 0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL,
  141. 0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL,
  142. 0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
  143. };
  144. /*Computes the binary exponential of _z, a log base 2 in Q57 format.*/
  145. ogg_int64_t oc_bexp64(ogg_int64_t _z){
  146. ogg_int64_t w;
  147. ogg_int64_t z;
  148. int ipart;
  149. ipart=(int)(_z>>57);
  150. if(ipart<0)return 0;
  151. if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL;
  152. z=_z-OC_Q57(ipart);
  153. if(z){
  154. ogg_int64_t mask;
  155. long wlo;
  156. int i;
  157. /*C doesn't give us 64x64->128 muls, so we use CORDIC.
  158. This is not particularly fast, but it's not being used in time-critical
  159. code; it is very accurate.*/
  160. /*z is the fractional part of the log in Q62 format.
  161. We need 1 bit of headroom since the magnitude can get larger than 1
  162. during the iteration, and a sign bit.*/
  163. z<<=5;
  164. /*w is the exponential in Q61 format (since it also needs headroom and can
  165. get as large as 2.0); we could get another bit if we dropped the sign,
  166. but we'll recover that bit later anyway.
  167. Ideally this should start out as
  168. \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
  169. but in order to guarantee convergence we have to repeat iterations 4,
  170. 13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
  171. w=0x26A3D0E401DD846DLL;
  172. for(i=0;;i++){
  173. mask=-(z<0);
  174. w+=(w>>i+1)+mask^mask;
  175. z-=OC_ATANH_LOG2[i]+mask^mask;
  176. /*Repeat iteration 4.*/
  177. if(i>=3)break;
  178. z<<=1;
  179. }
  180. for(;;i++){
  181. mask=-(z<0);
  182. w+=(w>>i+1)+mask^mask;
  183. z-=OC_ATANH_LOG2[i]+mask^mask;
  184. /*Repeat iteration 13.*/
  185. if(i>=12)break;
  186. z<<=1;
  187. }
  188. for(;i<32;i++){
  189. mask=-(z<0);
  190. w+=(w>>i+1)+mask^mask;
  191. z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1;
  192. }
  193. wlo=0;
  194. /*Skip the remaining iterations unless we really require that much
  195. precision.
  196. We could have bailed out earlier for smaller iparts, but that would
  197. require initializing w from a table, as the limit doesn't converge to
  198. 61-bit precision until n=30.*/
  199. if(ipart>30){
  200. /*For these iterations, we just update the low bits, as the high bits
  201. can't possibly be affected.
  202. OC_ATANH_LOG2 has also converged (it actually did so one iteration
  203. earlier, but that's no reason for an extra special case).*/
  204. for(;;i++){
  205. mask=-(z<0);
  206. wlo+=(w>>i)+mask^mask;
  207. z-=OC_ATANH_LOG2[31]+mask^mask;
  208. /*Repeat iteration 40.*/
  209. if(i>=39)break;
  210. z<<=1;
  211. }
  212. for(;i<61;i++){
  213. mask=-(z<0);
  214. wlo+=(w>>i)+mask^mask;
  215. z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1;
  216. }
  217. }
  218. w=(w<<1)+wlo;
  219. }
  220. else w=(ogg_int64_t)1<<62;
  221. if(ipart<62)w=(w>>61-ipart)+1>>1;
  222. return w;
  223. }
  224. /*Computes the binary logarithm of _w, returned in Q57 format.*/
  225. ogg_int64_t oc_blog64(ogg_int64_t _w){
  226. ogg_int64_t z;
  227. int ipart;
  228. if(_w<=0)return -1;
  229. ipart=OC_ILOGNZ_64(_w)-1;
  230. if(ipart>61)_w>>=ipart-61;
  231. else _w<<=61-ipart;
  232. z=0;
  233. if(_w&_w-1){
  234. ogg_int64_t x;
  235. ogg_int64_t y;
  236. ogg_int64_t u;
  237. ogg_int64_t mask;
  238. int i;
  239. /*C doesn't give us 64x64->128 muls, so we use CORDIC.
  240. This is not particularly fast, but it's not being used in time-critical
  241. code; it is very accurate.*/
  242. /*z is the fractional part of the log in Q61 format.*/
  243. /*x and y are the cosh() and sinh(), respectively, in Q61 format.
  244. We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/
  245. x=_w+((ogg_int64_t)1<<61);
  246. y=_w-((ogg_int64_t)1<<61);
  247. for(i=0;i<4;i++){
  248. mask=-(y<0);
  249. z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
  250. u=x>>i+1;
  251. x-=(y>>i+1)+mask^mask;
  252. y-=u+mask^mask;
  253. }
  254. /*Repeat iteration 4.*/
  255. for(i--;i<13;i++){
  256. mask=-(y<0);
  257. z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
  258. u=x>>i+1;
  259. x-=(y>>i+1)+mask^mask;
  260. y-=u+mask^mask;
  261. }
  262. /*Repeat iteration 13.*/
  263. for(i--;i<32;i++){
  264. mask=-(y<0);
  265. z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
  266. u=x>>i+1;
  267. x-=(y>>i+1)+mask^mask;
  268. y-=u+mask^mask;
  269. }
  270. /*OC_ATANH_LOG2 has converged.*/
  271. for(;i<40;i++){
  272. mask=-(y<0);
  273. z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
  274. u=x>>i+1;
  275. x-=(y>>i+1)+mask^mask;
  276. y-=u+mask^mask;
  277. }
  278. /*Repeat iteration 40.*/
  279. for(i--;i<62;i++){
  280. mask=-(y<0);
  281. z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
  282. u=x>>i+1;
  283. x-=(y>>i+1)+mask^mask;
  284. y-=u+mask^mask;
  285. }
  286. z=z+8>>4;
  287. }
  288. return OC_Q57(ipart)+z;
  289. }