Simd_3DNow.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /*
  2. ===========================================================================
  3. Doom 3 GPL Source Code
  4. Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
  6. Doom 3 Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #include "../precompiled.h"
  21. #pragma hdrstop
  22. #include "Simd_Generic.h"
  23. #include "Simd_MMX.h"
  24. #include "Simd_3DNow.h"
  25. //===============================================================
  26. //
  27. // 3DNow! implementation of idSIMDProcessor
  28. //
  29. //===============================================================
  30. #ifdef _WIN32
  31. /*
  32. ============
  33. idSIMD_3DNow::GetName
  34. ============
  35. */
  36. const char * idSIMD_3DNow::GetName( void ) const {
  37. return "MMX & 3DNow!";
  38. }
  39. // Very optimized memcpy() routine for all AMD Athlon and Duron family.
  40. // This code uses any of FOUR different basic copy methods, depending
  41. // on the transfer size.
  42. // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
  43. // "Streaming Store"), and also uses the software prefetchnta instructions,
  44. // be sure you're running on Athlon/Duron or other recent CPU before calling!
  45. #define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
  46. // The smallest copy uses the X86 "movsd" instruction, in an optimized
  47. // form which is an "unrolled loop".
  48. #define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
  49. // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
  50. // also using the "unrolled loop" optimization. This code uses
  51. // the software prefetch instruction to get the data into the cache.
  52. #define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
  53. // For larger blocks, which will spill beyond the cache, it's faster to
  54. // use the Streaming Store instruction MOVNTQ. This write instruction
  55. // bypasses the cache and writes straight to main memory. This code also
  56. // uses the software prefetch instruction to pre-read the data.
  57. // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
  58. #define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
  59. #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
  60. // For the largest size blocks, a special technique called Block Prefetch
  61. // can be used to accelerate the read operations. Block Prefetch reads
  62. // one address per cache line, for a series of cache lines, in a short loop.
  63. // This is faster than using software prefetch. The technique is great for
  64. // getting maximum read bandwidth, especially in DDR memory systems.
  65. /*
  66. ================
  67. idSIMD_3DNow::Memcpy
  68. optimized memory copy routine that handles all alignment cases and block sizes efficiently
  69. ================
  70. */
  71. void VPCALL idSIMD_3DNow::Memcpy( void *dest, const void *src, const int n ) {
  72. __asm {
  73. mov ecx, [n] // number of bytes to copy
  74. mov edi, [dest] // destination
  75. mov esi, [src] // source
  76. mov ebx, ecx // keep a copy of count
  77. cld
  78. cmp ecx, TINY_BLOCK_COPY
  79. jb $memcpy_ic_3 // tiny? skip mmx copy
  80. cmp ecx, 32*1024 // don't align between 32k-64k because
  81. jbe $memcpy_do_align // it appears to be slower
  82. cmp ecx, 64*1024
  83. jbe $memcpy_align_done
  84. $memcpy_do_align:
  85. mov ecx, 8 // a trick that's faster than rep movsb...
  86. sub ecx, edi // align destination to qword
  87. and ecx, 111b // get the low bits
  88. sub ebx, ecx // update copy count
  89. neg ecx // set up to jump into the array
  90. add ecx, offset $memcpy_align_done
  91. jmp ecx // jump to array of movsb's
  92. align 4
  93. movsb
  94. movsb
  95. movsb
  96. movsb
  97. movsb
  98. movsb
  99. movsb
  100. movsb
  101. $memcpy_align_done: // destination is dword aligned
  102. mov ecx, ebx // number of bytes left to copy
  103. shr ecx, 6 // get 64-byte block count
  104. jz $memcpy_ic_2 // finish the last few bytes
  105. cmp ecx, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
  106. jae $memcpy_uc_test
  107. // This is small block copy that uses the MMX registers to copy 8 bytes
  108. // at a time. It uses the "unrolled loop" optimization, and also uses
  109. // the software prefetch instruction to get the data into the cache.
  110. align 16
  111. $memcpy_ic_1: // 64-byte block copies, in-cache copy
  112. prefetchnta [esi + (200*64/34+192)] // start reading ahead
  113. movq mm0, [esi+0] // read 64 bits
  114. movq mm1, [esi+8]
  115. movq [edi+0], mm0 // write 64 bits
  116. movq [edi+8], mm1 // note: the normal movq writes the
  117. movq mm2, [esi+16] // data to cache; a cache line will be
  118. movq mm3, [esi+24] // allocated as needed, to store the data
  119. movq [edi+16], mm2
  120. movq [edi+24], mm3
  121. movq mm0, [esi+32]
  122. movq mm1, [esi+40]
  123. movq [edi+32], mm0
  124. movq [edi+40], mm1
  125. movq mm2, [esi+48]
  126. movq mm3, [esi+56]
  127. movq [edi+48], mm2
  128. movq [edi+56], mm3
  129. add esi, 64 // update source pointer
  130. add edi, 64 // update destination pointer
  131. dec ecx // count down
  132. jnz $memcpy_ic_1 // last 64-byte block?
  133. $memcpy_ic_2:
  134. mov ecx, ebx // has valid low 6 bits of the byte count
  135. $memcpy_ic_3:
  136. shr ecx, 2 // dword count
  137. and ecx, 1111b // only look at the "remainder" bits
  138. neg ecx // set up to jump into the array
  139. add ecx, offset $memcpy_last_few
  140. jmp ecx // jump to array of movsd's
  141. $memcpy_uc_test:
  142. cmp ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy
  143. jae $memcpy_bp_1
  144. $memcpy_64_test:
  145. or ecx, ecx // tail end of block prefetch will jump here
  146. jz $memcpy_ic_2 // no more 64-byte blocks left
  147. // For larger blocks, which will spill beyond the cache, it's faster to
  148. // use the Streaming Store instruction MOVNTQ. This write instruction
  149. // bypasses the cache and writes straight to main memory. This code also
  150. // uses the software prefetch instruction to pre-read the data.
  151. align 16
  152. $memcpy_uc_1: // 64-byte blocks, uncached copy
  153. prefetchnta [esi + (200*64/34+192)] // start reading ahead
  154. movq mm0,[esi+0] // read 64 bits
  155. add edi,64 // update destination pointer
  156. movq mm1,[esi+8]
  157. add esi,64 // update source pointer
  158. movq mm2,[esi-48]
  159. movntq [edi-64], mm0 // write 64 bits, bypassing the cache
  160. movq mm0,[esi-40] // note: movntq also prevents the CPU
  161. movntq [edi-56], mm1 // from READING the destination address
  162. movq mm1,[esi-32] // into the cache, only to be over-written
  163. movntq [edi-48], mm2 // so that also helps performance
  164. movq mm2,[esi-24]
  165. movntq [edi-40], mm0
  166. movq mm0,[esi-16]
  167. movntq [edi-32], mm1
  168. movq mm1,[esi-8]
  169. movntq [edi-24], mm2
  170. movntq [edi-16], mm0
  171. dec ecx
  172. movntq [edi-8], mm1
  173. jnz $memcpy_uc_1 // last 64-byte block?
  174. jmp $memcpy_ic_2 // almost done
  175. // For the largest size blocks, a special technique called Block Prefetch
  176. // can be used to accelerate the read operations. Block Prefetch reads
  177. // one address per cache line, for a series of cache lines, in a short loop.
  178. // This is faster than using software prefetch, in this case.
  179. // The technique is great for getting maximum read bandwidth,
  180. // especially in DDR memory systems.
  181. $memcpy_bp_1: // large blocks, block prefetch copy
  182. cmp ecx, CACHEBLOCK // big enough to run another prefetch loop?
  183. jl $memcpy_64_test // no, back to regular uncached copy
  184. mov eax, CACHEBLOCK / 2 // block prefetch loop, unrolled 2X
  185. add esi, CACHEBLOCK * 64 // move to the top of the block
  186. align 16
  187. $memcpy_bp_2:
  188. mov edx, [esi-64] // grab one address per cache line
  189. mov edx, [esi-128] // grab one address per cache line
  190. sub esi, 128 // go reverse order
  191. dec eax // count down the cache lines
  192. jnz $memcpy_bp_2 // keep grabbing more lines into cache
  193. mov eax, CACHEBLOCK // now that it's in cache, do the copy
  194. align 16
  195. $memcpy_bp_3:
  196. movq mm0, [esi ] // read 64 bits
  197. movq mm1, [esi+ 8]
  198. movq mm2, [esi+16]
  199. movq mm3, [esi+24]
  200. movq mm4, [esi+32]
  201. movq mm5, [esi+40]
  202. movq mm6, [esi+48]
  203. movq mm7, [esi+56]
  204. add esi, 64 // update source pointer
  205. movntq [edi ], mm0 // write 64 bits, bypassing cache
  206. movntq [edi+ 8], mm1 // note: movntq also prevents the CPU
  207. movntq [edi+16], mm2 // from READING the destination address
  208. movntq [edi+24], mm3 // into the cache, only to be over-written,
  209. movntq [edi+32], mm4 // so that also helps performance
  210. movntq [edi+40], mm5
  211. movntq [edi+48], mm6
  212. movntq [edi+56], mm7
  213. add edi, 64 // update dest pointer
  214. dec eax // count down
  215. jnz $memcpy_bp_3 // keep copying
  216. sub ecx, CACHEBLOCK // update the 64-byte block count
  217. jmp $memcpy_bp_1 // keep processing chunks
  218. // The smallest copy uses the X86 "movsd" instruction, in an optimized
  219. // form which is an "unrolled loop". Then it handles the last few bytes.
  220. align 4
  221. movsd
  222. movsd // perform last 1-15 dword copies
  223. movsd
  224. movsd
  225. movsd
  226. movsd
  227. movsd
  228. movsd
  229. movsd
  230. movsd // perform last 1-7 dword copies
  231. movsd
  232. movsd
  233. movsd
  234. movsd
  235. movsd
  236. movsd
  237. $memcpy_last_few: // dword aligned from before movsd's
  238. mov ecx, ebx // has valid low 2 bits of the byte count
  239. and ecx, 11b // the last few cows must come home
  240. jz $memcpy_final // no more, let's leave
  241. rep movsb // the last 1, 2, or 3 bytes
  242. $memcpy_final:
  243. emms // clean up the MMX state
  244. sfence // flush the write buffer
  245. mov eax, [dest] // ret value = destination pointer
  246. }
  247. }
  248. #endif /* _WIN32 */