mmx_32.c 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * MMX 3DNow! library helper functions
  4. *
  5. * To do:
  6. * We can use MMX just for prefetch in IRQ's. This may be a win.
  7. * (reported so on K6-III)
  8. * We should use a better code neutral filler for the short jump
  9. * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
  10. * We also want to clobber the filler register so we don't get any
  11. * register forwarding stalls on the filler.
  12. *
  13. * Add *user handling. Checksums are not a win with MMX on any CPU
  14. * tested so far for any MMX solution figured.
  15. *
  16. * 22/09/2000 - Arjan van de Ven
  17. * Improved for non-egineering-sample Athlons
  18. *
  19. */
  20. #include <linux/hardirq.h>
  21. #include <linux/string.h>
  22. #include <linux/export.h>
  23. #include <linux/sched.h>
  24. #include <linux/types.h>
  25. #include <asm/fpu/api.h>
  26. #include <asm/asm.h>
  27. void *_mmx_memcpy(void *to, const void *from, size_t len)
  28. {
  29. void *p;
  30. int i;
  31. if (unlikely(in_interrupt()))
  32. return __memcpy(to, from, len);
  33. p = to;
  34. i = len >> 6; /* len/64 */
  35. kernel_fpu_begin();
  36. __asm__ __volatile__ (
  37. "1: prefetch (%0)\n" /* This set is 28 bytes */
  38. " prefetch 64(%0)\n"
  39. " prefetch 128(%0)\n"
  40. " prefetch 192(%0)\n"
  41. " prefetch 256(%0)\n"
  42. "2: \n"
  43. ".section .fixup, \"ax\"\n"
  44. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  45. " jmp 2b\n"
  46. ".previous\n"
  47. _ASM_EXTABLE(1b, 3b)
  48. : : "r" (from));
  49. for ( ; i > 5; i--) {
  50. __asm__ __volatile__ (
  51. "1: prefetch 320(%0)\n"
  52. "2: movq (%0), %%mm0\n"
  53. " movq 8(%0), %%mm1\n"
  54. " movq 16(%0), %%mm2\n"
  55. " movq 24(%0), %%mm3\n"
  56. " movq %%mm0, (%1)\n"
  57. " movq %%mm1, 8(%1)\n"
  58. " movq %%mm2, 16(%1)\n"
  59. " movq %%mm3, 24(%1)\n"
  60. " movq 32(%0), %%mm0\n"
  61. " movq 40(%0), %%mm1\n"
  62. " movq 48(%0), %%mm2\n"
  63. " movq 56(%0), %%mm3\n"
  64. " movq %%mm0, 32(%1)\n"
  65. " movq %%mm1, 40(%1)\n"
  66. " movq %%mm2, 48(%1)\n"
  67. " movq %%mm3, 56(%1)\n"
  68. ".section .fixup, \"ax\"\n"
  69. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  70. " jmp 2b\n"
  71. ".previous\n"
  72. _ASM_EXTABLE(1b, 3b)
  73. : : "r" (from), "r" (to) : "memory");
  74. from += 64;
  75. to += 64;
  76. }
  77. for ( ; i > 0; i--) {
  78. __asm__ __volatile__ (
  79. " movq (%0), %%mm0\n"
  80. " movq 8(%0), %%mm1\n"
  81. " movq 16(%0), %%mm2\n"
  82. " movq 24(%0), %%mm3\n"
  83. " movq %%mm0, (%1)\n"
  84. " movq %%mm1, 8(%1)\n"
  85. " movq %%mm2, 16(%1)\n"
  86. " movq %%mm3, 24(%1)\n"
  87. " movq 32(%0), %%mm0\n"
  88. " movq 40(%0), %%mm1\n"
  89. " movq 48(%0), %%mm2\n"
  90. " movq 56(%0), %%mm3\n"
  91. " movq %%mm0, 32(%1)\n"
  92. " movq %%mm1, 40(%1)\n"
  93. " movq %%mm2, 48(%1)\n"
  94. " movq %%mm3, 56(%1)\n"
  95. : : "r" (from), "r" (to) : "memory");
  96. from += 64;
  97. to += 64;
  98. }
  99. /*
  100. * Now do the tail of the block:
  101. */
  102. __memcpy(to, from, len & 63);
  103. kernel_fpu_end();
  104. return p;
  105. }
  106. EXPORT_SYMBOL(_mmx_memcpy);
  107. #ifdef CONFIG_MK7
  108. /*
  109. * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
  110. * other MMX using processors do not.
  111. */
  112. static void fast_clear_page(void *page)
  113. {
  114. int i;
  115. kernel_fpu_begin();
  116. __asm__ __volatile__ (
  117. " pxor %%mm0, %%mm0\n" : :
  118. );
  119. for (i = 0; i < 4096/64; i++) {
  120. __asm__ __volatile__ (
  121. " movntq %%mm0, (%0)\n"
  122. " movntq %%mm0, 8(%0)\n"
  123. " movntq %%mm0, 16(%0)\n"
  124. " movntq %%mm0, 24(%0)\n"
  125. " movntq %%mm0, 32(%0)\n"
  126. " movntq %%mm0, 40(%0)\n"
  127. " movntq %%mm0, 48(%0)\n"
  128. " movntq %%mm0, 56(%0)\n"
  129. : : "r" (page) : "memory");
  130. page += 64;
  131. }
  132. /*
  133. * Since movntq is weakly-ordered, a "sfence" is needed to become
  134. * ordered again:
  135. */
  136. __asm__ __volatile__("sfence\n"::);
  137. kernel_fpu_end();
  138. }
  139. static void fast_copy_page(void *to, void *from)
  140. {
  141. int i;
  142. kernel_fpu_begin();
  143. /*
  144. * maybe the prefetch stuff can go before the expensive fnsave...
  145. * but that is for later. -AV
  146. */
  147. __asm__ __volatile__(
  148. "1: prefetch (%0)\n"
  149. " prefetch 64(%0)\n"
  150. " prefetch 128(%0)\n"
  151. " prefetch 192(%0)\n"
  152. " prefetch 256(%0)\n"
  153. "2: \n"
  154. ".section .fixup, \"ax\"\n"
  155. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  156. " jmp 2b\n"
  157. ".previous\n"
  158. _ASM_EXTABLE(1b, 3b) : : "r" (from));
  159. for (i = 0; i < (4096-320)/64; i++) {
  160. __asm__ __volatile__ (
  161. "1: prefetch 320(%0)\n"
  162. "2: movq (%0), %%mm0\n"
  163. " movntq %%mm0, (%1)\n"
  164. " movq 8(%0), %%mm1\n"
  165. " movntq %%mm1, 8(%1)\n"
  166. " movq 16(%0), %%mm2\n"
  167. " movntq %%mm2, 16(%1)\n"
  168. " movq 24(%0), %%mm3\n"
  169. " movntq %%mm3, 24(%1)\n"
  170. " movq 32(%0), %%mm4\n"
  171. " movntq %%mm4, 32(%1)\n"
  172. " movq 40(%0), %%mm5\n"
  173. " movntq %%mm5, 40(%1)\n"
  174. " movq 48(%0), %%mm6\n"
  175. " movntq %%mm6, 48(%1)\n"
  176. " movq 56(%0), %%mm7\n"
  177. " movntq %%mm7, 56(%1)\n"
  178. ".section .fixup, \"ax\"\n"
  179. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  180. " jmp 2b\n"
  181. ".previous\n"
  182. _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
  183. from += 64;
  184. to += 64;
  185. }
  186. for (i = (4096-320)/64; i < 4096/64; i++) {
  187. __asm__ __volatile__ (
  188. "2: movq (%0), %%mm0\n"
  189. " movntq %%mm0, (%1)\n"
  190. " movq 8(%0), %%mm1\n"
  191. " movntq %%mm1, 8(%1)\n"
  192. " movq 16(%0), %%mm2\n"
  193. " movntq %%mm2, 16(%1)\n"
  194. " movq 24(%0), %%mm3\n"
  195. " movntq %%mm3, 24(%1)\n"
  196. " movq 32(%0), %%mm4\n"
  197. " movntq %%mm4, 32(%1)\n"
  198. " movq 40(%0), %%mm5\n"
  199. " movntq %%mm5, 40(%1)\n"
  200. " movq 48(%0), %%mm6\n"
  201. " movntq %%mm6, 48(%1)\n"
  202. " movq 56(%0), %%mm7\n"
  203. " movntq %%mm7, 56(%1)\n"
  204. : : "r" (from), "r" (to) : "memory");
  205. from += 64;
  206. to += 64;
  207. }
  208. /*
  209. * Since movntq is weakly-ordered, a "sfence" is needed to become
  210. * ordered again:
  211. */
  212. __asm__ __volatile__("sfence \n"::);
  213. kernel_fpu_end();
  214. }
  215. #else /* CONFIG_MK7 */
  216. /*
  217. * Generic MMX implementation without K7 specific streaming
  218. */
  219. static void fast_clear_page(void *page)
  220. {
  221. int i;
  222. kernel_fpu_begin();
  223. __asm__ __volatile__ (
  224. " pxor %%mm0, %%mm0\n" : :
  225. );
  226. for (i = 0; i < 4096/128; i++) {
  227. __asm__ __volatile__ (
  228. " movq %%mm0, (%0)\n"
  229. " movq %%mm0, 8(%0)\n"
  230. " movq %%mm0, 16(%0)\n"
  231. " movq %%mm0, 24(%0)\n"
  232. " movq %%mm0, 32(%0)\n"
  233. " movq %%mm0, 40(%0)\n"
  234. " movq %%mm0, 48(%0)\n"
  235. " movq %%mm0, 56(%0)\n"
  236. " movq %%mm0, 64(%0)\n"
  237. " movq %%mm0, 72(%0)\n"
  238. " movq %%mm0, 80(%0)\n"
  239. " movq %%mm0, 88(%0)\n"
  240. " movq %%mm0, 96(%0)\n"
  241. " movq %%mm0, 104(%0)\n"
  242. " movq %%mm0, 112(%0)\n"
  243. " movq %%mm0, 120(%0)\n"
  244. : : "r" (page) : "memory");
  245. page += 128;
  246. }
  247. kernel_fpu_end();
  248. }
  249. static void fast_copy_page(void *to, void *from)
  250. {
  251. int i;
  252. kernel_fpu_begin();
  253. __asm__ __volatile__ (
  254. "1: prefetch (%0)\n"
  255. " prefetch 64(%0)\n"
  256. " prefetch 128(%0)\n"
  257. " prefetch 192(%0)\n"
  258. " prefetch 256(%0)\n"
  259. "2: \n"
  260. ".section .fixup, \"ax\"\n"
  261. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  262. " jmp 2b\n"
  263. ".previous\n"
  264. _ASM_EXTABLE(1b, 3b) : : "r" (from));
  265. for (i = 0; i < 4096/64; i++) {
  266. __asm__ __volatile__ (
  267. "1: prefetch 320(%0)\n"
  268. "2: movq (%0), %%mm0\n"
  269. " movq 8(%0), %%mm1\n"
  270. " movq 16(%0), %%mm2\n"
  271. " movq 24(%0), %%mm3\n"
  272. " movq %%mm0, (%1)\n"
  273. " movq %%mm1, 8(%1)\n"
  274. " movq %%mm2, 16(%1)\n"
  275. " movq %%mm3, 24(%1)\n"
  276. " movq 32(%0), %%mm0\n"
  277. " movq 40(%0), %%mm1\n"
  278. " movq 48(%0), %%mm2\n"
  279. " movq 56(%0), %%mm3\n"
  280. " movq %%mm0, 32(%1)\n"
  281. " movq %%mm1, 40(%1)\n"
  282. " movq %%mm2, 48(%1)\n"
  283. " movq %%mm3, 56(%1)\n"
  284. ".section .fixup, \"ax\"\n"
  285. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  286. " jmp 2b\n"
  287. ".previous\n"
  288. _ASM_EXTABLE(1b, 3b)
  289. : : "r" (from), "r" (to) : "memory");
  290. from += 64;
  291. to += 64;
  292. }
  293. kernel_fpu_end();
  294. }
  295. #endif /* !CONFIG_MK7 */
  296. /*
  297. * Favour MMX for page clear and copy:
  298. */
  299. static void slow_zero_page(void *page)
  300. {
  301. int d0, d1;
  302. __asm__ __volatile__(
  303. "cld\n\t"
  304. "rep ; stosl"
  305. : "=&c" (d0), "=&D" (d1)
  306. :"a" (0), "1" (page), "0" (1024)
  307. :"memory");
  308. }
  309. void mmx_clear_page(void *page)
  310. {
  311. if (unlikely(in_interrupt()))
  312. slow_zero_page(page);
  313. else
  314. fast_clear_page(page);
  315. }
  316. EXPORT_SYMBOL(mmx_clear_page);
  317. static void slow_copy_page(void *to, void *from)
  318. {
  319. int d0, d1, d2;
  320. __asm__ __volatile__(
  321. "cld\n\t"
  322. "rep ; movsl"
  323. : "=&c" (d0), "=&D" (d1), "=&S" (d2)
  324. : "0" (1024), "1" ((long) to), "2" ((long) from)
  325. : "memory");
  326. }
  327. void mmx_copy_page(void *to, void *from)
  328. {
  329. if (unlikely(in_interrupt()))
  330. slow_copy_page(to, from);
  331. else
  332. fast_copy_page(to, from);
  333. }
  334. EXPORT_SYMBOL(mmx_copy_page);