crc32-pclmul_asm.S 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /* GPL HEADER START
  2. *
  3. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License version 2 only,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License version 2 for more details (a copy is included
  13. * in the LICENSE file that accompanied this code).
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * version 2 along with this program; If not, see http://www.gnu.org/licenses
  17. *
  18. * Please visit http://www.xyratex.com/contact if you need additional
  19. * information or have any questions.
  20. *
  21. * GPL HEADER END
  22. */
  23. /*
  24. * Copyright 2012 Xyratex Technology Limited
  25. *
  26. * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  27. * calculation.
  28. * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  29. * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  30. * at:
  31. * http://www.intel.com/products/processor/manuals/
  32. * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  33. * Volume 2B: Instruction Set Reference, N-Z
  34. *
  35. * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  36. * Alexander Boyko <Alexander_Boyko@xyratex.com>
  37. */
  38. #include <linux/linkage.h>
  39. #include <asm/inst.h>
  40. .align 16
  41. /*
  42. * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
  43. * #define CONSTANT_R1 0x154442bd4LL
  44. *
  45. * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
  46. * #define CONSTANT_R2 0x1c6e41596LL
  47. */
  48. .Lconstant_R2R1:
  49. .octa 0x00000001c6e415960000000154442bd4
  50. /*
  51. * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
  52. * #define CONSTANT_R3 0x1751997d0LL
  53. *
  54. * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
  55. * #define CONSTANT_R4 0x0ccaa009eLL
  56. */
  57. .Lconstant_R4R3:
  58. .octa 0x00000000ccaa009e00000001751997d0
  59. /*
  60. * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
  61. * #define CONSTANT_R5 0x163cd6124LL
  62. */
  63. .Lconstant_R5:
  64. .octa 0x00000000000000000000000163cd6124
  65. .Lconstant_mask32:
  66. .octa 0x000000000000000000000000FFFFFFFF
  67. /*
  68. * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
  69. *
  70. * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
  71. * #define CONSTANT_RU 0x1F7011641LL
  72. */
  73. .Lconstant_RUpoly:
  74. .octa 0x00000001F701164100000001DB710641
  75. #define CONSTANT %xmm0
  76. #ifdef __x86_64__
  77. #define BUF %rdi
  78. #define LEN %rsi
  79. #define CRC %edx
  80. #else
  81. #define BUF %eax
  82. #define LEN %edx
  83. #define CRC %ecx
  84. #endif
  85. .text
  86. /**
  87. * Calculate crc32
  88. * BUF - buffer (16 bytes aligned)
  89. * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
  90. * CRC - initial crc32
  91. * return %eax crc32
  92. * uint crc32_pclmul_le_16(unsigned char const *buffer,
  93. * size_t len, uint crc32)
  94. */
  95. ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
  96. movdqa (BUF), %xmm1
  97. movdqa 0x10(BUF), %xmm2
  98. movdqa 0x20(BUF), %xmm3
  99. movdqa 0x30(BUF), %xmm4
  100. movd CRC, CONSTANT
  101. pxor CONSTANT, %xmm1
  102. sub $0x40, LEN
  103. add $0x40, BUF
  104. #ifndef __x86_64__
  105. /* This is for position independent code(-fPIC) support for 32bit */
  106. call delta
  107. delta:
  108. pop %ecx
  109. #endif
  110. cmp $0x40, LEN
  111. jb less_64
  112. #ifdef __x86_64__
  113. movdqa .Lconstant_R2R1(%rip), CONSTANT
  114. #else
  115. movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
  116. #endif
  117. loop_64:/* 64 bytes Full cache line folding */
  118. prefetchnta 0x40(BUF)
  119. movdqa %xmm1, %xmm5
  120. movdqa %xmm2, %xmm6
  121. movdqa %xmm3, %xmm7
  122. #ifdef __x86_64__
  123. movdqa %xmm4, %xmm8
  124. #endif
  125. PCLMULQDQ 00, CONSTANT, %xmm1
  126. PCLMULQDQ 00, CONSTANT, %xmm2
  127. PCLMULQDQ 00, CONSTANT, %xmm3
  128. #ifdef __x86_64__
  129. PCLMULQDQ 00, CONSTANT, %xmm4
  130. #endif
  131. PCLMULQDQ 0x11, CONSTANT, %xmm5
  132. PCLMULQDQ 0x11, CONSTANT, %xmm6
  133. PCLMULQDQ 0x11, CONSTANT, %xmm7
  134. #ifdef __x86_64__
  135. PCLMULQDQ 0x11, CONSTANT, %xmm8
  136. #endif
  137. pxor %xmm5, %xmm1
  138. pxor %xmm6, %xmm2
  139. pxor %xmm7, %xmm3
  140. #ifdef __x86_64__
  141. pxor %xmm8, %xmm4
  142. #else
  143. /* xmm8 unsupported for x32 */
  144. movdqa %xmm4, %xmm5
  145. PCLMULQDQ 00, CONSTANT, %xmm4
  146. PCLMULQDQ 0x11, CONSTANT, %xmm5
  147. pxor %xmm5, %xmm4
  148. #endif
  149. pxor (BUF), %xmm1
  150. pxor 0x10(BUF), %xmm2
  151. pxor 0x20(BUF), %xmm3
  152. pxor 0x30(BUF), %xmm4
  153. sub $0x40, LEN
  154. add $0x40, BUF
  155. cmp $0x40, LEN
  156. jge loop_64
  157. less_64:/* Folding cache line into 128bit */
  158. #ifdef __x86_64__
  159. movdqa .Lconstant_R4R3(%rip), CONSTANT
  160. #else
  161. movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT
  162. #endif
  163. prefetchnta (BUF)
  164. movdqa %xmm1, %xmm5
  165. PCLMULQDQ 0x00, CONSTANT, %xmm1
  166. PCLMULQDQ 0x11, CONSTANT, %xmm5
  167. pxor %xmm5, %xmm1
  168. pxor %xmm2, %xmm1
  169. movdqa %xmm1, %xmm5
  170. PCLMULQDQ 0x00, CONSTANT, %xmm1
  171. PCLMULQDQ 0x11, CONSTANT, %xmm5
  172. pxor %xmm5, %xmm1
  173. pxor %xmm3, %xmm1
  174. movdqa %xmm1, %xmm5
  175. PCLMULQDQ 0x00, CONSTANT, %xmm1
  176. PCLMULQDQ 0x11, CONSTANT, %xmm5
  177. pxor %xmm5, %xmm1
  178. pxor %xmm4, %xmm1
  179. cmp $0x10, LEN
  180. jb fold_64
  181. loop_16:/* Folding rest buffer into 128bit */
  182. movdqa %xmm1, %xmm5
  183. PCLMULQDQ 0x00, CONSTANT, %xmm1
  184. PCLMULQDQ 0x11, CONSTANT, %xmm5
  185. pxor %xmm5, %xmm1
  186. pxor (BUF), %xmm1
  187. sub $0x10, LEN
  188. add $0x10, BUF
  189. cmp $0x10, LEN
  190. jge loop_16
  191. fold_64:
  192. /* perform the last 64 bit fold, also adds 32 zeroes
  193. * to the input stream */
  194. PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
  195. psrldq $0x08, %xmm1
  196. pxor CONSTANT, %xmm1
  197. /* final 32-bit fold */
  198. movdqa %xmm1, %xmm2
  199. #ifdef __x86_64__
  200. movdqa .Lconstant_R5(%rip), CONSTANT
  201. movdqa .Lconstant_mask32(%rip), %xmm3
  202. #else
  203. movdqa .Lconstant_R5 - delta(%ecx), CONSTANT
  204. movdqa .Lconstant_mask32 - delta(%ecx), %xmm3
  205. #endif
  206. psrldq $0x04, %xmm2
  207. pand %xmm3, %xmm1
  208. PCLMULQDQ 0x00, CONSTANT, %xmm1
  209. pxor %xmm2, %xmm1
  210. /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
  211. #ifdef __x86_64__
  212. movdqa .Lconstant_RUpoly(%rip), CONSTANT
  213. #else
  214. movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT
  215. #endif
  216. movdqa %xmm1, %xmm2
  217. pand %xmm3, %xmm1
  218. PCLMULQDQ 0x10, CONSTANT, %xmm1
  219. pand %xmm3, %xmm1
  220. PCLMULQDQ 0x00, CONSTANT, %xmm1
  221. pxor %xmm2, %xmm1
  222. PEXTRD 0x01, %xmm1, %eax
  223. ret
  224. ENDPROC(crc32_pclmul_le_16)