crc32-pclmul_asm.S 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /* GPL HEADER START
  2. *
  3. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License version 2 only,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License version 2 for more details (a copy is included
  13. * in the LICENSE file that accompanied this code).
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * version 2 along with this program; If not, see http://www.gnu.org/licenses
  17. *
  18. * Please visit http://www.xyratex.com/contact if you need additional
  19. * information or have any questions.
  20. *
  21. * GPL HEADER END
  22. */
  23. /*
  24. * Copyright 2012 Xyratex Technology Limited
  25. *
  26. * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  27. * calculation.
  28. * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  29. * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  30. * at:
  31. * http://www.intel.com/products/processor/manuals/
  32. * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  33. * Volume 2B: Instruction Set Reference, N-Z
  34. *
  35. * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  36. * Alexander Boyko <Alexander_Boyko@xyratex.com>
  37. */
  38. #include <linux/linkage.h>
  39. #include <asm/inst.h>
  40. .section .rodata
  41. .align 16
  42. /*
  43. * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
  44. * #define CONSTANT_R1 0x154442bd4LL
  45. *
  46. * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
  47. * #define CONSTANT_R2 0x1c6e41596LL
  48. */
  49. .Lconstant_R2R1:
  50. .octa 0x00000001c6e415960000000154442bd4
  51. /*
  52. * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
  53. * #define CONSTANT_R3 0x1751997d0LL
  54. *
  55. * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
  56. * #define CONSTANT_R4 0x0ccaa009eLL
  57. */
  58. .Lconstant_R4R3:
  59. .octa 0x00000000ccaa009e00000001751997d0
  60. /*
  61. * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
  62. * #define CONSTANT_R5 0x163cd6124LL
  63. */
  64. .Lconstant_R5:
  65. .octa 0x00000000000000000000000163cd6124
  66. .Lconstant_mask32:
  67. .octa 0x000000000000000000000000FFFFFFFF
  68. /*
  69. * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
  70. *
  71. * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
  72. * #define CONSTANT_RU 0x1F7011641LL
  73. */
  74. .Lconstant_RUpoly:
  75. .octa 0x00000001F701164100000001DB710641
  76. #define CONSTANT %xmm0
  77. #ifdef __x86_64__
  78. #define BUF %rdi
  79. #define LEN %rsi
  80. #define CRC %edx
  81. #else
  82. #define BUF %eax
  83. #define LEN %edx
  84. #define CRC %ecx
  85. #endif
  86. .text
  87. /**
  88. * Calculate crc32
  89. * BUF - buffer (16 bytes aligned)
  90. * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
  91. * CRC - initial crc32
  92. * return %eax crc32
  93. * uint crc32_pclmul_le_16(unsigned char const *buffer,
  94. * size_t len, uint crc32)
  95. */
  96. ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
  97. movdqa (BUF), %xmm1
  98. movdqa 0x10(BUF), %xmm2
  99. movdqa 0x20(BUF), %xmm3
  100. movdqa 0x30(BUF), %xmm4
  101. movd CRC, CONSTANT
  102. pxor CONSTANT, %xmm1
  103. sub $0x40, LEN
  104. add $0x40, BUF
  105. cmp $0x40, LEN
  106. jb less_64
  107. #ifdef __x86_64__
  108. movdqa .Lconstant_R2R1(%rip), CONSTANT
  109. #else
  110. movdqa .Lconstant_R2R1, CONSTANT
  111. #endif
  112. loop_64:/* 64 bytes Full cache line folding */
  113. prefetchnta 0x40(BUF)
  114. movdqa %xmm1, %xmm5
  115. movdqa %xmm2, %xmm6
  116. movdqa %xmm3, %xmm7
  117. #ifdef __x86_64__
  118. movdqa %xmm4, %xmm8
  119. #endif
  120. PCLMULQDQ 00, CONSTANT, %xmm1
  121. PCLMULQDQ 00, CONSTANT, %xmm2
  122. PCLMULQDQ 00, CONSTANT, %xmm3
  123. #ifdef __x86_64__
  124. PCLMULQDQ 00, CONSTANT, %xmm4
  125. #endif
  126. PCLMULQDQ 0x11, CONSTANT, %xmm5
  127. PCLMULQDQ 0x11, CONSTANT, %xmm6
  128. PCLMULQDQ 0x11, CONSTANT, %xmm7
  129. #ifdef __x86_64__
  130. PCLMULQDQ 0x11, CONSTANT, %xmm8
  131. #endif
  132. pxor %xmm5, %xmm1
  133. pxor %xmm6, %xmm2
  134. pxor %xmm7, %xmm3
  135. #ifdef __x86_64__
  136. pxor %xmm8, %xmm4
  137. #else
  138. /* xmm8 unsupported for x32 */
  139. movdqa %xmm4, %xmm5
  140. PCLMULQDQ 00, CONSTANT, %xmm4
  141. PCLMULQDQ 0x11, CONSTANT, %xmm5
  142. pxor %xmm5, %xmm4
  143. #endif
  144. pxor (BUF), %xmm1
  145. pxor 0x10(BUF), %xmm2
  146. pxor 0x20(BUF), %xmm3
  147. pxor 0x30(BUF), %xmm4
  148. sub $0x40, LEN
  149. add $0x40, BUF
  150. cmp $0x40, LEN
  151. jge loop_64
  152. less_64:/* Folding cache line into 128bit */
  153. #ifdef __x86_64__
  154. movdqa .Lconstant_R4R3(%rip), CONSTANT
  155. #else
  156. movdqa .Lconstant_R4R3, CONSTANT
  157. #endif
  158. prefetchnta (BUF)
  159. movdqa %xmm1, %xmm5
  160. PCLMULQDQ 0x00, CONSTANT, %xmm1
  161. PCLMULQDQ 0x11, CONSTANT, %xmm5
  162. pxor %xmm5, %xmm1
  163. pxor %xmm2, %xmm1
  164. movdqa %xmm1, %xmm5
  165. PCLMULQDQ 0x00, CONSTANT, %xmm1
  166. PCLMULQDQ 0x11, CONSTANT, %xmm5
  167. pxor %xmm5, %xmm1
  168. pxor %xmm3, %xmm1
  169. movdqa %xmm1, %xmm5
  170. PCLMULQDQ 0x00, CONSTANT, %xmm1
  171. PCLMULQDQ 0x11, CONSTANT, %xmm5
  172. pxor %xmm5, %xmm1
  173. pxor %xmm4, %xmm1
  174. cmp $0x10, LEN
  175. jb fold_64
  176. loop_16:/* Folding rest buffer into 128bit */
  177. movdqa %xmm1, %xmm5
  178. PCLMULQDQ 0x00, CONSTANT, %xmm1
  179. PCLMULQDQ 0x11, CONSTANT, %xmm5
  180. pxor %xmm5, %xmm1
  181. pxor (BUF), %xmm1
  182. sub $0x10, LEN
  183. add $0x10, BUF
  184. cmp $0x10, LEN
  185. jge loop_16
  186. fold_64:
  187. /* perform the last 64 bit fold, also adds 32 zeroes
  188. * to the input stream */
  189. PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
  190. psrldq $0x08, %xmm1
  191. pxor CONSTANT, %xmm1
  192. /* final 32-bit fold */
  193. movdqa %xmm1, %xmm2
  194. #ifdef __x86_64__
  195. movdqa .Lconstant_R5(%rip), CONSTANT
  196. movdqa .Lconstant_mask32(%rip), %xmm3
  197. #else
  198. movdqa .Lconstant_R5, CONSTANT
  199. movdqa .Lconstant_mask32, %xmm3
  200. #endif
  201. psrldq $0x04, %xmm2
  202. pand %xmm3, %xmm1
  203. PCLMULQDQ 0x00, CONSTANT, %xmm1
  204. pxor %xmm2, %xmm1
  205. /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
  206. #ifdef __x86_64__
  207. movdqa .Lconstant_RUpoly(%rip), CONSTANT
  208. #else
  209. movdqa .Lconstant_RUpoly, CONSTANT
  210. #endif
  211. movdqa %xmm1, %xmm2
  212. pand %xmm3, %xmm1
  213. PCLMULQDQ 0x10, CONSTANT, %xmm1
  214. pand %xmm3, %xmm1
  215. PCLMULQDQ 0x00, CONSTANT, %xmm1
  216. pxor %xmm2, %xmm1
  217. PEXTRD 0x01, %xmm1, %eax
  218. ret
  219. ENDPROC(crc32_pclmul_le_16)