crct10dif-ce-core.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. //
  2. // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
  3. //
  4. // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. //
  6. // This program is free software; you can redistribute it and/or modify
  7. // it under the terms of the GNU General Public License version 2 as
  8. // published by the Free Software Foundation.
  9. //
  10. //
  11. // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
  12. //
  13. // Copyright (c) 2013, Intel Corporation
  14. //
  15. // Authors:
  16. // Erdinc Ozturk <erdinc.ozturk@intel.com>
  17. // Vinodh Gopal <vinodh.gopal@intel.com>
  18. // James Guilford <james.guilford@intel.com>
  19. // Tim Chen <tim.c.chen@linux.intel.com>
  20. //
  21. // This software is available to you under a choice of one of two
  22. // licenses. You may choose to be licensed under the terms of the GNU
  23. // General Public License (GPL) Version 2, available from the file
  24. // COPYING in the main directory of this source tree, or the
  25. // OpenIB.org BSD license below:
  26. //
  27. // Redistribution and use in source and binary forms, with or without
  28. // modification, are permitted provided that the following conditions are
  29. // met:
  30. //
  31. // * Redistributions of source code must retain the above copyright
  32. // notice, this list of conditions and the following disclaimer.
  33. //
  34. // * Redistributions in binary form must reproduce the above copyright
  35. // notice, this list of conditions and the following disclaimer in the
  36. // documentation and/or other materials provided with the
  37. // distribution.
  38. //
  39. // * Neither the name of the Intel Corporation nor the names of its
  40. // contributors may be used to endorse or promote products derived from
  41. // this software without specific prior written permission.
  42. //
  43. //
  44. // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  45. // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  46. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  47. // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  48. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  49. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  50. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  51. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  52. // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  53. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  54. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55. //
  56. // Function API:
  57. // UINT16 crc_t10dif_pcl(
  58. // UINT16 init_crc, //initial CRC value, 16 bits
  59. // const unsigned char *buf, //buffer pointer to calculate CRC on
  60. // UINT64 len //buffer length in bytes (64-bit data)
  61. // );
  62. //
  63. // Reference paper titled "Fast CRC Computation for Generic
  64. // Polynomials Using PCLMULQDQ Instruction"
  65. // URL: http://www.intel.com/content/dam/www/public/us/en/documents
  66. // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  67. //
  68. //
  69. #include <linux/linkage.h>
  70. #include <asm/assembler.h>
  71. .text
  72. .cpu generic+crypto
  73. arg1_low32 .req w19
  74. arg2 .req x20
  75. arg3 .req x21
  76. vzr .req v13
  77. ENTRY(crc_t10dif_pmull)
  78. frame_push 3, 128
  79. mov arg1_low32, w0
  80. mov arg2, x1
  81. mov arg3, x2
  82. movi vzr.16b, #0 // init zero register
  83. // adjust the 16-bit initial_crc value, scale it to 32 bits
  84. lsl arg1_low32, arg1_low32, #16
  85. // check if smaller than 256
  86. cmp arg3, #256
  87. // for sizes less than 128, we can't fold 64B at a time...
  88. b.lt _less_than_128
  89. // load the initial crc value
  90. // crc value does not need to be byte-reflected, but it needs
  91. // to be moved to the high part of the register.
  92. // because data will be byte-reflected and will align with
  93. // initial crc at correct place.
  94. movi v10.16b, #0
  95. mov v10.s[3], arg1_low32 // initial crc
  96. // receive the initial 64B data, xor the initial crc value
  97. ldp q0, q1, [arg2]
  98. ldp q2, q3, [arg2, #0x20]
  99. ldp q4, q5, [arg2, #0x40]
  100. ldp q6, q7, [arg2, #0x60]
  101. add arg2, arg2, #0x80
  102. CPU_LE( rev64 v0.16b, v0.16b )
  103. CPU_LE( rev64 v1.16b, v1.16b )
  104. CPU_LE( rev64 v2.16b, v2.16b )
  105. CPU_LE( rev64 v3.16b, v3.16b )
  106. CPU_LE( rev64 v4.16b, v4.16b )
  107. CPU_LE( rev64 v5.16b, v5.16b )
  108. CPU_LE( rev64 v6.16b, v6.16b )
  109. CPU_LE( rev64 v7.16b, v7.16b )
  110. CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
  111. CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
  112. CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
  113. CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
  114. CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
  115. CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
  116. CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
  117. CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
  118. // XOR the initial_crc value
  119. eor v0.16b, v0.16b, v10.16b
  120. ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
  121. // type of pmull instruction
  122. // will determine which constant to use
  123. //
  124. // we subtract 256 instead of 128 to save one instruction from the loop
  125. //
  126. sub arg3, arg3, #256
  127. // at this section of the code, there is 64*x+y (0<=y<64) bytes of
  128. // buffer. The _fold_64_B_loop will fold 64B at a time
  129. // until we have 64+y Bytes of buffer
  130. // fold 64B at a time. This section of the code folds 4 vector
  131. // registers in parallel
  132. _fold_64_B_loop:
  133. .macro fold64, reg1, reg2
  134. ldp q11, q12, [arg2], #0x20
  135. pmull2 v8.1q, \reg1\().2d, v10.2d
  136. pmull \reg1\().1q, \reg1\().1d, v10.1d
  137. CPU_LE( rev64 v11.16b, v11.16b )
  138. CPU_LE( rev64 v12.16b, v12.16b )
  139. pmull2 v9.1q, \reg2\().2d, v10.2d
  140. pmull \reg2\().1q, \reg2\().1d, v10.1d
  141. CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
  142. CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
  143. eor \reg1\().16b, \reg1\().16b, v8.16b
  144. eor \reg2\().16b, \reg2\().16b, v9.16b
  145. eor \reg1\().16b, \reg1\().16b, v11.16b
  146. eor \reg2\().16b, \reg2\().16b, v12.16b
  147. .endm
  148. fold64 v0, v1
  149. fold64 v2, v3
  150. fold64 v4, v5
  151. fold64 v6, v7
  152. subs arg3, arg3, #128
  153. // check if there is another 64B in the buffer to be able to fold
  154. b.lt _fold_64_B_end
  155. if_will_cond_yield_neon
  156. stp q0, q1, [sp, #.Lframe_local_offset]
  157. stp q2, q3, [sp, #.Lframe_local_offset + 32]
  158. stp q4, q5, [sp, #.Lframe_local_offset + 64]
  159. stp q6, q7, [sp, #.Lframe_local_offset + 96]
  160. do_cond_yield_neon
  161. ldp q0, q1, [sp, #.Lframe_local_offset]
  162. ldp q2, q3, [sp, #.Lframe_local_offset + 32]
  163. ldp q4, q5, [sp, #.Lframe_local_offset + 64]
  164. ldp q6, q7, [sp, #.Lframe_local_offset + 96]
  165. ldr_l q10, rk3, x8
  166. movi vzr.16b, #0 // init zero register
  167. endif_yield_neon
  168. b _fold_64_B_loop
  169. _fold_64_B_end:
  170. // at this point, the buffer pointer is pointing at the last y Bytes
  171. // of the buffer the 64B of folded data is in 4 of the vector
  172. // registers: v0, v1, v2, v3
  173. // fold the 8 vector registers to 1 vector register with different
  174. // constants
  175. ldr_l q10, rk9, x8
  176. .macro fold16, reg, rk
  177. pmull v8.1q, \reg\().1d, v10.1d
  178. pmull2 \reg\().1q, \reg\().2d, v10.2d
  179. .ifnb \rk
  180. ldr_l q10, \rk, x8
  181. .endif
  182. eor v7.16b, v7.16b, v8.16b
  183. eor v7.16b, v7.16b, \reg\().16b
  184. .endm
  185. fold16 v0, rk11
  186. fold16 v1, rk13
  187. fold16 v2, rk15
  188. fold16 v3, rk17
  189. fold16 v4, rk19
  190. fold16 v5, rk1
  191. fold16 v6
  192. // instead of 64, we add 48 to the loop counter to save 1 instruction
  193. // from the loop instead of a cmp instruction, we use the negative
  194. // flag with the jl instruction
  195. adds arg3, arg3, #(128-16)
  196. b.lt _final_reduction_for_128
  197. // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
  198. // and the rest is in memory. We can fold 16 bytes at a time if y>=16
  199. // continue folding 16B at a time
  200. _16B_reduction_loop:
  201. pmull v8.1q, v7.1d, v10.1d
  202. pmull2 v7.1q, v7.2d, v10.2d
  203. eor v7.16b, v7.16b, v8.16b
  204. ldr q0, [arg2], #16
  205. CPU_LE( rev64 v0.16b, v0.16b )
  206. CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
  207. eor v7.16b, v7.16b, v0.16b
  208. subs arg3, arg3, #16
  209. // instead of a cmp instruction, we utilize the flags with the
  210. // jge instruction equivalent of: cmp arg3, 16-16
  211. // check if there is any more 16B in the buffer to be able to fold
  212. b.ge _16B_reduction_loop
  213. // now we have 16+z bytes left to reduce, where 0<= z < 16.
  214. // first, we reduce the data in the xmm7 register
  215. _final_reduction_for_128:
  216. // check if any more data to fold. If not, compute the CRC of
  217. // the final 128 bits
  218. adds arg3, arg3, #16
  219. b.eq _128_done
  220. // here we are getting data that is less than 16 bytes.
  221. // since we know that there was data before the pointer, we can
  222. // offset the input pointer before the actual point, to receive
  223. // exactly 16 bytes. after that the registers need to be adjusted.
  224. _get_last_two_regs:
  225. add arg2, arg2, arg3
  226. ldr q1, [arg2, #-16]
  227. CPU_LE( rev64 v1.16b, v1.16b )
  228. CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
  229. // get rid of the extra data that was loaded before
  230. // load the shift constant
  231. adr_l x4, tbl_shf_table + 16
  232. sub x4, x4, arg3
  233. ld1 {v0.16b}, [x4]
  234. // shift v2 to the left by arg3 bytes
  235. tbl v2.16b, {v7.16b}, v0.16b
  236. // shift v7 to the right by 16-arg3 bytes
  237. movi v9.16b, #0x80
  238. eor v0.16b, v0.16b, v9.16b
  239. tbl v7.16b, {v7.16b}, v0.16b
  240. // blend
  241. sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
  242. bsl v0.16b, v2.16b, v1.16b
  243. // fold 16 Bytes
  244. pmull v8.1q, v7.1d, v10.1d
  245. pmull2 v7.1q, v7.2d, v10.2d
  246. eor v7.16b, v7.16b, v8.16b
  247. eor v7.16b, v7.16b, v0.16b
  248. _128_done:
  249. // compute crc of a 128-bit value
  250. ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
  251. // 64b fold
  252. ext v0.16b, vzr.16b, v7.16b, #8
  253. mov v7.d[0], v7.d[1]
  254. pmull v7.1q, v7.1d, v10.1d
  255. eor v7.16b, v7.16b, v0.16b
  256. // 32b fold
  257. ext v0.16b, v7.16b, vzr.16b, #4
  258. mov v7.s[3], vzr.s[0]
  259. pmull2 v0.1q, v0.2d, v10.2d
  260. eor v7.16b, v7.16b, v0.16b
  261. // barrett reduction
  262. _barrett:
  263. ldr_l q10, rk7, x8
  264. mov v0.d[0], v7.d[1]
  265. pmull v0.1q, v0.1d, v10.1d
  266. ext v0.16b, vzr.16b, v0.16b, #12
  267. pmull2 v0.1q, v0.2d, v10.2d
  268. ext v0.16b, vzr.16b, v0.16b, #12
  269. eor v7.16b, v7.16b, v0.16b
  270. mov w0, v7.s[1]
  271. _cleanup:
  272. // scale the result back to 16 bits
  273. lsr x0, x0, #16
  274. frame_pop
  275. ret
  276. _less_than_128:
  277. cbz arg3, _cleanup
  278. movi v0.16b, #0
  279. mov v0.s[3], arg1_low32 // get the initial crc value
  280. ldr q7, [arg2], #0x10
  281. CPU_LE( rev64 v7.16b, v7.16b )
  282. CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
  283. eor v7.16b, v7.16b, v0.16b // xor the initial crc value
  284. cmp arg3, #16
  285. b.eq _128_done // exactly 16 left
  286. b.lt _less_than_16_left
  287. ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
  288. // update the counter. subtract 32 instead of 16 to save one
  289. // instruction from the loop
  290. subs arg3, arg3, #32
  291. b.ge _16B_reduction_loop
  292. add arg3, arg3, #16
  293. b _get_last_two_regs
  294. _less_than_16_left:
  295. // shl r9, 4
  296. adr_l x0, tbl_shf_table + 16
  297. sub x0, x0, arg3
  298. ld1 {v0.16b}, [x0]
  299. movi v9.16b, #0x80
  300. eor v0.16b, v0.16b, v9.16b
  301. tbl v7.16b, {v7.16b}, v0.16b
  302. b _128_done
  303. ENDPROC(crc_t10dif_pmull)
  304. // precomputed constants
  305. // these constants are precomputed from the poly:
  306. // 0x8bb70000 (0x8bb7 scaled to 32 bits)
  307. .section ".rodata", "a"
  308. .align 4
  309. // Q = 0x18BB70000
  310. // rk1 = 2^(32*3) mod Q << 32
  311. // rk2 = 2^(32*5) mod Q << 32
  312. // rk3 = 2^(32*15) mod Q << 32
  313. // rk4 = 2^(32*17) mod Q << 32
  314. // rk5 = 2^(32*3) mod Q << 32
  315. // rk6 = 2^(32*2) mod Q << 32
  316. // rk7 = floor(2^64/Q)
  317. // rk8 = Q
  318. rk1: .octa 0x06df0000000000002d56000000000000
  319. rk3: .octa 0x7cf50000000000009d9d000000000000
  320. rk5: .octa 0x13680000000000002d56000000000000
  321. rk7: .octa 0x000000018bb7000000000001f65a57f8
  322. rk9: .octa 0xbfd6000000000000ceae000000000000
  323. rk11: .octa 0x713c0000000000001e16000000000000
  324. rk13: .octa 0x80a6000000000000f7f9000000000000
  325. rk15: .octa 0xe658000000000000044c000000000000
  326. rk17: .octa 0xa497000000000000ad18000000000000
  327. rk19: .octa 0xe7b50000000000006ee3000000000000
  328. tbl_shf_table:
  329. // use these values for shift constants for the tbl/tbx instruction
  330. // different alignments result in values as shown:
  331. // DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
  332. // DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
  333. // DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
  334. // DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
  335. // DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
  336. // DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
  337. // DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
  338. // DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
  339. // DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
  340. // DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
  341. // DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
  342. // DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
  343. // DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
  344. // DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
  345. // DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
  346. .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
  347. .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
  348. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  349. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0