chacha20-neon-core.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. /*
  2. * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
  3. *
  4. * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. *
  10. * Based on:
  11. * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
  12. *
  13. * Copyright (C) 2015 Martin Willi
  14. *
  15. * This program is free software; you can redistribute it and/or modify
  16. * it under the terms of the GNU General Public License as published by
  17. * the Free Software Foundation; either version 2 of the License, or
  18. * (at your option) any later version.
  19. */
  20. #include <linux/linkage.h>
  21. .text
  22. .align 6
  23. ENTRY(chacha20_block_xor_neon)
  24. // x0: Input state matrix, s
  25. // x1: 1 data block output, o
  26. // x2: 1 data block input, i
  27. //
  28. // This function encrypts one ChaCha20 block by loading the state matrix
  29. // in four NEON registers. It performs matrix operation on four words in
  30. // parallel, but requires shuffling to rearrange the words after each
  31. // round.
  32. //
  33. // x0..3 = s0..3
  34. adr x3, ROT8
  35. ld1 {v0.4s-v3.4s}, [x0]
  36. ld1 {v8.4s-v11.4s}, [x0]
  37. ld1 {v12.4s}, [x3]
  38. mov x3, #10
  39. .Ldoubleround:
  40. // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  41. add v0.4s, v0.4s, v1.4s
  42. eor v3.16b, v3.16b, v0.16b
  43. rev32 v3.8h, v3.8h
  44. // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  45. add v2.4s, v2.4s, v3.4s
  46. eor v4.16b, v1.16b, v2.16b
  47. shl v1.4s, v4.4s, #12
  48. sri v1.4s, v4.4s, #20
  49. // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  50. add v0.4s, v0.4s, v1.4s
  51. eor v3.16b, v3.16b, v0.16b
  52. tbl v3.16b, {v3.16b}, v12.16b
  53. // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  54. add v2.4s, v2.4s, v3.4s
  55. eor v4.16b, v1.16b, v2.16b
  56. shl v1.4s, v4.4s, #7
  57. sri v1.4s, v4.4s, #25
  58. // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  59. ext v1.16b, v1.16b, v1.16b, #4
  60. // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  61. ext v2.16b, v2.16b, v2.16b, #8
  62. // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  63. ext v3.16b, v3.16b, v3.16b, #12
  64. // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  65. add v0.4s, v0.4s, v1.4s
  66. eor v3.16b, v3.16b, v0.16b
  67. rev32 v3.8h, v3.8h
  68. // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  69. add v2.4s, v2.4s, v3.4s
  70. eor v4.16b, v1.16b, v2.16b
  71. shl v1.4s, v4.4s, #12
  72. sri v1.4s, v4.4s, #20
  73. // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  74. add v0.4s, v0.4s, v1.4s
  75. eor v3.16b, v3.16b, v0.16b
  76. tbl v3.16b, {v3.16b}, v12.16b
  77. // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  78. add v2.4s, v2.4s, v3.4s
  79. eor v4.16b, v1.16b, v2.16b
  80. shl v1.4s, v4.4s, #7
  81. sri v1.4s, v4.4s, #25
  82. // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  83. ext v1.16b, v1.16b, v1.16b, #12
  84. // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  85. ext v2.16b, v2.16b, v2.16b, #8
  86. // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  87. ext v3.16b, v3.16b, v3.16b, #4
  88. subs x3, x3, #1
  89. b.ne .Ldoubleround
  90. ld1 {v4.16b-v7.16b}, [x2]
  91. // o0 = i0 ^ (x0 + s0)
  92. add v0.4s, v0.4s, v8.4s
  93. eor v0.16b, v0.16b, v4.16b
  94. // o1 = i1 ^ (x1 + s1)
  95. add v1.4s, v1.4s, v9.4s
  96. eor v1.16b, v1.16b, v5.16b
  97. // o2 = i2 ^ (x2 + s2)
  98. add v2.4s, v2.4s, v10.4s
  99. eor v2.16b, v2.16b, v6.16b
  100. // o3 = i3 ^ (x3 + s3)
  101. add v3.4s, v3.4s, v11.4s
  102. eor v3.16b, v3.16b, v7.16b
  103. st1 {v0.16b-v3.16b}, [x1]
  104. ret
  105. ENDPROC(chacha20_block_xor_neon)
  106. .align 6
  107. ENTRY(chacha20_4block_xor_neon)
  108. // x0: Input state matrix, s
  109. // x1: 4 data blocks output, o
  110. // x2: 4 data blocks input, i
  111. //
  112. // This function encrypts four consecutive ChaCha20 blocks by loading
  113. // the state matrix in NEON registers four times. The algorithm performs
  114. // each operation on the corresponding word of each state matrix, hence
  115. // requires no word shuffling. For final XORing step we transpose the
  116. // matrix by interleaving 32- and then 64-bit words, which allows us to
  117. // do XOR in NEON registers.
  118. //
  119. adr x3, CTRINC // ... and ROT8
  120. ld1 {v30.4s-v31.4s}, [x3]
  121. // x0..15[0-3] = s0..3[0..3]
  122. mov x4, x0
  123. ld4r { v0.4s- v3.4s}, [x4], #16
  124. ld4r { v4.4s- v7.4s}, [x4], #16
  125. ld4r { v8.4s-v11.4s}, [x4], #16
  126. ld4r {v12.4s-v15.4s}, [x4]
  127. // x12 += counter values 0-3
  128. add v12.4s, v12.4s, v30.4s
  129. mov x3, #10
  130. .Ldoubleround4:
  131. // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  132. // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  133. // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  134. // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  135. add v0.4s, v0.4s, v4.4s
  136. add v1.4s, v1.4s, v5.4s
  137. add v2.4s, v2.4s, v6.4s
  138. add v3.4s, v3.4s, v7.4s
  139. eor v12.16b, v12.16b, v0.16b
  140. eor v13.16b, v13.16b, v1.16b
  141. eor v14.16b, v14.16b, v2.16b
  142. eor v15.16b, v15.16b, v3.16b
  143. rev32 v12.8h, v12.8h
  144. rev32 v13.8h, v13.8h
  145. rev32 v14.8h, v14.8h
  146. rev32 v15.8h, v15.8h
  147. // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  148. // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  149. // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  150. // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  151. add v8.4s, v8.4s, v12.4s
  152. add v9.4s, v9.4s, v13.4s
  153. add v10.4s, v10.4s, v14.4s
  154. add v11.4s, v11.4s, v15.4s
  155. eor v16.16b, v4.16b, v8.16b
  156. eor v17.16b, v5.16b, v9.16b
  157. eor v18.16b, v6.16b, v10.16b
  158. eor v19.16b, v7.16b, v11.16b
  159. shl v4.4s, v16.4s, #12
  160. shl v5.4s, v17.4s, #12
  161. shl v6.4s, v18.4s, #12
  162. shl v7.4s, v19.4s, #12
  163. sri v4.4s, v16.4s, #20
  164. sri v5.4s, v17.4s, #20
  165. sri v6.4s, v18.4s, #20
  166. sri v7.4s, v19.4s, #20
  167. // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  168. // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  169. // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  170. // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  171. add v0.4s, v0.4s, v4.4s
  172. add v1.4s, v1.4s, v5.4s
  173. add v2.4s, v2.4s, v6.4s
  174. add v3.4s, v3.4s, v7.4s
  175. eor v12.16b, v12.16b, v0.16b
  176. eor v13.16b, v13.16b, v1.16b
  177. eor v14.16b, v14.16b, v2.16b
  178. eor v15.16b, v15.16b, v3.16b
  179. tbl v12.16b, {v12.16b}, v31.16b
  180. tbl v13.16b, {v13.16b}, v31.16b
  181. tbl v14.16b, {v14.16b}, v31.16b
  182. tbl v15.16b, {v15.16b}, v31.16b
  183. // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  184. // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  185. // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  186. // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  187. add v8.4s, v8.4s, v12.4s
  188. add v9.4s, v9.4s, v13.4s
  189. add v10.4s, v10.4s, v14.4s
  190. add v11.4s, v11.4s, v15.4s
  191. eor v16.16b, v4.16b, v8.16b
  192. eor v17.16b, v5.16b, v9.16b
  193. eor v18.16b, v6.16b, v10.16b
  194. eor v19.16b, v7.16b, v11.16b
  195. shl v4.4s, v16.4s, #7
  196. shl v5.4s, v17.4s, #7
  197. shl v6.4s, v18.4s, #7
  198. shl v7.4s, v19.4s, #7
  199. sri v4.4s, v16.4s, #25
  200. sri v5.4s, v17.4s, #25
  201. sri v6.4s, v18.4s, #25
  202. sri v7.4s, v19.4s, #25
  203. // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  204. // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
  205. // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  206. // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  207. add v0.4s, v0.4s, v5.4s
  208. add v1.4s, v1.4s, v6.4s
  209. add v2.4s, v2.4s, v7.4s
  210. add v3.4s, v3.4s, v4.4s
  211. eor v15.16b, v15.16b, v0.16b
  212. eor v12.16b, v12.16b, v1.16b
  213. eor v13.16b, v13.16b, v2.16b
  214. eor v14.16b, v14.16b, v3.16b
  215. rev32 v15.8h, v15.8h
  216. rev32 v12.8h, v12.8h
  217. rev32 v13.8h, v13.8h
  218. rev32 v14.8h, v14.8h
  219. // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  220. // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  221. // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  222. // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  223. add v10.4s, v10.4s, v15.4s
  224. add v11.4s, v11.4s, v12.4s
  225. add v8.4s, v8.4s, v13.4s
  226. add v9.4s, v9.4s, v14.4s
  227. eor v16.16b, v5.16b, v10.16b
  228. eor v17.16b, v6.16b, v11.16b
  229. eor v18.16b, v7.16b, v8.16b
  230. eor v19.16b, v4.16b, v9.16b
  231. shl v5.4s, v16.4s, #12
  232. shl v6.4s, v17.4s, #12
  233. shl v7.4s, v18.4s, #12
  234. shl v4.4s, v19.4s, #12
  235. sri v5.4s, v16.4s, #20
  236. sri v6.4s, v17.4s, #20
  237. sri v7.4s, v18.4s, #20
  238. sri v4.4s, v19.4s, #20
  239. // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  240. // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  241. // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  242. // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  243. add v0.4s, v0.4s, v5.4s
  244. add v1.4s, v1.4s, v6.4s
  245. add v2.4s, v2.4s, v7.4s
  246. add v3.4s, v3.4s, v4.4s
  247. eor v15.16b, v15.16b, v0.16b
  248. eor v12.16b, v12.16b, v1.16b
  249. eor v13.16b, v13.16b, v2.16b
  250. eor v14.16b, v14.16b, v3.16b
  251. tbl v15.16b, {v15.16b}, v31.16b
  252. tbl v12.16b, {v12.16b}, v31.16b
  253. tbl v13.16b, {v13.16b}, v31.16b
  254. tbl v14.16b, {v14.16b}, v31.16b
  255. // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  256. // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  257. // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  258. // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  259. add v10.4s, v10.4s, v15.4s
  260. add v11.4s, v11.4s, v12.4s
  261. add v8.4s, v8.4s, v13.4s
  262. add v9.4s, v9.4s, v14.4s
  263. eor v16.16b, v5.16b, v10.16b
  264. eor v17.16b, v6.16b, v11.16b
  265. eor v18.16b, v7.16b, v8.16b
  266. eor v19.16b, v4.16b, v9.16b
  267. shl v5.4s, v16.4s, #7
  268. shl v6.4s, v17.4s, #7
  269. shl v7.4s, v18.4s, #7
  270. shl v4.4s, v19.4s, #7
  271. sri v5.4s, v16.4s, #25
  272. sri v6.4s, v17.4s, #25
  273. sri v7.4s, v18.4s, #25
  274. sri v4.4s, v19.4s, #25
  275. subs x3, x3, #1
  276. b.ne .Ldoubleround4
  277. ld4r {v16.4s-v19.4s}, [x0], #16
  278. ld4r {v20.4s-v23.4s}, [x0], #16
  279. // x12 += counter values 0-3
  280. add v12.4s, v12.4s, v30.4s
  281. // x0[0-3] += s0[0]
  282. // x1[0-3] += s0[1]
  283. // x2[0-3] += s0[2]
  284. // x3[0-3] += s0[3]
  285. add v0.4s, v0.4s, v16.4s
  286. add v1.4s, v1.4s, v17.4s
  287. add v2.4s, v2.4s, v18.4s
  288. add v3.4s, v3.4s, v19.4s
  289. ld4r {v24.4s-v27.4s}, [x0], #16
  290. ld4r {v28.4s-v31.4s}, [x0]
  291. // x4[0-3] += s1[0]
  292. // x5[0-3] += s1[1]
  293. // x6[0-3] += s1[2]
  294. // x7[0-3] += s1[3]
  295. add v4.4s, v4.4s, v20.4s
  296. add v5.4s, v5.4s, v21.4s
  297. add v6.4s, v6.4s, v22.4s
  298. add v7.4s, v7.4s, v23.4s
  299. // x8[0-3] += s2[0]
  300. // x9[0-3] += s2[1]
  301. // x10[0-3] += s2[2]
  302. // x11[0-3] += s2[3]
  303. add v8.4s, v8.4s, v24.4s
  304. add v9.4s, v9.4s, v25.4s
  305. add v10.4s, v10.4s, v26.4s
  306. add v11.4s, v11.4s, v27.4s
  307. // x12[0-3] += s3[0]
  308. // x13[0-3] += s3[1]
  309. // x14[0-3] += s3[2]
  310. // x15[0-3] += s3[3]
  311. add v12.4s, v12.4s, v28.4s
  312. add v13.4s, v13.4s, v29.4s
  313. add v14.4s, v14.4s, v30.4s
  314. add v15.4s, v15.4s, v31.4s
  315. // interleave 32-bit words in state n, n+1
  316. zip1 v16.4s, v0.4s, v1.4s
  317. zip2 v17.4s, v0.4s, v1.4s
  318. zip1 v18.4s, v2.4s, v3.4s
  319. zip2 v19.4s, v2.4s, v3.4s
  320. zip1 v20.4s, v4.4s, v5.4s
  321. zip2 v21.4s, v4.4s, v5.4s
  322. zip1 v22.4s, v6.4s, v7.4s
  323. zip2 v23.4s, v6.4s, v7.4s
  324. zip1 v24.4s, v8.4s, v9.4s
  325. zip2 v25.4s, v8.4s, v9.4s
  326. zip1 v26.4s, v10.4s, v11.4s
  327. zip2 v27.4s, v10.4s, v11.4s
  328. zip1 v28.4s, v12.4s, v13.4s
  329. zip2 v29.4s, v12.4s, v13.4s
  330. zip1 v30.4s, v14.4s, v15.4s
  331. zip2 v31.4s, v14.4s, v15.4s
  332. // interleave 64-bit words in state n, n+2
  333. zip1 v0.2d, v16.2d, v18.2d
  334. zip2 v4.2d, v16.2d, v18.2d
  335. zip1 v8.2d, v17.2d, v19.2d
  336. zip2 v12.2d, v17.2d, v19.2d
  337. ld1 {v16.16b-v19.16b}, [x2], #64
  338. zip1 v1.2d, v20.2d, v22.2d
  339. zip2 v5.2d, v20.2d, v22.2d
  340. zip1 v9.2d, v21.2d, v23.2d
  341. zip2 v13.2d, v21.2d, v23.2d
  342. ld1 {v20.16b-v23.16b}, [x2], #64
  343. zip1 v2.2d, v24.2d, v26.2d
  344. zip2 v6.2d, v24.2d, v26.2d
  345. zip1 v10.2d, v25.2d, v27.2d
  346. zip2 v14.2d, v25.2d, v27.2d
  347. ld1 {v24.16b-v27.16b}, [x2], #64
  348. zip1 v3.2d, v28.2d, v30.2d
  349. zip2 v7.2d, v28.2d, v30.2d
  350. zip1 v11.2d, v29.2d, v31.2d
  351. zip2 v15.2d, v29.2d, v31.2d
  352. ld1 {v28.16b-v31.16b}, [x2]
  353. // xor with corresponding input, write to output
  354. eor v16.16b, v16.16b, v0.16b
  355. eor v17.16b, v17.16b, v1.16b
  356. eor v18.16b, v18.16b, v2.16b
  357. eor v19.16b, v19.16b, v3.16b
  358. eor v20.16b, v20.16b, v4.16b
  359. eor v21.16b, v21.16b, v5.16b
  360. st1 {v16.16b-v19.16b}, [x1], #64
  361. eor v22.16b, v22.16b, v6.16b
  362. eor v23.16b, v23.16b, v7.16b
  363. eor v24.16b, v24.16b, v8.16b
  364. eor v25.16b, v25.16b, v9.16b
  365. st1 {v20.16b-v23.16b}, [x1], #64
  366. eor v26.16b, v26.16b, v10.16b
  367. eor v27.16b, v27.16b, v11.16b
  368. eor v28.16b, v28.16b, v12.16b
  369. st1 {v24.16b-v27.16b}, [x1], #64
  370. eor v29.16b, v29.16b, v13.16b
  371. eor v30.16b, v30.16b, v14.16b
  372. eor v31.16b, v31.16b, v15.16b
  373. st1 {v28.16b-v31.16b}, [x1]
  374. ret
  375. ENDPROC(chacha20_4block_xor_neon)
  376. CTRINC: .word 0, 1, 2, 3
  377. ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f