aes-modes.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. /*
  2. * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
  3. *
  4. * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. /* included by aes-ce.S and aes-neon.S */
  11. .text
  12. .align 4
  13. /*
  14. * There are several ways to instantiate this code:
  15. * - no interleave, all inline
  16. * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
  17. * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
  18. * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
  19. * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
  20. *
  21. * Macros imported by this code:
  22. * - enc_prepare - setup NEON registers for encryption
  23. * - dec_prepare - setup NEON registers for decryption
  24. * - enc_switch_key - change to new key after having prepared for encryption
  25. * - encrypt_block - encrypt a single block
  26. * - decrypt block - decrypt a single block
  27. * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
  28. * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
  29. * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
  30. * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
  31. */
  32. #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
  33. #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
  34. #define FRAME_POP ldp x29, x30, [sp],#16
  35. #if INTERLEAVE == 2
  36. aes_encrypt_block2x:
  37. encrypt_block2x v0, v1, w3, x2, x6, w7
  38. ret
  39. ENDPROC(aes_encrypt_block2x)
  40. aes_decrypt_block2x:
  41. decrypt_block2x v0, v1, w3, x2, x6, w7
  42. ret
  43. ENDPROC(aes_decrypt_block2x)
  44. #elif INTERLEAVE == 4
  45. aes_encrypt_block4x:
  46. encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  47. ret
  48. ENDPROC(aes_encrypt_block4x)
  49. aes_decrypt_block4x:
  50. decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  51. ret
  52. ENDPROC(aes_decrypt_block4x)
  53. #else
  54. #error INTERLEAVE should equal 2 or 4
  55. #endif
  56. .macro do_encrypt_block2x
  57. bl aes_encrypt_block2x
  58. .endm
  59. .macro do_decrypt_block2x
  60. bl aes_decrypt_block2x
  61. .endm
  62. .macro do_encrypt_block4x
  63. bl aes_encrypt_block4x
  64. .endm
  65. .macro do_decrypt_block4x
  66. bl aes_decrypt_block4x
  67. .endm
  68. #else
  69. #define FRAME_PUSH
  70. #define FRAME_POP
  71. .macro do_encrypt_block2x
  72. encrypt_block2x v0, v1, w3, x2, x6, w7
  73. .endm
  74. .macro do_decrypt_block2x
  75. decrypt_block2x v0, v1, w3, x2, x6, w7
  76. .endm
  77. .macro do_encrypt_block4x
  78. encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  79. .endm
  80. .macro do_decrypt_block4x
  81. decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  82. .endm
  83. #endif
  84. /*
  85. * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  86. * int blocks, int first)
  87. * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  88. * int blocks, int first)
  89. */
  90. AES_ENTRY(aes_ecb_encrypt)
  91. FRAME_PUSH
  92. cbz w5, .LecbencloopNx
  93. enc_prepare w3, x2, x5
  94. .LecbencloopNx:
  95. #if INTERLEAVE >= 2
  96. subs w4, w4, #INTERLEAVE
  97. bmi .Lecbenc1x
  98. #if INTERLEAVE == 2
  99. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
  100. do_encrypt_block2x
  101. st1 {v0.16b-v1.16b}, [x0], #32
  102. #else
  103. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  104. do_encrypt_block4x
  105. st1 {v0.16b-v3.16b}, [x0], #64
  106. #endif
  107. b .LecbencloopNx
  108. .Lecbenc1x:
  109. adds w4, w4, #INTERLEAVE
  110. beq .Lecbencout
  111. #endif
  112. .Lecbencloop:
  113. ld1 {v0.16b}, [x1], #16 /* get next pt block */
  114. encrypt_block v0, w3, x2, x5, w6
  115. st1 {v0.16b}, [x0], #16
  116. subs w4, w4, #1
  117. bne .Lecbencloop
  118. .Lecbencout:
  119. FRAME_POP
  120. ret
  121. AES_ENDPROC(aes_ecb_encrypt)
  122. AES_ENTRY(aes_ecb_decrypt)
  123. FRAME_PUSH
  124. cbz w5, .LecbdecloopNx
  125. dec_prepare w3, x2, x5
  126. .LecbdecloopNx:
  127. #if INTERLEAVE >= 2
  128. subs w4, w4, #INTERLEAVE
  129. bmi .Lecbdec1x
  130. #if INTERLEAVE == 2
  131. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  132. do_decrypt_block2x
  133. st1 {v0.16b-v1.16b}, [x0], #32
  134. #else
  135. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  136. do_decrypt_block4x
  137. st1 {v0.16b-v3.16b}, [x0], #64
  138. #endif
  139. b .LecbdecloopNx
  140. .Lecbdec1x:
  141. adds w4, w4, #INTERLEAVE
  142. beq .Lecbdecout
  143. #endif
  144. .Lecbdecloop:
  145. ld1 {v0.16b}, [x1], #16 /* get next ct block */
  146. decrypt_block v0, w3, x2, x5, w6
  147. st1 {v0.16b}, [x0], #16
  148. subs w4, w4, #1
  149. bne .Lecbdecloop
  150. .Lecbdecout:
  151. FRAME_POP
  152. ret
  153. AES_ENDPROC(aes_ecb_decrypt)
  154. /*
  155. * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  156. * int blocks, u8 iv[], int first)
  157. * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  158. * int blocks, u8 iv[], int first)
  159. */
  160. AES_ENTRY(aes_cbc_encrypt)
  161. cbz w6, .Lcbcencloop
  162. ld1 {v0.16b}, [x5] /* get iv */
  163. enc_prepare w3, x2, x6
  164. .Lcbcencloop:
  165. ld1 {v1.16b}, [x1], #16 /* get next pt block */
  166. eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
  167. encrypt_block v0, w3, x2, x6, w7
  168. st1 {v0.16b}, [x0], #16
  169. subs w4, w4, #1
  170. bne .Lcbcencloop
  171. st1 {v0.16b}, [x5] /* return iv */
  172. ret
  173. AES_ENDPROC(aes_cbc_encrypt)
  174. AES_ENTRY(aes_cbc_decrypt)
  175. FRAME_PUSH
  176. cbz w6, .LcbcdecloopNx
  177. ld1 {v7.16b}, [x5] /* get iv */
  178. dec_prepare w3, x2, x6
  179. .LcbcdecloopNx:
  180. #if INTERLEAVE >= 2
  181. subs w4, w4, #INTERLEAVE
  182. bmi .Lcbcdec1x
  183. #if INTERLEAVE == 2
  184. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  185. mov v2.16b, v0.16b
  186. mov v3.16b, v1.16b
  187. do_decrypt_block2x
  188. eor v0.16b, v0.16b, v7.16b
  189. eor v1.16b, v1.16b, v2.16b
  190. mov v7.16b, v3.16b
  191. st1 {v0.16b-v1.16b}, [x0], #32
  192. #else
  193. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  194. mov v4.16b, v0.16b
  195. mov v5.16b, v1.16b
  196. mov v6.16b, v2.16b
  197. do_decrypt_block4x
  198. sub x1, x1, #16
  199. eor v0.16b, v0.16b, v7.16b
  200. eor v1.16b, v1.16b, v4.16b
  201. ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
  202. eor v2.16b, v2.16b, v5.16b
  203. eor v3.16b, v3.16b, v6.16b
  204. st1 {v0.16b-v3.16b}, [x0], #64
  205. #endif
  206. b .LcbcdecloopNx
  207. .Lcbcdec1x:
  208. adds w4, w4, #INTERLEAVE
  209. beq .Lcbcdecout
  210. #endif
  211. .Lcbcdecloop:
  212. ld1 {v1.16b}, [x1], #16 /* get next ct block */
  213. mov v0.16b, v1.16b /* ...and copy to v0 */
  214. decrypt_block v0, w3, x2, x6, w7
  215. eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
  216. mov v7.16b, v1.16b /* ct is next iv */
  217. st1 {v0.16b}, [x0], #16
  218. subs w4, w4, #1
  219. bne .Lcbcdecloop
  220. .Lcbcdecout:
  221. FRAME_POP
  222. st1 {v7.16b}, [x5] /* return iv */
  223. ret
  224. AES_ENDPROC(aes_cbc_decrypt)
  225. /*
  226. * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  227. * int blocks, u8 ctr[], int first)
  228. */
  229. AES_ENTRY(aes_ctr_encrypt)
  230. FRAME_PUSH
  231. cbz w6, .Lctrnotfirst /* 1st time around? */
  232. enc_prepare w3, x2, x6
  233. ld1 {v4.16b}, [x5]
  234. .Lctrnotfirst:
  235. umov x8, v4.d[1] /* keep swabbed ctr in reg */
  236. rev x8, x8
  237. #if INTERLEAVE >= 2
  238. cmn w8, w4 /* 32 bit overflow? */
  239. bcs .Lctrloop
  240. .LctrloopNx:
  241. subs w4, w4, #INTERLEAVE
  242. bmi .Lctr1x
  243. #if INTERLEAVE == 2
  244. mov v0.8b, v4.8b
  245. mov v1.8b, v4.8b
  246. rev x7, x8
  247. add x8, x8, #1
  248. ins v0.d[1], x7
  249. rev x7, x8
  250. add x8, x8, #1
  251. ins v1.d[1], x7
  252. ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
  253. do_encrypt_block2x
  254. eor v0.16b, v0.16b, v2.16b
  255. eor v1.16b, v1.16b, v3.16b
  256. st1 {v0.16b-v1.16b}, [x0], #32
  257. #else
  258. ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
  259. dup v7.4s, w8
  260. mov v0.16b, v4.16b
  261. add v7.4s, v7.4s, v8.4s
  262. mov v1.16b, v4.16b
  263. rev32 v8.16b, v7.16b
  264. mov v2.16b, v4.16b
  265. mov v3.16b, v4.16b
  266. mov v1.s[3], v8.s[0]
  267. mov v2.s[3], v8.s[1]
  268. mov v3.s[3], v8.s[2]
  269. ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
  270. do_encrypt_block4x
  271. eor v0.16b, v5.16b, v0.16b
  272. ld1 {v5.16b}, [x1], #16 /* get 1 input block */
  273. eor v1.16b, v6.16b, v1.16b
  274. eor v2.16b, v7.16b, v2.16b
  275. eor v3.16b, v5.16b, v3.16b
  276. st1 {v0.16b-v3.16b}, [x0], #64
  277. add x8, x8, #INTERLEAVE
  278. #endif
  279. rev x7, x8
  280. ins v4.d[1], x7
  281. cbz w4, .Lctrout
  282. b .LctrloopNx
  283. .Lctr1x:
  284. adds w4, w4, #INTERLEAVE
  285. beq .Lctrout
  286. #endif
  287. .Lctrloop:
  288. mov v0.16b, v4.16b
  289. encrypt_block v0, w3, x2, x6, w7
  290. adds x8, x8, #1 /* increment BE ctr */
  291. rev x7, x8
  292. ins v4.d[1], x7
  293. bcs .Lctrcarry /* overflow? */
  294. .Lctrcarrydone:
  295. subs w4, w4, #1
  296. bmi .Lctrtailblock /* blocks <0 means tail block */
  297. ld1 {v3.16b}, [x1], #16
  298. eor v3.16b, v0.16b, v3.16b
  299. st1 {v3.16b}, [x0], #16
  300. bne .Lctrloop
  301. .Lctrout:
  302. st1 {v4.16b}, [x5] /* return next CTR value */
  303. FRAME_POP
  304. ret
  305. .Lctrtailblock:
  306. st1 {v0.16b}, [x0]
  307. FRAME_POP
  308. ret
  309. .Lctrcarry:
  310. umov x7, v4.d[0] /* load upper word of ctr */
  311. rev x7, x7 /* ... to handle the carry */
  312. add x7, x7, #1
  313. rev x7, x7
  314. ins v4.d[0], x7
  315. b .Lctrcarrydone
  316. AES_ENDPROC(aes_ctr_encrypt)
  317. .ltorg
  318. /*
  319. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  320. * int blocks, u8 const rk2[], u8 iv[], int first)
  321. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  322. * int blocks, u8 const rk2[], u8 iv[], int first)
  323. */
  324. .macro next_tweak, out, in, const, tmp
  325. sshr \tmp\().2d, \in\().2d, #63
  326. and \tmp\().16b, \tmp\().16b, \const\().16b
  327. add \out\().2d, \in\().2d, \in\().2d
  328. ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
  329. eor \out\().16b, \out\().16b, \tmp\().16b
  330. .endm
  331. .Lxts_mul_x:
  332. CPU_LE( .quad 1, 0x87 )
  333. CPU_BE( .quad 0x87, 1 )
  334. AES_ENTRY(aes_xts_encrypt)
  335. FRAME_PUSH
  336. cbz w7, .LxtsencloopNx
  337. ld1 {v4.16b}, [x6]
  338. enc_prepare w3, x5, x6
  339. encrypt_block v4, w3, x5, x6, w7 /* first tweak */
  340. enc_switch_key w3, x2, x6
  341. ldr q7, .Lxts_mul_x
  342. b .LxtsencNx
  343. .LxtsencloopNx:
  344. ldr q7, .Lxts_mul_x
  345. next_tweak v4, v4, v7, v8
  346. .LxtsencNx:
  347. #if INTERLEAVE >= 2
  348. subs w4, w4, #INTERLEAVE
  349. bmi .Lxtsenc1x
  350. #if INTERLEAVE == 2
  351. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
  352. next_tweak v5, v4, v7, v8
  353. eor v0.16b, v0.16b, v4.16b
  354. eor v1.16b, v1.16b, v5.16b
  355. do_encrypt_block2x
  356. eor v0.16b, v0.16b, v4.16b
  357. eor v1.16b, v1.16b, v5.16b
  358. st1 {v0.16b-v1.16b}, [x0], #32
  359. cbz w4, .LxtsencoutNx
  360. next_tweak v4, v5, v7, v8
  361. b .LxtsencNx
  362. .LxtsencoutNx:
  363. mov v4.16b, v5.16b
  364. b .Lxtsencout
  365. #else
  366. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  367. next_tweak v5, v4, v7, v8
  368. eor v0.16b, v0.16b, v4.16b
  369. next_tweak v6, v5, v7, v8
  370. eor v1.16b, v1.16b, v5.16b
  371. eor v2.16b, v2.16b, v6.16b
  372. next_tweak v7, v6, v7, v8
  373. eor v3.16b, v3.16b, v7.16b
  374. do_encrypt_block4x
  375. eor v3.16b, v3.16b, v7.16b
  376. eor v0.16b, v0.16b, v4.16b
  377. eor v1.16b, v1.16b, v5.16b
  378. eor v2.16b, v2.16b, v6.16b
  379. st1 {v0.16b-v3.16b}, [x0], #64
  380. mov v4.16b, v7.16b
  381. cbz w4, .Lxtsencout
  382. b .LxtsencloopNx
  383. #endif
  384. .Lxtsenc1x:
  385. adds w4, w4, #INTERLEAVE
  386. beq .Lxtsencout
  387. #endif
  388. .Lxtsencloop:
  389. ld1 {v1.16b}, [x1], #16
  390. eor v0.16b, v1.16b, v4.16b
  391. encrypt_block v0, w3, x2, x6, w7
  392. eor v0.16b, v0.16b, v4.16b
  393. st1 {v0.16b}, [x0], #16
  394. subs w4, w4, #1
  395. beq .Lxtsencout
  396. next_tweak v4, v4, v7, v8
  397. b .Lxtsencloop
  398. .Lxtsencout:
  399. FRAME_POP
  400. ret
  401. AES_ENDPROC(aes_xts_encrypt)
  402. AES_ENTRY(aes_xts_decrypt)
  403. FRAME_PUSH
  404. cbz w7, .LxtsdecloopNx
  405. ld1 {v4.16b}, [x6]
  406. enc_prepare w3, x5, x6
  407. encrypt_block v4, w3, x5, x6, w7 /* first tweak */
  408. dec_prepare w3, x2, x6
  409. ldr q7, .Lxts_mul_x
  410. b .LxtsdecNx
  411. .LxtsdecloopNx:
  412. ldr q7, .Lxts_mul_x
  413. next_tweak v4, v4, v7, v8
  414. .LxtsdecNx:
  415. #if INTERLEAVE >= 2
  416. subs w4, w4, #INTERLEAVE
  417. bmi .Lxtsdec1x
  418. #if INTERLEAVE == 2
  419. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  420. next_tweak v5, v4, v7, v8
  421. eor v0.16b, v0.16b, v4.16b
  422. eor v1.16b, v1.16b, v5.16b
  423. do_decrypt_block2x
  424. eor v0.16b, v0.16b, v4.16b
  425. eor v1.16b, v1.16b, v5.16b
  426. st1 {v0.16b-v1.16b}, [x0], #32
  427. cbz w4, .LxtsdecoutNx
  428. next_tweak v4, v5, v7, v8
  429. b .LxtsdecNx
  430. .LxtsdecoutNx:
  431. mov v4.16b, v5.16b
  432. b .Lxtsdecout
  433. #else
  434. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  435. next_tweak v5, v4, v7, v8
  436. eor v0.16b, v0.16b, v4.16b
  437. next_tweak v6, v5, v7, v8
  438. eor v1.16b, v1.16b, v5.16b
  439. eor v2.16b, v2.16b, v6.16b
  440. next_tweak v7, v6, v7, v8
  441. eor v3.16b, v3.16b, v7.16b
  442. do_decrypt_block4x
  443. eor v3.16b, v3.16b, v7.16b
  444. eor v0.16b, v0.16b, v4.16b
  445. eor v1.16b, v1.16b, v5.16b
  446. eor v2.16b, v2.16b, v6.16b
  447. st1 {v0.16b-v3.16b}, [x0], #64
  448. mov v4.16b, v7.16b
  449. cbz w4, .Lxtsdecout
  450. b .LxtsdecloopNx
  451. #endif
  452. .Lxtsdec1x:
  453. adds w4, w4, #INTERLEAVE
  454. beq .Lxtsdecout
  455. #endif
  456. .Lxtsdecloop:
  457. ld1 {v1.16b}, [x1], #16
  458. eor v0.16b, v1.16b, v4.16b
  459. decrypt_block v0, w3, x2, x6, w7
  460. eor v0.16b, v0.16b, v4.16b
  461. st1 {v0.16b}, [x0], #16
  462. subs w4, w4, #1
  463. beq .Lxtsdecout
  464. next_tweak v4, v4, v7, v8
  465. b .Lxtsdecloop
  466. .Lxtsdecout:
  467. FRAME_POP
  468. ret
  469. AES_ENDPROC(aes_xts_decrypt)
  470. /*
  471. * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
  472. * int blocks, u8 dg[], int enc_before, int enc_after)
  473. */
  474. AES_ENTRY(aes_mac_update)
  475. ld1 {v0.16b}, [x4] /* get dg */
  476. enc_prepare w2, x1, x7
  477. cbnz w5, .Lmacenc
  478. .Lmacloop:
  479. cbz w3, .Lmacout
  480. ld1 {v1.16b}, [x0], #16 /* get next pt block */
  481. eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
  482. subs w3, w3, #1
  483. csinv x5, x6, xzr, eq
  484. cbz w5, .Lmacout
  485. .Lmacenc:
  486. encrypt_block v0, w2, x1, x7, w8
  487. b .Lmacloop
  488. .Lmacout:
  489. st1 {v0.16b}, [x4] /* return dg */
  490. ret
  491. AES_ENDPROC(aes_mac_update)