aes-modes.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. /*
  2. * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
  3. *
  4. * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. /* included by aes-ce.S and aes-neon.S */
  11. .text
  12. .align 4
  13. /*
  14. * There are several ways to instantiate this code:
  15. * - no interleave, all inline
  16. * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
  17. * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
  18. * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
  19. * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
  20. *
  21. * Macros imported by this code:
  22. * - enc_prepare - setup NEON registers for encryption
  23. * - dec_prepare - setup NEON registers for decryption
  24. * - enc_switch_key - change to new key after having prepared for encryption
  25. * - encrypt_block - encrypt a single block
  26. * - decrypt block - decrypt a single block
  27. * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
  28. * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
  29. * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
  30. * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
  31. */
  32. #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
  33. #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
  34. #define FRAME_POP ldp x29, x30, [sp],#16
  35. #if INTERLEAVE == 2
  36. aes_encrypt_block2x:
  37. encrypt_block2x v0, v1, w3, x2, x6, w7
  38. ret
  39. ENDPROC(aes_encrypt_block2x)
  40. aes_decrypt_block2x:
  41. decrypt_block2x v0, v1, w3, x2, x6, w7
  42. ret
  43. ENDPROC(aes_decrypt_block2x)
  44. #elif INTERLEAVE == 4
  45. aes_encrypt_block4x:
  46. encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  47. ret
  48. ENDPROC(aes_encrypt_block4x)
  49. aes_decrypt_block4x:
  50. decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  51. ret
  52. ENDPROC(aes_decrypt_block4x)
  53. #else
  54. #error INTERLEAVE should equal 2 or 4
  55. #endif
  56. .macro do_encrypt_block2x
  57. bl aes_encrypt_block2x
  58. .endm
  59. .macro do_decrypt_block2x
  60. bl aes_decrypt_block2x
  61. .endm
  62. .macro do_encrypt_block4x
  63. bl aes_encrypt_block4x
  64. .endm
  65. .macro do_decrypt_block4x
  66. bl aes_decrypt_block4x
  67. .endm
  68. #else
  69. #define FRAME_PUSH
  70. #define FRAME_POP
  71. .macro do_encrypt_block2x
  72. encrypt_block2x v0, v1, w3, x2, x6, w7
  73. .endm
  74. .macro do_decrypt_block2x
  75. decrypt_block2x v0, v1, w3, x2, x6, w7
  76. .endm
  77. .macro do_encrypt_block4x
  78. encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  79. .endm
  80. .macro do_decrypt_block4x
  81. decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  82. .endm
  83. #endif
  84. /*
  85. * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  86. * int blocks, int first)
  87. * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  88. * int blocks, int first)
  89. */
  90. AES_ENTRY(aes_ecb_encrypt)
  91. FRAME_PUSH
  92. cbz w5, .LecbencloopNx
  93. enc_prepare w3, x2, x5
  94. .LecbencloopNx:
  95. #if INTERLEAVE >= 2
  96. subs w4, w4, #INTERLEAVE
  97. bmi .Lecbenc1x
  98. #if INTERLEAVE == 2
  99. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
  100. do_encrypt_block2x
  101. st1 {v0.16b-v1.16b}, [x0], #32
  102. #else
  103. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  104. do_encrypt_block4x
  105. st1 {v0.16b-v3.16b}, [x0], #64
  106. #endif
  107. b .LecbencloopNx
  108. .Lecbenc1x:
  109. adds w4, w4, #INTERLEAVE
  110. beq .Lecbencout
  111. #endif
  112. .Lecbencloop:
  113. ld1 {v0.16b}, [x1], #16 /* get next pt block */
  114. encrypt_block v0, w3, x2, x5, w6
  115. st1 {v0.16b}, [x0], #16
  116. subs w4, w4, #1
  117. bne .Lecbencloop
  118. .Lecbencout:
  119. FRAME_POP
  120. ret
  121. AES_ENDPROC(aes_ecb_encrypt)
  122. AES_ENTRY(aes_ecb_decrypt)
  123. FRAME_PUSH
  124. cbz w5, .LecbdecloopNx
  125. dec_prepare w3, x2, x5
  126. .LecbdecloopNx:
  127. #if INTERLEAVE >= 2
  128. subs w4, w4, #INTERLEAVE
  129. bmi .Lecbdec1x
  130. #if INTERLEAVE == 2
  131. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  132. do_decrypt_block2x
  133. st1 {v0.16b-v1.16b}, [x0], #32
  134. #else
  135. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  136. do_decrypt_block4x
  137. st1 {v0.16b-v3.16b}, [x0], #64
  138. #endif
  139. b .LecbdecloopNx
  140. .Lecbdec1x:
  141. adds w4, w4, #INTERLEAVE
  142. beq .Lecbdecout
  143. #endif
  144. .Lecbdecloop:
  145. ld1 {v0.16b}, [x1], #16 /* get next ct block */
  146. decrypt_block v0, w3, x2, x5, w6
  147. st1 {v0.16b}, [x0], #16
  148. subs w4, w4, #1
  149. bne .Lecbdecloop
  150. .Lecbdecout:
  151. FRAME_POP
  152. ret
  153. AES_ENDPROC(aes_ecb_decrypt)
  154. /*
  155. * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  156. * int blocks, u8 iv[], int first)
  157. * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  158. * int blocks, u8 iv[], int first)
  159. */
  160. AES_ENTRY(aes_cbc_encrypt)
  161. cbz w6, .Lcbcencloop
  162. ld1 {v0.16b}, [x5] /* get iv */
  163. enc_prepare w3, x2, x5
  164. .Lcbcencloop:
  165. ld1 {v1.16b}, [x1], #16 /* get next pt block */
  166. eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
  167. encrypt_block v0, w3, x2, x5, w6
  168. st1 {v0.16b}, [x0], #16
  169. subs w4, w4, #1
  170. bne .Lcbcencloop
  171. ret
  172. AES_ENDPROC(aes_cbc_encrypt)
  173. AES_ENTRY(aes_cbc_decrypt)
  174. FRAME_PUSH
  175. cbz w6, .LcbcdecloopNx
  176. ld1 {v7.16b}, [x5] /* get iv */
  177. dec_prepare w3, x2, x5
  178. .LcbcdecloopNx:
  179. #if INTERLEAVE >= 2
  180. subs w4, w4, #INTERLEAVE
  181. bmi .Lcbcdec1x
  182. #if INTERLEAVE == 2
  183. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  184. mov v2.16b, v0.16b
  185. mov v3.16b, v1.16b
  186. do_decrypt_block2x
  187. eor v0.16b, v0.16b, v7.16b
  188. eor v1.16b, v1.16b, v2.16b
  189. mov v7.16b, v3.16b
  190. st1 {v0.16b-v1.16b}, [x0], #32
  191. #else
  192. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  193. mov v4.16b, v0.16b
  194. mov v5.16b, v1.16b
  195. mov v6.16b, v2.16b
  196. do_decrypt_block4x
  197. sub x1, x1, #16
  198. eor v0.16b, v0.16b, v7.16b
  199. eor v1.16b, v1.16b, v4.16b
  200. ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
  201. eor v2.16b, v2.16b, v5.16b
  202. eor v3.16b, v3.16b, v6.16b
  203. st1 {v0.16b-v3.16b}, [x0], #64
  204. #endif
  205. b .LcbcdecloopNx
  206. .Lcbcdec1x:
  207. adds w4, w4, #INTERLEAVE
  208. beq .Lcbcdecout
  209. #endif
  210. .Lcbcdecloop:
  211. ld1 {v1.16b}, [x1], #16 /* get next ct block */
  212. mov v0.16b, v1.16b /* ...and copy to v0 */
  213. decrypt_block v0, w3, x2, x5, w6
  214. eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
  215. mov v7.16b, v1.16b /* ct is next iv */
  216. st1 {v0.16b}, [x0], #16
  217. subs w4, w4, #1
  218. bne .Lcbcdecloop
  219. .Lcbcdecout:
  220. FRAME_POP
  221. ret
  222. AES_ENDPROC(aes_cbc_decrypt)
  223. /*
  224. * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  225. * int blocks, u8 ctr[], int first)
  226. */
  227. AES_ENTRY(aes_ctr_encrypt)
  228. FRAME_PUSH
  229. cbnz w6, .Lctrfirst /* 1st time around? */
  230. umov x5, v4.d[1] /* keep swabbed ctr in reg */
  231. rev x5, x5
  232. #if INTERLEAVE >= 2
  233. cmn w5, w4 /* 32 bit overflow? */
  234. bcs .Lctrinc
  235. add x5, x5, #1 /* increment BE ctr */
  236. b .LctrincNx
  237. #else
  238. b .Lctrinc
  239. #endif
  240. .Lctrfirst:
  241. enc_prepare w3, x2, x6
  242. ld1 {v4.16b}, [x5]
  243. umov x5, v4.d[1] /* keep swabbed ctr in reg */
  244. rev x5, x5
  245. #if INTERLEAVE >= 2
  246. cmn w5, w4 /* 32 bit overflow? */
  247. bcs .Lctrloop
  248. .LctrloopNx:
  249. subs w4, w4, #INTERLEAVE
  250. bmi .Lctr1x
  251. #if INTERLEAVE == 2
  252. mov v0.8b, v4.8b
  253. mov v1.8b, v4.8b
  254. rev x7, x5
  255. add x5, x5, #1
  256. ins v0.d[1], x7
  257. rev x7, x5
  258. add x5, x5, #1
  259. ins v1.d[1], x7
  260. ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
  261. do_encrypt_block2x
  262. eor v0.16b, v0.16b, v2.16b
  263. eor v1.16b, v1.16b, v3.16b
  264. st1 {v0.16b-v1.16b}, [x0], #32
  265. #else
  266. ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
  267. dup v7.4s, w5
  268. mov v0.16b, v4.16b
  269. add v7.4s, v7.4s, v8.4s
  270. mov v1.16b, v4.16b
  271. rev32 v8.16b, v7.16b
  272. mov v2.16b, v4.16b
  273. mov v3.16b, v4.16b
  274. mov v1.s[3], v8.s[0]
  275. mov v2.s[3], v8.s[1]
  276. mov v3.s[3], v8.s[2]
  277. ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
  278. do_encrypt_block4x
  279. eor v0.16b, v5.16b, v0.16b
  280. ld1 {v5.16b}, [x1], #16 /* get 1 input block */
  281. eor v1.16b, v6.16b, v1.16b
  282. eor v2.16b, v7.16b, v2.16b
  283. eor v3.16b, v5.16b, v3.16b
  284. st1 {v0.16b-v3.16b}, [x0], #64
  285. add x5, x5, #INTERLEAVE
  286. #endif
  287. cbz w4, .LctroutNx
  288. .LctrincNx:
  289. rev x7, x5
  290. ins v4.d[1], x7
  291. b .LctrloopNx
  292. .LctroutNx:
  293. sub x5, x5, #1
  294. rev x7, x5
  295. ins v4.d[1], x7
  296. b .Lctrout
  297. .Lctr1x:
  298. adds w4, w4, #INTERLEAVE
  299. beq .Lctrout
  300. #endif
  301. .Lctrloop:
  302. mov v0.16b, v4.16b
  303. encrypt_block v0, w3, x2, x6, w7
  304. subs w4, w4, #1
  305. bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
  306. ld1 {v3.16b}, [x1], #16
  307. eor v3.16b, v0.16b, v3.16b
  308. st1 {v3.16b}, [x0], #16
  309. beq .Lctrout
  310. .Lctrinc:
  311. adds x5, x5, #1 /* increment BE ctr */
  312. rev x7, x5
  313. ins v4.d[1], x7
  314. bcc .Lctrloop /* no overflow? */
  315. umov x7, v4.d[0] /* load upper word of ctr */
  316. rev x7, x7 /* ... to handle the carry */
  317. add x7, x7, #1
  318. rev x7, x7
  319. ins v4.d[0], x7
  320. b .Lctrloop
  321. .Lctrhalfblock:
  322. ld1 {v3.8b}, [x1]
  323. eor v3.8b, v0.8b, v3.8b
  324. st1 {v3.8b}, [x0]
  325. .Lctrout:
  326. FRAME_POP
  327. ret
  328. AES_ENDPROC(aes_ctr_encrypt)
  329. .ltorg
  330. /*
  331. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  332. * int blocks, u8 const rk2[], u8 iv[], int first)
  333. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  334. * int blocks, u8 const rk2[], u8 iv[], int first)
  335. */
  336. .macro next_tweak, out, in, const, tmp
  337. sshr \tmp\().2d, \in\().2d, #63
  338. and \tmp\().16b, \tmp\().16b, \const\().16b
  339. add \out\().2d, \in\().2d, \in\().2d
  340. ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
  341. eor \out\().16b, \out\().16b, \tmp\().16b
  342. .endm
  343. .Lxts_mul_x:
  344. .word 1, 0, 0x87, 0
  345. AES_ENTRY(aes_xts_encrypt)
  346. FRAME_PUSH
  347. cbz w7, .LxtsencloopNx
  348. ld1 {v4.16b}, [x6]
  349. enc_prepare w3, x5, x6
  350. encrypt_block v4, w3, x5, x6, w7 /* first tweak */
  351. enc_switch_key w3, x2, x6
  352. ldr q7, .Lxts_mul_x
  353. b .LxtsencNx
  354. .LxtsencloopNx:
  355. ldr q7, .Lxts_mul_x
  356. next_tweak v4, v4, v7, v8
  357. .LxtsencNx:
  358. #if INTERLEAVE >= 2
  359. subs w4, w4, #INTERLEAVE
  360. bmi .Lxtsenc1x
  361. #if INTERLEAVE == 2
  362. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
  363. next_tweak v5, v4, v7, v8
  364. eor v0.16b, v0.16b, v4.16b
  365. eor v1.16b, v1.16b, v5.16b
  366. do_encrypt_block2x
  367. eor v0.16b, v0.16b, v4.16b
  368. eor v1.16b, v1.16b, v5.16b
  369. st1 {v0.16b-v1.16b}, [x0], #32
  370. cbz w4, .LxtsencoutNx
  371. next_tweak v4, v5, v7, v8
  372. b .LxtsencNx
  373. .LxtsencoutNx:
  374. mov v4.16b, v5.16b
  375. b .Lxtsencout
  376. #else
  377. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  378. next_tweak v5, v4, v7, v8
  379. eor v0.16b, v0.16b, v4.16b
  380. next_tweak v6, v5, v7, v8
  381. eor v1.16b, v1.16b, v5.16b
  382. eor v2.16b, v2.16b, v6.16b
  383. next_tweak v7, v6, v7, v8
  384. eor v3.16b, v3.16b, v7.16b
  385. do_encrypt_block4x
  386. eor v3.16b, v3.16b, v7.16b
  387. eor v0.16b, v0.16b, v4.16b
  388. eor v1.16b, v1.16b, v5.16b
  389. eor v2.16b, v2.16b, v6.16b
  390. st1 {v0.16b-v3.16b}, [x0], #64
  391. mov v4.16b, v7.16b
  392. cbz w4, .Lxtsencout
  393. b .LxtsencloopNx
  394. #endif
  395. .Lxtsenc1x:
  396. adds w4, w4, #INTERLEAVE
  397. beq .Lxtsencout
  398. #endif
  399. .Lxtsencloop:
  400. ld1 {v1.16b}, [x1], #16
  401. eor v0.16b, v1.16b, v4.16b
  402. encrypt_block v0, w3, x2, x6, w7
  403. eor v0.16b, v0.16b, v4.16b
  404. st1 {v0.16b}, [x0], #16
  405. subs w4, w4, #1
  406. beq .Lxtsencout
  407. next_tweak v4, v4, v7, v8
  408. b .Lxtsencloop
  409. .Lxtsencout:
  410. FRAME_POP
  411. ret
  412. AES_ENDPROC(aes_xts_encrypt)
  413. AES_ENTRY(aes_xts_decrypt)
  414. FRAME_PUSH
  415. cbz w7, .LxtsdecloopNx
  416. ld1 {v4.16b}, [x6]
  417. enc_prepare w3, x5, x6
  418. encrypt_block v4, w3, x5, x6, w7 /* first tweak */
  419. dec_prepare w3, x2, x6
  420. ldr q7, .Lxts_mul_x
  421. b .LxtsdecNx
  422. .LxtsdecloopNx:
  423. ldr q7, .Lxts_mul_x
  424. next_tweak v4, v4, v7, v8
  425. .LxtsdecNx:
  426. #if INTERLEAVE >= 2
  427. subs w4, w4, #INTERLEAVE
  428. bmi .Lxtsdec1x
  429. #if INTERLEAVE == 2
  430. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  431. next_tweak v5, v4, v7, v8
  432. eor v0.16b, v0.16b, v4.16b
  433. eor v1.16b, v1.16b, v5.16b
  434. do_decrypt_block2x
  435. eor v0.16b, v0.16b, v4.16b
  436. eor v1.16b, v1.16b, v5.16b
  437. st1 {v0.16b-v1.16b}, [x0], #32
  438. cbz w4, .LxtsdecoutNx
  439. next_tweak v4, v5, v7, v8
  440. b .LxtsdecNx
  441. .LxtsdecoutNx:
  442. mov v4.16b, v5.16b
  443. b .Lxtsdecout
  444. #else
  445. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  446. next_tweak v5, v4, v7, v8
  447. eor v0.16b, v0.16b, v4.16b
  448. next_tweak v6, v5, v7, v8
  449. eor v1.16b, v1.16b, v5.16b
  450. eor v2.16b, v2.16b, v6.16b
  451. next_tweak v7, v6, v7, v8
  452. eor v3.16b, v3.16b, v7.16b
  453. do_decrypt_block4x
  454. eor v3.16b, v3.16b, v7.16b
  455. eor v0.16b, v0.16b, v4.16b
  456. eor v1.16b, v1.16b, v5.16b
  457. eor v2.16b, v2.16b, v6.16b
  458. st1 {v0.16b-v3.16b}, [x0], #64
  459. mov v4.16b, v7.16b
  460. cbz w4, .Lxtsdecout
  461. b .LxtsdecloopNx
  462. #endif
  463. .Lxtsdec1x:
  464. adds w4, w4, #INTERLEAVE
  465. beq .Lxtsdecout
  466. #endif
  467. .Lxtsdecloop:
  468. ld1 {v1.16b}, [x1], #16
  469. eor v0.16b, v1.16b, v4.16b
  470. decrypt_block v0, w3, x2, x6, w7
  471. eor v0.16b, v0.16b, v4.16b
  472. st1 {v0.16b}, [x0], #16
  473. subs w4, w4, #1
  474. beq .Lxtsdecout
  475. next_tweak v4, v4, v7, v8
  476. b .Lxtsdecloop
  477. .Lxtsdecout:
  478. FRAME_POP
  479. ret
  480. AES_ENDPROC(aes_xts_decrypt)