aegis256-aesni-asm.S 10 KB


  1. /*
  2. * AES-NI + SSE2 implementation of AEGIS-128L
  3. *
  4. * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
  5. * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
  6. *
  7. * This program is free software; you can redistribute it and/or modify it
  8. * under the terms of the GNU General Public License version 2 as published
  9. * by the Free Software Foundation.
  10. */
  11. #include <linux/linkage.h>
  12. #include <asm/frame.h>
  13. #define STATE0 %xmm0
  14. #define STATE1 %xmm1
  15. #define STATE2 %xmm2
  16. #define STATE3 %xmm3
  17. #define STATE4 %xmm4
  18. #define STATE5 %xmm5
  19. #define MSG %xmm6
  20. #define T0 %xmm7
  21. #define T1 %xmm8
  22. #define T2 %xmm9
  23. #define T3 %xmm10
  24. #define STATEP %rdi
  25. #define LEN %rsi
  26. #define SRC %rdx
  27. #define DST %rcx
  28. .section .rodata.cst16.aegis256_const, "aM", @progbits, 32
  29. .align 16
  30. .Laegis256_const_0:
  31. .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
  32. .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
  33. .Laegis256_const_1:
  34. .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
  35. .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
  36. .section .rodata.cst16.aegis256_counter, "aM", @progbits, 16
  37. .align 16
  38. .Laegis256_counter:
  39. .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  40. .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  41. .text
  42. /*
  43. * __load_partial: internal ABI
  44. * input:
  45. * LEN - bytes
  46. * SRC - src
  47. * output:
  48. * MSG - message block
  49. * changed:
  50. * T0
  51. * %r8
  52. * %r9
  53. */
  54. __load_partial:
  55. xor %r9d, %r9d
  56. pxor MSG, MSG
  57. mov LEN, %r8
  58. and $0x1, %r8
  59. jz .Lld_partial_1
  60. mov LEN, %r8
  61. and $0x1E, %r8
  62. add SRC, %r8
  63. mov (%r8), %r9b
  64. .Lld_partial_1:
  65. mov LEN, %r8
  66. and $0x2, %r8
  67. jz .Lld_partial_2
  68. mov LEN, %r8
  69. and $0x1C, %r8
  70. add SRC, %r8
  71. shl $0x10, %r9
  72. mov (%r8), %r9w
  73. .Lld_partial_2:
  74. mov LEN, %r8
  75. and $0x4, %r8
  76. jz .Lld_partial_4
  77. mov LEN, %r8
  78. and $0x18, %r8
  79. add SRC, %r8
  80. shl $32, %r9
  81. mov (%r8), %r8d
  82. xor %r8, %r9
  83. .Lld_partial_4:
  84. movq %r9, MSG
  85. mov LEN, %r8
  86. and $0x8, %r8
  87. jz .Lld_partial_8
  88. mov LEN, %r8
  89. and $0x10, %r8
  90. add SRC, %r8
  91. pslldq $8, MSG
  92. movq (%r8), T0
  93. pxor T0, MSG
  94. .Lld_partial_8:
  95. ret
  96. ENDPROC(__load_partial)
  97. /*
  98. * __store_partial: internal ABI
  99. * input:
  100. * LEN - bytes
  101. * DST - dst
  102. * output:
  103. * T0 - message block
  104. * changed:
  105. * %r8
  106. * %r9
  107. * %r10
  108. */
  109. __store_partial:
  110. mov LEN, %r8
  111. mov DST, %r9
  112. movq T0, %r10
  113. cmp $8, %r8
  114. jl .Lst_partial_8
  115. mov %r10, (%r9)
  116. psrldq $8, T0
  117. movq T0, %r10
  118. sub $8, %r8
  119. add $8, %r9
  120. .Lst_partial_8:
  121. cmp $4, %r8
  122. jl .Lst_partial_4
  123. mov %r10d, (%r9)
  124. shr $32, %r10
  125. sub $4, %r8
  126. add $4, %r9
  127. .Lst_partial_4:
  128. cmp $2, %r8
  129. jl .Lst_partial_2
  130. mov %r10w, (%r9)
  131. shr $0x10, %r10
  132. sub $2, %r8
  133. add $2, %r9
  134. .Lst_partial_2:
  135. cmp $1, %r8
  136. jl .Lst_partial_1
  137. mov %r10b, (%r9)
  138. .Lst_partial_1:
  139. ret
  140. ENDPROC(__store_partial)
  141. .macro update
  142. movdqa STATE5, T0
  143. aesenc STATE0, STATE5
  144. aesenc STATE1, STATE0
  145. aesenc STATE2, STATE1
  146. aesenc STATE3, STATE2
  147. aesenc STATE4, STATE3
  148. aesenc T0, STATE4
  149. .endm
  150. .macro update0 m
  151. update
  152. pxor \m, STATE5
  153. .endm
  154. .macro update1 m
  155. update
  156. pxor \m, STATE4
  157. .endm
  158. .macro update2 m
  159. update
  160. pxor \m, STATE3
  161. .endm
  162. .macro update3 m
  163. update
  164. pxor \m, STATE2
  165. .endm
  166. .macro update4 m
  167. update
  168. pxor \m, STATE1
  169. .endm
  170. .macro update5 m
  171. update
  172. pxor \m, STATE0
  173. .endm
  174. .macro state_load
  175. movdqu 0x00(STATEP), STATE0
  176. movdqu 0x10(STATEP), STATE1
  177. movdqu 0x20(STATEP), STATE2
  178. movdqu 0x30(STATEP), STATE3
  179. movdqu 0x40(STATEP), STATE4
  180. movdqu 0x50(STATEP), STATE5
  181. .endm
  182. .macro state_store s0 s1 s2 s3 s4 s5
  183. movdqu \s5, 0x00(STATEP)
  184. movdqu \s0, 0x10(STATEP)
  185. movdqu \s1, 0x20(STATEP)
  186. movdqu \s2, 0x30(STATEP)
  187. movdqu \s3, 0x40(STATEP)
  188. movdqu \s4, 0x50(STATEP)
  189. .endm
  190. .macro state_store0
  191. state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
  192. .endm
  193. .macro state_store1
  194. state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
  195. .endm
  196. .macro state_store2
  197. state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
  198. .endm
  199. .macro state_store3
  200. state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
  201. .endm
  202. .macro state_store4
  203. state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
  204. .endm
  205. .macro state_store5
  206. state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
  207. .endm
  208. /*
  209. * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv);
  210. */
  211. ENTRY(crypto_aegis256_aesni_init)
  212. FRAME_BEGIN
  213. /* load key: */
  214. movdqa 0x00(%rsi), MSG
  215. movdqa 0x10(%rsi), T1
  216. movdqa MSG, STATE4
  217. movdqa T1, STATE5
  218. /* load IV: */
  219. movdqu 0x00(%rdx), T2
  220. movdqu 0x10(%rdx), T3
  221. pxor MSG, T2
  222. pxor T1, T3
  223. movdqa T2, STATE0
  224. movdqa T3, STATE1
  225. /* load the constants: */
  226. movdqa .Laegis256_const_0, STATE3
  227. movdqa .Laegis256_const_1, STATE2
  228. pxor STATE3, STATE4
  229. pxor STATE2, STATE5
  230. /* update 10 times with IV and KEY: */
  231. update0 MSG
  232. update1 T1
  233. update2 T2
  234. update3 T3
  235. update4 MSG
  236. update5 T1
  237. update0 T2
  238. update1 T3
  239. update2 MSG
  240. update3 T1
  241. update4 T2
  242. update5 T3
  243. update0 MSG
  244. update1 T1
  245. update2 T2
  246. update3 T3
  247. state_store3
  248. FRAME_END
  249. ret
  250. ENDPROC(crypto_aegis256_aesni_init)
  251. .macro ad_block a i
  252. movdq\a (\i * 0x10)(SRC), MSG
  253. update\i MSG
  254. sub $0x10, LEN
  255. cmp $0x10, LEN
  256. jl .Lad_out_\i
  257. .endm
  258. /*
  259. * void crypto_aegis256_aesni_ad(void *state, unsigned int length,
  260. * const void *data);
  261. */
  262. ENTRY(crypto_aegis256_aesni_ad)
  263. FRAME_BEGIN
  264. cmp $0x10, LEN
  265. jb .Lad_out
  266. state_load
  267. mov SRC, %r8
  268. and $0xf, %r8
  269. jnz .Lad_u_loop
  270. .align 8
  271. .Lad_a_loop:
  272. ad_block a 0
  273. ad_block a 1
  274. ad_block a 2
  275. ad_block a 3
  276. ad_block a 4
  277. ad_block a 5
  278. add $0x60, SRC
  279. jmp .Lad_a_loop
  280. .align 8
  281. .Lad_u_loop:
  282. ad_block u 0
  283. ad_block u 1
  284. ad_block u 2
  285. ad_block u 3
  286. ad_block u 4
  287. ad_block u 5
  288. add $0x60, SRC
  289. jmp .Lad_u_loop
  290. .Lad_out_0:
  291. state_store0
  292. FRAME_END
  293. ret
  294. .Lad_out_1:
  295. state_store1
  296. FRAME_END
  297. ret
  298. .Lad_out_2:
  299. state_store2
  300. FRAME_END
  301. ret
  302. .Lad_out_3:
  303. state_store3
  304. FRAME_END
  305. ret
  306. .Lad_out_4:
  307. state_store4
  308. FRAME_END
  309. ret
  310. .Lad_out_5:
  311. state_store5
  312. FRAME_END
  313. ret
  314. .Lad_out:
  315. FRAME_END
  316. ret
  317. ENDPROC(crypto_aegis256_aesni_ad)
  318. .macro crypt m s0 s1 s2 s3 s4 s5
  319. pxor \s1, \m
  320. pxor \s4, \m
  321. pxor \s5, \m
  322. movdqa \s2, T3
  323. pand \s3, T3
  324. pxor T3, \m
  325. .endm
  326. .macro crypt0 m
  327. crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
  328. .endm
  329. .macro crypt1 m
  330. crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
  331. .endm
  332. .macro crypt2 m
  333. crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
  334. .endm
  335. .macro crypt3 m
  336. crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
  337. .endm
  338. .macro crypt4 m
  339. crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
  340. .endm
  341. .macro crypt5 m
  342. crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
  343. .endm
  344. .macro encrypt_block a i
  345. movdq\a (\i * 0x10)(SRC), MSG
  346. movdqa MSG, T0
  347. crypt\i T0
  348. movdq\a T0, (\i * 0x10)(DST)
  349. update\i MSG
  350. sub $0x10, LEN
  351. cmp $0x10, LEN
  352. jl .Lenc_out_\i
  353. .endm
  354. .macro decrypt_block a i
  355. movdq\a (\i * 0x10)(SRC), MSG
  356. crypt\i MSG
  357. movdq\a MSG, (\i * 0x10)(DST)
  358. update\i MSG
  359. sub $0x10, LEN
  360. cmp $0x10, LEN
  361. jl .Ldec_out_\i
  362. .endm
  363. /*
  364. * void crypto_aegis256_aesni_enc(void *state, unsigned int length,
  365. * const void *src, void *dst);
  366. */
  367. ENTRY(crypto_aegis256_aesni_enc)
  368. FRAME_BEGIN
  369. cmp $0x10, LEN
  370. jb .Lenc_out
  371. state_load
  372. mov SRC, %r8
  373. or DST, %r8
  374. and $0xf, %r8
  375. jnz .Lenc_u_loop
  376. .align 8
  377. .Lenc_a_loop:
  378. encrypt_block a 0
  379. encrypt_block a 1
  380. encrypt_block a 2
  381. encrypt_block a 3
  382. encrypt_block a 4
  383. encrypt_block a 5
  384. add $0x60, SRC
  385. add $0x60, DST
  386. jmp .Lenc_a_loop
  387. .align 8
  388. .Lenc_u_loop:
  389. encrypt_block u 0
  390. encrypt_block u 1
  391. encrypt_block u 2
  392. encrypt_block u 3
  393. encrypt_block u 4
  394. encrypt_block u 5
  395. add $0x60, SRC
  396. add $0x60, DST
  397. jmp .Lenc_u_loop
  398. .Lenc_out_0:
  399. state_store0
  400. FRAME_END
  401. ret
  402. .Lenc_out_1:
  403. state_store1
  404. FRAME_END
  405. ret
  406. .Lenc_out_2:
  407. state_store2
  408. FRAME_END
  409. ret
  410. .Lenc_out_3:
  411. state_store3
  412. FRAME_END
  413. ret
  414. .Lenc_out_4:
  415. state_store4
  416. FRAME_END
  417. ret
  418. .Lenc_out_5:
  419. state_store5
  420. FRAME_END
  421. ret
  422. .Lenc_out:
  423. FRAME_END
  424. ret
  425. ENDPROC(crypto_aegis256_aesni_enc)
  426. /*
  427. * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length,
  428. * const void *src, void *dst);
  429. */
  430. ENTRY(crypto_aegis256_aesni_enc_tail)
  431. FRAME_BEGIN
  432. state_load
  433. /* encrypt message: */
  434. call __load_partial
  435. movdqa MSG, T0
  436. crypt0 T0
  437. call __store_partial
  438. update0 MSG
  439. state_store0
  440. FRAME_END
  441. ret
  442. ENDPROC(crypto_aegis256_aesni_enc_tail)
  443. /*
  444. * void crypto_aegis256_aesni_dec(void *state, unsigned int length,
  445. * const void *src, void *dst);
  446. */
  447. ENTRY(crypto_aegis256_aesni_dec)
  448. FRAME_BEGIN
  449. cmp $0x10, LEN
  450. jb .Ldec_out
  451. state_load
  452. mov SRC, %r8
  453. or DST, %r8
  454. and $0xF, %r8
  455. jnz .Ldec_u_loop
  456. .align 8
  457. .Ldec_a_loop:
  458. decrypt_block a 0
  459. decrypt_block a 1
  460. decrypt_block a 2
  461. decrypt_block a 3
  462. decrypt_block a 4
  463. decrypt_block a 5
  464. add $0x60, SRC
  465. add $0x60, DST
  466. jmp .Ldec_a_loop
  467. .align 8
  468. .Ldec_u_loop:
  469. decrypt_block u 0
  470. decrypt_block u 1
  471. decrypt_block u 2
  472. decrypt_block u 3
  473. decrypt_block u 4
  474. decrypt_block u 5
  475. add $0x60, SRC
  476. add $0x60, DST
  477. jmp .Ldec_u_loop
  478. .Ldec_out_0:
  479. state_store0
  480. FRAME_END
  481. ret
  482. .Ldec_out_1:
  483. state_store1
  484. FRAME_END
  485. ret
  486. .Ldec_out_2:
  487. state_store2
  488. FRAME_END
  489. ret
  490. .Ldec_out_3:
  491. state_store3
  492. FRAME_END
  493. ret
  494. .Ldec_out_4:
  495. state_store4
  496. FRAME_END
  497. ret
  498. .Ldec_out_5:
  499. state_store5
  500. FRAME_END
  501. ret
  502. .Ldec_out:
  503. FRAME_END
  504. ret
  505. ENDPROC(crypto_aegis256_aesni_dec)
  506. /*
  507. * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length,
  508. * const void *src, void *dst);
  509. */
  510. ENTRY(crypto_aegis256_aesni_dec_tail)
  511. FRAME_BEGIN
  512. state_load
  513. /* decrypt message: */
  514. call __load_partial
  515. crypt0 MSG
  516. movdqa MSG, T0
  517. call __store_partial
  518. /* mask with byte count: */
  519. movq LEN, T0
  520. punpcklbw T0, T0
  521. punpcklbw T0, T0
  522. punpcklbw T0, T0
  523. punpcklbw T0, T0
  524. movdqa .Laegis256_counter, T1
  525. pcmpgtb T1, T0
  526. pand T0, MSG
  527. update0 MSG
  528. state_store0
  529. FRAME_END
  530. ret
  531. ENDPROC(crypto_aegis256_aesni_dec_tail)
  532. /*
  533. * void crypto_aegis256_aesni_final(void *state, void *tag_xor,
  534. * u64 assoclen, u64 cryptlen);
  535. */
  536. ENTRY(crypto_aegis256_aesni_final)
  537. FRAME_BEGIN
  538. state_load
  539. /* prepare length block: */
  540. movq %rdx, MSG
  541. movq %rcx, T0
  542. pslldq $8, T0
  543. pxor T0, MSG
  544. psllq $3, MSG /* multiply by 8 (to get bit count) */
  545. pxor STATE3, MSG
  546. /* update state: */
  547. update0 MSG
  548. update1 MSG
  549. update2 MSG
  550. update3 MSG
  551. update4 MSG
  552. update5 MSG
  553. update0 MSG
  554. /* xor tag: */
  555. movdqu (%rsi), MSG
  556. pxor STATE0, MSG
  557. pxor STATE1, MSG
  558. pxor STATE2, MSG
  559. pxor STATE3, MSG
  560. pxor STATE4, MSG
  561. pxor STATE5, MSG
  562. movdqu MSG, (%rsi)
  563. FRAME_END
  564. ret
  565. ENDPROC(crypto_aegis256_aesni_final)