morus1280-sse2-asm.S 18 KB


  1. /*
  2. * SSE2 implementation of MORUS-1280
  3. *
  4. * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
  5. * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
  6. *
  7. * This program is free software; you can redistribute it and/or modify it
  8. * under the terms of the GNU General Public License version 2 as published
  9. * by the Free Software Foundation.
  10. */
  11. #include <linux/linkage.h>
  12. #include <asm/frame.h>
  13. #define SHUFFLE_MASK(i0, i1, i2, i3) \
  14. (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
  15. #define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
  16. #define STATE0_LO %xmm0
  17. #define STATE0_HI %xmm1
  18. #define STATE1_LO %xmm2
  19. #define STATE1_HI %xmm3
  20. #define STATE2_LO %xmm4
  21. #define STATE2_HI %xmm5
  22. #define STATE3_LO %xmm6
  23. #define STATE3_HI %xmm7
  24. #define STATE4_LO %xmm8
  25. #define STATE4_HI %xmm9
  26. #define KEY_LO %xmm10
  27. #define KEY_HI %xmm11
  28. #define MSG_LO %xmm10
  29. #define MSG_HI %xmm11
  30. #define T0_LO %xmm12
  31. #define T0_HI %xmm13
  32. #define T1_LO %xmm14
  33. #define T1_HI %xmm15
  34. .section .rodata.cst16.morus640_const, "aM", @progbits, 16
  35. .align 16
  36. .Lmorus640_const_0:
  37. .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
  38. .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
  39. .Lmorus640_const_1:
  40. .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
  41. .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
  42. .section .rodata.cst16.morus640_counter, "aM", @progbits, 16
  43. .align 16
  44. .Lmorus640_counter_0:
  45. .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  46. .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  47. .Lmorus640_counter_1:
  48. .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
  49. .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
  50. .text
  51. .macro rol1 hi, lo
  52. /*
  53. * HI_1 | HI_0 || LO_1 | LO_0
  54. * ==>
  55. * HI_0 | HI_1 || LO_1 | LO_0
  56. * ==>
  57. * HI_0 | LO_1 || LO_0 | HI_1
  58. */
  59. pshufd $MASK2, \hi, \hi
  60. movdqa \hi, T0_LO
  61. punpcklqdq \lo, T0_LO
  62. punpckhqdq \hi, \lo
  63. movdqa \lo, \hi
  64. movdqa T0_LO, \lo
  65. .endm
  66. .macro rol2 hi, lo
  67. movdqa \lo, T0_LO
  68. movdqa \hi, \lo
  69. movdqa T0_LO, \hi
  70. .endm
  71. .macro rol3 hi, lo
  72. /*
  73. * HI_1 | HI_0 || LO_1 | LO_0
  74. * ==>
  75. * HI_0 | HI_1 || LO_1 | LO_0
  76. * ==>
  77. * LO_0 | HI_1 || HI_0 | LO_1
  78. */
  79. pshufd $MASK2, \hi, \hi
  80. movdqa \lo, T0_LO
  81. punpckhqdq \hi, T0_LO
  82. punpcklqdq \lo, \hi
  83. movdqa T0_LO, \lo
  84. .endm
  85. .macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
  86. movdqa \s1_l, T0_LO
  87. pand \s2_l, T0_LO
  88. pxor T0_LO, \s0_l
  89. movdqa \s1_h, T0_LO
  90. pand \s2_h, T0_LO
  91. pxor T0_LO, \s0_h
  92. pxor \s3_l, \s0_l
  93. pxor \s3_h, \s0_h
  94. movdqa \s0_l, T0_LO
  95. psllq $\b, T0_LO
  96. psrlq $(64 - \b), \s0_l
  97. pxor T0_LO, \s0_l
  98. movdqa \s0_h, T0_LO
  99. psllq $\b, T0_LO
  100. psrlq $(64 - \b), \s0_h
  101. pxor T0_LO, \s0_h
  102. \w \s3_h, \s3_l
  103. .endm
  104. /*
  105. * __morus1280_update: internal ABI
  106. * input:
  107. * STATE[0-4] - input state
  108. * MSG - message block
  109. * output:
  110. * STATE[0-4] - output state
  111. * changed:
  112. * T0
  113. */
  114. __morus1280_update:
  115. morus1280_round \
  116. STATE0_LO, STATE0_HI, \
  117. STATE1_LO, STATE1_HI, \
  118. STATE2_LO, STATE2_HI, \
  119. STATE3_LO, STATE3_HI, \
  120. STATE4_LO, STATE4_HI, \
  121. 13, rol1
  122. pxor MSG_LO, STATE1_LO
  123. pxor MSG_HI, STATE1_HI
  124. morus1280_round \
  125. STATE1_LO, STATE1_HI, \
  126. STATE2_LO, STATE2_HI, \
  127. STATE3_LO, STATE3_HI, \
  128. STATE4_LO, STATE4_HI, \
  129. STATE0_LO, STATE0_HI, \
  130. 46, rol2
  131. pxor MSG_LO, STATE2_LO
  132. pxor MSG_HI, STATE2_HI
  133. morus1280_round \
  134. STATE2_LO, STATE2_HI, \
  135. STATE3_LO, STATE3_HI, \
  136. STATE4_LO, STATE4_HI, \
  137. STATE0_LO, STATE0_HI, \
  138. STATE1_LO, STATE1_HI, \
  139. 38, rol3
  140. pxor MSG_LO, STATE3_LO
  141. pxor MSG_HI, STATE3_HI
  142. morus1280_round \
  143. STATE3_LO, STATE3_HI, \
  144. STATE4_LO, STATE4_HI, \
  145. STATE0_LO, STATE0_HI, \
  146. STATE1_LO, STATE1_HI, \
  147. STATE2_LO, STATE2_HI, \
  148. 7, rol2
  149. pxor MSG_LO, STATE4_LO
  150. pxor MSG_HI, STATE4_HI
  151. morus1280_round \
  152. STATE4_LO, STATE4_HI, \
  153. STATE0_LO, STATE0_HI, \
  154. STATE1_LO, STATE1_HI, \
  155. STATE2_LO, STATE2_HI, \
  156. STATE3_LO, STATE3_HI, \
  157. 4, rol1
  158. ret
  159. ENDPROC(__morus1280_update)
  160. /*
  161. * __morus1280_update_zero: internal ABI
  162. * input:
  163. * STATE[0-4] - input state
  164. * output:
  165. * STATE[0-4] - output state
  166. * changed:
  167. * T0
  168. */
  169. __morus1280_update_zero:
  170. morus1280_round \
  171. STATE0_LO, STATE0_HI, \
  172. STATE1_LO, STATE1_HI, \
  173. STATE2_LO, STATE2_HI, \
  174. STATE3_LO, STATE3_HI, \
  175. STATE4_LO, STATE4_HI, \
  176. 13, rol1
  177. morus1280_round \
  178. STATE1_LO, STATE1_HI, \
  179. STATE2_LO, STATE2_HI, \
  180. STATE3_LO, STATE3_HI, \
  181. STATE4_LO, STATE4_HI, \
  182. STATE0_LO, STATE0_HI, \
  183. 46, rol2
  184. morus1280_round \
  185. STATE2_LO, STATE2_HI, \
  186. STATE3_LO, STATE3_HI, \
  187. STATE4_LO, STATE4_HI, \
  188. STATE0_LO, STATE0_HI, \
  189. STATE1_LO, STATE1_HI, \
  190. 38, rol3
  191. morus1280_round \
  192. STATE3_LO, STATE3_HI, \
  193. STATE4_LO, STATE4_HI, \
  194. STATE0_LO, STATE0_HI, \
  195. STATE1_LO, STATE1_HI, \
  196. STATE2_LO, STATE2_HI, \
  197. 7, rol2
  198. morus1280_round \
  199. STATE4_LO, STATE4_HI, \
  200. STATE0_LO, STATE0_HI, \
  201. STATE1_LO, STATE1_HI, \
  202. STATE2_LO, STATE2_HI, \
  203. STATE3_LO, STATE3_HI, \
  204. 4, rol1
  205. ret
  206. ENDPROC(__morus1280_update_zero)
  207. /*
  208. * __load_partial: internal ABI
  209. * input:
  210. * %rsi - src
  211. * %rcx - bytes
  212. * output:
  213. * MSG - message block
  214. * changed:
  215. * %r8
  216. * %r9
  217. */
  218. __load_partial:
  219. xor %r9d, %r9d
  220. pxor MSG_LO, MSG_LO
  221. pxor MSG_HI, MSG_HI
  222. mov %rcx, %r8
  223. and $0x1, %r8
  224. jz .Lld_partial_1
  225. mov %rcx, %r8
  226. and $0x1E, %r8
  227. add %rsi, %r8
  228. mov (%r8), %r9b
  229. .Lld_partial_1:
  230. mov %rcx, %r8
  231. and $0x2, %r8
  232. jz .Lld_partial_2
  233. mov %rcx, %r8
  234. and $0x1C, %r8
  235. add %rsi, %r8
  236. shl $16, %r9
  237. mov (%r8), %r9w
  238. .Lld_partial_2:
  239. mov %rcx, %r8
  240. and $0x4, %r8
  241. jz .Lld_partial_4
  242. mov %rcx, %r8
  243. and $0x18, %r8
  244. add %rsi, %r8
  245. shl $32, %r9
  246. mov (%r8), %r8d
  247. xor %r8, %r9
  248. .Lld_partial_4:
  249. movq %r9, MSG_LO
  250. mov %rcx, %r8
  251. and $0x8, %r8
  252. jz .Lld_partial_8
  253. mov %rcx, %r8
  254. and $0x10, %r8
  255. add %rsi, %r8
  256. pslldq $8, MSG_LO
  257. movq (%r8), T0_LO
  258. pxor T0_LO, MSG_LO
  259. .Lld_partial_8:
  260. mov %rcx, %r8
  261. and $0x10, %r8
  262. jz .Lld_partial_16
  263. movdqa MSG_LO, MSG_HI
  264. movdqu (%rsi), MSG_LO
  265. .Lld_partial_16:
  266. ret
  267. ENDPROC(__load_partial)
  268. /*
  269. * __store_partial: internal ABI
  270. * input:
  271. * %rdx - dst
  272. * %rcx - bytes
  273. * output:
  274. * T0 - message block
  275. * changed:
  276. * %r8
  277. * %r9
  278. * %r10
  279. */
  280. __store_partial:
  281. mov %rcx, %r8
  282. mov %rdx, %r9
  283. cmp $16, %r8
  284. jl .Lst_partial_16
  285. movdqu T0_LO, (%r9)
  286. movdqa T0_HI, T0_LO
  287. sub $16, %r8
  288. add $16, %r9
  289. .Lst_partial_16:
  290. movq T0_LO, %r10
  291. cmp $8, %r8
  292. jl .Lst_partial_8
  293. mov %r10, (%r9)
  294. psrldq $8, T0_LO
  295. movq T0_LO, %r10
  296. sub $8, %r8
  297. add $8, %r9
  298. .Lst_partial_8:
  299. cmp $4, %r8
  300. jl .Lst_partial_4
  301. mov %r10d, (%r9)
  302. shr $32, %r10
  303. sub $4, %r8
  304. add $4, %r9
  305. .Lst_partial_4:
  306. cmp $2, %r8
  307. jl .Lst_partial_2
  308. mov %r10w, (%r9)
  309. shr $16, %r10
  310. sub $2, %r8
  311. add $2, %r9
  312. .Lst_partial_2:
  313. cmp $1, %r8
  314. jl .Lst_partial_1
  315. mov %r10b, (%r9)
  316. .Lst_partial_1:
  317. ret
  318. ENDPROC(__store_partial)
  319. /*
  320. * void crypto_morus1280_sse2_init(void *state, const void *key,
  321. * const void *iv);
  322. */
  323. ENTRY(crypto_morus1280_sse2_init)
  324. FRAME_BEGIN
  325. /* load IV: */
  326. pxor STATE0_HI, STATE0_HI
  327. movdqu (%rdx), STATE0_LO
  328. /* load key: */
  329. movdqu 0(%rsi), KEY_LO
  330. movdqu 16(%rsi), KEY_HI
  331. movdqa KEY_LO, STATE1_LO
  332. movdqa KEY_HI, STATE1_HI
  333. /* load all ones: */
  334. pcmpeqd STATE2_LO, STATE2_LO
  335. pcmpeqd STATE2_HI, STATE2_HI
  336. /* load all zeros: */
  337. pxor STATE3_LO, STATE3_LO
  338. pxor STATE3_HI, STATE3_HI
  339. /* load the constant: */
  340. movdqa .Lmorus640_const_0, STATE4_LO
  341. movdqa .Lmorus640_const_1, STATE4_HI
  342. /* update 16 times with zero: */
  343. call __morus1280_update_zero
  344. call __morus1280_update_zero
  345. call __morus1280_update_zero
  346. call __morus1280_update_zero
  347. call __morus1280_update_zero
  348. call __morus1280_update_zero
  349. call __morus1280_update_zero
  350. call __morus1280_update_zero
  351. call __morus1280_update_zero
  352. call __morus1280_update_zero
  353. call __morus1280_update_zero
  354. call __morus1280_update_zero
  355. call __morus1280_update_zero
  356. call __morus1280_update_zero
  357. call __morus1280_update_zero
  358. call __morus1280_update_zero
  359. /* xor-in the key again after updates: */
  360. pxor KEY_LO, STATE1_LO
  361. pxor KEY_HI, STATE1_HI
  362. /* store the state: */
  363. movdqu STATE0_LO, (0 * 16)(%rdi)
  364. movdqu STATE0_HI, (1 * 16)(%rdi)
  365. movdqu STATE1_LO, (2 * 16)(%rdi)
  366. movdqu STATE1_HI, (3 * 16)(%rdi)
  367. movdqu STATE2_LO, (4 * 16)(%rdi)
  368. movdqu STATE2_HI, (5 * 16)(%rdi)
  369. movdqu STATE3_LO, (6 * 16)(%rdi)
  370. movdqu STATE3_HI, (7 * 16)(%rdi)
  371. movdqu STATE4_LO, (8 * 16)(%rdi)
  372. movdqu STATE4_HI, (9 * 16)(%rdi)
  373. FRAME_END
  374. ret
  375. ENDPROC(crypto_morus1280_sse2_init)
  376. /*
  377. * void crypto_morus1280_sse2_ad(void *state, const void *data,
  378. * unsigned int length);
  379. */
  380. ENTRY(crypto_morus1280_sse2_ad)
  381. FRAME_BEGIN
  382. cmp $32, %rdx
  383. jb .Lad_out
  384. /* load the state: */
  385. movdqu (0 * 16)(%rdi), STATE0_LO
  386. movdqu (1 * 16)(%rdi), STATE0_HI
  387. movdqu (2 * 16)(%rdi), STATE1_LO
  388. movdqu (3 * 16)(%rdi), STATE1_HI
  389. movdqu (4 * 16)(%rdi), STATE2_LO
  390. movdqu (5 * 16)(%rdi), STATE2_HI
  391. movdqu (6 * 16)(%rdi), STATE3_LO
  392. movdqu (7 * 16)(%rdi), STATE3_HI
  393. movdqu (8 * 16)(%rdi), STATE4_LO
  394. movdqu (9 * 16)(%rdi), STATE4_HI
  395. mov %rsi, %r8
  396. and $0xF, %r8
  397. jnz .Lad_u_loop
  398. .align 4
  399. .Lad_a_loop:
  400. movdqa 0(%rsi), MSG_LO
  401. movdqa 16(%rsi), MSG_HI
  402. call __morus1280_update
  403. sub $32, %rdx
  404. add $32, %rsi
  405. cmp $32, %rdx
  406. jge .Lad_a_loop
  407. jmp .Lad_cont
  408. .align 4
  409. .Lad_u_loop:
  410. movdqu 0(%rsi), MSG_LO
  411. movdqu 16(%rsi), MSG_HI
  412. call __morus1280_update
  413. sub $32, %rdx
  414. add $32, %rsi
  415. cmp $32, %rdx
  416. jge .Lad_u_loop
  417. .Lad_cont:
  418. /* store the state: */
  419. movdqu STATE0_LO, (0 * 16)(%rdi)
  420. movdqu STATE0_HI, (1 * 16)(%rdi)
  421. movdqu STATE1_LO, (2 * 16)(%rdi)
  422. movdqu STATE1_HI, (3 * 16)(%rdi)
  423. movdqu STATE2_LO, (4 * 16)(%rdi)
  424. movdqu STATE2_HI, (5 * 16)(%rdi)
  425. movdqu STATE3_LO, (6 * 16)(%rdi)
  426. movdqu STATE3_HI, (7 * 16)(%rdi)
  427. movdqu STATE4_LO, (8 * 16)(%rdi)
  428. movdqu STATE4_HI, (9 * 16)(%rdi)
  429. .Lad_out:
  430. FRAME_END
  431. ret
  432. ENDPROC(crypto_morus1280_sse2_ad)
  433. /*
  434. * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
  435. * unsigned int length);
  436. */
  437. ENTRY(crypto_morus1280_sse2_enc)
  438. FRAME_BEGIN
  439. cmp $32, %rcx
  440. jb .Lenc_out
  441. /* load the state: */
  442. movdqu (0 * 16)(%rdi), STATE0_LO
  443. movdqu (1 * 16)(%rdi), STATE0_HI
  444. movdqu (2 * 16)(%rdi), STATE1_LO
  445. movdqu (3 * 16)(%rdi), STATE1_HI
  446. movdqu (4 * 16)(%rdi), STATE2_LO
  447. movdqu (5 * 16)(%rdi), STATE2_HI
  448. movdqu (6 * 16)(%rdi), STATE3_LO
  449. movdqu (7 * 16)(%rdi), STATE3_HI
  450. movdqu (8 * 16)(%rdi), STATE4_LO
  451. movdqu (9 * 16)(%rdi), STATE4_HI
  452. mov %rsi, %r8
  453. or %rdx, %r8
  454. and $0xF, %r8
  455. jnz .Lenc_u_loop
  456. .align 4
  457. .Lenc_a_loop:
  458. movdqa 0(%rsi), MSG_LO
  459. movdqa 16(%rsi), MSG_HI
  460. movdqa STATE1_LO, T1_LO
  461. movdqa STATE1_HI, T1_HI
  462. rol3 T1_HI, T1_LO
  463. movdqa MSG_LO, T0_LO
  464. movdqa MSG_HI, T0_HI
  465. pxor T1_LO, T0_LO
  466. pxor T1_HI, T0_HI
  467. pxor STATE0_LO, T0_LO
  468. pxor STATE0_HI, T0_HI
  469. movdqa STATE2_LO, T1_LO
  470. movdqa STATE2_HI, T1_HI
  471. pand STATE3_LO, T1_LO
  472. pand STATE3_HI, T1_HI
  473. pxor T1_LO, T0_LO
  474. pxor T1_HI, T0_HI
  475. movdqa T0_LO, 0(%rdx)
  476. movdqa T0_HI, 16(%rdx)
  477. call __morus1280_update
  478. sub $32, %rcx
  479. add $32, %rsi
  480. add $32, %rdx
  481. cmp $32, %rcx
  482. jge .Lenc_a_loop
  483. jmp .Lenc_cont
  484. .align 4
  485. .Lenc_u_loop:
  486. movdqu 0(%rsi), MSG_LO
  487. movdqu 16(%rsi), MSG_HI
  488. movdqa STATE1_LO, T1_LO
  489. movdqa STATE1_HI, T1_HI
  490. rol3 T1_HI, T1_LO
  491. movdqa MSG_LO, T0_LO
  492. movdqa MSG_HI, T0_HI
  493. pxor T1_LO, T0_LO
  494. pxor T1_HI, T0_HI
  495. pxor STATE0_LO, T0_LO
  496. pxor STATE0_HI, T0_HI
  497. movdqa STATE2_LO, T1_LO
  498. movdqa STATE2_HI, T1_HI
  499. pand STATE3_LO, T1_LO
  500. pand STATE3_HI, T1_HI
  501. pxor T1_LO, T0_LO
  502. pxor T1_HI, T0_HI
  503. movdqu T0_LO, 0(%rdx)
  504. movdqu T0_HI, 16(%rdx)
  505. call __morus1280_update
  506. sub $32, %rcx
  507. add $32, %rsi
  508. add $32, %rdx
  509. cmp $32, %rcx
  510. jge .Lenc_u_loop
  511. .Lenc_cont:
  512. /* store the state: */
  513. movdqu STATE0_LO, (0 * 16)(%rdi)
  514. movdqu STATE0_HI, (1 * 16)(%rdi)
  515. movdqu STATE1_LO, (2 * 16)(%rdi)
  516. movdqu STATE1_HI, (3 * 16)(%rdi)
  517. movdqu STATE2_LO, (4 * 16)(%rdi)
  518. movdqu STATE2_HI, (5 * 16)(%rdi)
  519. movdqu STATE3_LO, (6 * 16)(%rdi)
  520. movdqu STATE3_HI, (7 * 16)(%rdi)
  521. movdqu STATE4_LO, (8 * 16)(%rdi)
  522. movdqu STATE4_HI, (9 * 16)(%rdi)
  523. .Lenc_out:
  524. FRAME_END
  525. ret
  526. ENDPROC(crypto_morus1280_sse2_enc)
  527. /*
  528. * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
  529. * unsigned int length);
  530. */
  531. ENTRY(crypto_morus1280_sse2_enc_tail)
  532. FRAME_BEGIN
  533. /* load the state: */
  534. movdqu (0 * 16)(%rdi), STATE0_LO
  535. movdqu (1 * 16)(%rdi), STATE0_HI
  536. movdqu (2 * 16)(%rdi), STATE1_LO
  537. movdqu (3 * 16)(%rdi), STATE1_HI
  538. movdqu (4 * 16)(%rdi), STATE2_LO
  539. movdqu (5 * 16)(%rdi), STATE2_HI
  540. movdqu (6 * 16)(%rdi), STATE3_LO
  541. movdqu (7 * 16)(%rdi), STATE3_HI
  542. movdqu (8 * 16)(%rdi), STATE4_LO
  543. movdqu (9 * 16)(%rdi), STATE4_HI
  544. /* encrypt message: */
  545. call __load_partial
  546. movdqa STATE1_LO, T1_LO
  547. movdqa STATE1_HI, T1_HI
  548. rol3 T1_HI, T1_LO
  549. movdqa MSG_LO, T0_LO
  550. movdqa MSG_HI, T0_HI
  551. pxor T1_LO, T0_LO
  552. pxor T1_HI, T0_HI
  553. pxor STATE0_LO, T0_LO
  554. pxor STATE0_HI, T0_HI
  555. movdqa STATE2_LO, T1_LO
  556. movdqa STATE2_HI, T1_HI
  557. pand STATE3_LO, T1_LO
  558. pand STATE3_HI, T1_HI
  559. pxor T1_LO, T0_LO
  560. pxor T1_HI, T0_HI
  561. call __store_partial
  562. call __morus1280_update
  563. /* store the state: */
  564. movdqu STATE0_LO, (0 * 16)(%rdi)
  565. movdqu STATE0_HI, (1 * 16)(%rdi)
  566. movdqu STATE1_LO, (2 * 16)(%rdi)
  567. movdqu STATE1_HI, (3 * 16)(%rdi)
  568. movdqu STATE2_LO, (4 * 16)(%rdi)
  569. movdqu STATE2_HI, (5 * 16)(%rdi)
  570. movdqu STATE3_LO, (6 * 16)(%rdi)
  571. movdqu STATE3_HI, (7 * 16)(%rdi)
  572. movdqu STATE4_LO, (8 * 16)(%rdi)
  573. movdqu STATE4_HI, (9 * 16)(%rdi)
  574. FRAME_END
  575. ret
  576. ENDPROC(crypto_morus1280_sse2_enc_tail)
  577. /*
  578. * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
  579. * unsigned int length);
  580. */
  581. ENTRY(crypto_morus1280_sse2_dec)
  582. FRAME_BEGIN
  583. cmp $32, %rcx
  584. jb .Ldec_out
  585. /* load the state: */
  586. movdqu (0 * 16)(%rdi), STATE0_LO
  587. movdqu (1 * 16)(%rdi), STATE0_HI
  588. movdqu (2 * 16)(%rdi), STATE1_LO
  589. movdqu (3 * 16)(%rdi), STATE1_HI
  590. movdqu (4 * 16)(%rdi), STATE2_LO
  591. movdqu (5 * 16)(%rdi), STATE2_HI
  592. movdqu (6 * 16)(%rdi), STATE3_LO
  593. movdqu (7 * 16)(%rdi), STATE3_HI
  594. movdqu (8 * 16)(%rdi), STATE4_LO
  595. movdqu (9 * 16)(%rdi), STATE4_HI
  596. mov %rsi, %r8
  597. or %rdx, %r8
  598. and $0xF, %r8
  599. jnz .Ldec_u_loop
  600. .align 4
  601. .Ldec_a_loop:
  602. movdqa 0(%rsi), MSG_LO
  603. movdqa 16(%rsi), MSG_HI
  604. pxor STATE0_LO, MSG_LO
  605. pxor STATE0_HI, MSG_HI
  606. movdqa STATE1_LO, T1_LO
  607. movdqa STATE1_HI, T1_HI
  608. rol3 T1_HI, T1_LO
  609. pxor T1_LO, MSG_LO
  610. pxor T1_HI, MSG_HI
  611. movdqa STATE2_LO, T1_LO
  612. movdqa STATE2_HI, T1_HI
  613. pand STATE3_LO, T1_LO
  614. pand STATE3_HI, T1_HI
  615. pxor T1_LO, MSG_LO
  616. pxor T1_HI, MSG_HI
  617. movdqa MSG_LO, 0(%rdx)
  618. movdqa MSG_HI, 16(%rdx)
  619. call __morus1280_update
  620. sub $32, %rcx
  621. add $32, %rsi
  622. add $32, %rdx
  623. cmp $32, %rcx
  624. jge .Ldec_a_loop
  625. jmp .Ldec_cont
  626. .align 4
  627. .Ldec_u_loop:
  628. movdqu 0(%rsi), MSG_LO
  629. movdqu 16(%rsi), MSG_HI
  630. pxor STATE0_LO, MSG_LO
  631. pxor STATE0_HI, MSG_HI
  632. movdqa STATE1_LO, T1_LO
  633. movdqa STATE1_HI, T1_HI
  634. rol3 T1_HI, T1_LO
  635. pxor T1_LO, MSG_LO
  636. pxor T1_HI, MSG_HI
  637. movdqa STATE2_LO, T1_LO
  638. movdqa STATE2_HI, T1_HI
  639. pand STATE3_LO, T1_LO
  640. pand STATE3_HI, T1_HI
  641. pxor T1_LO, MSG_LO
  642. pxor T1_HI, MSG_HI
  643. movdqu MSG_LO, 0(%rdx)
  644. movdqu MSG_HI, 16(%rdx)
  645. call __morus1280_update
  646. sub $32, %rcx
  647. add $32, %rsi
  648. add $32, %rdx
  649. cmp $32, %rcx
  650. jge .Ldec_u_loop
  651. .Ldec_cont:
  652. /* store the state: */
  653. movdqu STATE0_LO, (0 * 16)(%rdi)
  654. movdqu STATE0_HI, (1 * 16)(%rdi)
  655. movdqu STATE1_LO, (2 * 16)(%rdi)
  656. movdqu STATE1_HI, (3 * 16)(%rdi)
  657. movdqu STATE2_LO, (4 * 16)(%rdi)
  658. movdqu STATE2_HI, (5 * 16)(%rdi)
  659. movdqu STATE3_LO, (6 * 16)(%rdi)
  660. movdqu STATE3_HI, (7 * 16)(%rdi)
  661. movdqu STATE4_LO, (8 * 16)(%rdi)
  662. movdqu STATE4_HI, (9 * 16)(%rdi)
  663. .Ldec_out:
  664. FRAME_END
  665. ret
  666. ENDPROC(crypto_morus1280_sse2_dec)
  667. /*
  668. * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
  669. * unsigned int length);
  670. */
  671. ENTRY(crypto_morus1280_sse2_dec_tail)
  672. FRAME_BEGIN
  673. /* load the state: */
  674. movdqu (0 * 16)(%rdi), STATE0_LO
  675. movdqu (1 * 16)(%rdi), STATE0_HI
  676. movdqu (2 * 16)(%rdi), STATE1_LO
  677. movdqu (3 * 16)(%rdi), STATE1_HI
  678. movdqu (4 * 16)(%rdi), STATE2_LO
  679. movdqu (5 * 16)(%rdi), STATE2_HI
  680. movdqu (6 * 16)(%rdi), STATE3_LO
  681. movdqu (7 * 16)(%rdi), STATE3_HI
  682. movdqu (8 * 16)(%rdi), STATE4_LO
  683. movdqu (9 * 16)(%rdi), STATE4_HI
  684. /* decrypt message: */
  685. call __load_partial
  686. pxor STATE0_LO, MSG_LO
  687. pxor STATE0_HI, MSG_HI
  688. movdqa STATE1_LO, T1_LO
  689. movdqa STATE1_HI, T1_HI
  690. rol3 T1_HI, T1_LO
  691. pxor T1_LO, MSG_LO
  692. pxor T1_HI, MSG_HI
  693. movdqa STATE2_LO, T1_LO
  694. movdqa STATE2_HI, T1_HI
  695. pand STATE3_LO, T1_LO
  696. pand STATE3_HI, T1_HI
  697. pxor T1_LO, MSG_LO
  698. pxor T1_HI, MSG_HI
  699. movdqa MSG_LO, T0_LO
  700. movdqa MSG_HI, T0_HI
  701. call __store_partial
  702. /* mask with byte count: */
  703. movq %rcx, T0_LO
  704. punpcklbw T0_LO, T0_LO
  705. punpcklbw T0_LO, T0_LO
  706. punpcklbw T0_LO, T0_LO
  707. punpcklbw T0_LO, T0_LO
  708. movdqa T0_LO, T0_HI
  709. movdqa .Lmorus640_counter_0, T1_LO
  710. movdqa .Lmorus640_counter_1, T1_HI
  711. pcmpgtb T1_LO, T0_LO
  712. pcmpgtb T1_HI, T0_HI
  713. pand T0_LO, MSG_LO
  714. pand T0_HI, MSG_HI
  715. call __morus1280_update
  716. /* store the state: */
  717. movdqu STATE0_LO, (0 * 16)(%rdi)
  718. movdqu STATE0_HI, (1 * 16)(%rdi)
  719. movdqu STATE1_LO, (2 * 16)(%rdi)
  720. movdqu STATE1_HI, (3 * 16)(%rdi)
  721. movdqu STATE2_LO, (4 * 16)(%rdi)
  722. movdqu STATE2_HI, (5 * 16)(%rdi)
  723. movdqu STATE3_LO, (6 * 16)(%rdi)
  724. movdqu STATE3_HI, (7 * 16)(%rdi)
  725. movdqu STATE4_LO, (8 * 16)(%rdi)
  726. movdqu STATE4_HI, (9 * 16)(%rdi)
  727. FRAME_END
  728. ret
  729. ENDPROC(crypto_morus1280_sse2_dec_tail)
  730. /*
  731. * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
  732. * u64 assoclen, u64 cryptlen);
  733. */
  734. ENTRY(crypto_morus1280_sse2_final)
  735. FRAME_BEGIN
  736. /* load the state: */
  737. movdqu (0 * 16)(%rdi), STATE0_LO
  738. movdqu (1 * 16)(%rdi), STATE0_HI
  739. movdqu (2 * 16)(%rdi), STATE1_LO
  740. movdqu (3 * 16)(%rdi), STATE1_HI
  741. movdqu (4 * 16)(%rdi), STATE2_LO
  742. movdqu (5 * 16)(%rdi), STATE2_HI
  743. movdqu (6 * 16)(%rdi), STATE3_LO
  744. movdqu (7 * 16)(%rdi), STATE3_HI
  745. movdqu (8 * 16)(%rdi), STATE4_LO
  746. movdqu (9 * 16)(%rdi), STATE4_HI
  747. /* xor state[0] into state[4]: */
  748. pxor STATE0_LO, STATE4_LO
  749. pxor STATE0_HI, STATE4_HI
  750. /* prepare length block: */
  751. movq %rdx, MSG_LO
  752. movq %rcx, T0_LO
  753. pslldq $8, T0_LO
  754. pxor T0_LO, MSG_LO
  755. psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
  756. pxor MSG_HI, MSG_HI
  757. /* update state: */
  758. call __morus1280_update
  759. call __morus1280_update
  760. call __morus1280_update
  761. call __morus1280_update
  762. call __morus1280_update
  763. call __morus1280_update
  764. call __morus1280_update
  765. call __morus1280_update
  766. call __morus1280_update
  767. call __morus1280_update
  768. /* xor tag: */
  769. movdqu 0(%rsi), MSG_LO
  770. movdqu 16(%rsi), MSG_HI
  771. pxor STATE0_LO, MSG_LO
  772. pxor STATE0_HI, MSG_HI
  773. movdqa STATE1_LO, T0_LO
  774. movdqa STATE1_HI, T0_HI
  775. rol3 T0_HI, T0_LO
  776. pxor T0_LO, MSG_LO
  777. pxor T0_HI, MSG_HI
  778. movdqa STATE2_LO, T0_LO
  779. movdqa STATE2_HI, T0_HI
  780. pand STATE3_LO, T0_LO
  781. pand STATE3_HI, T0_HI
  782. pxor T0_LO, MSG_LO
  783. pxor T0_HI, MSG_HI
  784. movdqu MSG_LO, 0(%rsi)
  785. movdqu MSG_HI, 16(%rsi)
  786. FRAME_END
  787. ret
  788. ENDPROC(crypto_morus1280_sse2_final)