sm4-armv8-aarch64-ce.S 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732
  1. /* sm4-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM4 cipher
  2. *
  3. * Copyright (C) 2022 Alibaba Group.
  4. * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  5. *
  6. * This file is part of Libgcrypt.
  7. *
  8. * Libgcrypt is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as
  10. * published by the Free Software Foundation; either version 2.1 of
  11. * the License, or (at your option) any later version.
  12. *
  13. * Libgcrypt is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  20. */
  21. #include "asm-common-aarch64.h"
  22. #if defined(__AARCH64EL__) && \
  23. defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
  24. defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
  25. defined(USE_SM4)
  26. .cpu generic+simd+crypto
  27. #define vecnum_v0 0
  28. #define vecnum_v1 1
  29. #define vecnum_v2 2
  30. #define vecnum_v3 3
  31. #define vecnum_v4 4
  32. #define vecnum_v5 5
  33. #define vecnum_v6 6
  34. #define vecnum_v7 7
  35. #define vecnum_v16 16
  36. #define vecnum_v24 24
  37. #define vecnum_v25 25
  38. #define vecnum_v26 26
  39. #define vecnum_v27 27
  40. #define vecnum_v28 28
  41. #define vecnum_v29 29
  42. #define vecnum_v30 30
  43. #define vecnum_v31 31
  44. #define sm4e(vd, vn) \
  45. .inst (0xcec08400 | (vecnum_##vn << 5) | vecnum_##vd)
  46. #define sm4ekey(vd, vn, vm) \
  47. .inst (0xce60c800 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
  48. .text
  49. /* Register macros */
  50. #define RTMP0 v16
  51. #define RTMP1 v17
  52. #define RTMP2 v18
  53. #define RTMP3 v19
  54. #define RIV v20
  55. #define RMASK v21
  56. /* Helper macros. */
  57. #define load_rkey(ptr) \
  58. ld1 {v24.16b-v27.16b}, [ptr], #64; \
  59. ld1 {v28.16b-v31.16b}, [ptr];
  60. #define SM4_CRYPT_BLK(b0) \
  61. rev32 b0.16b, b0.16b; \
  62. sm4e(b0, v24); \
  63. sm4e(b0, v25); \
  64. sm4e(b0, v26); \
  65. sm4e(b0, v27); \
  66. sm4e(b0, v28); \
  67. sm4e(b0, v29); \
  68. sm4e(b0, v30); \
  69. sm4e(b0, v31); \
  70. rev64 b0.4s, b0.4s; \
  71. ext b0.16b, b0.16b, b0.16b, #8; \
  72. rev32 b0.16b, b0.16b;
  73. #define crypt_blk4(b0, b1, b2, b3) \
  74. rev32 b0.16b, b0.16b; \
  75. rev32 b1.16b, b1.16b; \
  76. rev32 b2.16b, b2.16b; \
  77. rev32 b3.16b, b3.16b; \
  78. sm4e(b0, v24); \
  79. sm4e(b1, v24); \
  80. sm4e(b2, v24); \
  81. sm4e(b3, v24); \
  82. sm4e(b0, v25); \
  83. sm4e(b1, v25); \
  84. sm4e(b2, v25); \
  85. sm4e(b3, v25); \
  86. sm4e(b0, v26); \
  87. sm4e(b1, v26); \
  88. sm4e(b2, v26); \
  89. sm4e(b3, v26); \
  90. sm4e(b0, v27); \
  91. sm4e(b1, v27); \
  92. sm4e(b2, v27); \
  93. sm4e(b3, v27); \
  94. sm4e(b0, v28); \
  95. sm4e(b1, v28); \
  96. sm4e(b2, v28); \
  97. sm4e(b3, v28); \
  98. sm4e(b0, v29); \
  99. sm4e(b1, v29); \
  100. sm4e(b2, v29); \
  101. sm4e(b3, v29); \
  102. sm4e(b0, v30); \
  103. sm4e(b1, v30); \
  104. sm4e(b2, v30); \
  105. sm4e(b3, v30); \
  106. sm4e(b0, v31); \
  107. sm4e(b1, v31); \
  108. sm4e(b2, v31); \
  109. sm4e(b3, v31); \
  110. rev64 b0.4s, b0.4s; \
  111. rev64 b1.4s, b1.4s; \
  112. rev64 b2.4s, b2.4s; \
  113. rev64 b3.4s, b3.4s; \
  114. ext b0.16b, b0.16b, b0.16b, #8; \
  115. ext b1.16b, b1.16b, b1.16b, #8; \
  116. ext b2.16b, b2.16b, b2.16b, #8; \
  117. ext b3.16b, b3.16b, b3.16b, #8; \
  118. rev32 b0.16b, b0.16b; \
  119. rev32 b1.16b, b1.16b; \
  120. rev32 b2.16b, b2.16b; \
  121. rev32 b3.16b, b3.16b;
  122. #define crypt_blk8(b0, b1, b2, b3, b4, b5, b6, b7) \
  123. rev32 b0.16b, b0.16b; \
  124. rev32 b1.16b, b1.16b; \
  125. rev32 b2.16b, b2.16b; \
  126. rev32 b3.16b, b3.16b; \
  127. rev32 b4.16b, b4.16b; \
  128. rev32 b5.16b, b5.16b; \
  129. rev32 b6.16b, b6.16b; \
  130. rev32 b7.16b, b7.16b; \
  131. sm4e(b0, v24); \
  132. sm4e(b1, v24); \
  133. sm4e(b2, v24); \
  134. sm4e(b3, v24); \
  135. sm4e(b4, v24); \
  136. sm4e(b5, v24); \
  137. sm4e(b6, v24); \
  138. sm4e(b7, v24); \
  139. sm4e(b0, v25); \
  140. sm4e(b1, v25); \
  141. sm4e(b2, v25); \
  142. sm4e(b3, v25); \
  143. sm4e(b4, v25); \
  144. sm4e(b5, v25); \
  145. sm4e(b6, v25); \
  146. sm4e(b7, v25); \
  147. sm4e(b0, v26); \
  148. sm4e(b1, v26); \
  149. sm4e(b2, v26); \
  150. sm4e(b3, v26); \
  151. sm4e(b4, v26); \
  152. sm4e(b5, v26); \
  153. sm4e(b6, v26); \
  154. sm4e(b7, v26); \
  155. sm4e(b0, v27); \
  156. sm4e(b1, v27); \
  157. sm4e(b2, v27); \
  158. sm4e(b3, v27); \
  159. sm4e(b4, v27); \
  160. sm4e(b5, v27); \
  161. sm4e(b6, v27); \
  162. sm4e(b7, v27); \
  163. sm4e(b0, v28); \
  164. sm4e(b1, v28); \
  165. sm4e(b2, v28); \
  166. sm4e(b3, v28); \
  167. sm4e(b4, v28); \
  168. sm4e(b5, v28); \
  169. sm4e(b6, v28); \
  170. sm4e(b7, v28); \
  171. sm4e(b0, v29); \
  172. sm4e(b1, v29); \
  173. sm4e(b2, v29); \
  174. sm4e(b3, v29); \
  175. sm4e(b4, v29); \
  176. sm4e(b5, v29); \
  177. sm4e(b6, v29); \
  178. sm4e(b7, v29); \
  179. sm4e(b0, v30); \
  180. sm4e(b1, v30); \
  181. sm4e(b2, v30); \
  182. sm4e(b3, v30); \
  183. sm4e(b4, v30); \
  184. sm4e(b5, v30); \
  185. sm4e(b6, v30); \
  186. sm4e(b7, v30); \
  187. sm4e(b0, v31); \
  188. sm4e(b1, v31); \
  189. sm4e(b2, v31); \
  190. sm4e(b3, v31); \
  191. sm4e(b4, v31); \
  192. sm4e(b5, v31); \
  193. sm4e(b6, v31); \
  194. sm4e(b7, v31); \
  195. rev64 b0.4s, b0.4s; \
  196. rev64 b1.4s, b1.4s; \
  197. rev64 b2.4s, b2.4s; \
  198. rev64 b3.4s, b3.4s; \
  199. rev64 b4.4s, b4.4s; \
  200. rev64 b5.4s, b5.4s; \
  201. rev64 b6.4s, b6.4s; \
  202. rev64 b7.4s, b7.4s; \
  203. ext b0.16b, b0.16b, b0.16b, #8; \
  204. ext b1.16b, b1.16b, b1.16b, #8; \
  205. ext b2.16b, b2.16b, b2.16b, #8; \
  206. ext b3.16b, b3.16b, b3.16b, #8; \
  207. ext b4.16b, b4.16b, b4.16b, #8; \
  208. ext b5.16b, b5.16b, b5.16b, #8; \
  209. ext b6.16b, b6.16b, b6.16b, #8; \
  210. ext b7.16b, b7.16b, b7.16b, #8; \
  211. rev32 b0.16b, b0.16b; \
  212. rev32 b1.16b, b1.16b; \
  213. rev32 b2.16b, b2.16b; \
  214. rev32 b3.16b, b3.16b; \
  215. rev32 b4.16b, b4.16b; \
  216. rev32 b5.16b, b5.16b; \
  217. rev32 b6.16b, b6.16b; \
  218. rev32 b7.16b, b7.16b;
  219. .align 4
  220. .global _gcry_sm4_armv8_ce_expand_key
  221. ELF(.type _gcry_sm4_armv8_ce_expand_key,%function;)
  222. _gcry_sm4_armv8_ce_expand_key:
  223. /* input:
  224. * x0: 128-bit key
  225. * x1: rkey_enc
  226. * x2: rkey_dec
  227. * x3: fk array
  228. * x4: ck array
  229. */
  230. CFI_STARTPROC();
  231. ld1 {v0.16b}, [x0];
  232. rev32 v0.16b, v0.16b;
  233. ld1 {v1.16b}, [x3];
  234. load_rkey(x4);
  235. /* input ^ fk */
  236. eor v0.16b, v0.16b, v1.16b;
  237. sm4ekey(v0, v0, v24);
  238. sm4ekey(v1, v0, v25);
  239. sm4ekey(v2, v1, v26);
  240. sm4ekey(v3, v2, v27);
  241. sm4ekey(v4, v3, v28);
  242. sm4ekey(v5, v4, v29);
  243. sm4ekey(v6, v5, v30);
  244. sm4ekey(v7, v6, v31);
  245. st1 {v0.16b-v3.16b}, [x1], #64;
  246. st1 {v4.16b-v7.16b}, [x1];
  247. rev64 v7.4s, v7.4s;
  248. rev64 v6.4s, v6.4s;
  249. rev64 v5.4s, v5.4s;
  250. rev64 v4.4s, v4.4s;
  251. rev64 v3.4s, v3.4s;
  252. rev64 v2.4s, v2.4s;
  253. rev64 v1.4s, v1.4s;
  254. rev64 v0.4s, v0.4s;
  255. ext v7.16b, v7.16b, v7.16b, #8;
  256. ext v6.16b, v6.16b, v6.16b, #8;
  257. ext v5.16b, v5.16b, v5.16b, #8;
  258. ext v4.16b, v4.16b, v4.16b, #8;
  259. ext v3.16b, v3.16b, v3.16b, #8;
  260. ext v2.16b, v2.16b, v2.16b, #8;
  261. ext v1.16b, v1.16b, v1.16b, #8;
  262. ext v0.16b, v0.16b, v0.16b, #8;
  263. st1 {v7.16b}, [x2], #16;
  264. st1 {v6.16b}, [x2], #16;
  265. st1 {v5.16b}, [x2], #16;
  266. st1 {v4.16b}, [x2], #16;
  267. st1 {v3.16b}, [x2], #16;
  268. st1 {v2.16b}, [x2], #16;
  269. st1 {v1.16b}, [x2], #16;
  270. st1 {v0.16b}, [x2];
  271. ret_spec_stop;
  272. CFI_ENDPROC();
  273. ELF(.size _gcry_sm4_armv8_ce_expand_key,.-_gcry_sm4_armv8_ce_expand_key;)
  274. .align 4
  275. ELF(.type sm4_armv8_ce_crypt_blk1_4,%function;)
  276. sm4_armv8_ce_crypt_blk1_4:
  277. /* input:
  278. * x0: round key array, CTX
  279. * x1: dst
  280. * x2: src
  281. * x3: num blocks (1..4)
  282. */
  283. CFI_STARTPROC();
  284. load_rkey(x0);
  285. ld1 {v0.16b}, [x2], #16;
  286. mov v1.16b, v0.16b;
  287. mov v2.16b, v0.16b;
  288. mov v3.16b, v0.16b;
  289. cmp x3, #2;
  290. blt .Lblk4_load_input_done;
  291. ld1 {v1.16b}, [x2], #16;
  292. beq .Lblk4_load_input_done;
  293. ld1 {v2.16b}, [x2], #16;
  294. cmp x3, #3;
  295. beq .Lblk4_load_input_done;
  296. ld1 {v3.16b}, [x2];
  297. .Lblk4_load_input_done:
  298. crypt_blk4(v0, v1, v2, v3);
  299. st1 {v0.16b}, [x1], #16;
  300. cmp x3, #2;
  301. blt .Lblk4_store_output_done;
  302. st1 {v1.16b}, [x1], #16;
  303. beq .Lblk4_store_output_done;
  304. st1 {v2.16b}, [x1], #16;
  305. cmp x3, #3;
  306. beq .Lblk4_store_output_done;
  307. st1 {v3.16b}, [x1];
  308. .Lblk4_store_output_done:
  309. ret_spec_stop;
  310. CFI_ENDPROC();
  311. ELF(.size sm4_armv8_ce_crypt_blk1_4,.-sm4_armv8_ce_crypt_blk1_4;)
  312. .align 4
  313. .global _gcry_sm4_armv8_ce_crypt_blk1_8
  314. ELF(.type _gcry_sm4_armv8_ce_crypt_blk1_8,%function;)
  315. _gcry_sm4_armv8_ce_crypt_blk1_8:
  316. /* input:
  317. * x0: round key array, CTX
  318. * x1: dst
  319. * x2: src
  320. * x3: num blocks (1..8)
  321. */
  322. CFI_STARTPROC();
  323. cmp x3, #5;
  324. blt sm4_armv8_ce_crypt_blk1_4;
  325. load_rkey(x0);
  326. ld1 {v0.16b-v3.16b}, [x2], #64;
  327. ld1 {v4.16b}, [x2], #16;
  328. mov v5.16b, v4.16b;
  329. mov v6.16b, v4.16b;
  330. mov v7.16b, v4.16b;
  331. beq .Lblk8_load_input_done;
  332. ld1 {v5.16b}, [x2], #16;
  333. cmp x3, #7;
  334. blt .Lblk8_load_input_done;
  335. ld1 {v6.16b}, [x2], #16;
  336. beq .Lblk8_load_input_done;
  337. ld1 {v7.16b}, [x2];
  338. .Lblk8_load_input_done:
  339. crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
  340. cmp x3, #6;
  341. st1 {v0.16b-v3.16b}, [x1], #64;
  342. st1 {v4.16b}, [x1], #16;
  343. blt .Lblk8_store_output_done;
  344. st1 {v5.16b}, [x1], #16;
  345. beq .Lblk8_store_output_done;
  346. st1 {v6.16b}, [x1], #16;
  347. cmp x3, #7;
  348. beq .Lblk8_store_output_done;
  349. st1 {v7.16b}, [x1];
  350. .Lblk8_store_output_done:
  351. ret_spec_stop;
  352. CFI_ENDPROC();
  353. ELF(.size _gcry_sm4_armv8_ce_crypt_blk1_8,.-_gcry_sm4_armv8_ce_crypt_blk1_8;)
  354. .align 4
  355. .global _gcry_sm4_armv8_ce_crypt
  356. ELF(.type _gcry_sm4_armv8_ce_crypt,%function;)
  357. _gcry_sm4_armv8_ce_crypt:
  358. /* input:
  359. * x0: round key array, CTX
  360. * x1: dst
  361. * x2: src
  362. * x3: nblocks (multiples of 8)
  363. */
  364. CFI_STARTPROC();
  365. load_rkey(x0);
  366. .Lcrypt_loop_blk:
  367. subs x3, x3, #8;
  368. bmi .Lcrypt_end;
  369. ld1 {v0.16b-v3.16b}, [x2], #64;
  370. ld1 {v4.16b-v7.16b}, [x2], #64;
  371. crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
  372. st1 {v0.16b-v3.16b}, [x1], #64;
  373. st1 {v4.16b-v7.16b}, [x1], #64;
  374. b .Lcrypt_loop_blk;
  375. .Lcrypt_end:
  376. ret_spec_stop;
  377. CFI_ENDPROC();
  378. ELF(.size _gcry_sm4_armv8_ce_crypt,.-_gcry_sm4_armv8_ce_crypt;)
  379. .align 4
  380. .global _gcry_sm4_armv8_ce_cbc_dec
  381. ELF(.type _gcry_sm4_armv8_ce_cbc_dec,%function;)
  382. _gcry_sm4_armv8_ce_cbc_dec:
  383. /* input:
  384. * x0: round key array, CTX
  385. * x1: dst
  386. * x2: src
  387. * x3: iv (big endian, 128 bit)
  388. * x4: nblocks (multiples of 8)
  389. */
  390. CFI_STARTPROC();
  391. load_rkey(x0);
  392. ld1 {RIV.16b}, [x3];
  393. .Lcbc_loop_blk:
  394. subs x4, x4, #8;
  395. bmi .Lcbc_end;
  396. ld1 {v0.16b-v3.16b}, [x2], #64;
  397. ld1 {v4.16b-v7.16b}, [x2];
  398. crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
  399. sub x2, x2, #64;
  400. eor v0.16b, v0.16b, RIV.16b;
  401. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  402. eor v1.16b, v1.16b, RTMP0.16b;
  403. eor v2.16b, v2.16b, RTMP1.16b;
  404. eor v3.16b, v3.16b, RTMP2.16b;
  405. st1 {v0.16b-v3.16b}, [x1], #64;
  406. eor v4.16b, v4.16b, RTMP3.16b;
  407. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  408. eor v5.16b, v5.16b, RTMP0.16b;
  409. eor v6.16b, v6.16b, RTMP1.16b;
  410. eor v7.16b, v7.16b, RTMP2.16b;
  411. mov RIV.16b, RTMP3.16b;
  412. st1 {v4.16b-v7.16b}, [x1], #64;
  413. b .Lcbc_loop_blk;
  414. .Lcbc_end:
  415. /* store new IV */
  416. st1 {RIV.16b}, [x3];
  417. ret_spec_stop;
  418. CFI_ENDPROC();
  419. ELF(.size _gcry_sm4_armv8_ce_cbc_dec,.-_gcry_sm4_armv8_ce_cbc_dec;)
  420. .align 4
  421. .global _gcry_sm4_armv8_ce_cfb_dec
  422. ELF(.type _gcry_sm4_armv8_ce_cfb_dec,%function;)
  423. _gcry_sm4_armv8_ce_cfb_dec:
  424. /* input:
  425. * x0: round key array, CTX
  426. * x1: dst
  427. * x2: src
  428. * x3: iv (big endian, 128 bit)
  429. * x4: nblocks (multiples of 8)
  430. */
  431. CFI_STARTPROC();
  432. load_rkey(x0);
  433. ld1 {v0.16b}, [x3];
  434. .Lcfb_loop_blk:
  435. subs x4, x4, #8;
  436. bmi .Lcfb_end;
  437. ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
  438. ld1 {v4.16b-v7.16b}, [x2];
  439. crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
  440. sub x2, x2, #48;
  441. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  442. eor v0.16b, v0.16b, RTMP0.16b;
  443. eor v1.16b, v1.16b, RTMP1.16b;
  444. eor v2.16b, v2.16b, RTMP2.16b;
  445. eor v3.16b, v3.16b, RTMP3.16b;
  446. st1 {v0.16b-v3.16b}, [x1], #64;
  447. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  448. eor v4.16b, v4.16b, RTMP0.16b;
  449. eor v5.16b, v5.16b, RTMP1.16b;
  450. eor v6.16b, v6.16b, RTMP2.16b;
  451. eor v7.16b, v7.16b, RTMP3.16b;
  452. st1 {v4.16b-v7.16b}, [x1], #64;
  453. mov v0.16b, RTMP3.16b;
  454. b .Lcfb_loop_blk;
  455. .Lcfb_end:
  456. /* store new IV */
  457. st1 {v0.16b}, [x3];
  458. ret_spec_stop;
  459. CFI_ENDPROC();
  460. ELF(.size _gcry_sm4_armv8_ce_cfb_dec,.-_gcry_sm4_armv8_ce_cfb_dec;)
  461. .align 4
  462. .global _gcry_sm4_armv8_ce_ctr_enc
  463. ELF(.type _gcry_sm4_armv8_ce_ctr_enc,%function;)
  464. _gcry_sm4_armv8_ce_ctr_enc:
  465. /* input:
  466. * x0: round key array, CTX
  467. * x1: dst
  468. * x2: src
  469. * x3: ctr (big endian, 128 bit)
  470. * x4: nblocks (multiples of 8)
  471. */
  472. CFI_STARTPROC();
  473. load_rkey(x0);
  474. ldp x7, x8, [x3];
  475. rev x7, x7;
  476. rev x8, x8;
  477. .Lctr_loop_blk:
  478. subs x4, x4, #8;
  479. bmi .Lctr_end;
  480. #define inc_le128(vctr) \
  481. mov vctr.d[1], x8; \
  482. mov vctr.d[0], x7; \
  483. adds x8, x8, #1; \
  484. adc x7, x7, xzr; \
  485. rev64 vctr.16b, vctr.16b;
  486. /* construct CTRs */
  487. inc_le128(v0); /* +0 */
  488. inc_le128(v1); /* +1 */
  489. inc_le128(v2); /* +2 */
  490. inc_le128(v3); /* +3 */
  491. inc_le128(v4); /* +4 */
  492. inc_le128(v5); /* +5 */
  493. inc_le128(v6); /* +6 */
  494. inc_le128(v7); /* +7 */
  495. crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
  496. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  497. eor v0.16b, v0.16b, RTMP0.16b;
  498. eor v1.16b, v1.16b, RTMP1.16b;
  499. eor v2.16b, v2.16b, RTMP2.16b;
  500. eor v3.16b, v3.16b, RTMP3.16b;
  501. st1 {v0.16b-v3.16b}, [x1], #64;
  502. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  503. eor v4.16b, v4.16b, RTMP0.16b;
  504. eor v5.16b, v5.16b, RTMP1.16b;
  505. eor v6.16b, v6.16b, RTMP2.16b;
  506. eor v7.16b, v7.16b, RTMP3.16b;
  507. st1 {v4.16b-v7.16b}, [x1], #64;
  508. b .Lctr_loop_blk;
  509. .Lctr_end:
  510. /* store new CTR */
  511. rev x7, x7;
  512. rev x8, x8;
  513. stp x7, x8, [x3];
  514. ret_spec_stop;
  515. CFI_ENDPROC();
  516. ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
  517. .align 4
  518. .global _gcry_sm4_armv8_ce_xts_crypt
  519. ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
  520. _gcry_sm4_armv8_ce_xts_crypt:
  521. /* input:
  522. * x0: round key array, CTX
  523. * x1: dst
  524. * x2: src
  525. * x3: tweak (big endian, 128 bit)
  526. * x4: nblocks
  527. */
  528. CFI_STARTPROC()
  529. VPUSH_ABI
  530. load_rkey(x0)
  531. mov x7, #0x87
  532. mov x8, #0x1
  533. mov RMASK.d[0], x7
  534. mov RMASK.d[1], x8
  535. ld1 {RIV.16b}, [x3]
  536. mov v8.16b, RIV.16b
  537. ext RIV.16b, RIV.16b, RIV.16b, #8
  538. .Lxts_loop_blk:
  539. sub x4, x4, #8
  540. tbnz x4, #63, .Lxts_tail8
  541. #define tweak_next(vt, vin, RTMP) \
  542. sshr RTMP.2d, RIV.2d, #63; \
  543. add vt.2d, vin.2d, vin.2d; \
  544. and RTMP.16b, RTMP.16b, RMASK.16b; \
  545. add RIV.2d, RIV.2d, RIV.2d; \
  546. eor vt.16b, vt.16b, RTMP.16b;
  547. tweak_next( v9, v8, RTMP0)
  548. tweak_next(v10, v9, RTMP1)
  549. tweak_next(v11, v10, RTMP2)
  550. tweak_next(v12, v11, RTMP3)
  551. tweak_next(v13, v12, RTMP0)
  552. tweak_next(v14, v13, RTMP1)
  553. tweak_next(v15, v14, RTMP2)
  554. ld1 {v0.16b-v3.16b}, [x2], #64
  555. eor v0.16b, v0.16b, v8.16b
  556. eor v1.16b, v1.16b, v9.16b
  557. eor v2.16b, v2.16b, v10.16b
  558. eor v3.16b, v3.16b, v11.16b
  559. ld1 {v4.16b-v7.16b}, [x2], #64
  560. eor v4.16b, v4.16b, v12.16b
  561. eor v5.16b, v5.16b, v13.16b
  562. eor v6.16b, v6.16b, v14.16b
  563. eor v7.16b, v7.16b, v15.16b
  564. crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
  565. eor v0.16b, v0.16b, v8.16b
  566. eor v1.16b, v1.16b, v9.16b
  567. eor v2.16b, v2.16b, v10.16b
  568. eor v3.16b, v3.16b, v11.16b
  569. st1 {v0.16b-v3.16b}, [x1], #64
  570. eor v4.16b, v4.16b, v12.16b
  571. eor v5.16b, v5.16b, v13.16b
  572. eor v6.16b, v6.16b, v14.16b
  573. eor v7.16b, v7.16b, v15.16b
  574. st1 {v4.16b-v7.16b}, [x1], #64
  575. tweak_next(v8, v15, RTMP3)
  576. cbz x4, .Lxts_end
  577. b .Lxts_loop_blk
  578. .Lxts_tail8:
  579. add x4, x4, #8
  580. cmp x4, #4
  581. blt .Lxts_tail4
  582. sub x4, x4, #4
  583. tweak_next( v9, v8, RTMP0)
  584. tweak_next(v10, v9, RTMP1)
  585. tweak_next(v11, v10, RTMP2)
  586. ld1 {v0.16b-v3.16b}, [x2], #64
  587. eor v0.16b, v0.16b, v8.16b
  588. eor v1.16b, v1.16b, v9.16b
  589. eor v2.16b, v2.16b, v10.16b
  590. eor v3.16b, v3.16b, v11.16b
  591. crypt_blk4(v0, v1, v2, v3);
  592. eor v0.16b, v0.16b, v8.16b
  593. eor v1.16b, v1.16b, v9.16b
  594. eor v2.16b, v2.16b, v10.16b
  595. eor v3.16b, v3.16b, v11.16b
  596. st1 {v0.16b-v3.16b}, [x1], #64
  597. tweak_next(v8, v11, RTMP3)
  598. cbz x4, .Lxts_end
  599. .Lxts_tail4:
  600. sub x4, x4, #1
  601. ld1 {v0.16b}, [x2], #16
  602. eor v0.16b, v0.16b, v8.16b
  603. SM4_CRYPT_BLK(v0)
  604. eor v0.16b, v0.16b, v8.16b
  605. st1 {v0.16b}, [x1], #16
  606. tweak_next(v8, v8, RTMP0)
  607. cbnz x4, .Lxts_tail4
  608. .Lxts_end:
  609. /* store new tweak */
  610. st1 {v8.16b}, [x3]
  611. CLEAR_REG(v8)
  612. CLEAR_REG(v9)
  613. CLEAR_REG(v10)
  614. CLEAR_REG(v11)
  615. CLEAR_REG(v12)
  616. CLEAR_REG(v13)
  617. CLEAR_REG(v14)
  618. CLEAR_REG(v15)
  619. CLEAR_REG(RIV)
  620. VPOP_ABI
  621. ret_spec_stop
  622. CFI_ENDPROC()
  623. ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
  624. #endif