sm4-aarch64.S 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. /* sm4-aarch64.S - ARMv8/AArch64 accelerated SM4 cipher
  2. *
  3. * Copyright (C) 2022 Alibaba Group.
  4. * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  5. *
  6. * This file is part of Libgcrypt.
  7. *
  8. * Libgcrypt is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as
  10. * published by the Free Software Foundation; either version 2.1 of
  11. * the License, or (at your option) any later version.
  12. *
  13. * Libgcrypt is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  20. */
  21. #include "asm-common-aarch64.h"
  22. #if defined(__AARCH64EL__) && \
  23. defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
  24. defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
  25. defined(USE_SM4)
  26. .cpu generic+simd
  27. /* Constants */
  28. SECTION_RODATA
  29. .align 4
  30. ELF(.type _gcry_sm4_aarch64_consts,@object)
  31. _gcry_sm4_aarch64_consts:
  32. .Lsm4_sbox:
  33. .byte 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7
  34. .byte 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05
  35. .byte 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3
  36. .byte 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99
  37. .byte 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a
  38. .byte 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62
  39. .byte 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95
  40. .byte 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6
  41. .byte 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba
  42. .byte 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8
  43. .byte 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b
  44. .byte 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35
  45. .byte 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2
  46. .byte 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87
  47. .byte 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52
  48. .byte 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e
  49. .byte 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5
  50. .byte 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1
  51. .byte 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55
  52. .byte 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3
  53. .byte 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60
  54. .byte 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f
  55. .byte 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f
  56. .byte 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51
  57. .byte 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f
  58. .byte 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8
  59. .byte 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd
  60. .byte 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0
  61. .byte 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e
  62. .byte 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84
  63. .byte 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20
  64. .byte 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
  65. ELF(.size _gcry_sm4_aarch64_consts,.-_gcry_sm4_aarch64_consts)
  66. /* Register macros */
  67. #define RTMP0 v8
  68. #define RTMP1 v9
  69. #define RTMP2 v10
  70. #define RTMP3 v11
  71. #define RX0 v12
  72. #define RX1 v13
  73. #define RKEY v14
  74. #define RIV v15
  75. /* Helper macros. */
  76. #define preload_sbox(ptr) \
  77. GET_DATA_POINTER(ptr, .Lsm4_sbox); \
  78. ld1 {v16.16b-v19.16b}, [ptr], #64; \
  79. ld1 {v20.16b-v23.16b}, [ptr], #64; \
  80. ld1 {v24.16b-v27.16b}, [ptr], #64; \
  81. ld1 {v28.16b-v31.16b}, [ptr];
  82. #define transpose_4x4(s0, s1, s2, s3) \
  83. zip1 RTMP0.4s, s0.4s, s1.4s; \
  84. zip1 RTMP1.4s, s2.4s, s3.4s; \
  85. zip2 RTMP2.4s, s0.4s, s1.4s; \
  86. zip2 RTMP3.4s, s2.4s, s3.4s; \
  87. zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
  88. zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
  89. zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
  90. zip2 s3.2d, RTMP2.2d, RTMP3.2d;
  91. #define rotate_clockwise_90(s0, s1, s2, s3) \
  92. zip1 RTMP0.4s, s1.4s, s0.4s; \
  93. zip2 RTMP1.4s, s1.4s, s0.4s; \
  94. zip1 RTMP2.4s, s3.4s, s2.4s; \
  95. zip2 RTMP3.4s, s3.4s, s2.4s; \
  96. zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
  97. zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
  98. zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
  99. zip2 s3.2d, RTMP3.2d, RTMP1.2d;
  100. .text
  101. .align 4
  102. ELF(.type sm4_aarch64_crypt_blk1_4,%function;)
  103. sm4_aarch64_crypt_blk1_4:
  104. /* input:
  105. * x0: round key array, CTX
  106. * x1: dst
  107. * x2: src
  108. * x3: num blocks (1..4)
  109. */
  110. CFI_STARTPROC();
  111. VPUSH_ABI;
  112. preload_sbox(x5);
  113. ld1 {v0.16b}, [x2], #16;
  114. mov v1.16b, v0.16b;
  115. mov v2.16b, v0.16b;
  116. mov v3.16b, v0.16b;
  117. cmp x3, #2;
  118. blt .Lblk4_load_input_done;
  119. ld1 {v1.16b}, [x2], #16;
  120. beq .Lblk4_load_input_done;
  121. ld1 {v2.16b}, [x2], #16;
  122. cmp x3, #3;
  123. beq .Lblk4_load_input_done;
  124. ld1 {v3.16b}, [x2];
  125. .Lblk4_load_input_done:
  126. rev32 v0.16b, v0.16b;
  127. rev32 v1.16b, v1.16b;
  128. rev32 v2.16b, v2.16b;
  129. rev32 v3.16b, v3.16b;
  130. transpose_4x4(v0, v1, v2, v3);
  131. #define ROUND(round, s0, s1, s2, s3) \
  132. dup RX0.4s, RKEY.s[round]; \
  133. /* rk ^ s1 ^ s2 ^ s3 */ \
  134. eor RTMP1.16b, s2.16b, s3.16b; \
  135. eor RX0.16b, RX0.16b, s1.16b; \
  136. eor RX0.16b, RX0.16b, RTMP1.16b; \
  137. \
  138. /* sbox, non-linear part */ \
  139. movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
  140. tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
  141. sub RX0.16b, RX0.16b, RTMP3.16b; \
  142. tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
  143. sub RX0.16b, RX0.16b, RTMP3.16b; \
  144. tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
  145. sub RX0.16b, RX0.16b, RTMP3.16b; \
  146. tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
  147. \
  148. /* linear part */ \
  149. shl RTMP1.4s, RTMP0.4s, #8; \
  150. shl RTMP2.4s, RTMP0.4s, #16; \
  151. shl RTMP3.4s, RTMP0.4s, #24; \
  152. sri RTMP1.4s, RTMP0.4s, #(32-8); \
  153. sri RTMP2.4s, RTMP0.4s, #(32-16); \
  154. sri RTMP3.4s, RTMP0.4s, #(32-24); \
  155. /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
  156. eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
  157. eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
  158. /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
  159. eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
  160. shl RTMP2.4s, RTMP1.4s, 2; \
  161. sri RTMP2.4s, RTMP1.4s, #(32-2); \
  162. eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
  163. /* s0 ^= RTMP3 */ \
  164. eor s0.16b, s0.16b, RTMP3.16b;
  165. mov x6, 8;
  166. .Lroundloop4:
  167. ld1 {RKEY.4s}, [x0], #16;
  168. subs x6, x6, #1;
  169. ROUND(0, v0, v1, v2, v3);
  170. ROUND(1, v1, v2, v3, v0);
  171. ROUND(2, v2, v3, v0, v1);
  172. ROUND(3, v3, v0, v1, v2);
  173. bne .Lroundloop4;
  174. #undef ROUND
  175. rotate_clockwise_90(v0, v1, v2, v3);
  176. rev32 v0.16b, v0.16b;
  177. rev32 v1.16b, v1.16b;
  178. rev32 v2.16b, v2.16b;
  179. rev32 v3.16b, v3.16b;
  180. st1 {v0.16b}, [x1], #16;
  181. cmp x3, #2;
  182. blt .Lblk4_store_output_done;
  183. st1 {v1.16b}, [x1], #16;
  184. beq .Lblk4_store_output_done;
  185. st1 {v2.16b}, [x1], #16;
  186. cmp x3, #3;
  187. beq .Lblk4_store_output_done;
  188. st1 {v3.16b}, [x1];
  189. .Lblk4_store_output_done:
  190. VPOP_ABI;
  191. ret_spec_stop;
  192. CFI_ENDPROC();
  193. ELF(.size sm4_aarch64_crypt_blk1_4,.-sm4_aarch64_crypt_blk1_4;)
  194. .align 4
  195. ELF(.type __sm4_crypt_blk8,%function;)
  196. __sm4_crypt_blk8:
  197. /* input:
  198. * x0: round key array, CTX
  199. * v16-v31: fill with sbox
  200. * v0, v1, v2, v3, v4, v5, v6, v7: eight parallel plaintext blocks
  201. * output:
  202. * v0, v1, v2, v3, v4, v5, v6, v7: eight parallel ciphertext blocks
  203. */
  204. CFI_STARTPROC();
  205. rev32 v0.16b, v0.16b;
  206. rev32 v1.16b, v1.16b;
  207. rev32 v2.16b, v2.16b;
  208. rev32 v3.16b, v3.16b;
  209. rev32 v4.16b, v4.16b;
  210. rev32 v5.16b, v5.16b;
  211. rev32 v6.16b, v6.16b;
  212. rev32 v7.16b, v7.16b;
  213. transpose_4x4(v0, v1, v2, v3);
  214. transpose_4x4(v4, v5, v6, v7);
  215. #define ROUND(round, s0, s1, s2, s3, t0, t1, t2, t3) \
  216. /* rk ^ s1 ^ s2 ^ s3 */ \
  217. dup RX0.4s, RKEY.s[round]; \
  218. eor RTMP0.16b, s2.16b, s3.16b; \
  219. mov RX1.16b, RX0.16b; \
  220. eor RTMP1.16b, t2.16b, t3.16b; \
  221. eor RX0.16b, RX0.16b, s1.16b; \
  222. eor RX1.16b, RX1.16b, t1.16b; \
  223. eor RX0.16b, RX0.16b, RTMP0.16b; \
  224. eor RX1.16b, RX1.16b, RTMP1.16b; \
  225. \
  226. /* sbox, non-linear part */ \
  227. movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
  228. tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
  229. tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
  230. sub RX0.16b, RX0.16b, RTMP3.16b; \
  231. sub RX1.16b, RX1.16b, RTMP3.16b; \
  232. tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
  233. tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
  234. sub RX0.16b, RX0.16b, RTMP3.16b; \
  235. sub RX1.16b, RX1.16b, RTMP3.16b; \
  236. tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
  237. tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
  238. sub RX0.16b, RX0.16b, RTMP3.16b; \
  239. sub RX1.16b, RX1.16b, RTMP3.16b; \
  240. tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
  241. tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
  242. \
  243. /* linear part */ \
  244. shl RX0.4s, RTMP0.4s, #8; \
  245. shl RX1.4s, RTMP1.4s, #8; \
  246. shl RTMP2.4s, RTMP0.4s, #16; \
  247. shl RTMP3.4s, RTMP1.4s, #16; \
  248. sri RX0.4s, RTMP0.4s, #(32 - 8); \
  249. sri RX1.4s, RTMP1.4s, #(32 - 8); \
  250. sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
  251. sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
  252. /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
  253. eor RX0.16b, RX0.16b, RTMP0.16b; \
  254. eor RX1.16b, RX1.16b, RTMP1.16b; \
  255. eor RX0.16b, RX0.16b, RTMP2.16b; \
  256. eor RX1.16b, RX1.16b, RTMP3.16b; \
  257. /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
  258. shl RTMP2.4s, RTMP0.4s, #24; \
  259. shl RTMP3.4s, RTMP1.4s, #24; \
  260. sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
  261. sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
  262. eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
  263. eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
  264. shl RTMP2.4s, RX0.4s, #2; \
  265. shl RTMP3.4s, RX1.4s, #2; \
  266. sri RTMP2.4s, RX0.4s, #(32 - 2); \
  267. sri RTMP3.4s, RX1.4s, #(32 - 2); \
  268. eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
  269. eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
  270. /* s0/t0 ^= RTMP0/1 */ \
  271. eor s0.16b, s0.16b, RTMP0.16b; \
  272. eor t0.16b, t0.16b, RTMP1.16b;
  273. mov x6, 8;
  274. .Lroundloop8:
  275. ld1 {RKEY.4s}, [x0], #16;
  276. subs x6, x6, #1;
  277. ROUND(0, v0, v1, v2, v3, v4, v5, v6, v7);
  278. ROUND(1, v1, v2, v3, v0, v5, v6, v7, v4);
  279. ROUND(2, v2, v3, v0, v1, v6, v7, v4, v5);
  280. ROUND(3, v3, v0, v1, v2, v7, v4, v5, v6);
  281. bne .Lroundloop8;
  282. #undef ROUND
  283. rotate_clockwise_90(v0, v1, v2, v3);
  284. rotate_clockwise_90(v4, v5, v6, v7);
  285. rev32 v0.16b, v0.16b;
  286. rev32 v1.16b, v1.16b;
  287. rev32 v2.16b, v2.16b;
  288. rev32 v3.16b, v3.16b;
  289. rev32 v4.16b, v4.16b;
  290. rev32 v5.16b, v5.16b;
  291. rev32 v6.16b, v6.16b;
  292. rev32 v7.16b, v7.16b;
  293. sub x0, x0, #128; /* repoint to rkey */
  294. ret;
  295. CFI_ENDPROC();
  296. ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;)
  297. .align 4
  298. .global _gcry_sm4_aarch64_crypt_blk1_8
  299. ELF(.type _gcry_sm4_aarch64_crypt_blk1_8,%function;)
  300. _gcry_sm4_aarch64_crypt_blk1_8:
  301. /* input:
  302. * x0: round key array, CTX
  303. * x1: dst
  304. * x2: src
  305. * x3: num blocks (1..8)
  306. */
  307. CFI_STARTPROC();
  308. cmp x3, #5;
  309. blt sm4_aarch64_crypt_blk1_4;
  310. stp x29, x30, [sp, #-16]!;
  311. CFI_ADJUST_CFA_OFFSET(16);
  312. CFI_REG_ON_STACK(29, 0);
  313. CFI_REG_ON_STACK(30, 8);
  314. VPUSH_ABI;
  315. preload_sbox(x5);
  316. ld1 {v0.16b-v3.16b}, [x2], #64;
  317. ld1 {v4.16b}, [x2], #16;
  318. mov v5.16b, v4.16b;
  319. mov v6.16b, v4.16b;
  320. mov v7.16b, v4.16b;
  321. beq .Lblk8_load_input_done;
  322. ld1 {v5.16b}, [x2], #16;
  323. cmp x3, #7;
  324. blt .Lblk8_load_input_done;
  325. ld1 {v6.16b}, [x2], #16;
  326. beq .Lblk8_load_input_done;
  327. ld1 {v7.16b}, [x2];
  328. .Lblk8_load_input_done:
  329. bl __sm4_crypt_blk8;
  330. cmp x3, #6;
  331. st1 {v0.16b-v3.16b}, [x1], #64;
  332. st1 {v4.16b}, [x1], #16;
  333. blt .Lblk8_store_output_done;
  334. st1 {v5.16b}, [x1], #16;
  335. beq .Lblk8_store_output_done;
  336. st1 {v6.16b}, [x1], #16;
  337. cmp x3, #7;
  338. beq .Lblk8_store_output_done;
  339. st1 {v7.16b}, [x1];
  340. .Lblk8_store_output_done:
  341. VPOP_ABI;
  342. ldp x29, x30, [sp], #16;
  343. CFI_ADJUST_CFA_OFFSET(-16);
  344. CFI_RESTORE(x29);
  345. CFI_RESTORE(x30);
  346. ret_spec_stop;
  347. CFI_ENDPROC();
  348. ELF(.size _gcry_sm4_aarch64_crypt_blk1_8,.-_gcry_sm4_aarch64_crypt_blk1_8;)
  349. .align 4
  350. .global _gcry_sm4_aarch64_crypt
  351. ELF(.type _gcry_sm4_aarch64_crypt,%function;)
  352. _gcry_sm4_aarch64_crypt:
  353. /* input:
  354. * x0: round key array, CTX
  355. * x1: dst
  356. * x2: src
  357. * x3: nblocks (multiples of 8)
  358. */
  359. CFI_STARTPROC();
  360. stp x29, x30, [sp, #-16]!;
  361. CFI_ADJUST_CFA_OFFSET(16);
  362. CFI_REG_ON_STACK(29, 0);
  363. CFI_REG_ON_STACK(30, 8);
  364. VPUSH_ABI;
  365. preload_sbox(x5);
  366. .Lcrypt_loop_blk:
  367. subs x3, x3, #8;
  368. bmi .Lcrypt_end;
  369. ld1 {v0.16b-v3.16b}, [x2], #64;
  370. ld1 {v4.16b-v7.16b}, [x2], #64;
  371. bl __sm4_crypt_blk8;
  372. st1 {v0.16b-v3.16b}, [x1], #64;
  373. st1 {v4.16b-v7.16b}, [x1], #64;
  374. b .Lcrypt_loop_blk;
  375. .Lcrypt_end:
  376. VPOP_ABI;
  377. ldp x29, x30, [sp], #16;
  378. CFI_ADJUST_CFA_OFFSET(-16);
  379. CFI_RESTORE(x29);
  380. CFI_RESTORE(x30);
  381. ret_spec_stop;
  382. CFI_ENDPROC();
  383. ELF(.size _gcry_sm4_aarch64_crypt,.-_gcry_sm4_aarch64_crypt;)
  384. .align 4
  385. .global _gcry_sm4_aarch64_cbc_dec
  386. ELF(.type _gcry_sm4_aarch64_cbc_dec,%function;)
  387. _gcry_sm4_aarch64_cbc_dec:
  388. /* input:
  389. * x0: round key array, CTX
  390. * x1: dst
  391. * x2: src
  392. * x3: iv (big endian, 128 bit)
  393. * x4: nblocks (multiples of 8)
  394. */
  395. CFI_STARTPROC();
  396. stp x29, x30, [sp, #-16]!;
  397. CFI_ADJUST_CFA_OFFSET(16);
  398. CFI_REG_ON_STACK(29, 0);
  399. CFI_REG_ON_STACK(30, 8);
  400. VPUSH_ABI;
  401. preload_sbox(x5);
  402. ld1 {RIV.16b}, [x3];
  403. .Lcbc_loop_blk:
  404. subs x4, x4, #8;
  405. bmi .Lcbc_end;
  406. ld1 {v0.16b-v3.16b}, [x2], #64;
  407. ld1 {v4.16b-v7.16b}, [x2];
  408. bl __sm4_crypt_blk8;
  409. sub x2, x2, #64;
  410. eor v0.16b, v0.16b, RIV.16b;
  411. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  412. eor v1.16b, v1.16b, RTMP0.16b;
  413. eor v2.16b, v2.16b, RTMP1.16b;
  414. eor v3.16b, v3.16b, RTMP2.16b;
  415. st1 {v0.16b-v3.16b}, [x1], #64;
  416. eor v4.16b, v4.16b, RTMP3.16b;
  417. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  418. eor v5.16b, v5.16b, RTMP0.16b;
  419. eor v6.16b, v6.16b, RTMP1.16b;
  420. eor v7.16b, v7.16b, RTMP2.16b;
  421. mov RIV.16b, RTMP3.16b;
  422. st1 {v4.16b-v7.16b}, [x1], #64;
  423. b .Lcbc_loop_blk;
  424. .Lcbc_end:
  425. /* store new IV */
  426. st1 {RIV.16b}, [x3];
  427. VPOP_ABI;
  428. ldp x29, x30, [sp], #16;
  429. CFI_ADJUST_CFA_OFFSET(-16);
  430. CFI_RESTORE(x29);
  431. CFI_RESTORE(x30);
  432. ret_spec_stop;
  433. CFI_ENDPROC();
  434. ELF(.size _gcry_sm4_aarch64_cbc_dec,.-_gcry_sm4_aarch64_cbc_dec;)
  435. .align 4
  436. .global _gcry_sm4_aarch64_cfb_dec
  437. ELF(.type _gcry_sm4_aarch64_cfb_dec,%function;)
  438. _gcry_sm4_aarch64_cfb_dec:
  439. /* input:
  440. * x0: round key array, CTX
  441. * x1: dst
  442. * x2: src
  443. * x3: iv (big endian, 128 bit)
  444. * x4: nblocks (multiples of 8)
  445. */
  446. CFI_STARTPROC();
  447. stp x29, x30, [sp, #-16]!;
  448. CFI_ADJUST_CFA_OFFSET(16);
  449. CFI_REG_ON_STACK(29, 0);
  450. CFI_REG_ON_STACK(30, 8);
  451. VPUSH_ABI;
  452. preload_sbox(x5);
  453. ld1 {v0.16b}, [x3];
  454. .Lcfb_loop_blk:
  455. subs x4, x4, #8;
  456. bmi .Lcfb_end;
  457. ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
  458. ld1 {v4.16b-v7.16b}, [x2];
  459. bl __sm4_crypt_blk8;
  460. sub x2, x2, #48;
  461. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  462. eor v0.16b, v0.16b, RTMP0.16b;
  463. eor v1.16b, v1.16b, RTMP1.16b;
  464. eor v2.16b, v2.16b, RTMP2.16b;
  465. eor v3.16b, v3.16b, RTMP3.16b;
  466. st1 {v0.16b-v3.16b}, [x1], #64;
  467. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  468. eor v4.16b, v4.16b, RTMP0.16b;
  469. eor v5.16b, v5.16b, RTMP1.16b;
  470. eor v6.16b, v6.16b, RTMP2.16b;
  471. eor v7.16b, v7.16b, RTMP3.16b;
  472. st1 {v4.16b-v7.16b}, [x1], #64;
  473. mov v0.16b, RTMP3.16b;
  474. b .Lcfb_loop_blk;
  475. .Lcfb_end:
  476. /* store new IV */
  477. st1 {v0.16b}, [x3];
  478. VPOP_ABI;
  479. ldp x29, x30, [sp], #16;
  480. CFI_ADJUST_CFA_OFFSET(-16);
  481. CFI_RESTORE(x29);
  482. CFI_RESTORE(x30);
  483. ret_spec_stop;
  484. CFI_ENDPROC();
  485. ELF(.size _gcry_sm4_aarch64_cfb_dec,.-_gcry_sm4_aarch64_cfb_dec;)
  486. .align 4
  487. .global _gcry_sm4_aarch64_ctr_enc
  488. ELF(.type _gcry_sm4_aarch64_ctr_enc,%function;)
  489. _gcry_sm4_aarch64_ctr_enc:
  490. /* input:
  491. * x0: round key array, CTX
  492. * x1: dst
  493. * x2: src
  494. * x3: ctr (big endian, 128 bit)
  495. * x4: nblocks (multiples of 8)
  496. */
  497. CFI_STARTPROC();
  498. stp x29, x30, [sp, #-16]!;
  499. CFI_ADJUST_CFA_OFFSET(16);
  500. CFI_REG_ON_STACK(29, 0);
  501. CFI_REG_ON_STACK(30, 8);
  502. VPUSH_ABI;
  503. preload_sbox(x5);
  504. ldp x7, x8, [x3];
  505. rev x7, x7;
  506. rev x8, x8;
  507. .Lctr_loop_blk:
  508. subs x4, x4, #8;
  509. bmi .Lctr_end;
  510. #define inc_le128(vctr) \
  511. mov vctr.d[1], x8; \
  512. mov vctr.d[0], x7; \
  513. adds x8, x8, #1; \
  514. adc x7, x7, xzr; \
  515. rev64 vctr.16b, vctr.16b;
  516. /* construct CTRs */
  517. inc_le128(v0); /* +0 */
  518. inc_le128(v1); /* +1 */
  519. inc_le128(v2); /* +2 */
  520. inc_le128(v3); /* +3 */
  521. inc_le128(v4); /* +4 */
  522. inc_le128(v5); /* +5 */
  523. inc_le128(v6); /* +6 */
  524. inc_le128(v7); /* +7 */
  525. bl __sm4_crypt_blk8;
  526. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  527. eor v0.16b, v0.16b, RTMP0.16b;
  528. eor v1.16b, v1.16b, RTMP1.16b;
  529. eor v2.16b, v2.16b, RTMP2.16b;
  530. eor v3.16b, v3.16b, RTMP3.16b;
  531. st1 {v0.16b-v3.16b}, [x1], #64;
  532. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
  533. eor v4.16b, v4.16b, RTMP0.16b;
  534. eor v5.16b, v5.16b, RTMP1.16b;
  535. eor v6.16b, v6.16b, RTMP2.16b;
  536. eor v7.16b, v7.16b, RTMP3.16b;
  537. st1 {v4.16b-v7.16b}, [x1], #64;
  538. b .Lctr_loop_blk;
  539. .Lctr_end:
  540. /* store new CTR */
  541. rev x7, x7;
  542. rev x8, x8;
  543. stp x7, x8, [x3];
  544. VPOP_ABI;
  545. ldp x29, x30, [sp], #16;
  546. CFI_ADJUST_CFA_OFFSET(-16);
  547. CFI_RESTORE(x29);
  548. CFI_RESTORE(x30);
  549. ret_spec_stop;
  550. CFI_ENDPROC();
  551. ELF(.size _gcry_sm4_aarch64_ctr_enc,.-_gcry_sm4_aarch64_ctr_enc;)
  552. #endif