123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732 |
- /* sm4-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM4 cipher
- *
- * Copyright (C) 2022 Alibaba Group.
- * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
- #include "asm-common-aarch64.h"
- #if defined(__AARCH64EL__) && \
- defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
- defined(USE_SM4)
- .cpu generic+simd+crypto
- #define vecnum_v0 0
- #define vecnum_v1 1
- #define vecnum_v2 2
- #define vecnum_v3 3
- #define vecnum_v4 4
- #define vecnum_v5 5
- #define vecnum_v6 6
- #define vecnum_v7 7
- #define vecnum_v16 16
- #define vecnum_v24 24
- #define vecnum_v25 25
- #define vecnum_v26 26
- #define vecnum_v27 27
- #define vecnum_v28 28
- #define vecnum_v29 29
- #define vecnum_v30 30
- #define vecnum_v31 31
- #define sm4e(vd, vn) \
- .inst (0xcec08400 | (vecnum_##vn << 5) | vecnum_##vd)
- #define sm4ekey(vd, vn, vm) \
- .inst (0xce60c800 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
- .text
- /* Register macros */
- #define RTMP0 v16
- #define RTMP1 v17
- #define RTMP2 v18
- #define RTMP3 v19
- #define RIV v20
- #define RMASK v21
- /* Helper macros. */
- #define load_rkey(ptr) \
- ld1 {v24.16b-v27.16b}, [ptr], #64; \
- ld1 {v28.16b-v31.16b}, [ptr];
- #define SM4_CRYPT_BLK(b0) \
- rev32 b0.16b, b0.16b; \
- sm4e(b0, v24); \
- sm4e(b0, v25); \
- sm4e(b0, v26); \
- sm4e(b0, v27); \
- sm4e(b0, v28); \
- sm4e(b0, v29); \
- sm4e(b0, v30); \
- sm4e(b0, v31); \
- rev64 b0.4s, b0.4s; \
- ext b0.16b, b0.16b, b0.16b, #8; \
- rev32 b0.16b, b0.16b;
- #define crypt_blk4(b0, b1, b2, b3) \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- rev32 b3.16b, b3.16b; \
- sm4e(b0, v24); \
- sm4e(b1, v24); \
- sm4e(b2, v24); \
- sm4e(b3, v24); \
- sm4e(b0, v25); \
- sm4e(b1, v25); \
- sm4e(b2, v25); \
- sm4e(b3, v25); \
- sm4e(b0, v26); \
- sm4e(b1, v26); \
- sm4e(b2, v26); \
- sm4e(b3, v26); \
- sm4e(b0, v27); \
- sm4e(b1, v27); \
- sm4e(b2, v27); \
- sm4e(b3, v27); \
- sm4e(b0, v28); \
- sm4e(b1, v28); \
- sm4e(b2, v28); \
- sm4e(b3, v28); \
- sm4e(b0, v29); \
- sm4e(b1, v29); \
- sm4e(b2, v29); \
- sm4e(b3, v29); \
- sm4e(b0, v30); \
- sm4e(b1, v30); \
- sm4e(b2, v30); \
- sm4e(b3, v30); \
- sm4e(b0, v31); \
- sm4e(b1, v31); \
- sm4e(b2, v31); \
- sm4e(b3, v31); \
- rev64 b0.4s, b0.4s; \
- rev64 b1.4s, b1.4s; \
- rev64 b2.4s, b2.4s; \
- rev64 b3.4s, b3.4s; \
- ext b0.16b, b0.16b, b0.16b, #8; \
- ext b1.16b, b1.16b, b1.16b, #8; \
- ext b2.16b, b2.16b, b2.16b, #8; \
- ext b3.16b, b3.16b, b3.16b, #8; \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- rev32 b3.16b, b3.16b;
- #define crypt_blk8(b0, b1, b2, b3, b4, b5, b6, b7) \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- rev32 b3.16b, b3.16b; \
- rev32 b4.16b, b4.16b; \
- rev32 b5.16b, b5.16b; \
- rev32 b6.16b, b6.16b; \
- rev32 b7.16b, b7.16b; \
- sm4e(b0, v24); \
- sm4e(b1, v24); \
- sm4e(b2, v24); \
- sm4e(b3, v24); \
- sm4e(b4, v24); \
- sm4e(b5, v24); \
- sm4e(b6, v24); \
- sm4e(b7, v24); \
- sm4e(b0, v25); \
- sm4e(b1, v25); \
- sm4e(b2, v25); \
- sm4e(b3, v25); \
- sm4e(b4, v25); \
- sm4e(b5, v25); \
- sm4e(b6, v25); \
- sm4e(b7, v25); \
- sm4e(b0, v26); \
- sm4e(b1, v26); \
- sm4e(b2, v26); \
- sm4e(b3, v26); \
- sm4e(b4, v26); \
- sm4e(b5, v26); \
- sm4e(b6, v26); \
- sm4e(b7, v26); \
- sm4e(b0, v27); \
- sm4e(b1, v27); \
- sm4e(b2, v27); \
- sm4e(b3, v27); \
- sm4e(b4, v27); \
- sm4e(b5, v27); \
- sm4e(b6, v27); \
- sm4e(b7, v27); \
- sm4e(b0, v28); \
- sm4e(b1, v28); \
- sm4e(b2, v28); \
- sm4e(b3, v28); \
- sm4e(b4, v28); \
- sm4e(b5, v28); \
- sm4e(b6, v28); \
- sm4e(b7, v28); \
- sm4e(b0, v29); \
- sm4e(b1, v29); \
- sm4e(b2, v29); \
- sm4e(b3, v29); \
- sm4e(b4, v29); \
- sm4e(b5, v29); \
- sm4e(b6, v29); \
- sm4e(b7, v29); \
- sm4e(b0, v30); \
- sm4e(b1, v30); \
- sm4e(b2, v30); \
- sm4e(b3, v30); \
- sm4e(b4, v30); \
- sm4e(b5, v30); \
- sm4e(b6, v30); \
- sm4e(b7, v30); \
- sm4e(b0, v31); \
- sm4e(b1, v31); \
- sm4e(b2, v31); \
- sm4e(b3, v31); \
- sm4e(b4, v31); \
- sm4e(b5, v31); \
- sm4e(b6, v31); \
- sm4e(b7, v31); \
- rev64 b0.4s, b0.4s; \
- rev64 b1.4s, b1.4s; \
- rev64 b2.4s, b2.4s; \
- rev64 b3.4s, b3.4s; \
- rev64 b4.4s, b4.4s; \
- rev64 b5.4s, b5.4s; \
- rev64 b6.4s, b6.4s; \
- rev64 b7.4s, b7.4s; \
- ext b0.16b, b0.16b, b0.16b, #8; \
- ext b1.16b, b1.16b, b1.16b, #8; \
- ext b2.16b, b2.16b, b2.16b, #8; \
- ext b3.16b, b3.16b, b3.16b, #8; \
- ext b4.16b, b4.16b, b4.16b, #8; \
- ext b5.16b, b5.16b, b5.16b, #8; \
- ext b6.16b, b6.16b, b6.16b, #8; \
- ext b7.16b, b7.16b, b7.16b, #8; \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- rev32 b3.16b, b3.16b; \
- rev32 b4.16b, b4.16b; \
- rev32 b5.16b, b5.16b; \
- rev32 b6.16b, b6.16b; \
- rev32 b7.16b, b7.16b;
- .align 4
- .global _gcry_sm4_armv8_ce_expand_key
- ELF(.type _gcry_sm4_armv8_ce_expand_key,%function;)
- _gcry_sm4_armv8_ce_expand_key:
- /* input:
- * x0: 128-bit key
- * x1: rkey_enc
- * x2: rkey_dec
- * x3: fk array
- * x4: ck array
- */
- CFI_STARTPROC();
- ld1 {v0.16b}, [x0];
- rev32 v0.16b, v0.16b;
- ld1 {v1.16b}, [x3];
- load_rkey(x4);
- /* input ^ fk */
- eor v0.16b, v0.16b, v1.16b;
- sm4ekey(v0, v0, v24);
- sm4ekey(v1, v0, v25);
- sm4ekey(v2, v1, v26);
- sm4ekey(v3, v2, v27);
- sm4ekey(v4, v3, v28);
- sm4ekey(v5, v4, v29);
- sm4ekey(v6, v5, v30);
- sm4ekey(v7, v6, v31);
- st1 {v0.16b-v3.16b}, [x1], #64;
- st1 {v4.16b-v7.16b}, [x1];
- rev64 v7.4s, v7.4s;
- rev64 v6.4s, v6.4s;
- rev64 v5.4s, v5.4s;
- rev64 v4.4s, v4.4s;
- rev64 v3.4s, v3.4s;
- rev64 v2.4s, v2.4s;
- rev64 v1.4s, v1.4s;
- rev64 v0.4s, v0.4s;
- ext v7.16b, v7.16b, v7.16b, #8;
- ext v6.16b, v6.16b, v6.16b, #8;
- ext v5.16b, v5.16b, v5.16b, #8;
- ext v4.16b, v4.16b, v4.16b, #8;
- ext v3.16b, v3.16b, v3.16b, #8;
- ext v2.16b, v2.16b, v2.16b, #8;
- ext v1.16b, v1.16b, v1.16b, #8;
- ext v0.16b, v0.16b, v0.16b, #8;
- st1 {v7.16b}, [x2], #16;
- st1 {v6.16b}, [x2], #16;
- st1 {v5.16b}, [x2], #16;
- st1 {v4.16b}, [x2], #16;
- st1 {v3.16b}, [x2], #16;
- st1 {v2.16b}, [x2], #16;
- st1 {v1.16b}, [x2], #16;
- st1 {v0.16b}, [x2];
- ret_spec_stop;
- CFI_ENDPROC();
- ELF(.size _gcry_sm4_armv8_ce_expand_key,.-_gcry_sm4_armv8_ce_expand_key;)
- .align 4
- ELF(.type sm4_armv8_ce_crypt_blk1_4,%function;)
- sm4_armv8_ce_crypt_blk1_4:
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: num blocks (1..4)
- */
- CFI_STARTPROC();
- load_rkey(x0);
- ld1 {v0.16b}, [x2], #16;
- mov v1.16b, v0.16b;
- mov v2.16b, v0.16b;
- mov v3.16b, v0.16b;
- cmp x3, #2;
- blt .Lblk4_load_input_done;
- ld1 {v1.16b}, [x2], #16;
- beq .Lblk4_load_input_done;
- ld1 {v2.16b}, [x2], #16;
- cmp x3, #3;
- beq .Lblk4_load_input_done;
- ld1 {v3.16b}, [x2];
- .Lblk4_load_input_done:
- crypt_blk4(v0, v1, v2, v3);
- st1 {v0.16b}, [x1], #16;
- cmp x3, #2;
- blt .Lblk4_store_output_done;
- st1 {v1.16b}, [x1], #16;
- beq .Lblk4_store_output_done;
- st1 {v2.16b}, [x1], #16;
- cmp x3, #3;
- beq .Lblk4_store_output_done;
- st1 {v3.16b}, [x1];
- .Lblk4_store_output_done:
- ret_spec_stop;
- CFI_ENDPROC();
- ELF(.size sm4_armv8_ce_crypt_blk1_4,.-sm4_armv8_ce_crypt_blk1_4;)
- .align 4
- .global _gcry_sm4_armv8_ce_crypt_blk1_8
- ELF(.type _gcry_sm4_armv8_ce_crypt_blk1_8,%function;)
- _gcry_sm4_armv8_ce_crypt_blk1_8:
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: num blocks (1..8)
- */
- CFI_STARTPROC();
- cmp x3, #5;
- blt sm4_armv8_ce_crypt_blk1_4;
- load_rkey(x0);
- ld1 {v0.16b-v3.16b}, [x2], #64;
- ld1 {v4.16b}, [x2], #16;
- mov v5.16b, v4.16b;
- mov v6.16b, v4.16b;
- mov v7.16b, v4.16b;
- beq .Lblk8_load_input_done;
- ld1 {v5.16b}, [x2], #16;
- cmp x3, #7;
- blt .Lblk8_load_input_done;
- ld1 {v6.16b}, [x2], #16;
- beq .Lblk8_load_input_done;
- ld1 {v7.16b}, [x2];
- .Lblk8_load_input_done:
- crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
- cmp x3, #6;
- st1 {v0.16b-v3.16b}, [x1], #64;
- st1 {v4.16b}, [x1], #16;
- blt .Lblk8_store_output_done;
- st1 {v5.16b}, [x1], #16;
- beq .Lblk8_store_output_done;
- st1 {v6.16b}, [x1], #16;
- cmp x3, #7;
- beq .Lblk8_store_output_done;
- st1 {v7.16b}, [x1];
- .Lblk8_store_output_done:
- ret_spec_stop;
- CFI_ENDPROC();
- ELF(.size _gcry_sm4_armv8_ce_crypt_blk1_8,.-_gcry_sm4_armv8_ce_crypt_blk1_8;)
- .align 4
- .global _gcry_sm4_armv8_ce_crypt
- ELF(.type _gcry_sm4_armv8_ce_crypt,%function;)
- _gcry_sm4_armv8_ce_crypt:
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: nblocks (multiples of 8)
- */
- CFI_STARTPROC();
- load_rkey(x0);
- .Lcrypt_loop_blk:
- subs x3, x3, #8;
- bmi .Lcrypt_end;
- ld1 {v0.16b-v3.16b}, [x2], #64;
- ld1 {v4.16b-v7.16b}, [x2], #64;
- crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
- st1 {v0.16b-v3.16b}, [x1], #64;
- st1 {v4.16b-v7.16b}, [x1], #64;
- b .Lcrypt_loop_blk;
- .Lcrypt_end:
- ret_spec_stop;
- CFI_ENDPROC();
- ELF(.size _gcry_sm4_armv8_ce_crypt,.-_gcry_sm4_armv8_ce_crypt;)
- .align 4
- .global _gcry_sm4_armv8_ce_cbc_dec
- ELF(.type _gcry_sm4_armv8_ce_cbc_dec,%function;)
- _gcry_sm4_armv8_ce_cbc_dec:
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: iv (big endian, 128 bit)
- * x4: nblocks (multiples of 8)
- */
- CFI_STARTPROC();
- load_rkey(x0);
- ld1 {RIV.16b}, [x3];
- .Lcbc_loop_blk:
- subs x4, x4, #8;
- bmi .Lcbc_end;
- ld1 {v0.16b-v3.16b}, [x2], #64;
- ld1 {v4.16b-v7.16b}, [x2];
- crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
- sub x2, x2, #64;
- eor v0.16b, v0.16b, RIV.16b;
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
- eor v1.16b, v1.16b, RTMP0.16b;
- eor v2.16b, v2.16b, RTMP1.16b;
- eor v3.16b, v3.16b, RTMP2.16b;
- st1 {v0.16b-v3.16b}, [x1], #64;
- eor v4.16b, v4.16b, RTMP3.16b;
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
- eor v5.16b, v5.16b, RTMP0.16b;
- eor v6.16b, v6.16b, RTMP1.16b;
- eor v7.16b, v7.16b, RTMP2.16b;
- mov RIV.16b, RTMP3.16b;
- st1 {v4.16b-v7.16b}, [x1], #64;
- b .Lcbc_loop_blk;
- .Lcbc_end:
- /* store new IV */
- st1 {RIV.16b}, [x3];
- ret_spec_stop;
- CFI_ENDPROC();
- ELF(.size _gcry_sm4_armv8_ce_cbc_dec,.-_gcry_sm4_armv8_ce_cbc_dec;)
- .align 4
- .global _gcry_sm4_armv8_ce_cfb_dec
- ELF(.type _gcry_sm4_armv8_ce_cfb_dec,%function;)
- _gcry_sm4_armv8_ce_cfb_dec:
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: iv (big endian, 128 bit)
- * x4: nblocks (multiples of 8)
- */
- CFI_STARTPROC();
- load_rkey(x0);
- ld1 {v0.16b}, [x3];
- .Lcfb_loop_blk:
- subs x4, x4, #8;
- bmi .Lcfb_end;
- ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
- ld1 {v4.16b-v7.16b}, [x2];
- crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
- sub x2, x2, #48;
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
- eor v0.16b, v0.16b, RTMP0.16b;
- eor v1.16b, v1.16b, RTMP1.16b;
- eor v2.16b, v2.16b, RTMP2.16b;
- eor v3.16b, v3.16b, RTMP3.16b;
- st1 {v0.16b-v3.16b}, [x1], #64;
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
- eor v4.16b, v4.16b, RTMP0.16b;
- eor v5.16b, v5.16b, RTMP1.16b;
- eor v6.16b, v6.16b, RTMP2.16b;
- eor v7.16b, v7.16b, RTMP3.16b;
- st1 {v4.16b-v7.16b}, [x1], #64;
- mov v0.16b, RTMP3.16b;
- b .Lcfb_loop_blk;
- .Lcfb_end:
- /* store new IV */
- st1 {v0.16b}, [x3];
- ret_spec_stop;
- CFI_ENDPROC();
- ELF(.size _gcry_sm4_armv8_ce_cfb_dec,.-_gcry_sm4_armv8_ce_cfb_dec;)
- .align 4
- .global _gcry_sm4_armv8_ce_ctr_enc
- ELF(.type _gcry_sm4_armv8_ce_ctr_enc,%function;)
- _gcry_sm4_armv8_ce_ctr_enc:
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: ctr (big endian, 128 bit)
- * x4: nblocks (multiples of 8)
- */
- CFI_STARTPROC();
- load_rkey(x0);
- ldp x7, x8, [x3];
- rev x7, x7;
- rev x8, x8;
- .Lctr_loop_blk:
- subs x4, x4, #8;
- bmi .Lctr_end;
- #define inc_le128(vctr) \
- mov vctr.d[1], x8; \
- mov vctr.d[0], x7; \
- adds x8, x8, #1; \
- adc x7, x7, xzr; \
- rev64 vctr.16b, vctr.16b;
- /* construct CTRs */
- inc_le128(v0); /* +0 */
- inc_le128(v1); /* +1 */
- inc_le128(v2); /* +2 */
- inc_le128(v3); /* +3 */
- inc_le128(v4); /* +4 */
- inc_le128(v5); /* +5 */
- inc_le128(v6); /* +6 */
- inc_le128(v7); /* +7 */
- crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
- eor v0.16b, v0.16b, RTMP0.16b;
- eor v1.16b, v1.16b, RTMP1.16b;
- eor v2.16b, v2.16b, RTMP2.16b;
- eor v3.16b, v3.16b, RTMP3.16b;
- st1 {v0.16b-v3.16b}, [x1], #64;
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
- eor v4.16b, v4.16b, RTMP0.16b;
- eor v5.16b, v5.16b, RTMP1.16b;
- eor v6.16b, v6.16b, RTMP2.16b;
- eor v7.16b, v7.16b, RTMP3.16b;
- st1 {v4.16b-v7.16b}, [x1], #64;
- b .Lctr_loop_blk;
- .Lctr_end:
- /* store new CTR */
- rev x7, x7;
- rev x8, x8;
- stp x7, x8, [x3];
- ret_spec_stop;
- CFI_ENDPROC();
- ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
- .align 4
- .global _gcry_sm4_armv8_ce_xts_crypt
- ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
- _gcry_sm4_armv8_ce_xts_crypt:
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: tweak (big endian, 128 bit)
- * x4: nblocks
- */
- CFI_STARTPROC()
- VPUSH_ABI
- load_rkey(x0)
- mov x7, #0x87
- mov x8, #0x1
- mov RMASK.d[0], x7
- mov RMASK.d[1], x8
- ld1 {RIV.16b}, [x3]
- mov v8.16b, RIV.16b
- ext RIV.16b, RIV.16b, RIV.16b, #8
- .Lxts_loop_blk:
- sub x4, x4, #8
- tbnz x4, #63, .Lxts_tail8
- #define tweak_next(vt, vin, RTMP) \
- sshr RTMP.2d, RIV.2d, #63; \
- add vt.2d, vin.2d, vin.2d; \
- and RTMP.16b, RTMP.16b, RMASK.16b; \
- add RIV.2d, RIV.2d, RIV.2d; \
- eor vt.16b, vt.16b, RTMP.16b;
- tweak_next( v9, v8, RTMP0)
- tweak_next(v10, v9, RTMP1)
- tweak_next(v11, v10, RTMP2)
- tweak_next(v12, v11, RTMP3)
- tweak_next(v13, v12, RTMP0)
- tweak_next(v14, v13, RTMP1)
- tweak_next(v15, v14, RTMP2)
- ld1 {v0.16b-v3.16b}, [x2], #64
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- ld1 {v4.16b-v7.16b}, [x2], #64
- eor v4.16b, v4.16b, v12.16b
- eor v5.16b, v5.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v7.16b, v7.16b, v15.16b
- crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- eor v4.16b, v4.16b, v12.16b
- eor v5.16b, v5.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v7.16b, v7.16b, v15.16b
- st1 {v4.16b-v7.16b}, [x1], #64
- tweak_next(v8, v15, RTMP3)
- cbz x4, .Lxts_end
- b .Lxts_loop_blk
- .Lxts_tail8:
- add x4, x4, #8
- cmp x4, #4
- blt .Lxts_tail4
- sub x4, x4, #4
- tweak_next( v9, v8, RTMP0)
- tweak_next(v10, v9, RTMP1)
- tweak_next(v11, v10, RTMP2)
- ld1 {v0.16b-v3.16b}, [x2], #64
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- crypt_blk4(v0, v1, v2, v3);
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- tweak_next(v8, v11, RTMP3)
- cbz x4, .Lxts_end
- .Lxts_tail4:
- sub x4, x4, #1
- ld1 {v0.16b}, [x2], #16
- eor v0.16b, v0.16b, v8.16b
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, v8.16b
- st1 {v0.16b}, [x1], #16
- tweak_next(v8, v8, RTMP0)
- cbnz x4, .Lxts_tail4
- .Lxts_end:
- /* store new tweak */
- st1 {v8.16b}, [x3]
- CLEAR_REG(v8)
- CLEAR_REG(v9)
- CLEAR_REG(v10)
- CLEAR_REG(v11)
- CLEAR_REG(v12)
- CLEAR_REG(v13)
- CLEAR_REG(v14)
- CLEAR_REG(v15)
- CLEAR_REG(RIV)
- VPOP_ABI
- ret_spec_stop
- CFI_ENDPROC()
- ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
- #endif
|