123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579 |
- /*
- * Accelerated GHASH implementation with ARMv8 PMULL instructions.
- *
- * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
- #include <linux/linkage.h>
- #include <asm/assembler.h>
- SHASH .req v0
- SHASH2 .req v1
- T1 .req v2
- T2 .req v3
- MASK .req v4
- XL .req v5
- XM .req v6
- XH .req v7
- IN1 .req v7
- k00_16 .req v8
- k32_48 .req v9
- t3 .req v10
- t4 .req v11
- t5 .req v12
- t6 .req v13
- t7 .req v14
- t8 .req v15
- t9 .req v16
- perm1 .req v17
- perm2 .req v18
- perm3 .req v19
- sh1 .req v20
- sh2 .req v21
- sh3 .req v22
- sh4 .req v23
- ss1 .req v24
- ss2 .req v25
- ss3 .req v26
- ss4 .req v27
- XL2 .req v8
- XM2 .req v9
- XH2 .req v10
- XL3 .req v11
- XM3 .req v12
- XH3 .req v13
- TT3 .req v14
- TT4 .req v15
- HH .req v16
- HH3 .req v17
- HH4 .req v18
- HH34 .req v19
- .text
- .arch armv8-a+crypto
- .macro __pmull_p64, rd, rn, rm
- pmull \rd\().1q, \rn\().1d, \rm\().1d
- .endm
- .macro __pmull2_p64, rd, rn, rm
- pmull2 \rd\().1q, \rn\().2d, \rm\().2d
- .endm
- .macro __pmull_p8, rq, ad, bd
- ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
- ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
- ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
- __pmull_p8_\bd \rq, \ad
- .endm
- .macro __pmull2_p8, rq, ad, bd
- tbl t3.16b, {\ad\().16b}, perm1.16b // A1
- tbl t5.16b, {\ad\().16b}, perm2.16b // A2
- tbl t7.16b, {\ad\().16b}, perm3.16b // A3
- __pmull2_p8_\bd \rq, \ad
- .endm
- .macro __pmull_p8_SHASH, rq, ad
- __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
- .endm
- .macro __pmull_p8_SHASH2, rq, ad
- __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
- .endm
- .macro __pmull2_p8_SHASH, rq, ad
- __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
- .endm
- .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
- pmull\t t3.8h, t3.\nb, \bd // F = A1*B
- pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
- pmull\t t5.8h, t5.\nb, \bd // H = A2*B
- pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
- pmull\t t7.8h, t7.\nb, \bd // J = A3*B
- pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
- pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
- pmull\t \rq\().8h, \ad, \bd // D = A*B
- eor t3.16b, t3.16b, t4.16b // L = E + F
- eor t5.16b, t5.16b, t6.16b // M = G + H
- eor t7.16b, t7.16b, t8.16b // N = I + J
- uzp1 t4.2d, t3.2d, t5.2d
- uzp2 t3.2d, t3.2d, t5.2d
- uzp1 t6.2d, t7.2d, t9.2d
- uzp2 t7.2d, t7.2d, t9.2d
- // t3 = (L) (P0 + P1) << 8
- // t5 = (M) (P2 + P3) << 16
- eor t4.16b, t4.16b, t3.16b
- and t3.16b, t3.16b, k32_48.16b
- // t7 = (N) (P4 + P5) << 24
- // t9 = (K) (P6 + P7) << 32
- eor t6.16b, t6.16b, t7.16b
- and t7.16b, t7.16b, k00_16.16b
- eor t4.16b, t4.16b, t3.16b
- eor t6.16b, t6.16b, t7.16b
- zip2 t5.2d, t4.2d, t3.2d
- zip1 t3.2d, t4.2d, t3.2d
- zip2 t9.2d, t6.2d, t7.2d
- zip1 t7.2d, t6.2d, t7.2d
- ext t3.16b, t3.16b, t3.16b, #15
- ext t5.16b, t5.16b, t5.16b, #14
- ext t7.16b, t7.16b, t7.16b, #13
- ext t9.16b, t9.16b, t9.16b, #12
- eor t3.16b, t3.16b, t5.16b
- eor t7.16b, t7.16b, t9.16b
- eor \rq\().16b, \rq\().16b, t3.16b
- eor \rq\().16b, \rq\().16b, t7.16b
- .endm
- .macro __pmull_pre_p64
- add x8, x3, #16
- ld1 {HH.2d-HH4.2d}, [x8]
- trn1 SHASH2.2d, SHASH.2d, HH.2d
- trn2 T1.2d, SHASH.2d, HH.2d
- eor SHASH2.16b, SHASH2.16b, T1.16b
- trn1 HH34.2d, HH3.2d, HH4.2d
- trn2 T1.2d, HH3.2d, HH4.2d
- eor HH34.16b, HH34.16b, T1.16b
- movi MASK.16b, #0xe1
- shl MASK.2d, MASK.2d, #57
- .endm
- .macro __pmull_pre_p8
- ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
- eor SHASH2.16b, SHASH2.16b, SHASH.16b
- // k00_16 := 0x0000000000000000_000000000000ffff
- // k32_48 := 0x00000000ffffffff_0000ffffffffffff
- movi k32_48.2d, #0xffffffff
- mov k32_48.h[2], k32_48.h[0]
- ushr k00_16.2d, k32_48.2d, #32
- // prepare the permutation vectors
- mov_q x5, 0x080f0e0d0c0b0a09
- movi T1.8b, #8
- dup perm1.2d, x5
- eor perm1.16b, perm1.16b, T1.16b
- ushr perm2.2d, perm1.2d, #8
- ushr perm3.2d, perm1.2d, #16
- ushr T1.2d, perm1.2d, #24
- sli perm2.2d, perm1.2d, #56
- sli perm3.2d, perm1.2d, #48
- sli T1.2d, perm1.2d, #40
- // precompute loop invariants
- tbl sh1.16b, {SHASH.16b}, perm1.16b
- tbl sh2.16b, {SHASH.16b}, perm2.16b
- tbl sh3.16b, {SHASH.16b}, perm3.16b
- tbl sh4.16b, {SHASH.16b}, T1.16b
- ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
- ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
- ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
- ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
- .endm
- //
- // PMULL (64x64->128) based reduction for CPUs that can do
- // it in a single instruction.
- //
- .macro __pmull_reduce_p64
- pmull T2.1q, XL.1d, MASK.1d
- eor XM.16b, XM.16b, T1.16b
- mov XH.d[0], XM.d[1]
- mov XM.d[1], XL.d[0]
- eor XL.16b, XM.16b, T2.16b
- ext T2.16b, XL.16b, XL.16b, #8
- pmull XL.1q, XL.1d, MASK.1d
- .endm
- //
- // Alternative reduction for CPUs that lack support for the
- // 64x64->128 PMULL instruction
- //
- .macro __pmull_reduce_p8
- eor XM.16b, XM.16b, T1.16b
- mov XL.d[1], XM.d[0]
- mov XH.d[0], XM.d[1]
- shl T1.2d, XL.2d, #57
- shl T2.2d, XL.2d, #62
- eor T2.16b, T2.16b, T1.16b
- shl T1.2d, XL.2d, #63
- eor T2.16b, T2.16b, T1.16b
- ext T1.16b, XL.16b, XH.16b, #8
- eor T2.16b, T2.16b, T1.16b
- mov XL.d[1], T2.d[0]
- mov XH.d[0], T2.d[1]
- ushr T2.2d, XL.2d, #1
- eor XH.16b, XH.16b, XL.16b
- eor XL.16b, XL.16b, T2.16b
- ushr T2.2d, T2.2d, #6
- ushr XL.2d, XL.2d, #1
- .endm
- .macro __pmull_ghash, pn
- ld1 {SHASH.2d}, [x3]
- ld1 {XL.2d}, [x1]
- __pmull_pre_\pn
- /* do the head block first, if supplied */
- cbz x4, 0f
- ld1 {T1.2d}, [x4]
- mov x4, xzr
- b 3f
- 0: .ifc \pn, p64
- tbnz w0, #0, 2f // skip until #blocks is a
- tbnz w0, #1, 2f // round multiple of 4
- 1: ld1 {XM3.16b-TT4.16b}, [x2], #64
- sub w0, w0, #4
- rev64 T1.16b, XM3.16b
- rev64 T2.16b, XH3.16b
- rev64 TT4.16b, TT4.16b
- rev64 TT3.16b, TT3.16b
- ext IN1.16b, TT4.16b, TT4.16b, #8
- ext XL3.16b, TT3.16b, TT3.16b, #8
- eor TT4.16b, TT4.16b, IN1.16b
- pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
- pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
- pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
- eor TT3.16b, TT3.16b, XL3.16b
- pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
- pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
- pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
- ext IN1.16b, T2.16b, T2.16b, #8
- eor XL2.16b, XL2.16b, XL3.16b
- eor XH2.16b, XH2.16b, XH3.16b
- eor XM2.16b, XM2.16b, XM3.16b
- eor T2.16b, T2.16b, IN1.16b
- pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
- pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
- pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
- eor XL2.16b, XL2.16b, XL3.16b
- eor XH2.16b, XH2.16b, XH3.16b
- eor XM2.16b, XM2.16b, XM3.16b
- ext IN1.16b, T1.16b, T1.16b, #8
- ext TT3.16b, XL.16b, XL.16b, #8
- eor XL.16b, XL.16b, IN1.16b
- eor T1.16b, T1.16b, TT3.16b
- pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
- eor T1.16b, T1.16b, XL.16b
- pmull XL.1q, HH4.1d, XL.1d // a0 * b0
- pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
- eor XL.16b, XL.16b, XL2.16b
- eor XH.16b, XH.16b, XH2.16b
- eor XM.16b, XM.16b, XM2.16b
- eor T2.16b, XL.16b, XH.16b
- ext T1.16b, XL.16b, XH.16b, #8
- eor XM.16b, XM.16b, T2.16b
- __pmull_reduce_p64
- eor T2.16b, T2.16b, XH.16b
- eor XL.16b, XL.16b, T2.16b
- cbz w0, 5f
- b 1b
- .endif
- 2: ld1 {T1.2d}, [x2], #16
- sub w0, w0, #1
- 3: /* multiply XL by SHASH in GF(2^128) */
- CPU_LE( rev64 T1.16b, T1.16b )
- ext T2.16b, XL.16b, XL.16b, #8
- ext IN1.16b, T1.16b, T1.16b, #8
- eor T1.16b, T1.16b, T2.16b
- eor XL.16b, XL.16b, IN1.16b
- __pmull2_\pn XH, XL, SHASH // a1 * b1
- eor T1.16b, T1.16b, XL.16b
- __pmull_\pn XL, XL, SHASH // a0 * b0
- __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
- 4: eor T2.16b, XL.16b, XH.16b
- ext T1.16b, XL.16b, XH.16b, #8
- eor XM.16b, XM.16b, T2.16b
- __pmull_reduce_\pn
- eor T2.16b, T2.16b, XH.16b
- eor XL.16b, XL.16b, T2.16b
- cbnz w0, 0b
- 5: st1 {XL.2d}, [x1]
- ret
- .endm
- /*
- * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- * struct ghash_key const *k, const char *head)
- */
- ENTRY(pmull_ghash_update_p64)
- __pmull_ghash p64
- ENDPROC(pmull_ghash_update_p64)
- ENTRY(pmull_ghash_update_p8)
- __pmull_ghash p8
- ENDPROC(pmull_ghash_update_p8)
- KS0 .req v12
- KS1 .req v13
- INP0 .req v14
- INP1 .req v15
- .macro load_round_keys, rounds, rk
- cmp \rounds, #12
- blo 2222f /* 128 bits */
- beq 1111f /* 192 bits */
- ld1 {v17.4s-v18.4s}, [\rk], #32
- 1111: ld1 {v19.4s-v20.4s}, [\rk], #32
- 2222: ld1 {v21.4s-v24.4s}, [\rk], #64
- ld1 {v25.4s-v28.4s}, [\rk], #64
- ld1 {v29.4s-v31.4s}, [\rk]
- .endm
- .macro enc_round, state, key
- aese \state\().16b, \key\().16b
- aesmc \state\().16b, \state\().16b
- .endm
- .macro enc_block, state, rounds
- cmp \rounds, #12
- b.lo 2222f /* 128 bits */
- b.eq 1111f /* 192 bits */
- enc_round \state, v17
- enc_round \state, v18
- 1111: enc_round \state, v19
- enc_round \state, v20
- 2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
- enc_round \state, \key
- .endr
- aese \state\().16b, v30.16b
- eor \state\().16b, \state\().16b, v31.16b
- .endm
- .macro pmull_gcm_do_crypt, enc
- ld1 {SHASH.2d}, [x4], #16
- ld1 {HH.2d}, [x4]
- ld1 {XL.2d}, [x1]
- ldr x8, [x5, #8] // load lower counter
- movi MASK.16b, #0xe1
- trn1 SHASH2.2d, SHASH.2d, HH.2d
- trn2 T1.2d, SHASH.2d, HH.2d
- CPU_LE( rev x8, x8 )
- shl MASK.2d, MASK.2d, #57
- eor SHASH2.16b, SHASH2.16b, T1.16b
- .if \enc == 1
- ldr x10, [sp]
- ld1 {KS0.16b-KS1.16b}, [x10]
- .endif
- cbnz x6, 4f
- 0: ld1 {INP0.16b-INP1.16b}, [x3], #32
- rev x9, x8
- add x11, x8, #1
- add x8, x8, #2
- .if \enc == 1
- eor INP0.16b, INP0.16b, KS0.16b // encrypt input
- eor INP1.16b, INP1.16b, KS1.16b
- .endif
- ld1 {KS0.8b}, [x5] // load upper counter
- rev x11, x11
- sub w0, w0, #2
- mov KS1.8b, KS0.8b
- ins KS0.d[1], x9 // set lower counter
- ins KS1.d[1], x11
- rev64 T1.16b, INP1.16b
- cmp w7, #12
- b.ge 2f // AES-192/256?
- 1: enc_round KS0, v21
- ext IN1.16b, T1.16b, T1.16b, #8
- enc_round KS1, v21
- pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
- enc_round KS0, v22
- eor T1.16b, T1.16b, IN1.16b
- enc_round KS1, v22
- pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
- enc_round KS0, v23
- pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
- enc_round KS1, v23
- rev64 T1.16b, INP0.16b
- ext T2.16b, XL.16b, XL.16b, #8
- enc_round KS0, v24
- ext IN1.16b, T1.16b, T1.16b, #8
- eor T1.16b, T1.16b, T2.16b
- enc_round KS1, v24
- eor XL.16b, XL.16b, IN1.16b
- enc_round KS0, v25
- eor T1.16b, T1.16b, XL.16b
- enc_round KS1, v25
- pmull2 XH.1q, HH.2d, XL.2d // a1 * b1
- enc_round KS0, v26
- pmull XL.1q, HH.1d, XL.1d // a0 * b0
- enc_round KS1, v26
- pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0)
- enc_round KS0, v27
- eor XL.16b, XL.16b, XL2.16b
- eor XH.16b, XH.16b, XH2.16b
- enc_round KS1, v27
- eor XM.16b, XM.16b, XM2.16b
- ext T1.16b, XL.16b, XH.16b, #8
- enc_round KS0, v28
- eor T2.16b, XL.16b, XH.16b
- eor XM.16b, XM.16b, T1.16b
- enc_round KS1, v28
- eor XM.16b, XM.16b, T2.16b
- enc_round KS0, v29
- pmull T2.1q, XL.1d, MASK.1d
- enc_round KS1, v29
- mov XH.d[0], XM.d[1]
- mov XM.d[1], XL.d[0]
- aese KS0.16b, v30.16b
- eor XL.16b, XM.16b, T2.16b
- aese KS1.16b, v30.16b
- ext T2.16b, XL.16b, XL.16b, #8
- eor KS0.16b, KS0.16b, v31.16b
- pmull XL.1q, XL.1d, MASK.1d
- eor T2.16b, T2.16b, XH.16b
- eor KS1.16b, KS1.16b, v31.16b
- eor XL.16b, XL.16b, T2.16b
- .if \enc == 0
- eor INP0.16b, INP0.16b, KS0.16b
- eor INP1.16b, INP1.16b, KS1.16b
- .endif
- st1 {INP0.16b-INP1.16b}, [x2], #32
- cbnz w0, 0b
- CPU_LE( rev x8, x8 )
- st1 {XL.2d}, [x1]
- str x8, [x5, #8] // store lower counter
- .if \enc == 1
- st1 {KS0.16b-KS1.16b}, [x10]
- .endif
- ret
- 2: b.eq 3f // AES-192?
- enc_round KS0, v17
- enc_round KS1, v17
- enc_round KS0, v18
- enc_round KS1, v18
- 3: enc_round KS0, v19
- enc_round KS1, v19
- enc_round KS0, v20
- enc_round KS1, v20
- b 1b
- 4: load_round_keys w7, x6
- b 0b
- .endm
- /*
- * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
- * struct ghash_key const *k, u8 ctr[],
- * int rounds, u8 ks[])
- */
- ENTRY(pmull_gcm_encrypt)
- pmull_gcm_do_crypt 1
- ENDPROC(pmull_gcm_encrypt)
- /*
- * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
- * struct ghash_key const *k, u8 ctr[],
- * int rounds)
- */
- ENTRY(pmull_gcm_decrypt)
- pmull_gcm_do_crypt 0
- ENDPROC(pmull_gcm_decrypt)
- /*
- * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
- */
- ENTRY(pmull_gcm_encrypt_block)
- cbz x2, 0f
- load_round_keys w3, x2
- 0: ld1 {v0.16b}, [x1]
- enc_block v0, w3
- st1 {v0.16b}, [x0]
- ret
- ENDPROC(pmull_gcm_encrypt_block)
|