123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- /*
- * Fast AES implementation for SPE instruction set (PPC)
- *
- * This code makes use of the SPE SIMD instruction set as defined in
- * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
- * Implementation is based on optimization guide notes from
- * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
- #include <asm/ppc_asm.h>
- #include "aes-spe-regs.h"
- #define EAD(in, bpos) \
- rlwimi rT0,in,28-((bpos+3)%4)*8,20,27;
- #define DAD(in, bpos) \
- rlwimi rT1,in,24-((bpos+3)%4)*8,24,31;
- #define LWH(out, off) \
- evlwwsplat out,off(rT0); /* load word high */
- #define LWL(out, off) \
- lwz out,off(rT0); /* load word low */
- #define LBZ(out, tab, off) \
- lbz out,off(tab); /* load byte */
- #define LAH(out, in, bpos, off) \
- EAD(in, bpos) /* calc addr + load word high */ \
- LWH(out, off)
- #define LAL(out, in, bpos, off) \
- EAD(in, bpos) /* calc addr + load word low */ \
- LWL(out, off)
- #define LAE(out, in, bpos) \
- EAD(in, bpos) /* calc addr + load enc byte */ \
- LBZ(out, rT0, 8)
- #define LBE(out) \
- LBZ(out, rT0, 8) /* load enc byte */
- #define LAD(out, in, bpos) \
- DAD(in, bpos) /* calc addr + load dec byte */ \
- LBZ(out, rT1, 0)
- #define LBD(out) \
- LBZ(out, rT1, 0)
- /*
- * ppc_encrypt_block: The central encryption function for a single 16 bytes
- * block. It does no stack handling or register saving to support fast calls
- * via bl/blr. It expects that caller has pre-xored input data with first
- * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
- * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
- * and rW0-rW3 and caller must execute a final xor on the output registers.
- * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
- *
- */
- _GLOBAL(ppc_encrypt_block)
- LAH(rW4, rD1, 2, 4)
- LAH(rW6, rD0, 3, 0)
- LAH(rW3, rD0, 1, 8)
- ppc_encrypt_block_loop:
- LAH(rW0, rD3, 0, 12)
- LAL(rW0, rD0, 0, 12)
- LAH(rW1, rD1, 0, 12)
- LAH(rW2, rD2, 1, 8)
- LAL(rW2, rD3, 1, 8)
- LAL(rW3, rD1, 1, 8)
- LAL(rW4, rD2, 2, 4)
- LAL(rW6, rD1, 3, 0)
- LAH(rW5, rD3, 2, 4)
- LAL(rW5, rD0, 2, 4)
- LAH(rW7, rD2, 3, 0)
- evldw rD1,16(rKP)
- EAD(rD3, 3)
- evxor rW2,rW2,rW4
- LWL(rW7, 0)
- evxor rW2,rW2,rW6
- EAD(rD2, 0)
- evxor rD1,rD1,rW2
- LWL(rW1, 12)
- evxor rD1,rD1,rW0
- evldw rD3,24(rKP)
- evmergehi rD0,rD0,rD1
- EAD(rD1, 2)
- evxor rW3,rW3,rW5
- LWH(rW4, 4)
- evxor rW3,rW3,rW7
- EAD(rD0, 3)
- evxor rD3,rD3,rW3
- LWH(rW6, 0)
- evxor rD3,rD3,rW1
- EAD(rD0, 1)
- evmergehi rD2,rD2,rD3
- LWH(rW3, 8)
- LAH(rW0, rD3, 0, 12)
- LAL(rW0, rD0, 0, 12)
- LAH(rW1, rD1, 0, 12)
- LAH(rW2, rD2, 1, 8)
- LAL(rW2, rD3, 1, 8)
- LAL(rW3, rD1, 1, 8)
- LAL(rW4, rD2, 2, 4)
- LAL(rW6, rD1, 3, 0)
- LAH(rW5, rD3, 2, 4)
- LAL(rW5, rD0, 2, 4)
- LAH(rW7, rD2, 3, 0)
- evldw rD1,32(rKP)
- EAD(rD3, 3)
- evxor rW2,rW2,rW4
- LWL(rW7, 0)
- evxor rW2,rW2,rW6
- EAD(rD2, 0)
- evxor rD1,rD1,rW2
- LWL(rW1, 12)
- evxor rD1,rD1,rW0
- evldw rD3,40(rKP)
- evmergehi rD0,rD0,rD1
- EAD(rD1, 2)
- evxor rW3,rW3,rW5
- LWH(rW4, 4)
- evxor rW3,rW3,rW7
- EAD(rD0, 3)
- evxor rD3,rD3,rW3
- LWH(rW6, 0)
- evxor rD3,rD3,rW1
- EAD(rD0, 1)
- evmergehi rD2,rD2,rD3
- LWH(rW3, 8)
- addi rKP,rKP,32
- bdnz ppc_encrypt_block_loop
- LAH(rW0, rD3, 0, 12)
- LAL(rW0, rD0, 0, 12)
- LAH(rW1, rD1, 0, 12)
- LAH(rW2, rD2, 1, 8)
- LAL(rW2, rD3, 1, 8)
- LAL(rW3, rD1, 1, 8)
- LAL(rW4, rD2, 2, 4)
- LAH(rW5, rD3, 2, 4)
- LAL(rW6, rD1, 3, 0)
- LAL(rW5, rD0, 2, 4)
- LAH(rW7, rD2, 3, 0)
- evldw rD1,16(rKP)
- EAD(rD3, 3)
- evxor rW2,rW2,rW4
- LWL(rW7, 0)
- evxor rW2,rW2,rW6
- EAD(rD2, 0)
- evxor rD1,rD1,rW2
- LWL(rW1, 12)
- evxor rD1,rD1,rW0
- evldw rD3,24(rKP)
- evmergehi rD0,rD0,rD1
- EAD(rD1, 0)
- evxor rW3,rW3,rW5
- LBE(rW2)
- evxor rW3,rW3,rW7
- EAD(rD0, 1)
- evxor rD3,rD3,rW3
- LBE(rW6)
- evxor rD3,rD3,rW1
- EAD(rD0, 0)
- evmergehi rD2,rD2,rD3
- LBE(rW1)
- LAE(rW0, rD3, 0)
- LAE(rW1, rD0, 0)
- LAE(rW4, rD2, 1)
- LAE(rW5, rD3, 1)
- LAE(rW3, rD2, 0)
- LAE(rW7, rD1, 1)
- rlwimi rW0,rW4,8,16,23
- rlwimi rW1,rW5,8,16,23
- LAE(rW4, rD1, 2)
- LAE(rW5, rD2, 2)
- rlwimi rW2,rW6,8,16,23
- rlwimi rW3,rW7,8,16,23
- LAE(rW6, rD3, 2)
- LAE(rW7, rD0, 2)
- rlwimi rW0,rW4,16,8,15
- rlwimi rW1,rW5,16,8,15
- LAE(rW4, rD0, 3)
- LAE(rW5, rD1, 3)
- rlwimi rW2,rW6,16,8,15
- lwz rD0,32(rKP)
- rlwimi rW3,rW7,16,8,15
- lwz rD1,36(rKP)
- LAE(rW6, rD2, 3)
- LAE(rW7, rD3, 3)
- rlwimi rW0,rW4,24,0,7
- lwz rD2,40(rKP)
- rlwimi rW1,rW5,24,0,7
- lwz rD3,44(rKP)
- rlwimi rW2,rW6,24,0,7
- rlwimi rW3,rW7,24,0,7
- blr
- /*
- * ppc_decrypt_block: The central decryption function for a single 16 bytes
- * block. It does no stack handling or register saving to support fast calls
- * via bl/blr. It expects that caller has pre-xored input data with first
- * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
- * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
- * and rW0-rW3 and caller must execute a final xor on the output registers.
- * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
- *
- */
- _GLOBAL(ppc_decrypt_block)
- LAH(rW0, rD1, 0, 12)
- LAH(rW6, rD0, 3, 0)
- LAH(rW3, rD0, 1, 8)
- ppc_decrypt_block_loop:
- LAH(rW1, rD3, 0, 12)
- LAL(rW0, rD2, 0, 12)
- LAH(rW2, rD2, 1, 8)
- LAL(rW2, rD3, 1, 8)
- LAH(rW4, rD3, 2, 4)
- LAL(rW4, rD0, 2, 4)
- LAL(rW6, rD1, 3, 0)
- LAH(rW5, rD1, 2, 4)
- LAH(rW7, rD2, 3, 0)
- LAL(rW7, rD3, 3, 0)
- LAL(rW3, rD1, 1, 8)
- evldw rD1,16(rKP)
- EAD(rD0, 0)
- evxor rW4,rW4,rW6
- LWL(rW1, 12)
- evxor rW0,rW0,rW4
- EAD(rD2, 2)
- evxor rW0,rW0,rW2
- LWL(rW5, 4)
- evxor rD1,rD1,rW0
- evldw rD3,24(rKP)
- evmergehi rD0,rD0,rD1
- EAD(rD1, 0)
- evxor rW3,rW3,rW7
- LWH(rW0, 12)
- evxor rW3,rW3,rW1
- EAD(rD0, 3)
- evxor rD3,rD3,rW3
- LWH(rW6, 0)
- evxor rD3,rD3,rW5
- EAD(rD0, 1)
- evmergehi rD2,rD2,rD3
- LWH(rW3, 8)
- LAH(rW1, rD3, 0, 12)
- LAL(rW0, rD2, 0, 12)
- LAH(rW2, rD2, 1, 8)
- LAL(rW2, rD3, 1, 8)
- LAH(rW4, rD3, 2, 4)
- LAL(rW4, rD0, 2, 4)
- LAL(rW6, rD1, 3, 0)
- LAH(rW5, rD1, 2, 4)
- LAH(rW7, rD2, 3, 0)
- LAL(rW7, rD3, 3, 0)
- LAL(rW3, rD1, 1, 8)
- evldw rD1,32(rKP)
- EAD(rD0, 0)
- evxor rW4,rW4,rW6
- LWL(rW1, 12)
- evxor rW0,rW0,rW4
- EAD(rD2, 2)
- evxor rW0,rW0,rW2
- LWL(rW5, 4)
- evxor rD1,rD1,rW0
- evldw rD3,40(rKP)
- evmergehi rD0,rD0,rD1
- EAD(rD1, 0)
- evxor rW3,rW3,rW7
- LWH(rW0, 12)
- evxor rW3,rW3,rW1
- EAD(rD0, 3)
- evxor rD3,rD3,rW3
- LWH(rW6, 0)
- evxor rD3,rD3,rW5
- EAD(rD0, 1)
- evmergehi rD2,rD2,rD3
- LWH(rW3, 8)
- addi rKP,rKP,32
- bdnz ppc_decrypt_block_loop
- LAH(rW1, rD3, 0, 12)
- LAL(rW0, rD2, 0, 12)
- LAH(rW2, rD2, 1, 8)
- LAL(rW2, rD3, 1, 8)
- LAH(rW4, rD3, 2, 4)
- LAL(rW4, rD0, 2, 4)
- LAL(rW6, rD1, 3, 0)
- LAH(rW5, rD1, 2, 4)
- LAH(rW7, rD2, 3, 0)
- LAL(rW7, rD3, 3, 0)
- LAL(rW3, rD1, 1, 8)
- evldw rD1,16(rKP)
- EAD(rD0, 0)
- evxor rW4,rW4,rW6
- LWL(rW1, 12)
- evxor rW0,rW0,rW4
- EAD(rD2, 2)
- evxor rW0,rW0,rW2
- LWL(rW5, 4)
- evxor rD1,rD1,rW0
- evldw rD3,24(rKP)
- evmergehi rD0,rD0,rD1
- DAD(rD1, 0)
- evxor rW3,rW3,rW7
- LBD(rW0)
- evxor rW3,rW3,rW1
- DAD(rD0, 1)
- evxor rD3,rD3,rW3
- LBD(rW6)
- evxor rD3,rD3,rW5
- DAD(rD0, 0)
- evmergehi rD2,rD2,rD3
- LBD(rW3)
- LAD(rW2, rD3, 0)
- LAD(rW1, rD2, 0)
- LAD(rW4, rD2, 1)
- LAD(rW5, rD3, 1)
- LAD(rW7, rD1, 1)
- rlwimi rW0,rW4,8,16,23
- rlwimi rW1,rW5,8,16,23
- LAD(rW4, rD3, 2)
- LAD(rW5, rD0, 2)
- rlwimi rW2,rW6,8,16,23
- rlwimi rW3,rW7,8,16,23
- LAD(rW6, rD1, 2)
- LAD(rW7, rD2, 2)
- rlwimi rW0,rW4,16,8,15
- rlwimi rW1,rW5,16,8,15
- LAD(rW4, rD0, 3)
- LAD(rW5, rD1, 3)
- rlwimi rW2,rW6,16,8,15
- lwz rD0,32(rKP)
- rlwimi rW3,rW7,16,8,15
- lwz rD1,36(rKP)
- LAD(rW6, rD2, 3)
- LAD(rW7, rD3, 3)
- rlwimi rW0,rW4,24,0,7
- lwz rD2,40(rKP)
- rlwimi rW1,rW5,24,0,7
- lwz rD3,44(rKP)
- rlwimi rW2,rW6,24,0,7
- rlwimi rW3,rW7,24,0,7
- blr
|