123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300 |
- /*
- * Fast SHA-1 implementation for SPE instruction set (PPC)
- *
- * This code makes use of the SPE SIMD instruction set as defined in
- * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
- * Implementation is based on optimization guide notes from
- * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
- #include <asm/ppc_asm.h>
- #include <asm/asm-offsets.h>
- #define rHP r3 /* pointer to hash value */
- #define rWP r4 /* pointer to input */
- #define rKP r5 /* pointer to constants */
- #define rW0 r14 /* 64 bit round words */
- #define rW1 r15
- #define rW2 r16
- #define rW3 r17
- #define rW4 r18
- #define rW5 r19
- #define rW6 r20
- #define rW7 r21
- #define rH0 r6 /* 32 bit hash values */
- #define rH1 r7
- #define rH2 r8
- #define rH3 r9
- #define rH4 r10
- #define rT0 r22 /* 64 bit temporary */
- #define rT1 r0 /* 32 bit temporaries */
- #define rT2 r11
- #define rT3 r12
- #define rK r23 /* 64 bit constant in volatile register */
- #define LOAD_K01
- #define LOAD_K11 \
- evlwwsplat rK,0(rKP);
- #define LOAD_K21 \
- evlwwsplat rK,4(rKP);
- #define LOAD_K31 \
- evlwwsplat rK,8(rKP);
- #define LOAD_K41 \
- evlwwsplat rK,12(rKP);
- #define INITIALIZE \
- stwu r1,-128(r1); /* create stack frame */ \
- evstdw r14,8(r1); /* We must save non volatile */ \
- evstdw r15,16(r1); /* registers. Take the chance */ \
- evstdw r16,24(r1); /* and save the SPE part too */ \
- evstdw r17,32(r1); \
- evstdw r18,40(r1); \
- evstdw r19,48(r1); \
- evstdw r20,56(r1); \
- evstdw r21,64(r1); \
- evstdw r22,72(r1); \
- evstdw r23,80(r1);
- #define FINALIZE \
- evldw r14,8(r1); /* restore SPE registers */ \
- evldw r15,16(r1); \
- evldw r16,24(r1); \
- evldw r17,32(r1); \
- evldw r18,40(r1); \
- evldw r19,48(r1); \
- evldw r20,56(r1); \
- evldw r21,64(r1); \
- evldw r22,72(r1); \
- evldw r23,80(r1); \
- xor r0,r0,r0; \
- stw r0,8(r1); /* Delete sensitive data */ \
- stw r0,16(r1); /* that we might have pushed */ \
- stw r0,24(r1); /* from other context that runs */ \
- stw r0,32(r1); /* the same code. Assume that */ \
- stw r0,40(r1); /* the lower part of the GPRs */ \
- stw r0,48(r1); /* were already overwritten on */ \
- stw r0,56(r1); /* the way down to here */ \
- stw r0,64(r1); \
- stw r0,72(r1); \
- stw r0,80(r1); \
- addi r1,r1,128; /* cleanup stack frame */
- #ifdef __BIG_ENDIAN__
- #define LOAD_DATA(reg, off) \
- lwz reg,off(rWP); /* load data */
- #define NEXT_BLOCK \
- addi rWP,rWP,64; /* increment per block */
- #else
- #define LOAD_DATA(reg, off) \
- lwbrx reg,0,rWP; /* load data */ \
- addi rWP,rWP,4; /* increment per word */
- #define NEXT_BLOCK /* nothing to do */
- #endif
- #define R_00_15(a, b, c, d, e, w0, w1, k, off) \
- LOAD_DATA(w0, off) /* 1: W */ \
- and rT2,b,c; /* 1: F' = B and C */ \
- LOAD_K##k##1 \
- andc rT1,d,b; /* 1: F" = ~B and D */ \
- rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \
- or rT2,rT2,rT1; /* 1: F = F' or F" */ \
- add e,e,rT0; /* 1: E = E + A' */ \
- rotrwi b,b,2; /* 1: B = B rotl 30 */ \
- add e,e,w0; /* 1: E = E + W */ \
- LOAD_DATA(w1, off+4) /* 2: W */ \
- add e,e,rT2; /* 1: E = E + F */ \
- and rT1,a,b; /* 2: F' = B and C */ \
- add e,e,rK; /* 1: E = E + K */ \
- andc rT2,c,a; /* 2: F" = ~B and D */ \
- add d,d,rK; /* 2: E = E + K */ \
- or rT2,rT2,rT1; /* 2: F = F' or F" */ \
- rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
- add d,d,w1; /* 2: E = E + W */ \
- rotrwi a,a,2; /* 2: B = B rotl 30 */ \
- add d,d,rT0; /* 2: E = E + A' */ \
- evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \
- add d,d,rT2 /* 2: E = E + F */
- #define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
- and rT2,b,c; /* 1: F' = B and C */ \
- evmergelohi rT0,w7,w6; /* W[-3] */ \
- andc rT1,d,b; /* 1: F" = ~B and D */ \
- evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
- or rT1,rT1,rT2; /* 1: F = F' or F" */ \
- evxor w0,w0,w4; /* W = W xor W[-8] */ \
- add e,e,rT1; /* 1: E = E + F */ \
- evxor w0,w0,w1; /* W = W xor W[-14] */ \
- rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
- evrlwi w0,w0,1; /* W = W rotl 1 */ \
- add e,e,rT2; /* 1: E = E + A' */ \
- evaddw rT0,w0,rK; /* WK = W + K */ \
- rotrwi b,b,2; /* 1: B = B rotl 30 */ \
- LOAD_K##k##1 \
- evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
- add e,e,rT0; /* 1: E = E + WK */ \
- add d,d,rT1; /* 2: E = E + WK */ \
- and rT2,a,b; /* 2: F' = B and C */ \
- andc rT1,c,a; /* 2: F" = ~B and D */ \
- rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
- or rT1,rT1,rT2; /* 2: F = F' or F" */ \
- add d,d,rT0; /* 2: E = E + A' */ \
- rotrwi a,a,2; /* 2: B = B rotl 30 */ \
- add d,d,rT1 /* 2: E = E + F */
- #define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
- evmergelohi rT0,w7,w6; /* W[-3] */ \
- xor rT2,b,c; /* 1: F' = B xor C */ \
- evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
- xor rT2,rT2,d; /* 1: F = F' xor D */ \
- evxor w0,w0,w4; /* W = W xor W[-8] */ \
- add e,e,rT2; /* 1: E = E + F */ \
- evxor w0,w0,w1; /* W = W xor W[-14] */ \
- rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
- evrlwi w0,w0,1; /* W = W rotl 1 */ \
- add e,e,rT2; /* 1: E = E + A' */ \
- evaddw rT0,w0,rK; /* WK = W + K */ \
- rotrwi b,b,2; /* 1: B = B rotl 30 */ \
- LOAD_K##k##1 \
- evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
- add e,e,rT0; /* 1: E = E + WK */ \
- xor rT2,a,b; /* 2: F' = B xor C */ \
- add d,d,rT1; /* 2: E = E + WK */ \
- xor rT2,rT2,c; /* 2: F = F' xor D */ \
- rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
- add d,d,rT2; /* 2: E = E + F */ \
- rotrwi a,a,2; /* 2: B = B rotl 30 */ \
- add d,d,rT0 /* 2: E = E + A' */
- #define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
- and rT2,b,c; /* 1: F' = B and C */ \
- evmergelohi rT0,w7,w6; /* W[-3] */ \
- or rT1,b,c; /* 1: F" = B or C */ \
- evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
- and rT1,d,rT1; /* 1: F" = F" and D */ \
- evxor w0,w0,w4; /* W = W xor W[-8] */ \
- or rT2,rT2,rT1; /* 1: F = F' or F" */ \
- evxor w0,w0,w1; /* W = W xor W[-14] */ \
- add e,e,rT2; /* 1: E = E + F */ \
- evrlwi w0,w0,1; /* W = W rotl 1 */ \
- rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
- evaddw rT0,w0,rK; /* WK = W + K */ \
- add e,e,rT2; /* 1: E = E + A' */ \
- LOAD_K##k##1 \
- evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
- rotrwi b,b,2; /* 1: B = B rotl 30 */ \
- add e,e,rT0; /* 1: E = E + WK */ \
- and rT2,a,b; /* 2: F' = B and C */ \
- or rT0,a,b; /* 2: F" = B or C */ \
- add d,d,rT1; /* 2: E = E + WK */ \
- and rT0,c,rT0; /* 2: F" = F" and D */ \
- rotrwi a,a,2; /* 2: B = B rotl 30 */ \
- or rT2,rT2,rT0; /* 2: F = F' or F" */ \
- rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
- add d,d,rT2; /* 2: E = E + F */ \
- add d,d,rT0 /* 2: E = E + A' */
- #define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
- R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
- _GLOBAL(ppc_spe_sha1_transform)
- INITIALIZE
- lwz rH0,0(rHP)
- lwz rH1,4(rHP)
- mtctr r5
- lwz rH2,8(rHP)
- lis rKP,PPC_SPE_SHA1_K@h
- lwz rH3,12(rHP)
- ori rKP,rKP,PPC_SPE_SHA1_K@l
- lwz rH4,16(rHP)
- ppc_spe_sha1_main:
- R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
- R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
- R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
- R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
- R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
- R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
- R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
- R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
- R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
- R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
- R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
- R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
- R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
- R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
- R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
- R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
- R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
- R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
- R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
- R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
- R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
- R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
- R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
- R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
- R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
- R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
- R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
- R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
- R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
- R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
- R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
- R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
- R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
- R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
- R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
- R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
- R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
- lwz rT3,0(rHP)
- R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
- lwz rW1,4(rHP)
- R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
- lwz rW2,8(rHP)
- R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
- lwz rW3,12(rHP)
- NEXT_BLOCK
- lwz rW4,16(rHP)
- add rH0,rH0,rT3
- stw rH0,0(rHP)
- add rH1,rH1,rW1
- stw rH1,4(rHP)
- add rH2,rH2,rW2
- stw rH2,8(rHP)
- add rH3,rH3,rW3
- stw rH3,12(rHP)
- add rH4,rH4,rW4
- stw rH4,16(rHP)
- bdnz ppc_spe_sha1_main
- FINALIZE
- blr
- .data
- .align 4
- PPC_SPE_SHA1_K:
- .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
|