123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842 |
- # Copyright 2021- IBM Inc. All rights reserved
- #
- # This file is part of Libgcrypt.
- #
- # Libgcrypt is free software; you can redistribute it and/or modify
- # it under the terms of the GNU Lesser General Public License as
- # published by the Free Software Foundation; either version 2.1 of
- # the License, or (at your option) any later version.
- #
- # Libgcrypt is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this program; if not, see <http://www.gnu.org/licenses/>.
- #
- #===================================================================================
- # Written by Danny Tsen <dtsen@us.ibm.com>
- #
- # Poly1305 - this version mainly using vector/VSX/Scalar
- # - 26 bits limbs
- # - Handle multiple 64 byte blcoks but need at least 2 64 bytes block
- #
- # Improve performance by breaking down polynominal to the sum of products with
- # h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
- #
- # 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
- # to 9 vectors for multiplications.
- #
- # setup r^4, r^3, r^2, r vectors
- # vs [r^1, r^3, r^2, r^4]
- # vs0 = [r0,.....]
- # vs1 = [r1,.....]
- # vs2 = [r2,.....]
- # vs3 = [r3,.....]
- # vs4 = [r4,.....]
- # vs5 = [r1*5,...]
- # vs6 = [r2*5,...]
- # vs7 = [r2*5,...]
- # vs8 = [r4*5,...]
- #
- # Each word in a vector consists a member of a "r/s" in [a * r/s].
- #
- # r0, r4*5, r3*5, r2*5, r1*5;
- # r1, r0, r4*5, r3*5, r2*5;
- # r2, r1, r0, r4*5, r3*5;
- # r3, r2, r1, r0, r4*5;
- # r4, r3, r2, r1, r0 ;
- #
- #
- # gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
- # k = 32 bytes key
- # r3 = k (r, s)
- # r4 = mlen
- # r5 = m
- #
- .text
- # Block size 16 bytes
- # key = (r, s)
- # clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
- # p = 2^130 - 5
- # a += m
- # a = (r + a) % p
- # a += s
- # 16 bytes (a)
- #
- # p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
- # p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
- # p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
- # p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
- # p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
- #
- # [r^2, r^3, r^1, r^4]
- # [m3, m2, m4, m1]
- #
- # multiply odd and even words
- .macro mul_odd
- vmulouw 14, 4, 26
- vmulouw 10, 5, 3
- vmulouw 11, 6, 2
- vmulouw 12, 7, 1
- vmulouw 13, 8, 0
- vmulouw 15, 4, 27
- vaddudm 14, 14, 10
- vaddudm 14, 14, 11
- vmulouw 10, 5, 26
- vmulouw 11, 6, 3
- vaddudm 14, 14, 12
- vaddudm 14, 14, 13 # x0
- vaddudm 15, 15, 10
- vaddudm 15, 15, 11
- vmulouw 12, 7, 2
- vmulouw 13, 8, 1
- vaddudm 15, 15, 12
- vaddudm 15, 15, 13 # x1
- vmulouw 16, 4, 28
- vmulouw 10, 5, 27
- vmulouw 11, 6, 26
- vaddudm 16, 16, 10
- vaddudm 16, 16, 11
- vmulouw 12, 7, 3
- vmulouw 13, 8, 2
- vaddudm 16, 16, 12
- vaddudm 16, 16, 13 # x2
- vmulouw 17, 4, 29
- vmulouw 10, 5, 28
- vmulouw 11, 6, 27
- vaddudm 17, 17, 10
- vaddudm 17, 17, 11
- vmulouw 12, 7, 26
- vmulouw 13, 8, 3
- vaddudm 17, 17, 12
- vaddudm 17, 17, 13 # x3
- vmulouw 18, 4, 30
- vmulouw 10, 5, 29
- vmulouw 11, 6, 28
- vaddudm 18, 18, 10
- vaddudm 18, 18, 11
- vmulouw 12, 7, 27
- vmulouw 13, 8, 26
- vaddudm 18, 18, 12
- vaddudm 18, 18, 13 # x4
- .endm
- .macro mul_even
- vmuleuw 9, 4, 26
- vmuleuw 10, 5, 3
- vmuleuw 11, 6, 2
- vmuleuw 12, 7, 1
- vmuleuw 13, 8, 0
- vaddudm 14, 14, 9
- vaddudm 14, 14, 10
- vaddudm 14, 14, 11
- vaddudm 14, 14, 12
- vaddudm 14, 14, 13 # x0
- vmuleuw 9, 4, 27
- vmuleuw 10, 5, 26
- vmuleuw 11, 6, 3
- vmuleuw 12, 7, 2
- vmuleuw 13, 8, 1
- vaddudm 15, 15, 9
- vaddudm 15, 15, 10
- vaddudm 15, 15, 11
- vaddudm 15, 15, 12
- vaddudm 15, 15, 13 # x1
- vmuleuw 9, 4, 28
- vmuleuw 10, 5, 27
- vmuleuw 11, 6, 26
- vmuleuw 12, 7, 3
- vmuleuw 13, 8, 2
- vaddudm 16, 16, 9
- vaddudm 16, 16, 10
- vaddudm 16, 16, 11
- vaddudm 16, 16, 12
- vaddudm 16, 16, 13 # x2
- vmuleuw 9, 4, 29
- vmuleuw 10, 5, 28
- vmuleuw 11, 6, 27
- vmuleuw 12, 7, 26
- vmuleuw 13, 8, 3
- vaddudm 17, 17, 9
- vaddudm 17, 17, 10
- vaddudm 17, 17, 11
- vaddudm 17, 17, 12
- vaddudm 17, 17, 13 # x3
- vmuleuw 9, 4, 30
- vmuleuw 10, 5, 29
- vmuleuw 11, 6, 28
- vmuleuw 12, 7, 27
- vmuleuw 13, 8, 26
- vaddudm 18, 18, 9
- vaddudm 18, 18, 10
- vaddudm 18, 18, 11
- vaddudm 18, 18, 12
- vaddudm 18, 18, 13 # x4
- .endm
- # setup r^4, r^3, r^2, r vectors
- # [r, r^3, r^2, r^4]
- # vs0 = [r0,...]
- # vs1 = [r1,...]
- # vs2 = [r2,...]
- # vs3 = [r3,...]
- # vs4 = [r4,...]
- # vs5 = [r4*5,...]
- # vs6 = [r3*5,...]
- # vs7 = [r2*5,...]
- # vs8 = [r1*5,...]
- #
- # r0, r4*5, r3*5, r2*5, r1*5;
- # r1, r0, r4*5, r3*5, r2*5;
- # r2, r1, r0, r4*5, r3*5;
- # r3, r2, r1, r0, r4*5;
- # r4, r3, r2, r1, r0 ;
- #
- .macro poly1305_setup_r
- # save r
- xxlor 26, 58, 58
- xxlor 27, 59, 59
- xxlor 28, 60, 60
- xxlor 29, 61, 61
- xxlor 30, 62, 62
- xxlxor 31, 31, 31
- # [r, r^3, r^2, r^4]
- # compute r^2
- vmr 4, 26
- vmr 5, 27
- vmr 6, 28
- vmr 7, 29
- vmr 8, 30
- bl do_mul # r^2 r^1
- xxpermdi 58, 58, 36, 0x3 # r0
- xxpermdi 59, 59, 37, 0x3 # r1
- xxpermdi 60, 60, 38, 0x3 # r2
- xxpermdi 61, 61, 39, 0x3 # r3
- xxpermdi 62, 62, 40, 0x3 # r4
- xxpermdi 36, 36, 36, 0x3
- xxpermdi 37, 37, 37, 0x3
- xxpermdi 38, 38, 38, 0x3
- xxpermdi 39, 39, 39, 0x3
- xxpermdi 40, 40, 40, 0x3
- vspltisb 13, 2
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
- bl do_mul # r^4 r^3
- vmrgow 26, 26, 4
- vmrgow 27, 27, 5
- vmrgow 28, 28, 6
- vmrgow 29, 29, 7
- vmrgow 30, 30, 8
- vspltisb 13, 2
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
- # r^2 r^4
- xxlor 0, 58, 58
- xxlor 1, 59, 59
- xxlor 2, 60, 60
- xxlor 3, 61, 61
- xxlor 4, 62, 62
- xxlor 5, 32, 32
- xxlor 6, 33, 33
- xxlor 7, 34, 34
- xxlor 8, 35, 35
- vspltw 9, 26, 3
- vspltw 10, 26, 2
- vmrgow 26, 10, 9
- vspltw 9, 27, 3
- vspltw 10, 27, 2
- vmrgow 27, 10, 9
- vspltw 9, 28, 3
- vspltw 10, 28, 2
- vmrgow 28, 10, 9
- vspltw 9, 29, 3
- vspltw 10, 29, 2
- vmrgow 29, 10, 9
- vspltw 9, 30, 3
- vspltw 10, 30, 2
- vmrgow 30, 10, 9
- vsld 9, 27, 13
- vsld 10, 28, 13
- vsld 11, 29, 13
- vsld 12, 30, 13
- vaddudm 0, 9, 27
- vaddudm 1, 10, 28
- vaddudm 2, 11, 29
- vaddudm 3, 12, 30
- .endm
- do_mul:
- mul_odd
- # do reduction ( h %= p )
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 14, 31
- vsrd 11, 17, 31
- vand 7, 17, 25
- vand 4, 14, 25
- vaddudm 18, 18, 11
- vsrd 12, 18, 31
- vaddudm 15, 15, 10
- vsrd 11, 15, 31
- vand 8, 18, 25
- vand 5, 15, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 16, 11
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vaddudm 8, 8, 11
- blr
- #
- # init key
- #
- do_poly1305_init:
- ld 10, rmask@got(2)
- ld 11, 0(10)
- ld 12, 8(10)
- li 14, 16
- li 15, 32
- ld 10, cnum@got(2)
- lvx 25, 0, 10 # v25 - mask
- lvx 31, 14, 10 # v31 = 1a
- lvx 19, 15, 10 # v19 = 1 << 24
- lxv 24, 48(10) # vs24
- lxv 25, 64(10) # vs25
- # initialize
- # load key from r3 to vectors
- ld 9, 16(3)
- ld 10, 24(3)
- ld 11, 0(3)
- ld 12, 8(3)
- # break 26 bits
- extrdi 14, 9, 26, 38
- extrdi 15, 9, 26, 12
- extrdi 16, 9, 12, 0
- mtvsrdd 58, 0, 14
- insrdi 16, 10, 14, 38
- mtvsrdd 59, 0, 15
- extrdi 17, 10, 26, 24
- mtvsrdd 60, 0, 16
- extrdi 18, 10, 24, 0
- mtvsrdd 61, 0, 17
- mtvsrdd 62, 0, 18
- # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
- li 9, 5
- mtvsrdd 36, 0, 9
- vmulouw 0, 27, 4 # v0 = rr0
- vmulouw 1, 28, 4 # v1 = rr1
- vmulouw 2, 29, 4 # v2 = rr2
- vmulouw 3, 30, 4 # v3 = rr3
- blr
- #
- # gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
- # k = 32 bytes key
- # r3 = k (r, s)
- # r4 = mlen
- # r5 = m
- #
- .global gcry_poly1305_p10le_4blocks
- .align 5
- gcry_poly1305_p10le_4blocks:
- _gcry_poly1305_p10le_4blocks:
- cmpdi 5, 128
- blt Out_no_poly1305
- stdu 1,-1024(1)
- mflr 0
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- std 31,248(1)
- li 14, 256
- stvx 20, 14, 1
- addi 14, 14, 16
- stvx 21, 14, 1
- addi 14, 14, 16
- stvx 22, 14, 1
- addi 14, 14, 16
- stvx 23, 14, 1
- addi 14, 14, 16
- stvx 24, 14, 1
- addi 14, 14, 16
- stvx 25, 14, 1
- addi 14, 14, 16
- stvx 26, 14, 1
- addi 14, 14, 16
- stvx 27, 14, 1
- addi 14, 14, 16
- stvx 28, 14, 1
- addi 14, 14, 16
- stvx 29, 14, 1
- addi 14, 14, 16
- stvx 30, 14, 1
- addi 14, 14, 16
- stvx 31, 14, 1
- addi 14, 14, 16
- stxvx 14, 14, 1
- addi 14, 14, 16
- stxvx 15, 14, 1
- addi 14, 14, 16
- stxvx 16, 14, 1
- addi 14, 14, 16
- stxvx 17, 14, 1
- addi 14, 14, 16
- stxvx 18, 14, 1
- addi 14, 14, 16
- stxvx 19, 14, 1
- addi 14, 14, 16
- stxvx 20, 14, 1
- addi 14, 14, 16
- stxvx 21, 14, 1
- addi 14, 14, 16
- stxvx 22, 14, 1
- addi 14, 14, 16
- stxvx 23, 14, 1
- addi 14, 14, 16
- stxvx 24, 14, 1
- addi 14, 14, 16
- stxvx 25, 14, 1
- addi 14, 14, 16
- stxvx 26, 14, 1
- addi 14, 14, 16
- stxvx 27, 14, 1
- addi 14, 14, 16
- stxvx 28, 14, 1
- addi 14, 14, 16
- stxvx 29, 14, 1
- addi 14, 14, 16
- stxvx 30, 14, 1
- addi 14, 14, 16
- stxvx 31, 14, 1
- std 0, 1040(1)
- bl do_poly1305_init
- li 21, 0 # counter to message
- poly1305_setup_r
- # load previous state
- # break/convert r6 to 26 bits
- ld 9, 32(3)
- ld 10, 40(3)
- lwz 19, 48(3)
- sldi 19, 19, 24
- mtvsrdd 41, 0, 19
- extrdi 14, 9, 26, 38
- extrdi 15, 9, 26, 12
- extrdi 16, 9, 12, 0
- mtvsrdd 36, 0, 14
- insrdi 16, 10, 14, 38
- mtvsrdd 37, 0, 15
- extrdi 17, 10, 26, 24
- mtvsrdd 38, 0, 16
- extrdi 18, 10, 24, 0
- mtvsrdd 39, 0, 17
- mtvsrdd 40, 0, 18
- vor 8, 8, 9
- # input m1 m2
- add 20, 4, 21
- xxlor 49, 24, 24
- xxlor 50, 25, 25
- lxvw4x 43, 0, 20
- addi 17, 20, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- vand 9, 14, 25 # a0
- vsrd 10, 14, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
- vand 10, 10, 25 # a1
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 12, 16, 13
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vspltisb 13, 14
- vsrd 12, 15, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
- vaddudm 20, 4, 9
- vaddudm 21, 5, 10
- vaddudm 22, 6, 11
- vaddudm 23, 7, 12
- vaddudm 24, 8, 13
- # m3 m4
- addi 17, 17, 16
- lxvw4x 43, 0, 17
- addi 17, 17, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- vand 9, 14, 25 # a0
- vsrd 10, 14, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
- vand 10, 10, 25 # a1
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 12, 16, 13
- vspltisb 13, 14
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vsrd 12, 15, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
- # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
- vmrgow 4, 9, 20
- vmrgow 5, 10, 21
- vmrgow 6, 11, 22
- vmrgow 7, 12, 23
- vmrgow 8, 13, 24
- vaddudm 8, 8, 19
- addi 5, 5, -64
- addi 21, 21, 64
- li 9, 64
- divdu 31, 5, 9
- mtctr 31
- # h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
- # Rewrite the polynominal sum of product as follows,
- # h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
- # h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
- # .... Repeat
- # h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
- # h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
- #
- loop_4blocks:
- # Multiply odd words and even words
- mul_odd
- mul_even
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 14, 31
- vsrd 11, 17, 31
- vand 7, 17, 25
- vand 4, 14, 25
- vaddudm 18, 18, 11
- vsrd 12, 18, 31
- vaddudm 15, 15, 10
- vsrd 11, 15, 31
- vand 8, 18, 25
- vand 5, 15, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 16, 11
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vaddudm 8, 8, 11
- # input m1 m2 m3 m4
- add 20, 4, 21
- xxlor 49, 24, 24
- xxlor 50, 25, 25
- lxvw4x 43, 0, 20
- addi 17, 20, 16
- lxvw4x 44, 0, 17
- vperm 14, 11, 12, 17
- vperm 15, 11, 12, 18
- addi 17, 17, 16
- lxvw4x 43, 0, 17
- addi 17, 17, 16
- lxvw4x 44, 0, 17
- vperm 17, 11, 12, 17
- vperm 18, 11, 12, 18
- vand 20, 14, 25 # a0
- vand 9, 17, 25 # a0
- vsrd 21, 14, 31 # >> 26
- vsrd 22, 21, 31 # 12 bits left
- vsrd 10, 17, 31 # >> 26
- vsrd 11, 10, 31 # 12 bits left
- vand 21, 21, 25 # a1
- vand 10, 10, 25 # a1
- vspltisb 13, 12
- vand 16, 15, 25
- vsld 23, 16, 13
- vor 22, 22, 23
- vand 22, 22, 25 # a2
- vand 16, 18, 25
- vsld 12, 16, 13
- vor 11, 11, 12
- vand 11, 11, 25 # a2
- vspltisb 13, 14
- vsrd 23, 15, 13 # >> 14
- vsrd 24, 23, 31 # >> 26, a4
- vand 23, 23, 25 # a3
- vsrd 12, 18, 13 # >> 14
- vsrd 13, 12, 31 # >> 26, a4
- vand 12, 12, 25 # a3
- vaddudm 4, 4, 20
- vaddudm 5, 5, 21
- vaddudm 6, 6, 22
- vaddudm 7, 7, 23
- vaddudm 8, 8, 24
- # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
- vmrgow 4, 9, 4
- vmrgow 5, 10, 5
- vmrgow 6, 11, 6
- vmrgow 7, 12, 7
- vmrgow 8, 13, 8
- vaddudm 8, 8, 19
- addi 5, 5, -64
- addi 21, 21, 64
- bdnz loop_4blocks
- xxlor 58, 0, 0
- xxlor 59, 1, 1
- xxlor 60, 2, 2
- xxlor 61, 3, 3
- xxlor 62, 4, 4
- xxlor 32, 5, 5
- xxlor 33, 6, 6
- xxlor 34, 7, 7
- xxlor 35, 8, 8
- # Multiply odd words and even words
- mul_odd
- mul_even
- # Sum the products.
- xxpermdi 41, 31, 46, 0
- xxpermdi 42, 31, 47, 0
- vaddudm 4, 14, 9
- xxpermdi 36, 31, 36, 3
- vaddudm 5, 15, 10
- xxpermdi 37, 31, 37, 3
- xxpermdi 43, 31, 48, 0
- vaddudm 6, 16, 11
- xxpermdi 38, 31, 38, 3
- xxpermdi 44, 31, 49, 0
- vaddudm 7, 17, 12
- xxpermdi 39, 31, 39, 3
- xxpermdi 45, 31, 50, 0
- vaddudm 8, 18, 13
- xxpermdi 40, 31, 40, 3
- # carry reduction
- vspltisb 9, 2
- vsrd 10, 4, 31
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 8, 8, 11
- vsrd 12, 8, 31
- vaddudm 5, 5, 10
- vsrd 11, 5, 31
- vand 8, 8, 25
- vand 5, 5, 25
- vaddudm 4, 4, 12
- vsld 10, 12, 9
- vaddudm 6, 6, 11
- vsrd 13, 6, 31
- vand 6, 6, 25
- vaddudm 4, 4, 10
- vsrd 10, 4, 31
- vaddudm 7, 7, 13
- vsrd 11, 7, 31
- vand 7, 7, 25
- vand 4, 4, 25
- vaddudm 5, 5, 10
- vaddudm 8, 8, 11
- b do_final_update
- do_final_update:
- # v4, v5, v6, v7 and v8 are 26 bit vectors
- vsld 5, 5, 31
- vor 20, 4, 5
- vspltisb 11, 12
- vsrd 12, 6, 11
- vsld 6, 6, 31
- vsld 6, 6, 31
- vor 20, 20, 6
- vspltisb 11, 14
- vsld 7, 7, 11
- vor 21, 7, 12
- mfvsrld 16, 40 # save last 2 bytes
- vsld 8, 8, 11
- vsld 8, 8, 31
- vor 21, 21, 8
- mfvsrld 17, 52
- mfvsrld 19, 53
- srdi 16, 16, 24
- std 17, 32(3)
- std 19, 40(3)
- stw 16, 48(3)
- Out_loop:
- li 3, 0
- li 14, 256
- lvx 20, 14, 1
- addi 14, 14, 16
- lvx 21, 14, 1
- addi 14, 14, 16
- lvx 22, 14, 1
- addi 14, 14, 16
- lvx 23, 14, 1
- addi 14, 14, 16
- lvx 24, 14, 1
- addi 14, 14, 16
- lvx 25, 14, 1
- addi 14, 14, 16
- lvx 26, 14, 1
- addi 14, 14, 16
- lvx 27, 14, 1
- addi 14, 14, 16
- lvx 28, 14, 1
- addi 14, 14, 16
- lvx 29, 14, 1
- addi 14, 14, 16
- lvx 30, 14, 1
- addi 14, 14, 16
- lvx 31, 14, 1
- addi 14, 14, 16
- lxvx 14, 14, 1
- addi 14, 14, 16
- lxvx 15, 14, 1
- addi 14, 14, 16
- lxvx 16, 14, 1
- addi 14, 14, 16
- lxvx 17, 14, 1
- addi 14, 14, 16
- lxvx 18, 14, 1
- addi 14, 14, 16
- lxvx 19, 14, 1
- addi 14, 14, 16
- lxvx 20, 14, 1
- addi 14, 14, 16
- lxvx 21, 14, 1
- addi 14, 14, 16
- lxvx 22, 14, 1
- addi 14, 14, 16
- lxvx 23, 14, 1
- addi 14, 14, 16
- lxvx 24, 14, 1
- addi 14, 14, 16
- lxvx 25, 14, 1
- addi 14, 14, 16
- lxvx 26, 14, 1
- addi 14, 14, 16
- lxvx 27, 14, 1
- addi 14, 14, 16
- lxvx 28, 14, 1
- addi 14, 14, 16
- lxvx 29, 14, 1
- addi 14, 14, 16
- lxvx 30, 14, 1
- addi 14, 14, 16
- lxvx 31, 14, 1
- ld 0, 1040(1)
- ld 14,112(1)
- ld 15,120(1)
- ld 16,128(1)
- ld 17,136(1)
- ld 18,144(1)
- ld 19,152(1)
- ld 20,160(1)
- ld 21,168(1)
- ld 31,248(1)
- mtlr 0
- addi 1, 1, 1024
- blr
- Out_no_poly1305:
- li 3, 0
- blr
- .data
- .align 5
- rmask:
- .byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
- cnum:
- .long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
- .long 0x1a, 0x00, 0x1a, 0x00
- .long 0x01000000, 0x01000000, 0x01000000, 0x01000000
- .long 0x00010203, 0x04050607, 0x10111213, 0x14151617
- .long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
- .long 0x05, 0x00, 0x00, 0x00
- .long 0x02020202, 0x02020202, 0x02020202, 0x02020202
- .long 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
|