ghashp8-ppc.pl 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. #!/usr/bin/env perl
  2. # SPDX-License-Identifier: GPL-2.0
  3. # This code is taken from the OpenSSL project but the author (Andy Polyakov)
  4. # has relicensed it under the GPLv2. Therefore this program is free software;
  5. # you can redistribute it and/or modify it under the terms of the GNU General
  6. # Public License version 2 as published by the Free Software Foundation.
  7. #
  8. # The original headers, including the original license headers, are
  9. # included below for completeness.
  10. # ====================================================================
  11. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12. # project. The module is, however, dual licensed under OpenSSL and
  13. # CRYPTOGAMS licenses depending on where you obtain it. For further
  14. # details see http://www.openssl.org/~appro/cryptogams/.
  15. # ====================================================================
  16. #
  17. # GHASH for for PowerISA v2.07.
  18. #
  19. # July 2014
  20. #
  21. # Accurate performance measurements are problematic, because it's
  22. # always virtualized setup with possibly throttled processor.
  23. # Relative comparison is therefore more informative. This initial
  24. # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
  25. # faster than "4-bit" integer-only compiler-generated 64-bit code.
  26. # "Initial version" means that there is room for futher improvement.
  27. $flavour=shift;
  28. $output =shift;
  29. if ($flavour =~ /64/) {
  30. $SIZE_T=8;
  31. $LRSAVE=2*$SIZE_T;
  32. $STU="stdu";
  33. $POP="ld";
  34. $PUSH="std";
  35. } elsif ($flavour =~ /32/) {
  36. $SIZE_T=4;
  37. $LRSAVE=$SIZE_T;
  38. $STU="stwu";
  39. $POP="lwz";
  40. $PUSH="stw";
  41. } else { die "nonsense $flavour"; }
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  45. die "can't locate ppc-xlate.pl";
  46. open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
  47. my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
  48. my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
  49. my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
  50. my $vrsave="r12";
  51. $code=<<___;
  52. .machine "any"
  53. .text
  54. .globl .gcm_init_p8
  55. lis r0,0xfff0
  56. li r8,0x10
  57. mfspr $vrsave,256
  58. li r9,0x20
  59. mtspr 256,r0
  60. li r10,0x30
  61. lvx_u $H,0,r4 # load H
  62. le?xor r7,r7,r7
  63. le?addi r7,r7,0x8 # need a vperm start with 08
  64. le?lvsr 5,0,r7
  65. le?vspltisb 6,0x0f
  66. le?vxor 5,5,6 # set a b-endian mask
  67. le?vperm $H,$H,$H,5
  68. vspltisb $xC2,-16 # 0xf0
  69. vspltisb $t0,1 # one
  70. vaddubm $xC2,$xC2,$xC2 # 0xe0
  71. vxor $zero,$zero,$zero
  72. vor $xC2,$xC2,$t0 # 0xe1
  73. vsldoi $xC2,$xC2,$zero,15 # 0xe1...
  74. vsldoi $t1,$zero,$t0,1 # ...1
  75. vaddubm $xC2,$xC2,$xC2 # 0xc2...
  76. vspltisb $t2,7
  77. vor $xC2,$xC2,$t1 # 0xc2....01
  78. vspltb $t1,$H,0 # most significant byte
  79. vsl $H,$H,$t0 # H<<=1
  80. vsrab $t1,$t1,$t2 # broadcast carry bit
  81. vand $t1,$t1,$xC2
  82. vxor $H,$H,$t1 # twisted H
  83. vsldoi $H,$H,$H,8 # twist even more ...
  84. vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
  85. vsldoi $Hl,$zero,$H,8 # ... and split
  86. vsldoi $Hh,$H,$zero,8
  87. stvx_u $xC2,0,r3 # save pre-computed table
  88. stvx_u $Hl,r8,r3
  89. stvx_u $H, r9,r3
  90. stvx_u $Hh,r10,r3
  91. mtspr 256,$vrsave
  92. blr
  93. .long 0
  94. .byte 0,12,0x14,0,0,0,2,0
  95. .long 0
  96. .size .gcm_init_p8,.-.gcm_init_p8
  97. .globl .gcm_gmult_p8
  98. lis r0,0xfff8
  99. li r8,0x10
  100. mfspr $vrsave,256
  101. li r9,0x20
  102. mtspr 256,r0
  103. li r10,0x30
  104. lvx_u $IN,0,$Xip # load Xi
  105. lvx_u $Hl,r8,$Htbl # load pre-computed table
  106. le?lvsl $lemask,r0,r0
  107. lvx_u $H, r9,$Htbl
  108. le?vspltisb $t0,0x07
  109. lvx_u $Hh,r10,$Htbl
  110. le?vxor $lemask,$lemask,$t0
  111. lvx_u $xC2,0,$Htbl
  112. le?vperm $IN,$IN,$IN,$lemask
  113. vxor $zero,$zero,$zero
  114. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  115. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  116. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  117. vpmsumd $t2,$Xl,$xC2 # 1st phase
  118. vsldoi $t0,$Xm,$zero,8
  119. vsldoi $t1,$zero,$Xm,8
  120. vxor $Xl,$Xl,$t0
  121. vxor $Xh,$Xh,$t1
  122. vsldoi $Xl,$Xl,$Xl,8
  123. vxor $Xl,$Xl,$t2
  124. vsldoi $t1,$Xl,$Xl,8 # 2nd phase
  125. vpmsumd $Xl,$Xl,$xC2
  126. vxor $t1,$t1,$Xh
  127. vxor $Xl,$Xl,$t1
  128. le?vperm $Xl,$Xl,$Xl,$lemask
  129. stvx_u $Xl,0,$Xip # write out Xi
  130. mtspr 256,$vrsave
  131. blr
  132. .long 0
  133. .byte 0,12,0x14,0,0,0,2,0
  134. .long 0
  135. .size .gcm_gmult_p8,.-.gcm_gmult_p8
  136. .globl .gcm_ghash_p8
  137. lis r0,0xfff8
  138. li r8,0x10
  139. mfspr $vrsave,256
  140. li r9,0x20
  141. mtspr 256,r0
  142. li r10,0x30
  143. lvx_u $Xl,0,$Xip # load Xi
  144. lvx_u $Hl,r8,$Htbl # load pre-computed table
  145. le?lvsl $lemask,r0,r0
  146. lvx_u $H, r9,$Htbl
  147. le?vspltisb $t0,0x07
  148. lvx_u $Hh,r10,$Htbl
  149. le?vxor $lemask,$lemask,$t0
  150. lvx_u $xC2,0,$Htbl
  151. le?vperm $Xl,$Xl,$Xl,$lemask
  152. vxor $zero,$zero,$zero
  153. lvx_u $IN,0,$inp
  154. addi $inp,$inp,16
  155. subi $len,$len,16
  156. le?vperm $IN,$IN,$IN,$lemask
  157. vxor $IN,$IN,$Xl
  158. b Loop
  159. .align 5
  160. Loop:
  161. subic $len,$len,16
  162. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  163. subfe. r0,r0,r0 # borrow?-1:0
  164. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  165. and r0,r0,$len
  166. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  167. add $inp,$inp,r0
  168. vpmsumd $t2,$Xl,$xC2 # 1st phase
  169. vsldoi $t0,$Xm,$zero,8
  170. vsldoi $t1,$zero,$Xm,8
  171. vxor $Xl,$Xl,$t0
  172. vxor $Xh,$Xh,$t1
  173. vsldoi $Xl,$Xl,$Xl,8
  174. vxor $Xl,$Xl,$t2
  175. lvx_u $IN,0,$inp
  176. addi $inp,$inp,16
  177. vsldoi $t1,$Xl,$Xl,8 # 2nd phase
  178. vpmsumd $Xl,$Xl,$xC2
  179. le?vperm $IN,$IN,$IN,$lemask
  180. vxor $t1,$t1,$Xh
  181. vxor $IN,$IN,$t1
  182. vxor $IN,$IN,$Xl
  183. beq Loop # did $len-=16 borrow?
  184. vxor $Xl,$Xl,$t1
  185. le?vperm $Xl,$Xl,$Xl,$lemask
  186. stvx_u $Xl,0,$Xip # write out Xi
  187. mtspr 256,$vrsave
  188. blr
  189. .long 0
  190. .byte 0,12,0x14,0,0,0,4,0
  191. .long 0
  192. .size .gcm_ghash_p8,.-.gcm_ghash_p8
  193. .asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
  194. .align 2
  195. ___
  196. foreach (split("\n",$code)) {
  197. if ($flavour =~ /le$/o) { # little-endian
  198. s/le\?//o or
  199. s/be\?/#be#/o;
  200. } else {
  201. s/le\?/#le#/o or
  202. s/be\?//o;
  203. }
  204. print $_,"\n";
  205. }
  206. close STDOUT; # enforce flush