poly1305-p10le.s 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842
  1. # Copyright 2021- IBM Inc. All rights reserved
  2. #
  3. # This file is part of Libgcrypt.
  4. #
  5. # Libgcrypt is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU Lesser General Public License as
  7. # published by the Free Software Foundation; either version 2.1 of
  8. # the License, or (at your option) any later version.
  9. #
  10. # Libgcrypt is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU Lesser General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU Lesser General Public
  16. # License along with this program; if not, see <http://www.gnu.org/licenses/>.
  17. #
  18. #===================================================================================
  19. # Written by Danny Tsen <dtsen@us.ibm.com>
  20. #
  21. # Poly1305 - this version mainly using vector/VSX/Scalar
  22. # - 26 bits limbs
  23. # - Handle multiple 64 byte blcoks but need at least 2 64 bytes block
  24. #
  25. # Improve performance by breaking down polynominal to the sum of products with
  26. # h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
  27. #
  28. # 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
  29. # to 9 vectors for multiplications.
  30. #
  31. # setup r^4, r^3, r^2, r vectors
  32. # vs [r^1, r^3, r^2, r^4]
  33. # vs0 = [r0,.....]
  34. # vs1 = [r1,.....]
  35. # vs2 = [r2,.....]
  36. # vs3 = [r3,.....]
  37. # vs4 = [r4,.....]
  38. # vs5 = [r1*5,...]
  39. # vs6 = [r2*5,...]
  40. # vs7 = [r2*5,...]
  41. # vs8 = [r4*5,...]
  42. #
  43. # Each word in a vector consists a member of a "r/s" in [a * r/s].
  44. #
  45. # r0, r4*5, r3*5, r2*5, r1*5;
  46. # r1, r0, r4*5, r3*5, r2*5;
  47. # r2, r1, r0, r4*5, r3*5;
  48. # r3, r2, r1, r0, r4*5;
  49. # r4, r3, r2, r1, r0 ;
  50. #
  51. #
  52. # gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
  53. # k = 32 bytes key
  54. # r3 = k (r, s)
  55. # r4 = mlen
  56. # r5 = m
  57. #
  58. .text
  59. # Block size 16 bytes
  60. # key = (r, s)
  61. # clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
  62. # p = 2^130 - 5
  63. # a += m
  64. # a = (r + a) % p
  65. # a += s
  66. # 16 bytes (a)
  67. #
  68. # p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
  69. # p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
  70. # p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
  71. # p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
  72. # p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
  73. #
  74. # [r^2, r^3, r^1, r^4]
  75. # [m3, m2, m4, m1]
  76. #
  77. # multiply odd and even words
  78. .macro mul_odd
  79. vmulouw 14, 4, 26
  80. vmulouw 10, 5, 3
  81. vmulouw 11, 6, 2
  82. vmulouw 12, 7, 1
  83. vmulouw 13, 8, 0
  84. vmulouw 15, 4, 27
  85. vaddudm 14, 14, 10
  86. vaddudm 14, 14, 11
  87. vmulouw 10, 5, 26
  88. vmulouw 11, 6, 3
  89. vaddudm 14, 14, 12
  90. vaddudm 14, 14, 13 # x0
  91. vaddudm 15, 15, 10
  92. vaddudm 15, 15, 11
  93. vmulouw 12, 7, 2
  94. vmulouw 13, 8, 1
  95. vaddudm 15, 15, 12
  96. vaddudm 15, 15, 13 # x1
  97. vmulouw 16, 4, 28
  98. vmulouw 10, 5, 27
  99. vmulouw 11, 6, 26
  100. vaddudm 16, 16, 10
  101. vaddudm 16, 16, 11
  102. vmulouw 12, 7, 3
  103. vmulouw 13, 8, 2
  104. vaddudm 16, 16, 12
  105. vaddudm 16, 16, 13 # x2
  106. vmulouw 17, 4, 29
  107. vmulouw 10, 5, 28
  108. vmulouw 11, 6, 27
  109. vaddudm 17, 17, 10
  110. vaddudm 17, 17, 11
  111. vmulouw 12, 7, 26
  112. vmulouw 13, 8, 3
  113. vaddudm 17, 17, 12
  114. vaddudm 17, 17, 13 # x3
  115. vmulouw 18, 4, 30
  116. vmulouw 10, 5, 29
  117. vmulouw 11, 6, 28
  118. vaddudm 18, 18, 10
  119. vaddudm 18, 18, 11
  120. vmulouw 12, 7, 27
  121. vmulouw 13, 8, 26
  122. vaddudm 18, 18, 12
  123. vaddudm 18, 18, 13 # x4
  124. .endm
  125. .macro mul_even
  126. vmuleuw 9, 4, 26
  127. vmuleuw 10, 5, 3
  128. vmuleuw 11, 6, 2
  129. vmuleuw 12, 7, 1
  130. vmuleuw 13, 8, 0
  131. vaddudm 14, 14, 9
  132. vaddudm 14, 14, 10
  133. vaddudm 14, 14, 11
  134. vaddudm 14, 14, 12
  135. vaddudm 14, 14, 13 # x0
  136. vmuleuw 9, 4, 27
  137. vmuleuw 10, 5, 26
  138. vmuleuw 11, 6, 3
  139. vmuleuw 12, 7, 2
  140. vmuleuw 13, 8, 1
  141. vaddudm 15, 15, 9
  142. vaddudm 15, 15, 10
  143. vaddudm 15, 15, 11
  144. vaddudm 15, 15, 12
  145. vaddudm 15, 15, 13 # x1
  146. vmuleuw 9, 4, 28
  147. vmuleuw 10, 5, 27
  148. vmuleuw 11, 6, 26
  149. vmuleuw 12, 7, 3
  150. vmuleuw 13, 8, 2
  151. vaddudm 16, 16, 9
  152. vaddudm 16, 16, 10
  153. vaddudm 16, 16, 11
  154. vaddudm 16, 16, 12
  155. vaddudm 16, 16, 13 # x2
  156. vmuleuw 9, 4, 29
  157. vmuleuw 10, 5, 28
  158. vmuleuw 11, 6, 27
  159. vmuleuw 12, 7, 26
  160. vmuleuw 13, 8, 3
  161. vaddudm 17, 17, 9
  162. vaddudm 17, 17, 10
  163. vaddudm 17, 17, 11
  164. vaddudm 17, 17, 12
  165. vaddudm 17, 17, 13 # x3
  166. vmuleuw 9, 4, 30
  167. vmuleuw 10, 5, 29
  168. vmuleuw 11, 6, 28
  169. vmuleuw 12, 7, 27
  170. vmuleuw 13, 8, 26
  171. vaddudm 18, 18, 9
  172. vaddudm 18, 18, 10
  173. vaddudm 18, 18, 11
  174. vaddudm 18, 18, 12
  175. vaddudm 18, 18, 13 # x4
  176. .endm
  177. # setup r^4, r^3, r^2, r vectors
  178. # [r, r^3, r^2, r^4]
  179. # vs0 = [r0,...]
  180. # vs1 = [r1,...]
  181. # vs2 = [r2,...]
  182. # vs3 = [r3,...]
  183. # vs4 = [r4,...]
  184. # vs5 = [r4*5,...]
  185. # vs6 = [r3*5,...]
  186. # vs7 = [r2*5,...]
  187. # vs8 = [r1*5,...]
  188. #
  189. # r0, r4*5, r3*5, r2*5, r1*5;
  190. # r1, r0, r4*5, r3*5, r2*5;
  191. # r2, r1, r0, r4*5, r3*5;
  192. # r3, r2, r1, r0, r4*5;
  193. # r4, r3, r2, r1, r0 ;
  194. #
  195. .macro poly1305_setup_r
  196. # save r
  197. xxlor 26, 58, 58
  198. xxlor 27, 59, 59
  199. xxlor 28, 60, 60
  200. xxlor 29, 61, 61
  201. xxlor 30, 62, 62
  202. xxlxor 31, 31, 31
  203. # [r, r^3, r^2, r^4]
  204. # compute r^2
  205. vmr 4, 26
  206. vmr 5, 27
  207. vmr 6, 28
  208. vmr 7, 29
  209. vmr 8, 30
  210. bl do_mul # r^2 r^1
  211. xxpermdi 58, 58, 36, 0x3 # r0
  212. xxpermdi 59, 59, 37, 0x3 # r1
  213. xxpermdi 60, 60, 38, 0x3 # r2
  214. xxpermdi 61, 61, 39, 0x3 # r3
  215. xxpermdi 62, 62, 40, 0x3 # r4
  216. xxpermdi 36, 36, 36, 0x3
  217. xxpermdi 37, 37, 37, 0x3
  218. xxpermdi 38, 38, 38, 0x3
  219. xxpermdi 39, 39, 39, 0x3
  220. xxpermdi 40, 40, 40, 0x3
  221. vspltisb 13, 2
  222. vsld 9, 27, 13
  223. vsld 10, 28, 13
  224. vsld 11, 29, 13
  225. vsld 12, 30, 13
  226. vaddudm 0, 9, 27
  227. vaddudm 1, 10, 28
  228. vaddudm 2, 11, 29
  229. vaddudm 3, 12, 30
  230. bl do_mul # r^4 r^3
  231. vmrgow 26, 26, 4
  232. vmrgow 27, 27, 5
  233. vmrgow 28, 28, 6
  234. vmrgow 29, 29, 7
  235. vmrgow 30, 30, 8
  236. vspltisb 13, 2
  237. vsld 9, 27, 13
  238. vsld 10, 28, 13
  239. vsld 11, 29, 13
  240. vsld 12, 30, 13
  241. vaddudm 0, 9, 27
  242. vaddudm 1, 10, 28
  243. vaddudm 2, 11, 29
  244. vaddudm 3, 12, 30
  245. # r^2 r^4
  246. xxlor 0, 58, 58
  247. xxlor 1, 59, 59
  248. xxlor 2, 60, 60
  249. xxlor 3, 61, 61
  250. xxlor 4, 62, 62
  251. xxlor 5, 32, 32
  252. xxlor 6, 33, 33
  253. xxlor 7, 34, 34
  254. xxlor 8, 35, 35
  255. vspltw 9, 26, 3
  256. vspltw 10, 26, 2
  257. vmrgow 26, 10, 9
  258. vspltw 9, 27, 3
  259. vspltw 10, 27, 2
  260. vmrgow 27, 10, 9
  261. vspltw 9, 28, 3
  262. vspltw 10, 28, 2
  263. vmrgow 28, 10, 9
  264. vspltw 9, 29, 3
  265. vspltw 10, 29, 2
  266. vmrgow 29, 10, 9
  267. vspltw 9, 30, 3
  268. vspltw 10, 30, 2
  269. vmrgow 30, 10, 9
  270. vsld 9, 27, 13
  271. vsld 10, 28, 13
  272. vsld 11, 29, 13
  273. vsld 12, 30, 13
  274. vaddudm 0, 9, 27
  275. vaddudm 1, 10, 28
  276. vaddudm 2, 11, 29
  277. vaddudm 3, 12, 30
  278. .endm
  279. do_mul:
  280. mul_odd
  281. # do reduction ( h %= p )
  282. # carry reduction
  283. vspltisb 9, 2
  284. vsrd 10, 14, 31
  285. vsrd 11, 17, 31
  286. vand 7, 17, 25
  287. vand 4, 14, 25
  288. vaddudm 18, 18, 11
  289. vsrd 12, 18, 31
  290. vaddudm 15, 15, 10
  291. vsrd 11, 15, 31
  292. vand 8, 18, 25
  293. vand 5, 15, 25
  294. vaddudm 4, 4, 12
  295. vsld 10, 12, 9
  296. vaddudm 6, 16, 11
  297. vsrd 13, 6, 31
  298. vand 6, 6, 25
  299. vaddudm 4, 4, 10
  300. vsrd 10, 4, 31
  301. vaddudm 7, 7, 13
  302. vsrd 11, 7, 31
  303. vand 7, 7, 25
  304. vand 4, 4, 25
  305. vaddudm 5, 5, 10
  306. vaddudm 8, 8, 11
  307. blr
  308. #
  309. # init key
  310. #
  311. do_poly1305_init:
  312. ld 10, rmask@got(2)
  313. ld 11, 0(10)
  314. ld 12, 8(10)
  315. li 14, 16
  316. li 15, 32
  317. ld 10, cnum@got(2)
  318. lvx 25, 0, 10 # v25 - mask
  319. lvx 31, 14, 10 # v31 = 1a
  320. lvx 19, 15, 10 # v19 = 1 << 24
  321. lxv 24, 48(10) # vs24
  322. lxv 25, 64(10) # vs25
  323. # initialize
  324. # load key from r3 to vectors
  325. ld 9, 16(3)
  326. ld 10, 24(3)
  327. ld 11, 0(3)
  328. ld 12, 8(3)
  329. # break 26 bits
  330. extrdi 14, 9, 26, 38
  331. extrdi 15, 9, 26, 12
  332. extrdi 16, 9, 12, 0
  333. mtvsrdd 58, 0, 14
  334. insrdi 16, 10, 14, 38
  335. mtvsrdd 59, 0, 15
  336. extrdi 17, 10, 26, 24
  337. mtvsrdd 60, 0, 16
  338. extrdi 18, 10, 24, 0
  339. mtvsrdd 61, 0, 17
  340. mtvsrdd 62, 0, 18
  341. # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
  342. li 9, 5
  343. mtvsrdd 36, 0, 9
  344. vmulouw 0, 27, 4 # v0 = rr0
  345. vmulouw 1, 28, 4 # v1 = rr1
  346. vmulouw 2, 29, 4 # v2 = rr2
  347. vmulouw 3, 30, 4 # v3 = rr3
  348. blr
  349. #
  350. # gcry_poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
  351. # k = 32 bytes key
  352. # r3 = k (r, s)
  353. # r4 = mlen
  354. # r5 = m
  355. #
  356. .global gcry_poly1305_p10le_4blocks
  357. .align 5
  358. gcry_poly1305_p10le_4blocks:
  359. _gcry_poly1305_p10le_4blocks:
  360. cmpdi 5, 128
  361. blt Out_no_poly1305
  362. stdu 1,-1024(1)
  363. mflr 0
  364. std 14,112(1)
  365. std 15,120(1)
  366. std 16,128(1)
  367. std 17,136(1)
  368. std 18,144(1)
  369. std 19,152(1)
  370. std 20,160(1)
  371. std 21,168(1)
  372. std 31,248(1)
  373. li 14, 256
  374. stvx 20, 14, 1
  375. addi 14, 14, 16
  376. stvx 21, 14, 1
  377. addi 14, 14, 16
  378. stvx 22, 14, 1
  379. addi 14, 14, 16
  380. stvx 23, 14, 1
  381. addi 14, 14, 16
  382. stvx 24, 14, 1
  383. addi 14, 14, 16
  384. stvx 25, 14, 1
  385. addi 14, 14, 16
  386. stvx 26, 14, 1
  387. addi 14, 14, 16
  388. stvx 27, 14, 1
  389. addi 14, 14, 16
  390. stvx 28, 14, 1
  391. addi 14, 14, 16
  392. stvx 29, 14, 1
  393. addi 14, 14, 16
  394. stvx 30, 14, 1
  395. addi 14, 14, 16
  396. stvx 31, 14, 1
  397. addi 14, 14, 16
  398. stxvx 14, 14, 1
  399. addi 14, 14, 16
  400. stxvx 15, 14, 1
  401. addi 14, 14, 16
  402. stxvx 16, 14, 1
  403. addi 14, 14, 16
  404. stxvx 17, 14, 1
  405. addi 14, 14, 16
  406. stxvx 18, 14, 1
  407. addi 14, 14, 16
  408. stxvx 19, 14, 1
  409. addi 14, 14, 16
  410. stxvx 20, 14, 1
  411. addi 14, 14, 16
  412. stxvx 21, 14, 1
  413. addi 14, 14, 16
  414. stxvx 22, 14, 1
  415. addi 14, 14, 16
  416. stxvx 23, 14, 1
  417. addi 14, 14, 16
  418. stxvx 24, 14, 1
  419. addi 14, 14, 16
  420. stxvx 25, 14, 1
  421. addi 14, 14, 16
  422. stxvx 26, 14, 1
  423. addi 14, 14, 16
  424. stxvx 27, 14, 1
  425. addi 14, 14, 16
  426. stxvx 28, 14, 1
  427. addi 14, 14, 16
  428. stxvx 29, 14, 1
  429. addi 14, 14, 16
  430. stxvx 30, 14, 1
  431. addi 14, 14, 16
  432. stxvx 31, 14, 1
  433. std 0, 1040(1)
  434. bl do_poly1305_init
  435. li 21, 0 # counter to message
  436. poly1305_setup_r
  437. # load previous state
  438. # break/convert r6 to 26 bits
  439. ld 9, 32(3)
  440. ld 10, 40(3)
  441. lwz 19, 48(3)
  442. sldi 19, 19, 24
  443. mtvsrdd 41, 0, 19
  444. extrdi 14, 9, 26, 38
  445. extrdi 15, 9, 26, 12
  446. extrdi 16, 9, 12, 0
  447. mtvsrdd 36, 0, 14
  448. insrdi 16, 10, 14, 38
  449. mtvsrdd 37, 0, 15
  450. extrdi 17, 10, 26, 24
  451. mtvsrdd 38, 0, 16
  452. extrdi 18, 10, 24, 0
  453. mtvsrdd 39, 0, 17
  454. mtvsrdd 40, 0, 18
  455. vor 8, 8, 9
  456. # input m1 m2
  457. add 20, 4, 21
  458. xxlor 49, 24, 24
  459. xxlor 50, 25, 25
  460. lxvw4x 43, 0, 20
  461. addi 17, 20, 16
  462. lxvw4x 44, 0, 17
  463. vperm 14, 11, 12, 17
  464. vperm 15, 11, 12, 18
  465. vand 9, 14, 25 # a0
  466. vsrd 10, 14, 31 # >> 26
  467. vsrd 11, 10, 31 # 12 bits left
  468. vand 10, 10, 25 # a1
  469. vspltisb 13, 12
  470. vand 16, 15, 25
  471. vsld 12, 16, 13
  472. vor 11, 11, 12
  473. vand 11, 11, 25 # a2
  474. vspltisb 13, 14
  475. vsrd 12, 15, 13 # >> 14
  476. vsrd 13, 12, 31 # >> 26, a4
  477. vand 12, 12, 25 # a3
  478. vaddudm 20, 4, 9
  479. vaddudm 21, 5, 10
  480. vaddudm 22, 6, 11
  481. vaddudm 23, 7, 12
  482. vaddudm 24, 8, 13
  483. # m3 m4
  484. addi 17, 17, 16
  485. lxvw4x 43, 0, 17
  486. addi 17, 17, 16
  487. lxvw4x 44, 0, 17
  488. vperm 14, 11, 12, 17
  489. vperm 15, 11, 12, 18
  490. vand 9, 14, 25 # a0
  491. vsrd 10, 14, 31 # >> 26
  492. vsrd 11, 10, 31 # 12 bits left
  493. vand 10, 10, 25 # a1
  494. vspltisb 13, 12
  495. vand 16, 15, 25
  496. vsld 12, 16, 13
  497. vspltisb 13, 14
  498. vor 11, 11, 12
  499. vand 11, 11, 25 # a2
  500. vsrd 12, 15, 13 # >> 14
  501. vsrd 13, 12, 31 # >> 26, a4
  502. vand 12, 12, 25 # a3
  503. # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
  504. vmrgow 4, 9, 20
  505. vmrgow 5, 10, 21
  506. vmrgow 6, 11, 22
  507. vmrgow 7, 12, 23
  508. vmrgow 8, 13, 24
  509. vaddudm 8, 8, 19
  510. addi 5, 5, -64
  511. addi 21, 21, 64
  512. li 9, 64
  513. divdu 31, 5, 9
  514. mtctr 31
  515. # h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
  516. # Rewrite the polynominal sum of product as follows,
  517. # h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
  518. # h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
  519. # .... Repeat
  520. # h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
  521. # h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
  522. #
  523. loop_4blocks:
  524. # Multiply odd words and even words
  525. mul_odd
  526. mul_even
  527. # carry reduction
  528. vspltisb 9, 2
  529. vsrd 10, 14, 31
  530. vsrd 11, 17, 31
  531. vand 7, 17, 25
  532. vand 4, 14, 25
  533. vaddudm 18, 18, 11
  534. vsrd 12, 18, 31
  535. vaddudm 15, 15, 10
  536. vsrd 11, 15, 31
  537. vand 8, 18, 25
  538. vand 5, 15, 25
  539. vaddudm 4, 4, 12
  540. vsld 10, 12, 9
  541. vaddudm 6, 16, 11
  542. vsrd 13, 6, 31
  543. vand 6, 6, 25
  544. vaddudm 4, 4, 10
  545. vsrd 10, 4, 31
  546. vaddudm 7, 7, 13
  547. vsrd 11, 7, 31
  548. vand 7, 7, 25
  549. vand 4, 4, 25
  550. vaddudm 5, 5, 10
  551. vaddudm 8, 8, 11
  552. # input m1 m2 m3 m4
  553. add 20, 4, 21
  554. xxlor 49, 24, 24
  555. xxlor 50, 25, 25
  556. lxvw4x 43, 0, 20
  557. addi 17, 20, 16
  558. lxvw4x 44, 0, 17
  559. vperm 14, 11, 12, 17
  560. vperm 15, 11, 12, 18
  561. addi 17, 17, 16
  562. lxvw4x 43, 0, 17
  563. addi 17, 17, 16
  564. lxvw4x 44, 0, 17
  565. vperm 17, 11, 12, 17
  566. vperm 18, 11, 12, 18
  567. vand 20, 14, 25 # a0
  568. vand 9, 17, 25 # a0
  569. vsrd 21, 14, 31 # >> 26
  570. vsrd 22, 21, 31 # 12 bits left
  571. vsrd 10, 17, 31 # >> 26
  572. vsrd 11, 10, 31 # 12 bits left
  573. vand 21, 21, 25 # a1
  574. vand 10, 10, 25 # a1
  575. vspltisb 13, 12
  576. vand 16, 15, 25
  577. vsld 23, 16, 13
  578. vor 22, 22, 23
  579. vand 22, 22, 25 # a2
  580. vand 16, 18, 25
  581. vsld 12, 16, 13
  582. vor 11, 11, 12
  583. vand 11, 11, 25 # a2
  584. vspltisb 13, 14
  585. vsrd 23, 15, 13 # >> 14
  586. vsrd 24, 23, 31 # >> 26, a4
  587. vand 23, 23, 25 # a3
  588. vsrd 12, 18, 13 # >> 14
  589. vsrd 13, 12, 31 # >> 26, a4
  590. vand 12, 12, 25 # a3
  591. vaddudm 4, 4, 20
  592. vaddudm 5, 5, 21
  593. vaddudm 6, 6, 22
  594. vaddudm 7, 7, 23
  595. vaddudm 8, 8, 24
  596. # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
  597. vmrgow 4, 9, 4
  598. vmrgow 5, 10, 5
  599. vmrgow 6, 11, 6
  600. vmrgow 7, 12, 7
  601. vmrgow 8, 13, 8
  602. vaddudm 8, 8, 19
  603. addi 5, 5, -64
  604. addi 21, 21, 64
  605. bdnz loop_4blocks
  606. xxlor 58, 0, 0
  607. xxlor 59, 1, 1
  608. xxlor 60, 2, 2
  609. xxlor 61, 3, 3
  610. xxlor 62, 4, 4
  611. xxlor 32, 5, 5
  612. xxlor 33, 6, 6
  613. xxlor 34, 7, 7
  614. xxlor 35, 8, 8
  615. # Multiply odd words and even words
  616. mul_odd
  617. mul_even
  618. # Sum the products.
  619. xxpermdi 41, 31, 46, 0
  620. xxpermdi 42, 31, 47, 0
  621. vaddudm 4, 14, 9
  622. xxpermdi 36, 31, 36, 3
  623. vaddudm 5, 15, 10
  624. xxpermdi 37, 31, 37, 3
  625. xxpermdi 43, 31, 48, 0
  626. vaddudm 6, 16, 11
  627. xxpermdi 38, 31, 38, 3
  628. xxpermdi 44, 31, 49, 0
  629. vaddudm 7, 17, 12
  630. xxpermdi 39, 31, 39, 3
  631. xxpermdi 45, 31, 50, 0
  632. vaddudm 8, 18, 13
  633. xxpermdi 40, 31, 40, 3
  634. # carry reduction
  635. vspltisb 9, 2
  636. vsrd 10, 4, 31
  637. vsrd 11, 7, 31
  638. vand 7, 7, 25
  639. vand 4, 4, 25
  640. vaddudm 8, 8, 11
  641. vsrd 12, 8, 31
  642. vaddudm 5, 5, 10
  643. vsrd 11, 5, 31
  644. vand 8, 8, 25
  645. vand 5, 5, 25
  646. vaddudm 4, 4, 12
  647. vsld 10, 12, 9
  648. vaddudm 6, 6, 11
  649. vsrd 13, 6, 31
  650. vand 6, 6, 25
  651. vaddudm 4, 4, 10
  652. vsrd 10, 4, 31
  653. vaddudm 7, 7, 13
  654. vsrd 11, 7, 31
  655. vand 7, 7, 25
  656. vand 4, 4, 25
  657. vaddudm 5, 5, 10
  658. vaddudm 8, 8, 11
  659. b do_final_update
  660. do_final_update:
  661. # v4, v5, v6, v7 and v8 are 26 bit vectors
  662. vsld 5, 5, 31
  663. vor 20, 4, 5
  664. vspltisb 11, 12
  665. vsrd 12, 6, 11
  666. vsld 6, 6, 31
  667. vsld 6, 6, 31
  668. vor 20, 20, 6
  669. vspltisb 11, 14
  670. vsld 7, 7, 11
  671. vor 21, 7, 12
  672. mfvsrld 16, 40 # save last 2 bytes
  673. vsld 8, 8, 11
  674. vsld 8, 8, 31
  675. vor 21, 21, 8
  676. mfvsrld 17, 52
  677. mfvsrld 19, 53
  678. srdi 16, 16, 24
  679. std 17, 32(3)
  680. std 19, 40(3)
  681. stw 16, 48(3)
  682. Out_loop:
  683. li 3, 0
  684. li 14, 256
  685. lvx 20, 14, 1
  686. addi 14, 14, 16
  687. lvx 21, 14, 1
  688. addi 14, 14, 16
  689. lvx 22, 14, 1
  690. addi 14, 14, 16
  691. lvx 23, 14, 1
  692. addi 14, 14, 16
  693. lvx 24, 14, 1
  694. addi 14, 14, 16
  695. lvx 25, 14, 1
  696. addi 14, 14, 16
  697. lvx 26, 14, 1
  698. addi 14, 14, 16
  699. lvx 27, 14, 1
  700. addi 14, 14, 16
  701. lvx 28, 14, 1
  702. addi 14, 14, 16
  703. lvx 29, 14, 1
  704. addi 14, 14, 16
  705. lvx 30, 14, 1
  706. addi 14, 14, 16
  707. lvx 31, 14, 1
  708. addi 14, 14, 16
  709. lxvx 14, 14, 1
  710. addi 14, 14, 16
  711. lxvx 15, 14, 1
  712. addi 14, 14, 16
  713. lxvx 16, 14, 1
  714. addi 14, 14, 16
  715. lxvx 17, 14, 1
  716. addi 14, 14, 16
  717. lxvx 18, 14, 1
  718. addi 14, 14, 16
  719. lxvx 19, 14, 1
  720. addi 14, 14, 16
  721. lxvx 20, 14, 1
  722. addi 14, 14, 16
  723. lxvx 21, 14, 1
  724. addi 14, 14, 16
  725. lxvx 22, 14, 1
  726. addi 14, 14, 16
  727. lxvx 23, 14, 1
  728. addi 14, 14, 16
  729. lxvx 24, 14, 1
  730. addi 14, 14, 16
  731. lxvx 25, 14, 1
  732. addi 14, 14, 16
  733. lxvx 26, 14, 1
  734. addi 14, 14, 16
  735. lxvx 27, 14, 1
  736. addi 14, 14, 16
  737. lxvx 28, 14, 1
  738. addi 14, 14, 16
  739. lxvx 29, 14, 1
  740. addi 14, 14, 16
  741. lxvx 30, 14, 1
  742. addi 14, 14, 16
  743. lxvx 31, 14, 1
  744. ld 0, 1040(1)
  745. ld 14,112(1)
  746. ld 15,120(1)
  747. ld 16,128(1)
  748. ld 17,136(1)
  749. ld 18,144(1)
  750. ld 19,152(1)
  751. ld 20,160(1)
  752. ld 21,168(1)
  753. ld 31,248(1)
  754. mtlr 0
  755. addi 1, 1, 1024
  756. blr
  757. Out_no_poly1305:
  758. li 3, 0
  759. blr
  760. .data
  761. .align 5
  762. rmask:
  763. .byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
  764. cnum:
  765. .long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
  766. .long 0x1a, 0x00, 0x1a, 0x00
  767. .long 0x01000000, 0x01000000, 0x01000000, 0x01000000
  768. .long 0x00010203, 0x04050607, 0x10111213, 0x14151617
  769. .long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
  770. .long 0x05, 0x00, 0x00, 0x00
  771. .long 0x02020202, 0x02020202, 0x02020202, 0x02020202
  772. .long 0xffffffff, 0xffffffff, 0x00000000, 0x00000000