chacha20-s390x.S 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567
  1. /* chacha20-s390x.S - zSeries implementation of ChaCha20 cipher
  2. *
  3. * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This file is part of Libgcrypt.
  6. *
  7. * Libgcrypt is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as
  9. * published by the Free Software Foundation; either version 2.1 of
  10. * the License, or (at your option) any later version.
  11. *
  12. * Libgcrypt is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
  21. #include <config.h>
  22. #if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
  23. #include "asm-common-s390x.h"
  24. #include "asm-poly1305-s390x.h"
  25. .machine "z13+vx"
  26. .section .rodata
  27. ELF(.type _gcry_chacha20_s390x_vx_constants,@function;)
  28. .balign 16
  29. _gcry_chacha20_s390x_vx_constants:
  30. .Lconsts:
  31. .Lwordswap:
  32. .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
  33. .Lbswap128:
  34. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  35. .Lbswap32:
  36. .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  37. .Lone:
  38. .long 0, 0, 0, 1
  39. .Ladd_counter_0123:
  40. .long 0, 1, 2, 3
  41. .Ladd_counter_4567:
  42. .long 4, 5, 6, 7
  43. /* register macros */
  44. #define INPUT %r2
  45. #define DST %r3
  46. #define SRC %r4
  47. #define NBLKS %r0
  48. #define ROUND %r1
  49. /* stack structure */
  50. #define STACK_FRAME_STD (8 * 16 + 8 * 4)
  51. #define STACK_FRAME_F8_F15 (8 * 8)
  52. #define STACK_FRAME_Y0_Y15 (16 * 16)
  53. #define STACK_FRAME_CTR (4 * 16)
  54. #define STACK_FRAME_PARAMS (6 * 8)
  55. #define STACK_MAX (STACK_FRAME_STD + STACK_FRAME_F8_F15 + \
  56. STACK_FRAME_Y0_Y15 + STACK_FRAME_CTR + \
  57. STACK_FRAME_PARAMS)
  58. #define STACK_F8 (STACK_MAX - STACK_FRAME_F8_F15)
  59. #define STACK_F9 (STACK_F8 + 8)
  60. #define STACK_F10 (STACK_F9 + 8)
  61. #define STACK_F11 (STACK_F10 + 8)
  62. #define STACK_F12 (STACK_F11 + 8)
  63. #define STACK_F13 (STACK_F12 + 8)
  64. #define STACK_F14 (STACK_F13 + 8)
  65. #define STACK_F15 (STACK_F14 + 8)
  66. #define STACK_Y0_Y15 (STACK_F8 - STACK_FRAME_Y0_Y15)
  67. #define STACK_CTR (STACK_Y0_Y15 - STACK_FRAME_CTR)
  68. #define STACK_INPUT (STACK_CTR - STACK_FRAME_PARAMS)
  69. #define STACK_DST (STACK_INPUT + 8)
  70. #define STACK_SRC (STACK_DST + 8)
  71. #define STACK_NBLKS (STACK_SRC + 8)
  72. #define STACK_POCTX (STACK_NBLKS + 8)
  73. #define STACK_POSRC (STACK_POCTX + 8)
  74. #define STACK_G0_H3 STACK_Y0_Y15
  75. /* vector registers */
  76. #define A0 %v0
  77. #define A1 %v1
  78. #define A2 %v2
  79. #define A3 %v3
  80. #define B0 %v4
  81. #define B1 %v5
  82. #define B2 %v6
  83. #define B3 %v7
  84. #define C0 %v8
  85. #define C1 %v9
  86. #define C2 %v10
  87. #define C3 %v11
  88. #define D0 %v12
  89. #define D1 %v13
  90. #define D2 %v14
  91. #define D3 %v15
  92. #define E0 %v16
  93. #define E1 %v17
  94. #define E2 %v18
  95. #define E3 %v19
  96. #define F0 %v20
  97. #define F1 %v21
  98. #define F2 %v22
  99. #define F3 %v23
  100. #define G0 %v24
  101. #define G1 %v25
  102. #define G2 %v26
  103. #define G3 %v27
  104. #define H0 %v28
  105. #define H1 %v29
  106. #define H2 %v30
  107. #define H3 %v31
  108. #define IO0 E0
  109. #define IO1 E1
  110. #define IO2 E2
  111. #define IO3 E3
  112. #define IO4 F0
  113. #define IO5 F1
  114. #define IO6 F2
  115. #define IO7 F3
  116. #define S0 G0
  117. #define S1 G1
  118. #define S2 G2
  119. #define S3 G3
  120. #define TMP0 H0
  121. #define TMP1 H1
  122. #define TMP2 H2
  123. #define TMP3 H3
  124. #define X0 A0
  125. #define X1 A1
  126. #define X2 A2
  127. #define X3 A3
  128. #define X4 B0
  129. #define X5 B1
  130. #define X6 B2
  131. #define X7 B3
  132. #define X8 C0
  133. #define X9 C1
  134. #define X10 C2
  135. #define X11 C3
  136. #define X12 D0
  137. #define X13 D1
  138. #define X14 D2
  139. #define X15 D3
  140. #define Y0 E0
  141. #define Y1 E1
  142. #define Y2 E2
  143. #define Y3 E3
  144. #define Y4 F0
  145. #define Y5 F1
  146. #define Y6 F2
  147. #define Y7 F3
  148. #define Y8 G0
  149. #define Y9 G1
  150. #define Y10 G2
  151. #define Y11 G3
  152. #define Y12 H0
  153. #define Y13 H1
  154. #define Y14 H2
  155. #define Y15 H3
  156. /**********************************************************************
  157. helper macros
  158. **********************************************************************/
  159. #define _ /*_*/
  160. #define CLEAR(x,...) vzero x;
  161. #define START_STACK(last_r) \
  162. lgr %r0, %r15; \
  163. lghi %r1, ~15; \
  164. stmg %r6, last_r, 6 * 8(%r15); \
  165. aghi %r0, -STACK_MAX; \
  166. ngr %r0, %r1; \
  167. lgr %r1, %r15; \
  168. CFI_DEF_CFA_REGISTER(1); \
  169. lgr %r15, %r0; \
  170. stg %r1, 0(%r15); \
  171. CFI_CFA_ON_STACK(0, 0); \
  172. std %f8, STACK_F8(%r15); \
  173. std %f9, STACK_F9(%r15); \
  174. std %f10, STACK_F10(%r15); \
  175. std %f11, STACK_F11(%r15); \
  176. std %f12, STACK_F12(%r15); \
  177. std %f13, STACK_F13(%r15); \
  178. std %f14, STACK_F14(%r15); \
  179. std %f15, STACK_F15(%r15);
  180. #define END_STACK(last_r) \
  181. lg %r1, 0(%r15); \
  182. ld %f8, STACK_F8(%r15); \
  183. ld %f9, STACK_F9(%r15); \
  184. ld %f10, STACK_F10(%r15); \
  185. ld %f11, STACK_F11(%r15); \
  186. ld %f12, STACK_F12(%r15); \
  187. ld %f13, STACK_F13(%r15); \
  188. ld %f14, STACK_F14(%r15); \
  189. ld %f15, STACK_F15(%r15); \
  190. lmg %r6, last_r, 6 * 8(%r1); \
  191. lgr %r15, %r1; \
  192. CFI_DEF_CFA_REGISTER(DW_REGNO_SP);
  193. #define PLUS(dst,src) \
  194. vaf dst, dst, src;
  195. #define XOR(dst,src) \
  196. vx dst, dst, src;
  197. #define ROTATE(v1,c) \
  198. verllf v1, v1, (c)(0);
  199. #define WORD_ROTATE(v1,s) \
  200. vsldb v1, v1, v1, ((s) * 4);
  201. #define DST_1(OPER, I, J) \
  202. OPER(A##I, J);
  203. #define DST_2(OPER, I, J) \
  204. OPER(A##I, J); OPER(B##I, J);
  205. #define DST_4(OPER, I, J) \
  206. OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J);
  207. #define DST_8(OPER, I, J) \
  208. OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); \
  209. OPER(E##I, J); OPER(F##I, J); OPER(G##I, J); OPER(H##I, J);
  210. #define DST_SRC_1(OPER, I, J) \
  211. OPER(A##I, A##J);
  212. #define DST_SRC_2(OPER, I, J) \
  213. OPER(A##I, A##J); OPER(B##I, B##J);
  214. #define DST_SRC_4(OPER, I, J) \
  215. OPER(A##I, A##J); OPER(B##I, B##J); OPER(C##I, C##J); \
  216. OPER(D##I, D##J);
  217. #define DST_SRC_8(OPER, I, J) \
  218. OPER(A##I, A##J); OPER(B##I, B##J); OPER(C##I, C##J); \
  219. OPER(D##I, D##J); OPER(E##I, E##J); OPER(F##I, F##J); \
  220. OPER(G##I, G##J); OPER(H##I, H##J);
  221. /**********************************************************************
  222. round macros
  223. **********************************************************************/
  224. #define QUARTERROUND4_POLY(wrot_1,wrot_2,wrot_3,op1,op2) \
  225. op1; DST_SRC_1(PLUS, 0, 1); DST_SRC_1(XOR, 3, 0); DST_1(ROTATE, 3, 16); \
  226. DST_SRC_1(PLUS, 2, 3); DST_SRC_1(XOR, 1, 2); DST_1(ROTATE, 1, 12); \
  227. DST_SRC_1(PLUS, 0, 1); DST_SRC_1(XOR, 3, 0); DST_1(ROTATE, 3, 8); \
  228. op2; DST_SRC_1(PLUS, 2, 3); DST_SRC_1(XOR, 1, 2); DST_1(ROTATE, 1, 7); \
  229. DST_1(WORD_ROTATE, 3, wrot_3); \
  230. DST_1(WORD_ROTATE, 2, wrot_2); \
  231. DST_1(WORD_ROTATE, 1, wrot_1);
  232. #define QUARTERROUND4(wrot_1,wrot_2,wrot_3) \
  233. QUARTERROUND4_POLY(wrot_1,wrot_2,wrot_3,,)
  234. #define QUARTERROUND4_2_POLY(wrot_1,wrot_2,wrot_3,op1,op2,op3,op4) \
  235. op1; DST_SRC_2(PLUS, 0, 1); DST_SRC_2(XOR, 3, 0); DST_2(ROTATE, 3, 16); \
  236. DST_SRC_2(PLUS, 2, 3); op2; DST_SRC_2(XOR, 1, 2); DST_2(ROTATE, 1, 12); \
  237. DST_SRC_2(PLUS, 0, 1); DST_SRC_2(XOR, 3, 0); op3; DST_2(ROTATE, 3, 8); \
  238. DST_SRC_2(PLUS, 2, 3); DST_SRC_2(XOR, 1, 2); DST_2(ROTATE, 1, 7); op4; \
  239. DST_2(WORD_ROTATE, 3, wrot_3); \
  240. DST_2(WORD_ROTATE, 2, wrot_2); \
  241. DST_2(WORD_ROTATE, 1, wrot_1);
  242. #define QUARTERROUND4_2(wrot_1,wrot_2,wrot_3) \
  243. QUARTERROUND4_2_POLY(wrot_1,wrot_2,wrot_3,,,,)
  244. #define QUARTERROUND4_4_POLY(wrot_1,wrot_2,wrot_3,op1,op2,op3,op4,op5,op6) \
  245. DST_SRC_4(PLUS, 0, 1); DST_SRC_4(XOR, 3, 0); op1; DST_4(ROTATE, 3, 16); \
  246. DST_SRC_4(PLUS, 2, 3); op2; DST_SRC_4(XOR, 1, 2); DST_4(ROTATE, 1, 12); \
  247. op3; DST_SRC_4(PLUS, 0, 1); DST_SRC_4(XOR, 3, 0); op4; DST_4(ROTATE, 3, 8); \
  248. DST_SRC_4(PLUS, 2, 3); op5; DST_SRC_4(XOR, 1, 2); DST_4(ROTATE, 1, 7); \
  249. op6; \
  250. DST_4(WORD_ROTATE, 3, wrot_3); \
  251. DST_4(WORD_ROTATE, 2, wrot_2); \
  252. DST_4(WORD_ROTATE, 1, wrot_1);
  253. #define QUARTERROUND4_4(wrot_1,wrot_2,wrot_3) \
  254. QUARTERROUND4_4_POLY(wrot_1,wrot_2,wrot_3,,,,,,)
  255. /**********************************************************************
  256. 4-way && 2-way && 1-way chacha20 ("horizontal")
  257. **********************************************************************/
  258. .text
  259. .balign 16
  260. .globl _gcry_chacha20_s390x_vx_blocks4_2_1
  261. ELF(.type _gcry_chacha20_s390x_vx_blocks4_2_1,@function;)
  262. _gcry_chacha20_s390x_vx_blocks4_2_1:
  263. /* input:
  264. * %r2: input
  265. * %r3: dst
  266. * %r4: src
  267. * %r5: nblks
  268. */
  269. CFI_STARTPROC();
  270. START_STACK(%r7);
  271. lgr NBLKS, %r5;
  272. /* Load constants. */
  273. larl %r7, .Lconsts;
  274. vl TMP0, (.Lwordswap - .Lconsts)(%r7);
  275. vl TMP1, (.Lone - .Lconsts)(%r7);
  276. vl TMP2, (.Lbswap128 - .Lconsts)(%r7);
  277. /* Load state. */
  278. vlm S0, S3, 0(INPUT);
  279. vperm S0, S0, S0, TMP0;
  280. vperm S1, S1, S1, TMP0;
  281. vperm S2, S2, S2, TMP0;
  282. vperm S3, S3, S3, TMP0;
  283. clgijl NBLKS, 4, .Lloop2;
  284. .balign 4
  285. .Lloop4:
  286. /* Process four chacha20 blocks. */
  287. vlr TMP3, S3;
  288. lghi ROUND, (20 / 2);
  289. vlr A0, S0;
  290. vlr A1, S1;
  291. vlr A2, S2;
  292. vlr A3, TMP3;
  293. vag TMP3, TMP3, TMP1;
  294. vlr B0, S0;
  295. vlr B1, S1;
  296. vlr B2, S2;
  297. vlr B3, TMP3;
  298. vag TMP3, TMP3, TMP1;
  299. vlr C0, S0;
  300. vlr C1, S1;
  301. vlr C2, S2;
  302. vlr C3, TMP3;
  303. vlr D0, S0;
  304. vlr D1, S1;
  305. vlr D2, S2;
  306. vag D3, TMP3, TMP1;
  307. slgfi NBLKS, 4;
  308. .balign 4
  309. .Lround2_4:
  310. QUARTERROUND4_4(3, 2, 1);
  311. QUARTERROUND4_4(1, 2, 3);
  312. brctg ROUND, .Lround2_4;
  313. vlm IO0, IO7, 0(SRC);
  314. PLUS(A0, S0);
  315. PLUS(A1, S1);
  316. PLUS(A2, S2);
  317. PLUS(A3, S3);
  318. vag S3, S3, TMP1; /* Update counter. */
  319. PLUS(B0, S0);
  320. PLUS(B1, S1);
  321. PLUS(B2, S2);
  322. PLUS(B3, S3);
  323. vag S3, S3, TMP1; /* Update counter. */
  324. vperm A0, A0, A0, TMP2;
  325. vperm A1, A1, A1, TMP2;
  326. vperm A2, A2, A2, TMP2;
  327. vperm A3, A3, A3, TMP2;
  328. vperm B0, B0, B0, TMP2;
  329. vperm B1, B1, B1, TMP2;
  330. vperm B2, B2, B2, TMP2;
  331. vperm B3, B3, B3, TMP2;
  332. PLUS(C0, S0);
  333. PLUS(C1, S1);
  334. PLUS(C2, S2);
  335. PLUS(C3, S3);
  336. vag S3, S3, TMP1; /* Update counter. */
  337. PLUS(D0, S0);
  338. PLUS(D1, S1);
  339. PLUS(D2, S2);
  340. PLUS(D3, S3);
  341. vag S3, S3, TMP1; /* Update counter. */
  342. vperm C0, C0, C0, TMP2;
  343. vperm C1, C1, C1, TMP2;
  344. vperm C2, C2, C2, TMP2;
  345. vperm C3, C3, C3, TMP2;
  346. vperm D0, D0, D0, TMP2;
  347. vperm D1, D1, D1, TMP2;
  348. vperm D2, D2, D2, TMP2;
  349. vperm D3, D3, D3, TMP2;
  350. XOR(IO0, A0);
  351. XOR(IO1, A1);
  352. XOR(IO2, A2);
  353. XOR(IO3, A3);
  354. XOR(IO4, B0);
  355. XOR(IO5, B1);
  356. XOR(IO6, B2);
  357. XOR(IO7, B3);
  358. vlm A0, B3, 128(SRC);
  359. vstm IO0, IO7, 0(DST);
  360. XOR(A0, C0);
  361. XOR(A1, C1);
  362. XOR(A2, C2);
  363. XOR(A3, C3);
  364. XOR(B0, D0);
  365. XOR(B1, D1);
  366. XOR(B2, D2);
  367. XOR(B3, D3);
  368. vstm A0, B3, 128(DST);
  369. aghi SRC, 256;
  370. aghi DST, 256;
  371. clgijhe NBLKS, 4, .Lloop4;
  372. CLEAR(C0);
  373. CLEAR(C1);
  374. CLEAR(C2);
  375. CLEAR(C3);
  376. CLEAR(D0);
  377. CLEAR(D1);
  378. CLEAR(D2);
  379. CLEAR(D3);
  380. .balign 4
  381. .Lloop2:
  382. clgijl NBLKS, 2, .Lloop1;
  383. /* Process two chacha20 blocks. */
  384. lghi ROUND, (20 / 2);
  385. vlr A0, S0;
  386. vlr A1, S1;
  387. vlr A2, S2;
  388. vlr A3, S3;
  389. vlr B0, S0;
  390. vlr B1, S1;
  391. vlr B2, S2;
  392. vag B3, S3, TMP1;
  393. slgfi NBLKS, 2;
  394. .balign 4
  395. .Lround2_2:
  396. QUARTERROUND4_2(3, 2, 1);
  397. QUARTERROUND4_2(1, 2, 3);
  398. brctg ROUND, .Lround2_2;
  399. vlm IO0, IO7, 0(SRC);
  400. PLUS(A0, S0);
  401. PLUS(A1, S1);
  402. PLUS(A2, S2);
  403. PLUS(A3, S3);
  404. vag S3, S3, TMP1; /* Update counter. */
  405. PLUS(B0, S0);
  406. PLUS(B1, S1);
  407. PLUS(B2, S2);
  408. PLUS(B3, S3);
  409. vag S3, S3, TMP1; /* Update counter. */
  410. vperm A0, A0, A0, TMP2;
  411. vperm A1, A1, A1, TMP2;
  412. vperm A2, A2, A2, TMP2;
  413. vperm A3, A3, A3, TMP2;
  414. vperm B0, B0, B0, TMP2;
  415. vperm B1, B1, B1, TMP2;
  416. vperm B2, B2, B2, TMP2;
  417. vperm B3, B3, B3, TMP2;
  418. XOR(IO0, A0);
  419. XOR(IO1, A1);
  420. XOR(IO2, A2);
  421. XOR(IO3, A3);
  422. XOR(IO4, B0);
  423. XOR(IO5, B1);
  424. XOR(IO6, B2);
  425. XOR(IO7, B3);
  426. vstm IO0, IO7, 0(DST);
  427. aghi SRC, 128;
  428. aghi DST, 128;
  429. clgijhe NBLKS, 2, .Lloop2;
  430. CLEAR(B0);
  431. CLEAR(B1);
  432. CLEAR(B2);
  433. CLEAR(B3);
  434. .balign 4
  435. .Lloop1:
  436. clgijl NBLKS, 1, .Ldone;
  437. /* Process one chacha20 block.*/
  438. lghi ROUND, (20 / 2);
  439. vlr A0, S0;
  440. vlr A1, S1;
  441. vlr A2, S2;
  442. vlr A3, S3;
  443. slgfi NBLKS, 1;
  444. .balign 4
  445. .Lround2_1:
  446. QUARTERROUND4(3, 2, 1);
  447. QUARTERROUND4(1, 2, 3);
  448. brct ROUND, .Lround2_1;
  449. vlm IO0, IO3, 0(SRC);
  450. PLUS(A0, S0);
  451. PLUS(A1, S1);
  452. PLUS(A2, S2);
  453. PLUS(A3, S3);
  454. vag S3, S3, TMP1; /* Update counter. */
  455. vperm A0, A0, A0, TMP2;
  456. vperm A1, A1, A1, TMP2;
  457. vperm A2, A2, A2, TMP2;
  458. vperm A3, A3, A3, TMP2;
  459. XOR(IO0, A0);
  460. XOR(IO1, A1);
  461. XOR(IO2, A2);
  462. XOR(IO3, A3);
  463. vstm IO0, IO3, 0(DST);
  464. aghi SRC, 64;
  465. aghi DST, 64;
  466. clgijhe NBLKS, 1, .Lloop1;
  467. .balign 4
  468. .Ldone:
  469. /* Store counter. */
  470. vperm S3, S3, S3, TMP0;
  471. vst S3, (48)(INPUT);
  472. /* Clear the used vector registers. */
  473. CLEAR(A0);
  474. CLEAR(A1);
  475. CLEAR(A2);
  476. CLEAR(A3);
  477. CLEAR(IO0);
  478. CLEAR(IO1);
  479. CLEAR(IO2);
  480. CLEAR(IO3);
  481. CLEAR(IO4);
  482. CLEAR(IO5);
  483. CLEAR(IO6);
  484. CLEAR(IO7);
  485. CLEAR(TMP0);
  486. CLEAR(TMP1);
  487. CLEAR(TMP2);
  488. END_STACK(%r7);
  489. xgr %r2, %r2;
  490. br %r14;
  491. CFI_ENDPROC();
  492. ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1,
  493. .-_gcry_chacha20_s390x_vx_blocks4_2_1;)
  494. /**********************************************************************
  495. 4-way && 2-way && 1-way stitched chacha20-poly1305 ("horizontal")
  496. **********************************************************************/
  497. .balign 16
  498. .globl _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1
  499. ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,@function;)
  500. _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1:
  501. /* input:
  502. * %r2: input
  503. * %r3: dst
  504. * %r4: src
  505. * %r5: nblks
  506. * %r6: poly1305 state
  507. * 160(%r15): poly1305 src
  508. */
  509. CFI_STARTPROC();
  510. START_STACK(%r14);
  511. lgr NBLKS, %r5;
  512. /* Load constants. */
  513. larl %r8, .Lconsts;
  514. vl TMP0, (.Lwordswap - .Lconsts)(%r8);
  515. vl TMP1, (.Lone - .Lconsts)(%r8);
  516. vl TMP2, (.Lbswap128 - .Lconsts)(%r8);
  517. /* Load state. */
  518. vlm S0, S3, 0(INPUT);
  519. vperm S0, S0, S0, TMP0;
  520. vperm S1, S1, S1, TMP0;
  521. vperm S2, S2, S2, TMP0;
  522. vperm S3, S3, S3, TMP0;
  523. /* Store parameters to stack. */
  524. stmg %r2, %r6, STACK_INPUT(%r15);
  525. lgr POLY_RSTATE, %r6;
  526. lgr NBLKS, %r5;
  527. lg POLY_RSRC, 0(%r15);
  528. lg POLY_RSRC, 160(POLY_RSRC);
  529. stg POLY_RSRC, STACK_POSRC(%r15);
  530. /* Load poly1305 state */
  531. POLY1305_LOAD_STATE();
  532. clgijl NBLKS, 4, .Lloop2_poly;
  533. .balign 4
  534. .Lloop4_poly:
  535. /* Process four chacha20 blocks and 16 poly1305 blocks. */
  536. vlr TMP3, S3;
  537. lghi ROUND, (20 / 4);
  538. vlr A0, S0;
  539. vlr A1, S1;
  540. vlr A2, S2;
  541. vlr A3, TMP3;
  542. vag TMP3, TMP3, TMP1;
  543. vlr B0, S0;
  544. vlr B1, S1;
  545. vlr B2, S2;
  546. vlr B3, TMP3;
  547. vag TMP3, TMP3, TMP1;
  548. vlr C0, S0;
  549. vlr C1, S1;
  550. vlr C2, S2;
  551. vlr C3, TMP3;
  552. vlr D0, S0;
  553. vlr D1, S1;
  554. vlr D2, S2;
  555. vag D3, TMP3, TMP1;
  556. slgfi NBLKS, 4;
  557. .balign 4
  558. .Lround4_4_poly:
  559. /* Total 15 poly1305 blocks processed by this loop. */
  560. QUARTERROUND4_4_POLY(3, 2, 1,
  561. POLY1305_BLOCK_PART1(0 * 16),
  562. POLY1305_BLOCK_PART2(),
  563. POLY1305_BLOCK_PART3(),
  564. POLY1305_BLOCK_PART4(),
  565. POLY1305_BLOCK_PART5(),
  566. POLY1305_BLOCK_PART6());
  567. QUARTERROUND4_4_POLY(1, 2, 3,
  568. POLY1305_BLOCK_PART7(),
  569. POLY1305_BLOCK_PART8(),
  570. POLY1305_BLOCK_PART1(1 * 16),
  571. POLY1305_BLOCK_PART2(),
  572. POLY1305_BLOCK_PART3(),
  573. POLY1305_BLOCK_PART4());
  574. QUARTERROUND4_4_POLY(3, 2, 1,
  575. POLY1305_BLOCK_PART5(),
  576. POLY1305_BLOCK_PART6(),
  577. POLY1305_BLOCK_PART7(),
  578. POLY1305_BLOCK_PART8(),
  579. POLY1305_BLOCK_PART1(2 * 16);
  580. INC_POLY1305_SRC(3 * 16),
  581. POLY1305_BLOCK_PART2());
  582. QUARTERROUND4_4_POLY(1, 2, 3,
  583. POLY1305_BLOCK_PART3(),
  584. POLY1305_BLOCK_PART4(),
  585. POLY1305_BLOCK_PART5(),
  586. POLY1305_BLOCK_PART6(),
  587. POLY1305_BLOCK_PART7(),
  588. POLY1305_BLOCK_PART8());
  589. brctg ROUND, .Lround4_4_poly;
  590. POLY1305_BLOCK_PART1(0 * 16);
  591. INC_POLY1305_SRC(1 * 16);
  592. stg POLY_RSRC, STACK_POSRC(%r15);
  593. lg %r14, STACK_SRC(%r15);
  594. vlm IO0, IO7, 0(%r14);
  595. PLUS(A0, S0);
  596. PLUS(A1, S1);
  597. PLUS(A2, S2);
  598. PLUS(A3, S3);
  599. vag S3, S3, TMP1; /* Update counter. */
  600. POLY1305_BLOCK_PART2();
  601. PLUS(B0, S0);
  602. PLUS(B1, S1);
  603. PLUS(B2, S2);
  604. PLUS(B3, S3);
  605. vag S3, S3, TMP1; /* Update counter. */
  606. POLY1305_BLOCK_PART3();
  607. vperm A0, A0, A0, TMP2;
  608. vperm A1, A1, A1, TMP2;
  609. vperm A2, A2, A2, TMP2;
  610. vperm A3, A3, A3, TMP2;
  611. vperm B0, B0, B0, TMP2;
  612. vperm B1, B1, B1, TMP2;
  613. vperm B2, B2, B2, TMP2;
  614. vperm B3, B3, B3, TMP2;
  615. POLY1305_BLOCK_PART4();
  616. PLUS(C0, S0);
  617. PLUS(C1, S1);
  618. PLUS(C2, S2);
  619. PLUS(C3, S3);
  620. vag S3, S3, TMP1; /* Update counter. */
  621. PLUS(D0, S0);
  622. PLUS(D1, S1);
  623. PLUS(D2, S2);
  624. PLUS(D3, S3);
  625. vag S3, S3, TMP1; /* Update counter. */
  626. POLY1305_BLOCK_PART5();
  627. vperm C0, C0, C0, TMP2;
  628. vperm C1, C1, C1, TMP2;
  629. vperm C2, C2, C2, TMP2;
  630. vperm C3, C3, C3, TMP2;
  631. vperm D0, D0, D0, TMP2;
  632. vperm D1, D1, D1, TMP2;
  633. vperm D2, D2, D2, TMP2;
  634. vperm D3, D3, D3, TMP2;
  635. POLY1305_BLOCK_PART6();
  636. XOR(IO0, A0);
  637. XOR(IO1, A1);
  638. XOR(IO2, A2);
  639. XOR(IO3, A3);
  640. XOR(IO4, B0);
  641. XOR(IO5, B1);
  642. XOR(IO6, B2);
  643. XOR(IO7, B3);
  644. vlm A0, B3, 128(%r14);
  645. aghi %r14, 256;
  646. stg %r14, STACK_SRC(%r15);
  647. lg %r14, STACK_DST(%r15);
  648. POLY1305_BLOCK_PART7();
  649. vstm IO0, IO7, 0(%r14);
  650. XOR(A0, C0);
  651. XOR(A1, C1);
  652. XOR(A2, C2);
  653. XOR(A3, C3);
  654. XOR(B0, D0);
  655. XOR(B1, D1);
  656. XOR(B2, D2);
  657. XOR(B3, D3);
  658. POLY1305_BLOCK_PART8();
  659. vstm A0, B3, 128(%r14);
  660. aghi %r14, 256;
  661. stg %r14, STACK_DST(%r15);
  662. lg POLY_RSRC, STACK_POSRC(%r15);
  663. clgijhe NBLKS, 4, .Lloop4_poly;
  664. CLEAR(C0);
  665. CLEAR(C1);
  666. CLEAR(C2);
  667. CLEAR(C3);
  668. CLEAR(D0);
  669. CLEAR(D1);
  670. CLEAR(D2);
  671. CLEAR(D3);
  672. .balign 4
  673. .Lloop2_poly:
  674. clgijl NBLKS, 2, .Lloop1_poly;
  675. /* Process two chacha20 and eight poly1305 blocks. */
  676. lghi ROUND, ((20 - 4) / 2);
  677. vlr A0, S0;
  678. vlr A1, S1;
  679. vlr A2, S2;
  680. vlr A3, S3;
  681. vlr B0, S0;
  682. vlr B1, S1;
  683. vlr B2, S2;
  684. vag B3, S3, TMP1;
  685. slgfi NBLKS, 2;
  686. .balign 4
  687. .Lround4_2_poly:
  688. /* Total eight poly1305 blocks processed by this loop. */
  689. QUARTERROUND4_2_POLY(3, 2, 1,
  690. POLY1305_BLOCK_PART1(0 * 16),
  691. POLY1305_BLOCK_PART2(),
  692. POLY1305_BLOCK_PART3(),
  693. POLY1305_BLOCK_PART4());
  694. INC_POLY1305_SRC(1 * 16);
  695. QUARTERROUND4_2_POLY(1, 2, 3,
  696. POLY1305_BLOCK_PART5(),
  697. POLY1305_BLOCK_PART6(),
  698. POLY1305_BLOCK_PART7(),
  699. POLY1305_BLOCK_PART8());
  700. brctg ROUND, .Lround4_2_poly;
  701. stg POLY_RSRC, STACK_POSRC(%r15);
  702. lg %r14, STACK_SRC(%r15);
  703. QUARTERROUND4_2(3, 2, 1);
  704. QUARTERROUND4_2(1, 2, 3);
  705. QUARTERROUND4_2(3, 2, 1);
  706. QUARTERROUND4_2(1, 2, 3);
  707. vlm IO0, IO7, 0(%r14);
  708. aghi %r14, 128;
  709. stg %r14, STACK_SRC(%r15);
  710. PLUS(A0, S0);
  711. PLUS(A1, S1);
  712. PLUS(A2, S2);
  713. PLUS(A3, S3);
  714. vag S3, S3, TMP1; /* Update counter. */
  715. PLUS(B0, S0);
  716. PLUS(B1, S1);
  717. PLUS(B2, S2);
  718. PLUS(B3, S3);
  719. vag S3, S3, TMP1; /* Update counter. */
  720. vperm A0, A0, A0, TMP2;
  721. vperm A1, A1, A1, TMP2;
  722. vperm A2, A2, A2, TMP2;
  723. vperm A3, A3, A3, TMP2;
  724. vperm B0, B0, B0, TMP2;
  725. vperm B1, B1, B1, TMP2;
  726. vperm B2, B2, B2, TMP2;
  727. vperm B3, B3, B3, TMP2;
  728. lg %r14, STACK_DST(%r15);
  729. XOR(IO0, A0);
  730. XOR(IO1, A1);
  731. XOR(IO2, A2);
  732. XOR(IO3, A3);
  733. XOR(IO4, B0);
  734. XOR(IO5, B1);
  735. XOR(IO6, B2);
  736. XOR(IO7, B3);
  737. vstm IO0, IO7, 0(%r14);
  738. aghi %r14, 128;
  739. stg %r14, STACK_DST(%r15);
  740. lg POLY_RSRC, STACK_POSRC(%r15);
  741. clgijhe NBLKS, 2, .Lloop2_poly;
  742. CLEAR(B0);
  743. CLEAR(B1);
  744. CLEAR(B2);
  745. CLEAR(B3);
  746. .balign 4
  747. .Lloop1_poly:
  748. clgijl NBLKS, 1, .Ldone_poly;
  749. /* Process one chacha20 block and four poly1305 blocks.*/
  750. lghi ROUND, ((20 - 4) / 4);
  751. vlr A0, S0;
  752. vlr A1, S1;
  753. vlr A2, S2;
  754. vlr A3, S3;
  755. slgfi NBLKS, 1;
  756. .balign 4
  757. .Lround4_1_poly:
  758. /* Total four poly1305 blocks processed by this loop. */
  759. QUARTERROUND4_POLY(3, 2, 1,
  760. POLY1305_BLOCK_PART1(0 * 16),
  761. POLY1305_BLOCK_PART2());
  762. INC_POLY1305_SRC(1 * 16);
  763. QUARTERROUND4_POLY(1, 2, 3,
  764. POLY1305_BLOCK_PART3(),
  765. POLY1305_BLOCK_PART4());
  766. QUARTERROUND4_POLY(3, 2, 1,
  767. POLY1305_BLOCK_PART5(),
  768. POLY1305_BLOCK_PART6());
  769. QUARTERROUND4_POLY(1, 2, 3,
  770. POLY1305_BLOCK_PART7(),
  771. POLY1305_BLOCK_PART8());
  772. brct ROUND, .Lround4_1_poly;
  773. stg POLY_RSRC, STACK_POSRC(%r15);
  774. lg %r14, STACK_SRC(%r15);
  775. QUARTERROUND4(3, 2, 1);
  776. QUARTERROUND4(1, 2, 3);
  777. QUARTERROUND4(3, 2, 1);
  778. QUARTERROUND4(1, 2, 3);
  779. vlm IO0, IO3, 0(%r14);
  780. aghi %r14, 64;
  781. stg %r14, STACK_SRC(%r15);
  782. PLUS(A0, S0);
  783. PLUS(A1, S1);
  784. PLUS(A2, S2);
  785. PLUS(A3, S3);
  786. vag S3, S3, TMP1; /* Update counter. */
  787. lg %r14, STACK_DST(%r15);
  788. vperm A0, A0, A0, TMP2;
  789. vperm A1, A1, A1, TMP2;
  790. vperm A2, A2, A2, TMP2;
  791. vperm A3, A3, A3, TMP2;
  792. XOR(IO0, A0);
  793. XOR(IO1, A1);
  794. XOR(IO2, A2);
  795. XOR(IO3, A3);
  796. vstm IO0, IO3, 0(%r14);
  797. aghi %r14, 64;
  798. stg %r14, STACK_DST(%r15);
  799. lg POLY_RSRC, STACK_POSRC(%r15);
  800. clgijhe NBLKS, 1, .Lloop1_poly;
  801. .balign 4
  802. .Ldone_poly:
  803. /* Store poly1305 state */
  804. lg POLY_RSTATE, STACK_POCTX(%r15);
  805. POLY1305_STORE_STATE();
  806. /* Store counter. */
  807. lg INPUT, STACK_INPUT(%r15);
  808. vperm S3, S3, S3, TMP0;
  809. vst S3, (48)(INPUT);
  810. /* Clear the used vector registers. */
  811. CLEAR(A0);
  812. CLEAR(A1);
  813. CLEAR(A2);
  814. CLEAR(A3);
  815. CLEAR(IO0);
  816. CLEAR(IO1);
  817. CLEAR(IO2);
  818. CLEAR(IO3);
  819. CLEAR(IO4);
  820. CLEAR(IO5);
  821. CLEAR(IO6);
  822. CLEAR(IO7);
  823. CLEAR(TMP0);
  824. CLEAR(TMP1);
  825. CLEAR(TMP2);
  826. END_STACK(%r14);
  827. xgr %r2, %r2;
  828. br %r14;
  829. CFI_ENDPROC();
  830. ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,
  831. .-_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1;)
  832. /**********************************************************************
  833. 8-way chacha20 ("vertical")
  834. **********************************************************************/
  835. #define QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
  836. x8,x9,x10,x11,x12,x13,x14,x15,\
  837. y0,y1,y2,y3,y4,y5,y6,y7,\
  838. y8,y9,y10,y11,y12,y13,y14,y15,\
  839. op1,op2,op3,op4,op5,op6,op7,op8,\
  840. op9,op10,op11,op12) \
  841. op1; \
  842. PLUS(x0, x1); PLUS(x4, x5); \
  843. PLUS(x8, x9); PLUS(x12, x13); \
  844. PLUS(y0, y1); PLUS(y4, y5); \
  845. PLUS(y8, y9); PLUS(y12, y13); \
  846. op2; \
  847. XOR(x3, x0); XOR(x7, x4); \
  848. XOR(x11, x8); XOR(x15, x12); \
  849. XOR(y3, y0); XOR(y7, y4); \
  850. XOR(y11, y8); XOR(y15, y12); \
  851. op3; \
  852. ROTATE(x3, 16); ROTATE(x7, 16); \
  853. ROTATE(x11, 16); ROTATE(x15, 16); \
  854. ROTATE(y3, 16); ROTATE(y7, 16); \
  855. ROTATE(y11, 16); ROTATE(y15, 16); \
  856. op4; \
  857. PLUS(x2, x3); PLUS(x6, x7); \
  858. PLUS(x10, x11); PLUS(x14, x15); \
  859. PLUS(y2, y3); PLUS(y6, y7); \
  860. PLUS(y10, y11); PLUS(y14, y15); \
  861. op5; \
  862. XOR(x1, x2); XOR(x5, x6); \
  863. XOR(x9, x10); XOR(x13, x14); \
  864. XOR(y1, y2); XOR(y5, y6); \
  865. XOR(y9, y10); XOR(y13, y14); \
  866. op6; \
  867. ROTATE(x1,12); ROTATE(x5,12); \
  868. ROTATE(x9,12); ROTATE(x13,12); \
  869. ROTATE(y1,12); ROTATE(y5,12); \
  870. ROTATE(y9,12); ROTATE(y13,12); \
  871. op7; \
  872. PLUS(x0, x1); PLUS(x4, x5); \
  873. PLUS(x8, x9); PLUS(x12, x13); \
  874. PLUS(y0, y1); PLUS(y4, y5); \
  875. PLUS(y8, y9); PLUS(y12, y13); \
  876. op8; \
  877. XOR(x3, x0); XOR(x7, x4); \
  878. XOR(x11, x8); XOR(x15, x12); \
  879. XOR(y3, y0); XOR(y7, y4); \
  880. XOR(y11, y8); XOR(y15, y12); \
  881. op9; \
  882. ROTATE(x3,8); ROTATE(x7,8); \
  883. ROTATE(x11,8); ROTATE(x15,8); \
  884. ROTATE(y3,8); ROTATE(y7,8); \
  885. ROTATE(y11,8); ROTATE(y15,8); \
  886. op10; \
  887. PLUS(x2, x3); PLUS(x6, x7); \
  888. PLUS(x10, x11); PLUS(x14, x15); \
  889. PLUS(y2, y3); PLUS(y6, y7); \
  890. PLUS(y10, y11); PLUS(y14, y15); \
  891. op11; \
  892. XOR(x1, x2); XOR(x5, x6); \
  893. XOR(x9, x10); XOR(x13, x14); \
  894. XOR(y1, y2); XOR(y5, y6); \
  895. XOR(y9, y10); XOR(y13, y14); \
  896. op12; \
  897. ROTATE(x1,7); ROTATE(x5,7); \
  898. ROTATE(x9,7); ROTATE(x13,7); \
  899. ROTATE(y1,7); ROTATE(y5,7); \
  900. ROTATE(y9,7); ROTATE(y13,7);
  901. #define QUARTERROUND4_V8(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,\
  902. y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15) \
  903. QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
  904. x8,x9,x10,x11,x12,x13,x14,x15,\
  905. y0,y1,y2,y3,y4,y5,y6,y7,\
  906. y8,y9,y10,y11,y12,y13,y14,y15,\
  907. ,,,,,,,,,,,)
  908. #define TRANSPOSE_4X4_2(v0,v1,v2,v3,va,vb,vc,vd,tmp0,tmp1,tmp2,tmpa,tmpb,tmpc) \
  909. vmrhf tmp0, v0, v1; \
  910. vmrhf tmp1, v2, v3; \
  911. vmrlf tmp2, v0, v1; \
  912. vmrlf v3, v2, v3; \
  913. vmrhf tmpa, va, vb; \
  914. vmrhf tmpb, vc, vd; \
  915. vmrlf tmpc, va, vb; \
  916. vmrlf vd, vc, vd; \
  917. vpdi v0, tmp0, tmp1, 0; \
  918. vpdi v1, tmp0, tmp1, 5; \
  919. vpdi v2, tmp2, v3, 0; \
  920. vpdi v3, tmp2, v3, 5; \
  921. vpdi va, tmpa, tmpb, 0; \
  922. vpdi vb, tmpa, tmpb, 5; \
  923. vpdi vc, tmpc, vd, 0; \
  924. vpdi vd, tmpc, vd, 5;
  925. .balign 16
  926. .globl _gcry_chacha20_s390x_vx_blocks8
  927. ELF(.type _gcry_chacha20_s390x_vx_blocks8,@function;)
  928. _gcry_chacha20_s390x_vx_blocks8:
  929. /* input:
  930. * %r2: input
  931. * %r3: dst
  932. * %r4: src
  933. * %r5: nblks (multiple of 8)
  934. */
  935. CFI_STARTPROC();
  936. START_STACK(%r8);
  937. lgr NBLKS, %r5;
  938. larl %r7, .Lconsts;
  939. /* Load counter. */
  940. lg %r8, (12 * 4)(INPUT);
  941. rllg %r8, %r8, 32;
  942. .balign 4
  943. /* Process eight chacha20 blocks per loop. */
  944. .Lloop8:
  945. vlm Y0, Y3, 0(INPUT);
  946. slgfi NBLKS, 8;
  947. lghi ROUND, (20 / 2);
  948. /* Construct counter vectors X12/X13 & Y12/Y13. */
  949. vl X4, (.Ladd_counter_0123 - .Lconsts)(%r7);
  950. vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r7);
  951. vrepf Y12, Y3, 0;
  952. vrepf Y13, Y3, 1;
  953. vaccf X5, Y12, X4;
  954. vaccf Y5, Y12, Y4;
  955. vaf X12, Y12, X4;
  956. vaf Y12, Y12, Y4;
  957. vaf X13, Y13, X5;
  958. vaf Y13, Y13, Y5;
  959. vrepf X0, Y0, 0;
  960. vrepf X1, Y0, 1;
  961. vrepf X2, Y0, 2;
  962. vrepf X3, Y0, 3;
  963. vrepf X4, Y1, 0;
  964. vrepf X5, Y1, 1;
  965. vrepf X6, Y1, 2;
  966. vrepf X7, Y1, 3;
  967. vrepf X8, Y2, 0;
  968. vrepf X9, Y2, 1;
  969. vrepf X10, Y2, 2;
  970. vrepf X11, Y2, 3;
  971. vrepf X14, Y3, 2;
  972. vrepf X15, Y3, 3;
  973. /* Store counters for blocks 0-7. */
  974. vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
  975. vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
  976. vlr Y0, X0;
  977. vlr Y1, X1;
  978. vlr Y2, X2;
  979. vlr Y3, X3;
  980. vlr Y4, X4;
  981. vlr Y5, X5;
  982. vlr Y6, X6;
  983. vlr Y7, X7;
  984. vlr Y8, X8;
  985. vlr Y9, X9;
  986. vlr Y10, X10;
  987. vlr Y11, X11;
  988. vlr Y14, X14;
  989. vlr Y15, X15;
  990. /* Update and store counter. */
  991. agfi %r8, 8;
  992. rllg %r5, %r8, 32;
  993. stg %r5, (12 * 4)(INPUT);
  994. .balign 4
  995. .Lround2_8:
  996. QUARTERROUND4_V8(X0, X4, X8, X12, X1, X5, X9, X13,
  997. X2, X6, X10, X14, X3, X7, X11, X15,
  998. Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13,
  999. Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15);
  1000. QUARTERROUND4_V8(X0, X5, X10, X15, X1, X6, X11, X12,
  1001. X2, X7, X8, X13, X3, X4, X9, X14,
  1002. Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12,
  1003. Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14);
  1004. brctg ROUND, .Lround2_8;
  1005. /* Store blocks 4-7. */
  1006. vstm Y0, Y15, STACK_Y0_Y15(%r15);
  1007. /* Load counters for blocks 0-3. */
  1008. vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
  1009. lghi ROUND, 1;
  1010. j .Lfirst_output_4blks_8;
  1011. .balign 4
  1012. .Lsecond_output_4blks_8:
  1013. /* Load blocks 4-7. */
  1014. vlm X0, X15, STACK_Y0_Y15(%r15);
  1015. /* Load counters for blocks 4-7. */
  1016. vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
  1017. lghi ROUND, 0;
  1018. .balign 4
  1019. /* Output four chacha20 blocks per loop. */
  1020. .Lfirst_output_4blks_8:
  1021. vlm Y12, Y15, 0(INPUT);
  1022. PLUS(X12, Y0);
  1023. PLUS(X13, Y1);
  1024. vrepf Y0, Y12, 0;
  1025. vrepf Y1, Y12, 1;
  1026. vrepf Y2, Y12, 2;
  1027. vrepf Y3, Y12, 3;
  1028. vrepf Y4, Y13, 0;
  1029. vrepf Y5, Y13, 1;
  1030. vrepf Y6, Y13, 2;
  1031. vrepf Y7, Y13, 3;
  1032. vrepf Y8, Y14, 0;
  1033. vrepf Y9, Y14, 1;
  1034. vrepf Y10, Y14, 2;
  1035. vrepf Y11, Y14, 3;
  1036. vrepf Y14, Y15, 2;
  1037. vrepf Y15, Y15, 3;
  1038. PLUS(X0, Y0);
  1039. PLUS(X1, Y1);
  1040. PLUS(X2, Y2);
  1041. PLUS(X3, Y3);
  1042. PLUS(X4, Y4);
  1043. PLUS(X5, Y5);
  1044. PLUS(X6, Y6);
  1045. PLUS(X7, Y7);
  1046. PLUS(X8, Y8);
  1047. PLUS(X9, Y9);
  1048. PLUS(X10, Y10);
  1049. PLUS(X11, Y11);
  1050. PLUS(X14, Y14);
  1051. PLUS(X15, Y15);
  1052. vl Y15, (.Lbswap32 - .Lconsts)(%r7);
  1053. TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
  1054. Y9, Y10, Y11, Y12, Y13, Y14);
  1055. TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
  1056. Y9, Y10, Y11, Y12, Y13, Y14);
  1057. vlm Y0, Y14, 0(SRC);
  1058. vperm X0, X0, X0, Y15;
  1059. vperm X1, X1, X1, Y15;
  1060. vperm X2, X2, X2, Y15;
  1061. vperm X3, X3, X3, Y15;
  1062. vperm X4, X4, X4, Y15;
  1063. vperm X5, X5, X5, Y15;
  1064. vperm X6, X6, X6, Y15;
  1065. vperm X7, X7, X7, Y15;
  1066. vperm X8, X8, X8, Y15;
  1067. vperm X9, X9, X9, Y15;
  1068. vperm X10, X10, X10, Y15;
  1069. vperm X11, X11, X11, Y15;
  1070. vperm X12, X12, X12, Y15;
  1071. vperm X13, X13, X13, Y15;
  1072. vperm X14, X14, X14, Y15;
  1073. vperm X15, X15, X15, Y15;
  1074. vl Y15, (15 * 16)(SRC);
  1075. XOR(Y0, X0);
  1076. XOR(Y1, X4);
  1077. XOR(Y2, X8);
  1078. XOR(Y3, X12);
  1079. XOR(Y4, X1);
  1080. XOR(Y5, X5);
  1081. XOR(Y6, X9);
  1082. XOR(Y7, X13);
  1083. XOR(Y8, X2);
  1084. XOR(Y9, X6);
  1085. XOR(Y10, X10);
  1086. XOR(Y11, X14);
  1087. XOR(Y12, X3);
  1088. XOR(Y13, X7);
  1089. XOR(Y14, X11);
  1090. XOR(Y15, X15);
  1091. vstm Y0, Y15, 0(DST);
  1092. aghi SRC, 256;
  1093. aghi DST, 256;
  1094. clgije ROUND, 1, .Lsecond_output_4blks_8;
  1095. clgijhe NBLKS, 8, .Lloop8;
  1096. /* Clear the used vector registers. */
  1097. DST_8(CLEAR, 0, _);
  1098. DST_8(CLEAR, 1, _);
  1099. DST_8(CLEAR, 2, _);
  1100. DST_8(CLEAR, 3, _);
  1101. /* Clear sensitive data in stack. */
  1102. vlm Y0, Y15, STACK_Y0_Y15(%r15);
  1103. vlm Y0, Y3, STACK_CTR(%r15);
  1104. END_STACK(%r8);
  1105. xgr %r2, %r2;
  1106. br %r14;
  1107. CFI_ENDPROC();
  1108. ELF(.size _gcry_chacha20_s390x_vx_blocks8,
  1109. .-_gcry_chacha20_s390x_vx_blocks8;)
  1110. /**********************************************************************
  1111. 8-way stitched chacha20-poly1305 ("vertical")
  1112. **********************************************************************/
  1113. .balign 16
  1114. .globl _gcry_chacha20_poly1305_s390x_vx_blocks8
  1115. ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks8,@function;)
  1116. _gcry_chacha20_poly1305_s390x_vx_blocks8:
  1117. /* input:
  1118. * %r2: input
  1119. * %r3: dst
  1120. * %r4: src
  1121. * %r5: nblks (multiple of 8)
  1122. * %r6: poly1305 state
  1123. * 160(%r15): poly1305 src
  1124. */
  1125. CFI_STARTPROC();
  1126. START_STACK(%r14);
  1127. /* Store parameters to stack. */
  1128. stmg %r2, %r6, STACK_INPUT(%r15);
  1129. lgr POLY_RSTATE, %r6;
  1130. lgr NBLKS, %r5;
  1131. lg POLY_RSRC, 0(%r15);
  1132. lg POLY_RSRC, 160(POLY_RSRC);
  1133. stg POLY_RSRC, STACK_POSRC(%r15);
  1134. /* Load poly1305 state */
  1135. POLY1305_LOAD_STATE();
  1136. .balign 4
  1137. /* Process eight chacha20 blocks and 32 poly1305 blocks per loop. */
  1138. .Lloop8_poly:
  1139. lg INPUT, STACK_INPUT(%r15);
  1140. larl %r8, .Lconsts;
  1141. vlm Y0, Y3, 0(INPUT);
  1142. slgfi NBLKS, 8;
  1143. lghi ROUND, (20 / 2);
  1144. /* Construct counter vectors X12/X13 & Y12/Y13. */
  1145. vl X4, (.Ladd_counter_0123 - .Lconsts)(%r8);
  1146. vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r8);
  1147. lg %r8, (12 * 4)(INPUT); /* Update counter. */
  1148. vrepf Y12, Y3, 0;
  1149. vrepf Y13, Y3, 1;
  1150. vaccf X5, Y12, X4;
  1151. vaccf Y5, Y12, Y4;
  1152. vaf X12, Y12, X4;
  1153. vaf Y12, Y12, Y4;
  1154. vaf X13, Y13, X5;
  1155. vaf Y13, Y13, Y5;
  1156. rllg %r8, %r8, 32;
  1157. vrepf X0, Y0, 0;
  1158. vrepf X1, Y0, 1;
  1159. vrepf X2, Y0, 2;
  1160. vrepf X3, Y0, 3;
  1161. vrepf X4, Y1, 0;
  1162. vrepf X5, Y1, 1;
  1163. vrepf X6, Y1, 2;
  1164. vrepf X7, Y1, 3;
  1165. vrepf X8, Y2, 0;
  1166. vrepf X9, Y2, 1;
  1167. vrepf X10, Y2, 2;
  1168. vrepf X11, Y2, 3;
  1169. vrepf X14, Y3, 2;
  1170. vrepf X15, Y3, 3;
  1171. agfi %r8, 8;
  1172. /* Store counters for blocks 0-7. */
  1173. vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
  1174. vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
  1175. rllg %r8, %r8, 32;
  1176. vlr Y0, X0;
  1177. vlr Y1, X1;
  1178. vlr Y2, X2;
  1179. vlr Y3, X3;
  1180. vlr Y4, X4;
  1181. vlr Y5, X5;
  1182. vlr Y6, X6;
  1183. vlr Y7, X7;
  1184. vlr Y8, X8;
  1185. vlr Y9, X9;
  1186. vlr Y10, X10;
  1187. vlr Y11, X11;
  1188. vlr Y14, X14;
  1189. vlr Y15, X15;
  1190. stg %r8, (12 * 4)(INPUT);
  1191. .balign 4
  1192. .Lround2_8_poly:
  1193. /* Total 30 poly1305 blocks processed by this loop. */
  1194. QUARTERROUND4_V8_POLY(X0, X4, X8, X12, X1, X5, X9, X13,
  1195. X2, X6, X10, X14, X3, X7, X11, X15,
  1196. Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13,
  1197. Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15,
  1198. POLY1305_BLOCK_PART1(0 * 16),
  1199. POLY1305_BLOCK_PART2(),
  1200. POLY1305_BLOCK_PART3(),
  1201. POLY1305_BLOCK_PART4(),
  1202. POLY1305_BLOCK_PART5(),
  1203. POLY1305_BLOCK_PART6(),
  1204. POLY1305_BLOCK_PART7(),
  1205. POLY1305_BLOCK_PART8(),
  1206. POLY1305_BLOCK_PART1(1 * 16),
  1207. POLY1305_BLOCK_PART2(),
  1208. POLY1305_BLOCK_PART3(),
  1209. POLY1305_BLOCK_PART4());
  1210. QUARTERROUND4_V8_POLY(X0, X5, X10, X15, X1, X6, X11, X12,
  1211. X2, X7, X8, X13, X3, X4, X9, X14,
  1212. Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12,
  1213. Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14,
  1214. POLY1305_BLOCK_PART5(),
  1215. POLY1305_BLOCK_PART6(),
  1216. POLY1305_BLOCK_PART7(),
  1217. POLY1305_BLOCK_PART8(),
  1218. POLY1305_BLOCK_PART1(2 * 16);
  1219. INC_POLY1305_SRC(3 * 16),
  1220. POLY1305_BLOCK_PART2(),
  1221. POLY1305_BLOCK_PART3(),
  1222. POLY1305_BLOCK_PART4(),
  1223. POLY1305_BLOCK_PART5(),
  1224. POLY1305_BLOCK_PART6(),
  1225. POLY1305_BLOCK_PART7(),
  1226. POLY1305_BLOCK_PART8());
  1227. brctg ROUND, .Lround2_8_poly;
  1228. POLY1305_BLOCK_PART1(0 * 16);
  1229. /* Store blocks 4-7. */
  1230. vstm Y0, Y15, STACK_Y0_Y15(%r15);
  1231. /* Load counters for blocks 0-3. */
  1232. vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
  1233. stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */
  1234. lghi ROUND, 1;
  1235. j .Lfirst_output_4blks_8_poly;
  1236. .balign 4
  1237. .Lsecond_output_4blks_8_poly:
  1238. POLY1305_BLOCK_PART1(1 * 16);
  1239. /* Load blocks 4-7. */
  1240. vlm X0, X15, STACK_Y0_Y15(%r15);
  1241. /* Load counters for blocks 4-7. */
  1242. vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
  1243. INC_POLY1305_SRC(2 * 16);
  1244. stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */
  1245. lghi ROUND, 0;
  1246. .balign 4
  1247. /* Output four chacha20 blocks and one poly1305 block per loop. */
  1248. .Lfirst_output_4blks_8_poly:
  1249. lg %r14, STACK_INPUT(%r15);
  1250. vlm Y12, Y15, 0(%r14);
  1251. POLY1305_BLOCK_PART2();
  1252. PLUS(X12, Y0);
  1253. PLUS(X13, Y1);
  1254. vrepf Y0, Y12, 0;
  1255. vrepf Y1, Y12, 1;
  1256. vrepf Y2, Y12, 2;
  1257. vrepf Y3, Y12, 3;
  1258. vrepf Y4, Y13, 0;
  1259. vrepf Y5, Y13, 1;
  1260. vrepf Y6, Y13, 2;
  1261. vrepf Y7, Y13, 3;
  1262. vrepf Y8, Y14, 0;
  1263. vrepf Y9, Y14, 1;
  1264. vrepf Y10, Y14, 2;
  1265. vrepf Y11, Y14, 3;
  1266. vrepf Y14, Y15, 2;
  1267. vrepf Y15, Y15, 3;
  1268. POLY1305_BLOCK_PART3();
  1269. PLUS(X0, Y0);
  1270. PLUS(X1, Y1);
  1271. PLUS(X2, Y2);
  1272. PLUS(X3, Y3);
  1273. PLUS(X4, Y4);
  1274. PLUS(X5, Y5);
  1275. PLUS(X6, Y6);
  1276. PLUS(X7, Y7);
  1277. PLUS(X8, Y8);
  1278. PLUS(X9, Y9);
  1279. PLUS(X10, Y10);
  1280. PLUS(X11, Y11);
  1281. PLUS(X14, Y14);
  1282. PLUS(X15, Y15);
  1283. POLY1305_BLOCK_PART4();
  1284. larl %r14, .Lconsts;
  1285. vl Y15, (.Lbswap32 - .Lconsts)(%r14);
  1286. TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
  1287. Y9, Y10, Y11, Y12, Y13, Y14);
  1288. lg %r14, STACK_SRC(%r15);
  1289. POLY1305_BLOCK_PART5();
  1290. TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
  1291. Y9, Y10, Y11, Y12, Y13, Y14);
  1292. vlm Y0, Y14, 0(%r14);
  1293. POLY1305_BLOCK_PART6();
  1294. vperm X0, X0, X0, Y15;
  1295. vperm X1, X1, X1, Y15;
  1296. vperm X2, X2, X2, Y15;
  1297. vperm X3, X3, X3, Y15;
  1298. vperm X4, X4, X4, Y15;
  1299. vperm X5, X5, X5, Y15;
  1300. vperm X6, X6, X6, Y15;
  1301. vperm X7, X7, X7, Y15;
  1302. vperm X8, X8, X8, Y15;
  1303. vperm X9, X9, X9, Y15;
  1304. vperm X10, X10, X10, Y15;
  1305. vperm X11, X11, X11, Y15;
  1306. vperm X12, X12, X12, Y15;
  1307. vperm X13, X13, X13, Y15;
  1308. vperm X14, X14, X14, Y15;
  1309. vperm X15, X15, X15, Y15;
  1310. vl Y15, (15 * 16)(%r14);
  1311. POLY1305_BLOCK_PART7();
  1312. aghi %r14, 256;
  1313. stg %r14, STACK_SRC(%r15);
  1314. lg %r14, STACK_DST(%r15);
  1315. XOR(Y0, X0);
  1316. XOR(Y1, X4);
  1317. XOR(Y2, X8);
  1318. XOR(Y3, X12);
  1319. XOR(Y4, X1);
  1320. XOR(Y5, X5);
  1321. XOR(Y6, X9);
  1322. XOR(Y7, X13);
  1323. XOR(Y8, X2);
  1324. XOR(Y9, X6);
  1325. XOR(Y10, X10);
  1326. XOR(Y11, X14);
  1327. XOR(Y12, X3);
  1328. XOR(Y13, X7);
  1329. XOR(Y14, X11);
  1330. XOR(Y15, X15);
  1331. POLY1305_BLOCK_PART8();
  1332. vstm Y0, Y15, 0(%r14);
  1333. aghi %r14, 256;
  1334. stg %r14, STACK_DST(%r15);
  1335. lg POLY_RSRC, STACK_POSRC(%r15);
  1336. clgije ROUND, 1, .Lsecond_output_4blks_8_poly;
  1337. clgijhe NBLKS, 8, .Lloop8_poly;
  1338. /* Store poly1305 state */
  1339. lg POLY_RSTATE, STACK_POCTX(%r15);
  1340. POLY1305_STORE_STATE();
  1341. /* Clear the used vector registers */
  1342. DST_8(CLEAR, 0, _);
  1343. DST_8(CLEAR, 1, _);
  1344. DST_8(CLEAR, 2, _);
  1345. DST_8(CLEAR, 3, _);
  1346. /* Clear sensitive data in stack. */
  1347. vlm Y0, Y15, STACK_Y0_Y15(%r15);
  1348. vlm Y0, Y3, STACK_CTR(%r15);
  1349. END_STACK(%r14);
  1350. xgr %r2, %r2;
  1351. br %r14;
  1352. CFI_ENDPROC();
  1353. ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks8,
  1354. .-_gcry_chacha20_poly1305_s390x_vx_blocks8;)
  1355. #endif /*HAVE_GCC_INLINE_ASM_S390X_VX*/
  1356. #endif /*__s390x__*/