twofish-amd64.S 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259
  1. /* twofish-amd64.S - AMD64 assembly implementation of Twofish cipher
  2. *
  3. * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This file is part of Libgcrypt.
  6. *
  7. * Libgcrypt is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as
  9. * published by the Free Software Foundation; either version 2.1 of
  10. * the License, or (at your option) any later version.
  11. *
  12. * Libgcrypt is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #ifdef __x86_64
  21. #include <config.h>
  22. #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  23. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH)
  24. #include "asm-common-amd64.h"
  25. .text
  26. /* structure of TWOFISH_context: */
  27. #define s0 0
  28. #define s1 ((s0) + 4 * 256)
  29. #define s2 ((s1) + 4 * 256)
  30. #define s3 ((s2) + 4 * 256)
  31. #define w ((s3) + 4 * 256)
  32. #define k ((w) + 4 * 8)
  33. /* register macros */
  34. #define CTX %rdi
  35. #define RA %rax
  36. #define RB %rbx
  37. #define RC %rcx
  38. #define RD %rdx
  39. #define RAd %eax
  40. #define RBd %ebx
  41. #define RCd %ecx
  42. #define RDd %edx
  43. #define RAbl %al
  44. #define RBbl %bl
  45. #define RCbl %cl
  46. #define RDbl %dl
  47. #define RAbh %ah
  48. #define RBbh %bh
  49. #define RCbh %ch
  50. #define RDbh %dh
  51. #define RX %r8
  52. #define RY %r9
  53. #define RXd %r8d
  54. #define RYd %r9d
  55. #define RT0 %rsi
  56. #define RT1 %rbp
  57. #define RT2 %r10
  58. #define RT3 %r11
  59. #define RT0d %esi
  60. #define RT1d %ebp
  61. #define RT2d %r10d
  62. #define RT3d %r11d
  63. /***********************************************************************
  64. * AMD64 assembly implementation of the Twofish cipher
  65. ***********************************************************************/
  66. #define enc_g1_2(a, b, x, y) \
  67. movzbl b ## bl, RT3d; \
  68. movzbl b ## bh, RT1d; \
  69. movzbl a ## bl, RT2d; \
  70. movzbl a ## bh, RT0d; \
  71. rorl $16, b ## d; \
  72. rorl $16, a ## d; \
  73. movl s1(CTX, RT3, 4), RYd; \
  74. movzbl b ## bl, RT3d; \
  75. movl s0(CTX, RT2, 4), RXd; \
  76. movzbl a ## bl, RT2d; \
  77. xorl s2(CTX, RT1, 4), RYd; \
  78. movzbl b ## bh, RT1d; \
  79. xorl s1(CTX, RT0, 4), RXd; \
  80. movzbl a ## bh, RT0d; \
  81. rorl $16, b ## d; \
  82. rorl $16, a ## d; \
  83. xorl s3(CTX, RT3, 4), RYd; \
  84. xorl s2(CTX, RT2, 4), RXd; \
  85. xorl s0(CTX, RT1, 4), RYd; \
  86. xorl s3(CTX, RT0, 4), RXd;
  87. #define dec_g1_2(a, b, x, y) \
  88. movzbl a ## bl, RT2d; \
  89. movzbl a ## bh, RT0d; \
  90. movzbl b ## bl, RT3d; \
  91. movzbl b ## bh, RT1d; \
  92. rorl $16, a ## d; \
  93. rorl $16, b ## d; \
  94. movl s0(CTX, RT2, 4), RXd; \
  95. movzbl a ## bl, RT2d; \
  96. movl s1(CTX, RT3, 4), RYd; \
  97. movzbl b ## bl, RT3d; \
  98. xorl s1(CTX, RT0, 4), RXd; \
  99. movzbl a ## bh, RT0d; \
  100. xorl s2(CTX, RT1, 4), RYd; \
  101. movzbl b ## bh, RT1d; \
  102. rorl $16, a ## d; \
  103. rorl $16, b ## d; \
  104. xorl s2(CTX, RT2, 4), RXd; \
  105. xorl s3(CTX, RT3, 4), RYd; \
  106. xorl s3(CTX, RT0, 4), RXd; \
  107. xorl s0(CTX, RT1, 4), RYd;
  108. #define encrypt_round(ra, rb, rc, rd, n) \
  109. enc_g1_2(##ra, ##rb, RX, RY); \
  110. \
  111. leal (RXd, RYd, 2), RT0d; \
  112. addl RYd, RXd; \
  113. addl (k + 8 * (n) + 4)(CTX), RT0d; \
  114. roll $1, rd ## d; \
  115. addl (k + 8 * (n))(CTX), RXd; \
  116. xorl RT0d, rd ## d; \
  117. xorl RXd, rc ## d; \
  118. rorl $1, rc ## d;
  119. #define decrypt_round(ra, rb, rc, rd, n) \
  120. dec_g1_2(##ra, ##rb, RX, RY); \
  121. \
  122. leal (RXd, RYd, 2), RT0d; \
  123. addl RYd, RXd; \
  124. addl (k + 8 * (n) + 4)(CTX), RT0d; \
  125. roll $1, rc ## d; \
  126. addl (k + 8 * (n))(CTX), RXd; \
  127. xorl RXd, rc ## d; \
  128. xorl RT0d, rd ## d; \
  129. rorl $1, rd ## d;
  130. #define encrypt_cycle(a, b, c, d, nc) \
  131. encrypt_round(##a, ##b, ##c, ##d, (nc) * 2); \
  132. encrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1);
  133. #define decrypt_cycle(a, b, c, d, nc) \
  134. decrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1); \
  135. decrypt_round(##a, ##b, ##c, ##d, (nc) * 2);
  136. #define inpack(in, n, x, m) \
  137. movl (4 * (n))(in), x; \
  138. xorl (w + 4 * (m))(CTX), x;
  139. #define outunpack(out, n, x, m) \
  140. xorl (w + 4 * (m))(CTX), x; \
  141. movl x, (4 * (n))(out);
  142. .align 16
  143. .globl _gcry_twofish_amd64_encrypt_block
  144. ELF(.type _gcry_twofish_amd64_encrypt_block,@function;)
  145. _gcry_twofish_amd64_encrypt_block:
  146. /* input:
  147. * %rdi: context, CTX
  148. * %rsi: dst
  149. * %rdx: src
  150. */
  151. CFI_STARTPROC();
  152. ENTER_SYSV_FUNC_PARAMS_0_4
  153. subq $(3 * 8), %rsp;
  154. CFI_ADJUST_CFA_OFFSET(3 * 8);
  155. movq %rsi, (0 * 8)(%rsp);
  156. movq %rbp, (1 * 8)(%rsp);
  157. movq %rbx, (2 * 8)(%rsp);
  158. CFI_REL_OFFSET(%rbp, 1 * 8);
  159. CFI_REL_OFFSET(%rbx, 2 * 8);
  160. movq %rdx, RX;
  161. inpack(RX, 0, RAd, 0);
  162. inpack(RX, 1, RBd, 1);
  163. inpack(RX, 2, RCd, 2);
  164. inpack(RX, 3, RDd, 3);
  165. encrypt_cycle(RA, RB, RC, RD, 0);
  166. encrypt_cycle(RA, RB, RC, RD, 1);
  167. encrypt_cycle(RA, RB, RC, RD, 2);
  168. encrypt_cycle(RA, RB, RC, RD, 3);
  169. encrypt_cycle(RA, RB, RC, RD, 4);
  170. encrypt_cycle(RA, RB, RC, RD, 5);
  171. encrypt_cycle(RA, RB, RC, RD, 6);
  172. encrypt_cycle(RA, RB, RC, RD, 7);
  173. movq (0 * 8)(%rsp), RX; /*dst*/
  174. outunpack(RX, 0, RCd, 4);
  175. outunpack(RX, 1, RDd, 5);
  176. outunpack(RX, 2, RAd, 6);
  177. outunpack(RX, 3, RBd, 7);
  178. movq (2 * 8)(%rsp), %rbx;
  179. movq (1 * 8)(%rsp), %rbp;
  180. CFI_RESTORE(%rbx);
  181. CFI_RESTORE(%rbp);
  182. addq $(3 * 8), %rsp;
  183. CFI_ADJUST_CFA_OFFSET(-3 * 8);
  184. EXIT_SYSV_FUNC
  185. ret_spec_stop;
  186. CFI_ENDPROC();
  187. ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
  188. .align 16
  189. .globl _gcry_twofish_amd64_decrypt_block
  190. ELF(.type _gcry_twofish_amd64_decrypt_block,@function;)
  191. _gcry_twofish_amd64_decrypt_block:
  192. /* input:
  193. * %rdi: context, CTX
  194. * %rsi: dst
  195. * %rdx: src
  196. */
  197. CFI_STARTPROC();
  198. ENTER_SYSV_FUNC_PARAMS_0_4
  199. subq $(3 * 8), %rsp;
  200. CFI_ADJUST_CFA_OFFSET(3 * 8);
  201. movq %rsi, (0 * 8)(%rsp);
  202. movq %rbp, (1 * 8)(%rsp);
  203. movq %rbx, (2 * 8)(%rsp);
  204. CFI_REL_OFFSET(%rbp, 1 * 8);
  205. CFI_REL_OFFSET(%rbx, 2 * 8);
  206. movq %rdx, RX;
  207. inpack(RX, 0, RCd, 4);
  208. inpack(RX, 1, RDd, 5);
  209. inpack(RX, 2, RAd, 6);
  210. inpack(RX, 3, RBd, 7);
  211. decrypt_cycle(RA, RB, RC, RD, 7);
  212. decrypt_cycle(RA, RB, RC, RD, 6);
  213. decrypt_cycle(RA, RB, RC, RD, 5);
  214. decrypt_cycle(RA, RB, RC, RD, 4);
  215. decrypt_cycle(RA, RB, RC, RD, 3);
  216. decrypt_cycle(RA, RB, RC, RD, 2);
  217. decrypt_cycle(RA, RB, RC, RD, 1);
  218. decrypt_cycle(RA, RB, RC, RD, 0);
  219. movq (0 * 8)(%rsp), RX; /*dst*/
  220. outunpack(RX, 0, RAd, 0);
  221. outunpack(RX, 1, RBd, 1);
  222. outunpack(RX, 2, RCd, 2);
  223. outunpack(RX, 3, RDd, 3);
  224. movq (2 * 8)(%rsp), %rbx;
  225. movq (1 * 8)(%rsp), %rbp;
  226. CFI_RESTORE(%rbx);
  227. CFI_RESTORE(%rbp);
  228. addq $(3 * 8), %rsp;
  229. CFI_ADJUST_CFA_OFFSET(-3 * 8);
  230. EXIT_SYSV_FUNC
  231. ret_spec_stop;
  232. CFI_ENDPROC();
  233. ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
  234. #undef CTX
  235. #undef RA
  236. #undef RB
  237. #undef RC
  238. #undef RD
  239. #undef RAd
  240. #undef RBd
  241. #undef RCd
  242. #undef RDd
  243. #undef RAbl
  244. #undef RBbl
  245. #undef RCbl
  246. #undef RDbl
  247. #undef RAbh
  248. #undef RBbh
  249. #undef RCbh
  250. #undef RDbh
  251. #undef RX
  252. #undef RY
  253. #undef RXd
  254. #undef RYd
  255. #undef RT0
  256. #undef RT1
  257. #undef RT2
  258. #undef RT3
  259. #undef RT0d
  260. #undef RT1d
  261. #undef RT2d
  262. #undef RT3d
  263. /***********************************************************************
  264. * AMD64 assembly implementation of the Twofish cipher, 3-way parallel
  265. ***********************************************************************/
  266. #define CTX %rdi
  267. #define RIO %rdx
  268. #define RAB0 %rax
  269. #define RAB1 %rbx
  270. #define RAB2 %rcx
  271. #define RAB0d %eax
  272. #define RAB1d %ebx
  273. #define RAB2d %ecx
  274. #define RAB0bh %ah
  275. #define RAB1bh %bh
  276. #define RAB2bh %ch
  277. #define RAB0bl %al
  278. #define RAB1bl %bl
  279. #define RAB2bl %cl
  280. #define RCD0 %r8
  281. #define RCD1 %r9
  282. #define RCD2 %r10
  283. #define RCD0d %r8d
  284. #define RCD1d %r9d
  285. #define RCD2d %r10d
  286. #define RX0 %rbp
  287. #define RX1 %r11
  288. #define RX2 %r12
  289. #define RX0d %ebp
  290. #define RX1d %r11d
  291. #define RX2d %r12d
  292. #define RY0 %r13
  293. #define RY1 %r14
  294. #define RY2 %r15
  295. #define RY0d %r13d
  296. #define RY1d %r14d
  297. #define RY2d %r15d
  298. #define RT0 %rdx
  299. #define RT1 %rsi
  300. #define RT0d %edx
  301. #define RT1d %esi
  302. #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
  303. movzbl ab ## bl, tmp2 ## d; \
  304. movzbl ab ## bh, tmp1 ## d; \
  305. rorq $(rot), ab; \
  306. op1##l T0(CTX, tmp2, 4), dst ## d; \
  307. op2##l T1(CTX, tmp1, 4), dst ## d;
  308. /*
  309. * Combined G1 & G2 function. Reordered with help of rotates to have moves
  310. * at beginning.
  311. */
  312. #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
  313. /* G1,1 && G2,1 */ \
  314. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
  315. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
  316. \
  317. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
  318. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
  319. \
  320. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
  321. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
  322. \
  323. /* G1,2 && G2,2 */ \
  324. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
  325. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
  326. movq ab ## 0, RT0; \
  327. movq cd ## 0, ab ## 0; \
  328. movq RT0, cd ## 0; \
  329. \
  330. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
  331. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
  332. movq ab ## 1, RT0; \
  333. movq cd ## 1, ab ## 1; \
  334. movq RT0, cd ## 1; \
  335. \
  336. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
  337. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
  338. movq ab ## 2, RT0; \
  339. movq cd ## 2, ab ## 2; \
  340. movq RT0, cd ## 2;
  341. #define enc_round_end(ab, x, y, n) \
  342. addl y ## d, x ## d; \
  343. addl x ## d, y ## d; \
  344. addl k+4*(2*(n))(CTX), x ## d; \
  345. xorl ab ## d, x ## d; \
  346. addl k+4*(2*(n)+1)(CTX), y ## d; \
  347. shrq $32, ab; \
  348. roll $1, ab ## d; \
  349. xorl y ## d, ab ## d; \
  350. shlq $32, ab; \
  351. rorl $1, x ## d; \
  352. orq x, ab;
  353. #define dec_round_end(ba, x, y, n) \
  354. addl y ## d, x ## d; \
  355. addl x ## d, y ## d; \
  356. addl k+4*(2*(n))(CTX), x ## d; \
  357. addl k+4*(2*(n)+1)(CTX), y ## d; \
  358. xorl ba ## d, y ## d; \
  359. shrq $32, ba; \
  360. roll $1, ba ## d; \
  361. xorl x ## d, ba ## d; \
  362. shlq $32, ba; \
  363. rorl $1, y ## d; \
  364. orq y, ba;
  365. #define encrypt_round3(ab, cd, n) \
  366. g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
  367. \
  368. enc_round_end(ab ## 0, RX0, RY0, n); \
  369. enc_round_end(ab ## 1, RX1, RY1, n); \
  370. enc_round_end(ab ## 2, RX2, RY2, n);
  371. #define decrypt_round3(ba, dc, n) \
  372. g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
  373. \
  374. dec_round_end(ba ## 0, RX0, RY0, n); \
  375. dec_round_end(ba ## 1, RX1, RY1, n); \
  376. dec_round_end(ba ## 2, RX2, RY2, n);
  377. #define encrypt_cycle3(ab, cd, n) \
  378. encrypt_round3(ab, cd, n*2); \
  379. encrypt_round3(ab, cd, (n*2)+1);
  380. #define decrypt_cycle3(ba, dc, n) \
  381. decrypt_round3(ba, dc, (n*2)+1); \
  382. decrypt_round3(ba, dc, (n*2));
  383. #define inpack3(xy, m) \
  384. xorq w+4*m(CTX), xy ## 0; \
  385. xorq w+4*m(CTX), xy ## 1; \
  386. xorq w+4*m(CTX), xy ## 2;
  387. #define outunpack3(xy, m) \
  388. xorq w+4*m(CTX), xy ## 0; \
  389. xorq w+4*m(CTX), xy ## 1; \
  390. xorq w+4*m(CTX), xy ## 2;
  391. #define inpack_enc3() \
  392. inpack3(RAB, 0); \
  393. inpack3(RCD, 2);
  394. #define outunpack_enc3() \
  395. outunpack3(RAB, 6); \
  396. outunpack3(RCD, 4);
  397. #define inpack_dec3() \
  398. inpack3(RAB, 4); \
  399. rorq $32, RAB0; \
  400. rorq $32, RAB1; \
  401. rorq $32, RAB2; \
  402. inpack3(RCD, 6); \
  403. rorq $32, RCD0; \
  404. rorq $32, RCD1; \
  405. rorq $32, RCD2;
  406. #define outunpack_dec3() \
  407. rorq $32, RCD0; \
  408. rorq $32, RCD1; \
  409. rorq $32, RCD2; \
  410. outunpack3(RCD, 0); \
  411. rorq $32, RAB0; \
  412. rorq $32, RAB1; \
  413. rorq $32, RAB2; \
  414. outunpack3(RAB, 2);
  415. .align 16
  416. ELF(.type __twofish_enc_blk3,@function;)
  417. __twofish_enc_blk3:
  418. /* input:
  419. * %rdi: ctx, CTX
  420. * RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three plaintext blocks
  421. * output:
  422. * RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three ciphertext blocks
  423. */
  424. CFI_STARTPROC();
  425. inpack_enc3();
  426. encrypt_cycle3(RAB, RCD, 0);
  427. encrypt_cycle3(RAB, RCD, 1);
  428. encrypt_cycle3(RAB, RCD, 2);
  429. encrypt_cycle3(RAB, RCD, 3);
  430. encrypt_cycle3(RAB, RCD, 4);
  431. encrypt_cycle3(RAB, RCD, 5);
  432. encrypt_cycle3(RAB, RCD, 6);
  433. encrypt_cycle3(RAB, RCD, 7);
  434. outunpack_enc3();
  435. ret_spec_stop;
  436. CFI_ENDPROC();
  437. ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)
  438. .align 16
  439. ELF(.type __twofish_dec_blk3,@function;)
  440. __twofish_dec_blk3:
  441. /* input:
  442. * %rdi: ctx, CTX
  443. * RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three ciphertext blocks
  444. * output:
  445. * RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three plaintext blocks
  446. */
  447. CFI_STARTPROC();
  448. inpack_dec3();
  449. decrypt_cycle3(RAB, RCD, 7);
  450. decrypt_cycle3(RAB, RCD, 6);
  451. decrypt_cycle3(RAB, RCD, 5);
  452. decrypt_cycle3(RAB, RCD, 4);
  453. decrypt_cycle3(RAB, RCD, 3);
  454. decrypt_cycle3(RAB, RCD, 2);
  455. decrypt_cycle3(RAB, RCD, 1);
  456. decrypt_cycle3(RAB, RCD, 0);
  457. outunpack_dec3();
  458. ret_spec_stop;
  459. CFI_ENDPROC();
  460. ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
  461. .align 16
  462. .globl _gcry_twofish_amd64_blk3
  463. ELF(.type _gcry_twofish_amd64_blk3,@function;)
  464. _gcry_twofish_amd64_blk3:
  465. /* input:
  466. * %rdi: ctx, CTX
  467. * %rsi: dst (3 blocks)
  468. * %rdx: src (3 blocks)
  469. * %ecx: encrypt (0 or 1)
  470. */
  471. CFI_STARTPROC();
  472. ENTER_SYSV_FUNC_PARAMS_0_4
  473. subq $(8 * 8), %rsp;
  474. CFI_ADJUST_CFA_OFFSET(8 * 8);
  475. movq %rbp, (0 * 8)(%rsp);
  476. movq %rbx, (1 * 8)(%rsp);
  477. movq %r12, (2 * 8)(%rsp);
  478. movq %r13, (3 * 8)(%rsp);
  479. movq %r14, (4 * 8)(%rsp);
  480. movq %r15, (5 * 8)(%rsp);
  481. CFI_REL_OFFSET(%rbp, 0 * 8);
  482. CFI_REL_OFFSET(%rbx, 1 * 8);
  483. CFI_REL_OFFSET(%r12, 2 * 8);
  484. CFI_REL_OFFSET(%r13, 3 * 8);
  485. CFI_REL_OFFSET(%r14, 4 * 8);
  486. CFI_REL_OFFSET(%r15, 5 * 8);
  487. testl %ecx, %ecx;
  488. movq %rdx, RX0;
  489. movq %rsi, (6 * 8)(%rsp);
  490. movq (0 * 8)(RX0), RAB0;
  491. movq (1 * 8)(RX0), RCD0;
  492. movq (2 * 8)(RX0), RAB1;
  493. movq (3 * 8)(RX0), RCD1;
  494. movq (4 * 8)(RX0), RAB2;
  495. movq (5 * 8)(RX0), RCD2;
  496. jz .Lblk1_3_dec;
  497. call __twofish_enc_blk3;
  498. jmp .Lblk1_3_end;
  499. .Lblk1_3_dec:
  500. call __twofish_dec_blk3;
  501. .Lblk1_3_end:
  502. movq (6 * 8)(%rsp), RX0;
  503. movq RCD0, (0 * 8)(RX0);
  504. movq RAB0, (1 * 8)(RX0);
  505. movq RCD1, (2 * 8)(RX0);
  506. movq RAB1, (3 * 8)(RX0);
  507. movq RCD2, (4 * 8)(RX0);
  508. movq RAB2, (5 * 8)(RX0);
  509. movq (0 * 8)(%rsp), %rbp;
  510. movq (1 * 8)(%rsp), %rbx;
  511. movq (2 * 8)(%rsp), %r12;
  512. movq (3 * 8)(%rsp), %r13;
  513. movq (4 * 8)(%rsp), %r14;
  514. movq (5 * 8)(%rsp), %r15;
  515. CFI_RESTORE(%rbp);
  516. CFI_RESTORE(%rbx);
  517. CFI_RESTORE(%r12);
  518. CFI_RESTORE(%r13);
  519. CFI_RESTORE(%r14);
  520. CFI_RESTORE(%r15);
  521. addq $(8 * 8), %rsp;
  522. CFI_ADJUST_CFA_OFFSET(-8 * 8);
  523. EXIT_SYSV_FUNC
  524. ret_spec_stop;
  525. CFI_ENDPROC();
  526. ELF(.size _gcry_twofish_amd64_blk3,.-_gcry_twofish_amd64_blk3;)
  527. .align 16
  528. .globl _gcry_twofish_amd64_ctr_enc
  529. ELF(.type _gcry_twofish_amd64_ctr_enc,@function;)
  530. _gcry_twofish_amd64_ctr_enc:
  531. /* input:
  532. * %rdi: ctx, CTX
  533. * %rsi: dst (3 blocks)
  534. * %rdx: src (3 blocks)
  535. * %rcx: iv (big endian, 128bit)
  536. */
  537. CFI_STARTPROC();
  538. ENTER_SYSV_FUNC_PARAMS_0_4
  539. subq $(8 * 8), %rsp;
  540. CFI_ADJUST_CFA_OFFSET(8 * 8);
  541. movq %rbp, (0 * 8)(%rsp);
  542. movq %rbx, (1 * 8)(%rsp);
  543. movq %r12, (2 * 8)(%rsp);
  544. movq %r13, (3 * 8)(%rsp);
  545. movq %r14, (4 * 8)(%rsp);
  546. movq %r15, (5 * 8)(%rsp);
  547. CFI_REL_OFFSET(%rbp, 0 * 8);
  548. CFI_REL_OFFSET(%rbx, 1 * 8);
  549. CFI_REL_OFFSET(%r12, 2 * 8);
  550. CFI_REL_OFFSET(%r13, 3 * 8);
  551. CFI_REL_OFFSET(%r14, 4 * 8);
  552. CFI_REL_OFFSET(%r15, 5 * 8);
  553. movq %rsi, (6 * 8)(%rsp);
  554. movq %rdx, (7 * 8)(%rsp);
  555. movq %rcx, RX0;
  556. /* load IV and byteswap */
  557. movq 8(RX0), RT0;
  558. movq 0(RX0), RT1;
  559. movq RT0, RCD0;
  560. movq RT1, RAB0;
  561. bswapq RT0;
  562. bswapq RT1;
  563. /* construct IVs */
  564. movq RT0, RCD1;
  565. movq RT1, RAB1;
  566. movq RT0, RCD2;
  567. movq RT1, RAB2;
  568. addq $1, RCD1;
  569. adcq $0, RAB1;
  570. bswapq RCD1;
  571. bswapq RAB1;
  572. addq $2, RCD2;
  573. adcq $0, RAB2;
  574. bswapq RCD2;
  575. bswapq RAB2;
  576. addq $3, RT0;
  577. adcq $0, RT1;
  578. bswapq RT0;
  579. bswapq RT1;
  580. /* store new IV */
  581. movq RT0, 8(RX0);
  582. movq RT1, 0(RX0);
  583. call __twofish_enc_blk3;
  584. movq (7 * 8)(%rsp), RX0; /*src*/
  585. movq (6 * 8)(%rsp), RX1; /*dst*/
  586. /* XOR key-stream with plaintext */
  587. xorq (0 * 8)(RX0), RCD0;
  588. xorq (1 * 8)(RX0), RAB0;
  589. xorq (2 * 8)(RX0), RCD1;
  590. xorq (3 * 8)(RX0), RAB1;
  591. xorq (4 * 8)(RX0), RCD2;
  592. xorq (5 * 8)(RX0), RAB2;
  593. movq RCD0, (0 * 8)(RX1);
  594. movq RAB0, (1 * 8)(RX1);
  595. movq RCD1, (2 * 8)(RX1);
  596. movq RAB1, (3 * 8)(RX1);
  597. movq RCD2, (4 * 8)(RX1);
  598. movq RAB2, (5 * 8)(RX1);
  599. movq (0 * 8)(%rsp), %rbp;
  600. movq (1 * 8)(%rsp), %rbx;
  601. movq (2 * 8)(%rsp), %r12;
  602. movq (3 * 8)(%rsp), %r13;
  603. movq (4 * 8)(%rsp), %r14;
  604. movq (5 * 8)(%rsp), %r15;
  605. CFI_RESTORE(%rbp);
  606. CFI_RESTORE(%rbx);
  607. CFI_RESTORE(%r12);
  608. CFI_RESTORE(%r13);
  609. CFI_RESTORE(%r14);
  610. CFI_RESTORE(%r15);
  611. addq $(8 * 8), %rsp;
  612. CFI_ADJUST_CFA_OFFSET(-8 * 8);
  613. EXIT_SYSV_FUNC
  614. ret_spec_stop;
  615. CFI_ENDPROC();
  616. ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
  617. .align 16
  618. .globl _gcry_twofish_amd64_cbc_dec
  619. ELF(.type _gcry_twofish_amd64_cbc_dec,@function;)
  620. _gcry_twofish_amd64_cbc_dec:
  621. /* input:
  622. * %rdi: ctx, CTX
  623. * %rsi: dst (3 blocks)
  624. * %rdx: src (3 blocks)
  625. * %rcx: iv (128bit)
  626. */
  627. CFI_STARTPROC();
  628. ENTER_SYSV_FUNC_PARAMS_0_4
  629. subq $(9 * 8), %rsp;
  630. CFI_ADJUST_CFA_OFFSET(9 * 8);
  631. movq %rbp, (0 * 8)(%rsp);
  632. movq %rbx, (1 * 8)(%rsp);
  633. movq %r12, (2 * 8)(%rsp);
  634. movq %r13, (3 * 8)(%rsp);
  635. movq %r14, (4 * 8)(%rsp);
  636. movq %r15, (5 * 8)(%rsp);
  637. CFI_REL_OFFSET(%rbp, 0 * 8);
  638. CFI_REL_OFFSET(%rbx, 1 * 8);
  639. CFI_REL_OFFSET(%r12, 2 * 8);
  640. CFI_REL_OFFSET(%r13, 3 * 8);
  641. CFI_REL_OFFSET(%r14, 4 * 8);
  642. CFI_REL_OFFSET(%r15, 5 * 8);
  643. movq %rsi, (6 * 8)(%rsp);
  644. movq %rdx, (7 * 8)(%rsp);
  645. movq %rcx, (8 * 8)(%rsp);
  646. movq %rdx, RX0;
  647. /* load input */
  648. movq (0 * 8)(RX0), RAB0;
  649. movq (1 * 8)(RX0), RCD0;
  650. movq (2 * 8)(RX0), RAB1;
  651. movq (3 * 8)(RX0), RCD1;
  652. movq (4 * 8)(RX0), RAB2;
  653. movq (5 * 8)(RX0), RCD2;
  654. call __twofish_dec_blk3;
  655. movq (8 * 8)(%rsp), RT0; /*iv*/
  656. movq (7 * 8)(%rsp), RX0; /*src*/
  657. movq (6 * 8)(%rsp), RX1; /*dst*/
  658. movq (4 * 8)(RX0), RY0;
  659. movq (5 * 8)(RX0), RY1;
  660. xorq (0 * 8)(RT0), RCD0;
  661. xorq (1 * 8)(RT0), RAB0;
  662. xorq (0 * 8)(RX0), RCD1;
  663. xorq (1 * 8)(RX0), RAB1;
  664. xorq (2 * 8)(RX0), RCD2;
  665. xorq (3 * 8)(RX0), RAB2;
  666. movq RY0, (0 * 8)(RT0);
  667. movq RY1, (1 * 8)(RT0);
  668. movq RCD0, (0 * 8)(RX1);
  669. movq RAB0, (1 * 8)(RX1);
  670. movq RCD1, (2 * 8)(RX1);
  671. movq RAB1, (3 * 8)(RX1);
  672. movq RCD2, (4 * 8)(RX1);
  673. movq RAB2, (5 * 8)(RX1);
  674. movq (0 * 8)(%rsp), %rbp;
  675. movq (1 * 8)(%rsp), %rbx;
  676. movq (2 * 8)(%rsp), %r12;
  677. movq (3 * 8)(%rsp), %r13;
  678. movq (4 * 8)(%rsp), %r14;
  679. movq (5 * 8)(%rsp), %r15;
  680. CFI_RESTORE(%rbp);
  681. CFI_RESTORE(%rbx);
  682. CFI_RESTORE(%r12);
  683. CFI_RESTORE(%r13);
  684. CFI_RESTORE(%r14);
  685. CFI_RESTORE(%r15);
  686. addq $(9 * 8), %rsp;
  687. CFI_ADJUST_CFA_OFFSET(-9 * 8);
  688. EXIT_SYSV_FUNC
  689. ret_spec_stop;
  690. CFI_ENDPROC();
  691. ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
  692. .align 16
  693. .globl _gcry_twofish_amd64_cfb_dec
  694. ELF(.type _gcry_twofish_amd64_cfb_dec,@function;)
  695. _gcry_twofish_amd64_cfb_dec:
  696. /* input:
  697. * %rdi: ctx, CTX
  698. * %rsi: dst (3 blocks)
  699. * %rdx: src (3 blocks)
  700. * %rcx: iv (128bit)
  701. */
  702. CFI_STARTPROC();
  703. ENTER_SYSV_FUNC_PARAMS_0_4
  704. subq $(8 * 8), %rsp;
  705. CFI_ADJUST_CFA_OFFSET(8 * 8);
  706. movq %rbp, (0 * 8)(%rsp);
  707. movq %rbx, (1 * 8)(%rsp);
  708. movq %r12, (2 * 8)(%rsp);
  709. movq %r13, (3 * 8)(%rsp);
  710. movq %r14, (4 * 8)(%rsp);
  711. movq %r15, (5 * 8)(%rsp);
  712. CFI_REL_OFFSET(%rbp, 0 * 8);
  713. CFI_REL_OFFSET(%rbx, 1 * 8);
  714. CFI_REL_OFFSET(%r12, 2 * 8);
  715. CFI_REL_OFFSET(%r13, 3 * 8);
  716. CFI_REL_OFFSET(%r14, 4 * 8);
  717. CFI_REL_OFFSET(%r15, 5 * 8);
  718. movq %rsi, (6 * 8)(%rsp);
  719. movq %rdx, (7 * 8)(%rsp);
  720. movq %rdx, RX0;
  721. movq %rcx, RX1;
  722. /* load input */
  723. movq (0 * 8)(RX1), RAB0;
  724. movq (1 * 8)(RX1), RCD0;
  725. movq (0 * 8)(RX0), RAB1;
  726. movq (1 * 8)(RX0), RCD1;
  727. movq (2 * 8)(RX0), RAB2;
  728. movq (3 * 8)(RX0), RCD2;
  729. /* Update IV */
  730. movq (4 * 8)(RX0), RY0;
  731. movq (5 * 8)(RX0), RY1;
  732. movq RY0, (0 * 8)(RX1);
  733. movq RY1, (1 * 8)(RX1);
  734. call __twofish_enc_blk3;
  735. movq (7 * 8)(%rsp), RX0; /*src*/
  736. movq (6 * 8)(%rsp), RX1; /*dst*/
  737. xorq (0 * 8)(RX0), RCD0;
  738. xorq (1 * 8)(RX0), RAB0;
  739. xorq (2 * 8)(RX0), RCD1;
  740. xorq (3 * 8)(RX0), RAB1;
  741. xorq (4 * 8)(RX0), RCD2;
  742. xorq (5 * 8)(RX0), RAB2;
  743. movq RCD0, (0 * 8)(RX1);
  744. movq RAB0, (1 * 8)(RX1);
  745. movq RCD1, (2 * 8)(RX1);
  746. movq RAB1, (3 * 8)(RX1);
  747. movq RCD2, (4 * 8)(RX1);
  748. movq RAB2, (5 * 8)(RX1);
  749. movq (0 * 8)(%rsp), %rbp;
  750. movq (1 * 8)(%rsp), %rbx;
  751. movq (2 * 8)(%rsp), %r12;
  752. movq (3 * 8)(%rsp), %r13;
  753. movq (4 * 8)(%rsp), %r14;
  754. movq (5 * 8)(%rsp), %r15;
  755. CFI_RESTORE(%rbp);
  756. CFI_RESTORE(%rbx);
  757. CFI_RESTORE(%r12);
  758. CFI_RESTORE(%r13);
  759. CFI_RESTORE(%r14);
  760. CFI_RESTORE(%r15);
  761. addq $(8 * 8), %rsp;
  762. CFI_ADJUST_CFA_OFFSET(-8 * 8);
  763. EXIT_SYSV_FUNC
  764. ret_spec_stop;
  765. CFI_ENDPROC();
  766. ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
  767. .align 16
  768. .globl _gcry_twofish_amd64_ocb_enc
  769. ELF(.type _gcry_twofish_amd64_ocb_enc,@function;)
  770. _gcry_twofish_amd64_ocb_enc:
  771. /* input:
  772. * %rdi: ctx, CTX
  773. * %rsi: dst (3 blocks)
  774. * %rdx: src (3 blocks)
  775. * %rcx: offset
  776. * %r8 : checksum
  777. * %r9 : L pointers (void *L[3])
  778. */
  779. CFI_STARTPROC();
  780. ENTER_SYSV_FUNC_PARAMS_6
  781. subq $(8 * 8), %rsp;
  782. CFI_ADJUST_CFA_OFFSET(8 * 8);
  783. movq %rbp, (0 * 8)(%rsp);
  784. movq %rbx, (1 * 8)(%rsp);
  785. movq %r12, (2 * 8)(%rsp);
  786. movq %r13, (3 * 8)(%rsp);
  787. movq %r14, (4 * 8)(%rsp);
  788. movq %r15, (5 * 8)(%rsp);
  789. CFI_REL_OFFSET(%rbp, 0 * 8);
  790. CFI_REL_OFFSET(%rbx, 1 * 8);
  791. CFI_REL_OFFSET(%r12, 2 * 8);
  792. CFI_REL_OFFSET(%r13, 3 * 8);
  793. CFI_REL_OFFSET(%r14, 4 * 8);
  794. CFI_REL_OFFSET(%r15, 5 * 8);
  795. movq %rsi, (6 * 8)(%rsp);
  796. movq %rdx, RX0;
  797. movq %rcx, RX1;
  798. movq %r8, RX2;
  799. movq %r9, RY0;
  800. movq %rsi, RY1;
  801. /* Load offset */
  802. movq (0 * 8)(RX1), RT0;
  803. movq (1 * 8)(RX1), RT1;
  804. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  805. movq (RY0), RY2;
  806. xorq (0 * 8)(RY2), RT0;
  807. xorq (1 * 8)(RY2), RT1;
  808. movq (0 * 8)(RX0), RAB0;
  809. movq (1 * 8)(RX0), RCD0;
  810. /* Store Offset_i */
  811. movq RT0, (0 * 8)(RY1);
  812. movq RT1, (1 * 8)(RY1);
  813. /* Checksum_i = Checksum_{i-1} xor P_i */
  814. xor RAB0, (0 * 8)(RX2);
  815. xor RCD0, (1 * 8)(RX2);
  816. /* PX_i = P_i xor Offset_i */
  817. xorq RT0, RAB0;
  818. xorq RT1, RCD0;
  819. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  820. movq 8(RY0), RY2;
  821. xorq (0 * 8)(RY2), RT0;
  822. xorq (1 * 8)(RY2), RT1;
  823. movq (2 * 8)(RX0), RAB1;
  824. movq (3 * 8)(RX0), RCD1;
  825. /* Store Offset_i */
  826. movq RT0, (2 * 8)(RY1);
  827. movq RT1, (3 * 8)(RY1);
  828. /* Checksum_i = Checksum_{i-1} xor P_i */
  829. xor RAB1, (0 * 8)(RX2);
  830. xor RCD1, (1 * 8)(RX2);
  831. /* PX_i = P_i xor Offset_i */
  832. xorq RT0, RAB1;
  833. xorq RT1, RCD1;
  834. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  835. movq 16(RY0), RY2;
  836. xorq (0 * 8)(RY2), RT0;
  837. xorq (1 * 8)(RY2), RT1;
  838. movq (4 * 8)(RX0), RAB2;
  839. movq (5 * 8)(RX0), RCD2;
  840. /* Store Offset_i */
  841. movq RT0, (4 * 8)(RY1);
  842. movq RT1, (5 * 8)(RY1);
  843. /* Checksum_i = Checksum_{i-1} xor P_i */
  844. xor RAB2, (0 * 8)(RX2);
  845. xor RCD2, (1 * 8)(RX2);
  846. /* PX_i = P_i xor Offset_i */
  847. xorq RT0, RAB2;
  848. xorq RT1, RCD2;
  849. /* Store offset */
  850. movq RT0, (0 * 8)(RX1);
  851. movq RT1, (1 * 8)(RX1);
  852. /* CX_i = ENCIPHER(K, PX_i) */
  853. call __twofish_enc_blk3;
  854. movq (6 * 8)(%rsp), RX1; /*dst*/
  855. /* C_i = CX_i xor Offset_i */
  856. xorq RCD0, (0 * 8)(RX1);
  857. xorq RAB0, (1 * 8)(RX1);
  858. xorq RCD1, (2 * 8)(RX1);
  859. xorq RAB1, (3 * 8)(RX1);
  860. xorq RCD2, (4 * 8)(RX1);
  861. xorq RAB2, (5 * 8)(RX1);
  862. movq (0 * 8)(%rsp), %rbp;
  863. movq (1 * 8)(%rsp), %rbx;
  864. movq (2 * 8)(%rsp), %r12;
  865. movq (3 * 8)(%rsp), %r13;
  866. movq (4 * 8)(%rsp), %r14;
  867. movq (5 * 8)(%rsp), %r15;
  868. CFI_RESTORE(%rbp);
  869. CFI_RESTORE(%rbx);
  870. CFI_RESTORE(%r12);
  871. CFI_RESTORE(%r13);
  872. CFI_RESTORE(%r14);
  873. CFI_RESTORE(%r15);
  874. addq $(8 * 8), %rsp;
  875. CFI_ADJUST_CFA_OFFSET(-8 * 8);
  876. EXIT_SYSV_FUNC
  877. ret_spec_stop;
  878. CFI_ENDPROC();
  879. ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
  880. .align 16
  881. .globl _gcry_twofish_amd64_ocb_dec
  882. ELF(.type _gcry_twofish_amd64_ocb_dec,@function;)
  883. _gcry_twofish_amd64_ocb_dec:
  884. /* input:
  885. * %rdi: ctx, CTX
  886. * %rsi: dst (3 blocks)
  887. * %rdx: src (3 blocks)
  888. * %rcx: offset
  889. * %r8 : checksum
  890. * %r9 : L pointers (void *L[3])
  891. */
  892. CFI_STARTPROC();
  893. ENTER_SYSV_FUNC_PARAMS_6
  894. subq $(8 * 8), %rsp;
  895. CFI_ADJUST_CFA_OFFSET(8 * 8);
  896. movq %rbp, (0 * 8)(%rsp);
  897. movq %rbx, (1 * 8)(%rsp);
  898. movq %r12, (2 * 8)(%rsp);
  899. movq %r13, (3 * 8)(%rsp);
  900. movq %r14, (4 * 8)(%rsp);
  901. movq %r15, (5 * 8)(%rsp);
  902. CFI_REL_OFFSET(%rbp, 0 * 8);
  903. CFI_REL_OFFSET(%rbx, 1 * 8);
  904. CFI_REL_OFFSET(%r12, 2 * 8);
  905. CFI_REL_OFFSET(%r13, 3 * 8);
  906. CFI_REL_OFFSET(%r14, 4 * 8);
  907. CFI_REL_OFFSET(%r15, 5 * 8);
  908. movq %rsi, (6 * 8)(%rsp);
  909. movq %r8, (7 * 8)(%rsp);
  910. movq %rdx, RX0;
  911. movq %rcx, RX1;
  912. movq %r9, RY0;
  913. movq %rsi, RY1;
  914. /* Load offset */
  915. movq (0 * 8)(RX1), RT0;
  916. movq (1 * 8)(RX1), RT1;
  917. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  918. movq (RY0), RY2;
  919. xorq (0 * 8)(RY2), RT0;
  920. xorq (1 * 8)(RY2), RT1;
  921. movq (0 * 8)(RX0), RAB0;
  922. movq (1 * 8)(RX0), RCD0;
  923. /* Store Offset_i */
  924. movq RT0, (0 * 8)(RY1);
  925. movq RT1, (1 * 8)(RY1);
  926. /* CX_i = C_i xor Offset_i */
  927. xorq RT0, RAB0;
  928. xorq RT1, RCD0;
  929. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  930. movq 8(RY0), RY2;
  931. xorq (0 * 8)(RY2), RT0;
  932. xorq (1 * 8)(RY2), RT1;
  933. movq (2 * 8)(RX0), RAB1;
  934. movq (3 * 8)(RX0), RCD1;
  935. /* Store Offset_i */
  936. movq RT0, (2 * 8)(RY1);
  937. movq RT1, (3 * 8)(RY1);
  938. /* PX_i = P_i xor Offset_i */
  939. xorq RT0, RAB1;
  940. xorq RT1, RCD1;
  941. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  942. movq 16(RY0), RY2;
  943. xorq (0 * 8)(RY2), RT0;
  944. xorq (1 * 8)(RY2), RT1;
  945. movq (4 * 8)(RX0), RAB2;
  946. movq (5 * 8)(RX0), RCD2;
  947. /* Store Offset_i */
  948. movq RT0, (4 * 8)(RY1);
  949. movq RT1, (5 * 8)(RY1);
  950. /* PX_i = P_i xor Offset_i */
  951. xorq RT0, RAB2;
  952. xorq RT1, RCD2;
  953. /* Store offset */
  954. movq RT0, (0 * 8)(RX1);
  955. movq RT1, (1 * 8)(RX1);
  956. /* PX_i = DECIPHER(K, CX_i) */
  957. call __twofish_dec_blk3;
  958. movq (7 * 8)(%rsp), RX2; /*checksum*/
  959. movq (6 * 8)(%rsp), RX1; /*dst*/
  960. /* Load checksum */
  961. movq (0 * 8)(RX2), RT0;
  962. movq (1 * 8)(RX2), RT1;
  963. /* P_i = PX_i xor Offset_i */
  964. xorq RCD0, (0 * 8)(RX1);
  965. xorq RAB0, (1 * 8)(RX1);
  966. xorq RCD1, (2 * 8)(RX1);
  967. xorq RAB1, (3 * 8)(RX1);
  968. xorq RCD2, (4 * 8)(RX1);
  969. xorq RAB2, (5 * 8)(RX1);
  970. /* Checksum_i = Checksum_{i-1} xor P_i */
  971. xorq (0 * 8)(RX1), RT0;
  972. xorq (1 * 8)(RX1), RT1;
  973. xorq (2 * 8)(RX1), RT0;
  974. xorq (3 * 8)(RX1), RT1;
  975. xorq (4 * 8)(RX1), RT0;
  976. xorq (5 * 8)(RX1), RT1;
  977. /* Store checksum */
  978. movq RT0, (0 * 8)(RX2);
  979. movq RT1, (1 * 8)(RX2);
  980. movq (0 * 8)(%rsp), %rbp;
  981. movq (1 * 8)(%rsp), %rbx;
  982. movq (2 * 8)(%rsp), %r12;
  983. movq (3 * 8)(%rsp), %r13;
  984. movq (4 * 8)(%rsp), %r14;
  985. movq (5 * 8)(%rsp), %r15;
  986. CFI_RESTORE(%rbp);
  987. CFI_RESTORE(%rbx);
  988. CFI_RESTORE(%r12);
  989. CFI_RESTORE(%r13);
  990. CFI_RESTORE(%r14);
  991. CFI_RESTORE(%r15);
  992. addq $(8 * 8), %rsp;
  993. CFI_ADJUST_CFA_OFFSET(-8 * 8);
  994. EXIT_SYSV_FUNC
  995. ret_spec_stop;
  996. CFI_ENDPROC();
  997. ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
  998. .align 16
  999. .globl _gcry_twofish_amd64_ocb_auth
  1000. ELF(.type _gcry_twofish_amd64_ocb_auth,@function;)
  1001. _gcry_twofish_amd64_ocb_auth:
  1002. /* input:
  1003. * %rdi: ctx, CTX
  1004. * %rsi: abuf (3 blocks)
  1005. * %rdx: offset
  1006. * %rcx: checksum
  1007. * %r8 : L pointers (void *L[3])
  1008. */
  1009. CFI_STARTPROC();
  1010. ENTER_SYSV_FUNC_PARAMS_5
  1011. subq $(8 * 8), %rsp;
  1012. CFI_ADJUST_CFA_OFFSET(8 * 8);
  1013. movq %rbp, (0 * 8)(%rsp);
  1014. movq %rbx, (1 * 8)(%rsp);
  1015. movq %r12, (2 * 8)(%rsp);
  1016. movq %r13, (3 * 8)(%rsp);
  1017. movq %r14, (4 * 8)(%rsp);
  1018. movq %r15, (5 * 8)(%rsp);
  1019. CFI_REL_OFFSET(%rbp, 0 * 8);
  1020. CFI_REL_OFFSET(%rbx, 1 * 8);
  1021. CFI_REL_OFFSET(%r12, 2 * 8);
  1022. CFI_REL_OFFSET(%r13, 3 * 8);
  1023. CFI_REL_OFFSET(%r14, 4 * 8);
  1024. CFI_REL_OFFSET(%r15, 5 * 8);
  1025. movq %rcx, (6 * 8)(%rsp);
  1026. movq %rsi, RX0;
  1027. movq %rdx, RX1;
  1028. movq %r8, RY0;
  1029. /* Load offset */
  1030. movq (0 * 8)(RX1), RT0;
  1031. movq (1 * 8)(RX1), RT1;
  1032. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  1033. movq (RY0), RY2;
  1034. xorq (0 * 8)(RY2), RT0;
  1035. xorq (1 * 8)(RY2), RT1;
  1036. movq (0 * 8)(RX0), RAB0;
  1037. movq (1 * 8)(RX0), RCD0;
  1038. /* PX_i = P_i xor Offset_i */
  1039. xorq RT0, RAB0;
  1040. xorq RT1, RCD0;
  1041. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  1042. movq 8(RY0), RY2;
  1043. xorq (0 * 8)(RY2), RT0;
  1044. xorq (1 * 8)(RY2), RT1;
  1045. movq (2 * 8)(RX0), RAB1;
  1046. movq (3 * 8)(RX0), RCD1;
  1047. /* PX_i = P_i xor Offset_i */
  1048. xorq RT0, RAB1;
  1049. xorq RT1, RCD1;
  1050. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  1051. movq 16(RY0), RY2;
  1052. xorq (0 * 8)(RY2), RT0;
  1053. xorq (1 * 8)(RY2), RT1;
  1054. movq (4 * 8)(RX0), RAB2;
  1055. movq (5 * 8)(RX0), RCD2;
  1056. /* PX_i = P_i xor Offset_i */
  1057. xorq RT0, RAB2;
  1058. xorq RT1, RCD2;
  1059. /* Store offset */
  1060. movq RT0, (0 * 8)(RX1);
  1061. movq RT1, (1 * 8)(RX1);
  1062. /* C_i = ENCIPHER(K, PX_i) */
  1063. call __twofish_enc_blk3;
  1064. movq (6 * 8)(%rsp), RX1; /*checksum*/
  1065. /* Checksum_i = C_i xor Checksum_i */
  1066. xorq RCD0, RCD1;
  1067. xorq RAB0, RAB1;
  1068. xorq RCD1, RCD2;
  1069. xorq RAB1, RAB2;
  1070. xorq RCD2, (0 * 8)(RX1);
  1071. xorq RAB2, (1 * 8)(RX1);
  1072. movq (0 * 8)(%rsp), %rbp;
  1073. movq (1 * 8)(%rsp), %rbx;
  1074. movq (2 * 8)(%rsp), %r12;
  1075. movq (3 * 8)(%rsp), %r13;
  1076. movq (4 * 8)(%rsp), %r14;
  1077. movq (5 * 8)(%rsp), %r15;
  1078. CFI_RESTORE(%rbp);
  1079. CFI_RESTORE(%rbx);
  1080. CFI_RESTORE(%r12);
  1081. CFI_RESTORE(%r13);
  1082. CFI_RESTORE(%r14);
  1083. CFI_RESTORE(%r15);
  1084. addq $(8 * 8), %rsp;
  1085. CFI_ADJUST_CFA_OFFSET(-8 * 8);
  1086. EXIT_SYSV_FUNC
  1087. ret_spec_stop;
  1088. CFI_ENDPROC();
  1089. ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
  1090. #endif /*USE_TWOFISH*/
  1091. #endif /*__x86_64*/