serpent-sse2-i586-asm_32.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
  1. /*
  2. * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
  3. *
  4. * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5. *
  6. * Based on crypto/serpent.c by
  7. * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
  8. * 2003 Herbert Valerio Riedel <hvr@gnu.org>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  23. * USA
  24. *
  25. */
  26. #include <linux/linkage.h>
  27. .file "serpent-sse2-i586-asm_32.S"
  28. .text
  29. #define arg_ctx 4
  30. #define arg_dst 8
  31. #define arg_src 12
  32. #define arg_xor 16
  33. /**********************************************************************
  34. 4-way SSE2 serpent
  35. **********************************************************************/
  36. #define CTX %edx
  37. #define RA %xmm0
  38. #define RB %xmm1
  39. #define RC %xmm2
  40. #define RD %xmm3
  41. #define RE %xmm4
  42. #define RT0 %xmm5
  43. #define RT1 %xmm6
  44. #define RNOT %xmm7
  45. #define get_key(i, j, t) \
  46. movd (4*(i)+(j))*4(CTX), t; \
  47. pshufd $0, t, t;
  48. #define K(x0, x1, x2, x3, x4, i) \
  49. get_key(i, 0, x4); \
  50. get_key(i, 1, RT0); \
  51. get_key(i, 2, RT1); \
  52. pxor x4, x0; \
  53. pxor RT0, x1; \
  54. pxor RT1, x2; \
  55. get_key(i, 3, x4); \
  56. pxor x4, x3;
  57. #define LK(x0, x1, x2, x3, x4, i) \
  58. movdqa x0, x4; \
  59. pslld $13, x0; \
  60. psrld $(32 - 13), x4; \
  61. por x4, x0; \
  62. pxor x0, x1; \
  63. movdqa x2, x4; \
  64. pslld $3, x2; \
  65. psrld $(32 - 3), x4; \
  66. por x4, x2; \
  67. pxor x2, x1; \
  68. movdqa x1, x4; \
  69. pslld $1, x1; \
  70. psrld $(32 - 1), x4; \
  71. por x4, x1; \
  72. movdqa x0, x4; \
  73. pslld $3, x4; \
  74. pxor x2, x3; \
  75. pxor x4, x3; \
  76. movdqa x3, x4; \
  77. pslld $7, x3; \
  78. psrld $(32 - 7), x4; \
  79. por x4, x3; \
  80. movdqa x1, x4; \
  81. pslld $7, x4; \
  82. pxor x1, x0; \
  83. pxor x3, x0; \
  84. pxor x3, x2; \
  85. pxor x4, x2; \
  86. movdqa x0, x4; \
  87. get_key(i, 1, RT0); \
  88. pxor RT0, x1; \
  89. get_key(i, 3, RT0); \
  90. pxor RT0, x3; \
  91. pslld $5, x0; \
  92. psrld $(32 - 5), x4; \
  93. por x4, x0; \
  94. movdqa x2, x4; \
  95. pslld $22, x2; \
  96. psrld $(32 - 22), x4; \
  97. por x4, x2; \
  98. get_key(i, 0, RT0); \
  99. pxor RT0, x0; \
  100. get_key(i, 2, RT0); \
  101. pxor RT0, x2;
  102. #define KL(x0, x1, x2, x3, x4, i) \
  103. K(x0, x1, x2, x3, x4, i); \
  104. movdqa x0, x4; \
  105. psrld $5, x0; \
  106. pslld $(32 - 5), x4; \
  107. por x4, x0; \
  108. movdqa x2, x4; \
  109. psrld $22, x2; \
  110. pslld $(32 - 22), x4; \
  111. por x4, x2; \
  112. pxor x3, x2; \
  113. pxor x3, x0; \
  114. movdqa x1, x4; \
  115. pslld $7, x4; \
  116. pxor x1, x0; \
  117. pxor x4, x2; \
  118. movdqa x1, x4; \
  119. psrld $1, x1; \
  120. pslld $(32 - 1), x4; \
  121. por x4, x1; \
  122. movdqa x3, x4; \
  123. psrld $7, x3; \
  124. pslld $(32 - 7), x4; \
  125. por x4, x3; \
  126. pxor x0, x1; \
  127. movdqa x0, x4; \
  128. pslld $3, x4; \
  129. pxor x4, x3; \
  130. movdqa x0, x4; \
  131. psrld $13, x0; \
  132. pslld $(32 - 13), x4; \
  133. por x4, x0; \
  134. pxor x2, x1; \
  135. pxor x2, x3; \
  136. movdqa x2, x4; \
  137. psrld $3, x2; \
  138. pslld $(32 - 3), x4; \
  139. por x4, x2;
  140. #define S0(x0, x1, x2, x3, x4) \
  141. movdqa x3, x4; \
  142. por x0, x3; \
  143. pxor x4, x0; \
  144. pxor x2, x4; \
  145. pxor RNOT, x4; \
  146. pxor x1, x3; \
  147. pand x0, x1; \
  148. pxor x4, x1; \
  149. pxor x0, x2; \
  150. pxor x3, x0; \
  151. por x0, x4; \
  152. pxor x2, x0; \
  153. pand x1, x2; \
  154. pxor x2, x3; \
  155. pxor RNOT, x1; \
  156. pxor x4, x2; \
  157. pxor x2, x1;
  158. #define S1(x0, x1, x2, x3, x4) \
  159. movdqa x1, x4; \
  160. pxor x0, x1; \
  161. pxor x3, x0; \
  162. pxor RNOT, x3; \
  163. pand x1, x4; \
  164. por x1, x0; \
  165. pxor x2, x3; \
  166. pxor x3, x0; \
  167. pxor x3, x1; \
  168. pxor x4, x3; \
  169. por x4, x1; \
  170. pxor x2, x4; \
  171. pand x0, x2; \
  172. pxor x1, x2; \
  173. por x0, x1; \
  174. pxor RNOT, x0; \
  175. pxor x2, x0; \
  176. pxor x1, x4;
  177. #define S2(x0, x1, x2, x3, x4) \
  178. pxor RNOT, x3; \
  179. pxor x0, x1; \
  180. movdqa x0, x4; \
  181. pand x2, x0; \
  182. pxor x3, x0; \
  183. por x4, x3; \
  184. pxor x1, x2; \
  185. pxor x1, x3; \
  186. pand x0, x1; \
  187. pxor x2, x0; \
  188. pand x3, x2; \
  189. por x1, x3; \
  190. pxor RNOT, x0; \
  191. pxor x0, x3; \
  192. pxor x0, x4; \
  193. pxor x2, x0; \
  194. por x2, x1;
  195. #define S3(x0, x1, x2, x3, x4) \
  196. movdqa x1, x4; \
  197. pxor x3, x1; \
  198. por x0, x3; \
  199. pand x0, x4; \
  200. pxor x2, x0; \
  201. pxor x1, x2; \
  202. pand x3, x1; \
  203. pxor x3, x2; \
  204. por x4, x0; \
  205. pxor x3, x4; \
  206. pxor x0, x1; \
  207. pand x3, x0; \
  208. pand x4, x3; \
  209. pxor x2, x3; \
  210. por x1, x4; \
  211. pand x1, x2; \
  212. pxor x3, x4; \
  213. pxor x3, x0; \
  214. pxor x2, x3;
  215. #define S4(x0, x1, x2, x3, x4) \
  216. movdqa x3, x4; \
  217. pand x0, x3; \
  218. pxor x4, x0; \
  219. pxor x2, x3; \
  220. por x4, x2; \
  221. pxor x1, x0; \
  222. pxor x3, x4; \
  223. por x0, x2; \
  224. pxor x1, x2; \
  225. pand x0, x1; \
  226. pxor x4, x1; \
  227. pand x2, x4; \
  228. pxor x3, x2; \
  229. pxor x0, x4; \
  230. por x1, x3; \
  231. pxor RNOT, x1; \
  232. pxor x0, x3;
  233. #define S5(x0, x1, x2, x3, x4) \
  234. movdqa x1, x4; \
  235. por x0, x1; \
  236. pxor x1, x2; \
  237. pxor RNOT, x3; \
  238. pxor x0, x4; \
  239. pxor x2, x0; \
  240. pand x4, x1; \
  241. por x3, x4; \
  242. pxor x0, x4; \
  243. pand x3, x0; \
  244. pxor x3, x1; \
  245. pxor x2, x3; \
  246. pxor x1, x0; \
  247. pand x4, x2; \
  248. pxor x2, x1; \
  249. pand x0, x2; \
  250. pxor x2, x3;
  251. #define S6(x0, x1, x2, x3, x4) \
  252. movdqa x1, x4; \
  253. pxor x0, x3; \
  254. pxor x2, x1; \
  255. pxor x0, x2; \
  256. pand x3, x0; \
  257. por x3, x1; \
  258. pxor RNOT, x4; \
  259. pxor x1, x0; \
  260. pxor x2, x1; \
  261. pxor x4, x3; \
  262. pxor x0, x4; \
  263. pand x0, x2; \
  264. pxor x1, x4; \
  265. pxor x3, x2; \
  266. pand x1, x3; \
  267. pxor x0, x3; \
  268. pxor x2, x1;
  269. #define S7(x0, x1, x2, x3, x4) \
  270. pxor RNOT, x1; \
  271. movdqa x1, x4; \
  272. pxor RNOT, x0; \
  273. pand x2, x1; \
  274. pxor x3, x1; \
  275. por x4, x3; \
  276. pxor x2, x4; \
  277. pxor x3, x2; \
  278. pxor x0, x3; \
  279. por x1, x0; \
  280. pand x0, x2; \
  281. pxor x4, x0; \
  282. pxor x3, x4; \
  283. pand x0, x3; \
  284. pxor x1, x4; \
  285. pxor x4, x2; \
  286. pxor x1, x3; \
  287. por x0, x4; \
  288. pxor x1, x4;
  289. #define SI0(x0, x1, x2, x3, x4) \
  290. movdqa x3, x4; \
  291. pxor x0, x1; \
  292. por x1, x3; \
  293. pxor x1, x4; \
  294. pxor RNOT, x0; \
  295. pxor x3, x2; \
  296. pxor x0, x3; \
  297. pand x1, x0; \
  298. pxor x2, x0; \
  299. pand x3, x2; \
  300. pxor x4, x3; \
  301. pxor x3, x2; \
  302. pxor x3, x1; \
  303. pand x0, x3; \
  304. pxor x0, x1; \
  305. pxor x2, x0; \
  306. pxor x3, x4;
  307. #define SI1(x0, x1, x2, x3, x4) \
  308. pxor x3, x1; \
  309. movdqa x0, x4; \
  310. pxor x2, x0; \
  311. pxor RNOT, x2; \
  312. por x1, x4; \
  313. pxor x3, x4; \
  314. pand x1, x3; \
  315. pxor x2, x1; \
  316. pand x4, x2; \
  317. pxor x1, x4; \
  318. por x3, x1; \
  319. pxor x0, x3; \
  320. pxor x0, x2; \
  321. por x4, x0; \
  322. pxor x4, x2; \
  323. pxor x0, x1; \
  324. pxor x1, x4;
  325. #define SI2(x0, x1, x2, x3, x4) \
  326. pxor x1, x2; \
  327. movdqa x3, x4; \
  328. pxor RNOT, x3; \
  329. por x2, x3; \
  330. pxor x4, x2; \
  331. pxor x0, x4; \
  332. pxor x1, x3; \
  333. por x2, x1; \
  334. pxor x0, x2; \
  335. pxor x4, x1; \
  336. por x3, x4; \
  337. pxor x3, x2; \
  338. pxor x2, x4; \
  339. pand x1, x2; \
  340. pxor x3, x2; \
  341. pxor x4, x3; \
  342. pxor x0, x4;
  343. #define SI3(x0, x1, x2, x3, x4) \
  344. pxor x1, x2; \
  345. movdqa x1, x4; \
  346. pand x2, x1; \
  347. pxor x0, x1; \
  348. por x4, x0; \
  349. pxor x3, x4; \
  350. pxor x3, x0; \
  351. por x1, x3; \
  352. pxor x2, x1; \
  353. pxor x3, x1; \
  354. pxor x2, x0; \
  355. pxor x3, x2; \
  356. pand x1, x3; \
  357. pxor x0, x1; \
  358. pand x2, x0; \
  359. pxor x3, x4; \
  360. pxor x0, x3; \
  361. pxor x1, x0;
  362. #define SI4(x0, x1, x2, x3, x4) \
  363. pxor x3, x2; \
  364. movdqa x0, x4; \
  365. pand x1, x0; \
  366. pxor x2, x0; \
  367. por x3, x2; \
  368. pxor RNOT, x4; \
  369. pxor x0, x1; \
  370. pxor x2, x0; \
  371. pand x4, x2; \
  372. pxor x0, x2; \
  373. por x4, x0; \
  374. pxor x3, x0; \
  375. pand x2, x3; \
  376. pxor x3, x4; \
  377. pxor x1, x3; \
  378. pand x0, x1; \
  379. pxor x1, x4; \
  380. pxor x3, x0;
  381. #define SI5(x0, x1, x2, x3, x4) \
  382. movdqa x1, x4; \
  383. por x2, x1; \
  384. pxor x4, x2; \
  385. pxor x3, x1; \
  386. pand x4, x3; \
  387. pxor x3, x2; \
  388. por x0, x3; \
  389. pxor RNOT, x0; \
  390. pxor x2, x3; \
  391. por x0, x2; \
  392. pxor x1, x4; \
  393. pxor x4, x2; \
  394. pand x0, x4; \
  395. pxor x1, x0; \
  396. pxor x3, x1; \
  397. pand x2, x0; \
  398. pxor x3, x2; \
  399. pxor x2, x0; \
  400. pxor x4, x2; \
  401. pxor x3, x4;
  402. #define SI6(x0, x1, x2, x3, x4) \
  403. pxor x2, x0; \
  404. movdqa x0, x4; \
  405. pand x3, x0; \
  406. pxor x3, x2; \
  407. pxor x2, x0; \
  408. pxor x1, x3; \
  409. por x4, x2; \
  410. pxor x3, x2; \
  411. pand x0, x3; \
  412. pxor RNOT, x0; \
  413. pxor x1, x3; \
  414. pand x2, x1; \
  415. pxor x0, x4; \
  416. pxor x4, x3; \
  417. pxor x2, x4; \
  418. pxor x1, x0; \
  419. pxor x0, x2;
  420. #define SI7(x0, x1, x2, x3, x4) \
  421. movdqa x3, x4; \
  422. pand x0, x3; \
  423. pxor x2, x0; \
  424. por x4, x2; \
  425. pxor x1, x4; \
  426. pxor RNOT, x0; \
  427. por x3, x1; \
  428. pxor x0, x4; \
  429. pand x2, x0; \
  430. pxor x1, x0; \
  431. pand x2, x1; \
  432. pxor x2, x3; \
  433. pxor x3, x4; \
  434. pand x3, x2; \
  435. por x0, x3; \
  436. pxor x4, x1; \
  437. pxor x4, x3; \
  438. pand x0, x4; \
  439. pxor x2, x4;
  440. #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  441. movdqa x0, t2; \
  442. punpckldq x1, x0; \
  443. punpckhdq x1, t2; \
  444. movdqa x2, t1; \
  445. punpckhdq x3, x2; \
  446. punpckldq x3, t1; \
  447. movdqa x0, x1; \
  448. punpcklqdq t1, x0; \
  449. punpckhqdq t1, x1; \
  450. movdqa t2, x3; \
  451. punpcklqdq x2, t2; \
  452. punpckhqdq x2, x3; \
  453. movdqa t2, x2;
  454. #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
  455. movdqu (0*4*4)(in), x0; \
  456. movdqu (1*4*4)(in), x1; \
  457. movdqu (2*4*4)(in), x2; \
  458. movdqu (3*4*4)(in), x3; \
  459. \
  460. transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  461. #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
  462. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  463. \
  464. movdqu x0, (0*4*4)(out); \
  465. movdqu x1, (1*4*4)(out); \
  466. movdqu x2, (2*4*4)(out); \
  467. movdqu x3, (3*4*4)(out);
  468. #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
  469. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  470. \
  471. movdqu (0*4*4)(out), t0; \
  472. pxor t0, x0; \
  473. movdqu x0, (0*4*4)(out); \
  474. movdqu (1*4*4)(out), t0; \
  475. pxor t0, x1; \
  476. movdqu x1, (1*4*4)(out); \
  477. movdqu (2*4*4)(out), t0; \
  478. pxor t0, x2; \
  479. movdqu x2, (2*4*4)(out); \
  480. movdqu (3*4*4)(out), t0; \
  481. pxor t0, x3; \
  482. movdqu x3, (3*4*4)(out);
  483. ENTRY(__serpent_enc_blk_4way)
  484. /* input:
  485. * arg_ctx(%esp): ctx, CTX
  486. * arg_dst(%esp): dst
  487. * arg_src(%esp): src
  488. * arg_xor(%esp): bool, if true: xor output
  489. */
  490. pcmpeqd RNOT, RNOT;
  491. movl arg_ctx(%esp), CTX;
  492. movl arg_src(%esp), %eax;
  493. read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
  494. K(RA, RB, RC, RD, RE, 0);
  495. S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
  496. S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
  497. S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
  498. S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
  499. S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
  500. S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
  501. S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
  502. S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
  503. S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
  504. S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
  505. S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
  506. S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
  507. S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
  508. S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
  509. S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
  510. S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
  511. S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
  512. S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
  513. S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
  514. S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
  515. S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
  516. S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
  517. S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
  518. S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
  519. S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
  520. S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
  521. S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
  522. S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
  523. S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
  524. S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
  525. S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
  526. S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
  527. movl arg_dst(%esp), %eax;
  528. cmpb $0, arg_xor(%esp);
  529. jnz .L__enc_xor4;
  530. write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
  531. ret;
  532. .L__enc_xor4:
  533. xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
  534. ret;
  535. ENDPROC(__serpent_enc_blk_4way)
  536. ENTRY(serpent_dec_blk_4way)
  537. /* input:
  538. * arg_ctx(%esp): ctx, CTX
  539. * arg_dst(%esp): dst
  540. * arg_src(%esp): src
  541. */
  542. pcmpeqd RNOT, RNOT;
  543. movl arg_ctx(%esp), CTX;
  544. movl arg_src(%esp), %eax;
  545. read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
  546. K(RA, RB, RC, RD, RE, 32);
  547. SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
  548. SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
  549. SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
  550. SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
  551. SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
  552. SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
  553. SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
  554. SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
  555. SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
  556. SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
  557. SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
  558. SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
  559. SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
  560. SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
  561. SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
  562. SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
  563. SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
  564. SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
  565. SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
  566. SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
  567. SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
  568. SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
  569. SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
  570. SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
  571. SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
  572. SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
  573. SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
  574. SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
  575. SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
  576. SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
  577. SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
  578. SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
  579. movl arg_dst(%esp), %eax;
  580. write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
  581. ret;
  582. ENDPROC(serpent_dec_blk_4way)