blowfish-amd64.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. /* blowfish-amd64.S - AMD64 assembly implementation of Blowfish cipher
  2. *
  3. * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This file is part of Libgcrypt.
  6. *
  7. * Libgcrypt is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as
  9. * published by the Free Software Foundation; either version 2.1 of
  10. * the License, or (at your option) any later version.
  11. *
  12. * Libgcrypt is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #ifdef __x86_64
  21. #include <config.h>
  22. #if defined(USE_BLOWFISH) && \
  23. (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  24. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
  25. #include "asm-common-amd64.h"
  26. .text
  27. /* structure of BLOWFISH_context: */
  28. #define s0 0
  29. #define s1 ((s0) + 256 * 4)
  30. #define s2 ((s1) + 256 * 4)
  31. #define s3 ((s2) + 256 * 4)
  32. #define p ((s3) + 256 * 4)
  33. /* register macros */
  34. #define CTX %rdi
  35. #define RIO %rsi
  36. #define RX0 %rax
  37. #define RX1 %rbx
  38. #define RX2 %rcx
  39. #define RX3 %rdx
  40. #define RX0d %eax
  41. #define RX1d %ebx
  42. #define RX2d %ecx
  43. #define RX3d %edx
  44. #define RX0bl %al
  45. #define RX1bl %bl
  46. #define RX2bl %cl
  47. #define RX3bl %dl
  48. #define RX0bh %ah
  49. #define RX1bh %bh
  50. #define RX2bh %ch
  51. #define RX3bh %dh
  52. #define RT0 %rbp
  53. #define RT1 %rsi
  54. #define RT2 %r8
  55. #define RT3 %r9
  56. #define RT0d %ebp
  57. #define RT1d %esi
  58. #define RT2d %r8d
  59. #define RT3d %r9d
  60. #define RKEY %r10
  61. /***********************************************************************
  62. * 1-way blowfish
  63. ***********************************************************************/
  64. #define F() \
  65. movzbl RX0bh, RT1d; \
  66. movzbl RX0bl, RT3d; \
  67. rorq $16, RX0; \
  68. movzbl RX0bh, RT0d; \
  69. movzbl RX0bl, RT2d; \
  70. rorq $16, RX0; \
  71. movl s0(CTX,RT0,4), RT0d; \
  72. addl s1(CTX,RT2,4), RT0d; \
  73. xorl s2(CTX,RT1,4), RT0d; \
  74. addl s3(CTX,RT3,4), RT0d; \
  75. xorq RT0, RX0;
  76. #define load_roundkey_enc(n) \
  77. movq p+4*(n)(CTX), RX3;
  78. #define add_roundkey_enc() \
  79. xorq RX3, RX0;
  80. #define round_enc(n) \
  81. add_roundkey_enc(); \
  82. load_roundkey_enc(n); \
  83. \
  84. F(); \
  85. F();
  86. #define load_roundkey_dec(n) \
  87. movq p+4*(n-1)(CTX), RX3; \
  88. rorq $32, RX3;
  89. #define add_roundkey_dec() \
  90. xorq RX3, RX0;
  91. #define round_dec(n) \
  92. add_roundkey_dec(); \
  93. load_roundkey_dec(n); \
  94. \
  95. F(); \
  96. F();
  97. #define read_block() \
  98. movq (RIO), RX0; \
  99. rorq $32, RX0; \
  100. bswapq RX0;
  101. #define write_block() \
  102. bswapq RX0; \
  103. movq RX0, (RIO);
  104. .align 16
  105. ELF(.type __blowfish_enc_blk1,@function;)
  106. __blowfish_enc_blk1:
  107. /* input:
  108. * %rdi: ctx, CTX
  109. * RX0: input plaintext block
  110. * output:
  111. * RX0: output plaintext block
  112. */
  113. CFI_STARTPROC();
  114. movq %rbp, %r11;
  115. CFI_REGISTER(%rbp, %r11);
  116. load_roundkey_enc(0);
  117. round_enc(2);
  118. round_enc(4);
  119. round_enc(6);
  120. round_enc(8);
  121. round_enc(10);
  122. round_enc(12);
  123. round_enc(14);
  124. round_enc(16);
  125. add_roundkey_enc();
  126. movq %r11, %rbp;
  127. CFI_RESTORE(%rbp)
  128. ret_spec_stop;
  129. CFI_ENDPROC();
  130. ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
  131. .align 16
  132. .globl _gcry_blowfish_amd64_do_encrypt
  133. ELF(.type _gcry_blowfish_amd64_do_encrypt,@function;)
  134. _gcry_blowfish_amd64_do_encrypt:
  135. /* input:
  136. * %rdi: ctx, CTX
  137. * %rsi: u32 *ret_xl
  138. * %rdx: u32 *ret_xr
  139. */
  140. CFI_STARTPROC();
  141. ENTER_SYSV_FUNC_PARAMS_0_4
  142. movl (%rdx), RX0d;
  143. shlq $32, RX0;
  144. movl (%rsi), RT3d;
  145. movq %rdx, %r10;
  146. orq RT3, RX0;
  147. movq %rsi, RX2;
  148. call __blowfish_enc_blk1;
  149. movl RX0d, (%r10);
  150. shrq $32, RX0;
  151. movl RX0d, (RX2);
  152. EXIT_SYSV_FUNC
  153. ret_spec_stop;
  154. CFI_ENDPROC();
  155. ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
  156. .align 16
  157. .globl _gcry_blowfish_amd64_encrypt_block
  158. ELF(.type _gcry_blowfish_amd64_encrypt_block,@function;)
  159. _gcry_blowfish_amd64_encrypt_block:
  160. /* input:
  161. * %rdi: ctx, CTX
  162. * %rsi: dst
  163. * %rdx: src
  164. */
  165. CFI_STARTPROC();
  166. ENTER_SYSV_FUNC_PARAMS_0_4
  167. movq %rsi, %r10;
  168. movq %rdx, RIO;
  169. read_block();
  170. call __blowfish_enc_blk1;
  171. movq %r10, RIO;
  172. write_block();
  173. EXIT_SYSV_FUNC
  174. ret_spec_stop;
  175. CFI_ENDPROC();
  176. ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
  177. .align 16
  178. .globl _gcry_blowfish_amd64_decrypt_block
  179. ELF(.type _gcry_blowfish_amd64_decrypt_block,@function;)
  180. _gcry_blowfish_amd64_decrypt_block:
  181. /* input:
  182. * %rdi: ctx, CTX
  183. * %rsi: dst
  184. * %rdx: src
  185. */
  186. CFI_STARTPROC();
  187. ENTER_SYSV_FUNC_PARAMS_0_4
  188. movq %rbp, %r11;
  189. CFI_REGISTER(%rbp, %r11);
  190. movq %rsi, %r10;
  191. movq %rdx, RIO;
  192. read_block();
  193. load_roundkey_dec(17);
  194. round_dec(15);
  195. round_dec(13);
  196. round_dec(11);
  197. round_dec(9);
  198. round_dec(7);
  199. round_dec(5);
  200. round_dec(3);
  201. round_dec(1);
  202. add_roundkey_dec();
  203. movq %r10, RIO;
  204. write_block();
  205. movq %r11, %rbp;
  206. CFI_RESTORE(%rbp);
  207. EXIT_SYSV_FUNC
  208. ret_spec_stop;
  209. CFI_ENDPROC();
  210. ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
  211. /**********************************************************************
  212. 4-way blowfish, four blocks parallel
  213. **********************************************************************/
  214. #define F4(x) \
  215. movzbl x ## bh, RT1d; \
  216. movzbl x ## bl, RT3d; \
  217. rorq $16, x; \
  218. movzbl x ## bh, RT0d; \
  219. movzbl x ## bl, RT2d; \
  220. rorq $16, x; \
  221. movl s0(CTX,RT0,4), RT0d; \
  222. addl s1(CTX,RT2,4), RT0d; \
  223. xorl s2(CTX,RT1,4), RT0d; \
  224. addl s3(CTX,RT3,4), RT0d; \
  225. xorq RT0, x;
  226. #define add_preloaded_roundkey4() \
  227. xorq RKEY, RX0; \
  228. xorq RKEY, RX1; \
  229. xorq RKEY, RX2; \
  230. xorq RKEY, RX3;
  231. #define preload_roundkey_enc(n) \
  232. movq p+4*(n)(CTX), RKEY;
  233. #define add_roundkey_enc4(n) \
  234. add_preloaded_roundkey4(); \
  235. preload_roundkey_enc(n + 2);
  236. #define round_enc4(n) \
  237. add_roundkey_enc4(n); \
  238. \
  239. F4(RX0); \
  240. F4(RX1); \
  241. F4(RX2); \
  242. F4(RX3); \
  243. \
  244. F4(RX0); \
  245. F4(RX1); \
  246. F4(RX2); \
  247. F4(RX3);
  248. #define preload_roundkey_dec(n) \
  249. movq p+4*((n)-1)(CTX), RKEY; \
  250. rorq $32, RKEY;
  251. #define add_roundkey_dec4(n) \
  252. add_preloaded_roundkey4(); \
  253. preload_roundkey_dec(n - 2);
  254. #define round_dec4(n) \
  255. add_roundkey_dec4(n); \
  256. \
  257. F4(RX0); \
  258. F4(RX1); \
  259. F4(RX2); \
  260. F4(RX3); \
  261. \
  262. F4(RX0); \
  263. F4(RX1); \
  264. F4(RX2); \
  265. F4(RX3);
  266. #define inbswap_block4() \
  267. rorq $32, RX0; \
  268. bswapq RX0; \
  269. rorq $32, RX1; \
  270. bswapq RX1; \
  271. rorq $32, RX2; \
  272. bswapq RX2; \
  273. rorq $32, RX3; \
  274. bswapq RX3;
  275. #define inctrswap_block4() \
  276. rorq $32, RX0; \
  277. rorq $32, RX1; \
  278. rorq $32, RX2; \
  279. rorq $32, RX3;
  280. #define outbswap_block4() \
  281. bswapq RX0; \
  282. bswapq RX1; \
  283. bswapq RX2; \
  284. bswapq RX3;
  285. .align 16
  286. ELF(.type __blowfish_enc_blk4,@function;)
  287. __blowfish_enc_blk4:
  288. /* input:
  289. * %rdi: ctx, CTX
  290. * RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
  291. * output:
  292. * RX0,RX1,RX2,RX3: four output ciphertext blocks
  293. */
  294. CFI_STARTPROC();
  295. preload_roundkey_enc(0);
  296. round_enc4(0);
  297. round_enc4(2);
  298. round_enc4(4);
  299. round_enc4(6);
  300. round_enc4(8);
  301. round_enc4(10);
  302. round_enc4(12);
  303. round_enc4(14);
  304. add_preloaded_roundkey4();
  305. outbswap_block4();
  306. ret_spec_stop;
  307. CFI_ENDPROC();
  308. ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
  309. .align 16
  310. ELF(.type __blowfish_dec_blk4,@function;)
  311. __blowfish_dec_blk4:
  312. /* input:
  313. * %rdi: ctx, CTX
  314. * RX0,RX1,RX2,RX3: four input ciphertext blocks
  315. * output:
  316. * RX0,RX1,RX2,RX3: four output plaintext blocks
  317. */
  318. CFI_STARTPROC();
  319. preload_roundkey_dec(17);
  320. inbswap_block4();
  321. round_dec4(17);
  322. round_dec4(15);
  323. round_dec4(13);
  324. round_dec4(11);
  325. round_dec4(9);
  326. round_dec4(7);
  327. round_dec4(5);
  328. round_dec4(3);
  329. add_preloaded_roundkey4();
  330. outbswap_block4();
  331. ret_spec_stop;
  332. CFI_ENDPROC();
  333. ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
  334. .align 16
  335. .globl _gcry_blowfish_amd64_ctr_enc
  336. ELF(.type _gcry_blowfish_amd64_ctr_enc,@function;)
  337. _gcry_blowfish_amd64_ctr_enc:
  338. /* input:
  339. * %rdi: ctx, CTX
  340. * %rsi: dst (4 blocks)
  341. * %rdx: src (4 blocks)
  342. * %rcx: iv (big endian, 64bit)
  343. */
  344. CFI_STARTPROC();
  345. ENTER_SYSV_FUNC_PARAMS_0_4
  346. pushq %rbp;
  347. CFI_PUSH(%rbp);
  348. pushq %rbx;
  349. CFI_PUSH(%rbx);
  350. pushq %r12;
  351. CFI_PUSH(%r12);
  352. pushq %r13;
  353. CFI_PUSH(%r13);
  354. /* %r11-%r13 are not used by __blowfish_enc_blk4 */
  355. movq %rcx, %r13; /*iv*/
  356. movq %rdx, %r12; /*src*/
  357. movq %rsi, %r11; /*dst*/
  358. /* load IV and byteswap */
  359. movq (%r13), RT0;
  360. bswapq RT0;
  361. movq RT0, RX0;
  362. /* construct IVs */
  363. leaq 1(RT0), RX1;
  364. leaq 2(RT0), RX2;
  365. leaq 3(RT0), RX3;
  366. leaq 4(RT0), RT0;
  367. bswapq RT0;
  368. inctrswap_block4();
  369. /* store new IV */
  370. movq RT0, (%r13);
  371. call __blowfish_enc_blk4;
  372. /* XOR key-stream with plaintext */
  373. xorq 0 * 8(%r12), RX0;
  374. xorq 1 * 8(%r12), RX1;
  375. xorq 2 * 8(%r12), RX2;
  376. xorq 3 * 8(%r12), RX3;
  377. movq RX0, 0 * 8(%r11);
  378. movq RX1, 1 * 8(%r11);
  379. movq RX2, 2 * 8(%r11);
  380. movq RX3, 3 * 8(%r11);
  381. popq %r13;
  382. CFI_POP(%r13);
  383. popq %r12;
  384. CFI_POP(%r12);
  385. popq %rbx;
  386. CFI_POP(%rbx);
  387. popq %rbp;
  388. CFI_POP(%rbp);
  389. EXIT_SYSV_FUNC
  390. ret_spec_stop;
  391. CFI_ENDPROC();
  392. ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
  393. .align 16
  394. .globl _gcry_blowfish_amd64_cbc_dec
  395. ELF(.type _gcry_blowfish_amd64_cbc_dec,@function;)
  396. _gcry_blowfish_amd64_cbc_dec:
  397. /* input:
  398. * %rdi: ctx, CTX
  399. * %rsi: dst (4 blocks)
  400. * %rdx: src (4 blocks)
  401. * %rcx: iv (64bit)
  402. */
  403. CFI_STARTPROC();
  404. ENTER_SYSV_FUNC_PARAMS_0_4
  405. pushq %rbp;
  406. CFI_PUSH(%rbp);
  407. pushq %rbx;
  408. CFI_PUSH(%rbx);
  409. pushq %r12;
  410. CFI_PUSH(%r12);
  411. pushq %r13;
  412. CFI_PUSH(%r13);
  413. /* %r11-%r13 are not used by __blowfish_dec_blk4 */
  414. movq %rsi, %r11; /*dst*/
  415. movq %rdx, %r12; /*src*/
  416. movq %rcx, %r13; /*iv*/
  417. /* load input */
  418. movq 0 * 8(%r12), RX0;
  419. movq 1 * 8(%r12), RX1;
  420. movq 2 * 8(%r12), RX2;
  421. movq 3 * 8(%r12), RX3;
  422. call __blowfish_dec_blk4;
  423. movq 3 * 8(%r12), RT0;
  424. xorq (%r13), RX0;
  425. xorq 0 * 8(%r12), RX1;
  426. xorq 1 * 8(%r12), RX2;
  427. xorq 2 * 8(%r12), RX3;
  428. movq RT0, (%r13); /* store new IV */
  429. movq RX0, 0 * 8(%r11);
  430. movq RX1, 1 * 8(%r11);
  431. movq RX2, 2 * 8(%r11);
  432. movq RX3, 3 * 8(%r11);
  433. popq %r13;
  434. CFI_POP(%r13);
  435. popq %r12;
  436. CFI_POP(%r12);
  437. popq %rbx;
  438. CFI_POP(%rbx);
  439. popq %rbp;
  440. CFI_POP(%rbp);
  441. EXIT_SYSV_FUNC
  442. ret_spec_stop;
  443. CFI_ENDPROC();
  444. ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
  445. .align 16
  446. .globl _gcry_blowfish_amd64_cfb_dec
  447. ELF(.type _gcry_blowfish_amd64_cfb_dec,@function;)
  448. _gcry_blowfish_amd64_cfb_dec:
  449. /* input:
  450. * %rdi: ctx, CTX
  451. * %rsi: dst (4 blocks)
  452. * %rdx: src (4 blocks)
  453. * %rcx: iv (64bit)
  454. */
  455. CFI_STARTPROC();
  456. ENTER_SYSV_FUNC_PARAMS_0_4
  457. pushq %rbp;
  458. CFI_PUSH(%rbp);
  459. pushq %rbx;
  460. CFI_PUSH(%rbx);
  461. pushq %r12;
  462. CFI_PUSH(%r12);
  463. pushq %r13;
  464. CFI_PUSH(%r13);
  465. /* %r11-%r13 are not used by __blowfish_enc_blk4 */
  466. movq %rcx, %r13; /*iv*/
  467. movq %rdx, %r12; /*src*/
  468. movq %rsi, %r11; /*dst*/
  469. /* Load input */
  470. movq (%r13), RX0;
  471. movq 0 * 8(%r12), RX1;
  472. movq 1 * 8(%r12), RX2;
  473. movq 2 * 8(%r12), RX3;
  474. inbswap_block4();
  475. /* Update IV */
  476. movq 3 * 8(%r12), RT0;
  477. movq RT0, (%r13);
  478. call __blowfish_enc_blk4;
  479. xorq 0 * 8(%r12), RX0;
  480. xorq 1 * 8(%r12), RX1;
  481. xorq 2 * 8(%r12), RX2;
  482. xorq 3 * 8(%r12), RX3;
  483. movq RX0, 0 * 8(%r11);
  484. movq RX1, 1 * 8(%r11);
  485. movq RX2, 2 * 8(%r11);
  486. movq RX3, 3 * 8(%r11);
  487. popq %r13;
  488. CFI_POP(%r13);
  489. popq %r12;
  490. CFI_POP(%r12);
  491. popq %rbx;
  492. CFI_POP(%rbx);
  493. popq %rbp;
  494. CFI_POP(%rbp);
  495. EXIT_SYSV_FUNC
  496. ret_spec_stop;
  497. CFI_ENDPROC();
  498. ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
  499. #endif /*defined(USE_BLOWFISH)*/
  500. #endif /*__x86_64*/