twofish-x86_64-asm_64-3way.S 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. /*
  2. * Twofish Cipher 3-way parallel algorithm (x86_64)
  3. *
  4. * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  19. * USA
  20. *
  21. */
  22. #include <linux/linkage.h>
  23. .file "twofish-x86_64-asm-3way.S"
  24. .text
  25. /* structure of crypto context */
  26. #define s0 0
  27. #define s1 1024
  28. #define s2 2048
  29. #define s3 3072
  30. #define w 4096
  31. #define k 4128
  32. /**********************************************************************
  33. 3-way twofish
  34. **********************************************************************/
  35. #define CTX %rdi
  36. #define RIO %rdx
  37. #define RAB0 %rax
  38. #define RAB1 %rbx
  39. #define RAB2 %rcx
  40. #define RAB0d %eax
  41. #define RAB1d %ebx
  42. #define RAB2d %ecx
  43. #define RAB0bh %ah
  44. #define RAB1bh %bh
  45. #define RAB2bh %ch
  46. #define RAB0bl %al
  47. #define RAB1bl %bl
  48. #define RAB2bl %cl
  49. #define CD0 0x0(%rsp)
  50. #define CD1 0x8(%rsp)
  51. #define CD2 0x10(%rsp)
  52. # used only before/after all rounds
  53. #define RCD0 %r8
  54. #define RCD1 %r9
  55. #define RCD2 %r10
  56. # used only during rounds
  57. #define RX0 %r8
  58. #define RX1 %r9
  59. #define RX2 %r10
  60. #define RX0d %r8d
  61. #define RX1d %r9d
  62. #define RX2d %r10d
  63. #define RY0 %r11
  64. #define RY1 %r12
  65. #define RY2 %r13
  66. #define RY0d %r11d
  67. #define RY1d %r12d
  68. #define RY2d %r13d
  69. #define RT0 %rdx
  70. #define RT1 %rsi
  71. #define RT0d %edx
  72. #define RT1d %esi
  73. #define RT1bl %sil
  74. #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
  75. movzbl ab ## bl, tmp2 ## d; \
  76. movzbl ab ## bh, tmp1 ## d; \
  77. rorq $(rot), ab; \
  78. op1##l T0(CTX, tmp2, 4), dst ## d; \
  79. op2##l T1(CTX, tmp1, 4), dst ## d;
  80. #define swap_ab_with_cd(ab, cd, tmp) \
  81. movq cd, tmp; \
  82. movq ab, cd; \
  83. movq tmp, ab;
  84. /*
  85. * Combined G1 & G2 function. Reordered with help of rotates to have moves
  86. * at begining.
  87. */
  88. #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
  89. /* G1,1 && G2,1 */ \
  90. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
  91. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
  92. \
  93. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
  94. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
  95. \
  96. do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
  97. do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
  98. \
  99. /* G1,2 && G2,2 */ \
  100. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
  101. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
  102. swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
  103. \
  104. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
  105. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
  106. swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
  107. \
  108. do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
  109. do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
  110. swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
  111. #define enc_round_end(ab, x, y, n) \
  112. addl y ## d, x ## d; \
  113. addl x ## d, y ## d; \
  114. addl k+4*(2*(n))(CTX), x ## d; \
  115. xorl ab ## d, x ## d; \
  116. addl k+4*(2*(n)+1)(CTX), y ## d; \
  117. shrq $32, ab; \
  118. roll $1, ab ## d; \
  119. xorl y ## d, ab ## d; \
  120. shlq $32, ab; \
  121. rorl $1, x ## d; \
  122. orq x, ab;
  123. #define dec_round_end(ba, x, y, n) \
  124. addl y ## d, x ## d; \
  125. addl x ## d, y ## d; \
  126. addl k+4*(2*(n))(CTX), x ## d; \
  127. addl k+4*(2*(n)+1)(CTX), y ## d; \
  128. xorl ba ## d, y ## d; \
  129. shrq $32, ba; \
  130. roll $1, ba ## d; \
  131. xorl x ## d, ba ## d; \
  132. shlq $32, ba; \
  133. rorl $1, y ## d; \
  134. orq y, ba;
  135. #define encrypt_round3(ab, cd, n) \
  136. g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
  137. \
  138. enc_round_end(ab ## 0, RX0, RY0, n); \
  139. enc_round_end(ab ## 1, RX1, RY1, n); \
  140. enc_round_end(ab ## 2, RX2, RY2, n);
  141. #define decrypt_round3(ba, dc, n) \
  142. g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
  143. \
  144. dec_round_end(ba ## 0, RX0, RY0, n); \
  145. dec_round_end(ba ## 1, RX1, RY1, n); \
  146. dec_round_end(ba ## 2, RX2, RY2, n);
  147. #define encrypt_cycle3(ab, cd, n) \
  148. encrypt_round3(ab, cd, n*2); \
  149. encrypt_round3(ab, cd, (n*2)+1);
  150. #define decrypt_cycle3(ba, dc, n) \
  151. decrypt_round3(ba, dc, (n*2)+1); \
  152. decrypt_round3(ba, dc, (n*2));
  153. #define push_cd() \
  154. pushq RCD2; \
  155. pushq RCD1; \
  156. pushq RCD0;
  157. #define pop_cd() \
  158. popq RCD0; \
  159. popq RCD1; \
  160. popq RCD2;
  161. #define inpack3(in, n, xy, m) \
  162. movq 4*(n)(in), xy ## 0; \
  163. xorq w+4*m(CTX), xy ## 0; \
  164. \
  165. movq 4*(4+(n))(in), xy ## 1; \
  166. xorq w+4*m(CTX), xy ## 1; \
  167. \
  168. movq 4*(8+(n))(in), xy ## 2; \
  169. xorq w+4*m(CTX), xy ## 2;
  170. #define outunpack3(op, out, n, xy, m) \
  171. xorq w+4*m(CTX), xy ## 0; \
  172. op ## q xy ## 0, 4*(n)(out); \
  173. \
  174. xorq w+4*m(CTX), xy ## 1; \
  175. op ## q xy ## 1, 4*(4+(n))(out); \
  176. \
  177. xorq w+4*m(CTX), xy ## 2; \
  178. op ## q xy ## 2, 4*(8+(n))(out);
  179. #define inpack_enc3() \
  180. inpack3(RIO, 0, RAB, 0); \
  181. inpack3(RIO, 2, RCD, 2);
  182. #define outunpack_enc3(op) \
  183. outunpack3(op, RIO, 2, RAB, 6); \
  184. outunpack3(op, RIO, 0, RCD, 4);
  185. #define inpack_dec3() \
  186. inpack3(RIO, 0, RAB, 4); \
  187. rorq $32, RAB0; \
  188. rorq $32, RAB1; \
  189. rorq $32, RAB2; \
  190. inpack3(RIO, 2, RCD, 6); \
  191. rorq $32, RCD0; \
  192. rorq $32, RCD1; \
  193. rorq $32, RCD2;
  194. #define outunpack_dec3() \
  195. rorq $32, RCD0; \
  196. rorq $32, RCD1; \
  197. rorq $32, RCD2; \
  198. outunpack3(mov, RIO, 0, RCD, 0); \
  199. rorq $32, RAB0; \
  200. rorq $32, RAB1; \
  201. rorq $32, RAB2; \
  202. outunpack3(mov, RIO, 2, RAB, 2);
  203. ENTRY(__twofish_enc_blk_3way)
  204. /* input:
  205. * %rdi: ctx, CTX
  206. * %rsi: dst
  207. * %rdx: src, RIO
  208. * %rcx: bool, if true: xor output
  209. */
  210. pushq %r13;
  211. pushq %r12;
  212. pushq %rbx;
  213. pushq %rcx; /* bool xor */
  214. pushq %rsi; /* dst */
  215. inpack_enc3();
  216. push_cd();
  217. encrypt_cycle3(RAB, CD, 0);
  218. encrypt_cycle3(RAB, CD, 1);
  219. encrypt_cycle3(RAB, CD, 2);
  220. encrypt_cycle3(RAB, CD, 3);
  221. encrypt_cycle3(RAB, CD, 4);
  222. encrypt_cycle3(RAB, CD, 5);
  223. encrypt_cycle3(RAB, CD, 6);
  224. encrypt_cycle3(RAB, CD, 7);
  225. pop_cd();
  226. popq RIO; /* dst */
  227. popq RT1; /* bool xor */
  228. testb RT1bl, RT1bl;
  229. jnz .L__enc_xor3;
  230. outunpack_enc3(mov);
  231. popq %rbx;
  232. popq %r12;
  233. popq %r13;
  234. ret;
  235. .L__enc_xor3:
  236. outunpack_enc3(xor);
  237. popq %rbx;
  238. popq %r12;
  239. popq %r13;
  240. ret;
  241. ENDPROC(__twofish_enc_blk_3way)
  242. ENTRY(twofish_dec_blk_3way)
  243. /* input:
  244. * %rdi: ctx, CTX
  245. * %rsi: dst
  246. * %rdx: src, RIO
  247. */
  248. pushq %r13;
  249. pushq %r12;
  250. pushq %rbx;
  251. pushq %rsi; /* dst */
  252. inpack_dec3();
  253. push_cd();
  254. decrypt_cycle3(RAB, CD, 7);
  255. decrypt_cycle3(RAB, CD, 6);
  256. decrypt_cycle3(RAB, CD, 5);
  257. decrypt_cycle3(RAB, CD, 4);
  258. decrypt_cycle3(RAB, CD, 3);
  259. decrypt_cycle3(RAB, CD, 2);
  260. decrypt_cycle3(RAB, CD, 1);
  261. decrypt_cycle3(RAB, CD, 0);
  262. pop_cd();
  263. popq RIO; /* dst */
  264. outunpack_dec3();
  265. popq %rbx;
  266. popq %r12;
  267. popq %r13;
  268. ret;
  269. ENDPROC(twofish_dec_blk_3way)