chacha20-ssse3-x86_64.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. /*
  2. * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
  3. *
  4. * Copyright (C) 2015 Martin Willi
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. */
  11. #include <linux/linkage.h>
  12. .section .rodata.cst16.ROT8, "aM", @progbits, 16
  13. .align 16
  14. ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
  15. .section .rodata.cst16.ROT16, "aM", @progbits, 16
  16. .align 16
  17. ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
  18. .section .rodata.cst16.CTRINC, "aM", @progbits, 16
  19. .align 16
  20. CTRINC: .octa 0x00000003000000020000000100000000
  21. .text
  22. ENTRY(chacha20_block_xor_ssse3)
  23. # %rdi: Input state matrix, s
  24. # %rsi: 1 data block output, o
  25. # %rdx: 1 data block input, i
  26. # This function encrypts one ChaCha20 block by loading the state matrix
  27. # in four SSE registers. It performs matrix operation on four words in
  28. # parallel, but requireds shuffling to rearrange the words after each
  29. # round. 8/16-bit word rotation is done with the slightly better
  30. # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
  31. # traditional shift+OR.
  32. # x0..3 = s0..3
  33. movdqa 0x00(%rdi),%xmm0
  34. movdqa 0x10(%rdi),%xmm1
  35. movdqa 0x20(%rdi),%xmm2
  36. movdqa 0x30(%rdi),%xmm3
  37. movdqa %xmm0,%xmm8
  38. movdqa %xmm1,%xmm9
  39. movdqa %xmm2,%xmm10
  40. movdqa %xmm3,%xmm11
  41. movdqa ROT8(%rip),%xmm4
  42. movdqa ROT16(%rip),%xmm5
  43. mov $10,%ecx
  44. .Ldoubleround:
  45. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  46. paddd %xmm1,%xmm0
  47. pxor %xmm0,%xmm3
  48. pshufb %xmm5,%xmm3
  49. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  50. paddd %xmm3,%xmm2
  51. pxor %xmm2,%xmm1
  52. movdqa %xmm1,%xmm6
  53. pslld $12,%xmm6
  54. psrld $20,%xmm1
  55. por %xmm6,%xmm1
  56. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  57. paddd %xmm1,%xmm0
  58. pxor %xmm0,%xmm3
  59. pshufb %xmm4,%xmm3
  60. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  61. paddd %xmm3,%xmm2
  62. pxor %xmm2,%xmm1
  63. movdqa %xmm1,%xmm7
  64. pslld $7,%xmm7
  65. psrld $25,%xmm1
  66. por %xmm7,%xmm1
  67. # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  68. pshufd $0x39,%xmm1,%xmm1
  69. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  70. pshufd $0x4e,%xmm2,%xmm2
  71. # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  72. pshufd $0x93,%xmm3,%xmm3
  73. # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  74. paddd %xmm1,%xmm0
  75. pxor %xmm0,%xmm3
  76. pshufb %xmm5,%xmm3
  77. # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  78. paddd %xmm3,%xmm2
  79. pxor %xmm2,%xmm1
  80. movdqa %xmm1,%xmm6
  81. pslld $12,%xmm6
  82. psrld $20,%xmm1
  83. por %xmm6,%xmm1
  84. # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  85. paddd %xmm1,%xmm0
  86. pxor %xmm0,%xmm3
  87. pshufb %xmm4,%xmm3
  88. # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  89. paddd %xmm3,%xmm2
  90. pxor %xmm2,%xmm1
  91. movdqa %xmm1,%xmm7
  92. pslld $7,%xmm7
  93. psrld $25,%xmm1
  94. por %xmm7,%xmm1
  95. # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  96. pshufd $0x93,%xmm1,%xmm1
  97. # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  98. pshufd $0x4e,%xmm2,%xmm2
  99. # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
  100. pshufd $0x39,%xmm3,%xmm3
  101. dec %ecx
  102. jnz .Ldoubleround
  103. # o0 = i0 ^ (x0 + s0)
  104. movdqu 0x00(%rdx),%xmm4
  105. paddd %xmm8,%xmm0
  106. pxor %xmm4,%xmm0
  107. movdqu %xmm0,0x00(%rsi)
  108. # o1 = i1 ^ (x1 + s1)
  109. movdqu 0x10(%rdx),%xmm5
  110. paddd %xmm9,%xmm1
  111. pxor %xmm5,%xmm1
  112. movdqu %xmm1,0x10(%rsi)
  113. # o2 = i2 ^ (x2 + s2)
  114. movdqu 0x20(%rdx),%xmm6
  115. paddd %xmm10,%xmm2
  116. pxor %xmm6,%xmm2
  117. movdqu %xmm2,0x20(%rsi)
  118. # o3 = i3 ^ (x3 + s3)
  119. movdqu 0x30(%rdx),%xmm7
  120. paddd %xmm11,%xmm3
  121. pxor %xmm7,%xmm3
  122. movdqu %xmm3,0x30(%rsi)
  123. ret
  124. ENDPROC(chacha20_block_xor_ssse3)
  125. ENTRY(chacha20_4block_xor_ssse3)
  126. # %rdi: Input state matrix, s
  127. # %rsi: 4 data blocks output, o
  128. # %rdx: 4 data blocks input, i
  129. # This function encrypts four consecutive ChaCha20 blocks by loading the
  130. # the state matrix in SSE registers four times. As we need some scratch
  131. # registers, we save the first four registers on the stack. The
  132. # algorithm performs each operation on the corresponding word of each
  133. # state matrix, hence requires no word shuffling. For final XORing step
  134. # we transpose the matrix by interleaving 32- and then 64-bit words,
  135. # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
  136. # done with the slightly better performing SSSE3 byte shuffling,
  137. # 7/12-bit word rotation uses traditional shift+OR.
  138. lea 8(%rsp),%r10
  139. sub $0x80,%rsp
  140. and $~63,%rsp
  141. # x0..15[0-3] = s0..3[0..3]
  142. movq 0x00(%rdi),%xmm1
  143. pshufd $0x00,%xmm1,%xmm0
  144. pshufd $0x55,%xmm1,%xmm1
  145. movq 0x08(%rdi),%xmm3
  146. pshufd $0x00,%xmm3,%xmm2
  147. pshufd $0x55,%xmm3,%xmm3
  148. movq 0x10(%rdi),%xmm5
  149. pshufd $0x00,%xmm5,%xmm4
  150. pshufd $0x55,%xmm5,%xmm5
  151. movq 0x18(%rdi),%xmm7
  152. pshufd $0x00,%xmm7,%xmm6
  153. pshufd $0x55,%xmm7,%xmm7
  154. movq 0x20(%rdi),%xmm9
  155. pshufd $0x00,%xmm9,%xmm8
  156. pshufd $0x55,%xmm9,%xmm9
  157. movq 0x28(%rdi),%xmm11
  158. pshufd $0x00,%xmm11,%xmm10
  159. pshufd $0x55,%xmm11,%xmm11
  160. movq 0x30(%rdi),%xmm13
  161. pshufd $0x00,%xmm13,%xmm12
  162. pshufd $0x55,%xmm13,%xmm13
  163. movq 0x38(%rdi),%xmm15
  164. pshufd $0x00,%xmm15,%xmm14
  165. pshufd $0x55,%xmm15,%xmm15
  166. # x0..3 on stack
  167. movdqa %xmm0,0x00(%rsp)
  168. movdqa %xmm1,0x10(%rsp)
  169. movdqa %xmm2,0x20(%rsp)
  170. movdqa %xmm3,0x30(%rsp)
  171. movdqa CTRINC(%rip),%xmm1
  172. movdqa ROT8(%rip),%xmm2
  173. movdqa ROT16(%rip),%xmm3
  174. # x12 += counter values 0-3
  175. paddd %xmm1,%xmm12
  176. mov $10,%ecx
  177. .Ldoubleround4:
  178. # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
  179. movdqa 0x00(%rsp),%xmm0
  180. paddd %xmm4,%xmm0
  181. movdqa %xmm0,0x00(%rsp)
  182. pxor %xmm0,%xmm12
  183. pshufb %xmm3,%xmm12
  184. # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
  185. movdqa 0x10(%rsp),%xmm0
  186. paddd %xmm5,%xmm0
  187. movdqa %xmm0,0x10(%rsp)
  188. pxor %xmm0,%xmm13
  189. pshufb %xmm3,%xmm13
  190. # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
  191. movdqa 0x20(%rsp),%xmm0
  192. paddd %xmm6,%xmm0
  193. movdqa %xmm0,0x20(%rsp)
  194. pxor %xmm0,%xmm14
  195. pshufb %xmm3,%xmm14
  196. # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
  197. movdqa 0x30(%rsp),%xmm0
  198. paddd %xmm7,%xmm0
  199. movdqa %xmm0,0x30(%rsp)
  200. pxor %xmm0,%xmm15
  201. pshufb %xmm3,%xmm15
  202. # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
  203. paddd %xmm12,%xmm8
  204. pxor %xmm8,%xmm4
  205. movdqa %xmm4,%xmm0
  206. pslld $12,%xmm0
  207. psrld $20,%xmm4
  208. por %xmm0,%xmm4
  209. # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
  210. paddd %xmm13,%xmm9
  211. pxor %xmm9,%xmm5
  212. movdqa %xmm5,%xmm0
  213. pslld $12,%xmm0
  214. psrld $20,%xmm5
  215. por %xmm0,%xmm5
  216. # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
  217. paddd %xmm14,%xmm10
  218. pxor %xmm10,%xmm6
  219. movdqa %xmm6,%xmm0
  220. pslld $12,%xmm0
  221. psrld $20,%xmm6
  222. por %xmm0,%xmm6
  223. # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
  224. paddd %xmm15,%xmm11
  225. pxor %xmm11,%xmm7
  226. movdqa %xmm7,%xmm0
  227. pslld $12,%xmm0
  228. psrld $20,%xmm7
  229. por %xmm0,%xmm7
  230. # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
  231. movdqa 0x00(%rsp),%xmm0
  232. paddd %xmm4,%xmm0
  233. movdqa %xmm0,0x00(%rsp)
  234. pxor %xmm0,%xmm12
  235. pshufb %xmm2,%xmm12
  236. # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
  237. movdqa 0x10(%rsp),%xmm0
  238. paddd %xmm5,%xmm0
  239. movdqa %xmm0,0x10(%rsp)
  240. pxor %xmm0,%xmm13
  241. pshufb %xmm2,%xmm13
  242. # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
  243. movdqa 0x20(%rsp),%xmm0
  244. paddd %xmm6,%xmm0
  245. movdqa %xmm0,0x20(%rsp)
  246. pxor %xmm0,%xmm14
  247. pshufb %xmm2,%xmm14
  248. # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
  249. movdqa 0x30(%rsp),%xmm0
  250. paddd %xmm7,%xmm0
  251. movdqa %xmm0,0x30(%rsp)
  252. pxor %xmm0,%xmm15
  253. pshufb %xmm2,%xmm15
  254. # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
  255. paddd %xmm12,%xmm8
  256. pxor %xmm8,%xmm4
  257. movdqa %xmm4,%xmm0
  258. pslld $7,%xmm0
  259. psrld $25,%xmm4
  260. por %xmm0,%xmm4
  261. # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
  262. paddd %xmm13,%xmm9
  263. pxor %xmm9,%xmm5
  264. movdqa %xmm5,%xmm0
  265. pslld $7,%xmm0
  266. psrld $25,%xmm5
  267. por %xmm0,%xmm5
  268. # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
  269. paddd %xmm14,%xmm10
  270. pxor %xmm10,%xmm6
  271. movdqa %xmm6,%xmm0
  272. pslld $7,%xmm0
  273. psrld $25,%xmm6
  274. por %xmm0,%xmm6
  275. # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
  276. paddd %xmm15,%xmm11
  277. pxor %xmm11,%xmm7
  278. movdqa %xmm7,%xmm0
  279. pslld $7,%xmm0
  280. psrld $25,%xmm7
  281. por %xmm0,%xmm7
  282. # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
  283. movdqa 0x00(%rsp),%xmm0
  284. paddd %xmm5,%xmm0
  285. movdqa %xmm0,0x00(%rsp)
  286. pxor %xmm0,%xmm15
  287. pshufb %xmm3,%xmm15
  288. # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
  289. movdqa 0x10(%rsp),%xmm0
  290. paddd %xmm6,%xmm0
  291. movdqa %xmm0,0x10(%rsp)
  292. pxor %xmm0,%xmm12
  293. pshufb %xmm3,%xmm12
  294. # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
  295. movdqa 0x20(%rsp),%xmm0
  296. paddd %xmm7,%xmm0
  297. movdqa %xmm0,0x20(%rsp)
  298. pxor %xmm0,%xmm13
  299. pshufb %xmm3,%xmm13
  300. # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
  301. movdqa 0x30(%rsp),%xmm0
  302. paddd %xmm4,%xmm0
  303. movdqa %xmm0,0x30(%rsp)
  304. pxor %xmm0,%xmm14
  305. pshufb %xmm3,%xmm14
  306. # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
  307. paddd %xmm15,%xmm10
  308. pxor %xmm10,%xmm5
  309. movdqa %xmm5,%xmm0
  310. pslld $12,%xmm0
  311. psrld $20,%xmm5
  312. por %xmm0,%xmm5
  313. # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
  314. paddd %xmm12,%xmm11
  315. pxor %xmm11,%xmm6
  316. movdqa %xmm6,%xmm0
  317. pslld $12,%xmm0
  318. psrld $20,%xmm6
  319. por %xmm0,%xmm6
  320. # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
  321. paddd %xmm13,%xmm8
  322. pxor %xmm8,%xmm7
  323. movdqa %xmm7,%xmm0
  324. pslld $12,%xmm0
  325. psrld $20,%xmm7
  326. por %xmm0,%xmm7
  327. # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
  328. paddd %xmm14,%xmm9
  329. pxor %xmm9,%xmm4
  330. movdqa %xmm4,%xmm0
  331. pslld $12,%xmm0
  332. psrld $20,%xmm4
  333. por %xmm0,%xmm4
  334. # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
  335. movdqa 0x00(%rsp),%xmm0
  336. paddd %xmm5,%xmm0
  337. movdqa %xmm0,0x00(%rsp)
  338. pxor %xmm0,%xmm15
  339. pshufb %xmm2,%xmm15
  340. # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
  341. movdqa 0x10(%rsp),%xmm0
  342. paddd %xmm6,%xmm0
  343. movdqa %xmm0,0x10(%rsp)
  344. pxor %xmm0,%xmm12
  345. pshufb %xmm2,%xmm12
  346. # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
  347. movdqa 0x20(%rsp),%xmm0
  348. paddd %xmm7,%xmm0
  349. movdqa %xmm0,0x20(%rsp)
  350. pxor %xmm0,%xmm13
  351. pshufb %xmm2,%xmm13
  352. # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
  353. movdqa 0x30(%rsp),%xmm0
  354. paddd %xmm4,%xmm0
  355. movdqa %xmm0,0x30(%rsp)
  356. pxor %xmm0,%xmm14
  357. pshufb %xmm2,%xmm14
  358. # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
  359. paddd %xmm15,%xmm10
  360. pxor %xmm10,%xmm5
  361. movdqa %xmm5,%xmm0
  362. pslld $7,%xmm0
  363. psrld $25,%xmm5
  364. por %xmm0,%xmm5
  365. # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
  366. paddd %xmm12,%xmm11
  367. pxor %xmm11,%xmm6
  368. movdqa %xmm6,%xmm0
  369. pslld $7,%xmm0
  370. psrld $25,%xmm6
  371. por %xmm0,%xmm6
  372. # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
  373. paddd %xmm13,%xmm8
  374. pxor %xmm8,%xmm7
  375. movdqa %xmm7,%xmm0
  376. pslld $7,%xmm0
  377. psrld $25,%xmm7
  378. por %xmm0,%xmm7
  379. # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
  380. paddd %xmm14,%xmm9
  381. pxor %xmm9,%xmm4
  382. movdqa %xmm4,%xmm0
  383. pslld $7,%xmm0
  384. psrld $25,%xmm4
  385. por %xmm0,%xmm4
  386. dec %ecx
  387. jnz .Ldoubleround4
  388. # x0[0-3] += s0[0]
  389. # x1[0-3] += s0[1]
  390. movq 0x00(%rdi),%xmm3
  391. pshufd $0x00,%xmm3,%xmm2
  392. pshufd $0x55,%xmm3,%xmm3
  393. paddd 0x00(%rsp),%xmm2
  394. movdqa %xmm2,0x00(%rsp)
  395. paddd 0x10(%rsp),%xmm3
  396. movdqa %xmm3,0x10(%rsp)
  397. # x2[0-3] += s0[2]
  398. # x3[0-3] += s0[3]
  399. movq 0x08(%rdi),%xmm3
  400. pshufd $0x00,%xmm3,%xmm2
  401. pshufd $0x55,%xmm3,%xmm3
  402. paddd 0x20(%rsp),%xmm2
  403. movdqa %xmm2,0x20(%rsp)
  404. paddd 0x30(%rsp),%xmm3
  405. movdqa %xmm3,0x30(%rsp)
  406. # x4[0-3] += s1[0]
  407. # x5[0-3] += s1[1]
  408. movq 0x10(%rdi),%xmm3
  409. pshufd $0x00,%xmm3,%xmm2
  410. pshufd $0x55,%xmm3,%xmm3
  411. paddd %xmm2,%xmm4
  412. paddd %xmm3,%xmm5
  413. # x6[0-3] += s1[2]
  414. # x7[0-3] += s1[3]
  415. movq 0x18(%rdi),%xmm3
  416. pshufd $0x00,%xmm3,%xmm2
  417. pshufd $0x55,%xmm3,%xmm3
  418. paddd %xmm2,%xmm6
  419. paddd %xmm3,%xmm7
  420. # x8[0-3] += s2[0]
  421. # x9[0-3] += s2[1]
  422. movq 0x20(%rdi),%xmm3
  423. pshufd $0x00,%xmm3,%xmm2
  424. pshufd $0x55,%xmm3,%xmm3
  425. paddd %xmm2,%xmm8
  426. paddd %xmm3,%xmm9
  427. # x10[0-3] += s2[2]
  428. # x11[0-3] += s2[3]
  429. movq 0x28(%rdi),%xmm3
  430. pshufd $0x00,%xmm3,%xmm2
  431. pshufd $0x55,%xmm3,%xmm3
  432. paddd %xmm2,%xmm10
  433. paddd %xmm3,%xmm11
  434. # x12[0-3] += s3[0]
  435. # x13[0-3] += s3[1]
  436. movq 0x30(%rdi),%xmm3
  437. pshufd $0x00,%xmm3,%xmm2
  438. pshufd $0x55,%xmm3,%xmm3
  439. paddd %xmm2,%xmm12
  440. paddd %xmm3,%xmm13
  441. # x14[0-3] += s3[2]
  442. # x15[0-3] += s3[3]
  443. movq 0x38(%rdi),%xmm3
  444. pshufd $0x00,%xmm3,%xmm2
  445. pshufd $0x55,%xmm3,%xmm3
  446. paddd %xmm2,%xmm14
  447. paddd %xmm3,%xmm15
  448. # x12 += counter values 0-3
  449. paddd %xmm1,%xmm12
  450. # interleave 32-bit words in state n, n+1
  451. movdqa 0x00(%rsp),%xmm0
  452. movdqa 0x10(%rsp),%xmm1
  453. movdqa %xmm0,%xmm2
  454. punpckldq %xmm1,%xmm2
  455. punpckhdq %xmm1,%xmm0
  456. movdqa %xmm2,0x00(%rsp)
  457. movdqa %xmm0,0x10(%rsp)
  458. movdqa 0x20(%rsp),%xmm0
  459. movdqa 0x30(%rsp),%xmm1
  460. movdqa %xmm0,%xmm2
  461. punpckldq %xmm1,%xmm2
  462. punpckhdq %xmm1,%xmm0
  463. movdqa %xmm2,0x20(%rsp)
  464. movdqa %xmm0,0x30(%rsp)
  465. movdqa %xmm4,%xmm0
  466. punpckldq %xmm5,%xmm4
  467. punpckhdq %xmm5,%xmm0
  468. movdqa %xmm0,%xmm5
  469. movdqa %xmm6,%xmm0
  470. punpckldq %xmm7,%xmm6
  471. punpckhdq %xmm7,%xmm0
  472. movdqa %xmm0,%xmm7
  473. movdqa %xmm8,%xmm0
  474. punpckldq %xmm9,%xmm8
  475. punpckhdq %xmm9,%xmm0
  476. movdqa %xmm0,%xmm9
  477. movdqa %xmm10,%xmm0
  478. punpckldq %xmm11,%xmm10
  479. punpckhdq %xmm11,%xmm0
  480. movdqa %xmm0,%xmm11
  481. movdqa %xmm12,%xmm0
  482. punpckldq %xmm13,%xmm12
  483. punpckhdq %xmm13,%xmm0
  484. movdqa %xmm0,%xmm13
  485. movdqa %xmm14,%xmm0
  486. punpckldq %xmm15,%xmm14
  487. punpckhdq %xmm15,%xmm0
  488. movdqa %xmm0,%xmm15
  489. # interleave 64-bit words in state n, n+2
  490. movdqa 0x00(%rsp),%xmm0
  491. movdqa 0x20(%rsp),%xmm1
  492. movdqa %xmm0,%xmm2
  493. punpcklqdq %xmm1,%xmm2
  494. punpckhqdq %xmm1,%xmm0
  495. movdqa %xmm2,0x00(%rsp)
  496. movdqa %xmm0,0x20(%rsp)
  497. movdqa 0x10(%rsp),%xmm0
  498. movdqa 0x30(%rsp),%xmm1
  499. movdqa %xmm0,%xmm2
  500. punpcklqdq %xmm1,%xmm2
  501. punpckhqdq %xmm1,%xmm0
  502. movdqa %xmm2,0x10(%rsp)
  503. movdqa %xmm0,0x30(%rsp)
  504. movdqa %xmm4,%xmm0
  505. punpcklqdq %xmm6,%xmm4
  506. punpckhqdq %xmm6,%xmm0
  507. movdqa %xmm0,%xmm6
  508. movdqa %xmm5,%xmm0
  509. punpcklqdq %xmm7,%xmm5
  510. punpckhqdq %xmm7,%xmm0
  511. movdqa %xmm0,%xmm7
  512. movdqa %xmm8,%xmm0
  513. punpcklqdq %xmm10,%xmm8
  514. punpckhqdq %xmm10,%xmm0
  515. movdqa %xmm0,%xmm10
  516. movdqa %xmm9,%xmm0
  517. punpcklqdq %xmm11,%xmm9
  518. punpckhqdq %xmm11,%xmm0
  519. movdqa %xmm0,%xmm11
  520. movdqa %xmm12,%xmm0
  521. punpcklqdq %xmm14,%xmm12
  522. punpckhqdq %xmm14,%xmm0
  523. movdqa %xmm0,%xmm14
  524. movdqa %xmm13,%xmm0
  525. punpcklqdq %xmm15,%xmm13
  526. punpckhqdq %xmm15,%xmm0
  527. movdqa %xmm0,%xmm15
  528. # xor with corresponding input, write to output
  529. movdqa 0x00(%rsp),%xmm0
  530. movdqu 0x00(%rdx),%xmm1
  531. pxor %xmm1,%xmm0
  532. movdqu %xmm0,0x00(%rsi)
  533. movdqa 0x10(%rsp),%xmm0
  534. movdqu 0x80(%rdx),%xmm1
  535. pxor %xmm1,%xmm0
  536. movdqu %xmm0,0x80(%rsi)
  537. movdqa 0x20(%rsp),%xmm0
  538. movdqu 0x40(%rdx),%xmm1
  539. pxor %xmm1,%xmm0
  540. movdqu %xmm0,0x40(%rsi)
  541. movdqa 0x30(%rsp),%xmm0
  542. movdqu 0xc0(%rdx),%xmm1
  543. pxor %xmm1,%xmm0
  544. movdqu %xmm0,0xc0(%rsi)
  545. movdqu 0x10(%rdx),%xmm1
  546. pxor %xmm1,%xmm4
  547. movdqu %xmm4,0x10(%rsi)
  548. movdqu 0x90(%rdx),%xmm1
  549. pxor %xmm1,%xmm5
  550. movdqu %xmm5,0x90(%rsi)
  551. movdqu 0x50(%rdx),%xmm1
  552. pxor %xmm1,%xmm6
  553. movdqu %xmm6,0x50(%rsi)
  554. movdqu 0xd0(%rdx),%xmm1
  555. pxor %xmm1,%xmm7
  556. movdqu %xmm7,0xd0(%rsi)
  557. movdqu 0x20(%rdx),%xmm1
  558. pxor %xmm1,%xmm8
  559. movdqu %xmm8,0x20(%rsi)
  560. movdqu 0xa0(%rdx),%xmm1
  561. pxor %xmm1,%xmm9
  562. movdqu %xmm9,0xa0(%rsi)
  563. movdqu 0x60(%rdx),%xmm1
  564. pxor %xmm1,%xmm10
  565. movdqu %xmm10,0x60(%rsi)
  566. movdqu 0xe0(%rdx),%xmm1
  567. pxor %xmm1,%xmm11
  568. movdqu %xmm11,0xe0(%rsi)
  569. movdqu 0x30(%rdx),%xmm1
  570. pxor %xmm1,%xmm12
  571. movdqu %xmm12,0x30(%rsi)
  572. movdqu 0xb0(%rdx),%xmm1
  573. pxor %xmm1,%xmm13
  574. movdqu %xmm13,0xb0(%rsi)
  575. movdqu 0x70(%rdx),%xmm1
  576. pxor %xmm1,%xmm14
  577. movdqu %xmm14,0x70(%rsi)
  578. movdqu 0xf0(%rdx),%xmm1
  579. pxor %xmm1,%xmm15
  580. movdqu %xmm15,0xf0(%rsi)
  581. lea -8(%r10),%rsp
  582. ret
  583. ENDPROC(chacha20_4block_xor_ssse3)