aes-neonbs-core.S 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. /*
  2. * Bit sliced AES using NEON instructions
  3. *
  4. * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. /*
  11. * The algorithm implemented here is described in detail by the paper
  12. * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
  13. * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
  14. *
  15. * This implementation is based primarily on the OpenSSL implementation
  16. * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
  17. */
  18. #include <linux/linkage.h>
  19. #include <asm/assembler.h>
  20. .text
  21. rounds .req x11
  22. bskey .req x12
  23. .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  24. eor \b2, \b2, \b1
  25. eor \b5, \b5, \b6
  26. eor \b3, \b3, \b0
  27. eor \b6, \b6, \b2
  28. eor \b5, \b5, \b0
  29. eor \b6, \b6, \b3
  30. eor \b3, \b3, \b7
  31. eor \b7, \b7, \b5
  32. eor \b3, \b3, \b4
  33. eor \b4, \b4, \b5
  34. eor \b2, \b2, \b7
  35. eor \b3, \b3, \b1
  36. eor \b1, \b1, \b5
  37. .endm
  38. .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  39. eor \b0, \b0, \b6
  40. eor \b1, \b1, \b4
  41. eor \b4, \b4, \b6
  42. eor \b2, \b2, \b0
  43. eor \b6, \b6, \b1
  44. eor \b1, \b1, \b5
  45. eor \b5, \b5, \b3
  46. eor \b3, \b3, \b7
  47. eor \b7, \b7, \b5
  48. eor \b2, \b2, \b5
  49. eor \b4, \b4, \b7
  50. .endm
  51. .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
  52. eor \b1, \b1, \b7
  53. eor \b4, \b4, \b7
  54. eor \b7, \b7, \b5
  55. eor \b1, \b1, \b3
  56. eor \b2, \b2, \b5
  57. eor \b3, \b3, \b7
  58. eor \b6, \b6, \b1
  59. eor \b2, \b2, \b0
  60. eor \b5, \b5, \b3
  61. eor \b4, \b4, \b6
  62. eor \b0, \b0, \b6
  63. eor \b1, \b1, \b4
  64. .endm
  65. .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
  66. eor \b1, \b1, \b5
  67. eor \b2, \b2, \b7
  68. eor \b3, \b3, \b1
  69. eor \b4, \b4, \b5
  70. eor \b7, \b7, \b5
  71. eor \b3, \b3, \b4
  72. eor \b5, \b5, \b0
  73. eor \b3, \b3, \b7
  74. eor \b6, \b6, \b2
  75. eor \b2, \b2, \b1
  76. eor \b6, \b6, \b3
  77. eor \b3, \b3, \b0
  78. eor \b5, \b5, \b6
  79. .endm
  80. .macro mul_gf4, x0, x1, y0, y1, t0, t1
  81. eor \t0, \y0, \y1
  82. and \t0, \t0, \x0
  83. eor \x0, \x0, \x1
  84. and \t1, \x1, \y0
  85. and \x0, \x0, \y1
  86. eor \x1, \t1, \t0
  87. eor \x0, \x0, \t1
  88. .endm
  89. .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
  90. eor \t0, \y0, \y1
  91. eor \t1, \y2, \y3
  92. and \t0, \t0, \x0
  93. and \t1, \t1, \x2
  94. eor \x0, \x0, \x1
  95. eor \x2, \x2, \x3
  96. and \x1, \x1, \y0
  97. and \x3, \x3, \y2
  98. and \x0, \x0, \y1
  99. and \x2, \x2, \y3
  100. eor \x1, \x1, \x0
  101. eor \x2, \x2, \x3
  102. eor \x0, \x0, \t0
  103. eor \x3, \x3, \t1
  104. .endm
  105. .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
  106. y0, y1, y2, y3, t0, t1, t2, t3
  107. eor \t0, \x0, \x2
  108. eor \t1, \x1, \x3
  109. mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
  110. eor \y0, \y0, \y2
  111. eor \y1, \y1, \y3
  112. mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
  113. eor \x0, \x0, \t0
  114. eor \x2, \x2, \t0
  115. eor \x1, \x1, \t1
  116. eor \x3, \x3, \t1
  117. eor \t0, \x4, \x6
  118. eor \t1, \x5, \x7
  119. mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
  120. eor \y0, \y0, \y2
  121. eor \y1, \y1, \y3
  122. mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
  123. eor \x4, \x4, \t0
  124. eor \x6, \x6, \t0
  125. eor \x5, \x5, \t1
  126. eor \x7, \x7, \t1
  127. .endm
  128. .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
  129. t0, t1, t2, t3, s0, s1, s2, s3
  130. eor \t3, \x4, \x6
  131. eor \t0, \x5, \x7
  132. eor \t1, \x1, \x3
  133. eor \s1, \x7, \x6
  134. eor \s0, \x0, \x2
  135. eor \s3, \t3, \t0
  136. orr \t2, \t0, \t1
  137. and \s2, \t3, \s0
  138. orr \t3, \t3, \s0
  139. eor \s0, \s0, \t1
  140. and \t0, \t0, \t1
  141. eor \t1, \x3, \x2
  142. and \s3, \s3, \s0
  143. and \s1, \s1, \t1
  144. eor \t1, \x4, \x5
  145. eor \s0, \x1, \x0
  146. eor \t3, \t3, \s1
  147. eor \t2, \t2, \s1
  148. and \s1, \t1, \s0
  149. orr \t1, \t1, \s0
  150. eor \t3, \t3, \s3
  151. eor \t0, \t0, \s1
  152. eor \t2, \t2, \s2
  153. eor \t1, \t1, \s3
  154. eor \t0, \t0, \s2
  155. and \s0, \x7, \x3
  156. eor \t1, \t1, \s2
  157. and \s1, \x6, \x2
  158. and \s2, \x5, \x1
  159. orr \s3, \x4, \x0
  160. eor \t3, \t3, \s0
  161. eor \t1, \t1, \s2
  162. eor \s0, \t0, \s3
  163. eor \t2, \t2, \s1
  164. and \s2, \t3, \t1
  165. eor \s1, \t2, \s2
  166. eor \s3, \s0, \s2
  167. bsl \s1, \t1, \s0
  168. not \t0, \s0
  169. bsl \s0, \s1, \s3
  170. bsl \t0, \s1, \s3
  171. bsl \s3, \t3, \t2
  172. eor \t3, \t3, \t2
  173. and \s2, \s0, \s3
  174. eor \t1, \t1, \t0
  175. eor \s2, \s2, \t3
  176. mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
  177. \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  178. .endm
  179. .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
  180. t0, t1, t2, t3, s0, s1, s2, s3
  181. in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
  182. \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
  183. inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
  184. \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
  185. \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
  186. \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
  187. out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
  188. \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
  189. .endm
  190. .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
  191. t0, t1, t2, t3, s0, s1, s2, s3
  192. inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
  193. \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
  194. inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
  195. \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
  196. \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
  197. \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
  198. inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
  199. \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
  200. .endm
  201. .macro enc_next_rk
  202. ldp q16, q17, [bskey], #128
  203. ldp q18, q19, [bskey, #-96]
  204. ldp q20, q21, [bskey, #-64]
  205. ldp q22, q23, [bskey, #-32]
  206. .endm
  207. .macro dec_next_rk
  208. ldp q16, q17, [bskey, #-128]!
  209. ldp q18, q19, [bskey, #32]
  210. ldp q20, q21, [bskey, #64]
  211. ldp q22, q23, [bskey, #96]
  212. .endm
  213. .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
  214. eor \x0\().16b, \x0\().16b, v16.16b
  215. eor \x1\().16b, \x1\().16b, v17.16b
  216. eor \x2\().16b, \x2\().16b, v18.16b
  217. eor \x3\().16b, \x3\().16b, v19.16b
  218. eor \x4\().16b, \x4\().16b, v20.16b
  219. eor \x5\().16b, \x5\().16b, v21.16b
  220. eor \x6\().16b, \x6\().16b, v22.16b
  221. eor \x7\().16b, \x7\().16b, v23.16b
  222. .endm
  223. .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
  224. tbl \x0\().16b, {\x0\().16b}, \mask\().16b
  225. tbl \x1\().16b, {\x1\().16b}, \mask\().16b
  226. tbl \x2\().16b, {\x2\().16b}, \mask\().16b
  227. tbl \x3\().16b, {\x3\().16b}, \mask\().16b
  228. tbl \x4\().16b, {\x4\().16b}, \mask\().16b
  229. tbl \x5\().16b, {\x5\().16b}, \mask\().16b
  230. tbl \x6\().16b, {\x6\().16b}, \mask\().16b
  231. tbl \x7\().16b, {\x7\().16b}, \mask\().16b
  232. .endm
  233. .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
  234. t0, t1, t2, t3, t4, t5, t6, t7, inv
  235. ext \t0\().16b, \x0\().16b, \x0\().16b, #12
  236. ext \t1\().16b, \x1\().16b, \x1\().16b, #12
  237. eor \x0\().16b, \x0\().16b, \t0\().16b
  238. ext \t2\().16b, \x2\().16b, \x2\().16b, #12
  239. eor \x1\().16b, \x1\().16b, \t1\().16b
  240. ext \t3\().16b, \x3\().16b, \x3\().16b, #12
  241. eor \x2\().16b, \x2\().16b, \t2\().16b
  242. ext \t4\().16b, \x4\().16b, \x4\().16b, #12
  243. eor \x3\().16b, \x3\().16b, \t3\().16b
  244. ext \t5\().16b, \x5\().16b, \x5\().16b, #12
  245. eor \x4\().16b, \x4\().16b, \t4\().16b
  246. ext \t6\().16b, \x6\().16b, \x6\().16b, #12
  247. eor \x5\().16b, \x5\().16b, \t5\().16b
  248. ext \t7\().16b, \x7\().16b, \x7\().16b, #12
  249. eor \x6\().16b, \x6\().16b, \t6\().16b
  250. eor \t1\().16b, \t1\().16b, \x0\().16b
  251. eor \x7\().16b, \x7\().16b, \t7\().16b
  252. ext \x0\().16b, \x0\().16b, \x0\().16b, #8
  253. eor \t2\().16b, \t2\().16b, \x1\().16b
  254. eor \t0\().16b, \t0\().16b, \x7\().16b
  255. eor \t1\().16b, \t1\().16b, \x7\().16b
  256. ext \x1\().16b, \x1\().16b, \x1\().16b, #8
  257. eor \t5\().16b, \t5\().16b, \x4\().16b
  258. eor \x0\().16b, \x0\().16b, \t0\().16b
  259. eor \t6\().16b, \t6\().16b, \x5\().16b
  260. eor \x1\().16b, \x1\().16b, \t1\().16b
  261. ext \t0\().16b, \x4\().16b, \x4\().16b, #8
  262. eor \t4\().16b, \t4\().16b, \x3\().16b
  263. ext \t1\().16b, \x5\().16b, \x5\().16b, #8
  264. eor \t7\().16b, \t7\().16b, \x6\().16b
  265. ext \x4\().16b, \x3\().16b, \x3\().16b, #8
  266. eor \t3\().16b, \t3\().16b, \x2\().16b
  267. ext \x5\().16b, \x7\().16b, \x7\().16b, #8
  268. eor \t4\().16b, \t4\().16b, \x7\().16b
  269. ext \x3\().16b, \x6\().16b, \x6\().16b, #8
  270. eor \t3\().16b, \t3\().16b, \x7\().16b
  271. ext \x6\().16b, \x2\().16b, \x2\().16b, #8
  272. eor \x7\().16b, \t1\().16b, \t5\().16b
  273. .ifb \inv
  274. eor \x2\().16b, \t0\().16b, \t4\().16b
  275. eor \x4\().16b, \x4\().16b, \t3\().16b
  276. eor \x5\().16b, \x5\().16b, \t7\().16b
  277. eor \x3\().16b, \x3\().16b, \t6\().16b
  278. eor \x6\().16b, \x6\().16b, \t2\().16b
  279. .else
  280. eor \t3\().16b, \t3\().16b, \x4\().16b
  281. eor \x5\().16b, \x5\().16b, \t7\().16b
  282. eor \x2\().16b, \x3\().16b, \t6\().16b
  283. eor \x3\().16b, \t0\().16b, \t4\().16b
  284. eor \x4\().16b, \x6\().16b, \t2\().16b
  285. mov \x6\().16b, \t3\().16b
  286. .endif
  287. .endm
  288. .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
  289. t0, t1, t2, t3, t4, t5, t6, t7
  290. ext \t0\().16b, \x0\().16b, \x0\().16b, #8
  291. ext \t6\().16b, \x6\().16b, \x6\().16b, #8
  292. ext \t7\().16b, \x7\().16b, \x7\().16b, #8
  293. eor \t0\().16b, \t0\().16b, \x0\().16b
  294. ext \t1\().16b, \x1\().16b, \x1\().16b, #8
  295. eor \t6\().16b, \t6\().16b, \x6\().16b
  296. ext \t2\().16b, \x2\().16b, \x2\().16b, #8
  297. eor \t7\().16b, \t7\().16b, \x7\().16b
  298. ext \t3\().16b, \x3\().16b, \x3\().16b, #8
  299. eor \t1\().16b, \t1\().16b, \x1\().16b
  300. ext \t4\().16b, \x4\().16b, \x4\().16b, #8
  301. eor \t2\().16b, \t2\().16b, \x2\().16b
  302. ext \t5\().16b, \x5\().16b, \x5\().16b, #8
  303. eor \t3\().16b, \t3\().16b, \x3\().16b
  304. eor \t4\().16b, \t4\().16b, \x4\().16b
  305. eor \t5\().16b, \t5\().16b, \x5\().16b
  306. eor \x0\().16b, \x0\().16b, \t6\().16b
  307. eor \x1\().16b, \x1\().16b, \t6\().16b
  308. eor \x2\().16b, \x2\().16b, \t0\().16b
  309. eor \x4\().16b, \x4\().16b, \t2\().16b
  310. eor \x3\().16b, \x3\().16b, \t1\().16b
  311. eor \x1\().16b, \x1\().16b, \t7\().16b
  312. eor \x2\().16b, \x2\().16b, \t7\().16b
  313. eor \x4\().16b, \x4\().16b, \t6\().16b
  314. eor \x5\().16b, \x5\().16b, \t3\().16b
  315. eor \x3\().16b, \x3\().16b, \t6\().16b
  316. eor \x6\().16b, \x6\().16b, \t4\().16b
  317. eor \x4\().16b, \x4\().16b, \t7\().16b
  318. eor \x5\().16b, \x5\().16b, \t7\().16b
  319. eor \x7\().16b, \x7\().16b, \t5\().16b
  320. mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
  321. \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
  322. .endm
  323. .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
  324. ushr \t0\().2d, \b0\().2d, #\n
  325. ushr \t1\().2d, \b1\().2d, #\n
  326. eor \t0\().16b, \t0\().16b, \a0\().16b
  327. eor \t1\().16b, \t1\().16b, \a1\().16b
  328. and \t0\().16b, \t0\().16b, \mask\().16b
  329. and \t1\().16b, \t1\().16b, \mask\().16b
  330. eor \a0\().16b, \a0\().16b, \t0\().16b
  331. shl \t0\().2d, \t0\().2d, #\n
  332. eor \a1\().16b, \a1\().16b, \t1\().16b
  333. shl \t1\().2d, \t1\().2d, #\n
  334. eor \b0\().16b, \b0\().16b, \t0\().16b
  335. eor \b1\().16b, \b1\().16b, \t1\().16b
  336. .endm
  337. .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
  338. movi \t0\().16b, #0x55
  339. movi \t1\().16b, #0x33
  340. swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
  341. swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
  342. movi \t0\().16b, #0x0f
  343. swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
  344. swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
  345. swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
  346. swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
  347. .endm
  348. .align 6
  349. M0: .octa 0x0004080c0105090d02060a0e03070b0f
  350. M0SR: .octa 0x0004080c05090d010a0e02060f03070b
  351. SR: .octa 0x0f0e0d0c0a09080b0504070600030201
  352. SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
  353. M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
  354. ISR: .octa 0x0f0e0d0c080b0a090504070602010003
  355. ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
  356. /*
  357. * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
  358. */
  359. ENTRY(aesbs_convert_key)
  360. ld1 {v7.4s}, [x1], #16 // load round 0 key
  361. ld1 {v17.4s}, [x1], #16 // load round 1 key
  362. movi v8.16b, #0x01 // bit masks
  363. movi v9.16b, #0x02
  364. movi v10.16b, #0x04
  365. movi v11.16b, #0x08
  366. movi v12.16b, #0x10
  367. movi v13.16b, #0x20
  368. movi v14.16b, #0x40
  369. movi v15.16b, #0x80
  370. ldr q16, M0
  371. sub x2, x2, #1
  372. str q7, [x0], #16 // save round 0 key
  373. .Lkey_loop:
  374. tbl v7.16b ,{v17.16b}, v16.16b
  375. ld1 {v17.4s}, [x1], #16 // load next round key
  376. cmtst v0.16b, v7.16b, v8.16b
  377. cmtst v1.16b, v7.16b, v9.16b
  378. cmtst v2.16b, v7.16b, v10.16b
  379. cmtst v3.16b, v7.16b, v11.16b
  380. cmtst v4.16b, v7.16b, v12.16b
  381. cmtst v5.16b, v7.16b, v13.16b
  382. cmtst v6.16b, v7.16b, v14.16b
  383. cmtst v7.16b, v7.16b, v15.16b
  384. not v0.16b, v0.16b
  385. not v1.16b, v1.16b
  386. not v5.16b, v5.16b
  387. not v6.16b, v6.16b
  388. subs x2, x2, #1
  389. stp q0, q1, [x0], #128
  390. stp q2, q3, [x0, #-96]
  391. stp q4, q5, [x0, #-64]
  392. stp q6, q7, [x0, #-32]
  393. b.ne .Lkey_loop
  394. movi v7.16b, #0x63 // compose .L63
  395. eor v17.16b, v17.16b, v7.16b
  396. str q17, [x0]
  397. ret
  398. ENDPROC(aesbs_convert_key)
  399. .align 4
  400. aesbs_encrypt8:
  401. ldr q9, [bskey], #16 // round 0 key
  402. ldr q8, M0SR
  403. ldr q24, SR
  404. eor v10.16b, v0.16b, v9.16b // xor with round0 key
  405. eor v11.16b, v1.16b, v9.16b
  406. tbl v0.16b, {v10.16b}, v8.16b
  407. eor v12.16b, v2.16b, v9.16b
  408. tbl v1.16b, {v11.16b}, v8.16b
  409. eor v13.16b, v3.16b, v9.16b
  410. tbl v2.16b, {v12.16b}, v8.16b
  411. eor v14.16b, v4.16b, v9.16b
  412. tbl v3.16b, {v13.16b}, v8.16b
  413. eor v15.16b, v5.16b, v9.16b
  414. tbl v4.16b, {v14.16b}, v8.16b
  415. eor v10.16b, v6.16b, v9.16b
  416. tbl v5.16b, {v15.16b}, v8.16b
  417. eor v11.16b, v7.16b, v9.16b
  418. tbl v6.16b, {v10.16b}, v8.16b
  419. tbl v7.16b, {v11.16b}, v8.16b
  420. bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
  421. sub rounds, rounds, #1
  422. b .Lenc_sbox
  423. .Lenc_loop:
  424. shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
  425. .Lenc_sbox:
  426. sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
  427. v13, v14, v15
  428. subs rounds, rounds, #1
  429. b.cc .Lenc_done
  430. enc_next_rk
  431. mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
  432. v13, v14, v15
  433. add_round_key v0, v1, v2, v3, v4, v5, v6, v7
  434. b.ne .Lenc_loop
  435. ldr q24, SRM0
  436. b .Lenc_loop
  437. .Lenc_done:
  438. ldr q12, [bskey] // last round key
  439. bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
  440. eor v0.16b, v0.16b, v12.16b
  441. eor v1.16b, v1.16b, v12.16b
  442. eor v4.16b, v4.16b, v12.16b
  443. eor v6.16b, v6.16b, v12.16b
  444. eor v3.16b, v3.16b, v12.16b
  445. eor v7.16b, v7.16b, v12.16b
  446. eor v2.16b, v2.16b, v12.16b
  447. eor v5.16b, v5.16b, v12.16b
  448. ret
  449. ENDPROC(aesbs_encrypt8)
  450. .align 4
  451. aesbs_decrypt8:
  452. lsl x9, rounds, #7
  453. add bskey, bskey, x9
  454. ldr q9, [bskey, #-112]! // round 0 key
  455. ldr q8, M0ISR
  456. ldr q24, ISR
  457. eor v10.16b, v0.16b, v9.16b // xor with round0 key
  458. eor v11.16b, v1.16b, v9.16b
  459. tbl v0.16b, {v10.16b}, v8.16b
  460. eor v12.16b, v2.16b, v9.16b
  461. tbl v1.16b, {v11.16b}, v8.16b
  462. eor v13.16b, v3.16b, v9.16b
  463. tbl v2.16b, {v12.16b}, v8.16b
  464. eor v14.16b, v4.16b, v9.16b
  465. tbl v3.16b, {v13.16b}, v8.16b
  466. eor v15.16b, v5.16b, v9.16b
  467. tbl v4.16b, {v14.16b}, v8.16b
  468. eor v10.16b, v6.16b, v9.16b
  469. tbl v5.16b, {v15.16b}, v8.16b
  470. eor v11.16b, v7.16b, v9.16b
  471. tbl v6.16b, {v10.16b}, v8.16b
  472. tbl v7.16b, {v11.16b}, v8.16b
  473. bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
  474. sub rounds, rounds, #1
  475. b .Ldec_sbox
  476. .Ldec_loop:
  477. shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
  478. .Ldec_sbox:
  479. inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
  480. v13, v14, v15
  481. subs rounds, rounds, #1
  482. b.cc .Ldec_done
  483. dec_next_rk
  484. add_round_key v0, v1, v6, v4, v2, v7, v3, v5
  485. inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
  486. v13, v14, v15
  487. b.ne .Ldec_loop
  488. ldr q24, ISRM0
  489. b .Ldec_loop
  490. .Ldec_done:
  491. ldr q12, [bskey, #-16] // last round key
  492. bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
  493. eor v0.16b, v0.16b, v12.16b
  494. eor v1.16b, v1.16b, v12.16b
  495. eor v6.16b, v6.16b, v12.16b
  496. eor v4.16b, v4.16b, v12.16b
  497. eor v2.16b, v2.16b, v12.16b
  498. eor v7.16b, v7.16b, v12.16b
  499. eor v3.16b, v3.16b, v12.16b
  500. eor v5.16b, v5.16b, v12.16b
  501. ret
  502. ENDPROC(aesbs_decrypt8)
  503. /*
  504. * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  505. * int blocks)
  506. * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  507. * int blocks)
  508. */
  509. .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
  510. frame_push 5
  511. mov x19, x0
  512. mov x20, x1
  513. mov x21, x2
  514. mov x22, x3
  515. mov x23, x4
  516. 99: mov x5, #1
  517. lsl x5, x5, x23
  518. subs w23, w23, #8
  519. csel x23, x23, xzr, pl
  520. csel x5, x5, xzr, mi
  521. ld1 {v0.16b}, [x20], #16
  522. tbnz x5, #1, 0f
  523. ld1 {v1.16b}, [x20], #16
  524. tbnz x5, #2, 0f
  525. ld1 {v2.16b}, [x20], #16
  526. tbnz x5, #3, 0f
  527. ld1 {v3.16b}, [x20], #16
  528. tbnz x5, #4, 0f
  529. ld1 {v4.16b}, [x20], #16
  530. tbnz x5, #5, 0f
  531. ld1 {v5.16b}, [x20], #16
  532. tbnz x5, #6, 0f
  533. ld1 {v6.16b}, [x20], #16
  534. tbnz x5, #7, 0f
  535. ld1 {v7.16b}, [x20], #16
  536. 0: mov bskey, x21
  537. mov rounds, x22
  538. bl \do8
  539. st1 {\o0\().16b}, [x19], #16
  540. tbnz x5, #1, 1f
  541. st1 {\o1\().16b}, [x19], #16
  542. tbnz x5, #2, 1f
  543. st1 {\o2\().16b}, [x19], #16
  544. tbnz x5, #3, 1f
  545. st1 {\o3\().16b}, [x19], #16
  546. tbnz x5, #4, 1f
  547. st1 {\o4\().16b}, [x19], #16
  548. tbnz x5, #5, 1f
  549. st1 {\o5\().16b}, [x19], #16
  550. tbnz x5, #6, 1f
  551. st1 {\o6\().16b}, [x19], #16
  552. tbnz x5, #7, 1f
  553. st1 {\o7\().16b}, [x19], #16
  554. cbz x23, 1f
  555. cond_yield_neon
  556. b 99b
  557. 1: frame_pop
  558. ret
  559. .endm
  560. .align 4
  561. ENTRY(aesbs_ecb_encrypt)
  562. __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
  563. ENDPROC(aesbs_ecb_encrypt)
  564. .align 4
  565. ENTRY(aesbs_ecb_decrypt)
  566. __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
  567. ENDPROC(aesbs_ecb_decrypt)
  568. /*
  569. * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  570. * int blocks, u8 iv[])
  571. */
  572. .align 4
  573. ENTRY(aesbs_cbc_decrypt)
  574. frame_push 6
  575. mov x19, x0
  576. mov x20, x1
  577. mov x21, x2
  578. mov x22, x3
  579. mov x23, x4
  580. mov x24, x5
  581. 99: mov x6, #1
  582. lsl x6, x6, x23
  583. subs w23, w23, #8
  584. csel x23, x23, xzr, pl
  585. csel x6, x6, xzr, mi
  586. ld1 {v0.16b}, [x20], #16
  587. mov v25.16b, v0.16b
  588. tbnz x6, #1, 0f
  589. ld1 {v1.16b}, [x20], #16
  590. mov v26.16b, v1.16b
  591. tbnz x6, #2, 0f
  592. ld1 {v2.16b}, [x20], #16
  593. mov v27.16b, v2.16b
  594. tbnz x6, #3, 0f
  595. ld1 {v3.16b}, [x20], #16
  596. mov v28.16b, v3.16b
  597. tbnz x6, #4, 0f
  598. ld1 {v4.16b}, [x20], #16
  599. mov v29.16b, v4.16b
  600. tbnz x6, #5, 0f
  601. ld1 {v5.16b}, [x20], #16
  602. mov v30.16b, v5.16b
  603. tbnz x6, #6, 0f
  604. ld1 {v6.16b}, [x20], #16
  605. mov v31.16b, v6.16b
  606. tbnz x6, #7, 0f
  607. ld1 {v7.16b}, [x20]
  608. 0: mov bskey, x21
  609. mov rounds, x22
  610. bl aesbs_decrypt8
  611. ld1 {v24.16b}, [x24] // load IV
  612. eor v1.16b, v1.16b, v25.16b
  613. eor v6.16b, v6.16b, v26.16b
  614. eor v4.16b, v4.16b, v27.16b
  615. eor v2.16b, v2.16b, v28.16b
  616. eor v7.16b, v7.16b, v29.16b
  617. eor v0.16b, v0.16b, v24.16b
  618. eor v3.16b, v3.16b, v30.16b
  619. eor v5.16b, v5.16b, v31.16b
  620. st1 {v0.16b}, [x19], #16
  621. mov v24.16b, v25.16b
  622. tbnz x6, #1, 1f
  623. st1 {v1.16b}, [x19], #16
  624. mov v24.16b, v26.16b
  625. tbnz x6, #2, 1f
  626. st1 {v6.16b}, [x19], #16
  627. mov v24.16b, v27.16b
  628. tbnz x6, #3, 1f
  629. st1 {v4.16b}, [x19], #16
  630. mov v24.16b, v28.16b
  631. tbnz x6, #4, 1f
  632. st1 {v2.16b}, [x19], #16
  633. mov v24.16b, v29.16b
  634. tbnz x6, #5, 1f
  635. st1 {v7.16b}, [x19], #16
  636. mov v24.16b, v30.16b
  637. tbnz x6, #6, 1f
  638. st1 {v3.16b}, [x19], #16
  639. mov v24.16b, v31.16b
  640. tbnz x6, #7, 1f
  641. ld1 {v24.16b}, [x20], #16
  642. st1 {v5.16b}, [x19], #16
  643. 1: st1 {v24.16b}, [x24] // store IV
  644. cbz x23, 2f
  645. cond_yield_neon
  646. b 99b
  647. 2: frame_pop
  648. ret
  649. ENDPROC(aesbs_cbc_decrypt)
  650. .macro next_tweak, out, in, const, tmp
  651. sshr \tmp\().2d, \in\().2d, #63
  652. and \tmp\().16b, \tmp\().16b, \const\().16b
  653. add \out\().2d, \in\().2d, \in\().2d
  654. ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
  655. eor \out\().16b, \out\().16b, \tmp\().16b
  656. .endm
  657. .align 4
  658. .Lxts_mul_x:
  659. CPU_LE( .quad 1, 0x87 )
  660. CPU_BE( .quad 0x87, 1 )
  661. /*
  662. * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  663. * int blocks, u8 iv[])
  664. * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  665. * int blocks, u8 iv[])
  666. */
  667. __xts_crypt8:
  668. mov x6, #1
  669. lsl x6, x6, x23
  670. subs w23, w23, #8
  671. csel x23, x23, xzr, pl
  672. csel x6, x6, xzr, mi
  673. ld1 {v0.16b}, [x20], #16
  674. next_tweak v26, v25, v30, v31
  675. eor v0.16b, v0.16b, v25.16b
  676. tbnz x6, #1, 0f
  677. ld1 {v1.16b}, [x20], #16
  678. next_tweak v27, v26, v30, v31
  679. eor v1.16b, v1.16b, v26.16b
  680. tbnz x6, #2, 0f
  681. ld1 {v2.16b}, [x20], #16
  682. next_tweak v28, v27, v30, v31
  683. eor v2.16b, v2.16b, v27.16b
  684. tbnz x6, #3, 0f
  685. ld1 {v3.16b}, [x20], #16
  686. next_tweak v29, v28, v30, v31
  687. eor v3.16b, v3.16b, v28.16b
  688. tbnz x6, #4, 0f
  689. ld1 {v4.16b}, [x20], #16
  690. str q29, [sp, #.Lframe_local_offset]
  691. eor v4.16b, v4.16b, v29.16b
  692. next_tweak v29, v29, v30, v31
  693. tbnz x6, #5, 0f
  694. ld1 {v5.16b}, [x20], #16
  695. str q29, [sp, #.Lframe_local_offset + 16]
  696. eor v5.16b, v5.16b, v29.16b
  697. next_tweak v29, v29, v30, v31
  698. tbnz x6, #6, 0f
  699. ld1 {v6.16b}, [x20], #16
  700. str q29, [sp, #.Lframe_local_offset + 32]
  701. eor v6.16b, v6.16b, v29.16b
  702. next_tweak v29, v29, v30, v31
  703. tbnz x6, #7, 0f
  704. ld1 {v7.16b}, [x20], #16
  705. str q29, [sp, #.Lframe_local_offset + 48]
  706. eor v7.16b, v7.16b, v29.16b
  707. next_tweak v29, v29, v30, v31
  708. 0: mov bskey, x21
  709. mov rounds, x22
  710. br x7
  711. ENDPROC(__xts_crypt8)
  712. .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
  713. frame_push 6, 64
  714. mov x19, x0
  715. mov x20, x1
  716. mov x21, x2
  717. mov x22, x3
  718. mov x23, x4
  719. mov x24, x5
  720. 0: ldr q30, .Lxts_mul_x
  721. ld1 {v25.16b}, [x24]
  722. 99: adr x7, \do8
  723. bl __xts_crypt8
  724. ldp q16, q17, [sp, #.Lframe_local_offset]
  725. ldp q18, q19, [sp, #.Lframe_local_offset + 32]
  726. eor \o0\().16b, \o0\().16b, v25.16b
  727. eor \o1\().16b, \o1\().16b, v26.16b
  728. eor \o2\().16b, \o2\().16b, v27.16b
  729. eor \o3\().16b, \o3\().16b, v28.16b
  730. st1 {\o0\().16b}, [x19], #16
  731. mov v25.16b, v26.16b
  732. tbnz x6, #1, 1f
  733. st1 {\o1\().16b}, [x19], #16
  734. mov v25.16b, v27.16b
  735. tbnz x6, #2, 1f
  736. st1 {\o2\().16b}, [x19], #16
  737. mov v25.16b, v28.16b
  738. tbnz x6, #3, 1f
  739. st1 {\o3\().16b}, [x19], #16
  740. mov v25.16b, v29.16b
  741. tbnz x6, #4, 1f
  742. eor \o4\().16b, \o4\().16b, v16.16b
  743. eor \o5\().16b, \o5\().16b, v17.16b
  744. eor \o6\().16b, \o6\().16b, v18.16b
  745. eor \o7\().16b, \o7\().16b, v19.16b
  746. st1 {\o4\().16b}, [x19], #16
  747. tbnz x6, #5, 1f
  748. st1 {\o5\().16b}, [x19], #16
  749. tbnz x6, #6, 1f
  750. st1 {\o6\().16b}, [x19], #16
  751. tbnz x6, #7, 1f
  752. st1 {\o7\().16b}, [x19], #16
  753. cbz x23, 1f
  754. st1 {v25.16b}, [x24]
  755. cond_yield_neon 0b
  756. b 99b
  757. 1: st1 {v25.16b}, [x24]
  758. frame_pop
  759. ret
  760. .endm
  761. ENTRY(aesbs_xts_encrypt)
  762. __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
  763. ENDPROC(aesbs_xts_encrypt)
  764. ENTRY(aesbs_xts_decrypt)
  765. __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
  766. ENDPROC(aesbs_xts_decrypt)
  767. .macro next_ctr, v
  768. mov \v\().d[1], x8
  769. adds x8, x8, #1
  770. mov \v\().d[0], x7
  771. adc x7, x7, xzr
  772. rev64 \v\().16b, \v\().16b
  773. .endm
  774. /*
  775. * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
  776. * int rounds, int blocks, u8 iv[], u8 final[])
  777. */
  778. ENTRY(aesbs_ctr_encrypt)
  779. frame_push 8
  780. mov x19, x0
  781. mov x20, x1
  782. mov x21, x2
  783. mov x22, x3
  784. mov x23, x4
  785. mov x24, x5
  786. mov x25, x6
  787. cmp x25, #0
  788. cset x26, ne
  789. add x23, x23, x26 // do one extra block if final
  790. 98: ldp x7, x8, [x24]
  791. ld1 {v0.16b}, [x24]
  792. CPU_LE( rev x7, x7 )
  793. CPU_LE( rev x8, x8 )
  794. adds x8, x8, #1
  795. adc x7, x7, xzr
  796. 99: mov x9, #1
  797. lsl x9, x9, x23
  798. subs w23, w23, #8
  799. csel x23, x23, xzr, pl
  800. csel x9, x9, xzr, le
  801. tbnz x9, #1, 0f
  802. next_ctr v1
  803. tbnz x9, #2, 0f
  804. next_ctr v2
  805. tbnz x9, #3, 0f
  806. next_ctr v3
  807. tbnz x9, #4, 0f
  808. next_ctr v4
  809. tbnz x9, #5, 0f
  810. next_ctr v5
  811. tbnz x9, #6, 0f
  812. next_ctr v6
  813. tbnz x9, #7, 0f
  814. next_ctr v7
  815. 0: mov bskey, x21
  816. mov rounds, x22
  817. bl aesbs_encrypt8
  818. lsr x9, x9, x26 // disregard the extra block
  819. tbnz x9, #0, 0f
  820. ld1 {v8.16b}, [x20], #16
  821. eor v0.16b, v0.16b, v8.16b
  822. st1 {v0.16b}, [x19], #16
  823. tbnz x9, #1, 1f
  824. ld1 {v9.16b}, [x20], #16
  825. eor v1.16b, v1.16b, v9.16b
  826. st1 {v1.16b}, [x19], #16
  827. tbnz x9, #2, 2f
  828. ld1 {v10.16b}, [x20], #16
  829. eor v4.16b, v4.16b, v10.16b
  830. st1 {v4.16b}, [x19], #16
  831. tbnz x9, #3, 3f
  832. ld1 {v11.16b}, [x20], #16
  833. eor v6.16b, v6.16b, v11.16b
  834. st1 {v6.16b}, [x19], #16
  835. tbnz x9, #4, 4f
  836. ld1 {v12.16b}, [x20], #16
  837. eor v3.16b, v3.16b, v12.16b
  838. st1 {v3.16b}, [x19], #16
  839. tbnz x9, #5, 5f
  840. ld1 {v13.16b}, [x20], #16
  841. eor v7.16b, v7.16b, v13.16b
  842. st1 {v7.16b}, [x19], #16
  843. tbnz x9, #6, 6f
  844. ld1 {v14.16b}, [x20], #16
  845. eor v2.16b, v2.16b, v14.16b
  846. st1 {v2.16b}, [x19], #16
  847. tbnz x9, #7, 7f
  848. ld1 {v15.16b}, [x20], #16
  849. eor v5.16b, v5.16b, v15.16b
  850. st1 {v5.16b}, [x19], #16
  851. 8: next_ctr v0
  852. st1 {v0.16b}, [x24]
  853. cbz x23, .Lctr_done
  854. cond_yield_neon 98b
  855. b 99b
  856. .Lctr_done:
  857. frame_pop
  858. ret
  859. /*
  860. * If we are handling the tail of the input (x6 != NULL), return the
  861. * final keystream block back to the caller.
  862. */
  863. 0: cbz x25, 8b
  864. st1 {v0.16b}, [x25]
  865. b 8b
  866. 1: cbz x25, 8b
  867. st1 {v1.16b}, [x25]
  868. b 8b
  869. 2: cbz x25, 8b
  870. st1 {v4.16b}, [x25]
  871. b 8b
  872. 3: cbz x25, 8b
  873. st1 {v6.16b}, [x25]
  874. b 8b
  875. 4: cbz x25, 8b
  876. st1 {v3.16b}, [x25]
  877. b 8b
  878. 5: cbz x25, 8b
  879. st1 {v7.16b}, [x25]
  880. b 8b
  881. 6: cbz x25, 8b
  882. st1 {v2.16b}, [x25]
  883. b 8b
  884. 7: cbz x25, 8b
  885. st1 {v5.16b}, [x25]
  886. b 8b
  887. ENDPROC(aesbs_ctr_encrypt)