rijndael-vaes-avx2-i386.S 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805
  1. /* VAES/AVX2 i386 accelerated AES for Libgcrypt
  2. * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  3. *
  4. * This file is part of Libgcrypt.
  5. *
  6. * Libgcrypt is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as
  8. * published by the Free Software Foundation; either version 2.1 of
  9. * the License, or (at your option) any later version.
  10. *
  11. * Libgcrypt is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #if defined(__i386__)
  20. #include <config.h>
  21. #if (defined(HAVE_COMPATIBLE_GCC_I386_PLATFORM_AS) || \
  22. defined(HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS)) && \
  23. defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) && \
  24. defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
  25. #include "asm-common-i386.h"
  26. .text
  27. DECL_GET_PC_THUNK(eax);
  28. /**********************************************************************
  29. helper macros
  30. **********************************************************************/
  31. #define AES_OP4(op, key, b0, b1, b2, b3) \
  32. op key, b0, b0; \
  33. op key, b1, b1; \
  34. op key, b2, b2; \
  35. op key, b3, b3;
  36. #define VAESENC4(key, b0, b1, b2, b3) \
  37. AES_OP4(vaesenc, key, b0, b1, b2, b3)
  38. #define VAESDEC4(key, b0, b1, b2, b3) \
  39. AES_OP4(vaesdec, key, b0, b1, b2, b3)
  40. #define XOR4(key, b0, b1, b2, b3) \
  41. AES_OP4(vpxor, key, b0, b1, b2, b3)
  42. #define AES_OP2(op, key, b0, b1) \
  43. op key, b0, b0; \
  44. op key, b1, b1;
  45. #define VAESENC2(key, b0, b1) \
  46. AES_OP2(vaesenc, key, b0, b1)
  47. #define VAESDEC2(key, b0, b1) \
  48. AES_OP2(vaesdec, key, b0, b1)
  49. #define XOR2(key, b0, b1) \
  50. AES_OP2(vpxor, key, b0, b1)
  51. #define VAESENC6(key, b0, b1, b2, b3, b4, b5) \
  52. AES_OP4(vaesenc, key, b0, b1, b2, b3); \
  53. AES_OP2(vaesenc, key, b4, b5)
  54. #define VAESDEC6(key, b0, b1, b2, b3, b4, b5) \
  55. AES_OP4(vaesdec, key, b0, b1, b2, b3); \
  56. AES_OP2(vaesdec, key, b4, b5)
  57. #define XOR6(key, b0, b1, b2, b3, b4, b5) \
  58. AES_OP4(vpxor, key, b0, b1, b2, b3); \
  59. AES_OP2(vpxor, key, b4, b5)
  60. #define CADDR(name, reg) \
  61. (name - SYM_NAME(_gcry_vaes_consts))(reg)
  62. /**********************************************************************
  63. CBC-mode decryption
  64. **********************************************************************/
  65. ELF(.type SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386),@function)
  66. .globl SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386)
  67. .align 16
  68. SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386):
  69. /* input:
  70. * (esp + 4): round keys
  71. * (esp + 8): iv
  72. * (esp + 12): dst
  73. * (esp + 16): src
  74. * (esp + 20): nblocks
  75. * (esp + 24): nrounds
  76. */
  77. CFI_STARTPROC();
  78. pushl %edi;
  79. CFI_PUSH(%edi);
  80. pushl %esi;
  81. CFI_PUSH(%esi);
  82. movl 8+4(%esp), %edi;
  83. movl 8+8(%esp), %esi;
  84. movl 8+12(%esp), %edx;
  85. movl 8+16(%esp), %ecx;
  86. movl 8+20(%esp), %eax;
  87. /* Process 8 blocks per loop. */
  88. .align 8
  89. .Lcbc_dec_blk8:
  90. cmpl $8, %eax;
  91. jb .Lcbc_dec_blk4;
  92. leal -8(%eax), %eax;
  93. /* Load input and xor first key. Update IV. */
  94. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  95. vmovdqu (0 * 16)(%ecx), %ymm0;
  96. vmovdqu (2 * 16)(%ecx), %ymm1;
  97. vmovdqu (4 * 16)(%ecx), %ymm2;
  98. vmovdqu (6 * 16)(%ecx), %ymm3;
  99. vmovdqu (%esi), %xmm6; /* Load IV. */
  100. vinserti128 $1, %xmm0, %ymm6, %ymm5;
  101. vextracti128 $1, %ymm3, (%esi); /* Store IV. */
  102. vpxor %ymm4, %ymm0, %ymm0;
  103. vpxor %ymm4, %ymm1, %ymm1;
  104. vpxor %ymm4, %ymm2, %ymm2;
  105. vpxor %ymm4, %ymm3, %ymm3;
  106. vmovdqu (1 * 16)(%ecx), %ymm6;
  107. vmovdqu (3 * 16)(%ecx), %ymm7;
  108. /* AES rounds */
  109. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  110. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  111. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  112. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  113. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  114. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  115. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  116. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  117. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  118. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  119. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  120. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  121. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  122. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  123. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  124. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  125. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  126. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  127. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  128. cmpl $12, 8+24(%esp);
  129. jb .Lcbc_dec_blk8_last;
  130. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  131. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  132. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  133. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  134. jz .Lcbc_dec_blk8_last;
  135. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  136. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  137. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  138. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  139. /* Last round and output handling. */
  140. .Lcbc_dec_blk8_last:
  141. vpxor %ymm4, %ymm5, %ymm5;
  142. vpxor %ymm4, %ymm6, %ymm6;
  143. vpxor %ymm4, %ymm7, %ymm7;
  144. vpxor (5 * 16)(%ecx), %ymm4, %ymm4;
  145. leal (8 * 16)(%ecx), %ecx;
  146. vaesdeclast %ymm5, %ymm0, %ymm0;
  147. vaesdeclast %ymm6, %ymm1, %ymm1;
  148. vaesdeclast %ymm7, %ymm2, %ymm2;
  149. vaesdeclast %ymm4, %ymm3, %ymm3;
  150. vmovdqu %ymm0, (0 * 16)(%edx);
  151. vmovdqu %ymm1, (2 * 16)(%edx);
  152. vmovdqu %ymm2, (4 * 16)(%edx);
  153. vmovdqu %ymm3, (6 * 16)(%edx);
  154. leal (8 * 16)(%edx), %edx;
  155. jmp .Lcbc_dec_blk8;
  156. /* Handle trailing four blocks. */
  157. .align 8
  158. .Lcbc_dec_blk4:
  159. cmpl $4, %eax;
  160. jb .Lcbc_dec_blk1;
  161. leal -4(%eax), %eax;
  162. /* Load input and xor first key. Update IV. */
  163. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  164. vmovdqu (0 * 16)(%ecx), %ymm0;
  165. vmovdqu (2 * 16)(%ecx), %ymm1;
  166. vmovdqu (%esi), %xmm6; /* Load IV. */
  167. vinserti128 $1, %xmm0, %ymm6, %ymm5;
  168. vextracti128 $1, %ymm1, (%esi); /* Store IV. */
  169. vpxor %ymm4, %ymm0, %ymm0;
  170. vpxor %ymm4, %ymm1, %ymm1;
  171. vmovdqu (1 * 16)(%ecx), %ymm6;
  172. leal (4 * 16)(%ecx), %ecx;
  173. /* AES rounds */
  174. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  175. VAESDEC2(%ymm4, %ymm0, %ymm1);
  176. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  177. VAESDEC2(%ymm4, %ymm0, %ymm1);
  178. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  179. VAESDEC2(%ymm4, %ymm0, %ymm1);
  180. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  181. VAESDEC2(%ymm4, %ymm0, %ymm1);
  182. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  183. VAESDEC2(%ymm4, %ymm0, %ymm1);
  184. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  185. VAESDEC2(%ymm4, %ymm0, %ymm1);
  186. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  187. VAESDEC2(%ymm4, %ymm0, %ymm1);
  188. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  189. VAESDEC2(%ymm4, %ymm0, %ymm1);
  190. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  191. VAESDEC2(%ymm4, %ymm0, %ymm1);
  192. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  193. cmpl $12, 8+24(%esp);
  194. jb .Lcbc_dec_blk4_last;
  195. VAESDEC2(%ymm4, %ymm0, %ymm1);
  196. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  197. VAESDEC2(%ymm4, %ymm0, %ymm1);
  198. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  199. jz .Lcbc_dec_blk4_last;
  200. VAESDEC2(%ymm4, %ymm0, %ymm1);
  201. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  202. VAESDEC2(%ymm4, %ymm0, %ymm1);
  203. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  204. /* Last round and output handling. */
  205. .Lcbc_dec_blk4_last:
  206. vpxor %ymm4, %ymm5, %ymm5;
  207. vpxor %ymm4, %ymm6, %ymm6;
  208. vaesdeclast %ymm5, %ymm0, %ymm0;
  209. vaesdeclast %ymm6, %ymm1, %ymm1;
  210. vmovdqu %ymm0, (0 * 16)(%edx);
  211. vmovdqu %ymm1, (2 * 16)(%edx);
  212. leal (4 * 16)(%edx), %edx;
  213. /* Process trailing one to three blocks, one per loop. */
  214. .align 8
  215. .Lcbc_dec_blk1:
  216. cmpl $1, %eax;
  217. jb .Ldone_cbc_dec;
  218. leal -1(%eax), %eax;
  219. /* Load input. */
  220. vmovdqu (%ecx), %xmm2;
  221. leal 16(%ecx), %ecx;
  222. /* Xor first key. */
  223. vpxor (0 * 16)(%edi), %xmm2, %xmm0;
  224. /* AES rounds. */
  225. vaesdec (1 * 16)(%edi), %xmm0, %xmm0;
  226. vaesdec (2 * 16)(%edi), %xmm0, %xmm0;
  227. vaesdec (3 * 16)(%edi), %xmm0, %xmm0;
  228. vaesdec (4 * 16)(%edi), %xmm0, %xmm0;
  229. vaesdec (5 * 16)(%edi), %xmm0, %xmm0;
  230. vaesdec (6 * 16)(%edi), %xmm0, %xmm0;
  231. vaesdec (7 * 16)(%edi), %xmm0, %xmm0;
  232. vaesdec (8 * 16)(%edi), %xmm0, %xmm0;
  233. vaesdec (9 * 16)(%edi), %xmm0, %xmm0;
  234. vmovdqa (10 * 16)(%edi), %xmm1;
  235. cmpl $12, 8+24(%esp);
  236. jb .Lcbc_dec_blk1_last;
  237. vaesdec %xmm1, %xmm0, %xmm0;
  238. vaesdec (11 * 16)(%edi), %xmm0, %xmm0;
  239. vmovdqa (12 * 16)(%edi), %xmm1;
  240. jz .Lcbc_dec_blk1_last;
  241. vaesdec %xmm1, %xmm0, %xmm0;
  242. vaesdec (13 * 16)(%edi), %xmm0, %xmm0;
  243. vmovdqa (14 * 16)(%edi), %xmm1;
  244. /* Last round and output handling. */
  245. .Lcbc_dec_blk1_last:
  246. vpxor (%esi), %xmm1, %xmm1;
  247. vaesdeclast %xmm1, %xmm0, %xmm0;
  248. vmovdqu %xmm2, (%esi);
  249. vmovdqu %xmm0, (%edx);
  250. leal 16(%edx), %edx;
  251. jmp .Lcbc_dec_blk1;
  252. .align 8
  253. .Ldone_cbc_dec:
  254. popl %esi;
  255. CFI_POP(%esi);
  256. popl %edi;
  257. CFI_POP(%edi);
  258. vzeroall;
  259. ret_spec_stop
  260. CFI_ENDPROC();
  261. ELF(.size SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386),
  262. .-SYM_NAME(_gcry_vaes_avx2_cbc_dec_i386))
  263. /**********************************************************************
  264. CFB-mode decryption
  265. **********************************************************************/
  266. ELF(.type SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386),@function)
  267. .globl SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386)
  268. .align 16
  269. SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386):
  270. /* input:
  271. * (esp + 4): round keys
  272. * (esp + 8): iv
  273. * (esp + 12): dst
  274. * (esp + 16): src
  275. * (esp + 20): nblocks
  276. * (esp + 24): nrounds
  277. */
  278. CFI_STARTPROC();
  279. pushl %edi;
  280. CFI_PUSH(%edi);
  281. pushl %esi;
  282. CFI_PUSH(%esi);
  283. movl 8+4(%esp), %edi;
  284. movl 8+8(%esp), %esi;
  285. movl 8+12(%esp), %edx;
  286. movl 8+16(%esp), %ecx;
  287. movl 8+20(%esp), %eax;
  288. /* Process 8 blocks per loop. */
  289. .align 8
  290. .Lcfb_dec_blk8:
  291. cmpl $8, %eax;
  292. jb .Lcfb_dec_blk4;
  293. leal -8(%eax), %eax;
  294. /* Load IV. */
  295. vmovdqu (%esi), %xmm0;
  296. /* Load input and xor first key. Update IV. */
  297. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  298. vmovdqu (0 * 16)(%ecx), %ymm5;
  299. vinserti128 $1, %xmm5, %ymm0, %ymm0;
  300. vmovdqu (1 * 16)(%ecx), %ymm1;
  301. vmovdqu (3 * 16)(%ecx), %ymm2;
  302. vmovdqu (5 * 16)(%ecx), %ymm3;
  303. vmovdqu (7 * 16)(%ecx), %xmm6;
  304. vpxor %ymm4, %ymm0, %ymm0;
  305. vpxor %ymm4, %ymm1, %ymm1;
  306. vpxor %ymm4, %ymm2, %ymm2;
  307. vpxor %ymm4, %ymm3, %ymm3;
  308. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  309. vmovdqu %xmm6, (%esi); /* Store IV. */
  310. vmovdqu (2 * 16)(%ecx), %ymm6;
  311. vmovdqu (4 * 16)(%ecx), %ymm7;
  312. /* AES rounds */
  313. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  314. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  315. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  316. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  317. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  318. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  319. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  320. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  321. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  322. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  323. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  324. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  325. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  326. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  327. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  328. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  329. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  330. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  331. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  332. cmpl $12, 8+24(%esp);
  333. jb .Lcfb_dec_blk8_last;
  334. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  335. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  336. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  337. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  338. jz .Lcfb_dec_blk8_last;
  339. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  340. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  341. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  342. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  343. /* Last round and output handling. */
  344. .Lcfb_dec_blk8_last:
  345. vpxor %ymm4, %ymm5, %ymm5;
  346. vpxor %ymm4, %ymm6, %ymm6;
  347. vpxor %ymm4, %ymm7, %ymm7;
  348. vpxor (6 * 16)(%ecx), %ymm4, %ymm4;
  349. leal (8 * 16)(%ecx), %ecx;
  350. vaesenclast %ymm5, %ymm0, %ymm0;
  351. vaesenclast %ymm6, %ymm1, %ymm1;
  352. vaesenclast %ymm7, %ymm2, %ymm2;
  353. vaesenclast %ymm4, %ymm3, %ymm3;
  354. vmovdqu %ymm0, (0 * 16)(%edx);
  355. vmovdqu %ymm1, (2 * 16)(%edx);
  356. vmovdqu %ymm2, (4 * 16)(%edx);
  357. vmovdqu %ymm3, (6 * 16)(%edx);
  358. leal (8 * 16)(%edx), %edx;
  359. jmp .Lcfb_dec_blk8;
  360. /* Handle trailing four blocks. */
  361. .align 8
  362. .Lcfb_dec_blk4:
  363. cmpl $4, %eax;
  364. jb .Lcfb_dec_blk1;
  365. leal -4(%eax), %eax;
  366. /* Load IV. */
  367. vmovdqu (%esi), %xmm0;
  368. /* Load input and xor first key. Update IV. */
  369. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  370. vmovdqu (0 * 16)(%ecx), %ymm5;
  371. vinserti128 $1, %xmm5, %ymm0, %ymm0;
  372. vmovdqu (1 * 16)(%ecx), %ymm1;
  373. vmovdqu (3 * 16)(%ecx), %xmm6;
  374. vpxor %ymm4, %ymm0, %ymm0;
  375. vpxor %ymm4, %ymm1, %ymm1;
  376. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  377. vmovdqu %xmm6, (%esi); /* Store IV. */
  378. vmovdqu (2 * 16)(%ecx), %ymm6;
  379. leal (4 * 16)(%ecx), %ecx;
  380. /* AES rounds */
  381. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  382. VAESENC2(%ymm4, %ymm0, %ymm1);
  383. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  384. VAESENC2(%ymm4, %ymm0, %ymm1);
  385. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  386. VAESENC2(%ymm4, %ymm0, %ymm1);
  387. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  388. VAESENC2(%ymm4, %ymm0, %ymm1);
  389. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  390. VAESENC2(%ymm4, %ymm0, %ymm1);
  391. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  392. VAESENC2(%ymm4, %ymm0, %ymm1);
  393. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  394. VAESENC2(%ymm4, %ymm0, %ymm1);
  395. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  396. VAESENC2(%ymm4, %ymm0, %ymm1);
  397. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  398. VAESENC2(%ymm4, %ymm0, %ymm1);
  399. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  400. cmpl $12, 8+24(%esp);
  401. jb .Lcfb_dec_blk4_last;
  402. VAESENC2(%ymm4, %ymm0, %ymm1);
  403. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  404. VAESENC2(%ymm4, %ymm0, %ymm1);
  405. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  406. jz .Lcfb_dec_blk4_last;
  407. VAESENC2(%ymm4, %ymm0, %ymm1);
  408. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  409. VAESENC2(%ymm4, %ymm0, %ymm1);
  410. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  411. /* Last round and output handling. */
  412. .Lcfb_dec_blk4_last:
  413. vpxor %ymm4, %ymm5, %ymm5;
  414. vpxor %ymm4, %ymm6, %ymm6;
  415. vaesenclast %ymm5, %ymm0, %ymm0;
  416. vaesenclast %ymm6, %ymm1, %ymm1;
  417. vmovdqu %ymm0, (0 * 16)(%edx);
  418. vmovdqu %ymm1, (2 * 16)(%edx);
  419. leal (4 * 16)(%edx), %edx;
  420. /* Process trailing one to three blocks, one per loop. */
  421. .align 8
  422. .Lcfb_dec_blk1:
  423. cmpl $1, %eax;
  424. jb .Ldone_cfb_dec;
  425. leal -1(%eax), %eax;
  426. /* Load IV. */
  427. vmovdqu (%esi), %xmm0;
  428. /* Xor first key. */
  429. vpxor (0 * 16)(%edi), %xmm0, %xmm0;
  430. /* Load input as next IV. */
  431. vmovdqu (%ecx), %xmm2;
  432. leal 16(%ecx), %ecx;
  433. /* AES rounds. */
  434. vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
  435. vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
  436. vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
  437. vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
  438. vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
  439. vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
  440. vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
  441. vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
  442. vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
  443. vmovdqa (10 * 16)(%edi), %xmm1;
  444. vmovdqu %xmm2, (%esi); /* Store IV. */
  445. cmpl $12, 8+24(%esp);
  446. jb .Lcfb_dec_blk1_last;
  447. vaesenc %xmm1, %xmm0, %xmm0;
  448. vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
  449. vmovdqa (12 * 16)(%edi), %xmm1;
  450. jz .Lcfb_dec_blk1_last;
  451. vaesenc %xmm1, %xmm0, %xmm0;
  452. vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
  453. vmovdqa (14 * 16)(%edi), %xmm1;
  454. /* Last round and output handling. */
  455. .Lcfb_dec_blk1_last:
  456. vpxor %xmm2, %xmm1, %xmm1;
  457. vaesenclast %xmm1, %xmm0, %xmm0;
  458. vmovdqu %xmm0, (%edx);
  459. leal 16(%edx), %edx;
  460. jmp .Lcfb_dec_blk1;
  461. .align 8
  462. .Ldone_cfb_dec:
  463. popl %esi;
  464. CFI_POP(%esi);
  465. popl %edi;
  466. CFI_POP(%edi);
  467. vzeroall;
  468. ret_spec_stop
  469. CFI_ENDPROC();
  470. ELF(.size SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386),
  471. .-SYM_NAME(_gcry_vaes_avx2_cfb_dec_i386))
  472. /**********************************************************************
  473. CTR-mode encryption
  474. **********************************************************************/
  475. ELF(.type SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386),@function)
  476. .globl SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386)
  477. .align 16
  478. SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386):
  479. /* input:
  480. * (esp + 4): round keys
  481. * (esp + 8): iv
  482. * (esp + 12): dst
  483. * (esp + 16): src
  484. * (esp + 20): nblocks
  485. * (esp + 24): nrounds
  486. */
  487. CFI_STARTPROC();
  488. GET_DATA_POINTER(SYM_NAME(_gcry_vaes_consts), eax);
  489. pushl %ebp;
  490. CFI_PUSH(%ebp);
  491. movl %esp, %ebp;
  492. CFI_DEF_CFA_REGISTER(%ebp);
  493. subl $(3 * 32 + 3 * 4), %esp;
  494. andl $-32, %esp;
  495. movl %edi, (3 * 32 + 0 * 4)(%esp);
  496. CFI_REG_ON_STACK(edi, 3 * 32 + 0 * 4);
  497. movl %esi, (3 * 32 + 1 * 4)(%esp);
  498. CFI_REG_ON_STACK(esi, 3 * 32 + 1 * 4);
  499. movl %ebx, (3 * 32 + 2 * 4)(%esp);
  500. CFI_REG_ON_STACK(ebx, 3 * 32 + 2 * 4);
  501. movl %eax, %ebx;
  502. movl 4+4(%ebp), %edi;
  503. movl 4+8(%ebp), %esi;
  504. movl 4+12(%ebp), %edx;
  505. movl 4+16(%ebp), %ecx;
  506. #define prepare_ctr_const(minus_one, minus_two) \
  507. vpcmpeqd minus_one, minus_one, minus_one; \
  508. vpsrldq $8, minus_one, minus_one; /* 0:-1 */ \
  509. vpaddq minus_one, minus_one, minus_two; /* 0:-2 */
  510. #define inc_le128(x, minus_one, tmp) \
  511. vpcmpeqq minus_one, x, tmp; \
  512. vpsubq minus_one, x, x; \
  513. vpslldq $8, tmp, tmp; \
  514. vpsubq tmp, x, x;
  515. #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
  516. vpcmpeqq minus_one, x, tmp1; \
  517. vpcmpeqq minus_two, x, tmp2; \
  518. vpor tmp1, tmp2, tmp2; \
  519. vpsubq minus_two, x, x; \
  520. vpslldq $8, tmp2, tmp2; \
  521. vpsubq tmp2, x, x;
  522. #define handle_ctr_128bit_add(nblks) \
  523. movl 12(%esi), %eax; \
  524. bswapl %eax; \
  525. addl $nblks, %eax; \
  526. bswapl %eax; \
  527. movl %eax, 12(%esi); \
  528. jnc 1f; \
  529. \
  530. movl 8(%esi), %eax; \
  531. bswapl %eax; \
  532. adcl $0, %eax; \
  533. bswapl %eax; \
  534. movl %eax, 8(%esi); \
  535. \
  536. movl 4(%esi), %eax; \
  537. bswapl %eax; \
  538. adcl $0, %eax; \
  539. bswapl %eax; \
  540. movl %eax, 4(%esi); \
  541. \
  542. movl 0(%esi), %eax; \
  543. bswapl %eax; \
  544. adcl $0, %eax; \
  545. bswapl %eax; \
  546. movl %eax, 0(%esi); \
  547. .align 8; \
  548. 1:;
  549. cmpl $12, 4+20(%ebp);
  550. jae .Lctr_enc_blk12_loop;
  551. jmp .Lctr_enc_blk4;
  552. /* Process 12 blocks per loop. */
  553. .align 16
  554. .Lctr_enc_blk12_loop:
  555. subl $12, 4+20(%ebp);
  556. vbroadcasti128 (%esi), %ymm6;
  557. /* detect if carry handling is needed */
  558. movl 12(%esi), %eax;
  559. addl $(12 << 24), %eax;
  560. jc .Lctr_enc_blk12_handle_carry;
  561. movl %eax, 12(%esi);
  562. .Lctr_enc_blk12_byte_bige_add:
  563. /* Increment counters. */
  564. vpaddb CADDR(.Lbige_addb_0, %ebx), %ymm6, %ymm0;
  565. vpaddb CADDR(.Lbige_addb_2, %ebx), %ymm6, %ymm1;
  566. vpaddb CADDR(.Lbige_addb_4, %ebx), %ymm6, %ymm2;
  567. vpaddb CADDR(.Lbige_addb_6, %ebx), %ymm6, %ymm3;
  568. vpaddb CADDR(.Lbige_addb_8, %ebx), %ymm6, %ymm5;
  569. vpaddb CADDR(.Lbige_addb_10, %ebx), %ymm6, %ymm6;
  570. .Lctr_enc_blk12_rounds:
  571. /* AES rounds */
  572. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  573. XOR6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  574. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  575. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  576. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  577. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  578. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  579. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  580. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  581. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  582. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  583. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  584. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  585. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  586. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  587. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  588. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  589. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  590. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  591. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  592. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  593. cmpl $12, 4+24(%ebp);
  594. jb .Lctr_enc_blk12_last;
  595. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  596. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  597. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  598. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  599. jz .Lctr_enc_blk12_last;
  600. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  601. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  602. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  603. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  604. /* Last round and output handling. */
  605. .Lctr_enc_blk12_last:
  606. vpxor (0 * 16)(%ecx), %ymm4, %ymm7; /* Xor src to last round key. */
  607. vaesenclast %ymm7, %ymm0, %ymm0;
  608. vmovdqu %ymm0, (0 * 16)(%edx);
  609. vpxor (2 * 16)(%ecx), %ymm4, %ymm7;
  610. vpxor (4 * 16)(%ecx), %ymm4, %ymm0;
  611. vaesenclast %ymm7, %ymm1, %ymm1;
  612. vaesenclast %ymm0, %ymm2, %ymm2;
  613. vpxor (6 * 16)(%ecx), %ymm4, %ymm7;
  614. vpxor (8 * 16)(%ecx), %ymm4, %ymm0;
  615. vpxor (10 * 16)(%ecx), %ymm4, %ymm4;
  616. leal (12 * 16)(%ecx), %ecx;
  617. vaesenclast %ymm7, %ymm3, %ymm3;
  618. vaesenclast %ymm0, %ymm5, %ymm5;
  619. vaesenclast %ymm4, %ymm6, %ymm6;
  620. vmovdqu %ymm1, (2 * 16)(%edx);
  621. vmovdqu %ymm2, (4 * 16)(%edx);
  622. vmovdqu %ymm3, (6 * 16)(%edx);
  623. vmovdqu %ymm5, (8 * 16)(%edx);
  624. vmovdqu %ymm6, (10 * 16)(%edx);
  625. leal (12 * 16)(%edx), %edx;
  626. cmpl $12, 4+20(%ebp);
  627. jae .Lctr_enc_blk12_loop;
  628. jmp .Lctr_enc_blk4;
  629. .align 8
  630. .Lctr_enc_blk12_handle_only_ctr_carry:
  631. handle_ctr_128bit_add(12);
  632. jmp .Lctr_enc_blk12_byte_bige_add;
  633. .align 8
  634. .Lctr_enc_blk12_handle_carry:
  635. jz .Lctr_enc_blk12_handle_only_ctr_carry;
  636. /* Increment counters (handle carry). */
  637. prepare_ctr_const(%ymm4, %ymm7);
  638. vmovdqa CADDR(.Lbswap128_mask, %ebx), %ymm2;
  639. vpshufb %xmm2, %xmm6, %xmm1; /* be => le */
  640. vmovdqa %xmm1, %xmm0;
  641. inc_le128(%xmm1, %xmm4, %xmm5);
  642. vinserti128 $1, %xmm1, %ymm0, %ymm6; /* ctr: +1:+0 */
  643. handle_ctr_128bit_add(12);
  644. vpshufb %ymm2, %ymm6, %ymm0;
  645. vmovdqa %ymm0, (0 * 32)(%esp);
  646. add2_le128(%ymm6, %ymm4, %ymm7, %ymm5, %ymm1); /* ctr: +3:+2 */
  647. vpshufb %ymm2, %ymm6, %ymm0;
  648. vmovdqa %ymm0, (1 * 32)(%esp);
  649. add2_le128(%ymm6, %ymm4, %ymm7, %ymm5, %ymm1); /* ctr: +5:+4 */
  650. vpshufb %ymm2, %ymm6, %ymm0;
  651. vmovdqa %ymm0, (2 * 32)(%esp);
  652. add2_le128(%ymm6, %ymm4, %ymm7, %ymm5, %ymm1); /* ctr: +7:+6 */
  653. vpshufb %ymm2, %ymm6, %ymm3;
  654. add2_le128(%ymm6, %ymm4, %ymm7, %ymm5, %ymm1); /* ctr: +9:+8 */
  655. vpshufb %ymm2, %ymm6, %ymm5;
  656. add2_le128(%ymm6, %ymm4, %ymm7, %ymm2, %ymm1); /* ctr: +11:+10 */
  657. vmovdqa (0 * 32)(%esp), %ymm0;
  658. vmovdqa (1 * 32)(%esp), %ymm1;
  659. vmovdqa (2 * 32)(%esp), %ymm2;
  660. vpshufb CADDR(.Lbswap128_mask, %ebx), %ymm6, %ymm6;
  661. jmp .Lctr_enc_blk12_rounds;
  662. /* Handle trailing four blocks. */
  663. .align 8
  664. .Lctr_enc_blk4:
  665. cmpl $4, 4+20(%ebp);
  666. jb .Lctr_enc_blk1;
  667. subl $4, 4+20(%ebp);
  668. vbroadcasti128 (%esi), %ymm3;
  669. /* detect if carry handling is needed */
  670. movl 12(%esi), %eax;
  671. addl $(4 << 24), %eax;
  672. jc .Lctr_enc_blk4_handle_carry;
  673. movl %eax, 12(%esi);
  674. .Lctr_enc_blk4_byte_bige_add:
  675. /* Increment counters. */
  676. vpaddb CADDR(.Lbige_addb_0, %ebx), %ymm3, %ymm0;
  677. vpaddb CADDR(.Lbige_addb_2, %ebx), %ymm3, %ymm1;
  678. .Lctr_enc_blk4_rounds:
  679. /* AES rounds */
  680. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  681. XOR2(%ymm4, %ymm0, %ymm1);
  682. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  683. VAESENC2(%ymm4, %ymm0, %ymm1);
  684. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  685. VAESENC2(%ymm4, %ymm0, %ymm1);
  686. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  687. VAESENC2(%ymm4, %ymm0, %ymm1);
  688. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  689. VAESENC2(%ymm4, %ymm0, %ymm1);
  690. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  691. VAESENC2(%ymm4, %ymm0, %ymm1);
  692. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  693. VAESENC2(%ymm4, %ymm0, %ymm1);
  694. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  695. VAESENC2(%ymm4, %ymm0, %ymm1);
  696. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  697. VAESENC2(%ymm4, %ymm0, %ymm1);
  698. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  699. VAESENC2(%ymm4, %ymm0, %ymm1);
  700. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  701. cmpl $12, 4+24(%ebp);
  702. jb .Lctr_enc_blk4_last;
  703. VAESENC2(%ymm4, %ymm0, %ymm1);
  704. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  705. VAESENC2(%ymm4, %ymm0, %ymm1);
  706. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  707. jz .Lctr_enc_blk4_last;
  708. VAESENC2(%ymm4, %ymm0, %ymm1);
  709. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  710. VAESENC2(%ymm4, %ymm0, %ymm1);
  711. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  712. /* Last round and output handling. */
  713. .Lctr_enc_blk4_last:
  714. vpxor (0 * 16)(%ecx), %ymm4, %ymm5; /* Xor src to last round key. */
  715. vpxor (2 * 16)(%ecx), %ymm4, %ymm6;
  716. leal (4 * 16)(%ecx), %ecx;
  717. vaesenclast %ymm5, %ymm0, %ymm0;
  718. vaesenclast %ymm6, %ymm1, %ymm1;
  719. vmovdqu %ymm0, (0 * 16)(%edx);
  720. vmovdqu %ymm1, (2 * 16)(%edx);
  721. leal (4 * 16)(%edx), %edx;
  722. jmp .Lctr_enc_blk1;
  723. .align 8
  724. .Lctr_enc_blk4_handle_only_ctr_carry:
  725. handle_ctr_128bit_add(4);
  726. jmp .Lctr_enc_blk4_byte_bige_add;
  727. .align 8
  728. .Lctr_enc_blk4_handle_carry:
  729. jz .Lctr_enc_blk4_handle_only_ctr_carry;
  730. /* Increment counters (handle carry). */
  731. prepare_ctr_const(%ymm4, %ymm7);
  732. vpshufb CADDR(.Lbswap128_mask, %ebx), %xmm3, %xmm1; /* be => le */
  733. vmovdqa %xmm1, %xmm0;
  734. inc_le128(%xmm1, %xmm4, %xmm5);
  735. vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
  736. vpshufb CADDR(.Lbswap128_mask, %ebx), %ymm3, %ymm0;
  737. handle_ctr_128bit_add(4);
  738. add2_le128(%ymm3, %ymm4, %ymm7, %ymm5, %ymm6); /* ctr: +3:+2 */
  739. vpshufb CADDR(.Lbswap128_mask, %ebx), %ymm3, %ymm1;
  740. jmp .Lctr_enc_blk4_rounds;
  741. /* Process trailing one to three blocks, one per loop. */
  742. .align 8
  743. .Lctr_enc_blk1:
  744. cmpl $1, 4+20(%ebp);
  745. jb .Ldone_ctr_enc;
  746. subl $1, 4+20(%ebp);
  747. /* Load and increament counter. */
  748. vmovdqu (%esi), %xmm0;
  749. handle_ctr_128bit_add(1);
  750. /* AES rounds. */
  751. vpxor (0 * 16)(%edi), %xmm0, %xmm0;
  752. vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
  753. vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
  754. vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
  755. vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
  756. vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
  757. vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
  758. vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
  759. vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
  760. vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
  761. vmovdqa (10 * 16)(%edi), %xmm1;
  762. cmpl $12, 4+24(%ebp);
  763. jb .Lctr_enc_blk1_last;
  764. vaesenc %xmm1, %xmm0, %xmm0;
  765. vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
  766. vmovdqa (12 * 16)(%edi), %xmm1;
  767. jz .Lctr_enc_blk1_last;
  768. vaesenc %xmm1, %xmm0, %xmm0;
  769. vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
  770. vmovdqa (14 * 16)(%edi), %xmm1;
  771. /* Last round and output handling. */
  772. .Lctr_enc_blk1_last:
  773. vpxor (%ecx), %xmm1, %xmm1; /* Xor src to last round key. */
  774. leal 16(%ecx), %ecx;
  775. vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */
  776. vmovdqu %xmm0, (%edx);
  777. leal 16(%edx), %edx;
  778. jmp .Lctr_enc_blk1;
  779. .align 8
  780. .Ldone_ctr_enc:
  781. vpxor %ymm0, %ymm0, %ymm0;
  782. movl (3 * 32 + 0 * 4)(%esp), %edi;
  783. CFI_RESTORE(edi);
  784. movl (3 * 32 + 1 * 4)(%esp), %esi;
  785. CFI_RESTORE(esi);
  786. movl (3 * 32 + 2 * 4)(%esp), %ebx;
  787. CFI_RESTORE(ebx);
  788. vmovdqa %ymm0, (0 * 32)(%esp);
  789. vmovdqa %ymm0, (1 * 32)(%esp);
  790. vmovdqa %ymm0, (2 * 32)(%esp);
  791. leave;
  792. CFI_LEAVE();
  793. vzeroall;
  794. ret_spec_stop
  795. CFI_ENDPROC();
  796. ELF(.size SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386),
  797. .-SYM_NAME(_gcry_vaes_avx2_ctr_enc_i386))
  798. /**********************************************************************
  799. Little-endian 32-bit CTR-mode encryption (GCM-SIV)
  800. **********************************************************************/
  801. ELF(.type SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386),@function)
  802. .globl SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386)
  803. .align 16
  804. SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386):
  805. /* input:
  806. * (esp + 4): round keys
  807. * (esp + 8): counter
  808. * (esp + 12): dst
  809. * (esp + 16): src
  810. * (esp + 20): nblocks
  811. * (esp + 24): nrounds
  812. */
  813. CFI_STARTPROC();
  814. GET_DATA_POINTER(SYM_NAME(_gcry_vaes_consts), eax);
  815. pushl %ebp;
  816. CFI_PUSH(%ebp);
  817. movl %esp, %ebp;
  818. CFI_DEF_CFA_REGISTER(%ebp);
  819. subl $(3 * 4), %esp;
  820. movl %edi, (0 * 4)(%esp);
  821. CFI_REG_ON_STACK(edi, 0 * 4);
  822. movl %esi, (1 * 4)(%esp);
  823. CFI_REG_ON_STACK(esi, 1 * 4);
  824. movl %ebx, (2 * 4)(%esp);
  825. CFI_REG_ON_STACK(ebx, 2 * 4);
  826. movl %eax, %ebx;
  827. movl 4+4(%ebp), %edi;
  828. movl 4+8(%ebp), %esi;
  829. movl 4+12(%ebp), %edx;
  830. movl 4+16(%ebp), %ecx;
  831. movl 4+20(%ebp), %eax;
  832. vbroadcasti128 (%esi), %ymm7; /* Load CTR. */
  833. /* Process 12 blocks per loop. */
  834. .align 8
  835. .Lctr32le_enc_blk12:
  836. cmpl $12, %eax;
  837. jb .Lctr32le_enc_blk4;
  838. leal -12(%eax), %eax;
  839. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  840. /* Increment counters. */
  841. vpaddd CADDR(.Lle_addd_0, %ebx), %ymm7, %ymm0;
  842. vpaddd CADDR(.Lle_addd_2, %ebx), %ymm7, %ymm1;
  843. vpaddd CADDR(.Lle_addd_4, %ebx), %ymm7, %ymm2;
  844. vpaddd CADDR(.Lle_addd_6, %ebx), %ymm7, %ymm3;
  845. vpaddd CADDR(.Lle_addd_8, %ebx), %ymm7, %ymm5;
  846. vpaddd CADDR(.Lle_addd_10, %ebx), %ymm7, %ymm6;
  847. vpaddd CADDR(.Lle_addd_12_2, %ebx), %ymm7, %ymm7;
  848. vmovdqu %xmm7, (%esi); /* Store CTR. */
  849. /* AES rounds */
  850. XOR6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  851. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  852. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  853. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  854. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  855. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  856. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  857. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  858. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  859. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  860. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  861. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  862. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  863. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  864. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  865. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  866. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  867. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  868. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  869. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  870. cmpl $12, 4+24(%ebp);
  871. jb .Lctr32le_enc_blk8_last;
  872. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  873. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  874. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  875. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  876. jz .Lctr32le_enc_blk8_last;
  877. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  878. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  879. VAESENC6(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3, %ymm5, %ymm6);
  880. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  881. /* Last round and output handling. */
  882. .Lctr32le_enc_blk8_last:
  883. vpxor (0 * 16)(%ecx), %ymm4, %ymm7; /* Xor src to last round key. */
  884. vaesenclast %ymm7, %ymm0, %ymm0;
  885. vpxor (2 * 16)(%ecx), %ymm4, %ymm7;
  886. vaesenclast %ymm7, %ymm1, %ymm1;
  887. vpxor (4 * 16)(%ecx), %ymm4, %ymm7;
  888. vaesenclast %ymm7, %ymm2, %ymm2;
  889. vpxor (6 * 16)(%ecx), %ymm4, %ymm7;
  890. vaesenclast %ymm7, %ymm3, %ymm3;
  891. vpxor (8 * 16)(%ecx), %ymm4, %ymm7;
  892. vpxor (10 * 16)(%ecx), %ymm4, %ymm4;
  893. vaesenclast %ymm7, %ymm5, %ymm5;
  894. vbroadcasti128 (%esi), %ymm7; /* Reload CTR. */
  895. vaesenclast %ymm4, %ymm6, %ymm6;
  896. leal (12 * 16)(%ecx), %ecx;
  897. vmovdqu %ymm0, (0 * 16)(%edx);
  898. vmovdqu %ymm1, (2 * 16)(%edx);
  899. vmovdqu %ymm2, (4 * 16)(%edx);
  900. vmovdqu %ymm3, (6 * 16)(%edx);
  901. vmovdqu %ymm5, (8 * 16)(%edx);
  902. vmovdqu %ymm6, (10 * 16)(%edx);
  903. leal (12 * 16)(%edx), %edx;
  904. jmp .Lctr32le_enc_blk12;
  905. /* Handle trailing four blocks. */
  906. .align 8
  907. .Lctr32le_enc_blk4:
  908. cmpl $4, %eax;
  909. jb .Lctr32le_enc_blk1;
  910. leal -4(%eax), %eax;
  911. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  912. /* Increment counters. */
  913. vpaddd CADDR(.Lle_addd_0, %ebx), %ymm7, %ymm0;
  914. vpaddd CADDR(.Lle_addd_2, %ebx), %ymm7, %ymm1;
  915. vpaddd CADDR(.Lle_addd_4_2, %ebx), %ymm7, %ymm7;
  916. /* AES rounds */
  917. XOR2(%ymm4, %ymm0, %ymm1);
  918. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  919. VAESENC2(%ymm4, %ymm0, %ymm1);
  920. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  921. VAESENC2(%ymm4, %ymm0, %ymm1);
  922. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  923. VAESENC2(%ymm4, %ymm0, %ymm1);
  924. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  925. VAESENC2(%ymm4, %ymm0, %ymm1);
  926. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  927. VAESENC2(%ymm4, %ymm0, %ymm1);
  928. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  929. VAESENC2(%ymm4, %ymm0, %ymm1);
  930. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  931. VAESENC2(%ymm4, %ymm0, %ymm1);
  932. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  933. VAESENC2(%ymm4, %ymm0, %ymm1);
  934. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  935. VAESENC2(%ymm4, %ymm0, %ymm1);
  936. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  937. cmpl $12, 4+24(%ebp);
  938. jb .Lctr32le_enc_blk4_last;
  939. VAESENC2(%ymm4, %ymm0, %ymm1);
  940. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  941. VAESENC2(%ymm4, %ymm0, %ymm1);
  942. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  943. jz .Lctr32le_enc_blk4_last;
  944. VAESENC2(%ymm4, %ymm0, %ymm1);
  945. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  946. VAESENC2(%ymm4, %ymm0, %ymm1);
  947. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  948. /* Last round and output handling. */
  949. .Lctr32le_enc_blk4_last:
  950. vpxor (0 * 16)(%ecx), %ymm4, %ymm5; /* Xor src to last round key. */
  951. vpxor (2 * 16)(%ecx), %ymm4, %ymm6;
  952. leal (4 * 16)(%ecx), %ecx;
  953. vaesenclast %ymm5, %ymm0, %ymm0;
  954. vaesenclast %ymm6, %ymm1, %ymm1;
  955. vmovdqu %ymm0, (0 * 16)(%edx);
  956. vmovdqu %ymm1, (2 * 16)(%edx);
  957. leal (4 * 16)(%edx), %edx;
  958. /* Process trailing one to three blocks, one per loop. */
  959. .align 8
  960. .Lctr32le_enc_blk1:
  961. cmpl $1, %eax;
  962. jb .Ldone_ctr32le_enc;
  963. leal -1(%eax), %eax;
  964. /* Load and increament counter. */
  965. vmovdqu %xmm7, %xmm0;
  966. vpaddd CADDR(.Lle_addd_1, %ebx), %xmm7, %xmm7;
  967. /* AES rounds. */
  968. vpxor (0 * 16)(%edi), %xmm0, %xmm0;
  969. vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
  970. vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
  971. vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
  972. vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
  973. vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
  974. vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
  975. vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
  976. vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
  977. vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
  978. vmovdqa (10 * 16)(%edi), %xmm1;
  979. cmpl $12, 4+24(%ebp);
  980. jb .Lctr32le_enc_blk1_last;
  981. vaesenc %xmm1, %xmm0, %xmm0;
  982. vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
  983. vmovdqa (12 * 16)(%edi), %xmm1;
  984. jz .Lctr32le_enc_blk1_last;
  985. vaesenc %xmm1, %xmm0, %xmm0;
  986. vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
  987. vmovdqa (14 * 16)(%edi), %xmm1;
  988. /* Last round and output handling. */
  989. .Lctr32le_enc_blk1_last:
  990. vpxor (%ecx), %xmm1, %xmm1; /* Xor src to last round key. */
  991. leal 16(%ecx), %ecx;
  992. vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */
  993. vmovdqu %xmm0, (%edx);
  994. leal 16(%edx), %edx;
  995. jmp .Lctr32le_enc_blk1;
  996. .align 8
  997. .Ldone_ctr32le_enc:
  998. vmovdqu %xmm7, (%esi); /* Store CTR. */
  999. movl (0 * 4)(%esp), %edi;
  1000. CFI_RESTORE(edi);
  1001. movl (1 * 4)(%esp), %esi;
  1002. CFI_RESTORE(esi);
  1003. movl (2 * 4)(%esp), %ebx;
  1004. CFI_RESTORE(ebx);
  1005. leave;
  1006. CFI_LEAVE();
  1007. vzeroall;
  1008. ret_spec_stop
  1009. CFI_ENDPROC();
  1010. ELF(.size SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386),
  1011. .-SYM_NAME(_gcry_vaes_avx2_ctr32le_enc_i386))
  1012. /**********************************************************************
  1013. OCB-mode encryption/decryption/authentication
  1014. **********************************************************************/
  1015. ELF(.type SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386),@function)
  1016. .globl SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386)
  1017. .align 16
  1018. SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386):
  1019. /* input:
  1020. * (esp + 4): round keys
  1021. * (esp + 8): dst
  1022. * (esp + 12): src
  1023. * (esp + 16): nblocks
  1024. * (esp + 20): nrounds
  1025. * (esp + 24): offset
  1026. * (esp + 28): checksum
  1027. * (esp + 32): blkn
  1028. * (esp + 36): L table
  1029. * (esp + 44): encrypt/decrypt/auth mode
  1030. */
  1031. CFI_STARTPROC();
  1032. pushl %ebp;
  1033. CFI_PUSH(%ebp);
  1034. movl %esp, %ebp;
  1035. CFI_DEF_CFA_REGISTER(%ebp);
  1036. #define STACK_VEC_POS 0
  1037. #define STACK_TMP_Y0 (STACK_VEC_POS + 0 * 32)
  1038. #define STACK_TMP_Y1 (STACK_VEC_POS + 1 * 32)
  1039. #define STACK_TMP_Y2 (STACK_VEC_POS + 2 * 32)
  1040. #define STACK_TMP_Y3 (STACK_VEC_POS + 3 * 32)
  1041. #define STACK_TMP_Y4 (STACK_VEC_POS + 4 * 32)
  1042. #define STACK_TMP_Y5 (STACK_VEC_POS + 5 * 32)
  1043. #define STACK_FXL_KEY (STACK_VEC_POS + 6 * 32)
  1044. #define STACK_OFFSET_AND_F_KEY (STACK_VEC_POS + 7 * 32)
  1045. #define STACK_CHECKSUM (STACK_VEC_POS + 8 * 32)
  1046. #define STACK_GPR_POS (9 * 32)
  1047. #define STACK_END_POS (STACK_GPR_POS + 3 * 4)
  1048. subl $STACK_END_POS, %esp;
  1049. andl $-32, %esp;
  1050. movl %edi, (STACK_GPR_POS + 0 * 4)(%esp);
  1051. CFI_REG_ON_STACK(edi, STACK_GPR_POS + 0 * 4);
  1052. movl %esi, (STACK_GPR_POS + 1 * 4)(%esp);
  1053. CFI_REG_ON_STACK(esi, STACK_GPR_POS + 1 * 4);
  1054. movl %ebx, (STACK_GPR_POS + 2 * 4)(%esp);
  1055. CFI_REG_ON_STACK(ebx, STACK_GPR_POS + 2 * 4);
  1056. movl 4+4(%ebp), %edi;
  1057. movl 4+8(%ebp), %esi;
  1058. movl 4+12(%ebp), %edx;
  1059. movl 4+32(%ebp), %ebx;
  1060. movl 4+24(%ebp), %eax;
  1061. movl 4+20(%ebp), %ecx;
  1062. leal (, %ecx, 4), %ecx;
  1063. vmovdqu (%eax), %xmm1; /* offset */
  1064. vmovdqa (%edi), %xmm0; /* first key */
  1065. vpxor %xmm0, %xmm1, %xmm1; /* offset ^ first key */
  1066. vpxor (%edi, %ecx, 4), %xmm0, %xmm0; /* first key ^ last key */
  1067. vinserti128 $1, %xmm0, %ymm0, %ymm0;
  1068. vpxor %ymm2, %ymm2, %ymm2;
  1069. vmovdqa %xmm1, (STACK_OFFSET_AND_F_KEY)(%esp);
  1070. vmovdqa %ymm2, (STACK_CHECKSUM)(%esp);
  1071. vmovdqa %ymm0, (STACK_FXL_KEY)(%esp);
  1072. cmpl $12, 4+16(%ebp);
  1073. jae .Locb_crypt_blk12_loop;
  1074. jmp .Locb_crypt_blk4;
  1075. /* Process 12 blocks per loop. */
  1076. .align 16
  1077. .Locb_crypt_blk12_loop:
  1078. subl $12, 4+16(%ebp);
  1079. movl 4+36(%ebp), %ecx;
  1080. vmovdqa (%ecx), %xmm7; /* Preload L[0] */
  1081. testl $1, %ebx;
  1082. jz .Locb_crypt_blk12_nblk_even;
  1083. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  1084. leal 1(%ebx), %eax;
  1085. tzcntl %eax, %eax; // ntz(blkn+1)
  1086. shll $4, %eax;
  1087. vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm1;
  1088. vpxor (%ecx, %eax), %xmm1, %xmm1;
  1089. vpxor %xmm7, %xmm1, %xmm0;
  1090. vinserti128 $1, %xmm0, %ymm1, %ymm1;
  1091. vmovdqa %ymm1, (STACK_TMP_Y0)(%esp);
  1092. leal 3(%ebx), %eax;
  1093. tzcntl %eax, %eax; // ntz(blkn+3)
  1094. shll $4, %eax;
  1095. vpxor (%ecx, %eax), %xmm0, %xmm1;
  1096. vpxor %xmm7, %xmm1, %xmm0;
  1097. vinserti128 $1, %xmm0, %ymm1, %ymm2;
  1098. leal 5(%ebx), %eax;
  1099. tzcntl %eax, %eax; // ntz(blkn+5)
  1100. shll $4, %eax;
  1101. vpxor (%ecx, %eax), %xmm0, %xmm1;
  1102. vpxor %xmm7, %xmm1, %xmm0;
  1103. vinserti128 $1, %xmm0, %ymm1, %ymm3;
  1104. leal 7(%ebx), %eax;
  1105. tzcntl %eax, %eax; // ntz(blkn+7)
  1106. shll $4, %eax;
  1107. vpxor (%ecx, %eax), %xmm0, %xmm1;
  1108. vpxor %xmm7, %xmm1, %xmm0;
  1109. vinserti128 $1, %xmm0, %ymm1, %ymm4;
  1110. leal 9(%ebx), %eax;
  1111. tzcntl %eax, %eax; // ntz(blkn+9)
  1112. shll $4, %eax;
  1113. vpxor (%ecx, %eax), %xmm0, %xmm1;
  1114. vpxor %xmm7, %xmm1, %xmm0;
  1115. vinserti128 $1, %xmm0, %ymm1, %ymm5;
  1116. leal 11(%ebx), %eax;
  1117. tzcntl %eax, %eax; // ntz(blkn+11)
  1118. shll $4, %eax;
  1119. vpxor (%ecx, %eax), %xmm0, %xmm1;
  1120. leal 12(%ebx), %ebx;
  1121. vpxor %xmm7, %xmm1, %xmm0;
  1122. vinserti128 $1, %xmm0, %ymm1, %ymm6;
  1123. cmpl $1, 4+40(%ebp);
  1124. jb .Locb_dec_blk12;
  1125. ja .Locb_auth_blk12;
  1126. jmp .Locb_enc_blk12;
  1127. .align 8
  1128. .Locb_crypt_blk12_nblk_even:
  1129. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  1130. vpxor (STACK_OFFSET_AND_F_KEY)(%esp), %xmm7, %xmm1;
  1131. leal 2(%ebx), %eax;
  1132. tzcntl %eax, %eax; // ntz(blkn+2)
  1133. shll $4, %eax;
  1134. vpxor (%ecx, %eax), %xmm1, %xmm0;
  1135. vinserti128 $1, %xmm0, %ymm1, %ymm1;
  1136. vmovdqa %ymm1, (STACK_TMP_Y0)(%esp);
  1137. vpxor %xmm7, %xmm0, %xmm1;
  1138. leal 4(%ebx), %eax;
  1139. tzcntl %eax, %eax; // ntz(blkn+4)
  1140. shll $4, %eax;
  1141. vpxor (%ecx, %eax), %xmm1, %xmm0;
  1142. vinserti128 $1, %xmm0, %ymm1, %ymm2;
  1143. vpxor %xmm7, %xmm0, %xmm1;
  1144. leal 6(%ebx), %eax;
  1145. tzcntl %eax, %eax; // ntz(blkn+6)
  1146. shll $4, %eax;
  1147. vpxor (%ecx, %eax), %xmm1, %xmm0;
  1148. vinserti128 $1, %xmm0, %ymm1, %ymm3;
  1149. vpxor %xmm7, %xmm0, %xmm1;
  1150. leal 8(%ebx), %eax;
  1151. tzcntl %eax, %eax; // ntz(blkn+8)
  1152. shll $4, %eax;
  1153. vpxor (%ecx, %eax), %xmm1, %xmm0;
  1154. vinserti128 $1, %xmm0, %ymm1, %ymm4;
  1155. vpxor %xmm7, %xmm0, %xmm1;
  1156. leal 10(%ebx), %eax;
  1157. tzcntl %eax, %eax; // ntz(blkn+10)
  1158. shll $4, %eax;
  1159. vpxor (%ecx, %eax), %xmm1, %xmm0;
  1160. vinserti128 $1, %xmm0, %ymm1, %ymm5;
  1161. vpxor %xmm7, %xmm0, %xmm1;
  1162. leal 12(%ebx), %ebx;
  1163. tzcntl %ebx, %eax; // ntz(blkn+12)
  1164. shll $4, %eax;
  1165. vpxor (%ecx, %eax), %xmm1, %xmm0;
  1166. vinserti128 $1, %xmm0, %ymm1, %ymm6;
  1167. cmpl $1, 4+40(%ebp);
  1168. jb .Locb_dec_blk12;
  1169. ja .Locb_auth_blk12;
  1170. .align 8
  1171. .Locb_enc_blk12:
  1172. vmovdqa %ymm2, (STACK_TMP_Y1)(%esp);
  1173. vmovdqa %ymm3, (STACK_TMP_Y2)(%esp);
  1174. vmovdqa %ymm4, (STACK_TMP_Y3)(%esp);
  1175. vmovdqa %ymm5, (STACK_TMP_Y4)(%esp);
  1176. vmovdqa %ymm6, (STACK_TMP_Y5)(%esp);
  1177. vmovdqa %xmm0, (STACK_OFFSET_AND_F_KEY)(%esp);
  1178. vmovdqu 0*16(%edx), %ymm1;
  1179. vmovdqu 2*16(%edx), %ymm2;
  1180. vmovdqu 4*16(%edx), %ymm3;
  1181. vmovdqu 6*16(%edx), %ymm4;
  1182. vmovdqu 8*16(%edx), %ymm5;
  1183. vmovdqu 10*16(%edx), %ymm6;
  1184. leal 12*16(%edx), %edx;
  1185. /* Checksum_i = Checksum_{i-1} xor P_i */
  1186. vpxor %ymm1, %ymm2, %ymm0;
  1187. vpxor %ymm3, %ymm4, %ymm7;
  1188. vpxor %ymm5, %ymm0, %ymm0;
  1189. vpxor %ymm6, %ymm7, %ymm7;
  1190. vpxor %ymm0, %ymm7, %ymm7;
  1191. vbroadcasti128 (1 * 16)(%edi), %ymm0;
  1192. vpxor (STACK_CHECKSUM)(%esp), %ymm7, %ymm7;
  1193. /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
  1194. vpxor (STACK_TMP_Y0)(%esp), %ymm1, %ymm1;
  1195. vpxor (STACK_TMP_Y1)(%esp), %ymm2, %ymm2;
  1196. vpxor (STACK_TMP_Y2)(%esp), %ymm3, %ymm3;
  1197. vpxor (STACK_TMP_Y3)(%esp), %ymm4, %ymm4;
  1198. vpxor (STACK_TMP_Y4)(%esp), %ymm5, %ymm5;
  1199. vpxor (STACK_TMP_Y5)(%esp), %ymm6, %ymm6;
  1200. vmovdqa %ymm7, (STACK_CHECKSUM)(%esp);
  1201. /* AES rounds */
  1202. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1203. vbroadcasti128 (2 * 16)(%edi), %ymm0;
  1204. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1205. vbroadcasti128 (3 * 16)(%edi), %ymm0;
  1206. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1207. vbroadcasti128 (4 * 16)(%edi), %ymm0;
  1208. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1209. vbroadcasti128 (5 * 16)(%edi), %ymm0;
  1210. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1211. vbroadcasti128 (6 * 16)(%edi), %ymm0;
  1212. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1213. vbroadcasti128 (7 * 16)(%edi), %ymm0;
  1214. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1215. vbroadcasti128 (8 * 16)(%edi), %ymm0;
  1216. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1217. vbroadcasti128 (9 * 16)(%edi), %ymm0;
  1218. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1219. cmpl $12, 4+20(%ebp);
  1220. jb .Locb_enc_blk12_last;
  1221. vbroadcasti128 (10 * 16)(%edi), %ymm0;
  1222. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1223. vbroadcasti128 (11 * 16)(%edi), %ymm0;
  1224. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1225. jz .Locb_enc_blk12_last;
  1226. vbroadcasti128 (12 * 16)(%edi), %ymm0;
  1227. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1228. vbroadcasti128 (13 * 16)(%edi), %ymm0;
  1229. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1230. /* Last round and output handling. */
  1231. .Locb_enc_blk12_last:
  1232. vmovdqa (STACK_FXL_KEY)(%esp), %ymm0;
  1233. vpxor (STACK_TMP_Y0)(%esp), %ymm0, %ymm7;
  1234. vaesenclast %ymm7, %ymm1, %ymm1;
  1235. vpxor (STACK_TMP_Y1)(%esp), %ymm0, %ymm7;
  1236. vmovdqu %ymm1, 0*16(%esi);
  1237. vpxor (STACK_TMP_Y2)(%esp), %ymm0, %ymm1;
  1238. vaesenclast %ymm7, %ymm2, %ymm2;
  1239. vpxor (STACK_TMP_Y3)(%esp), %ymm0, %ymm7;
  1240. vaesenclast %ymm1, %ymm3, %ymm3;
  1241. vpxor (STACK_TMP_Y4)(%esp), %ymm0, %ymm1;
  1242. vaesenclast %ymm7, %ymm4, %ymm4;
  1243. vpxor (STACK_TMP_Y5)(%esp), %ymm0, %ymm7;
  1244. vaesenclast %ymm1, %ymm5, %ymm5;
  1245. vaesenclast %ymm7, %ymm6, %ymm6;
  1246. vmovdqu %ymm2, 2*16(%esi);
  1247. vmovdqu %ymm3, 4*16(%esi);
  1248. vmovdqu %ymm4, 6*16(%esi);
  1249. vmovdqu %ymm5, 8*16(%esi);
  1250. vmovdqu %ymm6, 10*16(%esi);
  1251. leal 12*16(%esi), %esi;
  1252. cmpl $12, 4+16(%ebp);
  1253. jae .Locb_crypt_blk12_loop;
  1254. jmp .Locb_crypt_blk12_cleanup;
  1255. .align 8
  1256. .Locb_auth_blk12:
  1257. vmovdqa %xmm0, (STACK_OFFSET_AND_F_KEY)(%esp);
  1258. vbroadcasti128 (1 * 16)(%edi), %ymm0;
  1259. /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
  1260. vmovdqa (STACK_TMP_Y0)(%esp), %ymm1;
  1261. vpxor 0*16(%edx), %ymm1, %ymm1;
  1262. vpxor 2*16(%edx), %ymm2, %ymm2;
  1263. vpxor 4*16(%edx), %ymm3, %ymm3;
  1264. vpxor 6*16(%edx), %ymm4, %ymm4;
  1265. vpxor 8*16(%edx), %ymm5, %ymm5;
  1266. vpxor 10*16(%edx), %ymm6, %ymm6;
  1267. leal 12*16(%edx), %edx;
  1268. /* AES rounds */
  1269. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1270. vbroadcasti128 (2 * 16)(%edi), %ymm0;
  1271. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1272. vbroadcasti128 (3 * 16)(%edi), %ymm0;
  1273. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1274. vbroadcasti128 (4 * 16)(%edi), %ymm0;
  1275. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1276. vbroadcasti128 (5 * 16)(%edi), %ymm0;
  1277. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1278. vbroadcasti128 (6 * 16)(%edi), %ymm0;
  1279. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1280. vbroadcasti128 (7 * 16)(%edi), %ymm0;
  1281. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1282. vbroadcasti128 (8 * 16)(%edi), %ymm0;
  1283. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1284. vbroadcasti128 (9 * 16)(%edi), %ymm0;
  1285. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1286. vbroadcasti128 (10 * 16)(%edi), %ymm0;
  1287. cmpl $12, 4+20(%ebp);
  1288. jb .Locb_auth_blk12_last;
  1289. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1290. vbroadcasti128 (11 * 16)(%edi), %ymm0;
  1291. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1292. vbroadcasti128 (12 * 16)(%edi), %ymm0;
  1293. jz .Locb_auth_blk12_last;
  1294. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1295. vbroadcasti128 (13 * 16)(%edi), %ymm0;
  1296. VAESENC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1297. vbroadcasti128 (14 * 16)(%edi), %ymm0;
  1298. /* Last round and output handling. */
  1299. .Locb_auth_blk12_last:
  1300. vaesenclast %ymm0, %ymm1, %ymm1;
  1301. vaesenclast %ymm0, %ymm2, %ymm2;
  1302. vaesenclast %ymm0, %ymm3, %ymm3;
  1303. vaesenclast %ymm0, %ymm4, %ymm4;
  1304. vaesenclast %ymm0, %ymm5, %ymm5;
  1305. vaesenclast %ymm0, %ymm6, %ymm6;
  1306. vpxor %ymm1, %ymm2, %ymm0;
  1307. vpxor %ymm3, %ymm4, %ymm4;
  1308. vpxor %ymm5, %ymm0, %ymm0;
  1309. vpxor %ymm6, %ymm4, %ymm4;
  1310. vpxor %ymm0, %ymm4, %ymm4;
  1311. vpxor (STACK_CHECKSUM)(%esp), %ymm4, %ymm4;
  1312. vmovdqa %ymm4, (STACK_CHECKSUM)(%esp);
  1313. cmpl $12, 4+16(%ebp);
  1314. jae .Locb_crypt_blk12_loop;
  1315. jmp .Locb_crypt_blk12_cleanup;
  1316. .align 8
  1317. .Locb_dec_blk12:
  1318. vmovdqa %xmm0, (STACK_OFFSET_AND_F_KEY)(%esp);
  1319. /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
  1320. vmovdqa (STACK_TMP_Y0)(%esp), %ymm1;
  1321. vmovdqu 0*16(%edx), %ymm0;
  1322. vmovdqu 2*16(%edx), %ymm7;
  1323. vpxor %ymm0, %ymm1, %ymm1;
  1324. vmovdqa %ymm2, (STACK_TMP_Y1)(%esp);
  1325. vpxor %ymm7, %ymm2, %ymm2;
  1326. vmovdqu 4*16(%edx), %ymm0;
  1327. vmovdqu 6*16(%edx), %ymm7;
  1328. vmovdqa %ymm3, (STACK_TMP_Y2)(%esp);
  1329. vmovdqa %ymm4, (STACK_TMP_Y3)(%esp);
  1330. vpxor %ymm0, %ymm3, %ymm3;
  1331. vpxor %ymm7, %ymm4, %ymm4;
  1332. vmovdqu 8*16(%edx), %ymm0;
  1333. vmovdqu 10*16(%edx), %ymm7;
  1334. leal 12*16(%edx), %edx;
  1335. vmovdqa %ymm5, (STACK_TMP_Y4)(%esp);
  1336. vmovdqa %ymm6, (STACK_TMP_Y5)(%esp);
  1337. vpxor %ymm0, %ymm5, %ymm5;
  1338. vbroadcasti128 (1 * 16)(%edi), %ymm0;
  1339. vpxor %ymm7, %ymm6, %ymm6;
  1340. /* AES rounds */
  1341. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1342. vbroadcasti128 (2 * 16)(%edi), %ymm0;
  1343. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1344. vbroadcasti128 (3 * 16)(%edi), %ymm0;
  1345. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1346. vbroadcasti128 (4 * 16)(%edi), %ymm0;
  1347. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1348. vbroadcasti128 (5 * 16)(%edi), %ymm0;
  1349. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1350. vbroadcasti128 (6 * 16)(%edi), %ymm0;
  1351. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1352. vbroadcasti128 (7 * 16)(%edi), %ymm0;
  1353. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1354. vbroadcasti128 (8 * 16)(%edi), %ymm0;
  1355. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1356. vbroadcasti128 (9 * 16)(%edi), %ymm0;
  1357. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1358. cmpl $12, 4+20(%ebp);
  1359. jb .Locb_dec_blk12_last;
  1360. vbroadcasti128 (10 * 16)(%edi), %ymm0;
  1361. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1362. vbroadcasti128 (11 * 16)(%edi), %ymm0;
  1363. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1364. jz .Locb_dec_blk12_last;
  1365. vbroadcasti128 (12 * 16)(%edi), %ymm0;
  1366. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1367. vbroadcasti128 (13 * 16)(%edi), %ymm0;
  1368. VAESDEC6(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6);
  1369. /* Last round and output handling. */
  1370. .Locb_dec_blk12_last:
  1371. vmovdqa (STACK_FXL_KEY)(%esp), %ymm0;
  1372. vpxor (STACK_TMP_Y0)(%esp), %ymm0, %ymm7;
  1373. vaesdeclast %ymm7, %ymm1, %ymm1;
  1374. vmovdqu %ymm1, 0*16(%esi);
  1375. vpxor (STACK_TMP_Y1)(%esp), %ymm0, %ymm1;
  1376. vpxor (STACK_TMP_Y2)(%esp), %ymm0, %ymm7;
  1377. vaesdeclast %ymm1, %ymm2, %ymm2;
  1378. vpxor (STACK_TMP_Y3)(%esp), %ymm0, %ymm1;
  1379. vaesdeclast %ymm7, %ymm3, %ymm3;
  1380. vpxor (STACK_TMP_Y4)(%esp), %ymm0, %ymm7;
  1381. vaesdeclast %ymm1, %ymm4, %ymm4;
  1382. vpxor (STACK_TMP_Y5)(%esp), %ymm0, %ymm0;
  1383. vaesdeclast %ymm7, %ymm5, %ymm5;
  1384. vaesdeclast %ymm0, %ymm6, %ymm6;
  1385. /* Checksum_i = Checksum_{i-1} xor P_i */
  1386. vpxor %ymm2, %ymm3, %ymm0;
  1387. vpxor %ymm4, %ymm5, %ymm7;
  1388. vpxor %ymm6, %ymm0, %ymm0;
  1389. vpxor 0*16(%esi), %ymm7, %ymm7;
  1390. vpxor %ymm0, %ymm7, %ymm7;
  1391. vpxor (STACK_CHECKSUM)(%esp), %ymm7, %ymm7;
  1392. vmovdqu %ymm2, 2*16(%esi);
  1393. vmovdqu %ymm3, 4*16(%esi);
  1394. vmovdqu %ymm4, 6*16(%esi);
  1395. vmovdqu %ymm5, 8*16(%esi);
  1396. vmovdqu %ymm6, 10*16(%esi);
  1397. leal 12*16(%esi), %esi;
  1398. vmovdqa %ymm7, (STACK_CHECKSUM)(%esp);
  1399. cmpl $12, 4+16(%ebp);
  1400. jae .Locb_crypt_blk12_loop;
  1401. .align 8
  1402. .Locb_crypt_blk12_cleanup:
  1403. vpxor %ymm0, %ymm0, %ymm0;
  1404. vmovdqa %ymm0, (STACK_TMP_Y0)(%esp);
  1405. vmovdqa %ymm0, (STACK_TMP_Y1)(%esp);
  1406. vmovdqa %ymm0, (STACK_TMP_Y2)(%esp);
  1407. vmovdqa %ymm0, (STACK_TMP_Y3)(%esp);
  1408. vmovdqa %ymm0, (STACK_TMP_Y4)(%esp);
  1409. vmovdqa %ymm0, (STACK_TMP_Y5)(%esp);
  1410. /* Process trailing four blocks. */
  1411. .align 8
  1412. .Locb_crypt_blk4:
  1413. cmpl $4, 4+16(%ebp);
  1414. jb .Locb_crypt_blk1;
  1415. subl $4, 4+16(%ebp);
  1416. movl 4+36(%ebp), %ecx;
  1417. vmovdqa (%ecx), %xmm7; /* Preload L[0] */
  1418. testl $1, %ebx;
  1419. jz .Locb_crypt_blk4_nblk_even;
  1420. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  1421. leal 1(%ebx), %eax;
  1422. tzcntl %eax, %eax; // ntz(blkn+1)
  1423. shll $4, %eax;
  1424. vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm1;
  1425. vpxor (%ecx, %eax), %xmm1, %xmm1;
  1426. vpxor %xmm7, %xmm1, %xmm2;
  1427. vinserti128 $1, %xmm2, %ymm1, %ymm6;
  1428. leal 3(%ebx), %eax;
  1429. tzcntl %eax, %eax; // ntz(blkn+3)
  1430. shll $4, %eax;
  1431. vpxor (%ecx, %eax), %xmm2, %xmm3;
  1432. leal 4(%ebx), %ebx;
  1433. vpxor %xmm7, %xmm3, %xmm4;
  1434. vinserti128 $1, %xmm4, %ymm3, %ymm7;
  1435. vmovdqa %xmm4, (STACK_OFFSET_AND_F_KEY)(%esp);
  1436. cmpl $1, 4+40(%ebp);
  1437. jb .Locb_dec_blk4;
  1438. ja .Locb_auth_blk4;
  1439. jmp .Locb_enc_blk4;
  1440. .align 8
  1441. .Locb_crypt_blk4_nblk_even:
  1442. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  1443. vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm1;
  1444. vpxor %xmm7, %xmm1, %xmm1;
  1445. leal 2(%ebx), %eax;
  1446. tzcntl %eax, %eax; // ntz(blkn+2)
  1447. shll $4, %eax;
  1448. vpxor (%ecx, %eax), %xmm1, %xmm2;
  1449. vinserti128 $1, %xmm2, %ymm1, %ymm6;
  1450. vpxor %xmm7, %xmm2, %xmm3;
  1451. leal 4(%ebx), %ebx;
  1452. tzcntl %ebx, %eax; // ntz(blkn+4)
  1453. shll $4, %eax;
  1454. vpxor (%ecx, %eax), %xmm3, %xmm4;
  1455. vinserti128 $1, %xmm4, %ymm3, %ymm7;
  1456. vmovdqa %xmm4, (STACK_OFFSET_AND_F_KEY)(%esp);
  1457. cmpl $1, 4+40(%ebp);
  1458. jb .Locb_dec_blk4;
  1459. ja .Locb_auth_blk4;
  1460. .align 8
  1461. .Locb_enc_blk4:
  1462. vmovdqu 0*16(%edx), %ymm1;
  1463. vmovdqu 2*16(%edx), %ymm2;
  1464. leal 4*16(%edx), %edx;
  1465. /* Checksum_i = Checksum_{i-1} xor P_i */
  1466. vpxor %ymm1, %ymm2, %ymm5;
  1467. vpxor (STACK_CHECKSUM)(%esp), %ymm5, %ymm5;
  1468. vmovdqa %ymm5, (STACK_CHECKSUM)(%esp);
  1469. /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
  1470. vpxor %ymm6, %ymm1, %ymm1;
  1471. vpxor %ymm7, %ymm2, %ymm2;
  1472. /* AES rounds */
  1473. vbroadcasti128 (1 * 16)(%edi), %ymm0;
  1474. VAESENC2(%ymm0, %ymm1, %ymm2);
  1475. vbroadcasti128 (2 * 16)(%edi), %ymm0;
  1476. VAESENC2(%ymm0, %ymm1, %ymm2);
  1477. vbroadcasti128 (3 * 16)(%edi), %ymm0;
  1478. VAESENC2(%ymm0, %ymm1, %ymm2);
  1479. vbroadcasti128 (4 * 16)(%edi), %ymm0;
  1480. VAESENC2(%ymm0, %ymm1, %ymm2);
  1481. vbroadcasti128 (5 * 16)(%edi), %ymm0;
  1482. VAESENC2(%ymm0, %ymm1, %ymm2);
  1483. vbroadcasti128 (6 * 16)(%edi), %ymm0;
  1484. VAESENC2(%ymm0, %ymm1, %ymm2);
  1485. vbroadcasti128 (7 * 16)(%edi), %ymm0;
  1486. VAESENC2(%ymm0, %ymm1, %ymm2);
  1487. vbroadcasti128 (8 * 16)(%edi), %ymm0;
  1488. VAESENC2(%ymm0, %ymm1, %ymm2);
  1489. vbroadcasti128 (9 * 16)(%edi), %ymm0;
  1490. VAESENC2(%ymm0, %ymm1, %ymm2);
  1491. cmpl $12, 4+20(%ebp);
  1492. jb .Locb_enc_blk4_last;
  1493. vbroadcasti128 (10 * 16)(%edi), %ymm0;
  1494. VAESENC2(%ymm0, %ymm1, %ymm2);
  1495. vbroadcasti128 (11 * 16)(%edi), %ymm0;
  1496. VAESENC2(%ymm0, %ymm1, %ymm2);
  1497. jz .Locb_enc_blk4_last;
  1498. vbroadcasti128 (12 * 16)(%edi), %ymm0;
  1499. VAESENC2(%ymm0, %ymm1, %ymm2);
  1500. vbroadcasti128 (13 * 16)(%edi), %ymm0;
  1501. VAESENC2(%ymm0, %ymm1, %ymm2);
  1502. /* Last round and output handling. */
  1503. .Locb_enc_blk4_last:
  1504. vmovdqa (STACK_FXL_KEY)(%esp), %ymm0;
  1505. vpxor %ymm0, %ymm6, %ymm6; /* Xor offset to last round key. */
  1506. vpxor %ymm0, %ymm7, %ymm7;
  1507. vaesenclast %ymm6, %ymm1, %ymm1;
  1508. vaesenclast %ymm7, %ymm2, %ymm2;
  1509. vmovdqu %ymm1, 0*16(%esi);
  1510. vmovdqu %ymm2, 2*16(%esi);
  1511. leal 4*16(%esi), %esi;
  1512. jmp .Locb_crypt_blk1;
  1513. .align 8
  1514. .Locb_auth_blk4:
  1515. /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
  1516. vpxor 0*16(%edx), %ymm6, %ymm1;
  1517. vpxor 2*16(%edx), %ymm7, %ymm2;
  1518. leal 4*16(%edx), %edx;
  1519. /* AES rounds */
  1520. vbroadcasti128 (1 * 16)(%edi), %ymm0;
  1521. VAESENC2(%ymm0, %ymm1, %ymm2);
  1522. vbroadcasti128 (2 * 16)(%edi), %ymm0;
  1523. VAESENC2(%ymm0, %ymm1, %ymm2);
  1524. vbroadcasti128 (3 * 16)(%edi), %ymm0;
  1525. VAESENC2(%ymm0, %ymm1, %ymm2);
  1526. vbroadcasti128 (4 * 16)(%edi), %ymm0;
  1527. VAESENC2(%ymm0, %ymm1, %ymm2);
  1528. vbroadcasti128 (5 * 16)(%edi), %ymm0;
  1529. VAESENC2(%ymm0, %ymm1, %ymm2);
  1530. vbroadcasti128 (6 * 16)(%edi), %ymm0;
  1531. VAESENC2(%ymm0, %ymm1, %ymm2);
  1532. vbroadcasti128 (7 * 16)(%edi), %ymm0;
  1533. VAESENC2(%ymm0, %ymm1, %ymm2);
  1534. vbroadcasti128 (8 * 16)(%edi), %ymm0;
  1535. VAESENC2(%ymm0, %ymm1, %ymm2);
  1536. vbroadcasti128 (9 * 16)(%edi), %ymm0;
  1537. VAESENC2(%ymm0, %ymm1, %ymm2);
  1538. vbroadcasti128 (10 * 16)(%edi), %ymm0;
  1539. cmpl $12, 4+20(%ebp);
  1540. jb .Locb_auth_blk4_last;
  1541. VAESENC2(%ymm0, %ymm1, %ymm2);
  1542. vbroadcasti128 (11 * 16)(%edi), %ymm0;
  1543. VAESENC2(%ymm0, %ymm1, %ymm2);
  1544. vbroadcasti128 (12 * 16)(%edi), %ymm0;
  1545. jz .Locb_auth_blk4_last;
  1546. VAESENC2(%ymm0, %ymm1, %ymm2);
  1547. vbroadcasti128 (13 * 16)(%edi), %ymm0;
  1548. VAESENC2(%ymm0, %ymm1, %ymm2);
  1549. vbroadcasti128 (14 * 16)(%edi), %ymm0;
  1550. /* Last round and output handling. */
  1551. .Locb_auth_blk4_last:
  1552. vaesenclast %ymm0, %ymm1, %ymm1;
  1553. vaesenclast %ymm0, %ymm2, %ymm2;
  1554. /* Checksum_i = Checksum_{i-1} xor P_i */
  1555. vpxor %ymm1, %ymm2, %ymm5;
  1556. vpxor (STACK_CHECKSUM)(%esp), %ymm5, %ymm5;
  1557. vmovdqa %ymm5, (STACK_CHECKSUM)(%esp);
  1558. jmp .Locb_crypt_blk1;
  1559. .align 8
  1560. .Locb_dec_blk4:
  1561. /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
  1562. vpxor 0*16(%edx), %ymm6, %ymm1;
  1563. vpxor 2*16(%edx), %ymm7, %ymm2;
  1564. leal 4*16(%edx), %edx;
  1565. /* AES rounds */
  1566. vbroadcasti128 (1 * 16)(%edi), %ymm0;
  1567. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1568. vbroadcasti128 (2 * 16)(%edi), %ymm0;
  1569. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1570. vbroadcasti128 (3 * 16)(%edi), %ymm0;
  1571. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1572. vbroadcasti128 (4 * 16)(%edi), %ymm0;
  1573. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1574. vbroadcasti128 (5 * 16)(%edi), %ymm0;
  1575. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1576. vbroadcasti128 (6 * 16)(%edi), %ymm0;
  1577. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1578. vbroadcasti128 (7 * 16)(%edi), %ymm0;
  1579. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1580. vbroadcasti128 (8 * 16)(%edi), %ymm0;
  1581. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1582. vbroadcasti128 (9 * 16)(%edi), %ymm0;
  1583. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1584. cmpl $12, 4+20(%ebp);
  1585. jb .Locb_dec_blk4_last;
  1586. vbroadcasti128 (10 * 16)(%edi), %ymm0;
  1587. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1588. vbroadcasti128 (11 * 16)(%edi), %ymm0;
  1589. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1590. jz .Locb_dec_blk4_last;
  1591. vbroadcasti128 (12 * 16)(%edi), %ymm0;
  1592. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1593. vbroadcasti128 (13 * 16)(%edi), %ymm0;
  1594. VAESDEC2(%ymm0, %ymm1, %ymm2);
  1595. /* Last round and output handling. */
  1596. .Locb_dec_blk4_last:
  1597. vmovdqa (STACK_FXL_KEY)(%esp), %ymm0;
  1598. vpxor %ymm0, %ymm6, %ymm6; /* Xor offset to last round key. */
  1599. vpxor %ymm0, %ymm7, %ymm7;
  1600. vaesdeclast %ymm6, %ymm1, %ymm1;
  1601. vaesdeclast %ymm7, %ymm2, %ymm2;
  1602. /* Checksum_i = Checksum_{i-1} xor P_i */
  1603. vpxor %ymm1, %ymm2, %ymm5;
  1604. vpxor (STACK_CHECKSUM)(%esp), %ymm5, %ymm5;
  1605. vmovdqu %ymm1, 0*16(%esi);
  1606. vmovdqu %ymm2, 2*16(%esi);
  1607. leal 4*16(%esi), %esi;
  1608. vmovdqa %ymm5, (STACK_CHECKSUM)(%esp);
  1609. /* Process trailing one to three blocks, one per loop. */
  1610. .align 8
  1611. .Locb_crypt_blk1:
  1612. cmpl $1, 4+16(%ebp);
  1613. jb .Locb_crypt_done;
  1614. subl $1, 4+16(%ebp);
  1615. movl 4+36(%ebp), %ecx;
  1616. leal 1(%ebx), %ebx;
  1617. tzcntl %ebx, %eax; // ntz(blkn+1)
  1618. shll $4, %eax;
  1619. vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm7;
  1620. vpxor (%ecx, %eax), %xmm7, %xmm7;
  1621. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  1622. vmovdqa %xmm7, (STACK_OFFSET_AND_F_KEY)(%esp);
  1623. cmpl $1, 4+40(%ebp);
  1624. jb .Locb_dec_blk1;
  1625. ja .Locb_auth_blk1;
  1626. vmovdqu (%edx), %xmm0;
  1627. leal 16(%edx), %edx;
  1628. /* Checksum_i = Checksum_{i-1} xor P_i */
  1629. vpxor (STACK_CHECKSUM)(%esp), %xmm0, %xmm1;
  1630. vmovdqa %xmm1, (STACK_CHECKSUM)(%esp);
  1631. /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
  1632. vpxor %xmm7, %xmm0, %xmm0;
  1633. /* AES rounds. */
  1634. vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
  1635. vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
  1636. vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
  1637. vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
  1638. vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
  1639. vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
  1640. vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
  1641. vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
  1642. vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
  1643. cmpl $12, 4+20(%ebp);
  1644. jb .Locb_enc_blk1_last;
  1645. vaesenc (10 * 16)(%edi), %xmm0, %xmm0;
  1646. vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
  1647. jz .Locb_enc_blk1_last;
  1648. vaesenc (12 * 16)(%edi), %xmm0, %xmm0;
  1649. vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
  1650. /* Last round and output handling. */
  1651. .Locb_enc_blk1_last:
  1652. vpxor (STACK_FXL_KEY)(%esp), %xmm7, %xmm1;
  1653. vaesenclast %xmm1, %xmm0, %xmm0;
  1654. vmovdqu %xmm0, (%esi);
  1655. leal 16(%esi), %esi;
  1656. jmp .Locb_crypt_blk1;
  1657. .align 8
  1658. .Locb_auth_blk1:
  1659. /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
  1660. vpxor (%edx), %xmm7, %xmm0;
  1661. leal 16(%edx), %edx;
  1662. /* AES rounds. */
  1663. vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
  1664. vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
  1665. vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
  1666. vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
  1667. vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
  1668. vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
  1669. vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
  1670. vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
  1671. vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
  1672. vmovdqa (10 * 16)(%edi), %xmm1;
  1673. cmpl $12, 4+20(%ebp);
  1674. jb .Locb_auth_blk1_last;
  1675. vaesenc %xmm1, %xmm0, %xmm0;
  1676. vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
  1677. vmovdqa (12 * 16)(%edi), %xmm1;
  1678. jz .Locb_auth_blk1_last;
  1679. vaesenc %xmm1, %xmm0, %xmm0;
  1680. vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
  1681. vmovdqa (14 * 16)(%edi), %xmm1;
  1682. /* Last round and output handling. */
  1683. .Locb_auth_blk1_last:
  1684. vpxor (STACK_CHECKSUM)(%esp), %xmm1, %xmm1;
  1685. vaesenclast %xmm1, %xmm0, %xmm0;
  1686. vmovdqa %xmm0, (STACK_CHECKSUM)(%esp);
  1687. jmp .Locb_crypt_blk1;
  1688. .align 8
  1689. .Locb_dec_blk1:
  1690. /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
  1691. vpxor (%edx), %xmm7, %xmm0;
  1692. leal 16(%edx), %edx;
  1693. /* AES rounds. */
  1694. vaesdec (1 * 16)(%edi), %xmm0, %xmm0;
  1695. vaesdec (2 * 16)(%edi), %xmm0, %xmm0;
  1696. vaesdec (3 * 16)(%edi), %xmm0, %xmm0;
  1697. vaesdec (4 * 16)(%edi), %xmm0, %xmm0;
  1698. vaesdec (5 * 16)(%edi), %xmm0, %xmm0;
  1699. vaesdec (6 * 16)(%edi), %xmm0, %xmm0;
  1700. vaesdec (7 * 16)(%edi), %xmm0, %xmm0;
  1701. vaesdec (8 * 16)(%edi), %xmm0, %xmm0;
  1702. vaesdec (9 * 16)(%edi), %xmm0, %xmm0;
  1703. cmpl $12, 4+20(%ebp);
  1704. jb .Locb_dec_blk1_last;
  1705. vaesdec (10 * 16)(%edi), %xmm0, %xmm0;
  1706. vaesdec (11 * 16)(%edi), %xmm0, %xmm0;
  1707. jz .Locb_dec_blk1_last;
  1708. vaesdec (12 * 16)(%edi), %xmm0, %xmm0;
  1709. vaesdec (13 * 16)(%edi), %xmm0, %xmm0;
  1710. /* Last round and output handling. */
  1711. .Locb_dec_blk1_last:
  1712. vpxor (STACK_FXL_KEY)(%esp), %xmm7, %xmm1;
  1713. vaesdeclast %xmm1, %xmm0, %xmm0;
  1714. /* Checksum_i = Checksum_{i-1} xor P_i */
  1715. vpxor (STACK_CHECKSUM)(%esp), %xmm0, %xmm1;
  1716. vmovdqu %xmm0, (%esi);
  1717. leal 16(%esi), %esi;
  1718. vmovdqa %xmm1, (STACK_CHECKSUM)(%esp);
  1719. jmp .Locb_crypt_blk1;
  1720. .align 8
  1721. .Locb_crypt_done:
  1722. movl 4+24(%ebp), %ecx;
  1723. vmovdqa (STACK_OFFSET_AND_F_KEY)(%esp), %xmm1;
  1724. vpxor (%edi), %xmm1, %xmm1;
  1725. vmovdqu %xmm1, (%ecx);
  1726. movl 4+28(%ebp), %eax;
  1727. vmovdqa (STACK_CHECKSUM)(%esp), %xmm2;
  1728. vpxor (STACK_CHECKSUM + 16)(%esp), %xmm2, %xmm2;
  1729. vpxor (%eax), %xmm2, %xmm2;
  1730. vmovdqu %xmm2, (%eax);
  1731. movl (STACK_GPR_POS + 0 * 4)(%esp), %edi;
  1732. CFI_RESTORE(edi);
  1733. movl (STACK_GPR_POS + 1 * 4)(%esp), %esi;
  1734. CFI_RESTORE(esi);
  1735. movl (STACK_GPR_POS + 2 * 4)(%esp), %ebx;
  1736. CFI_RESTORE(ebx);
  1737. vpxor %ymm0, %ymm0, %ymm0;
  1738. vmovdqa %ymm0, (STACK_OFFSET_AND_F_KEY)(%esp);
  1739. vmovdqa %ymm0, (STACK_CHECKSUM)(%esp);
  1740. xorl %eax, %eax;
  1741. leave;
  1742. CFI_LEAVE();
  1743. vzeroall;
  1744. ret_spec_stop
  1745. CFI_ENDPROC();
  1746. ELF(.size SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386),
  1747. .-SYM_NAME(_gcry_vaes_avx2_ocb_crypt_i386))
  1748. /**********************************************************************
  1749. XTS-mode encryption
  1750. **********************************************************************/
  1751. ELF(.type SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386),@function)
  1752. .globl SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386)
  1753. .align 16
  1754. SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386):
  1755. /* input:
  1756. * (esp + 4): round keys
  1757. * (esp + 8): tweak
  1758. * (esp + 12): dst
  1759. * (esp + 16): src
  1760. * (esp + 20): nblocks
  1761. * (esp + 24): nrounds
  1762. * (esp + 28): encrypt
  1763. */
  1764. CFI_STARTPROC();
  1765. GET_DATA_POINTER(SYM_NAME(_gcry_vaes_consts), eax);
  1766. pushl %ebp;
  1767. CFI_PUSH(%ebp);
  1768. movl %esp, %ebp;
  1769. CFI_DEF_CFA_REGISTER(%ebp);
  1770. subl $(4 * 32 + 3 * 4), %esp;
  1771. andl $-32, %esp;
  1772. movl %edi, (4 * 32 + 0 * 4)(%esp);
  1773. CFI_REG_ON_STACK(edi, 4 * 32 + 0 * 4);
  1774. movl %esi, (4 * 32 + 1 * 4)(%esp);
  1775. CFI_REG_ON_STACK(esi, 4 * 32 + 1 * 4);
  1776. movl %ebx, (4 * 32 + 2 * 4)(%esp);
  1777. CFI_REG_ON_STACK(ebx, 4 * 32 + 2 * 4);
  1778. movl %eax, %ebx;
  1779. movl 4+4(%ebp), %edi;
  1780. movl 4+8(%ebp), %esi;
  1781. movl 4+12(%ebp), %edx;
  1782. movl 4+16(%ebp), %ecx;
  1783. movl 4+20(%ebp), %eax;
  1784. #define tweak_clmul(shift, out, tweak, hi_tweak, tmp1, tmp2) \
  1785. vpsrld $(32-(shift)), hi_tweak, tmp2; \
  1786. vpsllq $(shift), tweak, out; \
  1787. vpclmulqdq $0, CADDR(.Lxts_gfmul_clmul, %ebx), tmp2, tmp1; \
  1788. vpunpckhqdq tmp2, tmp1, tmp1; \
  1789. vpxor tmp1, out, out;
  1790. /* Prepare tweak. */
  1791. vmovdqu (%esi), %xmm7;
  1792. vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %xmm7, %xmm6;
  1793. tweak_clmul(1, %xmm5, %xmm7, %xmm6, %xmm0, %xmm1);
  1794. vinserti128 $1, %xmm5, %ymm7, %ymm7; /* tweak:tweak1 */
  1795. vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %ymm7, %ymm6;
  1796. /* Process eight blocks per loop. */
  1797. .align 8
  1798. .Lxts_crypt_blk8:
  1799. cmpl $8, %eax;
  1800. jb .Lxts_crypt_blk4;
  1801. leal -8(%eax), %eax;
  1802. vmovdqa %ymm7, (0 * 32)(%esp);
  1803. tweak_clmul(2, %ymm2, %ymm7, %ymm6, %ymm0, %ymm1);
  1804. vmovdqa %ymm2, (1 * 32)(%esp);
  1805. tweak_clmul(4, %ymm2, %ymm7, %ymm6, %ymm0, %ymm1);
  1806. vmovdqa %ymm2, (2 * 32)(%esp);
  1807. tweak_clmul(6, %ymm2, %ymm7, %ymm6, %ymm0, %ymm1);
  1808. vmovdqa %ymm2, (3 * 32)(%esp);
  1809. tweak_clmul(8, %ymm7, %ymm7, %ymm6, %ymm0, %ymm1);
  1810. vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %ymm7, %ymm6;
  1811. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  1812. vmovdqa (0 * 32)(%esp), %ymm0;
  1813. vmovdqa (1 * 32)(%esp), %ymm1;
  1814. vmovdqa (2 * 32)(%esp), %ymm2;
  1815. vmovdqa (3 * 32)(%esp), %ymm3;
  1816. vpxor (0 * 16)(%ecx), %ymm0, %ymm0;
  1817. vpxor (2 * 16)(%ecx), %ymm1, %ymm1;
  1818. vpxor (4 * 16)(%ecx), %ymm2, %ymm2;
  1819. vpxor (6 * 16)(%ecx), %ymm3, %ymm3;
  1820. leal (8 * 16)(%ecx), %ecx;
  1821. cmpl $1, 4+28(%ebp);
  1822. jne .Lxts_dec_blk8;
  1823. /* AES rounds */
  1824. XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1825. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  1826. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1827. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  1828. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1829. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  1830. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1831. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  1832. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1833. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  1834. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1835. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  1836. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1837. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  1838. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1839. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  1840. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1841. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  1842. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1843. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  1844. cmpl $12, 4+24(%ebp);
  1845. jb .Lxts_enc_blk8_last;
  1846. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1847. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  1848. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1849. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  1850. jz .Lxts_enc_blk8_last;
  1851. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1852. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  1853. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1854. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  1855. /* Last round and output handling. */
  1856. .Lxts_enc_blk8_last:
  1857. vpxor (0 * 32)(%esp), %ymm4, %ymm5; /* Xor tweak to last round key. */
  1858. vaesenclast %ymm5, %ymm0, %ymm0;
  1859. vpxor (1 * 32)(%esp), %ymm4, %ymm5;
  1860. vaesenclast %ymm5, %ymm1, %ymm1;
  1861. vpxor (2 * 32)(%esp), %ymm4, %ymm5;
  1862. vpxor (3 * 32)(%esp), %ymm4, %ymm4;
  1863. vaesenclast %ymm5, %ymm2, %ymm2;
  1864. vaesenclast %ymm4, %ymm3, %ymm3;
  1865. vmovdqu %ymm0, (0 * 16)(%edx);
  1866. vmovdqu %ymm1, (2 * 16)(%edx);
  1867. vmovdqu %ymm2, (4 * 16)(%edx);
  1868. vmovdqu %ymm3, (6 * 16)(%edx);
  1869. leal (8 * 16)(%edx), %edx;
  1870. jmp .Lxts_crypt_blk8;
  1871. .align 8
  1872. .Lxts_dec_blk8:
  1873. /* AES rounds */
  1874. XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1875. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  1876. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1877. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  1878. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1879. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  1880. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1881. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  1882. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1883. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  1884. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1885. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  1886. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1887. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  1888. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1889. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  1890. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1891. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  1892. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1893. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  1894. cmpl $12, 4+24(%ebp);
  1895. jb .Lxts_dec_blk8_last;
  1896. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1897. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  1898. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1899. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  1900. jz .Lxts_dec_blk8_last;
  1901. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1902. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  1903. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  1904. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  1905. /* Last round and output handling. */
  1906. .Lxts_dec_blk8_last:
  1907. vpxor (0 * 32)(%esp), %ymm4, %ymm5; /* Xor tweak to last round key. */
  1908. vaesdeclast %ymm5, %ymm0, %ymm0;
  1909. vpxor (1 * 32)(%esp), %ymm4, %ymm5;
  1910. vaesdeclast %ymm5, %ymm1, %ymm1;
  1911. vpxor (2 * 32)(%esp), %ymm4, %ymm5;
  1912. vpxor (3 * 32)(%esp), %ymm4, %ymm4;
  1913. vaesdeclast %ymm5, %ymm2, %ymm2;
  1914. vaesdeclast %ymm4, %ymm3, %ymm3;
  1915. vmovdqu %ymm0, (0 * 16)(%edx);
  1916. vmovdqu %ymm1, (2 * 16)(%edx);
  1917. vmovdqu %ymm2, (4 * 16)(%edx);
  1918. vmovdqu %ymm3, (6 * 16)(%edx);
  1919. leal (8 * 16)(%edx), %edx;
  1920. jmp .Lxts_crypt_blk8;
  1921. /* Handle trailing four blocks. */
  1922. .align 8
  1923. .Lxts_crypt_blk4:
  1924. /* Try exit early as typically input length is large power of 2. */
  1925. cmpl $1, %eax;
  1926. jb .Ldone_xts_crypt;
  1927. cmpl $4, %eax;
  1928. jb .Lxts_crypt_blk1;
  1929. leal -4(%eax), %eax;
  1930. vmovdqa %ymm7, %ymm2;
  1931. tweak_clmul(2, %ymm3, %ymm7, %ymm6, %ymm0, %ymm1);
  1932. tweak_clmul(4, %ymm7, %ymm7, %ymm6, %ymm0, %ymm1);
  1933. vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %ymm7, %ymm6;
  1934. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  1935. vpxor (0 * 16)(%ecx), %ymm2, %ymm0;
  1936. vpxor (2 * 16)(%ecx), %ymm3, %ymm1;
  1937. leal (4 * 16)(%ecx), %ecx;
  1938. cmpl $1, 4+28(%ebp);
  1939. jne .Lxts_dec_blk4;
  1940. /* AES rounds */
  1941. XOR2(%ymm4, %ymm0, %ymm1);
  1942. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  1943. VAESENC2(%ymm4, %ymm0, %ymm1);
  1944. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  1945. VAESENC2(%ymm4, %ymm0, %ymm1);
  1946. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  1947. VAESENC2(%ymm4, %ymm0, %ymm1);
  1948. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  1949. VAESENC2(%ymm4, %ymm0, %ymm1);
  1950. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  1951. VAESENC2(%ymm4, %ymm0, %ymm1);
  1952. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  1953. VAESENC2(%ymm4, %ymm0, %ymm1);
  1954. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  1955. VAESENC2(%ymm4, %ymm0, %ymm1);
  1956. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  1957. VAESENC2(%ymm4, %ymm0, %ymm1);
  1958. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  1959. VAESENC2(%ymm4, %ymm0, %ymm1);
  1960. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  1961. cmpl $12, 4+24(%ebp);
  1962. jb .Lxts_enc_blk4_last;
  1963. VAESENC2(%ymm4, %ymm0, %ymm1);
  1964. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  1965. VAESENC2(%ymm4, %ymm0, %ymm1);
  1966. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  1967. jz .Lxts_enc_blk4_last;
  1968. VAESENC2(%ymm4, %ymm0, %ymm1);
  1969. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  1970. VAESENC2(%ymm4, %ymm0, %ymm1);
  1971. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  1972. /* Last round and output handling. */
  1973. .Lxts_enc_blk4_last:
  1974. vpxor %ymm4, %ymm2, %ymm2; /* Xor tweak to last round key. */
  1975. vpxor %ymm4, %ymm3, %ymm3;
  1976. vaesenclast %ymm2, %ymm0, %ymm0;
  1977. vaesenclast %ymm3, %ymm1, %ymm1;
  1978. vmovdqu %ymm0, (0 * 16)(%edx);
  1979. vmovdqu %ymm1, (2 * 16)(%edx);
  1980. leal (4 * 16)(%edx), %edx;
  1981. jmp .Lxts_crypt_blk1;
  1982. .align 8
  1983. .Lxts_dec_blk4:
  1984. /* AES rounds */
  1985. XOR2(%ymm4, %ymm0, %ymm1);
  1986. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  1987. VAESDEC2(%ymm4, %ymm0, %ymm1);
  1988. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  1989. VAESDEC2(%ymm4, %ymm0, %ymm1);
  1990. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  1991. VAESDEC2(%ymm4, %ymm0, %ymm1);
  1992. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  1993. VAESDEC2(%ymm4, %ymm0, %ymm1);
  1994. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  1995. VAESDEC2(%ymm4, %ymm0, %ymm1);
  1996. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  1997. VAESDEC2(%ymm4, %ymm0, %ymm1);
  1998. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  1999. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2000. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  2001. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2002. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  2003. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2004. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  2005. cmpl $12, 4+24(%ebp);
  2006. jb .Lxts_dec_blk4_last;
  2007. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2008. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  2009. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2010. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  2011. jz .Lxts_dec_blk4_last;
  2012. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2013. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  2014. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2015. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  2016. /* Last round and output handling. */
  2017. .Lxts_dec_blk4_last:
  2018. vpxor %ymm4, %ymm2, %ymm2; /* Xor tweak to last round key. */
  2019. vpxor %ymm4, %ymm3, %ymm3;
  2020. vaesdeclast %ymm2, %ymm0, %ymm0;
  2021. vaesdeclast %ymm3, %ymm1, %ymm1;
  2022. vmovdqu %ymm0, (0 * 16)(%edx);
  2023. vmovdqu %ymm1, (2 * 16)(%edx);
  2024. leal (4 * 16)(%edx), %edx;
  2025. /* Process trailing one to three blocks, one per loop. */
  2026. .align 8
  2027. .Lxts_crypt_blk1:
  2028. cmpl $1, %eax;
  2029. jb .Ldone_xts_crypt;
  2030. leal -1(%eax), %eax;
  2031. vpxor (%ecx), %xmm7, %xmm0;
  2032. vmovdqa %xmm7, %xmm5;
  2033. tweak_clmul(1, %xmm7, %xmm7, %xmm6, %xmm2, %xmm3);
  2034. vpshufb CADDR(.Lxts_high_bit_shuf, %ebx), %xmm7, %xmm6;
  2035. leal 16(%ecx), %ecx;
  2036. cmpl $1, 4+28(%ebp);
  2037. jne .Lxts_dec_blk1;
  2038. /* AES rounds. */
  2039. vpxor (0 * 16)(%edi), %xmm0, %xmm0;
  2040. vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
  2041. vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
  2042. vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
  2043. vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
  2044. vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
  2045. vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
  2046. vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
  2047. vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
  2048. vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
  2049. vmovdqa (10 * 16)(%edi), %xmm1;
  2050. cmpl $12, 4+24(%ebp);
  2051. jb .Lxts_enc_blk1_last;
  2052. vaesenc %xmm1, %xmm0, %xmm0;
  2053. vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
  2054. vmovdqa (12 * 16)(%edi), %xmm1;
  2055. jz .Lxts_enc_blk1_last;
  2056. vaesenc %xmm1, %xmm0, %xmm0;
  2057. vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
  2058. vmovdqa (14 * 16)(%edi), %xmm1;
  2059. /* Last round and output handling. */
  2060. .Lxts_enc_blk1_last:
  2061. vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */
  2062. vaesenclast %xmm5, %xmm0, %xmm0;
  2063. vmovdqu %xmm0, (%edx);
  2064. leal 16(%edx), %edx;
  2065. jmp .Lxts_crypt_blk1;
  2066. .align 8
  2067. .Lxts_dec_blk1:
  2068. /* AES rounds. */
  2069. vpxor (0 * 16)(%edi), %xmm0, %xmm0;
  2070. vaesdec (1 * 16)(%edi), %xmm0, %xmm0;
  2071. vaesdec (2 * 16)(%edi), %xmm0, %xmm0;
  2072. vaesdec (3 * 16)(%edi), %xmm0, %xmm0;
  2073. vaesdec (4 * 16)(%edi), %xmm0, %xmm0;
  2074. vaesdec (5 * 16)(%edi), %xmm0, %xmm0;
  2075. vaesdec (6 * 16)(%edi), %xmm0, %xmm0;
  2076. vaesdec (7 * 16)(%edi), %xmm0, %xmm0;
  2077. vaesdec (8 * 16)(%edi), %xmm0, %xmm0;
  2078. vaesdec (9 * 16)(%edi), %xmm0, %xmm0;
  2079. vmovdqa (10 * 16)(%edi), %xmm1;
  2080. cmpl $12, 4+24(%ebp);
  2081. jb .Lxts_dec_blk1_last;
  2082. vaesdec %xmm1, %xmm0, %xmm0;
  2083. vaesdec (11 * 16)(%edi), %xmm0, %xmm0;
  2084. vmovdqa (12 * 16)(%edi), %xmm1;
  2085. jz .Lxts_dec_blk1_last;
  2086. vaesdec %xmm1, %xmm0, %xmm0;
  2087. vaesdec (13 * 16)(%edi), %xmm0, %xmm0;
  2088. vmovdqa (14 * 16)(%edi), %xmm1;
  2089. /* Last round and output handling. */
  2090. .Lxts_dec_blk1_last:
  2091. vpxor %xmm1, %xmm5, %xmm5; /* Xor tweak to last round key. */
  2092. vaesdeclast %xmm5, %xmm0, %xmm0;
  2093. vmovdqu %xmm0, (%edx);
  2094. leal 16(%edx), %edx;
  2095. jmp .Lxts_crypt_blk1;
  2096. .align 8
  2097. .Ldone_xts_crypt:
  2098. /* Store IV. */
  2099. vmovdqu %xmm7, (%esi);
  2100. vpxor %ymm0, %ymm0, %ymm0;
  2101. movl (4 * 32 + 0 * 4)(%esp), %edi;
  2102. CFI_RESTORE(edi);
  2103. movl (4 * 32 + 1 * 4)(%esp), %esi;
  2104. CFI_RESTORE(esi);
  2105. movl (4 * 32 + 2 * 4)(%esp), %ebx;
  2106. CFI_RESTORE(ebx);
  2107. vmovdqa %ymm0, (0 * 32)(%esp);
  2108. vmovdqa %ymm0, (1 * 32)(%esp);
  2109. vmovdqa %ymm0, (2 * 32)(%esp);
  2110. vmovdqa %ymm0, (3 * 32)(%esp);
  2111. leave;
  2112. CFI_LEAVE();
  2113. vzeroall;
  2114. xorl %eax, %eax;
  2115. ret_spec_stop
  2116. CFI_ENDPROC();
  2117. ELF(.size SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386),
  2118. .-SYM_NAME(_gcry_vaes_avx2_xts_crypt_i386))
  2119. /**********************************************************************
  2120. ECB-mode encryption
  2121. **********************************************************************/
  2122. ELF(.type SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386),@function)
  2123. .globl SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386)
  2124. .align 16
  2125. SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386):
  2126. /* input:
  2127. * (esp + 4): round keys
  2128. * (esp + 8): encrypt
  2129. * (esp + 12): dst
  2130. * (esp + 16): src
  2131. * (esp + 20): nblocks
  2132. * (esp + 24): nrounds
  2133. */
  2134. CFI_STARTPROC();
  2135. pushl %edi;
  2136. CFI_PUSH(%edi);
  2137. pushl %esi;
  2138. CFI_PUSH(%esi);
  2139. movl 8+4(%esp), %edi;
  2140. movl 8+8(%esp), %esi;
  2141. movl 8+12(%esp), %edx;
  2142. movl 8+16(%esp), %ecx;
  2143. movl 8+20(%esp), %eax;
  2144. /* Process 8 blocks per loop. */
  2145. .align 8
  2146. .Lecb_blk8:
  2147. cmpl $8, %eax;
  2148. jb .Lecb_blk4;
  2149. leal -8(%eax), %eax;
  2150. /* Load input and xor first key. */
  2151. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  2152. vmovdqu (0 * 16)(%ecx), %ymm0;
  2153. vmovdqu (2 * 16)(%ecx), %ymm1;
  2154. vmovdqu (4 * 16)(%ecx), %ymm2;
  2155. vmovdqu (6 * 16)(%ecx), %ymm3;
  2156. vpxor %ymm4, %ymm0, %ymm0;
  2157. vpxor %ymm4, %ymm1, %ymm1;
  2158. vpxor %ymm4, %ymm2, %ymm2;
  2159. vpxor %ymm4, %ymm3, %ymm3;
  2160. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  2161. leal (8 * 16)(%ecx), %ecx;
  2162. testl %esi, %esi;
  2163. jz .Lecb_dec_blk8;
  2164. /* AES rounds */
  2165. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2166. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  2167. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2168. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  2169. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2170. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  2171. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2172. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  2173. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2174. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  2175. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2176. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  2177. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2178. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  2179. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2180. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  2181. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2182. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  2183. cmpl $12, 8+24(%esp);
  2184. jb .Lecb_enc_blk8_last;
  2185. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2186. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  2187. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2188. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  2189. jz .Lecb_enc_blk8_last;
  2190. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2191. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  2192. VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2193. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  2194. .Lecb_enc_blk8_last:
  2195. vaesenclast %ymm4, %ymm0, %ymm0;
  2196. vaesenclast %ymm4, %ymm1, %ymm1;
  2197. vaesenclast %ymm4, %ymm2, %ymm2;
  2198. vaesenclast %ymm4, %ymm3, %ymm3;
  2199. vmovdqu %ymm0, (0 * 16)(%edx);
  2200. vmovdqu %ymm1, (2 * 16)(%edx);
  2201. vmovdqu %ymm2, (4 * 16)(%edx);
  2202. vmovdqu %ymm3, (6 * 16)(%edx);
  2203. leal (8 * 16)(%edx), %edx;
  2204. jmp .Lecb_blk8;
  2205. .align 8
  2206. .Lecb_dec_blk8:
  2207. /* AES rounds */
  2208. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2209. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  2210. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2211. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  2212. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2213. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  2214. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2215. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  2216. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2217. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  2218. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2219. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  2220. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2221. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  2222. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2223. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  2224. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2225. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  2226. cmpl $12, 8+24(%esp);
  2227. jb .Lecb_dec_blk8_last;
  2228. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2229. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  2230. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2231. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  2232. jz .Lecb_dec_blk8_last;
  2233. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2234. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  2235. VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
  2236. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  2237. .Lecb_dec_blk8_last:
  2238. vaesdeclast %ymm4, %ymm0, %ymm0;
  2239. vaesdeclast %ymm4, %ymm1, %ymm1;
  2240. vaesdeclast %ymm4, %ymm2, %ymm2;
  2241. vaesdeclast %ymm4, %ymm3, %ymm3;
  2242. vmovdqu %ymm0, (0 * 16)(%edx);
  2243. vmovdqu %ymm1, (2 * 16)(%edx);
  2244. vmovdqu %ymm2, (4 * 16)(%edx);
  2245. vmovdqu %ymm3, (6 * 16)(%edx);
  2246. leal (8 * 16)(%edx), %edx;
  2247. jmp .Lecb_blk8;
  2248. /* Handle trailing four blocks. */
  2249. .align 8
  2250. .Lecb_blk4:
  2251. cmpl $4, %eax;
  2252. jb .Lecb_blk1;
  2253. leal -4(%eax), %eax;
  2254. /* Load input and xor first key. */
  2255. vbroadcasti128 (0 * 16)(%edi), %ymm4;
  2256. vmovdqu (0 * 16)(%ecx), %ymm0;
  2257. vmovdqu (2 * 16)(%ecx), %ymm1;
  2258. vpxor %ymm4, %ymm0, %ymm0;
  2259. vpxor %ymm4, %ymm1, %ymm1;
  2260. vbroadcasti128 (1 * 16)(%edi), %ymm4;
  2261. leal (4 * 16)(%ecx), %ecx;
  2262. testl %esi, %esi;
  2263. jz .Lecb_dec_blk4;
  2264. /* AES rounds */
  2265. VAESENC2(%ymm4, %ymm0, %ymm1);
  2266. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  2267. VAESENC2(%ymm4, %ymm0, %ymm1);
  2268. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  2269. VAESENC2(%ymm4, %ymm0, %ymm1);
  2270. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  2271. VAESENC2(%ymm4, %ymm0, %ymm1);
  2272. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  2273. VAESENC2(%ymm4, %ymm0, %ymm1);
  2274. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  2275. VAESENC2(%ymm4, %ymm0, %ymm1);
  2276. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  2277. VAESENC2(%ymm4, %ymm0, %ymm1);
  2278. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  2279. VAESENC2(%ymm4, %ymm0, %ymm1);
  2280. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  2281. VAESENC2(%ymm4, %ymm0, %ymm1);
  2282. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  2283. cmpl $12, 8+24(%esp);
  2284. jb .Lecb_enc_blk4_last;
  2285. VAESENC2(%ymm4, %ymm0, %ymm1);
  2286. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  2287. VAESENC2(%ymm4, %ymm0, %ymm1);
  2288. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  2289. jz .Lecb_enc_blk4_last;
  2290. VAESENC2(%ymm4, %ymm0, %ymm1);
  2291. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  2292. VAESENC2(%ymm4, %ymm0, %ymm1);
  2293. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  2294. .Lecb_enc_blk4_last:
  2295. vaesenclast %ymm4, %ymm0, %ymm0;
  2296. vaesenclast %ymm4, %ymm1, %ymm1;
  2297. vmovdqu %ymm0, (0 * 16)(%edx);
  2298. vmovdqu %ymm1, (2 * 16)(%edx);
  2299. leal (4 * 16)(%edx), %edx;
  2300. jmp .Lecb_blk1;
  2301. .align 8
  2302. .Lecb_dec_blk4:
  2303. /* AES rounds */
  2304. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2305. vbroadcasti128 (2 * 16)(%edi), %ymm4;
  2306. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2307. vbroadcasti128 (3 * 16)(%edi), %ymm4;
  2308. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2309. vbroadcasti128 (4 * 16)(%edi), %ymm4;
  2310. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2311. vbroadcasti128 (5 * 16)(%edi), %ymm4;
  2312. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2313. vbroadcasti128 (6 * 16)(%edi), %ymm4;
  2314. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2315. vbroadcasti128 (7 * 16)(%edi), %ymm4;
  2316. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2317. vbroadcasti128 (8 * 16)(%edi), %ymm4;
  2318. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2319. vbroadcasti128 (9 * 16)(%edi), %ymm4;
  2320. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2321. vbroadcasti128 (10 * 16)(%edi), %ymm4;
  2322. cmpl $12, 8+24(%esp);
  2323. jb .Lecb_dec_blk4_last;
  2324. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2325. vbroadcasti128 (11 * 16)(%edi), %ymm4;
  2326. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2327. vbroadcasti128 (12 * 16)(%edi), %ymm4;
  2328. jz .Lecb_dec_blk4_last;
  2329. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2330. vbroadcasti128 (13 * 16)(%edi), %ymm4;
  2331. VAESDEC2(%ymm4, %ymm0, %ymm1);
  2332. vbroadcasti128 (14 * 16)(%edi), %ymm4;
  2333. .Lecb_dec_blk4_last:
  2334. vaesdeclast %ymm4, %ymm0, %ymm0;
  2335. vaesdeclast %ymm4, %ymm1, %ymm1;
  2336. vmovdqu %ymm0, (0 * 16)(%edx);
  2337. vmovdqu %ymm1, (2 * 16)(%edx);
  2338. leal (4 * 16)(%edx), %edx;
  2339. /* Process trailing one to three blocks, one per loop. */
  2340. .align 8
  2341. .Lecb_blk1:
  2342. cmpl $1, %eax;
  2343. jb .Ldone_ecb;
  2344. leal -1(%eax), %eax;
  2345. /* Load input. */
  2346. vmovdqu (%ecx), %xmm2;
  2347. leal 16(%ecx), %ecx;
  2348. /* Xor first key. */
  2349. vpxor (0 * 16)(%edi), %xmm2, %xmm0;
  2350. testl %esi, %esi;
  2351. jz .Lecb_dec_blk1;
  2352. /* AES rounds. */
  2353. vaesenc (1 * 16)(%edi), %xmm0, %xmm0;
  2354. vaesenc (2 * 16)(%edi), %xmm0, %xmm0;
  2355. vaesenc (3 * 16)(%edi), %xmm0, %xmm0;
  2356. vaesenc (4 * 16)(%edi), %xmm0, %xmm0;
  2357. vaesenc (5 * 16)(%edi), %xmm0, %xmm0;
  2358. vaesenc (6 * 16)(%edi), %xmm0, %xmm0;
  2359. vaesenc (7 * 16)(%edi), %xmm0, %xmm0;
  2360. vaesenc (8 * 16)(%edi), %xmm0, %xmm0;
  2361. vaesenc (9 * 16)(%edi), %xmm0, %xmm0;
  2362. vmovdqa (10 * 16)(%edi), %xmm1;
  2363. cmpl $12, 8+24(%esp);
  2364. jb .Lecb_enc_blk1_last;
  2365. vaesenc %xmm1, %xmm0, %xmm0;
  2366. vaesenc (11 * 16)(%edi), %xmm0, %xmm0;
  2367. vmovdqa (12 * 16)(%edi), %xmm1;
  2368. jz .Lecb_enc_blk1_last;
  2369. vaesenc %xmm1, %xmm0, %xmm0;
  2370. vaesenc (13 * 16)(%edi), %xmm0, %xmm0;
  2371. vmovdqa (14 * 16)(%edi), %xmm1;
  2372. .Lecb_enc_blk1_last:
  2373. vaesenclast %xmm1, %xmm0, %xmm0;
  2374. jmp .Lecb_blk1_end;
  2375. .align 8
  2376. .Lecb_dec_blk1:
  2377. /* AES rounds. */
  2378. vaesdec (1 * 16)(%edi), %xmm0, %xmm0;
  2379. vaesdec (2 * 16)(%edi), %xmm0, %xmm0;
  2380. vaesdec (3 * 16)(%edi), %xmm0, %xmm0;
  2381. vaesdec (4 * 16)(%edi), %xmm0, %xmm0;
  2382. vaesdec (5 * 16)(%edi), %xmm0, %xmm0;
  2383. vaesdec (6 * 16)(%edi), %xmm0, %xmm0;
  2384. vaesdec (7 * 16)(%edi), %xmm0, %xmm0;
  2385. vaesdec (8 * 16)(%edi), %xmm0, %xmm0;
  2386. vaesdec (9 * 16)(%edi), %xmm0, %xmm0;
  2387. vmovdqa (10 * 16)(%edi), %xmm1;
  2388. cmpl $12, 8+24(%esp);
  2389. jb .Lecb_dec_blk1_last;
  2390. vaesdec %xmm1, %xmm0, %xmm0;
  2391. vaesdec (11 * 16)(%edi), %xmm0, %xmm0;
  2392. vmovdqa (12 * 16)(%edi), %xmm1;
  2393. jz .Lecb_dec_blk1_last;
  2394. vaesdec %xmm1, %xmm0, %xmm0;
  2395. vaesdec (13 * 16)(%edi), %xmm0, %xmm0;
  2396. vmovdqa (14 * 16)(%edi), %xmm1;
  2397. .Lecb_dec_blk1_last:
  2398. vaesdeclast %xmm1, %xmm0, %xmm0;
  2399. jmp .Lecb_blk1_end;
  2400. .align 8
  2401. .Lecb_blk1_end:
  2402. vmovdqu %xmm0, (%edx);
  2403. leal 16(%edx), %edx;
  2404. jmp .Lecb_blk1;
  2405. .align 8
  2406. .Ldone_ecb:
  2407. popl %esi;
  2408. CFI_POP(%esi);
  2409. popl %edi;
  2410. CFI_POP(%edi);
  2411. vzeroall;
  2412. ret_spec_stop
  2413. CFI_ENDPROC();
  2414. ELF(.size SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386),
  2415. .-SYM_NAME(_gcry_vaes_avx2_ecb_crypt_i386))
  2416. /**********************************************************************
  2417. constants
  2418. **********************************************************************/
  2419. SECTION_RODATA
  2420. ELF(.type SYM_NAME(_gcry_vaes_consts),@object)
  2421. .align 32
  2422. SYM_NAME(_gcry_vaes_consts):
  2423. .Lbige_addb_0:
  2424. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2425. .Lbige_addb_1:
  2426. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
  2427. .Lbige_addb_2:
  2428. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
  2429. .Lbige_addb_3:
  2430. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
  2431. .Lbige_addb_4:
  2432. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
  2433. .Lbige_addb_5:
  2434. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
  2435. .Lbige_addb_6:
  2436. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
  2437. .Lbige_addb_7:
  2438. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
  2439. .Lbige_addb_8:
  2440. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
  2441. .Lbige_addb_9:
  2442. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
  2443. .Lbige_addb_10:
  2444. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
  2445. .Lbige_addb_11:
  2446. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
  2447. .Lle_addd_0:
  2448. .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2449. .Lle_addd_1:
  2450. .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2451. .Lle_addd_2:
  2452. .byte 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2453. .Lle_addd_3:
  2454. .byte 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2455. .Lle_addd_4:
  2456. .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2457. .Lle_addd_5:
  2458. .byte 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2459. .Lle_addd_6:
  2460. .byte 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2461. .Lle_addd_7:
  2462. .byte 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2463. .Lle_addd_8:
  2464. .byte 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2465. .Lle_addd_9:
  2466. .byte 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2467. .Lle_addd_10:
  2468. .byte 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2469. .Lle_addd_11:
  2470. .byte 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2471. .Lle_addd_4_2:
  2472. .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2473. .byte 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2474. .Lle_addd_12_2:
  2475. .byte 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2476. .byte 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  2477. .Lxts_gfmul_clmul:
  2478. .long 0x00, 0x87, 0x00, 0x00
  2479. .long 0x00, 0x87, 0x00, 0x00
  2480. .Lxts_high_bit_shuf:
  2481. .byte -1, -1, -1, -1, 12, 13, 14, 15
  2482. .byte 4, 5, 6, 7, -1, -1, -1, -1
  2483. .byte -1, -1, -1, -1, 12, 13, 14, 15
  2484. .byte 4, 5, 6, 7, -1, -1, -1, -1
  2485. .Lbswap128_mask:
  2486. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2487. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2488. ELF(.size SYM_NAME(_gcry_vaes_consts),.-SYM_NAME(_gcry_vaes_consts))
  2489. #endif /* HAVE_GCC_INLINE_ASM_VAES */
  2490. #endif /* __i386__ */