aesni-intel_avx-x86_64.S 102 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947
  1. ########################################################################
  2. # Copyright (c) 2013, Intel Corporation
  3. #
  4. # This software is available to you under a choice of one of two
  5. # licenses. You may choose to be licensed under the terms of the GNU
  6. # General Public License (GPL) Version 2, available from the file
  7. # COPYING in the main directory of this source tree, or the
  8. # OpenIB.org BSD license below:
  9. #
  10. # Redistribution and use in source and binary forms, with or without
  11. # modification, are permitted provided that the following conditions are
  12. # met:
  13. #
  14. # * Redistributions of source code must retain the above copyright
  15. # notice, this list of conditions and the following disclaimer.
  16. #
  17. # * Redistributions in binary form must reproduce the above copyright
  18. # notice, this list of conditions and the following disclaimer in the
  19. # documentation and/or other materials provided with the
  20. # distribution.
  21. #
  22. # * Neither the name of the Intel Corporation nor the names of its
  23. # contributors may be used to endorse or promote products derived from
  24. # this software without specific prior written permission.
  25. #
  26. #
  27. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  28. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  30. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  31. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  32. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  33. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
  34. # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. ########################################################################
  39. ##
  40. ## Authors:
  41. ## Erdinc Ozturk <erdinc.ozturk@intel.com>
  42. ## Vinodh Gopal <vinodh.gopal@intel.com>
  43. ## James Guilford <james.guilford@intel.com>
  44. ## Tim Chen <tim.c.chen@linux.intel.com>
  45. ##
  46. ## References:
  47. ## This code was derived and highly optimized from the code described in paper:
  48. ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
  49. ## on Intel Architecture Processors. August, 2010
  50. ## The details of the implementation is explained in:
  51. ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
  52. ## on Intel Architecture Processors. October, 2012.
  53. ##
  54. ## Assumptions:
  55. ##
  56. ##
  57. ##
  58. ## iv:
  59. ## 0 1 2 3
  60. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  61. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  62. ## | Salt (From the SA) |
  63. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  64. ## | Initialization Vector |
  65. ## | (This is the sequence number from IPSec header) |
  66. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67. ## | 0x1 |
  68. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69. ##
  70. ##
  71. ##
  72. ## AAD:
  73. ## AAD padded to 128 bits with 0
  74. ## for example, assume AAD is a u32 vector
  75. ##
  76. ## if AAD is 8 bytes:
  77. ## AAD[3] = {A0, A1}#
  78. ## padded AAD in xmm register = {A1 A0 0 0}
  79. ##
  80. ## 0 1 2 3
  81. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  82. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  83. ## | SPI (A1) |
  84. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  85. ## | 32-bit Sequence Number (A0) |
  86. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  87. ## | 0x0 |
  88. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  89. ##
  90. ## AAD Format with 32-bit Sequence Number
  91. ##
  92. ## if AAD is 12 bytes:
  93. ## AAD[3] = {A0, A1, A2}#
  94. ## padded AAD in xmm register = {A2 A1 A0 0}
  95. ##
  96. ## 0 1 2 3
  97. ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  98. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  99. ## | SPI (A2) |
  100. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  101. ## | 64-bit Extended Sequence Number {A1,A0} |
  102. ## | |
  103. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  104. ## | 0x0 |
  105. ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  106. ##
  107. ## AAD Format with 64-bit Extended Sequence Number
  108. ##
  109. ##
  110. ## aadLen:
  111. ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
  112. ## The code additionally supports aadLen of length 16 bytes.
  113. ##
  114. ## TLen:
  115. ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  116. ##
  117. ## poly = x^128 + x^127 + x^126 + x^121 + 1
  118. ## throughout the code, one tab and two tab indentations are used. one tab is
  119. ## for GHASH part, two tabs is for AES part.
  120. ##
  121. #include <linux/linkage.h>
  122. #include <asm/inst.h>
  123. # constants in mergeable sections, linker can reorder and merge
  124. .section .rodata.cst16.POLY, "aM", @progbits, 16
  125. .align 16
  126. POLY: .octa 0xC2000000000000000000000000000001
  127. .section .rodata.cst16.POLY2, "aM", @progbits, 16
  128. .align 16
  129. POLY2: .octa 0xC20000000000000000000001C2000000
  130. .section .rodata.cst16.TWOONE, "aM", @progbits, 16
  131. .align 16
  132. TWOONE: .octa 0x00000001000000000000000000000001
  133. .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  134. .align 16
  135. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  136. .section .rodata.cst16.ONE, "aM", @progbits, 16
  137. .align 16
  138. ONE: .octa 0x00000000000000000000000000000001
  139. .section .rodata.cst16.ONEf, "aM", @progbits, 16
  140. .align 16
  141. ONEf: .octa 0x01000000000000000000000000000000
  142. # order of these constants should not change.
  143. # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
  144. .section .rodata, "a", @progbits
  145. .align 16
  146. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  147. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  148. .octa 0x00000000000000000000000000000000
  149. .section .rodata
  150. .align 16
  151. .type aad_shift_arr, @object
  152. .size aad_shift_arr, 272
  153. aad_shift_arr:
  154. .octa 0xffffffffffffffffffffffffffffffff
  155. .octa 0xffffffffffffffffffffffffffffff0C
  156. .octa 0xffffffffffffffffffffffffffff0D0C
  157. .octa 0xffffffffffffffffffffffffff0E0D0C
  158. .octa 0xffffffffffffffffffffffff0F0E0D0C
  159. .octa 0xffffffffffffffffffffff0C0B0A0908
  160. .octa 0xffffffffffffffffffff0D0C0B0A0908
  161. .octa 0xffffffffffffffffff0E0D0C0B0A0908
  162. .octa 0xffffffffffffffff0F0E0D0C0B0A0908
  163. .octa 0xffffffffffffff0C0B0A090807060504
  164. .octa 0xffffffffffff0D0C0B0A090807060504
  165. .octa 0xffffffffff0E0D0C0B0A090807060504
  166. .octa 0xffffffff0F0E0D0C0B0A090807060504
  167. .octa 0xffffff0C0B0A09080706050403020100
  168. .octa 0xffff0D0C0B0A09080706050403020100
  169. .octa 0xff0E0D0C0B0A09080706050403020100
  170. .octa 0x0F0E0D0C0B0A09080706050403020100
  171. .text
  172. ##define the fields of the gcm aes context
  173. #{
  174. # u8 expanded_keys[16*11] store expanded keys
  175. # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
  176. # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
  177. # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
  178. # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
  179. # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
  180. # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
  181. # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
  182. # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
  183. # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
  184. # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
  185. # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
  186. # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
  187. # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
  188. # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
  189. # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
  190. # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
  191. #} gcm_ctx#
  192. HashKey = 16*11 # store HashKey <<1 mod poly here
  193. HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
  194. HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
  195. HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
  196. HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
  197. HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
  198. HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
  199. HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
  200. HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
  201. HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
  202. HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
  203. HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
  204. HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
  205. HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
  206. HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
  207. HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
  208. #define arg1 %rdi
  209. #define arg2 %rsi
  210. #define arg3 %rdx
  211. #define arg4 %rcx
  212. #define arg5 %r8
  213. #define arg6 %r9
  214. #define arg7 STACK_OFFSET+8*1(%r14)
  215. #define arg8 STACK_OFFSET+8*2(%r14)
  216. #define arg9 STACK_OFFSET+8*3(%r14)
  217. i = 0
  218. j = 0
  219. out_order = 0
  220. in_order = 1
  221. DEC = 0
  222. ENC = 1
  223. .macro define_reg r n
  224. reg_\r = %xmm\n
  225. .endm
  226. .macro setreg
  227. .altmacro
  228. define_reg i %i
  229. define_reg j %j
  230. .noaltmacro
  231. .endm
  232. # need to push 4 registers into stack to maintain
  233. STACK_OFFSET = 8*4
  234. TMP1 = 16*0 # Temporary storage for AAD
  235. TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
  236. TMP3 = 16*2 # Temporary storage for AES State 3
  237. TMP4 = 16*3 # Temporary storage for AES State 4
  238. TMP5 = 16*4 # Temporary storage for AES State 5
  239. TMP6 = 16*5 # Temporary storage for AES State 6
  240. TMP7 = 16*6 # Temporary storage for AES State 7
  241. TMP8 = 16*7 # Temporary storage for AES State 8
  242. VARIABLE_OFFSET = 16*8
  243. ################################
  244. # Utility Macros
  245. ################################
  246. # Encryption of a single block
  247. .macro ENCRYPT_SINGLE_BLOCK XMM0
  248. vpxor (arg1), \XMM0, \XMM0
  249. i = 1
  250. setreg
  251. .rep 9
  252. vaesenc 16*i(arg1), \XMM0, \XMM0
  253. i = (i+1)
  254. setreg
  255. .endr
  256. vaesenclast 16*10(arg1), \XMM0, \XMM0
  257. .endm
  258. #ifdef CONFIG_AS_AVX
  259. ###############################################################################
  260. # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  261. # Input: A and B (128-bits each, bit-reflected)
  262. # Output: C = A*B*x mod poly, (i.e. >>1 )
  263. # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  264. # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  265. ###############################################################################
  266. .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
  267. vpshufd $0b01001110, \GH, \T2
  268. vpshufd $0b01001110, \HK, \T3
  269. vpxor \GH , \T2, \T2 # T2 = (a1+a0)
  270. vpxor \HK , \T3, \T3 # T3 = (b1+b0)
  271. vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
  272. vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
  273. vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
  274. vpxor \GH, \T2,\T2
  275. vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
  276. vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
  277. vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
  278. vpxor \T3, \GH, \GH
  279. vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
  280. #first phase of the reduction
  281. vpslld $31, \GH, \T2 # packed right shifting << 31
  282. vpslld $30, \GH, \T3 # packed right shifting shift << 30
  283. vpslld $25, \GH, \T4 # packed right shifting shift << 25
  284. vpxor \T3, \T2, \T2 # xor the shifted versions
  285. vpxor \T4, \T2, \T2
  286. vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
  287. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  288. vpxor \T2, \GH, \GH # first phase of the reduction complete
  289. #second phase of the reduction
  290. vpsrld $1,\GH, \T2 # packed left shifting >> 1
  291. vpsrld $2,\GH, \T3 # packed left shifting >> 2
  292. vpsrld $7,\GH, \T4 # packed left shifting >> 7
  293. vpxor \T3, \T2, \T2 # xor the shifted versions
  294. vpxor \T4, \T2, \T2
  295. vpxor \T5, \T2, \T2
  296. vpxor \T2, \GH, \GH
  297. vpxor \T1, \GH, \GH # the result is in GH
  298. .endm
  299. .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
  300. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  301. vmovdqa \HK, \T5
  302. vpshufd $0b01001110, \T5, \T1
  303. vpxor \T5, \T1, \T1
  304. vmovdqa \T1, HashKey_k(arg1)
  305. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
  306. vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
  307. vpshufd $0b01001110, \T5, \T1
  308. vpxor \T5, \T1, \T1
  309. vmovdqa \T1, HashKey_2_k(arg1)
  310. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
  311. vmovdqa \T5, HashKey_3(arg1)
  312. vpshufd $0b01001110, \T5, \T1
  313. vpxor \T5, \T1, \T1
  314. vmovdqa \T1, HashKey_3_k(arg1)
  315. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
  316. vmovdqa \T5, HashKey_4(arg1)
  317. vpshufd $0b01001110, \T5, \T1
  318. vpxor \T5, \T1, \T1
  319. vmovdqa \T1, HashKey_4_k(arg1)
  320. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
  321. vmovdqa \T5, HashKey_5(arg1)
  322. vpshufd $0b01001110, \T5, \T1
  323. vpxor \T5, \T1, \T1
  324. vmovdqa \T1, HashKey_5_k(arg1)
  325. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
  326. vmovdqa \T5, HashKey_6(arg1)
  327. vpshufd $0b01001110, \T5, \T1
  328. vpxor \T5, \T1, \T1
  329. vmovdqa \T1, HashKey_6_k(arg1)
  330. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
  331. vmovdqa \T5, HashKey_7(arg1)
  332. vpshufd $0b01001110, \T5, \T1
  333. vpxor \T5, \T1, \T1
  334. vmovdqa \T1, HashKey_7_k(arg1)
  335. GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
  336. vmovdqa \T5, HashKey_8(arg1)
  337. vpshufd $0b01001110, \T5, \T1
  338. vpxor \T5, \T1, \T1
  339. vmovdqa \T1, HashKey_8_k(arg1)
  340. .endm
  341. ## if a = number of total plaintext bytes
  342. ## b = floor(a/16)
  343. ## num_initial_blocks = b mod 4#
  344. ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
  345. ## r10, r11, r12, rax are clobbered
  346. ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
  347. .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
  348. i = (8-\num_initial_blocks)
  349. j = 0
  350. setreg
  351. mov arg6, %r10 # r10 = AAD
  352. mov arg7, %r12 # r12 = aadLen
  353. mov %r12, %r11
  354. vpxor reg_j, reg_j, reg_j
  355. vpxor reg_i, reg_i, reg_i
  356. cmp $16, %r11
  357. jl _get_AAD_rest8\@
  358. _get_AAD_blocks\@:
  359. vmovdqu (%r10), reg_i
  360. vpshufb SHUF_MASK(%rip), reg_i, reg_i
  361. vpxor reg_i, reg_j, reg_j
  362. GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
  363. add $16, %r10
  364. sub $16, %r12
  365. sub $16, %r11
  366. cmp $16, %r11
  367. jge _get_AAD_blocks\@
  368. vmovdqu reg_j, reg_i
  369. cmp $0, %r11
  370. je _get_AAD_done\@
  371. vpxor reg_i, reg_i, reg_i
  372. /* read the last <16B of AAD. since we have at least 4B of
  373. data right after the AAD (the ICV, and maybe some CT), we can
  374. read 4B/8B blocks safely, and then get rid of the extra stuff */
  375. _get_AAD_rest8\@:
  376. cmp $4, %r11
  377. jle _get_AAD_rest4\@
  378. movq (%r10), \T1
  379. add $8, %r10
  380. sub $8, %r11
  381. vpslldq $8, \T1, \T1
  382. vpsrldq $8, reg_i, reg_i
  383. vpxor \T1, reg_i, reg_i
  384. jmp _get_AAD_rest8\@
  385. _get_AAD_rest4\@:
  386. cmp $0, %r11
  387. jle _get_AAD_rest0\@
  388. mov (%r10), %eax
  389. movq %rax, \T1
  390. add $4, %r10
  391. sub $4, %r11
  392. vpslldq $12, \T1, \T1
  393. vpsrldq $4, reg_i, reg_i
  394. vpxor \T1, reg_i, reg_i
  395. _get_AAD_rest0\@:
  396. /* finalize: shift out the extra bytes we read, and align
  397. left. since pslldq can only shift by an immediate, we use
  398. vpshufb and an array of shuffle masks */
  399. movq %r12, %r11
  400. salq $4, %r11
  401. movdqu aad_shift_arr(%r11), \T1
  402. vpshufb \T1, reg_i, reg_i
  403. _get_AAD_rest_final\@:
  404. vpshufb SHUF_MASK(%rip), reg_i, reg_i
  405. vpxor reg_j, reg_i, reg_i
  406. GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
  407. _get_AAD_done\@:
  408. # initialize the data pointer offset as zero
  409. xor %r11d, %r11d
  410. # start AES for num_initial_blocks blocks
  411. mov arg5, %rax # rax = *Y0
  412. vmovdqu (%rax), \CTR # CTR = Y0
  413. vpshufb SHUF_MASK(%rip), \CTR, \CTR
  414. i = (9-\num_initial_blocks)
  415. setreg
  416. .rep \num_initial_blocks
  417. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  418. vmovdqa \CTR, reg_i
  419. vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
  420. i = (i+1)
  421. setreg
  422. .endr
  423. vmovdqa (arg1), \T_key
  424. i = (9-\num_initial_blocks)
  425. setreg
  426. .rep \num_initial_blocks
  427. vpxor \T_key, reg_i, reg_i
  428. i = (i+1)
  429. setreg
  430. .endr
  431. j = 1
  432. setreg
  433. .rep 9
  434. vmovdqa 16*j(arg1), \T_key
  435. i = (9-\num_initial_blocks)
  436. setreg
  437. .rep \num_initial_blocks
  438. vaesenc \T_key, reg_i, reg_i
  439. i = (i+1)
  440. setreg
  441. .endr
  442. j = (j+1)
  443. setreg
  444. .endr
  445. vmovdqa 16*10(arg1), \T_key
  446. i = (9-\num_initial_blocks)
  447. setreg
  448. .rep \num_initial_blocks
  449. vaesenclast \T_key, reg_i, reg_i
  450. i = (i+1)
  451. setreg
  452. .endr
  453. i = (9-\num_initial_blocks)
  454. setreg
  455. .rep \num_initial_blocks
  456. vmovdqu (arg3, %r11), \T1
  457. vpxor \T1, reg_i, reg_i
  458. vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
  459. add $16, %r11
  460. .if \ENC_DEC == DEC
  461. vmovdqa \T1, reg_i
  462. .endif
  463. vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
  464. i = (i+1)
  465. setreg
  466. .endr
  467. i = (8-\num_initial_blocks)
  468. j = (9-\num_initial_blocks)
  469. setreg
  470. .rep \num_initial_blocks
  471. vpxor reg_i, reg_j, reg_j
  472. GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
  473. i = (i+1)
  474. j = (j+1)
  475. setreg
  476. .endr
  477. # XMM8 has the combined result here
  478. vmovdqa \XMM8, TMP1(%rsp)
  479. vmovdqa \XMM8, \T3
  480. cmp $128, %r13
  481. jl _initial_blocks_done\@ # no need for precomputed constants
  482. ###############################################################################
  483. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  484. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  485. vmovdqa \CTR, \XMM1
  486. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  487. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  488. vmovdqa \CTR, \XMM2
  489. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  490. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  491. vmovdqa \CTR, \XMM3
  492. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  493. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  494. vmovdqa \CTR, \XMM4
  495. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  496. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  497. vmovdqa \CTR, \XMM5
  498. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  499. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  500. vmovdqa \CTR, \XMM6
  501. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  502. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  503. vmovdqa \CTR, \XMM7
  504. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  505. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  506. vmovdqa \CTR, \XMM8
  507. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  508. vmovdqa (arg1), \T_key
  509. vpxor \T_key, \XMM1, \XMM1
  510. vpxor \T_key, \XMM2, \XMM2
  511. vpxor \T_key, \XMM3, \XMM3
  512. vpxor \T_key, \XMM4, \XMM4
  513. vpxor \T_key, \XMM5, \XMM5
  514. vpxor \T_key, \XMM6, \XMM6
  515. vpxor \T_key, \XMM7, \XMM7
  516. vpxor \T_key, \XMM8, \XMM8
  517. i = 1
  518. setreg
  519. .rep 9 # do 9 rounds
  520. vmovdqa 16*i(arg1), \T_key
  521. vaesenc \T_key, \XMM1, \XMM1
  522. vaesenc \T_key, \XMM2, \XMM2
  523. vaesenc \T_key, \XMM3, \XMM3
  524. vaesenc \T_key, \XMM4, \XMM4
  525. vaesenc \T_key, \XMM5, \XMM5
  526. vaesenc \T_key, \XMM6, \XMM6
  527. vaesenc \T_key, \XMM7, \XMM7
  528. vaesenc \T_key, \XMM8, \XMM8
  529. i = (i+1)
  530. setreg
  531. .endr
  532. vmovdqa 16*i(arg1), \T_key
  533. vaesenclast \T_key, \XMM1, \XMM1
  534. vaesenclast \T_key, \XMM2, \XMM2
  535. vaesenclast \T_key, \XMM3, \XMM3
  536. vaesenclast \T_key, \XMM4, \XMM4
  537. vaesenclast \T_key, \XMM5, \XMM5
  538. vaesenclast \T_key, \XMM6, \XMM6
  539. vaesenclast \T_key, \XMM7, \XMM7
  540. vaesenclast \T_key, \XMM8, \XMM8
  541. vmovdqu (arg3, %r11), \T1
  542. vpxor \T1, \XMM1, \XMM1
  543. vmovdqu \XMM1, (arg2 , %r11)
  544. .if \ENC_DEC == DEC
  545. vmovdqa \T1, \XMM1
  546. .endif
  547. vmovdqu 16*1(arg3, %r11), \T1
  548. vpxor \T1, \XMM2, \XMM2
  549. vmovdqu \XMM2, 16*1(arg2 , %r11)
  550. .if \ENC_DEC == DEC
  551. vmovdqa \T1, \XMM2
  552. .endif
  553. vmovdqu 16*2(arg3, %r11), \T1
  554. vpxor \T1, \XMM3, \XMM3
  555. vmovdqu \XMM3, 16*2(arg2 , %r11)
  556. .if \ENC_DEC == DEC
  557. vmovdqa \T1, \XMM3
  558. .endif
  559. vmovdqu 16*3(arg3, %r11), \T1
  560. vpxor \T1, \XMM4, \XMM4
  561. vmovdqu \XMM4, 16*3(arg2 , %r11)
  562. .if \ENC_DEC == DEC
  563. vmovdqa \T1, \XMM4
  564. .endif
  565. vmovdqu 16*4(arg3, %r11), \T1
  566. vpxor \T1, \XMM5, \XMM5
  567. vmovdqu \XMM5, 16*4(arg2 , %r11)
  568. .if \ENC_DEC == DEC
  569. vmovdqa \T1, \XMM5
  570. .endif
  571. vmovdqu 16*5(arg3, %r11), \T1
  572. vpxor \T1, \XMM6, \XMM6
  573. vmovdqu \XMM6, 16*5(arg2 , %r11)
  574. .if \ENC_DEC == DEC
  575. vmovdqa \T1, \XMM6
  576. .endif
  577. vmovdqu 16*6(arg3, %r11), \T1
  578. vpxor \T1, \XMM7, \XMM7
  579. vmovdqu \XMM7, 16*6(arg2 , %r11)
  580. .if \ENC_DEC == DEC
  581. vmovdqa \T1, \XMM7
  582. .endif
  583. vmovdqu 16*7(arg3, %r11), \T1
  584. vpxor \T1, \XMM8, \XMM8
  585. vmovdqu \XMM8, 16*7(arg2 , %r11)
  586. .if \ENC_DEC == DEC
  587. vmovdqa \T1, \XMM8
  588. .endif
  589. add $128, %r11
  590. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  591. vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
  592. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  593. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  594. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  595. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  596. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  597. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  598. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  599. ###############################################################################
  600. _initial_blocks_done\@:
  601. .endm
  602. # encrypt 8 blocks at a time
  603. # ghash the 8 previously encrypted ciphertext blocks
  604. # arg1, arg2, arg3 are used as pointers only, not modified
  605. # r11 is the data offset value
  606. .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
  607. vmovdqa \XMM1, \T2
  608. vmovdqa \XMM2, TMP2(%rsp)
  609. vmovdqa \XMM3, TMP3(%rsp)
  610. vmovdqa \XMM4, TMP4(%rsp)
  611. vmovdqa \XMM5, TMP5(%rsp)
  612. vmovdqa \XMM6, TMP6(%rsp)
  613. vmovdqa \XMM7, TMP7(%rsp)
  614. vmovdqa \XMM8, TMP8(%rsp)
  615. .if \loop_idx == in_order
  616. vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
  617. vpaddd ONE(%rip), \XMM1, \XMM2
  618. vpaddd ONE(%rip), \XMM2, \XMM3
  619. vpaddd ONE(%rip), \XMM3, \XMM4
  620. vpaddd ONE(%rip), \XMM4, \XMM5
  621. vpaddd ONE(%rip), \XMM5, \XMM6
  622. vpaddd ONE(%rip), \XMM6, \XMM7
  623. vpaddd ONE(%rip), \XMM7, \XMM8
  624. vmovdqa \XMM8, \CTR
  625. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  626. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  627. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  628. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  629. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  630. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  631. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  632. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  633. .else
  634. vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
  635. vpaddd ONEf(%rip), \XMM1, \XMM2
  636. vpaddd ONEf(%rip), \XMM2, \XMM3
  637. vpaddd ONEf(%rip), \XMM3, \XMM4
  638. vpaddd ONEf(%rip), \XMM4, \XMM5
  639. vpaddd ONEf(%rip), \XMM5, \XMM6
  640. vpaddd ONEf(%rip), \XMM6, \XMM7
  641. vpaddd ONEf(%rip), \XMM7, \XMM8
  642. vmovdqa \XMM8, \CTR
  643. .endif
  644. #######################################################################
  645. vmovdqu (arg1), \T1
  646. vpxor \T1, \XMM1, \XMM1
  647. vpxor \T1, \XMM2, \XMM2
  648. vpxor \T1, \XMM3, \XMM3
  649. vpxor \T1, \XMM4, \XMM4
  650. vpxor \T1, \XMM5, \XMM5
  651. vpxor \T1, \XMM6, \XMM6
  652. vpxor \T1, \XMM7, \XMM7
  653. vpxor \T1, \XMM8, \XMM8
  654. #######################################################################
  655. vmovdqu 16*1(arg1), \T1
  656. vaesenc \T1, \XMM1, \XMM1
  657. vaesenc \T1, \XMM2, \XMM2
  658. vaesenc \T1, \XMM3, \XMM3
  659. vaesenc \T1, \XMM4, \XMM4
  660. vaesenc \T1, \XMM5, \XMM5
  661. vaesenc \T1, \XMM6, \XMM6
  662. vaesenc \T1, \XMM7, \XMM7
  663. vaesenc \T1, \XMM8, \XMM8
  664. vmovdqu 16*2(arg1), \T1
  665. vaesenc \T1, \XMM1, \XMM1
  666. vaesenc \T1, \XMM2, \XMM2
  667. vaesenc \T1, \XMM3, \XMM3
  668. vaesenc \T1, \XMM4, \XMM4
  669. vaesenc \T1, \XMM5, \XMM5
  670. vaesenc \T1, \XMM6, \XMM6
  671. vaesenc \T1, \XMM7, \XMM7
  672. vaesenc \T1, \XMM8, \XMM8
  673. #######################################################################
  674. vmovdqa HashKey_8(arg1), \T5
  675. vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
  676. vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
  677. vpshufd $0b01001110, \T2, \T6
  678. vpxor \T2, \T6, \T6
  679. vmovdqa HashKey_8_k(arg1), \T5
  680. vpclmulqdq $0x00, \T5, \T6, \T6
  681. vmovdqu 16*3(arg1), \T1
  682. vaesenc \T1, \XMM1, \XMM1
  683. vaesenc \T1, \XMM2, \XMM2
  684. vaesenc \T1, \XMM3, \XMM3
  685. vaesenc \T1, \XMM4, \XMM4
  686. vaesenc \T1, \XMM5, \XMM5
  687. vaesenc \T1, \XMM6, \XMM6
  688. vaesenc \T1, \XMM7, \XMM7
  689. vaesenc \T1, \XMM8, \XMM8
  690. vmovdqa TMP2(%rsp), \T1
  691. vmovdqa HashKey_7(arg1), \T5
  692. vpclmulqdq $0x11, \T5, \T1, \T3
  693. vpxor \T3, \T4, \T4
  694. vpclmulqdq $0x00, \T5, \T1, \T3
  695. vpxor \T3, \T7, \T7
  696. vpshufd $0b01001110, \T1, \T3
  697. vpxor \T1, \T3, \T3
  698. vmovdqa HashKey_7_k(arg1), \T5
  699. vpclmulqdq $0x10, \T5, \T3, \T3
  700. vpxor \T3, \T6, \T6
  701. vmovdqu 16*4(arg1), \T1
  702. vaesenc \T1, \XMM1, \XMM1
  703. vaesenc \T1, \XMM2, \XMM2
  704. vaesenc \T1, \XMM3, \XMM3
  705. vaesenc \T1, \XMM4, \XMM4
  706. vaesenc \T1, \XMM5, \XMM5
  707. vaesenc \T1, \XMM6, \XMM6
  708. vaesenc \T1, \XMM7, \XMM7
  709. vaesenc \T1, \XMM8, \XMM8
  710. #######################################################################
  711. vmovdqa TMP3(%rsp), \T1
  712. vmovdqa HashKey_6(arg1), \T5
  713. vpclmulqdq $0x11, \T5, \T1, \T3
  714. vpxor \T3, \T4, \T4
  715. vpclmulqdq $0x00, \T5, \T1, \T3
  716. vpxor \T3, \T7, \T7
  717. vpshufd $0b01001110, \T1, \T3
  718. vpxor \T1, \T3, \T3
  719. vmovdqa HashKey_6_k(arg1), \T5
  720. vpclmulqdq $0x10, \T5, \T3, \T3
  721. vpxor \T3, \T6, \T6
  722. vmovdqu 16*5(arg1), \T1
  723. vaesenc \T1, \XMM1, \XMM1
  724. vaesenc \T1, \XMM2, \XMM2
  725. vaesenc \T1, \XMM3, \XMM3
  726. vaesenc \T1, \XMM4, \XMM4
  727. vaesenc \T1, \XMM5, \XMM5
  728. vaesenc \T1, \XMM6, \XMM6
  729. vaesenc \T1, \XMM7, \XMM7
  730. vaesenc \T1, \XMM8, \XMM8
  731. vmovdqa TMP4(%rsp), \T1
  732. vmovdqa HashKey_5(arg1), \T5
  733. vpclmulqdq $0x11, \T5, \T1, \T3
  734. vpxor \T3, \T4, \T4
  735. vpclmulqdq $0x00, \T5, \T1, \T3
  736. vpxor \T3, \T7, \T7
  737. vpshufd $0b01001110, \T1, \T3
  738. vpxor \T1, \T3, \T3
  739. vmovdqa HashKey_5_k(arg1), \T5
  740. vpclmulqdq $0x10, \T5, \T3, \T3
  741. vpxor \T3, \T6, \T6
  742. vmovdqu 16*6(arg1), \T1
  743. vaesenc \T1, \XMM1, \XMM1
  744. vaesenc \T1, \XMM2, \XMM2
  745. vaesenc \T1, \XMM3, \XMM3
  746. vaesenc \T1, \XMM4, \XMM4
  747. vaesenc \T1, \XMM5, \XMM5
  748. vaesenc \T1, \XMM6, \XMM6
  749. vaesenc \T1, \XMM7, \XMM7
  750. vaesenc \T1, \XMM8, \XMM8
  751. vmovdqa TMP5(%rsp), \T1
  752. vmovdqa HashKey_4(arg1), \T5
  753. vpclmulqdq $0x11, \T5, \T1, \T3
  754. vpxor \T3, \T4, \T4
  755. vpclmulqdq $0x00, \T5, \T1, \T3
  756. vpxor \T3, \T7, \T7
  757. vpshufd $0b01001110, \T1, \T3
  758. vpxor \T1, \T3, \T3
  759. vmovdqa HashKey_4_k(arg1), \T5
  760. vpclmulqdq $0x10, \T5, \T3, \T3
  761. vpxor \T3, \T6, \T6
  762. vmovdqu 16*7(arg1), \T1
  763. vaesenc \T1, \XMM1, \XMM1
  764. vaesenc \T1, \XMM2, \XMM2
  765. vaesenc \T1, \XMM3, \XMM3
  766. vaesenc \T1, \XMM4, \XMM4
  767. vaesenc \T1, \XMM5, \XMM5
  768. vaesenc \T1, \XMM6, \XMM6
  769. vaesenc \T1, \XMM7, \XMM7
  770. vaesenc \T1, \XMM8, \XMM8
  771. vmovdqa TMP6(%rsp), \T1
  772. vmovdqa HashKey_3(arg1), \T5
  773. vpclmulqdq $0x11, \T5, \T1, \T3
  774. vpxor \T3, \T4, \T4
  775. vpclmulqdq $0x00, \T5, \T1, \T3
  776. vpxor \T3, \T7, \T7
  777. vpshufd $0b01001110, \T1, \T3
  778. vpxor \T1, \T3, \T3
  779. vmovdqa HashKey_3_k(arg1), \T5
  780. vpclmulqdq $0x10, \T5, \T3, \T3
  781. vpxor \T3, \T6, \T6
  782. vmovdqu 16*8(arg1), \T1
  783. vaesenc \T1, \XMM1, \XMM1
  784. vaesenc \T1, \XMM2, \XMM2
  785. vaesenc \T1, \XMM3, \XMM3
  786. vaesenc \T1, \XMM4, \XMM4
  787. vaesenc \T1, \XMM5, \XMM5
  788. vaesenc \T1, \XMM6, \XMM6
  789. vaesenc \T1, \XMM7, \XMM7
  790. vaesenc \T1, \XMM8, \XMM8
  791. vmovdqa TMP7(%rsp), \T1
  792. vmovdqa HashKey_2(arg1), \T5
  793. vpclmulqdq $0x11, \T5, \T1, \T3
  794. vpxor \T3, \T4, \T4
  795. vpclmulqdq $0x00, \T5, \T1, \T3
  796. vpxor \T3, \T7, \T7
  797. vpshufd $0b01001110, \T1, \T3
  798. vpxor \T1, \T3, \T3
  799. vmovdqa HashKey_2_k(arg1), \T5
  800. vpclmulqdq $0x10, \T5, \T3, \T3
  801. vpxor \T3, \T6, \T6
  802. #######################################################################
  803. vmovdqu 16*9(arg1), \T5
  804. vaesenc \T5, \XMM1, \XMM1
  805. vaesenc \T5, \XMM2, \XMM2
  806. vaesenc \T5, \XMM3, \XMM3
  807. vaesenc \T5, \XMM4, \XMM4
  808. vaesenc \T5, \XMM5, \XMM5
  809. vaesenc \T5, \XMM6, \XMM6
  810. vaesenc \T5, \XMM7, \XMM7
  811. vaesenc \T5, \XMM8, \XMM8
  812. vmovdqa TMP8(%rsp), \T1
  813. vmovdqa HashKey(arg1), \T5
  814. vpclmulqdq $0x11, \T5, \T1, \T3
  815. vpxor \T3, \T4, \T4
  816. vpclmulqdq $0x00, \T5, \T1, \T3
  817. vpxor \T3, \T7, \T7
  818. vpshufd $0b01001110, \T1, \T3
  819. vpxor \T1, \T3, \T3
  820. vmovdqa HashKey_k(arg1), \T5
  821. vpclmulqdq $0x10, \T5, \T3, \T3
  822. vpxor \T3, \T6, \T6
  823. vpxor \T4, \T6, \T6
  824. vpxor \T7, \T6, \T6
  825. vmovdqu 16*10(arg1), \T5
  826. i = 0
  827. j = 1
  828. setreg
  829. .rep 8
  830. vpxor 16*i(arg3, %r11), \T5, \T2
  831. .if \ENC_DEC == ENC
  832. vaesenclast \T2, reg_j, reg_j
  833. .else
  834. vaesenclast \T2, reg_j, \T3
  835. vmovdqu 16*i(arg3, %r11), reg_j
  836. vmovdqu \T3, 16*i(arg2, %r11)
  837. .endif
  838. i = (i+1)
  839. j = (j+1)
  840. setreg
  841. .endr
  842. #######################################################################
  843. vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
  844. vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
  845. vpxor \T3, \T7, \T7
  846. vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
  847. #######################################################################
  848. #first phase of the reduction
  849. #######################################################################
  850. vpslld $31, \T7, \T2 # packed right shifting << 31
  851. vpslld $30, \T7, \T3 # packed right shifting shift << 30
  852. vpslld $25, \T7, \T4 # packed right shifting shift << 25
  853. vpxor \T3, \T2, \T2 # xor the shifted versions
  854. vpxor \T4, \T2, \T2
  855. vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
  856. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  857. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  858. #######################################################################
  859. .if \ENC_DEC == ENC
  860. vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
  861. vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
  862. vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
  863. vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
  864. vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
  865. vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
  866. vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
  867. vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
  868. .endif
  869. #######################################################################
  870. #second phase of the reduction
  871. vpsrld $1, \T7, \T2 # packed left shifting >> 1
  872. vpsrld $2, \T7, \T3 # packed left shifting >> 2
  873. vpsrld $7, \T7, \T4 # packed left shifting >> 7
  874. vpxor \T3, \T2, \T2 # xor the shifted versions
  875. vpxor \T4, \T2, \T2
  876. vpxor \T1, \T2, \T2
  877. vpxor \T2, \T7, \T7
  878. vpxor \T7, \T6, \T6 # the result is in T6
  879. #######################################################################
  880. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  881. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  882. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  883. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  884. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  885. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  886. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  887. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  888. vpxor \T6, \XMM1, \XMM1
  889. .endm
  890. # GHASH the last 4 ciphertext blocks.
  891. .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
  892. ## Karatsuba Method
  893. vpshufd $0b01001110, \XMM1, \T2
  894. vpxor \XMM1, \T2, \T2
  895. vmovdqa HashKey_8(arg1), \T5
  896. vpclmulqdq $0x11, \T5, \XMM1, \T6
  897. vpclmulqdq $0x00, \T5, \XMM1, \T7
  898. vmovdqa HashKey_8_k(arg1), \T3
  899. vpclmulqdq $0x00, \T3, \T2, \XMM1
  900. ######################
  901. vpshufd $0b01001110, \XMM2, \T2
  902. vpxor \XMM2, \T2, \T2
  903. vmovdqa HashKey_7(arg1), \T5
  904. vpclmulqdq $0x11, \T5, \XMM2, \T4
  905. vpxor \T4, \T6, \T6
  906. vpclmulqdq $0x00, \T5, \XMM2, \T4
  907. vpxor \T4, \T7, \T7
  908. vmovdqa HashKey_7_k(arg1), \T3
  909. vpclmulqdq $0x00, \T3, \T2, \T2
  910. vpxor \T2, \XMM1, \XMM1
  911. ######################
  912. vpshufd $0b01001110, \XMM3, \T2
  913. vpxor \XMM3, \T2, \T2
  914. vmovdqa HashKey_6(arg1), \T5
  915. vpclmulqdq $0x11, \T5, \XMM3, \T4
  916. vpxor \T4, \T6, \T6
  917. vpclmulqdq $0x00, \T5, \XMM3, \T4
  918. vpxor \T4, \T7, \T7
  919. vmovdqa HashKey_6_k(arg1), \T3
  920. vpclmulqdq $0x00, \T3, \T2, \T2
  921. vpxor \T2, \XMM1, \XMM1
  922. ######################
  923. vpshufd $0b01001110, \XMM4, \T2
  924. vpxor \XMM4, \T2, \T2
  925. vmovdqa HashKey_5(arg1), \T5
  926. vpclmulqdq $0x11, \T5, \XMM4, \T4
  927. vpxor \T4, \T6, \T6
  928. vpclmulqdq $0x00, \T5, \XMM4, \T4
  929. vpxor \T4, \T7, \T7
  930. vmovdqa HashKey_5_k(arg1), \T3
  931. vpclmulqdq $0x00, \T3, \T2, \T2
  932. vpxor \T2, \XMM1, \XMM1
  933. ######################
  934. vpshufd $0b01001110, \XMM5, \T2
  935. vpxor \XMM5, \T2, \T2
  936. vmovdqa HashKey_4(arg1), \T5
  937. vpclmulqdq $0x11, \T5, \XMM5, \T4
  938. vpxor \T4, \T6, \T6
  939. vpclmulqdq $0x00, \T5, \XMM5, \T4
  940. vpxor \T4, \T7, \T7
  941. vmovdqa HashKey_4_k(arg1), \T3
  942. vpclmulqdq $0x00, \T3, \T2, \T2
  943. vpxor \T2, \XMM1, \XMM1
  944. ######################
  945. vpshufd $0b01001110, \XMM6, \T2
  946. vpxor \XMM6, \T2, \T2
  947. vmovdqa HashKey_3(arg1), \T5
  948. vpclmulqdq $0x11, \T5, \XMM6, \T4
  949. vpxor \T4, \T6, \T6
  950. vpclmulqdq $0x00, \T5, \XMM6, \T4
  951. vpxor \T4, \T7, \T7
  952. vmovdqa HashKey_3_k(arg1), \T3
  953. vpclmulqdq $0x00, \T3, \T2, \T2
  954. vpxor \T2, \XMM1, \XMM1
  955. ######################
  956. vpshufd $0b01001110, \XMM7, \T2
  957. vpxor \XMM7, \T2, \T2
  958. vmovdqa HashKey_2(arg1), \T5
  959. vpclmulqdq $0x11, \T5, \XMM7, \T4
  960. vpxor \T4, \T6, \T6
  961. vpclmulqdq $0x00, \T5, \XMM7, \T4
  962. vpxor \T4, \T7, \T7
  963. vmovdqa HashKey_2_k(arg1), \T3
  964. vpclmulqdq $0x00, \T3, \T2, \T2
  965. vpxor \T2, \XMM1, \XMM1
  966. ######################
  967. vpshufd $0b01001110, \XMM8, \T2
  968. vpxor \XMM8, \T2, \T2
  969. vmovdqa HashKey(arg1), \T5
  970. vpclmulqdq $0x11, \T5, \XMM8, \T4
  971. vpxor \T4, \T6, \T6
  972. vpclmulqdq $0x00, \T5, \XMM8, \T4
  973. vpxor \T4, \T7, \T7
  974. vmovdqa HashKey_k(arg1), \T3
  975. vpclmulqdq $0x00, \T3, \T2, \T2
  976. vpxor \T2, \XMM1, \XMM1
  977. vpxor \T6, \XMM1, \XMM1
  978. vpxor \T7, \XMM1, \T2
  979. vpslldq $8, \T2, \T4
  980. vpsrldq $8, \T2, \T2
  981. vpxor \T4, \T7, \T7
  982. vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
  983. # the accumulated carry-less multiplications
  984. #######################################################################
  985. #first phase of the reduction
  986. vpslld $31, \T7, \T2 # packed right shifting << 31
  987. vpslld $30, \T7, \T3 # packed right shifting shift << 30
  988. vpslld $25, \T7, \T4 # packed right shifting shift << 25
  989. vpxor \T3, \T2, \T2 # xor the shifted versions
  990. vpxor \T4, \T2, \T2
  991. vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
  992. vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
  993. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  994. #######################################################################
  995. #second phase of the reduction
  996. vpsrld $1, \T7, \T2 # packed left shifting >> 1
  997. vpsrld $2, \T7, \T3 # packed left shifting >> 2
  998. vpsrld $7, \T7, \T4 # packed left shifting >> 7
  999. vpxor \T3, \T2, \T2 # xor the shifted versions
  1000. vpxor \T4, \T2, \T2
  1001. vpxor \T1, \T2, \T2
  1002. vpxor \T2, \T7, \T7
  1003. vpxor \T7, \T6, \T6 # the result is in T6
  1004. .endm
  1005. # combined for GCM encrypt and decrypt functions
  1006. # clobbering all xmm registers
  1007. # clobbering r10, r11, r12, r13, r14, r15
  1008. .macro GCM_ENC_DEC_AVX ENC_DEC
  1009. #the number of pushes must equal STACK_OFFSET
  1010. push %r12
  1011. push %r13
  1012. push %r14
  1013. push %r15
  1014. mov %rsp, %r14
  1015. sub $VARIABLE_OFFSET, %rsp
  1016. and $~63, %rsp # align rsp to 64 bytes
  1017. vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
  1018. mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
  1019. and $-16, %r13 # r13 = r13 - (r13 mod 16)
  1020. mov %r13, %r12
  1021. shr $4, %r12
  1022. and $7, %r12
  1023. jz _initial_num_blocks_is_0\@
  1024. cmp $7, %r12
  1025. je _initial_num_blocks_is_7\@
  1026. cmp $6, %r12
  1027. je _initial_num_blocks_is_6\@
  1028. cmp $5, %r12
  1029. je _initial_num_blocks_is_5\@
  1030. cmp $4, %r12
  1031. je _initial_num_blocks_is_4\@
  1032. cmp $3, %r12
  1033. je _initial_num_blocks_is_3\@
  1034. cmp $2, %r12
  1035. je _initial_num_blocks_is_2\@
  1036. jmp _initial_num_blocks_is_1\@
  1037. _initial_num_blocks_is_7\@:
  1038. INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1039. sub $16*7, %r13
  1040. jmp _initial_blocks_encrypted\@
  1041. _initial_num_blocks_is_6\@:
  1042. INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1043. sub $16*6, %r13
  1044. jmp _initial_blocks_encrypted\@
  1045. _initial_num_blocks_is_5\@:
  1046. INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1047. sub $16*5, %r13
  1048. jmp _initial_blocks_encrypted\@
  1049. _initial_num_blocks_is_4\@:
  1050. INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1051. sub $16*4, %r13
  1052. jmp _initial_blocks_encrypted\@
  1053. _initial_num_blocks_is_3\@:
  1054. INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1055. sub $16*3, %r13
  1056. jmp _initial_blocks_encrypted\@
  1057. _initial_num_blocks_is_2\@:
  1058. INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1059. sub $16*2, %r13
  1060. jmp _initial_blocks_encrypted\@
  1061. _initial_num_blocks_is_1\@:
  1062. INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1063. sub $16*1, %r13
  1064. jmp _initial_blocks_encrypted\@
  1065. _initial_num_blocks_is_0\@:
  1066. INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  1067. _initial_blocks_encrypted\@:
  1068. cmp $0, %r13
  1069. je _zero_cipher_left\@
  1070. sub $128, %r13
  1071. je _eight_cipher_left\@
  1072. vmovd %xmm9, %r15d
  1073. and $255, %r15d
  1074. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1075. _encrypt_by_8_new\@:
  1076. cmp $(255-8), %r15d
  1077. jg _encrypt_by_8\@
  1078. add $8, %r15b
  1079. GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
  1080. add $128, %r11
  1081. sub $128, %r13
  1082. jne _encrypt_by_8_new\@
  1083. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1084. jmp _eight_cipher_left\@
  1085. _encrypt_by_8\@:
  1086. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1087. add $8, %r15b
  1088. GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
  1089. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1090. add $128, %r11
  1091. sub $128, %r13
  1092. jne _encrypt_by_8_new\@
  1093. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1094. _eight_cipher_left\@:
  1095. GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
  1096. _zero_cipher_left\@:
  1097. cmp $16, arg4
  1098. jl _only_less_than_16\@
  1099. mov arg4, %r13
  1100. and $15, %r13 # r13 = (arg4 mod 16)
  1101. je _multiple_of_16_bytes\@
  1102. # handle the last <16 Byte block seperately
  1103. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  1104. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1105. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  1106. sub $16, %r11
  1107. add %r13, %r11
  1108. vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
  1109. lea SHIFT_MASK+16(%rip), %r12
  1110. sub %r13, %r12 # adjust the shuffle mask pointer to be
  1111. # able to shift 16-r13 bytes (r13 is the
  1112. # number of bytes in plaintext mod 16)
  1113. vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1114. vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
  1115. jmp _final_ghash_mul\@
  1116. _only_less_than_16\@:
  1117. # check for 0 length
  1118. mov arg4, %r13
  1119. and $15, %r13 # r13 = (arg4 mod 16)
  1120. je _multiple_of_16_bytes\@
  1121. # handle the last <16 Byte block seperately
  1122. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  1123. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1124. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  1125. lea SHIFT_MASK+16(%rip), %r12
  1126. sub %r13, %r12 # adjust the shuffle mask pointer to be
  1127. # able to shift 16-r13 bytes (r13 is the
  1128. # number of bytes in plaintext mod 16)
  1129. _get_last_16_byte_loop\@:
  1130. movb (arg3, %r11), %al
  1131. movb %al, TMP1 (%rsp , %r11)
  1132. add $1, %r11
  1133. cmp %r13, %r11
  1134. jne _get_last_16_byte_loop\@
  1135. vmovdqu TMP1(%rsp), %xmm1
  1136. sub $16, %r11
  1137. _final_ghash_mul\@:
  1138. .if \ENC_DEC == DEC
  1139. vmovdqa %xmm1, %xmm2
  1140. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  1141. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
  1142. # mask out top 16-r13 bytes of xmm9
  1143. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  1144. vpand %xmm1, %xmm2, %xmm2
  1145. vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
  1146. vpxor %xmm2, %xmm14, %xmm14
  1147. #GHASH computation for the last <16 Byte block
  1148. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  1149. sub %r13, %r11
  1150. add $16, %r11
  1151. .else
  1152. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  1153. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
  1154. # mask out top 16-r13 bytes of xmm9
  1155. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  1156. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  1157. vpxor %xmm9, %xmm14, %xmm14
  1158. #GHASH computation for the last <16 Byte block
  1159. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  1160. sub %r13, %r11
  1161. add $16, %r11
  1162. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
  1163. .endif
  1164. #############################
  1165. # output r13 Bytes
  1166. vmovq %xmm9, %rax
  1167. cmp $8, %r13
  1168. jle _less_than_8_bytes_left\@
  1169. mov %rax, (arg2 , %r11)
  1170. add $8, %r11
  1171. vpsrldq $8, %xmm9, %xmm9
  1172. vmovq %xmm9, %rax
  1173. sub $8, %r13
  1174. _less_than_8_bytes_left\@:
  1175. movb %al, (arg2 , %r11)
  1176. add $1, %r11
  1177. shr $8, %rax
  1178. sub $1, %r13
  1179. jne _less_than_8_bytes_left\@
  1180. #############################
  1181. _multiple_of_16_bytes\@:
  1182. mov arg7, %r12 # r12 = aadLen (number of bytes)
  1183. shl $3, %r12 # convert into number of bits
  1184. vmovd %r12d, %xmm15 # len(A) in xmm15
  1185. shl $3, arg4 # len(C) in bits (*128)
  1186. vmovq arg4, %xmm1
  1187. vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
  1188. vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
  1189. vpxor %xmm15, %xmm14, %xmm14
  1190. GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
  1191. vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
  1192. mov arg5, %rax # rax = *Y0
  1193. vmovdqu (%rax), %xmm9 # xmm9 = Y0
  1194. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
  1195. vpxor %xmm14, %xmm9, %xmm9
  1196. _return_T\@:
  1197. mov arg8, %r10 # r10 = authTag
  1198. mov arg9, %r11 # r11 = auth_tag_len
  1199. cmp $16, %r11
  1200. je _T_16\@
  1201. cmp $8, %r11
  1202. jl _T_4\@
  1203. _T_8\@:
  1204. vmovq %xmm9, %rax
  1205. mov %rax, (%r10)
  1206. add $8, %r10
  1207. sub $8, %r11
  1208. vpsrldq $8, %xmm9, %xmm9
  1209. cmp $0, %r11
  1210. je _return_T_done\@
  1211. _T_4\@:
  1212. vmovd %xmm9, %eax
  1213. mov %eax, (%r10)
  1214. add $4, %r10
  1215. sub $4, %r11
  1216. vpsrldq $4, %xmm9, %xmm9
  1217. cmp $0, %r11
  1218. je _return_T_done\@
  1219. _T_123\@:
  1220. vmovd %xmm9, %eax
  1221. cmp $2, %r11
  1222. jl _T_1\@
  1223. mov %ax, (%r10)
  1224. cmp $2, %r11
  1225. je _return_T_done\@
  1226. add $2, %r10
  1227. sar $16, %eax
  1228. _T_1\@:
  1229. mov %al, (%r10)
  1230. jmp _return_T_done\@
  1231. _T_16\@:
  1232. vmovdqu %xmm9, (%r10)
  1233. _return_T_done\@:
  1234. mov %r14, %rsp
  1235. pop %r15
  1236. pop %r14
  1237. pop %r13
  1238. pop %r12
  1239. .endm
  1240. #############################################################
  1241. #void aesni_gcm_precomp_avx_gen2
  1242. # (gcm_data *my_ctx_data,
  1243. # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
  1244. #############################################################
  1245. ENTRY(aesni_gcm_precomp_avx_gen2)
  1246. #the number of pushes must equal STACK_OFFSET
  1247. push %r12
  1248. push %r13
  1249. push %r14
  1250. push %r15
  1251. mov %rsp, %r14
  1252. sub $VARIABLE_OFFSET, %rsp
  1253. and $~63, %rsp # align rsp to 64 bytes
  1254. vmovdqu (arg2), %xmm6 # xmm6 = HashKey
  1255. vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
  1256. ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
  1257. vmovdqa %xmm6, %xmm2
  1258. vpsllq $1, %xmm6, %xmm6
  1259. vpsrlq $63, %xmm2, %xmm2
  1260. vmovdqa %xmm2, %xmm1
  1261. vpslldq $8, %xmm2, %xmm2
  1262. vpsrldq $8, %xmm1, %xmm1
  1263. vpor %xmm2, %xmm6, %xmm6
  1264. #reduction
  1265. vpshufd $0b00100100, %xmm1, %xmm2
  1266. vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
  1267. vpand POLY(%rip), %xmm2, %xmm2
  1268. vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
  1269. #######################################################################
  1270. vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
  1271. PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
  1272. mov %r14, %rsp
  1273. pop %r15
  1274. pop %r14
  1275. pop %r13
  1276. pop %r12
  1277. ret
  1278. ENDPROC(aesni_gcm_precomp_avx_gen2)
  1279. ###############################################################################
  1280. #void aesni_gcm_enc_avx_gen2(
  1281. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1282. # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
  1283. # const u8 *in, /* Plaintext input */
  1284. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  1285. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  1286. # (from Security Association) concatenated with 8 byte
  1287. # Initialisation Vector (from IPSec ESP Payload)
  1288. # concatenated with 0x00000001. 16-byte aligned pointer. */
  1289. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  1290. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  1291. # u8 *auth_tag, /* Authenticated Tag output. */
  1292. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  1293. # Valid values are 16 (most likely), 12 or 8. */
  1294. ###############################################################################
  1295. ENTRY(aesni_gcm_enc_avx_gen2)
  1296. GCM_ENC_DEC_AVX ENC
  1297. ret
  1298. ENDPROC(aesni_gcm_enc_avx_gen2)
  1299. ###############################################################################
  1300. #void aesni_gcm_dec_avx_gen2(
  1301. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  1302. # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
  1303. # const u8 *in, /* Ciphertext input */
  1304. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  1305. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  1306. # (from Security Association) concatenated with 8 byte
  1307. # Initialisation Vector (from IPSec ESP Payload)
  1308. # concatenated with 0x00000001. 16-byte aligned pointer. */
  1309. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  1310. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  1311. # u8 *auth_tag, /* Authenticated Tag output. */
  1312. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  1313. # Valid values are 16 (most likely), 12 or 8. */
  1314. ###############################################################################
  1315. ENTRY(aesni_gcm_dec_avx_gen2)
  1316. GCM_ENC_DEC_AVX DEC
  1317. ret
  1318. ENDPROC(aesni_gcm_dec_avx_gen2)
  1319. #endif /* CONFIG_AS_AVX */
  1320. #ifdef CONFIG_AS_AVX2
  1321. ###############################################################################
  1322. # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  1323. # Input: A and B (128-bits each, bit-reflected)
  1324. # Output: C = A*B*x mod poly, (i.e. >>1 )
  1325. # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  1326. # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  1327. ###############################################################################
  1328. .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
  1329. vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
  1330. vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
  1331. vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
  1332. vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
  1333. vpxor \T3, \GH, \GH
  1334. vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
  1335. vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
  1336. vpxor \T3, \T1, \T1
  1337. vpxor \T2, \GH, \GH
  1338. #######################################################################
  1339. #first phase of the reduction
  1340. vmovdqa POLY2(%rip), \T3
  1341. vpclmulqdq $0x01, \GH, \T3, \T2
  1342. vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
  1343. vpxor \T2, \GH, \GH # first phase of the reduction complete
  1344. #######################################################################
  1345. #second phase of the reduction
  1346. vpclmulqdq $0x00, \GH, \T3, \T2
  1347. vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1348. vpclmulqdq $0x10, \GH, \T3, \GH
  1349. vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1350. vpxor \T2, \GH, \GH # second phase of the reduction complete
  1351. #######################################################################
  1352. vpxor \T1, \GH, \GH # the result is in GH
  1353. .endm
  1354. .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
  1355. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  1356. vmovdqa \HK, \T5
  1357. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
  1358. vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
  1359. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
  1360. vmovdqa \T5, HashKey_3(arg1)
  1361. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
  1362. vmovdqa \T5, HashKey_4(arg1)
  1363. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
  1364. vmovdqa \T5, HashKey_5(arg1)
  1365. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
  1366. vmovdqa \T5, HashKey_6(arg1)
  1367. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
  1368. vmovdqa \T5, HashKey_7(arg1)
  1369. GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
  1370. vmovdqa \T5, HashKey_8(arg1)
  1371. .endm
  1372. ## if a = number of total plaintext bytes
  1373. ## b = floor(a/16)
  1374. ## num_initial_blocks = b mod 4#
  1375. ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
  1376. ## r10, r11, r12, rax are clobbered
  1377. ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
  1378. .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
  1379. i = (8-\num_initial_blocks)
  1380. j = 0
  1381. setreg
  1382. mov arg6, %r10 # r10 = AAD
  1383. mov arg7, %r12 # r12 = aadLen
  1384. mov %r12, %r11
  1385. vpxor reg_j, reg_j, reg_j
  1386. vpxor reg_i, reg_i, reg_i
  1387. cmp $16, %r11
  1388. jl _get_AAD_rest8\@
  1389. _get_AAD_blocks\@:
  1390. vmovdqu (%r10), reg_i
  1391. vpshufb SHUF_MASK(%rip), reg_i, reg_i
  1392. vpxor reg_i, reg_j, reg_j
  1393. GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
  1394. add $16, %r10
  1395. sub $16, %r12
  1396. sub $16, %r11
  1397. cmp $16, %r11
  1398. jge _get_AAD_blocks\@
  1399. vmovdqu reg_j, reg_i
  1400. cmp $0, %r11
  1401. je _get_AAD_done\@
  1402. vpxor reg_i, reg_i, reg_i
  1403. /* read the last <16B of AAD. since we have at least 4B of
  1404. data right after the AAD (the ICV, and maybe some CT), we can
  1405. read 4B/8B blocks safely, and then get rid of the extra stuff */
  1406. _get_AAD_rest8\@:
  1407. cmp $4, %r11
  1408. jle _get_AAD_rest4\@
  1409. movq (%r10), \T1
  1410. add $8, %r10
  1411. sub $8, %r11
  1412. vpslldq $8, \T1, \T1
  1413. vpsrldq $8, reg_i, reg_i
  1414. vpxor \T1, reg_i, reg_i
  1415. jmp _get_AAD_rest8\@
  1416. _get_AAD_rest4\@:
  1417. cmp $0, %r11
  1418. jle _get_AAD_rest0\@
  1419. mov (%r10), %eax
  1420. movq %rax, \T1
  1421. add $4, %r10
  1422. sub $4, %r11
  1423. vpslldq $12, \T1, \T1
  1424. vpsrldq $4, reg_i, reg_i
  1425. vpxor \T1, reg_i, reg_i
  1426. _get_AAD_rest0\@:
  1427. /* finalize: shift out the extra bytes we read, and align
  1428. left. since pslldq can only shift by an immediate, we use
  1429. vpshufb and an array of shuffle masks */
  1430. movq %r12, %r11
  1431. salq $4, %r11
  1432. movdqu aad_shift_arr(%r11), \T1
  1433. vpshufb \T1, reg_i, reg_i
  1434. _get_AAD_rest_final\@:
  1435. vpshufb SHUF_MASK(%rip), reg_i, reg_i
  1436. vpxor reg_j, reg_i, reg_i
  1437. GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
  1438. _get_AAD_done\@:
  1439. # initialize the data pointer offset as zero
  1440. xor %r11d, %r11d
  1441. # start AES for num_initial_blocks blocks
  1442. mov arg5, %rax # rax = *Y0
  1443. vmovdqu (%rax), \CTR # CTR = Y0
  1444. vpshufb SHUF_MASK(%rip), \CTR, \CTR
  1445. i = (9-\num_initial_blocks)
  1446. setreg
  1447. .rep \num_initial_blocks
  1448. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1449. vmovdqa \CTR, reg_i
  1450. vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
  1451. i = (i+1)
  1452. setreg
  1453. .endr
  1454. vmovdqa (arg1), \T_key
  1455. i = (9-\num_initial_blocks)
  1456. setreg
  1457. .rep \num_initial_blocks
  1458. vpxor \T_key, reg_i, reg_i
  1459. i = (i+1)
  1460. setreg
  1461. .endr
  1462. j = 1
  1463. setreg
  1464. .rep 9
  1465. vmovdqa 16*j(arg1), \T_key
  1466. i = (9-\num_initial_blocks)
  1467. setreg
  1468. .rep \num_initial_blocks
  1469. vaesenc \T_key, reg_i, reg_i
  1470. i = (i+1)
  1471. setreg
  1472. .endr
  1473. j = (j+1)
  1474. setreg
  1475. .endr
  1476. vmovdqa 16*10(arg1), \T_key
  1477. i = (9-\num_initial_blocks)
  1478. setreg
  1479. .rep \num_initial_blocks
  1480. vaesenclast \T_key, reg_i, reg_i
  1481. i = (i+1)
  1482. setreg
  1483. .endr
  1484. i = (9-\num_initial_blocks)
  1485. setreg
  1486. .rep \num_initial_blocks
  1487. vmovdqu (arg3, %r11), \T1
  1488. vpxor \T1, reg_i, reg_i
  1489. vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
  1490. # num_initial_blocks blocks
  1491. add $16, %r11
  1492. .if \ENC_DEC == DEC
  1493. vmovdqa \T1, reg_i
  1494. .endif
  1495. vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
  1496. i = (i+1)
  1497. setreg
  1498. .endr
  1499. i = (8-\num_initial_blocks)
  1500. j = (9-\num_initial_blocks)
  1501. setreg
  1502. .rep \num_initial_blocks
  1503. vpxor reg_i, reg_j, reg_j
  1504. GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
  1505. i = (i+1)
  1506. j = (j+1)
  1507. setreg
  1508. .endr
  1509. # XMM8 has the combined result here
  1510. vmovdqa \XMM8, TMP1(%rsp)
  1511. vmovdqa \XMM8, \T3
  1512. cmp $128, %r13
  1513. jl _initial_blocks_done\@ # no need for precomputed constants
  1514. ###############################################################################
  1515. # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  1516. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1517. vmovdqa \CTR, \XMM1
  1518. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1519. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1520. vmovdqa \CTR, \XMM2
  1521. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1522. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1523. vmovdqa \CTR, \XMM3
  1524. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1525. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1526. vmovdqa \CTR, \XMM4
  1527. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1528. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1529. vmovdqa \CTR, \XMM5
  1530. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1531. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1532. vmovdqa \CTR, \XMM6
  1533. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1534. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1535. vmovdqa \CTR, \XMM7
  1536. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1537. vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
  1538. vmovdqa \CTR, \XMM8
  1539. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1540. vmovdqa (arg1), \T_key
  1541. vpxor \T_key, \XMM1, \XMM1
  1542. vpxor \T_key, \XMM2, \XMM2
  1543. vpxor \T_key, \XMM3, \XMM3
  1544. vpxor \T_key, \XMM4, \XMM4
  1545. vpxor \T_key, \XMM5, \XMM5
  1546. vpxor \T_key, \XMM6, \XMM6
  1547. vpxor \T_key, \XMM7, \XMM7
  1548. vpxor \T_key, \XMM8, \XMM8
  1549. i = 1
  1550. setreg
  1551. .rep 9 # do 9 rounds
  1552. vmovdqa 16*i(arg1), \T_key
  1553. vaesenc \T_key, \XMM1, \XMM1
  1554. vaesenc \T_key, \XMM2, \XMM2
  1555. vaesenc \T_key, \XMM3, \XMM3
  1556. vaesenc \T_key, \XMM4, \XMM4
  1557. vaesenc \T_key, \XMM5, \XMM5
  1558. vaesenc \T_key, \XMM6, \XMM6
  1559. vaesenc \T_key, \XMM7, \XMM7
  1560. vaesenc \T_key, \XMM8, \XMM8
  1561. i = (i+1)
  1562. setreg
  1563. .endr
  1564. vmovdqa 16*i(arg1), \T_key
  1565. vaesenclast \T_key, \XMM1, \XMM1
  1566. vaesenclast \T_key, \XMM2, \XMM2
  1567. vaesenclast \T_key, \XMM3, \XMM3
  1568. vaesenclast \T_key, \XMM4, \XMM4
  1569. vaesenclast \T_key, \XMM5, \XMM5
  1570. vaesenclast \T_key, \XMM6, \XMM6
  1571. vaesenclast \T_key, \XMM7, \XMM7
  1572. vaesenclast \T_key, \XMM8, \XMM8
  1573. vmovdqu (arg3, %r11), \T1
  1574. vpxor \T1, \XMM1, \XMM1
  1575. vmovdqu \XMM1, (arg2 , %r11)
  1576. .if \ENC_DEC == DEC
  1577. vmovdqa \T1, \XMM1
  1578. .endif
  1579. vmovdqu 16*1(arg3, %r11), \T1
  1580. vpxor \T1, \XMM2, \XMM2
  1581. vmovdqu \XMM2, 16*1(arg2 , %r11)
  1582. .if \ENC_DEC == DEC
  1583. vmovdqa \T1, \XMM2
  1584. .endif
  1585. vmovdqu 16*2(arg3, %r11), \T1
  1586. vpxor \T1, \XMM3, \XMM3
  1587. vmovdqu \XMM3, 16*2(arg2 , %r11)
  1588. .if \ENC_DEC == DEC
  1589. vmovdqa \T1, \XMM3
  1590. .endif
  1591. vmovdqu 16*3(arg3, %r11), \T1
  1592. vpxor \T1, \XMM4, \XMM4
  1593. vmovdqu \XMM4, 16*3(arg2 , %r11)
  1594. .if \ENC_DEC == DEC
  1595. vmovdqa \T1, \XMM4
  1596. .endif
  1597. vmovdqu 16*4(arg3, %r11), \T1
  1598. vpxor \T1, \XMM5, \XMM5
  1599. vmovdqu \XMM5, 16*4(arg2 , %r11)
  1600. .if \ENC_DEC == DEC
  1601. vmovdqa \T1, \XMM5
  1602. .endif
  1603. vmovdqu 16*5(arg3, %r11), \T1
  1604. vpxor \T1, \XMM6, \XMM6
  1605. vmovdqu \XMM6, 16*5(arg2 , %r11)
  1606. .if \ENC_DEC == DEC
  1607. vmovdqa \T1, \XMM6
  1608. .endif
  1609. vmovdqu 16*6(arg3, %r11), \T1
  1610. vpxor \T1, \XMM7, \XMM7
  1611. vmovdqu \XMM7, 16*6(arg2 , %r11)
  1612. .if \ENC_DEC == DEC
  1613. vmovdqa \T1, \XMM7
  1614. .endif
  1615. vmovdqu 16*7(arg3, %r11), \T1
  1616. vpxor \T1, \XMM8, \XMM8
  1617. vmovdqu \XMM8, 16*7(arg2 , %r11)
  1618. .if \ENC_DEC == DEC
  1619. vmovdqa \T1, \XMM8
  1620. .endif
  1621. add $128, %r11
  1622. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1623. vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
  1624. # the corresponding ciphertext
  1625. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1626. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1627. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1628. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1629. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1630. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1631. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1632. ###############################################################################
  1633. _initial_blocks_done\@:
  1634. .endm
  1635. # encrypt 8 blocks at a time
  1636. # ghash the 8 previously encrypted ciphertext blocks
  1637. # arg1, arg2, arg3 are used as pointers only, not modified
  1638. # r11 is the data offset value
  1639. .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
  1640. vmovdqa \XMM1, \T2
  1641. vmovdqa \XMM2, TMP2(%rsp)
  1642. vmovdqa \XMM3, TMP3(%rsp)
  1643. vmovdqa \XMM4, TMP4(%rsp)
  1644. vmovdqa \XMM5, TMP5(%rsp)
  1645. vmovdqa \XMM6, TMP6(%rsp)
  1646. vmovdqa \XMM7, TMP7(%rsp)
  1647. vmovdqa \XMM8, TMP8(%rsp)
  1648. .if \loop_idx == in_order
  1649. vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
  1650. vpaddd ONE(%rip), \XMM1, \XMM2
  1651. vpaddd ONE(%rip), \XMM2, \XMM3
  1652. vpaddd ONE(%rip), \XMM3, \XMM4
  1653. vpaddd ONE(%rip), \XMM4, \XMM5
  1654. vpaddd ONE(%rip), \XMM5, \XMM6
  1655. vpaddd ONE(%rip), \XMM6, \XMM7
  1656. vpaddd ONE(%rip), \XMM7, \XMM8
  1657. vmovdqa \XMM8, \CTR
  1658. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1659. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1660. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1661. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1662. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1663. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1664. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1665. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1666. .else
  1667. vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
  1668. vpaddd ONEf(%rip), \XMM1, \XMM2
  1669. vpaddd ONEf(%rip), \XMM2, \XMM3
  1670. vpaddd ONEf(%rip), \XMM3, \XMM4
  1671. vpaddd ONEf(%rip), \XMM4, \XMM5
  1672. vpaddd ONEf(%rip), \XMM5, \XMM6
  1673. vpaddd ONEf(%rip), \XMM6, \XMM7
  1674. vpaddd ONEf(%rip), \XMM7, \XMM8
  1675. vmovdqa \XMM8, \CTR
  1676. .endif
  1677. #######################################################################
  1678. vmovdqu (arg1), \T1
  1679. vpxor \T1, \XMM1, \XMM1
  1680. vpxor \T1, \XMM2, \XMM2
  1681. vpxor \T1, \XMM3, \XMM3
  1682. vpxor \T1, \XMM4, \XMM4
  1683. vpxor \T1, \XMM5, \XMM5
  1684. vpxor \T1, \XMM6, \XMM6
  1685. vpxor \T1, \XMM7, \XMM7
  1686. vpxor \T1, \XMM8, \XMM8
  1687. #######################################################################
  1688. vmovdqu 16*1(arg1), \T1
  1689. vaesenc \T1, \XMM1, \XMM1
  1690. vaesenc \T1, \XMM2, \XMM2
  1691. vaesenc \T1, \XMM3, \XMM3
  1692. vaesenc \T1, \XMM4, \XMM4
  1693. vaesenc \T1, \XMM5, \XMM5
  1694. vaesenc \T1, \XMM6, \XMM6
  1695. vaesenc \T1, \XMM7, \XMM7
  1696. vaesenc \T1, \XMM8, \XMM8
  1697. vmovdqu 16*2(arg1), \T1
  1698. vaesenc \T1, \XMM1, \XMM1
  1699. vaesenc \T1, \XMM2, \XMM2
  1700. vaesenc \T1, \XMM3, \XMM3
  1701. vaesenc \T1, \XMM4, \XMM4
  1702. vaesenc \T1, \XMM5, \XMM5
  1703. vaesenc \T1, \XMM6, \XMM6
  1704. vaesenc \T1, \XMM7, \XMM7
  1705. vaesenc \T1, \XMM8, \XMM8
  1706. #######################################################################
  1707. vmovdqa HashKey_8(arg1), \T5
  1708. vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
  1709. vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
  1710. vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
  1711. vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
  1712. vpxor \T5, \T6, \T6
  1713. vmovdqu 16*3(arg1), \T1
  1714. vaesenc \T1, \XMM1, \XMM1
  1715. vaesenc \T1, \XMM2, \XMM2
  1716. vaesenc \T1, \XMM3, \XMM3
  1717. vaesenc \T1, \XMM4, \XMM4
  1718. vaesenc \T1, \XMM5, \XMM5
  1719. vaesenc \T1, \XMM6, \XMM6
  1720. vaesenc \T1, \XMM7, \XMM7
  1721. vaesenc \T1, \XMM8, \XMM8
  1722. vmovdqa TMP2(%rsp), \T1
  1723. vmovdqa HashKey_7(arg1), \T5
  1724. vpclmulqdq $0x11, \T5, \T1, \T3
  1725. vpxor \T3, \T4, \T4
  1726. vpclmulqdq $0x00, \T5, \T1, \T3
  1727. vpxor \T3, \T7, \T7
  1728. vpclmulqdq $0x01, \T5, \T1, \T3
  1729. vpxor \T3, \T6, \T6
  1730. vpclmulqdq $0x10, \T5, \T1, \T3
  1731. vpxor \T3, \T6, \T6
  1732. vmovdqu 16*4(arg1), \T1
  1733. vaesenc \T1, \XMM1, \XMM1
  1734. vaesenc \T1, \XMM2, \XMM2
  1735. vaesenc \T1, \XMM3, \XMM3
  1736. vaesenc \T1, \XMM4, \XMM4
  1737. vaesenc \T1, \XMM5, \XMM5
  1738. vaesenc \T1, \XMM6, \XMM6
  1739. vaesenc \T1, \XMM7, \XMM7
  1740. vaesenc \T1, \XMM8, \XMM8
  1741. #######################################################################
  1742. vmovdqa TMP3(%rsp), \T1
  1743. vmovdqa HashKey_6(arg1), \T5
  1744. vpclmulqdq $0x11, \T5, \T1, \T3
  1745. vpxor \T3, \T4, \T4
  1746. vpclmulqdq $0x00, \T5, \T1, \T3
  1747. vpxor \T3, \T7, \T7
  1748. vpclmulqdq $0x01, \T5, \T1, \T3
  1749. vpxor \T3, \T6, \T6
  1750. vpclmulqdq $0x10, \T5, \T1, \T3
  1751. vpxor \T3, \T6, \T6
  1752. vmovdqu 16*5(arg1), \T1
  1753. vaesenc \T1, \XMM1, \XMM1
  1754. vaesenc \T1, \XMM2, \XMM2
  1755. vaesenc \T1, \XMM3, \XMM3
  1756. vaesenc \T1, \XMM4, \XMM4
  1757. vaesenc \T1, \XMM5, \XMM5
  1758. vaesenc \T1, \XMM6, \XMM6
  1759. vaesenc \T1, \XMM7, \XMM7
  1760. vaesenc \T1, \XMM8, \XMM8
  1761. vmovdqa TMP4(%rsp), \T1
  1762. vmovdqa HashKey_5(arg1), \T5
  1763. vpclmulqdq $0x11, \T5, \T1, \T3
  1764. vpxor \T3, \T4, \T4
  1765. vpclmulqdq $0x00, \T5, \T1, \T3
  1766. vpxor \T3, \T7, \T7
  1767. vpclmulqdq $0x01, \T5, \T1, \T3
  1768. vpxor \T3, \T6, \T6
  1769. vpclmulqdq $0x10, \T5, \T1, \T3
  1770. vpxor \T3, \T6, \T6
  1771. vmovdqu 16*6(arg1), \T1
  1772. vaesenc \T1, \XMM1, \XMM1
  1773. vaesenc \T1, \XMM2, \XMM2
  1774. vaesenc \T1, \XMM3, \XMM3
  1775. vaesenc \T1, \XMM4, \XMM4
  1776. vaesenc \T1, \XMM5, \XMM5
  1777. vaesenc \T1, \XMM6, \XMM6
  1778. vaesenc \T1, \XMM7, \XMM7
  1779. vaesenc \T1, \XMM8, \XMM8
  1780. vmovdqa TMP5(%rsp), \T1
  1781. vmovdqa HashKey_4(arg1), \T5
  1782. vpclmulqdq $0x11, \T5, \T1, \T3
  1783. vpxor \T3, \T4, \T4
  1784. vpclmulqdq $0x00, \T5, \T1, \T3
  1785. vpxor \T3, \T7, \T7
  1786. vpclmulqdq $0x01, \T5, \T1, \T3
  1787. vpxor \T3, \T6, \T6
  1788. vpclmulqdq $0x10, \T5, \T1, \T3
  1789. vpxor \T3, \T6, \T6
  1790. vmovdqu 16*7(arg1), \T1
  1791. vaesenc \T1, \XMM1, \XMM1
  1792. vaesenc \T1, \XMM2, \XMM2
  1793. vaesenc \T1, \XMM3, \XMM3
  1794. vaesenc \T1, \XMM4, \XMM4
  1795. vaesenc \T1, \XMM5, \XMM5
  1796. vaesenc \T1, \XMM6, \XMM6
  1797. vaesenc \T1, \XMM7, \XMM7
  1798. vaesenc \T1, \XMM8, \XMM8
  1799. vmovdqa TMP6(%rsp), \T1
  1800. vmovdqa HashKey_3(arg1), \T5
  1801. vpclmulqdq $0x11, \T5, \T1, \T3
  1802. vpxor \T3, \T4, \T4
  1803. vpclmulqdq $0x00, \T5, \T1, \T3
  1804. vpxor \T3, \T7, \T7
  1805. vpclmulqdq $0x01, \T5, \T1, \T3
  1806. vpxor \T3, \T6, \T6
  1807. vpclmulqdq $0x10, \T5, \T1, \T3
  1808. vpxor \T3, \T6, \T6
  1809. vmovdqu 16*8(arg1), \T1
  1810. vaesenc \T1, \XMM1, \XMM1
  1811. vaesenc \T1, \XMM2, \XMM2
  1812. vaesenc \T1, \XMM3, \XMM3
  1813. vaesenc \T1, \XMM4, \XMM4
  1814. vaesenc \T1, \XMM5, \XMM5
  1815. vaesenc \T1, \XMM6, \XMM6
  1816. vaesenc \T1, \XMM7, \XMM7
  1817. vaesenc \T1, \XMM8, \XMM8
  1818. vmovdqa TMP7(%rsp), \T1
  1819. vmovdqa HashKey_2(arg1), \T5
  1820. vpclmulqdq $0x11, \T5, \T1, \T3
  1821. vpxor \T3, \T4, \T4
  1822. vpclmulqdq $0x00, \T5, \T1, \T3
  1823. vpxor \T3, \T7, \T7
  1824. vpclmulqdq $0x01, \T5, \T1, \T3
  1825. vpxor \T3, \T6, \T6
  1826. vpclmulqdq $0x10, \T5, \T1, \T3
  1827. vpxor \T3, \T6, \T6
  1828. #######################################################################
  1829. vmovdqu 16*9(arg1), \T5
  1830. vaesenc \T5, \XMM1, \XMM1
  1831. vaesenc \T5, \XMM2, \XMM2
  1832. vaesenc \T5, \XMM3, \XMM3
  1833. vaesenc \T5, \XMM4, \XMM4
  1834. vaesenc \T5, \XMM5, \XMM5
  1835. vaesenc \T5, \XMM6, \XMM6
  1836. vaesenc \T5, \XMM7, \XMM7
  1837. vaesenc \T5, \XMM8, \XMM8
  1838. vmovdqa TMP8(%rsp), \T1
  1839. vmovdqa HashKey(arg1), \T5
  1840. vpclmulqdq $0x00, \T5, \T1, \T3
  1841. vpxor \T3, \T7, \T7
  1842. vpclmulqdq $0x01, \T5, \T1, \T3
  1843. vpxor \T3, \T6, \T6
  1844. vpclmulqdq $0x10, \T5, \T1, \T3
  1845. vpxor \T3, \T6, \T6
  1846. vpclmulqdq $0x11, \T5, \T1, \T3
  1847. vpxor \T3, \T4, \T1
  1848. vmovdqu 16*10(arg1), \T5
  1849. i = 0
  1850. j = 1
  1851. setreg
  1852. .rep 8
  1853. vpxor 16*i(arg3, %r11), \T5, \T2
  1854. .if \ENC_DEC == ENC
  1855. vaesenclast \T2, reg_j, reg_j
  1856. .else
  1857. vaesenclast \T2, reg_j, \T3
  1858. vmovdqu 16*i(arg3, %r11), reg_j
  1859. vmovdqu \T3, 16*i(arg2, %r11)
  1860. .endif
  1861. i = (i+1)
  1862. j = (j+1)
  1863. setreg
  1864. .endr
  1865. #######################################################################
  1866. vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
  1867. vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
  1868. vpxor \T3, \T7, \T7
  1869. vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
  1870. #######################################################################
  1871. #first phase of the reduction
  1872. vmovdqa POLY2(%rip), \T3
  1873. vpclmulqdq $0x01, \T7, \T3, \T2
  1874. vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
  1875. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  1876. #######################################################################
  1877. .if \ENC_DEC == ENC
  1878. vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
  1879. vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
  1880. vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
  1881. vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
  1882. vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
  1883. vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
  1884. vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
  1885. vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
  1886. .endif
  1887. #######################################################################
  1888. #second phase of the reduction
  1889. vpclmulqdq $0x00, \T7, \T3, \T2
  1890. vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  1891. vpclmulqdq $0x10, \T7, \T3, \T4
  1892. vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
  1893. vpxor \T2, \T4, \T4 # second phase of the reduction complete
  1894. #######################################################################
  1895. vpxor \T4, \T1, \T1 # the result is in T1
  1896. vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
  1897. vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
  1898. vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
  1899. vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
  1900. vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
  1901. vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
  1902. vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
  1903. vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
  1904. vpxor \T1, \XMM1, \XMM1
  1905. .endm
  1906. # GHASH the last 4 ciphertext blocks.
  1907. .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
  1908. ## Karatsuba Method
  1909. vmovdqa HashKey_8(arg1), \T5
  1910. vpshufd $0b01001110, \XMM1, \T2
  1911. vpshufd $0b01001110, \T5, \T3
  1912. vpxor \XMM1, \T2, \T2
  1913. vpxor \T5, \T3, \T3
  1914. vpclmulqdq $0x11, \T5, \XMM1, \T6
  1915. vpclmulqdq $0x00, \T5, \XMM1, \T7
  1916. vpclmulqdq $0x00, \T3, \T2, \XMM1
  1917. ######################
  1918. vmovdqa HashKey_7(arg1), \T5
  1919. vpshufd $0b01001110, \XMM2, \T2
  1920. vpshufd $0b01001110, \T5, \T3
  1921. vpxor \XMM2, \T2, \T2
  1922. vpxor \T5, \T3, \T3
  1923. vpclmulqdq $0x11, \T5, \XMM2, \T4
  1924. vpxor \T4, \T6, \T6
  1925. vpclmulqdq $0x00, \T5, \XMM2, \T4
  1926. vpxor \T4, \T7, \T7
  1927. vpclmulqdq $0x00, \T3, \T2, \T2
  1928. vpxor \T2, \XMM1, \XMM1
  1929. ######################
  1930. vmovdqa HashKey_6(arg1), \T5
  1931. vpshufd $0b01001110, \XMM3, \T2
  1932. vpshufd $0b01001110, \T5, \T3
  1933. vpxor \XMM3, \T2, \T2
  1934. vpxor \T5, \T3, \T3
  1935. vpclmulqdq $0x11, \T5, \XMM3, \T4
  1936. vpxor \T4, \T6, \T6
  1937. vpclmulqdq $0x00, \T5, \XMM3, \T4
  1938. vpxor \T4, \T7, \T7
  1939. vpclmulqdq $0x00, \T3, \T2, \T2
  1940. vpxor \T2, \XMM1, \XMM1
  1941. ######################
  1942. vmovdqa HashKey_5(arg1), \T5
  1943. vpshufd $0b01001110, \XMM4, \T2
  1944. vpshufd $0b01001110, \T5, \T3
  1945. vpxor \XMM4, \T2, \T2
  1946. vpxor \T5, \T3, \T3
  1947. vpclmulqdq $0x11, \T5, \XMM4, \T4
  1948. vpxor \T4, \T6, \T6
  1949. vpclmulqdq $0x00, \T5, \XMM4, \T4
  1950. vpxor \T4, \T7, \T7
  1951. vpclmulqdq $0x00, \T3, \T2, \T2
  1952. vpxor \T2, \XMM1, \XMM1
  1953. ######################
  1954. vmovdqa HashKey_4(arg1), \T5
  1955. vpshufd $0b01001110, \XMM5, \T2
  1956. vpshufd $0b01001110, \T5, \T3
  1957. vpxor \XMM5, \T2, \T2
  1958. vpxor \T5, \T3, \T3
  1959. vpclmulqdq $0x11, \T5, \XMM5, \T4
  1960. vpxor \T4, \T6, \T6
  1961. vpclmulqdq $0x00, \T5, \XMM5, \T4
  1962. vpxor \T4, \T7, \T7
  1963. vpclmulqdq $0x00, \T3, \T2, \T2
  1964. vpxor \T2, \XMM1, \XMM1
  1965. ######################
  1966. vmovdqa HashKey_3(arg1), \T5
  1967. vpshufd $0b01001110, \XMM6, \T2
  1968. vpshufd $0b01001110, \T5, \T3
  1969. vpxor \XMM6, \T2, \T2
  1970. vpxor \T5, \T3, \T3
  1971. vpclmulqdq $0x11, \T5, \XMM6, \T4
  1972. vpxor \T4, \T6, \T6
  1973. vpclmulqdq $0x00, \T5, \XMM6, \T4
  1974. vpxor \T4, \T7, \T7
  1975. vpclmulqdq $0x00, \T3, \T2, \T2
  1976. vpxor \T2, \XMM1, \XMM1
  1977. ######################
  1978. vmovdqa HashKey_2(arg1), \T5
  1979. vpshufd $0b01001110, \XMM7, \T2
  1980. vpshufd $0b01001110, \T5, \T3
  1981. vpxor \XMM7, \T2, \T2
  1982. vpxor \T5, \T3, \T3
  1983. vpclmulqdq $0x11, \T5, \XMM7, \T4
  1984. vpxor \T4, \T6, \T6
  1985. vpclmulqdq $0x00, \T5, \XMM7, \T4
  1986. vpxor \T4, \T7, \T7
  1987. vpclmulqdq $0x00, \T3, \T2, \T2
  1988. vpxor \T2, \XMM1, \XMM1
  1989. ######################
  1990. vmovdqa HashKey(arg1), \T5
  1991. vpshufd $0b01001110, \XMM8, \T2
  1992. vpshufd $0b01001110, \T5, \T3
  1993. vpxor \XMM8, \T2, \T2
  1994. vpxor \T5, \T3, \T3
  1995. vpclmulqdq $0x11, \T5, \XMM8, \T4
  1996. vpxor \T4, \T6, \T6
  1997. vpclmulqdq $0x00, \T5, \XMM8, \T4
  1998. vpxor \T4, \T7, \T7
  1999. vpclmulqdq $0x00, \T3, \T2, \T2
  2000. vpxor \T2, \XMM1, \XMM1
  2001. vpxor \T6, \XMM1, \XMM1
  2002. vpxor \T7, \XMM1, \T2
  2003. vpslldq $8, \T2, \T4
  2004. vpsrldq $8, \T2, \T2
  2005. vpxor \T4, \T7, \T7
  2006. vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
  2007. # accumulated carry-less multiplications
  2008. #######################################################################
  2009. #first phase of the reduction
  2010. vmovdqa POLY2(%rip), \T3
  2011. vpclmulqdq $0x01, \T7, \T3, \T2
  2012. vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
  2013. vpxor \T2, \T7, \T7 # first phase of the reduction complete
  2014. #######################################################################
  2015. #second phase of the reduction
  2016. vpclmulqdq $0x00, \T7, \T3, \T2
  2017. vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
  2018. vpclmulqdq $0x10, \T7, \T3, \T4
  2019. vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
  2020. vpxor \T2, \T4, \T4 # second phase of the reduction complete
  2021. #######################################################################
  2022. vpxor \T4, \T6, \T6 # the result is in T6
  2023. .endm
  2024. # combined for GCM encrypt and decrypt functions
  2025. # clobbering all xmm registers
  2026. # clobbering r10, r11, r12, r13, r14, r15
  2027. .macro GCM_ENC_DEC_AVX2 ENC_DEC
  2028. #the number of pushes must equal STACK_OFFSET
  2029. push %r12
  2030. push %r13
  2031. push %r14
  2032. push %r15
  2033. mov %rsp, %r14
  2034. sub $VARIABLE_OFFSET, %rsp
  2035. and $~63, %rsp # align rsp to 64 bytes
  2036. vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
  2037. mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
  2038. and $-16, %r13 # r13 = r13 - (r13 mod 16)
  2039. mov %r13, %r12
  2040. shr $4, %r12
  2041. and $7, %r12
  2042. jz _initial_num_blocks_is_0\@
  2043. cmp $7, %r12
  2044. je _initial_num_blocks_is_7\@
  2045. cmp $6, %r12
  2046. je _initial_num_blocks_is_6\@
  2047. cmp $5, %r12
  2048. je _initial_num_blocks_is_5\@
  2049. cmp $4, %r12
  2050. je _initial_num_blocks_is_4\@
  2051. cmp $3, %r12
  2052. je _initial_num_blocks_is_3\@
  2053. cmp $2, %r12
  2054. je _initial_num_blocks_is_2\@
  2055. jmp _initial_num_blocks_is_1\@
  2056. _initial_num_blocks_is_7\@:
  2057. INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  2058. sub $16*7, %r13
  2059. jmp _initial_blocks_encrypted\@
  2060. _initial_num_blocks_is_6\@:
  2061. INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  2062. sub $16*6, %r13
  2063. jmp _initial_blocks_encrypted\@
  2064. _initial_num_blocks_is_5\@:
  2065. INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  2066. sub $16*5, %r13
  2067. jmp _initial_blocks_encrypted\@
  2068. _initial_num_blocks_is_4\@:
  2069. INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  2070. sub $16*4, %r13
  2071. jmp _initial_blocks_encrypted\@
  2072. _initial_num_blocks_is_3\@:
  2073. INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  2074. sub $16*3, %r13
  2075. jmp _initial_blocks_encrypted\@
  2076. _initial_num_blocks_is_2\@:
  2077. INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  2078. sub $16*2, %r13
  2079. jmp _initial_blocks_encrypted\@
  2080. _initial_num_blocks_is_1\@:
  2081. INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  2082. sub $16*1, %r13
  2083. jmp _initial_blocks_encrypted\@
  2084. _initial_num_blocks_is_0\@:
  2085. INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
  2086. _initial_blocks_encrypted\@:
  2087. cmp $0, %r13
  2088. je _zero_cipher_left\@
  2089. sub $128, %r13
  2090. je _eight_cipher_left\@
  2091. vmovd %xmm9, %r15d
  2092. and $255, %r15d
  2093. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2094. _encrypt_by_8_new\@:
  2095. cmp $(255-8), %r15d
  2096. jg _encrypt_by_8\@
  2097. add $8, %r15b
  2098. GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
  2099. add $128, %r11
  2100. sub $128, %r13
  2101. jne _encrypt_by_8_new\@
  2102. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2103. jmp _eight_cipher_left\@
  2104. _encrypt_by_8\@:
  2105. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2106. add $8, %r15b
  2107. GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
  2108. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2109. add $128, %r11
  2110. sub $128, %r13
  2111. jne _encrypt_by_8_new\@
  2112. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2113. _eight_cipher_left\@:
  2114. GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
  2115. _zero_cipher_left\@:
  2116. cmp $16, arg4
  2117. jl _only_less_than_16\@
  2118. mov arg4, %r13
  2119. and $15, %r13 # r13 = (arg4 mod 16)
  2120. je _multiple_of_16_bytes\@
  2121. # handle the last <16 Byte block seperately
  2122. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  2123. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2124. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  2125. sub $16, %r11
  2126. add %r13, %r11
  2127. vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
  2128. lea SHIFT_MASK+16(%rip), %r12
  2129. sub %r13, %r12 # adjust the shuffle mask pointer
  2130. # to be able to shift 16-r13 bytes
  2131. # (r13 is the number of bytes in plaintext mod 16)
  2132. vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
  2133. vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
  2134. jmp _final_ghash_mul\@
  2135. _only_less_than_16\@:
  2136. # check for 0 length
  2137. mov arg4, %r13
  2138. and $15, %r13 # r13 = (arg4 mod 16)
  2139. je _multiple_of_16_bytes\@
  2140. # handle the last <16 Byte block seperately
  2141. vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
  2142. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2143. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
  2144. lea SHIFT_MASK+16(%rip), %r12
  2145. sub %r13, %r12 # adjust the shuffle mask pointer to be
  2146. # able to shift 16-r13 bytes (r13 is the
  2147. # number of bytes in plaintext mod 16)
  2148. _get_last_16_byte_loop\@:
  2149. movb (arg3, %r11), %al
  2150. movb %al, TMP1 (%rsp , %r11)
  2151. add $1, %r11
  2152. cmp %r13, %r11
  2153. jne _get_last_16_byte_loop\@
  2154. vmovdqu TMP1(%rsp), %xmm1
  2155. sub $16, %r11
  2156. _final_ghash_mul\@:
  2157. .if \ENC_DEC == DEC
  2158. vmovdqa %xmm1, %xmm2
  2159. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  2160. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
  2161. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  2162. vpand %xmm1, %xmm2, %xmm2
  2163. vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
  2164. vpxor %xmm2, %xmm14, %xmm14
  2165. #GHASH computation for the last <16 Byte block
  2166. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  2167. sub %r13, %r11
  2168. add $16, %r11
  2169. .else
  2170. vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
  2171. vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
  2172. vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
  2173. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
  2174. vpxor %xmm9, %xmm14, %xmm14
  2175. #GHASH computation for the last <16 Byte block
  2176. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  2177. sub %r13, %r11
  2178. add $16, %r11
  2179. vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
  2180. .endif
  2181. #############################
  2182. # output r13 Bytes
  2183. vmovq %xmm9, %rax
  2184. cmp $8, %r13
  2185. jle _less_than_8_bytes_left\@
  2186. mov %rax, (arg2 , %r11)
  2187. add $8, %r11
  2188. vpsrldq $8, %xmm9, %xmm9
  2189. vmovq %xmm9, %rax
  2190. sub $8, %r13
  2191. _less_than_8_bytes_left\@:
  2192. movb %al, (arg2 , %r11)
  2193. add $1, %r11
  2194. shr $8, %rax
  2195. sub $1, %r13
  2196. jne _less_than_8_bytes_left\@
  2197. #############################
  2198. _multiple_of_16_bytes\@:
  2199. mov arg7, %r12 # r12 = aadLen (number of bytes)
  2200. shl $3, %r12 # convert into number of bits
  2201. vmovd %r12d, %xmm15 # len(A) in xmm15
  2202. shl $3, arg4 # len(C) in bits (*128)
  2203. vmovq arg4, %xmm1
  2204. vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
  2205. vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
  2206. vpxor %xmm15, %xmm14, %xmm14
  2207. GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
  2208. vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
  2209. mov arg5, %rax # rax = *Y0
  2210. vmovdqu (%rax), %xmm9 # xmm9 = Y0
  2211. ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
  2212. vpxor %xmm14, %xmm9, %xmm9
  2213. _return_T\@:
  2214. mov arg8, %r10 # r10 = authTag
  2215. mov arg9, %r11 # r11 = auth_tag_len
  2216. cmp $16, %r11
  2217. je _T_16\@
  2218. cmp $8, %r11
  2219. jl _T_4\@
  2220. _T_8\@:
  2221. vmovq %xmm9, %rax
  2222. mov %rax, (%r10)
  2223. add $8, %r10
  2224. sub $8, %r11
  2225. vpsrldq $8, %xmm9, %xmm9
  2226. cmp $0, %r11
  2227. je _return_T_done\@
  2228. _T_4\@:
  2229. vmovd %xmm9, %eax
  2230. mov %eax, (%r10)
  2231. add $4, %r10
  2232. sub $4, %r11
  2233. vpsrldq $4, %xmm9, %xmm9
  2234. cmp $0, %r11
  2235. je _return_T_done\@
  2236. _T_123\@:
  2237. vmovd %xmm9, %eax
  2238. cmp $2, %r11
  2239. jl _T_1\@
  2240. mov %ax, (%r10)
  2241. cmp $2, %r11
  2242. je _return_T_done\@
  2243. add $2, %r10
  2244. sar $16, %eax
  2245. _T_1\@:
  2246. mov %al, (%r10)
  2247. jmp _return_T_done\@
  2248. _T_16\@:
  2249. vmovdqu %xmm9, (%r10)
  2250. _return_T_done\@:
  2251. mov %r14, %rsp
  2252. pop %r15
  2253. pop %r14
  2254. pop %r13
  2255. pop %r12
  2256. .endm
  2257. #############################################################
  2258. #void aesni_gcm_precomp_avx_gen4
  2259. # (gcm_data *my_ctx_data,
  2260. # u8 *hash_subkey)# /* H, the Hash sub key input.
  2261. # Data starts on a 16-byte boundary. */
  2262. #############################################################
  2263. ENTRY(aesni_gcm_precomp_avx_gen4)
  2264. #the number of pushes must equal STACK_OFFSET
  2265. push %r12
  2266. push %r13
  2267. push %r14
  2268. push %r15
  2269. mov %rsp, %r14
  2270. sub $VARIABLE_OFFSET, %rsp
  2271. and $~63, %rsp # align rsp to 64 bytes
  2272. vmovdqu (arg2), %xmm6 # xmm6 = HashKey
  2273. vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
  2274. ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
  2275. vmovdqa %xmm6, %xmm2
  2276. vpsllq $1, %xmm6, %xmm6
  2277. vpsrlq $63, %xmm2, %xmm2
  2278. vmovdqa %xmm2, %xmm1
  2279. vpslldq $8, %xmm2, %xmm2
  2280. vpsrldq $8, %xmm1, %xmm1
  2281. vpor %xmm2, %xmm6, %xmm6
  2282. #reduction
  2283. vpshufd $0b00100100, %xmm1, %xmm2
  2284. vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
  2285. vpand POLY(%rip), %xmm2, %xmm2
  2286. vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
  2287. #######################################################################
  2288. vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
  2289. PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
  2290. mov %r14, %rsp
  2291. pop %r15
  2292. pop %r14
  2293. pop %r13
  2294. pop %r12
  2295. ret
  2296. ENDPROC(aesni_gcm_precomp_avx_gen4)
  2297. ###############################################################################
  2298. #void aesni_gcm_enc_avx_gen4(
  2299. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2300. # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
  2301. # const u8 *in, /* Plaintext input */
  2302. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  2303. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  2304. # (from Security Association) concatenated with 8 byte
  2305. # Initialisation Vector (from IPSec ESP Payload)
  2306. # concatenated with 0x00000001. 16-byte aligned pointer. */
  2307. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  2308. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  2309. # u8 *auth_tag, /* Authenticated Tag output. */
  2310. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  2311. # Valid values are 16 (most likely), 12 or 8. */
  2312. ###############################################################################
  2313. ENTRY(aesni_gcm_enc_avx_gen4)
  2314. GCM_ENC_DEC_AVX2 ENC
  2315. ret
  2316. ENDPROC(aesni_gcm_enc_avx_gen4)
  2317. ###############################################################################
  2318. #void aesni_gcm_dec_avx_gen4(
  2319. # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
  2320. # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
  2321. # const u8 *in, /* Ciphertext input */
  2322. # u64 plaintext_len, /* Length of data in Bytes for encryption. */
  2323. # u8 *iv, /* Pre-counter block j0: 4 byte salt
  2324. # (from Security Association) concatenated with 8 byte
  2325. # Initialisation Vector (from IPSec ESP Payload)
  2326. # concatenated with 0x00000001. 16-byte aligned pointer. */
  2327. # const u8 *aad, /* Additional Authentication Data (AAD)*/
  2328. # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
  2329. # u8 *auth_tag, /* Authenticated Tag output. */
  2330. # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
  2331. # Valid values are 16 (most likely), 12 or 8. */
  2332. ###############################################################################
  2333. ENTRY(aesni_gcm_dec_avx_gen4)
  2334. GCM_ENC_DEC_AVX2 DEC
  2335. ret
  2336. ENDPROC(aesni_gcm_dec_avx_gen4)
  2337. #endif /* CONFIG_AS_AVX2 */