aesni-intel_asm.S 77 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * Ported x86_64 version to x86:
  24. * Author: Mathias Krause <minipli@googlemail.com>
  25. *
  26. * This program is free software; you can redistribute it and/or modify
  27. * it under the terms of the GNU General Public License as published by
  28. * the Free Software Foundation; either version 2 of the License, or
  29. * (at your option) any later version.
  30. */
  31. #include <linux/linkage.h>
  32. #include <asm/inst.h>
  33. #include <asm/frame.h>
  34. #include <asm/nospec-branch.h>
  35. /*
  36. * The following macros are used to move an (un)aligned 16 byte value to/from
  37. * an XMM register. This can done for either FP or integer values, for FP use
  38. * movaps (move aligned packed single) or integer use movdqa (move double quad
  39. * aligned). It doesn't make a performance difference which instruction is used
  40. * since Nehalem (original Core i7) was released. However, the movaps is a byte
  41. * shorter, so that is the one we'll use for now. (same for unaligned).
  42. */
  43. #define MOVADQ movaps
  44. #define MOVUDQ movups
  45. #ifdef __x86_64__
  46. # constants in mergeable sections, linker can reorder and merge
  47. .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  48. .align 16
  49. .Lgf128mul_x_ble_mask:
  50. .octa 0x00000000000000010000000000000087
  51. .section .rodata.cst16.POLY, "aM", @progbits, 16
  52. .align 16
  53. POLY: .octa 0xC2000000000000000000000000000001
  54. .section .rodata.cst16.TWOONE, "aM", @progbits, 16
  55. .align 16
  56. TWOONE: .octa 0x00000001000000000000000000000001
  57. .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  58. .align 16
  59. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  60. .section .rodata.cst16.MASK1, "aM", @progbits, 16
  61. .align 16
  62. MASK1: .octa 0x0000000000000000ffffffffffffffff
  63. .section .rodata.cst16.MASK2, "aM", @progbits, 16
  64. .align 16
  65. MASK2: .octa 0xffffffffffffffff0000000000000000
  66. .section .rodata.cst16.ONE, "aM", @progbits, 16
  67. .align 16
  68. ONE: .octa 0x00000000000000000000000000000001
  69. .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  70. .align 16
  71. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  72. .section .rodata.cst16.dec, "aM", @progbits, 16
  73. .align 16
  74. dec: .octa 0x1
  75. .section .rodata.cst16.enc, "aM", @progbits, 16
  76. .align 16
  77. enc: .octa 0x2
  78. # order of these constants should not change.
  79. # more specifically, ALL_F should follow SHIFT_MASK,
  80. # and zero should follow ALL_F
  81. .section .rodata, "a", @progbits
  82. .align 16
  83. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  84. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  85. .octa 0x00000000000000000000000000000000
  86. .text
  87. #define STACK_OFFSET 8*3
  88. #define AadHash 16*0
  89. #define AadLen 16*1
  90. #define InLen (16*1)+8
  91. #define PBlockEncKey 16*2
  92. #define OrigIV 16*3
  93. #define CurCount 16*4
  94. #define PBlockLen 16*5
  95. #define HashKey 16*6 // store HashKey <<1 mod poly here
  96. #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
  97. #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
  98. #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
  99. #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
  100. // bits of HashKey <<1 mod poly here
  101. //(for Karatsuba purposes)
  102. #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
  103. // bits of HashKey^2 <<1 mod poly here
  104. // (for Karatsuba purposes)
  105. #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
  106. // bits of HashKey^3 <<1 mod poly here
  107. // (for Karatsuba purposes)
  108. #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
  109. // bits of HashKey^4 <<1 mod poly here
  110. // (for Karatsuba purposes)
  111. #define arg1 rdi
  112. #define arg2 rsi
  113. #define arg3 rdx
  114. #define arg4 rcx
  115. #define arg5 r8
  116. #define arg6 r9
  117. #define arg7 STACK_OFFSET+8(%rsp)
  118. #define arg8 STACK_OFFSET+16(%rsp)
  119. #define arg9 STACK_OFFSET+24(%rsp)
  120. #define arg10 STACK_OFFSET+32(%rsp)
  121. #define arg11 STACK_OFFSET+40(%rsp)
  122. #define keysize 2*15*16(%arg1)
  123. #endif
  124. #define STATE1 %xmm0
  125. #define STATE2 %xmm4
  126. #define STATE3 %xmm5
  127. #define STATE4 %xmm6
  128. #define STATE STATE1
  129. #define IN1 %xmm1
  130. #define IN2 %xmm7
  131. #define IN3 %xmm8
  132. #define IN4 %xmm9
  133. #define IN IN1
  134. #define KEY %xmm2
  135. #define IV %xmm3
  136. #define BSWAP_MASK %xmm10
  137. #define CTR %xmm11
  138. #define INC %xmm12
  139. #define GF128MUL_MASK %xmm10
  140. #ifdef __x86_64__
  141. #define AREG %rax
  142. #define KEYP %rdi
  143. #define OUTP %rsi
  144. #define UKEYP OUTP
  145. #define INP %rdx
  146. #define LEN %rcx
  147. #define IVP %r8
  148. #define KLEN %r9d
  149. #define T1 %r10
  150. #define TKEYP T1
  151. #define T2 %r11
  152. #define TCTR_LOW T2
  153. #else
  154. #define AREG %eax
  155. #define KEYP %edi
  156. #define OUTP AREG
  157. #define UKEYP OUTP
  158. #define INP %edx
  159. #define LEN %esi
  160. #define IVP %ebp
  161. #define KLEN %ebx
  162. #define T1 %ecx
  163. #define TKEYP T1
  164. #endif
  165. .macro FUNC_SAVE
  166. push %r12
  167. push %r13
  168. push %r14
  169. #
  170. # states of %xmm registers %xmm6:%xmm15 not saved
  171. # all %xmm registers are clobbered
  172. #
  173. .endm
  174. .macro FUNC_RESTORE
  175. pop %r14
  176. pop %r13
  177. pop %r12
  178. .endm
  179. # Precompute hashkeys.
  180. # Input: Hash subkey.
  181. # Output: HashKeys stored in gcm_context_data. Only needs to be called
  182. # once per key.
  183. # clobbers r12, and tmp xmm registers.
  184. .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
  185. mov \SUBKEY, %r12
  186. movdqu (%r12), \TMP3
  187. movdqa SHUF_MASK(%rip), \TMP2
  188. PSHUFB_XMM \TMP2, \TMP3
  189. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  190. movdqa \TMP3, \TMP2
  191. psllq $1, \TMP3
  192. psrlq $63, \TMP2
  193. movdqa \TMP2, \TMP1
  194. pslldq $8, \TMP2
  195. psrldq $8, \TMP1
  196. por \TMP2, \TMP3
  197. # reduce HashKey<<1
  198. pshufd $0x24, \TMP1, \TMP2
  199. pcmpeqd TWOONE(%rip), \TMP2
  200. pand POLY(%rip), \TMP2
  201. pxor \TMP2, \TMP3
  202. movdqu \TMP3, HashKey(%arg2)
  203. movdqa \TMP3, \TMP5
  204. pshufd $78, \TMP3, \TMP1
  205. pxor \TMP3, \TMP1
  206. movdqu \TMP1, HashKey_k(%arg2)
  207. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  208. # TMP5 = HashKey^2<<1 (mod poly)
  209. movdqu \TMP5, HashKey_2(%arg2)
  210. # HashKey_2 = HashKey^2<<1 (mod poly)
  211. pshufd $78, \TMP5, \TMP1
  212. pxor \TMP5, \TMP1
  213. movdqu \TMP1, HashKey_2_k(%arg2)
  214. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  215. # TMP5 = HashKey^3<<1 (mod poly)
  216. movdqu \TMP5, HashKey_3(%arg2)
  217. pshufd $78, \TMP5, \TMP1
  218. pxor \TMP5, \TMP1
  219. movdqu \TMP1, HashKey_3_k(%arg2)
  220. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  221. # TMP5 = HashKey^3<<1 (mod poly)
  222. movdqu \TMP5, HashKey_4(%arg2)
  223. pshufd $78, \TMP5, \TMP1
  224. pxor \TMP5, \TMP1
  225. movdqu \TMP1, HashKey_4_k(%arg2)
  226. .endm
  227. # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
  228. # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
  229. .macro GCM_INIT Iv SUBKEY AAD AADLEN
  230. mov \AADLEN, %r11
  231. mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
  232. xor %r11d, %r11d
  233. mov %r11, InLen(%arg2) # ctx_data.in_length = 0
  234. mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
  235. mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
  236. mov \Iv, %rax
  237. movdqu (%rax), %xmm0
  238. movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
  239. movdqa SHUF_MASK(%rip), %xmm2
  240. PSHUFB_XMM %xmm2, %xmm0
  241. movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
  242. PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
  243. movdqu HashKey(%arg2), %xmm13
  244. CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
  245. %xmm4, %xmm5, %xmm6
  246. .endm
  247. # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
  248. # struct has been initialized by GCM_INIT.
  249. # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
  250. # Clobbers rax, r10-r13, and xmm0-xmm15
  251. .macro GCM_ENC_DEC operation
  252. movdqu AadHash(%arg2), %xmm8
  253. movdqu HashKey(%arg2), %xmm13
  254. add %arg5, InLen(%arg2)
  255. xor %r11d, %r11d # initialise the data pointer offset as zero
  256. PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
  257. sub %r11, %arg5 # sub partial block data used
  258. mov %arg5, %r13 # save the number of bytes
  259. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  260. mov %r13, %r12
  261. # Encrypt/Decrypt first few blocks
  262. and $(3<<4), %r12
  263. jz _initial_num_blocks_is_0_\@
  264. cmp $(2<<4), %r12
  265. jb _initial_num_blocks_is_1_\@
  266. je _initial_num_blocks_is_2_\@
  267. _initial_num_blocks_is_3_\@:
  268. INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  269. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
  270. sub $48, %r13
  271. jmp _initial_blocks_\@
  272. _initial_num_blocks_is_2_\@:
  273. INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  274. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
  275. sub $32, %r13
  276. jmp _initial_blocks_\@
  277. _initial_num_blocks_is_1_\@:
  278. INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  279. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
  280. sub $16, %r13
  281. jmp _initial_blocks_\@
  282. _initial_num_blocks_is_0_\@:
  283. INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  284. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
  285. _initial_blocks_\@:
  286. # Main loop - Encrypt/Decrypt remaining blocks
  287. cmp $0, %r13
  288. je _zero_cipher_left_\@
  289. sub $64, %r13
  290. je _four_cipher_left_\@
  291. _crypt_by_4_\@:
  292. GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
  293. %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
  294. %xmm7, %xmm8, enc
  295. add $64, %r11
  296. sub $64, %r13
  297. jne _crypt_by_4_\@
  298. _four_cipher_left_\@:
  299. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  300. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  301. _zero_cipher_left_\@:
  302. movdqu %xmm8, AadHash(%arg2)
  303. movdqu %xmm0, CurCount(%arg2)
  304. mov %arg5, %r13
  305. and $15, %r13 # %r13 = arg5 (mod 16)
  306. je _multiple_of_16_bytes_\@
  307. mov %r13, PBlockLen(%arg2)
  308. # Handle the last <16 Byte block separately
  309. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  310. movdqu %xmm0, CurCount(%arg2)
  311. movdqa SHUF_MASK(%rip), %xmm10
  312. PSHUFB_XMM %xmm10, %xmm0
  313. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  314. movdqu %xmm0, PBlockEncKey(%arg2)
  315. cmp $16, %arg5
  316. jge _large_enough_update_\@
  317. lea (%arg4,%r11,1), %r10
  318. mov %r13, %r12
  319. READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
  320. jmp _data_read_\@
  321. _large_enough_update_\@:
  322. sub $16, %r11
  323. add %r13, %r11
  324. # receive the last <16 Byte block
  325. movdqu (%arg4, %r11, 1), %xmm1
  326. sub %r13, %r11
  327. add $16, %r11
  328. lea SHIFT_MASK+16(%rip), %r12
  329. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  330. # (r13 is the number of bytes in plaintext mod 16)
  331. sub %r13, %r12
  332. # get the appropriate shuffle mask
  333. movdqu (%r12), %xmm2
  334. # shift right 16-r13 bytes
  335. PSHUFB_XMM %xmm2, %xmm1
  336. _data_read_\@:
  337. lea ALL_F+16(%rip), %r12
  338. sub %r13, %r12
  339. .ifc \operation, dec
  340. movdqa %xmm1, %xmm2
  341. .endif
  342. pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
  343. movdqu (%r12), %xmm1
  344. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  345. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  346. .ifc \operation, dec
  347. pand %xmm1, %xmm2
  348. movdqa SHUF_MASK(%rip), %xmm10
  349. PSHUFB_XMM %xmm10 ,%xmm2
  350. pxor %xmm2, %xmm8
  351. .else
  352. movdqa SHUF_MASK(%rip), %xmm10
  353. PSHUFB_XMM %xmm10,%xmm0
  354. pxor %xmm0, %xmm8
  355. .endif
  356. movdqu %xmm8, AadHash(%arg2)
  357. .ifc \operation, enc
  358. # GHASH computation for the last <16 byte block
  359. movdqa SHUF_MASK(%rip), %xmm10
  360. # shuffle xmm0 back to output as ciphertext
  361. PSHUFB_XMM %xmm10, %xmm0
  362. .endif
  363. # Output %r13 bytes
  364. MOVQ_R64_XMM %xmm0, %rax
  365. cmp $8, %r13
  366. jle _less_than_8_bytes_left_\@
  367. mov %rax, (%arg3 , %r11, 1)
  368. add $8, %r11
  369. psrldq $8, %xmm0
  370. MOVQ_R64_XMM %xmm0, %rax
  371. sub $8, %r13
  372. _less_than_8_bytes_left_\@:
  373. mov %al, (%arg3, %r11, 1)
  374. add $1, %r11
  375. shr $8, %rax
  376. sub $1, %r13
  377. jne _less_than_8_bytes_left_\@
  378. _multiple_of_16_bytes_\@:
  379. .endm
  380. # GCM_COMPLETE Finishes update of tag of last partial block
  381. # Output: Authorization Tag (AUTH_TAG)
  382. # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
  383. .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
  384. movdqu AadHash(%arg2), %xmm8
  385. movdqu HashKey(%arg2), %xmm13
  386. mov PBlockLen(%arg2), %r12
  387. cmp $0, %r12
  388. je _partial_done\@
  389. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  390. _partial_done\@:
  391. mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
  392. shl $3, %r12 # convert into number of bits
  393. movd %r12d, %xmm15 # len(A) in %xmm15
  394. mov InLen(%arg2), %r12
  395. shl $3, %r12 # len(C) in bits (*128)
  396. MOVQ_R64_XMM %r12, %xmm1
  397. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  398. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  399. pxor %xmm15, %xmm8
  400. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  401. # final GHASH computation
  402. movdqa SHUF_MASK(%rip), %xmm10
  403. PSHUFB_XMM %xmm10, %xmm8
  404. movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
  405. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  406. pxor %xmm8, %xmm0
  407. _return_T_\@:
  408. mov \AUTHTAG, %r10 # %r10 = authTag
  409. mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
  410. cmp $16, %r11
  411. je _T_16_\@
  412. cmp $8, %r11
  413. jl _T_4_\@
  414. _T_8_\@:
  415. MOVQ_R64_XMM %xmm0, %rax
  416. mov %rax, (%r10)
  417. add $8, %r10
  418. sub $8, %r11
  419. psrldq $8, %xmm0
  420. cmp $0, %r11
  421. je _return_T_done_\@
  422. _T_4_\@:
  423. movd %xmm0, %eax
  424. mov %eax, (%r10)
  425. add $4, %r10
  426. sub $4, %r11
  427. psrldq $4, %xmm0
  428. cmp $0, %r11
  429. je _return_T_done_\@
  430. _T_123_\@:
  431. movd %xmm0, %eax
  432. cmp $2, %r11
  433. jl _T_1_\@
  434. mov %ax, (%r10)
  435. cmp $2, %r11
  436. je _return_T_done_\@
  437. add $2, %r10
  438. sar $16, %eax
  439. _T_1_\@:
  440. mov %al, (%r10)
  441. jmp _return_T_done_\@
  442. _T_16_\@:
  443. movdqu %xmm0, (%r10)
  444. _return_T_done_\@:
  445. .endm
  446. #ifdef __x86_64__
  447. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  448. *
  449. *
  450. * Input: A and B (128-bits each, bit-reflected)
  451. * Output: C = A*B*x mod poly, (i.e. >>1 )
  452. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  453. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  454. *
  455. */
  456. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  457. movdqa \GH, \TMP1
  458. pshufd $78, \GH, \TMP2
  459. pshufd $78, \HK, \TMP3
  460. pxor \GH, \TMP2 # TMP2 = a1+a0
  461. pxor \HK, \TMP3 # TMP3 = b1+b0
  462. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  463. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  464. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  465. pxor \GH, \TMP2
  466. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  467. movdqa \TMP2, \TMP3
  468. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  469. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  470. pxor \TMP3, \GH
  471. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  472. # first phase of the reduction
  473. movdqa \GH, \TMP2
  474. movdqa \GH, \TMP3
  475. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  476. # in in order to perform
  477. # independent shifts
  478. pslld $31, \TMP2 # packed right shift <<31
  479. pslld $30, \TMP3 # packed right shift <<30
  480. pslld $25, \TMP4 # packed right shift <<25
  481. pxor \TMP3, \TMP2 # xor the shifted versions
  482. pxor \TMP4, \TMP2
  483. movdqa \TMP2, \TMP5
  484. psrldq $4, \TMP5 # right shift TMP5 1 DW
  485. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  486. pxor \TMP2, \GH
  487. # second phase of the reduction
  488. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  489. # in in order to perform
  490. # independent shifts
  491. movdqa \GH,\TMP3
  492. movdqa \GH,\TMP4
  493. psrld $1,\TMP2 # packed left shift >>1
  494. psrld $2,\TMP3 # packed left shift >>2
  495. psrld $7,\TMP4 # packed left shift >>7
  496. pxor \TMP3,\TMP2 # xor the shifted versions
  497. pxor \TMP4,\TMP2
  498. pxor \TMP5, \TMP2
  499. pxor \TMP2, \GH
  500. pxor \TMP1, \GH # result is in TMP1
  501. .endm
  502. # Reads DLEN bytes starting at DPTR and stores in XMMDst
  503. # where 0 < DLEN < 16
  504. # Clobbers %rax, DLEN and XMM1
  505. .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
  506. cmp $8, \DLEN
  507. jl _read_lt8_\@
  508. mov (\DPTR), %rax
  509. MOVQ_R64_XMM %rax, \XMMDst
  510. sub $8, \DLEN
  511. jz _done_read_partial_block_\@
  512. xor %eax, %eax
  513. _read_next_byte_\@:
  514. shl $8, %rax
  515. mov 7(\DPTR, \DLEN, 1), %al
  516. dec \DLEN
  517. jnz _read_next_byte_\@
  518. MOVQ_R64_XMM %rax, \XMM1
  519. pslldq $8, \XMM1
  520. por \XMM1, \XMMDst
  521. jmp _done_read_partial_block_\@
  522. _read_lt8_\@:
  523. xor %eax, %eax
  524. _read_next_byte_lt8_\@:
  525. shl $8, %rax
  526. mov -1(\DPTR, \DLEN, 1), %al
  527. dec \DLEN
  528. jnz _read_next_byte_lt8_\@
  529. MOVQ_R64_XMM %rax, \XMMDst
  530. _done_read_partial_block_\@:
  531. .endm
  532. # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
  533. # clobbers r10-11, xmm14
  534. .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
  535. TMP6 TMP7
  536. MOVADQ SHUF_MASK(%rip), %xmm14
  537. mov \AAD, %r10 # %r10 = AAD
  538. mov \AADLEN, %r11 # %r11 = aadLen
  539. pxor \TMP7, \TMP7
  540. pxor \TMP6, \TMP6
  541. cmp $16, %r11
  542. jl _get_AAD_rest\@
  543. _get_AAD_blocks\@:
  544. movdqu (%r10), \TMP7
  545. PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
  546. pxor \TMP7, \TMP6
  547. GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
  548. add $16, %r10
  549. sub $16, %r11
  550. cmp $16, %r11
  551. jge _get_AAD_blocks\@
  552. movdqu \TMP6, \TMP7
  553. /* read the last <16B of AAD */
  554. _get_AAD_rest\@:
  555. cmp $0, %r11
  556. je _get_AAD_done\@
  557. READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
  558. PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
  559. pxor \TMP6, \TMP7
  560. GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
  561. movdqu \TMP7, \TMP6
  562. _get_AAD_done\@:
  563. movdqu \TMP6, AadHash(%arg2)
  564. .endm
  565. # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
  566. # between update calls.
  567. # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
  568. # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
  569. # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
  570. .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
  571. AAD_HASH operation
  572. mov PBlockLen(%arg2), %r13
  573. cmp $0, %r13
  574. je _partial_block_done_\@ # Leave Macro if no partial blocks
  575. # Read in input data without over reading
  576. cmp $16, \PLAIN_CYPH_LEN
  577. jl _fewer_than_16_bytes_\@
  578. movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
  579. jmp _data_read_\@
  580. _fewer_than_16_bytes_\@:
  581. lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
  582. mov \PLAIN_CYPH_LEN, %r12
  583. READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
  584. mov PBlockLen(%arg2), %r13
  585. _data_read_\@: # Finished reading in data
  586. movdqu PBlockEncKey(%arg2), %xmm9
  587. movdqu HashKey(%arg2), %xmm13
  588. lea SHIFT_MASK(%rip), %r12
  589. # adjust the shuffle mask pointer to be able to shift r13 bytes
  590. # r16-r13 is the number of bytes in plaintext mod 16)
  591. add %r13, %r12
  592. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  593. PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
  594. .ifc \operation, dec
  595. movdqa %xmm1, %xmm3
  596. pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
  597. mov \PLAIN_CYPH_LEN, %r10
  598. add %r13, %r10
  599. # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
  600. sub $16, %r10
  601. # Determine if if partial block is not being filled and
  602. # shift mask accordingly
  603. jge _no_extra_mask_1_\@
  604. sub %r10, %r12
  605. _no_extra_mask_1_\@:
  606. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  607. # get the appropriate mask to mask out bottom r13 bytes of xmm9
  608. pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
  609. pand %xmm1, %xmm3
  610. movdqa SHUF_MASK(%rip), %xmm10
  611. PSHUFB_XMM %xmm10, %xmm3
  612. PSHUFB_XMM %xmm2, %xmm3
  613. pxor %xmm3, \AAD_HASH
  614. cmp $0, %r10
  615. jl _partial_incomplete_1_\@
  616. # GHASH computation for the last <16 Byte block
  617. GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  618. xor %eax, %eax
  619. mov %rax, PBlockLen(%arg2)
  620. jmp _dec_done_\@
  621. _partial_incomplete_1_\@:
  622. add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
  623. _dec_done_\@:
  624. movdqu \AAD_HASH, AadHash(%arg2)
  625. .else
  626. pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
  627. mov \PLAIN_CYPH_LEN, %r10
  628. add %r13, %r10
  629. # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
  630. sub $16, %r10
  631. # Determine if if partial block is not being filled and
  632. # shift mask accordingly
  633. jge _no_extra_mask_2_\@
  634. sub %r10, %r12
  635. _no_extra_mask_2_\@:
  636. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  637. # get the appropriate mask to mask out bottom r13 bytes of xmm9
  638. pand %xmm1, %xmm9
  639. movdqa SHUF_MASK(%rip), %xmm1
  640. PSHUFB_XMM %xmm1, %xmm9
  641. PSHUFB_XMM %xmm2, %xmm9
  642. pxor %xmm9, \AAD_HASH
  643. cmp $0, %r10
  644. jl _partial_incomplete_2_\@
  645. # GHASH computation for the last <16 Byte block
  646. GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
  647. xor %eax, %eax
  648. mov %rax, PBlockLen(%arg2)
  649. jmp _encode_done_\@
  650. _partial_incomplete_2_\@:
  651. add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
  652. _encode_done_\@:
  653. movdqu \AAD_HASH, AadHash(%arg2)
  654. movdqa SHUF_MASK(%rip), %xmm10
  655. # shuffle xmm9 back to output as ciphertext
  656. PSHUFB_XMM %xmm10, %xmm9
  657. PSHUFB_XMM %xmm2, %xmm9
  658. .endif
  659. # output encrypted Bytes
  660. cmp $0, %r10
  661. jl _partial_fill_\@
  662. mov %r13, %r12
  663. mov $16, %r13
  664. # Set r13 to be the number of bytes to write out
  665. sub %r12, %r13
  666. jmp _count_set_\@
  667. _partial_fill_\@:
  668. mov \PLAIN_CYPH_LEN, %r13
  669. _count_set_\@:
  670. movdqa %xmm9, %xmm0
  671. MOVQ_R64_XMM %xmm0, %rax
  672. cmp $8, %r13
  673. jle _less_than_8_bytes_left_\@
  674. mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
  675. add $8, \DATA_OFFSET
  676. psrldq $8, %xmm0
  677. MOVQ_R64_XMM %xmm0, %rax
  678. sub $8, %r13
  679. _less_than_8_bytes_left_\@:
  680. movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
  681. add $1, \DATA_OFFSET
  682. shr $8, %rax
  683. sub $1, %r13
  684. jne _less_than_8_bytes_left_\@
  685. _partial_block_done_\@:
  686. .endm # PARTIAL_BLOCK
  687. /*
  688. * if a = number of total plaintext bytes
  689. * b = floor(a/16)
  690. * num_initial_blocks = b mod 4
  691. * encrypt the initial num_initial_blocks blocks and apply ghash on
  692. * the ciphertext
  693. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  694. * are clobbered
  695. * arg1, %arg2, %arg3 are used as a pointer only, not modified
  696. */
  697. .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  698. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  699. MOVADQ SHUF_MASK(%rip), %xmm14
  700. movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
  701. # start AES for num_initial_blocks blocks
  702. movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
  703. .if (\i == 5) || (\i == 6) || (\i == 7)
  704. MOVADQ ONE(%RIP),\TMP1
  705. MOVADQ 0(%arg1),\TMP2
  706. .irpc index, \i_seq
  707. paddd \TMP1, \XMM0 # INCR Y0
  708. .ifc \operation, dec
  709. movdqa \XMM0, %xmm\index
  710. .else
  711. MOVADQ \XMM0, %xmm\index
  712. .endif
  713. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  714. pxor \TMP2, %xmm\index
  715. .endr
  716. lea 0x10(%arg1),%r10
  717. mov keysize,%eax
  718. shr $2,%eax # 128->4, 192->6, 256->8
  719. add $5,%eax # 128->9, 192->11, 256->13
  720. aes_loop_initial_\@:
  721. MOVADQ (%r10),\TMP1
  722. .irpc index, \i_seq
  723. AESENC \TMP1, %xmm\index
  724. .endr
  725. add $16,%r10
  726. sub $1,%eax
  727. jnz aes_loop_initial_\@
  728. MOVADQ (%r10), \TMP1
  729. .irpc index, \i_seq
  730. AESENCLAST \TMP1, %xmm\index # Last Round
  731. .endr
  732. .irpc index, \i_seq
  733. movdqu (%arg4 , %r11, 1), \TMP1
  734. pxor \TMP1, %xmm\index
  735. movdqu %xmm\index, (%arg3 , %r11, 1)
  736. # write back plaintext/ciphertext for num_initial_blocks
  737. add $16, %r11
  738. .ifc \operation, dec
  739. movdqa \TMP1, %xmm\index
  740. .endif
  741. PSHUFB_XMM %xmm14, %xmm\index
  742. # prepare plaintext/ciphertext for GHASH computation
  743. .endr
  744. .endif
  745. # apply GHASH on num_initial_blocks blocks
  746. .if \i == 5
  747. pxor %xmm5, %xmm6
  748. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  749. pxor %xmm6, %xmm7
  750. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  751. pxor %xmm7, %xmm8
  752. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  753. .elseif \i == 6
  754. pxor %xmm6, %xmm7
  755. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  756. pxor %xmm7, %xmm8
  757. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  758. .elseif \i == 7
  759. pxor %xmm7, %xmm8
  760. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  761. .endif
  762. cmp $64, %r13
  763. jl _initial_blocks_done\@
  764. # no need for precomputed values
  765. /*
  766. *
  767. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  768. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  769. */
  770. MOVADQ ONE(%RIP),\TMP1
  771. paddd \TMP1, \XMM0 # INCR Y0
  772. MOVADQ \XMM0, \XMM1
  773. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  774. paddd \TMP1, \XMM0 # INCR Y0
  775. MOVADQ \XMM0, \XMM2
  776. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  777. paddd \TMP1, \XMM0 # INCR Y0
  778. MOVADQ \XMM0, \XMM3
  779. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  780. paddd \TMP1, \XMM0 # INCR Y0
  781. MOVADQ \XMM0, \XMM4
  782. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  783. MOVADQ 0(%arg1),\TMP1
  784. pxor \TMP1, \XMM1
  785. pxor \TMP1, \XMM2
  786. pxor \TMP1, \XMM3
  787. pxor \TMP1, \XMM4
  788. .irpc index, 1234 # do 4 rounds
  789. movaps 0x10*\index(%arg1), \TMP1
  790. AESENC \TMP1, \XMM1
  791. AESENC \TMP1, \XMM2
  792. AESENC \TMP1, \XMM3
  793. AESENC \TMP1, \XMM4
  794. .endr
  795. .irpc index, 56789 # do next 5 rounds
  796. movaps 0x10*\index(%arg1), \TMP1
  797. AESENC \TMP1, \XMM1
  798. AESENC \TMP1, \XMM2
  799. AESENC \TMP1, \XMM3
  800. AESENC \TMP1, \XMM4
  801. .endr
  802. lea 0xa0(%arg1),%r10
  803. mov keysize,%eax
  804. shr $2,%eax # 128->4, 192->6, 256->8
  805. sub $4,%eax # 128->0, 192->2, 256->4
  806. jz aes_loop_pre_done\@
  807. aes_loop_pre_\@:
  808. MOVADQ (%r10),\TMP2
  809. .irpc index, 1234
  810. AESENC \TMP2, %xmm\index
  811. .endr
  812. add $16,%r10
  813. sub $1,%eax
  814. jnz aes_loop_pre_\@
  815. aes_loop_pre_done\@:
  816. MOVADQ (%r10), \TMP2
  817. AESENCLAST \TMP2, \XMM1
  818. AESENCLAST \TMP2, \XMM2
  819. AESENCLAST \TMP2, \XMM3
  820. AESENCLAST \TMP2, \XMM4
  821. movdqu 16*0(%arg4 , %r11 , 1), \TMP1
  822. pxor \TMP1, \XMM1
  823. .ifc \operation, dec
  824. movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
  825. movdqa \TMP1, \XMM1
  826. .endif
  827. movdqu 16*1(%arg4 , %r11 , 1), \TMP1
  828. pxor \TMP1, \XMM2
  829. .ifc \operation, dec
  830. movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
  831. movdqa \TMP1, \XMM2
  832. .endif
  833. movdqu 16*2(%arg4 , %r11 , 1), \TMP1
  834. pxor \TMP1, \XMM3
  835. .ifc \operation, dec
  836. movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
  837. movdqa \TMP1, \XMM3
  838. .endif
  839. movdqu 16*3(%arg4 , %r11 , 1), \TMP1
  840. pxor \TMP1, \XMM4
  841. .ifc \operation, dec
  842. movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
  843. movdqa \TMP1, \XMM4
  844. .else
  845. movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
  846. movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
  847. movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
  848. movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
  849. .endif
  850. add $64, %r11
  851. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  852. pxor \XMMDst, \XMM1
  853. # combine GHASHed value with the corresponding ciphertext
  854. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  855. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  856. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  857. _initial_blocks_done\@:
  858. .endm
  859. /*
  860. * encrypt 4 blocks at a time
  861. * ghash the 4 previously encrypted ciphertext blocks
  862. * arg1, %arg3, %arg4 are used as pointers only, not modified
  863. * %r11 is the data offset value
  864. */
  865. .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
  866. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  867. movdqa \XMM1, \XMM5
  868. movdqa \XMM2, \XMM6
  869. movdqa \XMM3, \XMM7
  870. movdqa \XMM4, \XMM8
  871. movdqa SHUF_MASK(%rip), %xmm15
  872. # multiply TMP5 * HashKey using karatsuba
  873. movdqa \XMM5, \TMP4
  874. pshufd $78, \XMM5, \TMP6
  875. pxor \XMM5, \TMP6
  876. paddd ONE(%rip), \XMM0 # INCR CNT
  877. movdqu HashKey_4(%arg2), \TMP5
  878. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  879. movdqa \XMM0, \XMM1
  880. paddd ONE(%rip), \XMM0 # INCR CNT
  881. movdqa \XMM0, \XMM2
  882. paddd ONE(%rip), \XMM0 # INCR CNT
  883. movdqa \XMM0, \XMM3
  884. paddd ONE(%rip), \XMM0 # INCR CNT
  885. movdqa \XMM0, \XMM4
  886. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  887. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  888. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  889. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  890. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  891. pxor (%arg1), \XMM1
  892. pxor (%arg1), \XMM2
  893. pxor (%arg1), \XMM3
  894. pxor (%arg1), \XMM4
  895. movdqu HashKey_4_k(%arg2), \TMP5
  896. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  897. movaps 0x10(%arg1), \TMP1
  898. AESENC \TMP1, \XMM1 # Round 1
  899. AESENC \TMP1, \XMM2
  900. AESENC \TMP1, \XMM3
  901. AESENC \TMP1, \XMM4
  902. movaps 0x20(%arg1), \TMP1
  903. AESENC \TMP1, \XMM1 # Round 2
  904. AESENC \TMP1, \XMM2
  905. AESENC \TMP1, \XMM3
  906. AESENC \TMP1, \XMM4
  907. movdqa \XMM6, \TMP1
  908. pshufd $78, \XMM6, \TMP2
  909. pxor \XMM6, \TMP2
  910. movdqu HashKey_3(%arg2), \TMP5
  911. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  912. movaps 0x30(%arg1), \TMP3
  913. AESENC \TMP3, \XMM1 # Round 3
  914. AESENC \TMP3, \XMM2
  915. AESENC \TMP3, \XMM3
  916. AESENC \TMP3, \XMM4
  917. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  918. movaps 0x40(%arg1), \TMP3
  919. AESENC \TMP3, \XMM1 # Round 4
  920. AESENC \TMP3, \XMM2
  921. AESENC \TMP3, \XMM3
  922. AESENC \TMP3, \XMM4
  923. movdqu HashKey_3_k(%arg2), \TMP5
  924. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  925. movaps 0x50(%arg1), \TMP3
  926. AESENC \TMP3, \XMM1 # Round 5
  927. AESENC \TMP3, \XMM2
  928. AESENC \TMP3, \XMM3
  929. AESENC \TMP3, \XMM4
  930. pxor \TMP1, \TMP4
  931. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  932. pxor \XMM6, \XMM5
  933. pxor \TMP2, \TMP6
  934. movdqa \XMM7, \TMP1
  935. pshufd $78, \XMM7, \TMP2
  936. pxor \XMM7, \TMP2
  937. movdqu HashKey_2(%arg2), \TMP5
  938. # Multiply TMP5 * HashKey using karatsuba
  939. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  940. movaps 0x60(%arg1), \TMP3
  941. AESENC \TMP3, \XMM1 # Round 6
  942. AESENC \TMP3, \XMM2
  943. AESENC \TMP3, \XMM3
  944. AESENC \TMP3, \XMM4
  945. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  946. movaps 0x70(%arg1), \TMP3
  947. AESENC \TMP3, \XMM1 # Round 7
  948. AESENC \TMP3, \XMM2
  949. AESENC \TMP3, \XMM3
  950. AESENC \TMP3, \XMM4
  951. movdqu HashKey_2_k(%arg2), \TMP5
  952. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  953. movaps 0x80(%arg1), \TMP3
  954. AESENC \TMP3, \XMM1 # Round 8
  955. AESENC \TMP3, \XMM2
  956. AESENC \TMP3, \XMM3
  957. AESENC \TMP3, \XMM4
  958. pxor \TMP1, \TMP4
  959. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  960. pxor \XMM7, \XMM5
  961. pxor \TMP2, \TMP6
  962. # Multiply XMM8 * HashKey
  963. # XMM8 and TMP5 hold the values for the two operands
  964. movdqa \XMM8, \TMP1
  965. pshufd $78, \XMM8, \TMP2
  966. pxor \XMM8, \TMP2
  967. movdqu HashKey(%arg2), \TMP5
  968. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  969. movaps 0x90(%arg1), \TMP3
  970. AESENC \TMP3, \XMM1 # Round 9
  971. AESENC \TMP3, \XMM2
  972. AESENC \TMP3, \XMM3
  973. AESENC \TMP3, \XMM4
  974. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  975. lea 0xa0(%arg1),%r10
  976. mov keysize,%eax
  977. shr $2,%eax # 128->4, 192->6, 256->8
  978. sub $4,%eax # 128->0, 192->2, 256->4
  979. jz aes_loop_par_enc_done\@
  980. aes_loop_par_enc\@:
  981. MOVADQ (%r10),\TMP3
  982. .irpc index, 1234
  983. AESENC \TMP3, %xmm\index
  984. .endr
  985. add $16,%r10
  986. sub $1,%eax
  987. jnz aes_loop_par_enc\@
  988. aes_loop_par_enc_done\@:
  989. MOVADQ (%r10), \TMP3
  990. AESENCLAST \TMP3, \XMM1 # Round 10
  991. AESENCLAST \TMP3, \XMM2
  992. AESENCLAST \TMP3, \XMM3
  993. AESENCLAST \TMP3, \XMM4
  994. movdqu HashKey_k(%arg2), \TMP5
  995. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  996. movdqu (%arg4,%r11,1), \TMP3
  997. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  998. movdqu 16(%arg4,%r11,1), \TMP3
  999. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  1000. movdqu 32(%arg4,%r11,1), \TMP3
  1001. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  1002. movdqu 48(%arg4,%r11,1), \TMP3
  1003. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  1004. movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
  1005. movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
  1006. movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
  1007. movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
  1008. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  1009. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  1010. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  1011. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  1012. pxor \TMP4, \TMP1
  1013. pxor \XMM8, \XMM5
  1014. pxor \TMP6, \TMP2
  1015. pxor \TMP1, \TMP2
  1016. pxor \XMM5, \TMP2
  1017. movdqa \TMP2, \TMP3
  1018. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  1019. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1020. pxor \TMP3, \XMM5
  1021. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  1022. # first phase of reduction
  1023. movdqa \XMM5, \TMP2
  1024. movdqa \XMM5, \TMP3
  1025. movdqa \XMM5, \TMP4
  1026. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  1027. pslld $31, \TMP2 # packed right shift << 31
  1028. pslld $30, \TMP3 # packed right shift << 30
  1029. pslld $25, \TMP4 # packed right shift << 25
  1030. pxor \TMP3, \TMP2 # xor the shifted versions
  1031. pxor \TMP4, \TMP2
  1032. movdqa \TMP2, \TMP5
  1033. psrldq $4, \TMP5 # right shift T5 1 DW
  1034. pslldq $12, \TMP2 # left shift T2 3 DWs
  1035. pxor \TMP2, \XMM5
  1036. # second phase of reduction
  1037. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  1038. movdqa \XMM5,\TMP3
  1039. movdqa \XMM5,\TMP4
  1040. psrld $1, \TMP2 # packed left shift >>1
  1041. psrld $2, \TMP3 # packed left shift >>2
  1042. psrld $7, \TMP4 # packed left shift >>7
  1043. pxor \TMP3,\TMP2 # xor the shifted versions
  1044. pxor \TMP4,\TMP2
  1045. pxor \TMP5, \TMP2
  1046. pxor \TMP2, \XMM5
  1047. pxor \TMP1, \XMM5 # result is in TMP1
  1048. pxor \XMM5, \XMM1
  1049. .endm
  1050. /*
  1051. * decrypt 4 blocks at a time
  1052. * ghash the 4 previously decrypted ciphertext blocks
  1053. * arg1, %arg3, %arg4 are used as pointers only, not modified
  1054. * %r11 is the data offset value
  1055. */
  1056. .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
  1057. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  1058. movdqa \XMM1, \XMM5
  1059. movdqa \XMM2, \XMM6
  1060. movdqa \XMM3, \XMM7
  1061. movdqa \XMM4, \XMM8
  1062. movdqa SHUF_MASK(%rip), %xmm15
  1063. # multiply TMP5 * HashKey using karatsuba
  1064. movdqa \XMM5, \TMP4
  1065. pshufd $78, \XMM5, \TMP6
  1066. pxor \XMM5, \TMP6
  1067. paddd ONE(%rip), \XMM0 # INCR CNT
  1068. movdqu HashKey_4(%arg2), \TMP5
  1069. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  1070. movdqa \XMM0, \XMM1
  1071. paddd ONE(%rip), \XMM0 # INCR CNT
  1072. movdqa \XMM0, \XMM2
  1073. paddd ONE(%rip), \XMM0 # INCR CNT
  1074. movdqa \XMM0, \XMM3
  1075. paddd ONE(%rip), \XMM0 # INCR CNT
  1076. movdqa \XMM0, \XMM4
  1077. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  1078. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  1079. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  1080. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  1081. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  1082. pxor (%arg1), \XMM1
  1083. pxor (%arg1), \XMM2
  1084. pxor (%arg1), \XMM3
  1085. pxor (%arg1), \XMM4
  1086. movdqu HashKey_4_k(%arg2), \TMP5
  1087. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  1088. movaps 0x10(%arg1), \TMP1
  1089. AESENC \TMP1, \XMM1 # Round 1
  1090. AESENC \TMP1, \XMM2
  1091. AESENC \TMP1, \XMM3
  1092. AESENC \TMP1, \XMM4
  1093. movaps 0x20(%arg1), \TMP1
  1094. AESENC \TMP1, \XMM1 # Round 2
  1095. AESENC \TMP1, \XMM2
  1096. AESENC \TMP1, \XMM3
  1097. AESENC \TMP1, \XMM4
  1098. movdqa \XMM6, \TMP1
  1099. pshufd $78, \XMM6, \TMP2
  1100. pxor \XMM6, \TMP2
  1101. movdqu HashKey_3(%arg2), \TMP5
  1102. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  1103. movaps 0x30(%arg1), \TMP3
  1104. AESENC \TMP3, \XMM1 # Round 3
  1105. AESENC \TMP3, \XMM2
  1106. AESENC \TMP3, \XMM3
  1107. AESENC \TMP3, \XMM4
  1108. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  1109. movaps 0x40(%arg1), \TMP3
  1110. AESENC \TMP3, \XMM1 # Round 4
  1111. AESENC \TMP3, \XMM2
  1112. AESENC \TMP3, \XMM3
  1113. AESENC \TMP3, \XMM4
  1114. movdqu HashKey_3_k(%arg2), \TMP5
  1115. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1116. movaps 0x50(%arg1), \TMP3
  1117. AESENC \TMP3, \XMM1 # Round 5
  1118. AESENC \TMP3, \XMM2
  1119. AESENC \TMP3, \XMM3
  1120. AESENC \TMP3, \XMM4
  1121. pxor \TMP1, \TMP4
  1122. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  1123. pxor \XMM6, \XMM5
  1124. pxor \TMP2, \TMP6
  1125. movdqa \XMM7, \TMP1
  1126. pshufd $78, \XMM7, \TMP2
  1127. pxor \XMM7, \TMP2
  1128. movdqu HashKey_2(%arg2), \TMP5
  1129. # Multiply TMP5 * HashKey using karatsuba
  1130. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1131. movaps 0x60(%arg1), \TMP3
  1132. AESENC \TMP3, \XMM1 # Round 6
  1133. AESENC \TMP3, \XMM2
  1134. AESENC \TMP3, \XMM3
  1135. AESENC \TMP3, \XMM4
  1136. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  1137. movaps 0x70(%arg1), \TMP3
  1138. AESENC \TMP3, \XMM1 # Round 7
  1139. AESENC \TMP3, \XMM2
  1140. AESENC \TMP3, \XMM3
  1141. AESENC \TMP3, \XMM4
  1142. movdqu HashKey_2_k(%arg2), \TMP5
  1143. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1144. movaps 0x80(%arg1), \TMP3
  1145. AESENC \TMP3, \XMM1 # Round 8
  1146. AESENC \TMP3, \XMM2
  1147. AESENC \TMP3, \XMM3
  1148. AESENC \TMP3, \XMM4
  1149. pxor \TMP1, \TMP4
  1150. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  1151. pxor \XMM7, \XMM5
  1152. pxor \TMP2, \TMP6
  1153. # Multiply XMM8 * HashKey
  1154. # XMM8 and TMP5 hold the values for the two operands
  1155. movdqa \XMM8, \TMP1
  1156. pshufd $78, \XMM8, \TMP2
  1157. pxor \XMM8, \TMP2
  1158. movdqu HashKey(%arg2), \TMP5
  1159. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1160. movaps 0x90(%arg1), \TMP3
  1161. AESENC \TMP3, \XMM1 # Round 9
  1162. AESENC \TMP3, \XMM2
  1163. AESENC \TMP3, \XMM3
  1164. AESENC \TMP3, \XMM4
  1165. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  1166. lea 0xa0(%arg1),%r10
  1167. mov keysize,%eax
  1168. shr $2,%eax # 128->4, 192->6, 256->8
  1169. sub $4,%eax # 128->0, 192->2, 256->4
  1170. jz aes_loop_par_dec_done\@
  1171. aes_loop_par_dec\@:
  1172. MOVADQ (%r10),\TMP3
  1173. .irpc index, 1234
  1174. AESENC \TMP3, %xmm\index
  1175. .endr
  1176. add $16,%r10
  1177. sub $1,%eax
  1178. jnz aes_loop_par_dec\@
  1179. aes_loop_par_dec_done\@:
  1180. MOVADQ (%r10), \TMP3
  1181. AESENCLAST \TMP3, \XMM1 # last round
  1182. AESENCLAST \TMP3, \XMM2
  1183. AESENCLAST \TMP3, \XMM3
  1184. AESENCLAST \TMP3, \XMM4
  1185. movdqu HashKey_k(%arg2), \TMP5
  1186. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1187. movdqu (%arg4,%r11,1), \TMP3
  1188. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  1189. movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
  1190. movdqa \TMP3, \XMM1
  1191. movdqu 16(%arg4,%r11,1), \TMP3
  1192. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  1193. movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
  1194. movdqa \TMP3, \XMM2
  1195. movdqu 32(%arg4,%r11,1), \TMP3
  1196. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  1197. movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
  1198. movdqa \TMP3, \XMM3
  1199. movdqu 48(%arg4,%r11,1), \TMP3
  1200. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  1201. movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
  1202. movdqa \TMP3, \XMM4
  1203. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  1204. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  1205. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  1206. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  1207. pxor \TMP4, \TMP1
  1208. pxor \XMM8, \XMM5
  1209. pxor \TMP6, \TMP2
  1210. pxor \TMP1, \TMP2
  1211. pxor \XMM5, \TMP2
  1212. movdqa \TMP2, \TMP3
  1213. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  1214. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1215. pxor \TMP3, \XMM5
  1216. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  1217. # first phase of reduction
  1218. movdqa \XMM5, \TMP2
  1219. movdqa \XMM5, \TMP3
  1220. movdqa \XMM5, \TMP4
  1221. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  1222. pslld $31, \TMP2 # packed right shift << 31
  1223. pslld $30, \TMP3 # packed right shift << 30
  1224. pslld $25, \TMP4 # packed right shift << 25
  1225. pxor \TMP3, \TMP2 # xor the shifted versions
  1226. pxor \TMP4, \TMP2
  1227. movdqa \TMP2, \TMP5
  1228. psrldq $4, \TMP5 # right shift T5 1 DW
  1229. pslldq $12, \TMP2 # left shift T2 3 DWs
  1230. pxor \TMP2, \XMM5
  1231. # second phase of reduction
  1232. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  1233. movdqa \XMM5,\TMP3
  1234. movdqa \XMM5,\TMP4
  1235. psrld $1, \TMP2 # packed left shift >>1
  1236. psrld $2, \TMP3 # packed left shift >>2
  1237. psrld $7, \TMP4 # packed left shift >>7
  1238. pxor \TMP3,\TMP2 # xor the shifted versions
  1239. pxor \TMP4,\TMP2
  1240. pxor \TMP5, \TMP2
  1241. pxor \TMP2, \XMM5
  1242. pxor \TMP1, \XMM5 # result is in TMP1
  1243. pxor \XMM5, \XMM1
  1244. .endm
  1245. /* GHASH the last 4 ciphertext blocks. */
  1246. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  1247. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  1248. # Multiply TMP6 * HashKey (using Karatsuba)
  1249. movdqa \XMM1, \TMP6
  1250. pshufd $78, \XMM1, \TMP2
  1251. pxor \XMM1, \TMP2
  1252. movdqu HashKey_4(%arg2), \TMP5
  1253. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  1254. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  1255. movdqu HashKey_4_k(%arg2), \TMP4
  1256. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1257. movdqa \XMM1, \XMMDst
  1258. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  1259. # Multiply TMP1 * HashKey (using Karatsuba)
  1260. movdqa \XMM2, \TMP1
  1261. pshufd $78, \XMM2, \TMP2
  1262. pxor \XMM2, \TMP2
  1263. movdqu HashKey_3(%arg2), \TMP5
  1264. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1265. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  1266. movdqu HashKey_3_k(%arg2), \TMP4
  1267. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1268. pxor \TMP1, \TMP6
  1269. pxor \XMM2, \XMMDst
  1270. pxor \TMP2, \XMM1
  1271. # results accumulated in TMP6, XMMDst, XMM1
  1272. # Multiply TMP1 * HashKey (using Karatsuba)
  1273. movdqa \XMM3, \TMP1
  1274. pshufd $78, \XMM3, \TMP2
  1275. pxor \XMM3, \TMP2
  1276. movdqu HashKey_2(%arg2), \TMP5
  1277. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1278. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  1279. movdqu HashKey_2_k(%arg2), \TMP4
  1280. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1281. pxor \TMP1, \TMP6
  1282. pxor \XMM3, \XMMDst
  1283. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  1284. # Multiply TMP1 * HashKey (using Karatsuba)
  1285. movdqa \XMM4, \TMP1
  1286. pshufd $78, \XMM4, \TMP2
  1287. pxor \XMM4, \TMP2
  1288. movdqu HashKey(%arg2), \TMP5
  1289. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1290. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  1291. movdqu HashKey_k(%arg2), \TMP4
  1292. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1293. pxor \TMP1, \TMP6
  1294. pxor \XMM4, \XMMDst
  1295. pxor \XMM1, \TMP2
  1296. pxor \TMP6, \TMP2
  1297. pxor \XMMDst, \TMP2
  1298. # middle section of the temp results combined as in karatsuba algorithm
  1299. movdqa \TMP2, \TMP4
  1300. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  1301. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1302. pxor \TMP4, \XMMDst
  1303. pxor \TMP2, \TMP6
  1304. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  1305. # first phase of the reduction
  1306. movdqa \XMMDst, \TMP2
  1307. movdqa \XMMDst, \TMP3
  1308. movdqa \XMMDst, \TMP4
  1309. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  1310. pslld $31, \TMP2 # packed right shifting << 31
  1311. pslld $30, \TMP3 # packed right shifting << 30
  1312. pslld $25, \TMP4 # packed right shifting << 25
  1313. pxor \TMP3, \TMP2 # xor the shifted versions
  1314. pxor \TMP4, \TMP2
  1315. movdqa \TMP2, \TMP7
  1316. psrldq $4, \TMP7 # right shift TMP7 1 DW
  1317. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  1318. pxor \TMP2, \XMMDst
  1319. # second phase of the reduction
  1320. movdqa \XMMDst, \TMP2
  1321. # make 3 copies of XMMDst for doing 3 shift operations
  1322. movdqa \XMMDst, \TMP3
  1323. movdqa \XMMDst, \TMP4
  1324. psrld $1, \TMP2 # packed left shift >> 1
  1325. psrld $2, \TMP3 # packed left shift >> 2
  1326. psrld $7, \TMP4 # packed left shift >> 7
  1327. pxor \TMP3, \TMP2 # xor the shifted versions
  1328. pxor \TMP4, \TMP2
  1329. pxor \TMP7, \TMP2
  1330. pxor \TMP2, \XMMDst
  1331. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  1332. .endm
  1333. /* Encryption of a single block
  1334. * uses eax & r10
  1335. */
  1336. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  1337. pxor (%arg1), \XMM0
  1338. mov keysize,%eax
  1339. shr $2,%eax # 128->4, 192->6, 256->8
  1340. add $5,%eax # 128->9, 192->11, 256->13
  1341. lea 16(%arg1), %r10 # get first expanded key address
  1342. _esb_loop_\@:
  1343. MOVADQ (%r10),\TMP1
  1344. AESENC \TMP1,\XMM0
  1345. add $16,%r10
  1346. sub $1,%eax
  1347. jnz _esb_loop_\@
  1348. MOVADQ (%r10),\TMP1
  1349. AESENCLAST \TMP1,\XMM0
  1350. .endm
  1351. /*****************************************************************************
  1352. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1353. * struct gcm_context_data *data
  1354. * // Context data
  1355. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  1356. * const u8 *in, // Ciphertext input
  1357. * u64 plaintext_len, // Length of data in bytes for decryption.
  1358. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1359. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1360. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1361. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1362. * const u8 *aad, // Additional Authentication Data (AAD)
  1363. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1364. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  1365. * // given authentication tag and only return the plaintext if they match.
  1366. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  1367. * // (most likely), 12 or 8.
  1368. *
  1369. * Assumptions:
  1370. *
  1371. * keys:
  1372. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  1373. * set of 11 keys in the data structure void *aes_ctx
  1374. *
  1375. * iv:
  1376. * 0 1 2 3
  1377. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1378. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1379. * | Salt (From the SA) |
  1380. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1381. * | Initialization Vector |
  1382. * | (This is the sequence number from IPSec header) |
  1383. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1384. * | 0x1 |
  1385. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1386. *
  1387. *
  1388. *
  1389. * AAD:
  1390. * AAD padded to 128 bits with 0
  1391. * for example, assume AAD is a u32 vector
  1392. *
  1393. * if AAD is 8 bytes:
  1394. * AAD[3] = {A0, A1};
  1395. * padded AAD in xmm register = {A1 A0 0 0}
  1396. *
  1397. * 0 1 2 3
  1398. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1399. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1400. * | SPI (A1) |
  1401. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1402. * | 32-bit Sequence Number (A0) |
  1403. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1404. * | 0x0 |
  1405. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1406. *
  1407. * AAD Format with 32-bit Sequence Number
  1408. *
  1409. * if AAD is 12 bytes:
  1410. * AAD[3] = {A0, A1, A2};
  1411. * padded AAD in xmm register = {A2 A1 A0 0}
  1412. *
  1413. * 0 1 2 3
  1414. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1415. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1416. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1417. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1418. * | SPI (A2) |
  1419. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1420. * | 64-bit Extended Sequence Number {A1,A0} |
  1421. * | |
  1422. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1423. * | 0x0 |
  1424. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1425. *
  1426. * AAD Format with 64-bit Extended Sequence Number
  1427. *
  1428. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1429. *
  1430. *****************************************************************************/
  1431. ENTRY(aesni_gcm_dec)
  1432. FUNC_SAVE
  1433. GCM_INIT %arg6, arg7, arg8, arg9
  1434. GCM_ENC_DEC dec
  1435. GCM_COMPLETE arg10, arg11
  1436. FUNC_RESTORE
  1437. ret
  1438. ENDPROC(aesni_gcm_dec)
  1439. /*****************************************************************************
  1440. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1441. * struct gcm_context_data *data
  1442. * // Context data
  1443. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1444. * const u8 *in, // Plaintext input
  1445. * u64 plaintext_len, // Length of data in bytes for encryption.
  1446. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1447. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1448. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1449. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1450. * const u8 *aad, // Additional Authentication Data (AAD)
  1451. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1452. * u8 *auth_tag, // Authenticated Tag output.
  1453. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1454. * // 12 or 8.
  1455. *
  1456. * Assumptions:
  1457. *
  1458. * keys:
  1459. * keys are pre-expanded and aligned to 16 bytes. we are using the
  1460. * first set of 11 keys in the data structure void *aes_ctx
  1461. *
  1462. *
  1463. * iv:
  1464. * 0 1 2 3
  1465. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1466. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1467. * | Salt (From the SA) |
  1468. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1469. * | Initialization Vector |
  1470. * | (This is the sequence number from IPSec header) |
  1471. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1472. * | 0x1 |
  1473. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1474. *
  1475. *
  1476. *
  1477. * AAD:
  1478. * AAD padded to 128 bits with 0
  1479. * for example, assume AAD is a u32 vector
  1480. *
  1481. * if AAD is 8 bytes:
  1482. * AAD[3] = {A0, A1};
  1483. * padded AAD in xmm register = {A1 A0 0 0}
  1484. *
  1485. * 0 1 2 3
  1486. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1487. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1488. * | SPI (A1) |
  1489. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1490. * | 32-bit Sequence Number (A0) |
  1491. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1492. * | 0x0 |
  1493. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1494. *
  1495. * AAD Format with 32-bit Sequence Number
  1496. *
  1497. * if AAD is 12 bytes:
  1498. * AAD[3] = {A0, A1, A2};
  1499. * padded AAD in xmm register = {A2 A1 A0 0}
  1500. *
  1501. * 0 1 2 3
  1502. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1503. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1504. * | SPI (A2) |
  1505. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1506. * | 64-bit Extended Sequence Number {A1,A0} |
  1507. * | |
  1508. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1509. * | 0x0 |
  1510. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1511. *
  1512. * AAD Format with 64-bit Extended Sequence Number
  1513. *
  1514. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1515. ***************************************************************************/
  1516. ENTRY(aesni_gcm_enc)
  1517. FUNC_SAVE
  1518. GCM_INIT %arg6, arg7, arg8, arg9
  1519. GCM_ENC_DEC enc
  1520. GCM_COMPLETE arg10, arg11
  1521. FUNC_RESTORE
  1522. ret
  1523. ENDPROC(aesni_gcm_enc)
  1524. /*****************************************************************************
  1525. * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1526. * struct gcm_context_data *data,
  1527. * // context data
  1528. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1529. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1530. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1531. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1532. * const u8 *aad, // Additional Authentication Data (AAD)
  1533. * u64 aad_len) // Length of AAD in bytes.
  1534. */
  1535. ENTRY(aesni_gcm_init)
  1536. FUNC_SAVE
  1537. GCM_INIT %arg3, %arg4,%arg5, %arg6
  1538. FUNC_RESTORE
  1539. ret
  1540. ENDPROC(aesni_gcm_init)
  1541. /*****************************************************************************
  1542. * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1543. * struct gcm_context_data *data,
  1544. * // context data
  1545. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1546. * const u8 *in, // Plaintext input
  1547. * u64 plaintext_len, // Length of data in bytes for encryption.
  1548. */
  1549. ENTRY(aesni_gcm_enc_update)
  1550. FUNC_SAVE
  1551. GCM_ENC_DEC enc
  1552. FUNC_RESTORE
  1553. ret
  1554. ENDPROC(aesni_gcm_enc_update)
  1555. /*****************************************************************************
  1556. * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1557. * struct gcm_context_data *data,
  1558. * // context data
  1559. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1560. * const u8 *in, // Plaintext input
  1561. * u64 plaintext_len, // Length of data in bytes for encryption.
  1562. */
  1563. ENTRY(aesni_gcm_dec_update)
  1564. FUNC_SAVE
  1565. GCM_ENC_DEC dec
  1566. FUNC_RESTORE
  1567. ret
  1568. ENDPROC(aesni_gcm_dec_update)
  1569. /*****************************************************************************
  1570. * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1571. * struct gcm_context_data *data,
  1572. * // context data
  1573. * u8 *auth_tag, // Authenticated Tag output.
  1574. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1575. * // 12 or 8.
  1576. */
  1577. ENTRY(aesni_gcm_finalize)
  1578. FUNC_SAVE
  1579. GCM_COMPLETE %arg3 %arg4
  1580. FUNC_RESTORE
  1581. ret
  1582. ENDPROC(aesni_gcm_finalize)
  1583. #endif
  1584. .align 4
  1585. _key_expansion_128:
  1586. _key_expansion_256a:
  1587. pshufd $0b11111111, %xmm1, %xmm1
  1588. shufps $0b00010000, %xmm0, %xmm4
  1589. pxor %xmm4, %xmm0
  1590. shufps $0b10001100, %xmm0, %xmm4
  1591. pxor %xmm4, %xmm0
  1592. pxor %xmm1, %xmm0
  1593. movaps %xmm0, (TKEYP)
  1594. add $0x10, TKEYP
  1595. ret
  1596. ENDPROC(_key_expansion_128)
  1597. ENDPROC(_key_expansion_256a)
  1598. .align 4
  1599. _key_expansion_192a:
  1600. pshufd $0b01010101, %xmm1, %xmm1
  1601. shufps $0b00010000, %xmm0, %xmm4
  1602. pxor %xmm4, %xmm0
  1603. shufps $0b10001100, %xmm0, %xmm4
  1604. pxor %xmm4, %xmm0
  1605. pxor %xmm1, %xmm0
  1606. movaps %xmm2, %xmm5
  1607. movaps %xmm2, %xmm6
  1608. pslldq $4, %xmm5
  1609. pshufd $0b11111111, %xmm0, %xmm3
  1610. pxor %xmm3, %xmm2
  1611. pxor %xmm5, %xmm2
  1612. movaps %xmm0, %xmm1
  1613. shufps $0b01000100, %xmm0, %xmm6
  1614. movaps %xmm6, (TKEYP)
  1615. shufps $0b01001110, %xmm2, %xmm1
  1616. movaps %xmm1, 0x10(TKEYP)
  1617. add $0x20, TKEYP
  1618. ret
  1619. ENDPROC(_key_expansion_192a)
  1620. .align 4
  1621. _key_expansion_192b:
  1622. pshufd $0b01010101, %xmm1, %xmm1
  1623. shufps $0b00010000, %xmm0, %xmm4
  1624. pxor %xmm4, %xmm0
  1625. shufps $0b10001100, %xmm0, %xmm4
  1626. pxor %xmm4, %xmm0
  1627. pxor %xmm1, %xmm0
  1628. movaps %xmm2, %xmm5
  1629. pslldq $4, %xmm5
  1630. pshufd $0b11111111, %xmm0, %xmm3
  1631. pxor %xmm3, %xmm2
  1632. pxor %xmm5, %xmm2
  1633. movaps %xmm0, (TKEYP)
  1634. add $0x10, TKEYP
  1635. ret
  1636. ENDPROC(_key_expansion_192b)
  1637. .align 4
  1638. _key_expansion_256b:
  1639. pshufd $0b10101010, %xmm1, %xmm1
  1640. shufps $0b00010000, %xmm2, %xmm4
  1641. pxor %xmm4, %xmm2
  1642. shufps $0b10001100, %xmm2, %xmm4
  1643. pxor %xmm4, %xmm2
  1644. pxor %xmm1, %xmm2
  1645. movaps %xmm2, (TKEYP)
  1646. add $0x10, TKEYP
  1647. ret
  1648. ENDPROC(_key_expansion_256b)
  1649. /*
  1650. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1651. * unsigned int key_len)
  1652. */
  1653. ENTRY(aesni_set_key)
  1654. FRAME_BEGIN
  1655. #ifndef __x86_64__
  1656. pushl KEYP
  1657. movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
  1658. movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
  1659. movl (FRAME_OFFSET+16)(%esp), %edx # key_len
  1660. #endif
  1661. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1662. movaps %xmm0, (KEYP)
  1663. lea 0x10(KEYP), TKEYP # key addr
  1664. movl %edx, 480(KEYP)
  1665. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1666. cmp $24, %dl
  1667. jb .Lenc_key128
  1668. je .Lenc_key192
  1669. movups 0x10(UKEYP), %xmm2 # other user key
  1670. movaps %xmm2, (TKEYP)
  1671. add $0x10, TKEYP
  1672. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1673. call _key_expansion_256a
  1674. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1675. call _key_expansion_256b
  1676. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1677. call _key_expansion_256a
  1678. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1679. call _key_expansion_256b
  1680. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1681. call _key_expansion_256a
  1682. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1683. call _key_expansion_256b
  1684. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1685. call _key_expansion_256a
  1686. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1687. call _key_expansion_256b
  1688. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1689. call _key_expansion_256a
  1690. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1691. call _key_expansion_256b
  1692. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1693. call _key_expansion_256a
  1694. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1695. call _key_expansion_256b
  1696. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1697. call _key_expansion_256a
  1698. jmp .Ldec_key
  1699. .Lenc_key192:
  1700. movq 0x10(UKEYP), %xmm2 # other user key
  1701. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1702. call _key_expansion_192a
  1703. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1704. call _key_expansion_192b
  1705. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1706. call _key_expansion_192a
  1707. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1708. call _key_expansion_192b
  1709. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1710. call _key_expansion_192a
  1711. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1712. call _key_expansion_192b
  1713. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1714. call _key_expansion_192a
  1715. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1716. call _key_expansion_192b
  1717. jmp .Ldec_key
  1718. .Lenc_key128:
  1719. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1720. call _key_expansion_128
  1721. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1722. call _key_expansion_128
  1723. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1724. call _key_expansion_128
  1725. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1726. call _key_expansion_128
  1727. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1728. call _key_expansion_128
  1729. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1730. call _key_expansion_128
  1731. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1732. call _key_expansion_128
  1733. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1734. call _key_expansion_128
  1735. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1736. call _key_expansion_128
  1737. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1738. call _key_expansion_128
  1739. .Ldec_key:
  1740. sub $0x10, TKEYP
  1741. movaps (KEYP), %xmm0
  1742. movaps (TKEYP), %xmm1
  1743. movaps %xmm0, 240(TKEYP)
  1744. movaps %xmm1, 240(KEYP)
  1745. add $0x10, KEYP
  1746. lea 240-16(TKEYP), UKEYP
  1747. .align 4
  1748. .Ldec_key_loop:
  1749. movaps (KEYP), %xmm0
  1750. AESIMC %xmm0 %xmm1
  1751. movaps %xmm1, (UKEYP)
  1752. add $0x10, KEYP
  1753. sub $0x10, UKEYP
  1754. cmp TKEYP, KEYP
  1755. jb .Ldec_key_loop
  1756. xor AREG, AREG
  1757. #ifndef __x86_64__
  1758. popl KEYP
  1759. #endif
  1760. FRAME_END
  1761. ret
  1762. ENDPROC(aesni_set_key)
  1763. /*
  1764. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1765. */
  1766. ENTRY(aesni_enc)
  1767. FRAME_BEGIN
  1768. #ifndef __x86_64__
  1769. pushl KEYP
  1770. pushl KLEN
  1771. movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
  1772. movl (FRAME_OFFSET+16)(%esp), OUTP # dst
  1773. movl (FRAME_OFFSET+20)(%esp), INP # src
  1774. #endif
  1775. movl 480(KEYP), KLEN # key length
  1776. movups (INP), STATE # input
  1777. call _aesni_enc1
  1778. movups STATE, (OUTP) # output
  1779. #ifndef __x86_64__
  1780. popl KLEN
  1781. popl KEYP
  1782. #endif
  1783. FRAME_END
  1784. ret
  1785. ENDPROC(aesni_enc)
  1786. /*
  1787. * _aesni_enc1: internal ABI
  1788. * input:
  1789. * KEYP: key struct pointer
  1790. * KLEN: round count
  1791. * STATE: initial state (input)
  1792. * output:
  1793. * STATE: finial state (output)
  1794. * changed:
  1795. * KEY
  1796. * TKEYP (T1)
  1797. */
  1798. .align 4
  1799. _aesni_enc1:
  1800. movaps (KEYP), KEY # key
  1801. mov KEYP, TKEYP
  1802. pxor KEY, STATE # round 0
  1803. add $0x30, TKEYP
  1804. cmp $24, KLEN
  1805. jb .Lenc128
  1806. lea 0x20(TKEYP), TKEYP
  1807. je .Lenc192
  1808. add $0x20, TKEYP
  1809. movaps -0x60(TKEYP), KEY
  1810. AESENC KEY STATE
  1811. movaps -0x50(TKEYP), KEY
  1812. AESENC KEY STATE
  1813. .align 4
  1814. .Lenc192:
  1815. movaps -0x40(TKEYP), KEY
  1816. AESENC KEY STATE
  1817. movaps -0x30(TKEYP), KEY
  1818. AESENC KEY STATE
  1819. .align 4
  1820. .Lenc128:
  1821. movaps -0x20(TKEYP), KEY
  1822. AESENC KEY STATE
  1823. movaps -0x10(TKEYP), KEY
  1824. AESENC KEY STATE
  1825. movaps (TKEYP), KEY
  1826. AESENC KEY STATE
  1827. movaps 0x10(TKEYP), KEY
  1828. AESENC KEY STATE
  1829. movaps 0x20(TKEYP), KEY
  1830. AESENC KEY STATE
  1831. movaps 0x30(TKEYP), KEY
  1832. AESENC KEY STATE
  1833. movaps 0x40(TKEYP), KEY
  1834. AESENC KEY STATE
  1835. movaps 0x50(TKEYP), KEY
  1836. AESENC KEY STATE
  1837. movaps 0x60(TKEYP), KEY
  1838. AESENC KEY STATE
  1839. movaps 0x70(TKEYP), KEY
  1840. AESENCLAST KEY STATE
  1841. ret
  1842. ENDPROC(_aesni_enc1)
  1843. /*
  1844. * _aesni_enc4: internal ABI
  1845. * input:
  1846. * KEYP: key struct pointer
  1847. * KLEN: round count
  1848. * STATE1: initial state (input)
  1849. * STATE2
  1850. * STATE3
  1851. * STATE4
  1852. * output:
  1853. * STATE1: finial state (output)
  1854. * STATE2
  1855. * STATE3
  1856. * STATE4
  1857. * changed:
  1858. * KEY
  1859. * TKEYP (T1)
  1860. */
  1861. .align 4
  1862. _aesni_enc4:
  1863. movaps (KEYP), KEY # key
  1864. mov KEYP, TKEYP
  1865. pxor KEY, STATE1 # round 0
  1866. pxor KEY, STATE2
  1867. pxor KEY, STATE3
  1868. pxor KEY, STATE4
  1869. add $0x30, TKEYP
  1870. cmp $24, KLEN
  1871. jb .L4enc128
  1872. lea 0x20(TKEYP), TKEYP
  1873. je .L4enc192
  1874. add $0x20, TKEYP
  1875. movaps -0x60(TKEYP), KEY
  1876. AESENC KEY STATE1
  1877. AESENC KEY STATE2
  1878. AESENC KEY STATE3
  1879. AESENC KEY STATE4
  1880. movaps -0x50(TKEYP), KEY
  1881. AESENC KEY STATE1
  1882. AESENC KEY STATE2
  1883. AESENC KEY STATE3
  1884. AESENC KEY STATE4
  1885. #.align 4
  1886. .L4enc192:
  1887. movaps -0x40(TKEYP), KEY
  1888. AESENC KEY STATE1
  1889. AESENC KEY STATE2
  1890. AESENC KEY STATE3
  1891. AESENC KEY STATE4
  1892. movaps -0x30(TKEYP), KEY
  1893. AESENC KEY STATE1
  1894. AESENC KEY STATE2
  1895. AESENC KEY STATE3
  1896. AESENC KEY STATE4
  1897. #.align 4
  1898. .L4enc128:
  1899. movaps -0x20(TKEYP), KEY
  1900. AESENC KEY STATE1
  1901. AESENC KEY STATE2
  1902. AESENC KEY STATE3
  1903. AESENC KEY STATE4
  1904. movaps -0x10(TKEYP), KEY
  1905. AESENC KEY STATE1
  1906. AESENC KEY STATE2
  1907. AESENC KEY STATE3
  1908. AESENC KEY STATE4
  1909. movaps (TKEYP), KEY
  1910. AESENC KEY STATE1
  1911. AESENC KEY STATE2
  1912. AESENC KEY STATE3
  1913. AESENC KEY STATE4
  1914. movaps 0x10(TKEYP), KEY
  1915. AESENC KEY STATE1
  1916. AESENC KEY STATE2
  1917. AESENC KEY STATE3
  1918. AESENC KEY STATE4
  1919. movaps 0x20(TKEYP), KEY
  1920. AESENC KEY STATE1
  1921. AESENC KEY STATE2
  1922. AESENC KEY STATE3
  1923. AESENC KEY STATE4
  1924. movaps 0x30(TKEYP), KEY
  1925. AESENC KEY STATE1
  1926. AESENC KEY STATE2
  1927. AESENC KEY STATE3
  1928. AESENC KEY STATE4
  1929. movaps 0x40(TKEYP), KEY
  1930. AESENC KEY STATE1
  1931. AESENC KEY STATE2
  1932. AESENC KEY STATE3
  1933. AESENC KEY STATE4
  1934. movaps 0x50(TKEYP), KEY
  1935. AESENC KEY STATE1
  1936. AESENC KEY STATE2
  1937. AESENC KEY STATE3
  1938. AESENC KEY STATE4
  1939. movaps 0x60(TKEYP), KEY
  1940. AESENC KEY STATE1
  1941. AESENC KEY STATE2
  1942. AESENC KEY STATE3
  1943. AESENC KEY STATE4
  1944. movaps 0x70(TKEYP), KEY
  1945. AESENCLAST KEY STATE1 # last round
  1946. AESENCLAST KEY STATE2
  1947. AESENCLAST KEY STATE3
  1948. AESENCLAST KEY STATE4
  1949. ret
  1950. ENDPROC(_aesni_enc4)
  1951. /*
  1952. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1953. */
  1954. ENTRY(aesni_dec)
  1955. FRAME_BEGIN
  1956. #ifndef __x86_64__
  1957. pushl KEYP
  1958. pushl KLEN
  1959. movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
  1960. movl (FRAME_OFFSET+16)(%esp), OUTP # dst
  1961. movl (FRAME_OFFSET+20)(%esp), INP # src
  1962. #endif
  1963. mov 480(KEYP), KLEN # key length
  1964. add $240, KEYP
  1965. movups (INP), STATE # input
  1966. call _aesni_dec1
  1967. movups STATE, (OUTP) #output
  1968. #ifndef __x86_64__
  1969. popl KLEN
  1970. popl KEYP
  1971. #endif
  1972. FRAME_END
  1973. ret
  1974. ENDPROC(aesni_dec)
  1975. /*
  1976. * _aesni_dec1: internal ABI
  1977. * input:
  1978. * KEYP: key struct pointer
  1979. * KLEN: key length
  1980. * STATE: initial state (input)
  1981. * output:
  1982. * STATE: finial state (output)
  1983. * changed:
  1984. * KEY
  1985. * TKEYP (T1)
  1986. */
  1987. .align 4
  1988. _aesni_dec1:
  1989. movaps (KEYP), KEY # key
  1990. mov KEYP, TKEYP
  1991. pxor KEY, STATE # round 0
  1992. add $0x30, TKEYP
  1993. cmp $24, KLEN
  1994. jb .Ldec128
  1995. lea 0x20(TKEYP), TKEYP
  1996. je .Ldec192
  1997. add $0x20, TKEYP
  1998. movaps -0x60(TKEYP), KEY
  1999. AESDEC KEY STATE
  2000. movaps -0x50(TKEYP), KEY
  2001. AESDEC KEY STATE
  2002. .align 4
  2003. .Ldec192:
  2004. movaps -0x40(TKEYP), KEY
  2005. AESDEC KEY STATE
  2006. movaps -0x30(TKEYP), KEY
  2007. AESDEC KEY STATE
  2008. .align 4
  2009. .Ldec128:
  2010. movaps -0x20(TKEYP), KEY
  2011. AESDEC KEY STATE
  2012. movaps -0x10(TKEYP), KEY
  2013. AESDEC KEY STATE
  2014. movaps (TKEYP), KEY
  2015. AESDEC KEY STATE
  2016. movaps 0x10(TKEYP), KEY
  2017. AESDEC KEY STATE
  2018. movaps 0x20(TKEYP), KEY
  2019. AESDEC KEY STATE
  2020. movaps 0x30(TKEYP), KEY
  2021. AESDEC KEY STATE
  2022. movaps 0x40(TKEYP), KEY
  2023. AESDEC KEY STATE
  2024. movaps 0x50(TKEYP), KEY
  2025. AESDEC KEY STATE
  2026. movaps 0x60(TKEYP), KEY
  2027. AESDEC KEY STATE
  2028. movaps 0x70(TKEYP), KEY
  2029. AESDECLAST KEY STATE
  2030. ret
  2031. ENDPROC(_aesni_dec1)
  2032. /*
  2033. * _aesni_dec4: internal ABI
  2034. * input:
  2035. * KEYP: key struct pointer
  2036. * KLEN: key length
  2037. * STATE1: initial state (input)
  2038. * STATE2
  2039. * STATE3
  2040. * STATE4
  2041. * output:
  2042. * STATE1: finial state (output)
  2043. * STATE2
  2044. * STATE3
  2045. * STATE4
  2046. * changed:
  2047. * KEY
  2048. * TKEYP (T1)
  2049. */
  2050. .align 4
  2051. _aesni_dec4:
  2052. movaps (KEYP), KEY # key
  2053. mov KEYP, TKEYP
  2054. pxor KEY, STATE1 # round 0
  2055. pxor KEY, STATE2
  2056. pxor KEY, STATE3
  2057. pxor KEY, STATE4
  2058. add $0x30, TKEYP
  2059. cmp $24, KLEN
  2060. jb .L4dec128
  2061. lea 0x20(TKEYP), TKEYP
  2062. je .L4dec192
  2063. add $0x20, TKEYP
  2064. movaps -0x60(TKEYP), KEY
  2065. AESDEC KEY STATE1
  2066. AESDEC KEY STATE2
  2067. AESDEC KEY STATE3
  2068. AESDEC KEY STATE4
  2069. movaps -0x50(TKEYP), KEY
  2070. AESDEC KEY STATE1
  2071. AESDEC KEY STATE2
  2072. AESDEC KEY STATE3
  2073. AESDEC KEY STATE4
  2074. .align 4
  2075. .L4dec192:
  2076. movaps -0x40(TKEYP), KEY
  2077. AESDEC KEY STATE1
  2078. AESDEC KEY STATE2
  2079. AESDEC KEY STATE3
  2080. AESDEC KEY STATE4
  2081. movaps -0x30(TKEYP), KEY
  2082. AESDEC KEY STATE1
  2083. AESDEC KEY STATE2
  2084. AESDEC KEY STATE3
  2085. AESDEC KEY STATE4
  2086. .align 4
  2087. .L4dec128:
  2088. movaps -0x20(TKEYP), KEY
  2089. AESDEC KEY STATE1
  2090. AESDEC KEY STATE2
  2091. AESDEC KEY STATE3
  2092. AESDEC KEY STATE4
  2093. movaps -0x10(TKEYP), KEY
  2094. AESDEC KEY STATE1
  2095. AESDEC KEY STATE2
  2096. AESDEC KEY STATE3
  2097. AESDEC KEY STATE4
  2098. movaps (TKEYP), KEY
  2099. AESDEC KEY STATE1
  2100. AESDEC KEY STATE2
  2101. AESDEC KEY STATE3
  2102. AESDEC KEY STATE4
  2103. movaps 0x10(TKEYP), KEY
  2104. AESDEC KEY STATE1
  2105. AESDEC KEY STATE2
  2106. AESDEC KEY STATE3
  2107. AESDEC KEY STATE4
  2108. movaps 0x20(TKEYP), KEY
  2109. AESDEC KEY STATE1
  2110. AESDEC KEY STATE2
  2111. AESDEC KEY STATE3
  2112. AESDEC KEY STATE4
  2113. movaps 0x30(TKEYP), KEY
  2114. AESDEC KEY STATE1
  2115. AESDEC KEY STATE2
  2116. AESDEC KEY STATE3
  2117. AESDEC KEY STATE4
  2118. movaps 0x40(TKEYP), KEY
  2119. AESDEC KEY STATE1
  2120. AESDEC KEY STATE2
  2121. AESDEC KEY STATE3
  2122. AESDEC KEY STATE4
  2123. movaps 0x50(TKEYP), KEY
  2124. AESDEC KEY STATE1
  2125. AESDEC KEY STATE2
  2126. AESDEC KEY STATE3
  2127. AESDEC KEY STATE4
  2128. movaps 0x60(TKEYP), KEY
  2129. AESDEC KEY STATE1
  2130. AESDEC KEY STATE2
  2131. AESDEC KEY STATE3
  2132. AESDEC KEY STATE4
  2133. movaps 0x70(TKEYP), KEY
  2134. AESDECLAST KEY STATE1 # last round
  2135. AESDECLAST KEY STATE2
  2136. AESDECLAST KEY STATE3
  2137. AESDECLAST KEY STATE4
  2138. ret
  2139. ENDPROC(_aesni_dec4)
  2140. /*
  2141. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2142. * size_t len)
  2143. */
  2144. ENTRY(aesni_ecb_enc)
  2145. FRAME_BEGIN
  2146. #ifndef __x86_64__
  2147. pushl LEN
  2148. pushl KEYP
  2149. pushl KLEN
  2150. movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
  2151. movl (FRAME_OFFSET+20)(%esp), OUTP # dst
  2152. movl (FRAME_OFFSET+24)(%esp), INP # src
  2153. movl (FRAME_OFFSET+28)(%esp), LEN # len
  2154. #endif
  2155. test LEN, LEN # check length
  2156. jz .Lecb_enc_ret
  2157. mov 480(KEYP), KLEN
  2158. cmp $16, LEN
  2159. jb .Lecb_enc_ret
  2160. cmp $64, LEN
  2161. jb .Lecb_enc_loop1
  2162. .align 4
  2163. .Lecb_enc_loop4:
  2164. movups (INP), STATE1
  2165. movups 0x10(INP), STATE2
  2166. movups 0x20(INP), STATE3
  2167. movups 0x30(INP), STATE4
  2168. call _aesni_enc4
  2169. movups STATE1, (OUTP)
  2170. movups STATE2, 0x10(OUTP)
  2171. movups STATE3, 0x20(OUTP)
  2172. movups STATE4, 0x30(OUTP)
  2173. sub $64, LEN
  2174. add $64, INP
  2175. add $64, OUTP
  2176. cmp $64, LEN
  2177. jge .Lecb_enc_loop4
  2178. cmp $16, LEN
  2179. jb .Lecb_enc_ret
  2180. .align 4
  2181. .Lecb_enc_loop1:
  2182. movups (INP), STATE1
  2183. call _aesni_enc1
  2184. movups STATE1, (OUTP)
  2185. sub $16, LEN
  2186. add $16, INP
  2187. add $16, OUTP
  2188. cmp $16, LEN
  2189. jge .Lecb_enc_loop1
  2190. .Lecb_enc_ret:
  2191. #ifndef __x86_64__
  2192. popl KLEN
  2193. popl KEYP
  2194. popl LEN
  2195. #endif
  2196. FRAME_END
  2197. ret
  2198. ENDPROC(aesni_ecb_enc)
  2199. /*
  2200. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2201. * size_t len);
  2202. */
  2203. ENTRY(aesni_ecb_dec)
  2204. FRAME_BEGIN
  2205. #ifndef __x86_64__
  2206. pushl LEN
  2207. pushl KEYP
  2208. pushl KLEN
  2209. movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
  2210. movl (FRAME_OFFSET+20)(%esp), OUTP # dst
  2211. movl (FRAME_OFFSET+24)(%esp), INP # src
  2212. movl (FRAME_OFFSET+28)(%esp), LEN # len
  2213. #endif
  2214. test LEN, LEN
  2215. jz .Lecb_dec_ret
  2216. mov 480(KEYP), KLEN
  2217. add $240, KEYP
  2218. cmp $16, LEN
  2219. jb .Lecb_dec_ret
  2220. cmp $64, LEN
  2221. jb .Lecb_dec_loop1
  2222. .align 4
  2223. .Lecb_dec_loop4:
  2224. movups (INP), STATE1
  2225. movups 0x10(INP), STATE2
  2226. movups 0x20(INP), STATE3
  2227. movups 0x30(INP), STATE4
  2228. call _aesni_dec4
  2229. movups STATE1, (OUTP)
  2230. movups STATE2, 0x10(OUTP)
  2231. movups STATE3, 0x20(OUTP)
  2232. movups STATE4, 0x30(OUTP)
  2233. sub $64, LEN
  2234. add $64, INP
  2235. add $64, OUTP
  2236. cmp $64, LEN
  2237. jge .Lecb_dec_loop4
  2238. cmp $16, LEN
  2239. jb .Lecb_dec_ret
  2240. .align 4
  2241. .Lecb_dec_loop1:
  2242. movups (INP), STATE1
  2243. call _aesni_dec1
  2244. movups STATE1, (OUTP)
  2245. sub $16, LEN
  2246. add $16, INP
  2247. add $16, OUTP
  2248. cmp $16, LEN
  2249. jge .Lecb_dec_loop1
  2250. .Lecb_dec_ret:
  2251. #ifndef __x86_64__
  2252. popl KLEN
  2253. popl KEYP
  2254. popl LEN
  2255. #endif
  2256. FRAME_END
  2257. ret
  2258. ENDPROC(aesni_ecb_dec)
  2259. /*
  2260. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2261. * size_t len, u8 *iv)
  2262. */
  2263. ENTRY(aesni_cbc_enc)
  2264. FRAME_BEGIN
  2265. #ifndef __x86_64__
  2266. pushl IVP
  2267. pushl LEN
  2268. pushl KEYP
  2269. pushl KLEN
  2270. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2271. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2272. movl (FRAME_OFFSET+28)(%esp), INP # src
  2273. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2274. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2275. #endif
  2276. cmp $16, LEN
  2277. jb .Lcbc_enc_ret
  2278. mov 480(KEYP), KLEN
  2279. movups (IVP), STATE # load iv as initial state
  2280. .align 4
  2281. .Lcbc_enc_loop:
  2282. movups (INP), IN # load input
  2283. pxor IN, STATE
  2284. call _aesni_enc1
  2285. movups STATE, (OUTP) # store output
  2286. sub $16, LEN
  2287. add $16, INP
  2288. add $16, OUTP
  2289. cmp $16, LEN
  2290. jge .Lcbc_enc_loop
  2291. movups STATE, (IVP)
  2292. .Lcbc_enc_ret:
  2293. #ifndef __x86_64__
  2294. popl KLEN
  2295. popl KEYP
  2296. popl LEN
  2297. popl IVP
  2298. #endif
  2299. FRAME_END
  2300. ret
  2301. ENDPROC(aesni_cbc_enc)
  2302. /*
  2303. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2304. * size_t len, u8 *iv)
  2305. */
  2306. ENTRY(aesni_cbc_dec)
  2307. FRAME_BEGIN
  2308. #ifndef __x86_64__
  2309. pushl IVP
  2310. pushl LEN
  2311. pushl KEYP
  2312. pushl KLEN
  2313. movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
  2314. movl (FRAME_OFFSET+24)(%esp), OUTP # dst
  2315. movl (FRAME_OFFSET+28)(%esp), INP # src
  2316. movl (FRAME_OFFSET+32)(%esp), LEN # len
  2317. movl (FRAME_OFFSET+36)(%esp), IVP # iv
  2318. #endif
  2319. cmp $16, LEN
  2320. jb .Lcbc_dec_just_ret
  2321. mov 480(KEYP), KLEN
  2322. add $240, KEYP
  2323. movups (IVP), IV
  2324. cmp $64, LEN
  2325. jb .Lcbc_dec_loop1
  2326. .align 4
  2327. .Lcbc_dec_loop4:
  2328. movups (INP), IN1
  2329. movaps IN1, STATE1
  2330. movups 0x10(INP), IN2
  2331. movaps IN2, STATE2
  2332. #ifdef __x86_64__
  2333. movups 0x20(INP), IN3
  2334. movaps IN3, STATE3
  2335. movups 0x30(INP), IN4
  2336. movaps IN4, STATE4
  2337. #else
  2338. movups 0x20(INP), IN1
  2339. movaps IN1, STATE3
  2340. movups 0x30(INP), IN2
  2341. movaps IN2, STATE4
  2342. #endif
  2343. call _aesni_dec4
  2344. pxor IV, STATE1
  2345. #ifdef __x86_64__
  2346. pxor IN1, STATE2
  2347. pxor IN2, STATE3
  2348. pxor IN3, STATE4
  2349. movaps IN4, IV
  2350. #else
  2351. pxor IN1, STATE4
  2352. movaps IN2, IV
  2353. movups (INP), IN1
  2354. pxor IN1, STATE2
  2355. movups 0x10(INP), IN2
  2356. pxor IN2, STATE3
  2357. #endif
  2358. movups STATE1, (OUTP)
  2359. movups STATE2, 0x10(OUTP)
  2360. movups STATE3, 0x20(OUTP)
  2361. movups STATE4, 0x30(OUTP)
  2362. sub $64, LEN
  2363. add $64, INP
  2364. add $64, OUTP
  2365. cmp $64, LEN
  2366. jge .Lcbc_dec_loop4
  2367. cmp $16, LEN
  2368. jb .Lcbc_dec_ret
  2369. .align 4
  2370. .Lcbc_dec_loop1:
  2371. movups (INP), IN
  2372. movaps IN, STATE
  2373. call _aesni_dec1
  2374. pxor IV, STATE
  2375. movups STATE, (OUTP)
  2376. movaps IN, IV
  2377. sub $16, LEN
  2378. add $16, INP
  2379. add $16, OUTP
  2380. cmp $16, LEN
  2381. jge .Lcbc_dec_loop1
  2382. .Lcbc_dec_ret:
  2383. movups IV, (IVP)
  2384. .Lcbc_dec_just_ret:
  2385. #ifndef __x86_64__
  2386. popl KLEN
  2387. popl KEYP
  2388. popl LEN
  2389. popl IVP
  2390. #endif
  2391. FRAME_END
  2392. ret
  2393. ENDPROC(aesni_cbc_dec)
  2394. #ifdef __x86_64__
  2395. .pushsection .rodata
  2396. .align 16
  2397. .Lbswap_mask:
  2398. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2399. .popsection
  2400. /*
  2401. * _aesni_inc_init: internal ABI
  2402. * setup registers used by _aesni_inc
  2403. * input:
  2404. * IV
  2405. * output:
  2406. * CTR: == IV, in little endian
  2407. * TCTR_LOW: == lower qword of CTR
  2408. * INC: == 1, in little endian
  2409. * BSWAP_MASK == endian swapping mask
  2410. */
  2411. .align 4
  2412. _aesni_inc_init:
  2413. movaps .Lbswap_mask, BSWAP_MASK
  2414. movaps IV, CTR
  2415. PSHUFB_XMM BSWAP_MASK CTR
  2416. mov $1, TCTR_LOW
  2417. MOVQ_R64_XMM TCTR_LOW INC
  2418. MOVQ_R64_XMM CTR TCTR_LOW
  2419. ret
  2420. ENDPROC(_aesni_inc_init)
  2421. /*
  2422. * _aesni_inc: internal ABI
  2423. * Increase IV by 1, IV is in big endian
  2424. * input:
  2425. * IV
  2426. * CTR: == IV, in little endian
  2427. * TCTR_LOW: == lower qword of CTR
  2428. * INC: == 1, in little endian
  2429. * BSWAP_MASK == endian swapping mask
  2430. * output:
  2431. * IV: Increase by 1
  2432. * changed:
  2433. * CTR: == output IV, in little endian
  2434. * TCTR_LOW: == lower qword of CTR
  2435. */
  2436. .align 4
  2437. _aesni_inc:
  2438. paddq INC, CTR
  2439. add $1, TCTR_LOW
  2440. jnc .Linc_low
  2441. pslldq $8, INC
  2442. paddq INC, CTR
  2443. psrldq $8, INC
  2444. .Linc_low:
  2445. movaps CTR, IV
  2446. PSHUFB_XMM BSWAP_MASK IV
  2447. ret
  2448. ENDPROC(_aesni_inc)
  2449. /*
  2450. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2451. * size_t len, u8 *iv)
  2452. */
  2453. ENTRY(aesni_ctr_enc)
  2454. FRAME_BEGIN
  2455. cmp $16, LEN
  2456. jb .Lctr_enc_just_ret
  2457. mov 480(KEYP), KLEN
  2458. movups (IVP), IV
  2459. call _aesni_inc_init
  2460. cmp $64, LEN
  2461. jb .Lctr_enc_loop1
  2462. .align 4
  2463. .Lctr_enc_loop4:
  2464. movaps IV, STATE1
  2465. call _aesni_inc
  2466. movups (INP), IN1
  2467. movaps IV, STATE2
  2468. call _aesni_inc
  2469. movups 0x10(INP), IN2
  2470. movaps IV, STATE3
  2471. call _aesni_inc
  2472. movups 0x20(INP), IN3
  2473. movaps IV, STATE4
  2474. call _aesni_inc
  2475. movups 0x30(INP), IN4
  2476. call _aesni_enc4
  2477. pxor IN1, STATE1
  2478. movups STATE1, (OUTP)
  2479. pxor IN2, STATE2
  2480. movups STATE2, 0x10(OUTP)
  2481. pxor IN3, STATE3
  2482. movups STATE3, 0x20(OUTP)
  2483. pxor IN4, STATE4
  2484. movups STATE4, 0x30(OUTP)
  2485. sub $64, LEN
  2486. add $64, INP
  2487. add $64, OUTP
  2488. cmp $64, LEN
  2489. jge .Lctr_enc_loop4
  2490. cmp $16, LEN
  2491. jb .Lctr_enc_ret
  2492. .align 4
  2493. .Lctr_enc_loop1:
  2494. movaps IV, STATE
  2495. call _aesni_inc
  2496. movups (INP), IN
  2497. call _aesni_enc1
  2498. pxor IN, STATE
  2499. movups STATE, (OUTP)
  2500. sub $16, LEN
  2501. add $16, INP
  2502. add $16, OUTP
  2503. cmp $16, LEN
  2504. jge .Lctr_enc_loop1
  2505. .Lctr_enc_ret:
  2506. movups IV, (IVP)
  2507. .Lctr_enc_just_ret:
  2508. FRAME_END
  2509. ret
  2510. ENDPROC(aesni_ctr_enc)
  2511. /*
  2512. * _aesni_gf128mul_x_ble: internal ABI
  2513. * Multiply in GF(2^128) for XTS IVs
  2514. * input:
  2515. * IV: current IV
  2516. * GF128MUL_MASK == mask with 0x87 and 0x01
  2517. * output:
  2518. * IV: next IV
  2519. * changed:
  2520. * CTR: == temporary value
  2521. */
  2522. #define _aesni_gf128mul_x_ble() \
  2523. pshufd $0x13, IV, CTR; \
  2524. paddq IV, IV; \
  2525. psrad $31, CTR; \
  2526. pand GF128MUL_MASK, CTR; \
  2527. pxor CTR, IV;
  2528. /*
  2529. * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2530. * bool enc, u8 *iv)
  2531. */
  2532. ENTRY(aesni_xts_crypt8)
  2533. FRAME_BEGIN
  2534. cmpb $0, %cl
  2535. movl $0, %ecx
  2536. movl $240, %r10d
  2537. leaq _aesni_enc4, %r11
  2538. leaq _aesni_dec4, %rax
  2539. cmovel %r10d, %ecx
  2540. cmoveq %rax, %r11
  2541. movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
  2542. movups (IVP), IV
  2543. mov 480(KEYP), KLEN
  2544. addq %rcx, KEYP
  2545. movdqa IV, STATE1
  2546. movdqu 0x00(INP), INC
  2547. pxor INC, STATE1
  2548. movdqu IV, 0x00(OUTP)
  2549. _aesni_gf128mul_x_ble()
  2550. movdqa IV, STATE2
  2551. movdqu 0x10(INP), INC
  2552. pxor INC, STATE2
  2553. movdqu IV, 0x10(OUTP)
  2554. _aesni_gf128mul_x_ble()
  2555. movdqa IV, STATE3
  2556. movdqu 0x20(INP), INC
  2557. pxor INC, STATE3
  2558. movdqu IV, 0x20(OUTP)
  2559. _aesni_gf128mul_x_ble()
  2560. movdqa IV, STATE4
  2561. movdqu 0x30(INP), INC
  2562. pxor INC, STATE4
  2563. movdqu IV, 0x30(OUTP)
  2564. CALL_NOSPEC %r11
  2565. movdqu 0x00(OUTP), INC
  2566. pxor INC, STATE1
  2567. movdqu STATE1, 0x00(OUTP)
  2568. _aesni_gf128mul_x_ble()
  2569. movdqa IV, STATE1
  2570. movdqu 0x40(INP), INC
  2571. pxor INC, STATE1
  2572. movdqu IV, 0x40(OUTP)
  2573. movdqu 0x10(OUTP), INC
  2574. pxor INC, STATE2
  2575. movdqu STATE2, 0x10(OUTP)
  2576. _aesni_gf128mul_x_ble()
  2577. movdqa IV, STATE2
  2578. movdqu 0x50(INP), INC
  2579. pxor INC, STATE2
  2580. movdqu IV, 0x50(OUTP)
  2581. movdqu 0x20(OUTP), INC
  2582. pxor INC, STATE3
  2583. movdqu STATE3, 0x20(OUTP)
  2584. _aesni_gf128mul_x_ble()
  2585. movdqa IV, STATE3
  2586. movdqu 0x60(INP), INC
  2587. pxor INC, STATE3
  2588. movdqu IV, 0x60(OUTP)
  2589. movdqu 0x30(OUTP), INC
  2590. pxor INC, STATE4
  2591. movdqu STATE4, 0x30(OUTP)
  2592. _aesni_gf128mul_x_ble()
  2593. movdqa IV, STATE4
  2594. movdqu 0x70(INP), INC
  2595. pxor INC, STATE4
  2596. movdqu IV, 0x70(OUTP)
  2597. _aesni_gf128mul_x_ble()
  2598. movups IV, (IVP)
  2599. CALL_NOSPEC %r11
  2600. movdqu 0x40(OUTP), INC
  2601. pxor INC, STATE1
  2602. movdqu STATE1, 0x40(OUTP)
  2603. movdqu 0x50(OUTP), INC
  2604. pxor INC, STATE2
  2605. movdqu STATE2, 0x50(OUTP)
  2606. movdqu 0x60(OUTP), INC
  2607. pxor INC, STATE3
  2608. movdqu STATE3, 0x60(OUTP)
  2609. movdqu 0x70(OUTP), INC
  2610. pxor INC, STATE4
  2611. movdqu STATE4, 0x70(OUTP)
  2612. FRAME_END
  2613. ret
  2614. ENDPROC(aesni_xts_crypt8)
  2615. #endif