avx2.h 195 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759
  1. /* SPDX-License-Identifier: MIT
  2. *
  3. * Permission is hereby granted, free of charge, to any person
  4. * obtaining a copy of this software and associated documentation
  5. * files (the "Software"), to deal in the Software without
  6. * restriction, including without limitation the rights to use, copy,
  7. * modify, merge, publish, distribute, sublicense, and/or sell copies
  8. * of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be
  12. * included in all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. * SOFTWARE.
  22. *
  23. * Copyright:
  24. * 2018-2020 Evan Nemerson <evan@nemerson.com>
  25. * 2019-2020 Michael R. Crusoe <crusoe@debian.org>
  26. * 2020 Himanshi Mathur <himanshi18037@iiitd.ac.in>
  27. * 2020 Hidayat Khan <huk2209@gmail.com>
  28. */
  29. #if !defined(SIMDE_X86_AVX2_H)
  30. #define SIMDE_X86_AVX2_H
  31. #include "avx.h"
  32. HEDLEY_DIAGNOSTIC_PUSH
  33. SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
  34. SIMDE_BEGIN_DECLS_
  35. SIMDE_FUNCTION_ATTRIBUTES
  36. simde__m256i
  37. simde_mm256_abs_epi8 (simde__m256i a) {
  38. #if defined(SIMDE_X86_AVX2_NATIVE)
  39. return _mm256_abs_epi8(a);
  40. #else
  41. simde__m256i_private
  42. r_,
  43. a_ = simde__m256i_to_private(a);
  44. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  45. r_.m128i[0] = simde_mm_abs_epi8(a_.m128i[0]);
  46. r_.m128i[1] = simde_mm_abs_epi8(a_.m128i[1]);
  47. #else
  48. SIMDE_VECTORIZE
  49. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  50. r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i];
  51. }
  52. #endif
  53. return simde__m256i_from_private(r_);
  54. #endif
  55. }
  56. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  57. #undef _mm256_abs_epi8
  58. #define _mm256_abs_epi8(a) simde_mm256_abs_epi8(a)
  59. #endif
  60. SIMDE_FUNCTION_ATTRIBUTES
  61. simde__m256i
  62. simde_mm256_abs_epi16 (simde__m256i a) {
  63. #if defined(SIMDE_X86_AVX2_NATIVE)
  64. return _mm256_abs_epi16(a);
  65. #else
  66. simde__m256i_private
  67. r_,
  68. a_ = simde__m256i_to_private(a);
  69. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  70. r_.m128i[0] = simde_mm_abs_epi16(a_.m128i[0]);
  71. r_.m128i[1] = simde_mm_abs_epi16(a_.m128i[1]);
  72. #else
  73. SIMDE_VECTORIZE
  74. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  75. r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i];
  76. }
  77. #endif
  78. return simde__m256i_from_private(r_);
  79. #endif
  80. }
  81. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  82. #undef _mm256_abs_epi16
  83. #define _mm256_abs_epi16(a) simde_mm256_abs_epi16(a)
  84. #endif
  85. SIMDE_FUNCTION_ATTRIBUTES
  86. simde__m256i
  87. simde_mm256_abs_epi32(simde__m256i a) {
  88. #if defined(SIMDE_X86_AVX2_NATIVE)
  89. return _mm256_abs_epi32(a);
  90. #else
  91. simde__m256i_private
  92. r_,
  93. a_ = simde__m256i_to_private(a);
  94. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  95. r_.m128i[0] = simde_mm_abs_epi32(a_.m128i[0]);
  96. r_.m128i[1] = simde_mm_abs_epi32(a_.m128i[1]);
  97. #else
  98. SIMDE_VECTORIZE
  99. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  100. r_.i32[i] = (a_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
  101. }
  102. #endif
  103. return simde__m256i_from_private(r_);
  104. #endif
  105. }
  106. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  107. #undef _mm256_abs_epi32
  108. #define _mm256_abs_epi32(a) simde_mm256_abs_epi32(a)
  109. #endif
  110. SIMDE_FUNCTION_ATTRIBUTES
  111. simde__m256i
  112. simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) {
  113. #if defined(SIMDE_X86_AVX2_NATIVE)
  114. return _mm256_add_epi8(a, b);
  115. #else
  116. simde__m256i_private
  117. r_,
  118. a_ = simde__m256i_to_private(a),
  119. b_ = simde__m256i_to_private(b);
  120. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  121. r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]);
  122. r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]);
  123. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  124. r_.i8 = a_.i8 + b_.i8;
  125. #else
  126. SIMDE_VECTORIZE
  127. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  128. r_.i8[i] = a_.i8[i] + b_.i8[i];
  129. }
  130. #endif
  131. return simde__m256i_from_private(r_);
  132. #endif
  133. }
  134. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  135. #undef _mm256_add_epi8
  136. #define _mm256_add_epi8(a, b) simde_mm256_add_epi8(a, b)
  137. #endif
  138. SIMDE_FUNCTION_ATTRIBUTES
  139. simde__m256i
  140. simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) {
  141. #if defined(SIMDE_X86_AVX2_NATIVE)
  142. return _mm256_add_epi16(a, b);
  143. #else
  144. simde__m256i_private
  145. r_,
  146. a_ = simde__m256i_to_private(a),
  147. b_ = simde__m256i_to_private(b);
  148. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  149. r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]);
  150. r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]);
  151. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  152. r_.i16 = a_.i16 + b_.i16;
  153. #else
  154. SIMDE_VECTORIZE
  155. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  156. r_.i16[i] = a_.i16[i] + b_.i16[i];
  157. }
  158. #endif
  159. return simde__m256i_from_private(r_);
  160. #endif
  161. }
  162. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  163. #undef _mm256_add_epi16
  164. #define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b)
  165. #endif
  166. SIMDE_FUNCTION_ATTRIBUTES
  167. simde__m256i
  168. simde_mm256_hadd_epi16 (simde__m256i a, simde__m256i b) {
  169. #if defined(SIMDE_X86_AVX2_NATIVE)
  170. return _mm256_hadd_epi16(a, b);
  171. #else
  172. return simde_mm256_add_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
  173. #endif
  174. }
  175. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  176. #undef _mm256_hadd_epi16
  177. #define _mm256_hadd_epi16(a, b) simde_mm256_hadd_epi16(a, b)
  178. #endif
  179. SIMDE_FUNCTION_ATTRIBUTES
  180. simde__m256i
  181. simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) {
  182. #if defined(SIMDE_X86_AVX2_NATIVE)
  183. return _mm256_add_epi32(a, b);
  184. #else
  185. simde__m256i_private
  186. r_,
  187. a_ = simde__m256i_to_private(a),
  188. b_ = simde__m256i_to_private(b);
  189. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  190. r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]);
  191. r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]);
  192. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  193. r_.i32 = a_.i32 + b_.i32;
  194. #else
  195. SIMDE_VECTORIZE
  196. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  197. r_.i32[i] = a_.i32[i] + b_.i32[i];
  198. }
  199. #endif
  200. return simde__m256i_from_private(r_);
  201. #endif
  202. }
  203. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  204. #undef _mm256_add_epi32
  205. #define _mm256_add_epi32(a, b) simde_mm256_add_epi32(a, b)
  206. #endif
  207. SIMDE_FUNCTION_ATTRIBUTES
  208. simde__m256i
  209. simde_mm256_hadd_epi32 (simde__m256i a, simde__m256i b) {
  210. #if defined(SIMDE_X86_AVX2_NATIVE)
  211. return _mm256_hadd_epi32(a, b);
  212. #else
  213. return simde_mm256_add_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b));
  214. #endif
  215. }
  216. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  217. #undef _mm256_hadd_epi32
  218. #define _mm256_hadd_epi32(a, b) simde_mm256_hadd_epi32(a, b)
  219. #endif
  220. SIMDE_FUNCTION_ATTRIBUTES
  221. simde__m256i
  222. simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) {
  223. #if defined(SIMDE_X86_AVX2_NATIVE)
  224. return _mm256_add_epi64(a, b);
  225. #else
  226. simde__m256i_private
  227. r_,
  228. a_ = simde__m256i_to_private(a),
  229. b_ = simde__m256i_to_private(b);
  230. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  231. r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]);
  232. r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]);
  233. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS)
  234. r_.i64 = a_.i64 + b_.i64;
  235. #else
  236. SIMDE_VECTORIZE
  237. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  238. r_.i64[i] = a_.i64[i] + b_.i64[i];
  239. }
  240. #endif
  241. return simde__m256i_from_private(r_);
  242. #endif
  243. }
  244. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  245. #undef _mm256_add_epi64
  246. #define _mm256_add_epi64(a, b) simde_mm256_add_epi64(a, b)
  247. #endif
  248. SIMDE_FUNCTION_ATTRIBUTES
  249. simde__m256i
  250. simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count)
  251. SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) {
  252. simde__m256i_private
  253. r_,
  254. a_ = simde__m256i_to_private(a),
  255. b_ = simde__m256i_to_private(b);
  256. if (HEDLEY_UNLIKELY(count > 31))
  257. return simde_mm256_setzero_si256();
  258. for (size_t h = 0 ; h < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; h++) {
  259. SIMDE_VECTORIZE
  260. for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
  261. const int srcpos = count + HEDLEY_STATIC_CAST(int, i);
  262. if (srcpos > 31) {
  263. r_.m128i_private[h].i8[i] = 0;
  264. } else if (srcpos > 15) {
  265. r_.m128i_private[h].i8[i] = a_.m128i_private[h].i8[(srcpos) & 15];
  266. } else {
  267. r_.m128i_private[h].i8[i] = b_.m128i_private[h].i8[srcpos];
  268. }
  269. }
  270. }
  271. return simde__m256i_from_private(r_);
  272. }
  273. #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_BUG_PGI_30106)
  274. # define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count)
  275. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  276. # define simde_mm256_alignr_epi8(a, b, count) \
  277. simde_mm256_set_m128i( \
  278. simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \
  279. simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count)))
  280. #endif
  281. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  282. #undef _mm256_alignr_epi8
  283. #define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count))
  284. #endif
  285. SIMDE_FUNCTION_ATTRIBUTES
  286. simde__m256i
  287. simde_mm256_and_si256 (simde__m256i a, simde__m256i b) {
  288. #if defined(SIMDE_X86_AVX2_NATIVE)
  289. return _mm256_and_si256(a, b);
  290. #else
  291. simde__m256i_private
  292. r_,
  293. a_ = simde__m256i_to_private(a),
  294. b_ = simde__m256i_to_private(b);
  295. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  296. r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]);
  297. r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]);
  298. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  299. r_.i32f = a_.i32f & b_.i32f;
  300. #else
  301. SIMDE_VECTORIZE
  302. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  303. r_.i64[i] = a_.i64[i] & b_.i64[i];
  304. }
  305. #endif
  306. return simde__m256i_from_private(r_);
  307. #endif
  308. }
  309. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  310. #undef _mm256_and_si256
  311. #define _mm256_and_si256(a, b) simde_mm256_and_si256(a, b)
  312. #endif
  313. SIMDE_FUNCTION_ATTRIBUTES
  314. simde__m256i
  315. simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) {
  316. #if defined(SIMDE_X86_AVX2_NATIVE)
  317. return _mm256_andnot_si256(a, b);
  318. #else
  319. simde__m256i_private
  320. r_,
  321. a_ = simde__m256i_to_private(a),
  322. b_ = simde__m256i_to_private(b);
  323. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  324. r_.m128i[0] = simde_mm_andnot_si128(a_.m128i[0], b_.m128i[0]);
  325. r_.m128i[1] = simde_mm_andnot_si128(a_.m128i[1], b_.m128i[1]);
  326. #else
  327. SIMDE_VECTORIZE
  328. for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
  329. r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
  330. }
  331. #endif
  332. return simde__m256i_from_private(r_);
  333. #endif
  334. }
  335. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  336. #undef _mm256_andnot_si256
  337. #define _mm256_andnot_si256(a, b) simde_mm256_andnot_si256(a, b)
  338. #endif
  339. SIMDE_FUNCTION_ATTRIBUTES
  340. simde__m256i
  341. simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) {
  342. #if defined(SIMDE_X86_AVX2_NATIVE)
  343. return _mm256_adds_epi8(a, b);
  344. #else
  345. simde__m256i_private
  346. r_,
  347. a_ = simde__m256i_to_private(a),
  348. b_ = simde__m256i_to_private(b);
  349. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  350. r_.m128i[0] = simde_mm_adds_epi8(a_.m128i[0], b_.m128i[0]);
  351. r_.m128i[1] = simde_mm_adds_epi8(a_.m128i[1], b_.m128i[1]);
  352. #else
  353. SIMDE_VECTORIZE
  354. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  355. r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]);
  356. }
  357. #endif
  358. return simde__m256i_from_private(r_);
  359. #endif
  360. }
  361. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  362. #undef _mm256_adds_epi8
  363. #define _mm256_adds_epi8(a, b) simde_mm256_adds_epi8(a, b)
  364. #endif
  365. SIMDE_FUNCTION_ATTRIBUTES
  366. simde__m256i
  367. simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) {
  368. #if defined(SIMDE_X86_AVX2_NATIVE)
  369. return _mm256_adds_epi16(a, b);
  370. #else
  371. simde__m256i_private
  372. r_,
  373. a_ = simde__m256i_to_private(a),
  374. b_ = simde__m256i_to_private(b);
  375. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  376. r_.m128i[0] = simde_mm_adds_epi16(a_.m128i[0], b_.m128i[0]);
  377. r_.m128i[1] = simde_mm_adds_epi16(a_.m128i[1], b_.m128i[1]);
  378. #else
  379. SIMDE_VECTORIZE
  380. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  381. r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]);
  382. }
  383. #endif
  384. return simde__m256i_from_private(r_);
  385. #endif
  386. }
  387. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  388. #undef _mm256_adds_epi16
  389. #define _mm256_adds_epi16(a, b) simde_mm256_adds_epi16(a, b)
  390. #endif
  391. SIMDE_FUNCTION_ATTRIBUTES
  392. simde__m256i
  393. simde_mm256_hadds_epi16 (simde__m256i a, simde__m256i b) {
  394. #if defined(SIMDE_X86_AVX2_NATIVE)
  395. return _mm256_hadds_epi16(a, b);
  396. #else
  397. return simde_mm256_adds_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
  398. #endif
  399. }
  400. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  401. #undef _mm256_hadds_epi16
  402. #define _mm256_hadds_epi16(a, b) simde_mm256_hadds_epi16(a, b)
  403. #endif
  404. SIMDE_FUNCTION_ATTRIBUTES
  405. simde__m256i
  406. simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) {
  407. #if defined(SIMDE_X86_AVX2_NATIVE)
  408. return _mm256_adds_epu8(a, b);
  409. #else
  410. simde__m256i_private
  411. r_,
  412. a_ = simde__m256i_to_private(a),
  413. b_ = simde__m256i_to_private(b);
  414. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  415. r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]);
  416. r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]);
  417. #else
  418. SIMDE_VECTORIZE
  419. for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
  420. r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]);
  421. }
  422. #endif
  423. return simde__m256i_from_private(r_);
  424. #endif
  425. }
  426. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  427. #undef _mm256_adds_epu8
  428. #define _mm256_adds_epu8(a, b) simde_mm256_adds_epu8(a, b)
  429. #endif
  430. SIMDE_FUNCTION_ATTRIBUTES
  431. simde__m256i
  432. simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) {
  433. #if defined(SIMDE_X86_AVX2_NATIVE)
  434. return _mm256_adds_epu16(a, b);
  435. #else
  436. simde__m256i_private
  437. r_,
  438. a_ = simde__m256i_to_private(a),
  439. b_ = simde__m256i_to_private(b);
  440. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  441. r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]);
  442. r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]);
  443. #else
  444. SIMDE_VECTORIZE
  445. for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
  446. r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]);
  447. }
  448. #endif
  449. return simde__m256i_from_private(r_);
  450. #endif
  451. }
  452. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  453. #undef _mm256_adds_epu16
  454. #define _mm256_adds_epu16(a, b) simde_mm256_adds_epu16(a, b)
  455. #endif
  456. SIMDE_FUNCTION_ATTRIBUTES
  457. simde__m256i
  458. simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) {
  459. #if defined(SIMDE_X86_AVX2_NATIVE)
  460. return _mm256_avg_epu8(a, b);
  461. #else
  462. simde__m256i_private
  463. r_,
  464. a_ = simde__m256i_to_private(a),
  465. b_ = simde__m256i_to_private(b);
  466. SIMDE_VECTORIZE
  467. for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
  468. r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
  469. }
  470. return simde__m256i_from_private(r_);
  471. #endif
  472. }
  473. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  474. #undef _mm256_avg_epu8
  475. #define _mm256_avg_epu8(a, b) simde_mm256_avg_epu8(a, b)
  476. #endif
  477. SIMDE_FUNCTION_ATTRIBUTES
  478. simde__m256i
  479. simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) {
  480. #if defined(SIMDE_X86_AVX2_NATIVE)
  481. return _mm256_avg_epu16(a, b);
  482. #else
  483. simde__m256i_private
  484. r_,
  485. a_ = simde__m256i_to_private(a),
  486. b_ = simde__m256i_to_private(b);
  487. SIMDE_VECTORIZE
  488. for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
  489. r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
  490. }
  491. return simde__m256i_from_private(r_);
  492. #endif
  493. }
  494. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  495. #undef _mm256_avg_epu16
  496. #define _mm256_avg_epu16(a, b) simde_mm256_avg_epu16(a, b)
  497. #endif
  498. SIMDE_FUNCTION_ATTRIBUTES
  499. simde__m128i
  500. simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8)
  501. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) {
  502. simde__m128i_private
  503. r_,
  504. a_ = simde__m128i_to_private(a),
  505. b_ = simde__m128i_to_private(b);
  506. SIMDE_VECTORIZE
  507. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  508. r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
  509. }
  510. return simde__m128i_from_private(r_);
  511. }
  512. #if defined(SIMDE_X86_AVX2_NATIVE)
  513. # define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8)
  514. #elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128)
  515. # define simde_mm_blend_epi32(a, b, imm8) \
  516. simde_mm_castps_si128(simde_mm_blend_ps(simde_mm_castsi128_ps(a), simde_mm_castsi128_ps(b), (imm8)))
  517. #endif
  518. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  519. #undef _mm_blend_epi32
  520. #define _mm_blend_epi32(a, b, imm8) simde_mm_blend_epi32(a, b, imm8)
  521. #endif
  522. SIMDE_FUNCTION_ATTRIBUTES
  523. simde__m256i
  524. simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8)
  525. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  526. simde__m256i_private
  527. r_,
  528. a_ = simde__m256i_to_private(a),
  529. b_ = simde__m256i_to_private(b);
  530. SIMDE_VECTORIZE
  531. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  532. r_.i16[i] = ((imm8 >> i%8) & 1) ? b_.i16[i] : a_.i16[i];
  533. }
  534. return simde__m256i_from_private(r_);
  535. }
  536. #if defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_BUG_CLANG_REV_234560)
  537. # define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8))
  538. #elif defined(SIMDE_X86_AVX2_NATIVE)
  539. # define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8)
  540. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  541. # define simde_mm256_blend_epi16(a, b, imm8) \
  542. simde_mm256_set_m128i( \
  543. simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8)), \
  544. simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8)))
  545. #endif
  546. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  547. #undef _mm256_blend_epi16
  548. #define _mm256_blend_epi16(a, b, imm8) simde_mm256_blend_epi16(a, b, imm8)
  549. #endif
  550. SIMDE_FUNCTION_ATTRIBUTES
  551. simde__m256i
  552. simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8)
  553. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  554. simde__m256i_private
  555. r_,
  556. a_ = simde__m256i_to_private(a),
  557. b_ = simde__m256i_to_private(b);
  558. SIMDE_VECTORIZE
  559. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  560. r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i];
  561. }
  562. return simde__m256i_from_private(r_);
  563. }
  564. #if defined(SIMDE_X86_AVX2_NATIVE)
  565. # define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8)
  566. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  567. # define simde_mm256_blend_epi32(a, b, imm8) \
  568. simde_mm256_set_m128i( \
  569. simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8) >> 4), \
  570. simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8) & 0x0F))
  571. #endif
  572. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  573. #undef _mm256_blend_epi32
  574. #define _mm256_blend_epi32(a, b, imm8) simde_mm256_blend_epi32(a, b, imm8)
  575. #endif
  576. SIMDE_FUNCTION_ATTRIBUTES
  577. simde__m256i
  578. simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) {
  579. #if defined(SIMDE_X86_AVX2_NATIVE)
  580. return _mm256_blendv_epi8(a, b, mask);
  581. #else
  582. simde__m256i_private
  583. r_,
  584. a_ = simde__m256i_to_private(a),
  585. b_ = simde__m256i_to_private(b),
  586. mask_ = simde__m256i_to_private(mask);
  587. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  588. r_.m128i[0] = simde_mm_blendv_epi8(a_.m128i[0], b_.m128i[0], mask_.m128i[0]);
  589. r_.m128i[1] = simde_mm_blendv_epi8(a_.m128i[1], b_.m128i[1], mask_.m128i[1]);
  590. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  591. __typeof__(mask_.i8) tmp = mask_.i8 >> 7;
  592. r_.i8 = (tmp & b_.i8) | (~tmp & a_.i8);
  593. #else
  594. SIMDE_VECTORIZE
  595. for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
  596. int8_t tmp = mask_.i8[i] >> 7;
  597. r_.i8[i] = (tmp & b_.i8[i]) | (~tmp & a_.i8[i]);
  598. }
  599. #endif
  600. return simde__m256i_from_private(r_);
  601. #endif
  602. }
  603. #if defined(SIMDE_X86_AVX2_NATIVE)
  604. # define simde_mm256_blendv_epi8(a, b, imm8) _mm256_blendv_epi8(a, b, imm8)
  605. #endif
  606. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  607. #undef _mm256_blendv_epi8
  608. #define _mm256_blendv_epi8(a, b, mask) simde_mm256_blendv_epi8(a, b, mask)
  609. #endif
  610. SIMDE_FUNCTION_ATTRIBUTES
  611. simde__m128i
  612. simde_mm_broadcastb_epi8 (simde__m128i a) {
  613. #if defined(SIMDE_X86_AVX2_NATIVE)
  614. return _mm_broadcastb_epi8(a);
  615. #else
  616. simde__m128i_private r_;
  617. simde__m128i_private a_= simde__m128i_to_private(a);
  618. SIMDE_VECTORIZE
  619. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  620. r_.i8[i] = a_.i8[0];
  621. }
  622. return simde__m128i_from_private(r_);
  623. #endif
  624. }
  625. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  626. #undef _mm_broadcastb_epi8
  627. #define _mm_broadcastb_epi8(a) simde_mm_broadcastb_epi8(a)
  628. #endif
  629. SIMDE_FUNCTION_ATTRIBUTES
  630. simde__m256i
  631. simde_mm256_broadcastb_epi8 (simde__m128i a) {
  632. #if defined(SIMDE_X86_AVX2_NATIVE)
  633. return _mm256_broadcastb_epi8(a);
  634. #else
  635. simde__m256i_private r_;
  636. simde__m128i_private a_= simde__m128i_to_private(a);
  637. SIMDE_VECTORIZE
  638. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  639. r_.i8[i] = a_.i8[0];
  640. }
  641. return simde__m256i_from_private(r_);
  642. #endif
  643. }
  644. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  645. #undef _mm256_broadcastb_epi8
  646. #define _mm256_broadcastb_epi8(a) simde_mm256_broadcastb_epi8(a)
  647. #endif
  648. SIMDE_FUNCTION_ATTRIBUTES
  649. simde__m128i
  650. simde_mm_broadcastw_epi16 (simde__m128i a) {
  651. #if defined(SIMDE_X86_AVX2_NATIVE)
  652. return _mm_broadcastw_epi16(a);
  653. #else
  654. simde__m128i_private r_;
  655. simde__m128i_private a_= simde__m128i_to_private(a);
  656. SIMDE_VECTORIZE
  657. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  658. r_.i16[i] = a_.i16[0];
  659. }
  660. return simde__m128i_from_private(r_);
  661. #endif
  662. }
  663. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  664. #undef _mm_broadcastw_epi16
  665. #define _mm_broadcastw_epi16(a) simde_mm_broadcastw_epi16(a)
  666. #endif
  667. SIMDE_FUNCTION_ATTRIBUTES
  668. simde__m256i
  669. simde_mm256_broadcastw_epi16 (simde__m128i a) {
  670. #if defined(SIMDE_X86_AVX2_NATIVE)
  671. return _mm256_broadcastw_epi16(a);
  672. #else
  673. simde__m256i_private r_;
  674. simde__m128i_private a_= simde__m128i_to_private(a);
  675. SIMDE_VECTORIZE
  676. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  677. r_.i16[i] = a_.i16[0];
  678. }
  679. return simde__m256i_from_private(r_);
  680. #endif
  681. }
  682. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  683. #undef _mm256_broadcastw_epi16
  684. #define _mm256_broadcastw_epi16(a) simde_mm256_broadcastw_epi16(a)
  685. #endif
  686. SIMDE_FUNCTION_ATTRIBUTES
  687. simde__m128i
  688. simde_mm_broadcastd_epi32 (simde__m128i a) {
  689. #if defined(SIMDE_X86_AVX2_NATIVE)
  690. return _mm_broadcastd_epi32(a);
  691. #else
  692. simde__m128i_private r_;
  693. simde__m128i_private a_= simde__m128i_to_private(a);
  694. SIMDE_VECTORIZE
  695. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  696. r_.i32[i] = a_.i32[0];
  697. }
  698. return simde__m128i_from_private(r_);
  699. #endif
  700. }
  701. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  702. #undef _mm_broadcastd_epi32
  703. #define _mm_broadcastd_epi32(a) simde_mm_broadcastd_epi32(a)
  704. #endif
  705. SIMDE_FUNCTION_ATTRIBUTES
  706. simde__m256i
  707. simde_mm256_broadcastd_epi32 (simde__m128i a) {
  708. #if defined(SIMDE_X86_AVX2_NATIVE)
  709. return _mm256_broadcastd_epi32(a);
  710. #else
  711. simde__m256i_private r_;
  712. simde__m128i_private a_= simde__m128i_to_private(a);
  713. SIMDE_VECTORIZE
  714. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  715. r_.i32[i] = a_.i32[0];
  716. }
  717. return simde__m256i_from_private(r_);
  718. #endif
  719. }
  720. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  721. #undef _mm256_broadcastd_epi32
  722. #define _mm256_broadcastd_epi32(a) simde_mm256_broadcastd_epi32(a)
  723. #endif
  724. SIMDE_FUNCTION_ATTRIBUTES
  725. simde__m128i
  726. simde_mm_broadcastq_epi64 (simde__m128i a) {
  727. #if defined(SIMDE_X86_AVX2_NATIVE)
  728. return _mm_broadcastq_epi64(a);
  729. #else
  730. simde__m128i_private r_;
  731. simde__m128i_private a_= simde__m128i_to_private(a);
  732. SIMDE_VECTORIZE
  733. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  734. r_.i64[i] = a_.i64[0];
  735. }
  736. return simde__m128i_from_private(r_);
  737. #endif
  738. }
  739. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  740. #undef _mm_broadcastq_epi64
  741. #define _mm_broadcastq_epi64(a) simde_mm_broadcastq_epi64(a)
  742. #endif
  743. SIMDE_FUNCTION_ATTRIBUTES
  744. simde__m256i
  745. simde_mm256_broadcastq_epi64 (simde__m128i a) {
  746. #if defined(SIMDE_X86_AVX2_NATIVE)
  747. return _mm256_broadcastq_epi64(a);
  748. #else
  749. simde__m256i_private r_;
  750. simde__m128i_private a_= simde__m128i_to_private(a);
  751. SIMDE_VECTORIZE
  752. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  753. r_.i64[i] = a_.i64[0];
  754. }
  755. return simde__m256i_from_private(r_);
  756. #endif
  757. }
  758. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  759. #undef _mm256_broadcastq_epi64
  760. #define _mm256_broadcastq_epi64(a) simde_mm256_broadcastq_epi64(a)
  761. #endif
  762. SIMDE_FUNCTION_ATTRIBUTES
  763. simde__m128
  764. simde_mm_broadcastss_ps (simde__m128 a) {
  765. #if defined(SIMDE_X86_AVX2_NATIVE)
  766. return _mm_broadcastss_ps(a);
  767. #elif defined(SIMDE_X86_SSE_NATIVE)
  768. return simde_mm_shuffle_ps(a, a, 0);
  769. #else
  770. simde__m128_private r_;
  771. simde__m128_private a_= simde__m128_to_private(a);
  772. #if defined(SIMDE_SHUFFLE_VECTOR_)
  773. r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0);
  774. #else
  775. SIMDE_VECTORIZE
  776. for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
  777. r_.f32[i] = a_.f32[0];
  778. }
  779. #endif
  780. return simde__m128_from_private(r_);
  781. #endif
  782. }
  783. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  784. #undef _mm_broadcastss_ps
  785. #define _mm_broadcastss_ps(a) simde_mm_broadcastss_ps(a)
  786. #endif
  787. SIMDE_FUNCTION_ATTRIBUTES
  788. simde__m256
  789. simde_mm256_broadcastss_ps (simde__m128 a) {
  790. #if defined(SIMDE_X86_AVX2_NATIVE)
  791. return _mm256_broadcastss_ps(a);
  792. #else
  793. simde__m256_private r_;
  794. simde__m128_private a_= simde__m128_to_private(a);
  795. #if defined(SIMDE_X86_AVX_NATIVE)
  796. __m128 tmp = _mm_permute_ps(a_.n, 0);
  797. r_.n = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 1);
  798. #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
  799. r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 0, 0, 0, 0, 0, 0, 0);
  800. #elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128)
  801. r_.m128[0] = r_.m128[1] = simde_mm_broadcastss_ps(simde__m128_from_private(a_));
  802. #else
  803. SIMDE_VECTORIZE
  804. for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
  805. r_.f32[i] = a_.f32[0];
  806. }
  807. #endif
  808. return simde__m256_from_private(r_);
  809. #endif
  810. }
  811. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  812. #undef _mm256_broadcastss_ps
  813. #define _mm256_broadcastss_ps(a) simde_mm256_broadcastss_ps(a)
  814. #endif
  815. SIMDE_FUNCTION_ATTRIBUTES
  816. simde__m128d
  817. simde_mm_broadcastsd_pd (simde__m128d a) {
  818. return simde_mm_movedup_pd(a);
  819. }
  820. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  821. #undef _mm_broadcastsd_pd
  822. #define _mm_broadcastsd_pd(a) simde_mm_broadcastsd_pd(a)
  823. #endif
  824. SIMDE_FUNCTION_ATTRIBUTES
  825. simde__m256d
  826. simde_mm256_broadcastsd_pd (simde__m128d a) {
  827. #if defined(SIMDE_X86_AVX2_NATIVE)
  828. return _mm256_broadcastsd_pd(a);
  829. #else
  830. simde__m256d_private r_;
  831. simde__m128d_private a_= simde__m128d_to_private(a);
  832. SIMDE_VECTORIZE
  833. for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
  834. r_.f64[i] = a_.f64[0];
  835. }
  836. return simde__m256d_from_private(r_);
  837. #endif
  838. }
  839. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  840. #undef _mm256_broadcastsd_pd
  841. #define _mm256_broadcastsd_pd(a) simde_mm256_broadcastsd_pd(a)
  842. #endif
  843. SIMDE_FUNCTION_ATTRIBUTES
  844. simde__m256i
  845. simde_mm256_broadcastsi128_si256 (simde__m128i a) {
  846. #if defined(SIMDE_X86_AVX2_NATIVE) && \
  847. (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0))
  848. return _mm256_broadcastsi128_si256(a);
  849. #else
  850. simde__m256i_private r_;
  851. simde__m128i_private a_ = simde__m128i_to_private(a);
  852. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  853. r_.m128i_private[0] = a_;
  854. r_.m128i_private[1] = a_;
  855. #else
  856. r_.i64[0] = a_.i64[0];
  857. r_.i64[1] = a_.i64[1];
  858. r_.i64[2] = a_.i64[0];
  859. r_.i64[3] = a_.i64[1];
  860. #endif
  861. return simde__m256i_from_private(r_);
  862. #endif
  863. }
  864. #define simde_mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
  865. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  866. #undef _mm256_broadcastsi128_si256
  867. #define _mm256_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
  868. #undef _mm_broadcastsi128_si256
  869. #define _mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a)
  870. #endif
  871. SIMDE_FUNCTION_ATTRIBUTES
  872. simde__m256i
  873. simde_mm256_bslli_epi128 (simde__m256i a, const int imm8)
  874. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  875. simde__m256i_private
  876. r_,
  877. a_ = simde__m256i_to_private(a);
  878. const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0])));
  879. SIMDE_VECTORIZE
  880. for (int i = 0 ; i < ssize ; i++) {
  881. const int e = i - imm8;
  882. if(i >= (ssize/2)) {
  883. if(e >= (ssize/2) && e < ssize)
  884. r_.i8[i] = a_.i8[e];
  885. else
  886. r_.i8[i] = 0;
  887. }
  888. else{
  889. if(e >= 0 && e < (ssize/2))
  890. r_.i8[i] = a_.i8[e];
  891. else
  892. r_.i8[i] = 0;
  893. }
  894. }
  895. return simde__m256i_from_private(r_);
  896. }
  897. #if defined(SIMDE_X86_AVX2_NATIVE) && \
  898. (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \
  899. SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
  900. #define simde_mm256_bslli_epi128(a, imm8) _mm256_bslli_epi128(a, imm8)
  901. #endif
  902. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  903. #undef _mm256_bslli_epi128
  904. #define _mm256_bslli_epi128(a, imm8) simde_mm256_bslli_epi128(a, imm8)
  905. #endif
  906. SIMDE_FUNCTION_ATTRIBUTES
  907. simde__m256i
  908. simde_mm256_bsrli_epi128 (simde__m256i a, const int imm8)
  909. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  910. simde__m256i_private
  911. r_,
  912. a_ = simde__m256i_to_private(a);
  913. const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0])));
  914. SIMDE_VECTORIZE
  915. for (int i = 0 ; i < ssize ; i++) {
  916. const int e = i + imm8;
  917. if(i < (ssize/2)) {
  918. if(e >= 0 && e < (ssize/2))
  919. r_.i8[i] = a_.i8[e];
  920. else
  921. r_.i8[i] = 0;
  922. }
  923. else{
  924. if(e >= (ssize/2) && e < ssize)
  925. r_.i8[i] = a_.i8[e];
  926. else
  927. r_.i8[i] = 0;
  928. }
  929. }
  930. return simde__m256i_from_private(r_);
  931. }
  932. #if defined(SIMDE_X86_AVX2_NATIVE) && \
  933. (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \
  934. SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0)
  935. #define simde_mm256_bsrli_epi128(a, imm8) _mm256_bsrli_epi128(a, imm8)
  936. #endif
  937. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  938. #undef _mm256_bsrli_epi128
  939. #define _mm256_bsrli_epi128(a, imm8) simde_mm256_bsrli_epi128(a, imm8)
  940. #endif
  941. SIMDE_FUNCTION_ATTRIBUTES
  942. simde__m256i
  943. simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) {
  944. #if defined(SIMDE_X86_AVX2_NATIVE)
  945. return _mm256_cmpeq_epi8(a, b);
  946. #else
  947. simde__m256i_private
  948. r_,
  949. a_ = simde__m256i_to_private(a),
  950. b_ = simde__m256i_to_private(b);
  951. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  952. r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]);
  953. r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]);
  954. #else
  955. SIMDE_VECTORIZE
  956. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  957. r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
  958. }
  959. #endif
  960. return simde__m256i_from_private(r_);
  961. #endif
  962. }
  963. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  964. #undef _mm256_cmpeq_epi8
  965. #define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b)
  966. #endif
  967. SIMDE_FUNCTION_ATTRIBUTES
  968. simde__m256i
  969. simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) {
  970. #if defined(SIMDE_X86_AVX2_NATIVE)
  971. return _mm256_cmpeq_epi16(a, b);
  972. #else
  973. simde__m256i_private
  974. r_,
  975. a_ = simde__m256i_to_private(a),
  976. b_ = simde__m256i_to_private(b);
  977. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  978. r_.m128i[0] = simde_mm_cmpeq_epi16(a_.m128i[0], b_.m128i[0]);
  979. r_.m128i[1] = simde_mm_cmpeq_epi16(a_.m128i[1], b_.m128i[1]);
  980. #else
  981. SIMDE_VECTORIZE
  982. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  983. r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
  984. }
  985. #endif
  986. return simde__m256i_from_private(r_);
  987. #endif
  988. }
  989. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  990. #undef _mm256_cmpeq_epi16
  991. #define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b)
  992. #endif
  993. SIMDE_FUNCTION_ATTRIBUTES
  994. simde__m256i
  995. simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) {
  996. #if defined(SIMDE_X86_AVX2_NATIVE)
  997. return _mm256_cmpeq_epi32(a, b);
  998. #else
  999. simde__m256i_private
  1000. r_,
  1001. a_ = simde__m256i_to_private(a),
  1002. b_ = simde__m256i_to_private(b);
  1003. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  1004. r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]);
  1005. r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]);
  1006. #else
  1007. SIMDE_VECTORIZE
  1008. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  1009. r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
  1010. }
  1011. #endif
  1012. return simde__m256i_from_private(r_);
  1013. #endif
  1014. }
  1015. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1016. #undef _mm256_cmpeq_epi32
  1017. #define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b)
  1018. #endif
  1019. SIMDE_FUNCTION_ATTRIBUTES
  1020. simde__m256i
  1021. simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) {
  1022. #if defined(SIMDE_X86_AVX2_NATIVE)
  1023. return _mm256_cmpeq_epi64(a, b);
  1024. #else
  1025. simde__m256i_private
  1026. r_,
  1027. a_ = simde__m256i_to_private(a),
  1028. b_ = simde__m256i_to_private(b);
  1029. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  1030. r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]);
  1031. r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]);
  1032. #else
  1033. SIMDE_VECTORIZE
  1034. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1035. r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
  1036. }
  1037. #endif
  1038. return simde__m256i_from_private(r_);
  1039. #endif
  1040. }
  1041. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1042. #undef _mm256_cmpeq_epi64
  1043. #define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b)
  1044. #endif
  1045. SIMDE_FUNCTION_ATTRIBUTES
  1046. simde__m256i
  1047. simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) {
  1048. #if defined(SIMDE_X86_AVX2_NATIVE)
  1049. return _mm256_cmpgt_epi8(a, b);
  1050. #else
  1051. simde__m256i_private
  1052. r_,
  1053. a_ = simde__m256i_to_private(a),
  1054. b_ = simde__m256i_to_private(b);
  1055. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  1056. r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]);
  1057. r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]);
  1058. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1059. r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 > b_.i8);
  1060. #else
  1061. SIMDE_VECTORIZE
  1062. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  1063. r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
  1064. }
  1065. #endif
  1066. return simde__m256i_from_private(r_);
  1067. #endif
  1068. }
  1069. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1070. #undef _mm256_cmpgt_epi8
  1071. #define _mm256_cmpgt_epi8(a, b) simde_mm256_cmpgt_epi8(a, b)
  1072. #endif
  1073. SIMDE_FUNCTION_ATTRIBUTES
  1074. simde__m256i
  1075. simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) {
  1076. #if defined(SIMDE_X86_AVX2_NATIVE)
  1077. return _mm256_cmpgt_epi16(a, b);
  1078. #else
  1079. simde__m256i_private
  1080. r_,
  1081. a_ = simde__m256i_to_private(a),
  1082. b_ = simde__m256i_to_private(b);
  1083. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  1084. r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]);
  1085. r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]);
  1086. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1087. r_.i16 = a_.i16 > b_.i16;
  1088. #else
  1089. SIMDE_VECTORIZE
  1090. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  1091. r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
  1092. }
  1093. #endif
  1094. return simde__m256i_from_private(r_);
  1095. #endif
  1096. }
  1097. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1098. #undef _mm256_cmpgt_epi16
  1099. #define _mm256_cmpgt_epi16(a, b) simde_mm256_cmpgt_epi16(a, b)
  1100. #endif
  1101. SIMDE_FUNCTION_ATTRIBUTES
  1102. simde__m256i
  1103. simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) {
  1104. #if defined(SIMDE_X86_AVX2_NATIVE)
  1105. return _mm256_cmpgt_epi32(a, b);
  1106. #else
  1107. simde__m256i_private
  1108. r_,
  1109. a_ = simde__m256i_to_private(a),
  1110. b_ = simde__m256i_to_private(b);
  1111. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  1112. r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]);
  1113. r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]);
  1114. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1115. r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 > b_.i32);
  1116. #else
  1117. SIMDE_VECTORIZE
  1118. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  1119. r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
  1120. }
  1121. #endif
  1122. return simde__m256i_from_private(r_);
  1123. #endif
  1124. }
  1125. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1126. #undef _mm256_cmpgt_epi32
  1127. #define _mm256_cmpgt_epi32(a, b) simde_mm256_cmpgt_epi32(a, b)
  1128. #endif
  1129. SIMDE_FUNCTION_ATTRIBUTES
  1130. simde__m256i
  1131. simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) {
  1132. #if defined(SIMDE_X86_AVX2_NATIVE)
  1133. return _mm256_cmpgt_epi64(a, b);
  1134. #else
  1135. simde__m256i_private
  1136. r_,
  1137. a_ = simde__m256i_to_private(a),
  1138. b_ = simde__m256i_to_private(b);
  1139. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  1140. r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]);
  1141. r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]);
  1142. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1143. r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64);
  1144. #else
  1145. SIMDE_VECTORIZE
  1146. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1147. r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0);
  1148. }
  1149. #endif
  1150. return simde__m256i_from_private(r_);
  1151. #endif
  1152. }
  1153. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1154. #undef _mm256_cmpgt_epi64
  1155. #define _mm256_cmpgt_epi64(a, b) simde_mm256_cmpgt_epi64(a, b)
  1156. #endif
  1157. SIMDE_FUNCTION_ATTRIBUTES
  1158. simde__m256i
  1159. simde_mm256_cvtepi8_epi16 (simde__m128i a) {
  1160. #if defined(SIMDE_X86_AVX2_NATIVE)
  1161. return _mm256_cvtepi8_epi16(a);
  1162. #else
  1163. simde__m256i_private r_;
  1164. simde__m128i_private a_ = simde__m128i_to_private(a);
  1165. #if defined(SIMDE_CONVERT_VECTOR_)
  1166. SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8);
  1167. #else
  1168. SIMDE_VECTORIZE
  1169. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  1170. r_.i16[i] = a_.i8[i];
  1171. }
  1172. #endif
  1173. return simde__m256i_from_private(r_);
  1174. #endif
  1175. }
  1176. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1177. #undef _mm256_cvtepi8_epi16
  1178. #define _mm256_cvtepi8_epi16(a) simde_mm256_cvtepi8_epi16(a)
  1179. #endif
  1180. SIMDE_FUNCTION_ATTRIBUTES
  1181. simde__m256i
  1182. simde_mm256_cvtepi8_epi32 (simde__m128i a) {
  1183. #if defined(SIMDE_X86_AVX2_NATIVE)
  1184. return _mm256_cvtepi8_epi32(a);
  1185. #else
  1186. simde__m256i_private r_;
  1187. simde__m128i_private a_ = simde__m128i_to_private(a);
  1188. #if defined(SIMDE_CONVERT_VECTOR_)
  1189. SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].i8);
  1190. #else
  1191. SIMDE_VECTORIZE
  1192. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  1193. r_.i32[i] = a_.i8[i];
  1194. }
  1195. #endif
  1196. return simde__m256i_from_private(r_);
  1197. #endif
  1198. }
  1199. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1200. #undef _mm256_cvtepi8_epi32
  1201. #define _mm256_cvtepi8_epi32(a) simde_mm256_cvtepi8_epi32(a)
  1202. #endif
  1203. SIMDE_FUNCTION_ATTRIBUTES
  1204. simde__m256i
  1205. simde_mm256_cvtepi8_epi64 (simde__m128i a) {
  1206. #if defined(SIMDE_X86_AVX2_NATIVE)
  1207. return _mm256_cvtepi8_epi64(a);
  1208. #else
  1209. simde__m256i_private r_;
  1210. simde__m128i_private a_ = simde__m128i_to_private(a);
  1211. SIMDE_VECTORIZE
  1212. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1213. r_.i64[i] = a_.i8[i];
  1214. }
  1215. return simde__m256i_from_private(r_);
  1216. #endif
  1217. }
  1218. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1219. #undef _mm256_cvtepi8_epi64
  1220. #define _mm256_cvtepi8_epi64(a) simde_mm256_cvtepi8_epi64(a)
  1221. #endif
  1222. SIMDE_FUNCTION_ATTRIBUTES
  1223. simde__m256i
  1224. simde_mm256_cvtepi16_epi32 (simde__m128i a) {
  1225. #if defined(SIMDE_X86_AVX2_NATIVE)
  1226. return _mm256_cvtepi16_epi32(a);
  1227. #else
  1228. simde__m256i_private r_;
  1229. simde__m128i_private a_ = simde__m128i_to_private(a);
  1230. #if defined(SIMDE_CONVERT_VECTOR_)
  1231. SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16);
  1232. #else
  1233. SIMDE_VECTORIZE
  1234. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  1235. r_.i32[i] = a_.i16[i];
  1236. }
  1237. #endif
  1238. return simde__m256i_from_private(r_);
  1239. #endif
  1240. }
  1241. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1242. #undef _mm256_cvtepi16_epi32
  1243. #define _mm256_cvtepi16_epi32(a) simde_mm256_cvtepi16_epi32(a)
  1244. #endif
  1245. SIMDE_FUNCTION_ATTRIBUTES
  1246. simde__m256i
  1247. simde_mm256_cvtepi16_epi64 (simde__m128i a) {
  1248. #if defined(SIMDE_X86_AVX2_NATIVE)
  1249. return _mm256_cvtepi16_epi64(a);
  1250. #else
  1251. simde__m256i_private r_;
  1252. simde__m128i_private a_ = simde__m128i_to_private(a);
  1253. #if defined(SIMDE_CONVERT_VECTOR_)
  1254. SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i16);
  1255. #else
  1256. SIMDE_VECTORIZE
  1257. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1258. r_.i64[i] = a_.i16[i];
  1259. }
  1260. #endif
  1261. return simde__m256i_from_private(r_);
  1262. #endif
  1263. }
  1264. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1265. #undef _mm256_cvtepi16_epi64
  1266. #define _mm256_cvtepi16_epi64(a) simde_mm256_cvtepi16_epi64(a)
  1267. #endif
  1268. SIMDE_FUNCTION_ATTRIBUTES
  1269. simde__m256i
  1270. simde_mm256_cvtepi32_epi64 (simde__m128i a) {
  1271. #if defined(SIMDE_X86_AVX2_NATIVE)
  1272. return _mm256_cvtepi32_epi64(a);
  1273. #else
  1274. simde__m256i_private r_;
  1275. simde__m128i_private a_ = simde__m128i_to_private(a);
  1276. #if defined(SIMDE_CONVERT_VECTOR_)
  1277. SIMDE_CONVERT_VECTOR_(r_.i64, a_.i32);
  1278. #else
  1279. SIMDE_VECTORIZE
  1280. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1281. r_.i64[i] = a_.i32[i];
  1282. }
  1283. #endif
  1284. return simde__m256i_from_private(r_);
  1285. #endif
  1286. }
  1287. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1288. #undef _mm256_cvtepi32_epi64
  1289. #define _mm256_cvtepi32_epi64(a) simde_mm256_cvtepi32_epi64(a)
  1290. #endif
  1291. SIMDE_FUNCTION_ATTRIBUTES
  1292. simde__m256i
  1293. simde_mm256_cvtepu8_epi16 (simde__m128i a) {
  1294. #if defined(SIMDE_X86_AVX2_NATIVE)
  1295. return _mm256_cvtepu8_epi16(a);
  1296. #else
  1297. simde__m256i_private r_;
  1298. simde__m128i_private a_ = simde__m128i_to_private(a);
  1299. #if defined(SIMDE_CONVERT_VECTOR_)
  1300. SIMDE_CONVERT_VECTOR_(r_.i16, a_.u8);
  1301. #else
  1302. SIMDE_VECTORIZE
  1303. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  1304. r_.i16[i] = a_.u8[i];
  1305. }
  1306. #endif
  1307. return simde__m256i_from_private(r_);
  1308. #endif
  1309. }
  1310. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1311. #undef _mm256_cvtepu8_epi16
  1312. #define _mm256_cvtepu8_epi16(a) simde_mm256_cvtepu8_epi16(a)
  1313. #endif
  1314. SIMDE_FUNCTION_ATTRIBUTES
  1315. simde__m256i
  1316. simde_mm256_cvtepu8_epi32 (simde__m128i a) {
  1317. #if defined(SIMDE_X86_AVX2_NATIVE)
  1318. return _mm256_cvtepu8_epi32(a);
  1319. #else
  1320. simde__m256i_private r_;
  1321. simde__m128i_private a_ = simde__m128i_to_private(a);
  1322. #if defined(SIMDE_CONVERT_VECTOR_)
  1323. SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u8);
  1324. #else
  1325. SIMDE_VECTORIZE
  1326. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  1327. r_.i32[i] = a_.u8[i];
  1328. }
  1329. #endif
  1330. return simde__m256i_from_private(r_);
  1331. #endif
  1332. }
  1333. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1334. #undef _mm256_cvtepu8_epi32
  1335. #define _mm256_cvtepu8_epi32(a) simde_mm256_cvtepu8_epi32(a)
  1336. #endif
  1337. SIMDE_FUNCTION_ATTRIBUTES
  1338. simde__m256i
  1339. simde_mm256_cvtepu8_epi64 (simde__m128i a) {
  1340. #if defined(SIMDE_X86_AVX2_NATIVE)
  1341. return _mm256_cvtepu8_epi64(a);
  1342. #else
  1343. simde__m256i_private r_;
  1344. simde__m128i_private a_ = simde__m128i_to_private(a);
  1345. SIMDE_VECTORIZE
  1346. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1347. r_.i64[i] = a_.u8[i];
  1348. }
  1349. return simde__m256i_from_private(r_);
  1350. #endif
  1351. }
  1352. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1353. #undef _mm256_cvtepu8_epi64
  1354. #define _mm256_cvtepu8_epi64(a) simde_mm256_cvtepu8_epi64(a)
  1355. #endif
  1356. SIMDE_FUNCTION_ATTRIBUTES
  1357. simde__m256i
  1358. simde_mm256_cvtepu16_epi32 (simde__m128i a) {
  1359. #if defined(SIMDE_X86_AVX2_NATIVE)
  1360. return _mm256_cvtepu16_epi32(a);
  1361. #else
  1362. simde__m256i_private r_;
  1363. simde__m128i_private a_ = simde__m128i_to_private(a);
  1364. #if defined(SIMDE_CONVERT_VECTOR_)
  1365. SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16);
  1366. #else
  1367. SIMDE_VECTORIZE
  1368. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  1369. r_.i32[i] = a_.u16[i];
  1370. }
  1371. #endif
  1372. return simde__m256i_from_private(r_);
  1373. #endif
  1374. }
  1375. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1376. #undef _mm256_cvtepu16_epi32
  1377. #define _mm256_cvtepu16_epi32(a) simde_mm256_cvtepu16_epi32(a)
  1378. #endif
  1379. SIMDE_FUNCTION_ATTRIBUTES
  1380. simde__m256i
  1381. simde_mm256_cvtepu16_epi64 (simde__m128i a) {
  1382. #if defined(SIMDE_X86_AVX2_NATIVE)
  1383. return _mm256_cvtepu16_epi64(a);
  1384. #else
  1385. simde__m256i_private r_;
  1386. simde__m128i_private a_ = simde__m128i_to_private(a);
  1387. #if defined(SIMDE_CONVERT_VECTOR_)
  1388. SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u16);
  1389. #else
  1390. SIMDE_VECTORIZE
  1391. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1392. r_.i64[i] = a_.u16[i];
  1393. }
  1394. #endif
  1395. return simde__m256i_from_private(r_);
  1396. #endif
  1397. }
  1398. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1399. #undef _mm256_cvtepu16_epi64
  1400. #define _mm256_cvtepu16_epi64(a) simde_mm256_cvtepu16_epi64(a)
  1401. #endif
  1402. SIMDE_FUNCTION_ATTRIBUTES
  1403. simde__m256i
  1404. simde_mm256_cvtepu32_epi64 (simde__m128i a) {
  1405. #if defined(SIMDE_X86_AVX2_NATIVE)
  1406. return _mm256_cvtepu32_epi64(a);
  1407. #else
  1408. simde__m256i_private r_;
  1409. simde__m128i_private a_ = simde__m128i_to_private(a);
  1410. #if defined(SIMDE_CONVERT_VECTOR_)
  1411. SIMDE_CONVERT_VECTOR_(r_.i64, a_.u32);
  1412. #else
  1413. SIMDE_VECTORIZE
  1414. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1415. r_.i64[i] = a_.u32[i];
  1416. }
  1417. #endif
  1418. return simde__m256i_from_private(r_);
  1419. #endif
  1420. }
  1421. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1422. #undef _mm256_cvtepu32_epi64
  1423. #define _mm256_cvtepu32_epi64(a) simde_mm256_cvtepu32_epi64(a)
  1424. #endif
  1425. SIMDE_FUNCTION_ATTRIBUTES
  1426. int
  1427. simde_mm256_extract_epi8 (simde__m256i a, const int index)
  1428. SIMDE_REQUIRE_RANGE(index, 0, 31){
  1429. simde__m256i_private a_ = simde__m256i_to_private(a);
  1430. return a_.i8[index];
  1431. }
  1432. #if defined(SIMDE_X86_AVX2_NATIVE) && \
  1433. (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0))
  1434. #define simde_mm256_extract_epi8(a, index) _mm256_extract_epi8(a, index)
  1435. #endif
  1436. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1437. #undef _mm256_extract_epi8
  1438. #define _mm256_extract_epi8(a, index) simde_mm256_extract_epi8(a, index)
  1439. #endif
  1440. SIMDE_FUNCTION_ATTRIBUTES
  1441. int
  1442. simde_mm256_extract_epi16 (simde__m256i a, const int index)
  1443. SIMDE_REQUIRE_RANGE(index, 0, 15) {
  1444. simde__m256i_private a_ = simde__m256i_to_private(a);
  1445. return a_.i16[index];
  1446. }
  1447. #if defined(SIMDE_X86_AVX2_NATIVE) && \
  1448. (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0))
  1449. #define simde_mm256_extract_epi16(a, index) _mm256_extract_epi16(a, index)
  1450. #endif
  1451. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1452. #undef _mm256_extract_epi16
  1453. #define _mm256_extract_epi16(a, index) simde_mm256_extract_epi16(a, index)
  1454. #endif
  1455. SIMDE_FUNCTION_ATTRIBUTES
  1456. simde__m128i
  1457. simde_mm256_extracti128_si256 (simde__m256i a, const int imm8)
  1458. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
  1459. simde__m256i_private a_ = simde__m256i_to_private(a);
  1460. return a_.m128i[imm8];
  1461. }
  1462. #if defined(SIMDE_X86_AVX2_NATIVE)
  1463. # define simde_mm256_extracti128_si256(a, imm8) _mm256_extracti128_si256(a, imm8)
  1464. #endif
  1465. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1466. #undef _mm256_extracti128_si256
  1467. #define _mm256_extracti128_si256(a, imm8) simde_mm256_extracti128_si256(a, imm8)
  1468. #endif
  1469. SIMDE_FUNCTION_ATTRIBUTES
  1470. simde__m128i
  1471. simde_mm_i32gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale)
  1472. SIMDE_REQUIRE_CONSTANT(scale)
  1473. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1474. simde__m128i_private
  1475. vindex_ = simde__m128i_to_private(vindex),
  1476. r_;
  1477. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1478. SIMDE_VECTORIZE
  1479. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  1480. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1481. int32_t dst;
  1482. simde_memcpy(&dst, src, sizeof(dst));
  1483. r_.i32[i] = dst;
  1484. }
  1485. return simde__m128i_from_private(r_);
  1486. }
  1487. #if defined(SIMDE_X86_AVX2_NATIVE)
  1488. #define simde_mm_i32gather_epi32(base_addr, vindex, scale) _mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
  1489. #endif
  1490. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1491. #undef _mm_i32gather_epi32
  1492. #define _mm_i32gather_epi32(base_addr, vindex, scale) simde_mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
  1493. #endif
  1494. SIMDE_FUNCTION_ATTRIBUTES
  1495. simde__m128i
  1496. simde_mm_mask_i32gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
  1497. SIMDE_REQUIRE_CONSTANT(scale)
  1498. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1499. simde__m128i_private
  1500. vindex_ = simde__m128i_to_private(vindex),
  1501. src_ = simde__m128i_to_private(src),
  1502. mask_ = simde__m128i_to_private(mask),
  1503. r_;
  1504. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1505. SIMDE_VECTORIZE
  1506. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  1507. if ((mask_.i32[i] >> 31) & 1) {
  1508. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1509. int32_t dst;
  1510. simde_memcpy(&dst, src1, sizeof(dst));
  1511. r_.i32[i] = dst;
  1512. }
  1513. else {
  1514. r_.i32[i] = src_.i32[i];
  1515. }
  1516. }
  1517. return simde__m128i_from_private(r_);
  1518. }
  1519. #if defined(SIMDE_X86_AVX2_NATIVE)
  1520. #define simde_mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
  1521. #endif
  1522. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1523. #undef _mm_mask_i32gather_epi32
  1524. #define _mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
  1525. #endif
  1526. SIMDE_FUNCTION_ATTRIBUTES
  1527. simde__m256i
  1528. simde_mm256_i32gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale)
  1529. SIMDE_REQUIRE_CONSTANT(scale)
  1530. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1531. simde__m256i_private
  1532. vindex_ = simde__m256i_to_private(vindex),
  1533. r_;
  1534. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1535. SIMDE_VECTORIZE
  1536. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  1537. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1538. int32_t dst;
  1539. simde_memcpy(&dst, src, sizeof(dst));
  1540. r_.i32[i] = dst;
  1541. }
  1542. return simde__m256i_from_private(r_);
  1543. }
  1544. #if defined(SIMDE_X86_AVX2_NATIVE)
  1545. #define simde_mm256_i32gather_epi32(base_addr, vindex, scale) _mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
  1546. #endif
  1547. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1548. #undef _mm256_i32gather_epi32
  1549. #define _mm256_i32gather_epi32(base_addr, vindex, scale) simde_mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
  1550. #endif
  1551. SIMDE_FUNCTION_ATTRIBUTES
  1552. simde__m256i
  1553. simde_mm256_mask_i32gather_epi32(simde__m256i src, const int32_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale)
  1554. SIMDE_REQUIRE_CONSTANT(scale)
  1555. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1556. simde__m256i_private
  1557. vindex_ = simde__m256i_to_private(vindex),
  1558. src_ = simde__m256i_to_private(src),
  1559. mask_ = simde__m256i_to_private(mask),
  1560. r_;
  1561. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1562. SIMDE_VECTORIZE
  1563. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  1564. if ((mask_.i32[i] >> 31) & 1) {
  1565. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1566. int32_t dst;
  1567. simde_memcpy(&dst, src1, sizeof(dst));
  1568. r_.i32[i] = dst;
  1569. }
  1570. else {
  1571. r_.i32[i] = src_.i32[i];
  1572. }
  1573. }
  1574. return simde__m256i_from_private(r_);
  1575. }
  1576. #if defined(SIMDE_X86_AVX2_NATIVE)
  1577. #define simde_mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
  1578. #endif
  1579. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1580. #undef _mm256_mask_i32gather_epi32
  1581. #define _mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
  1582. #endif
  1583. SIMDE_FUNCTION_ATTRIBUTES
  1584. simde__m128i
  1585. simde_mm_i64gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale)
  1586. SIMDE_REQUIRE_CONSTANT(scale)
  1587. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1588. simde__m128i_private
  1589. vindex_ = simde__m128i_to_private(vindex),
  1590. r_ = simde__m128i_to_private(simde_mm_setzero_si128());
  1591. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1592. SIMDE_VECTORIZE
  1593. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  1594. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1595. int32_t dst;
  1596. simde_memcpy(&dst, src, sizeof(dst));
  1597. r_.i32[i] = dst;
  1598. }
  1599. return simde__m128i_from_private(r_);
  1600. }
  1601. #if defined(SIMDE_X86_AVX2_NATIVE)
  1602. #define simde_mm_i64gather_epi32(base_addr, vindex, scale) _mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
  1603. #endif
  1604. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1605. #undef _mm_i64gather_epi32
  1606. #define _mm_i64gather_epi32(base_addr, vindex, scale) simde_mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
  1607. #endif
  1608. SIMDE_FUNCTION_ATTRIBUTES
  1609. simde__m128i
  1610. simde_mm_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
  1611. SIMDE_REQUIRE_CONSTANT(scale)
  1612. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1613. simde__m128i_private
  1614. vindex_ = simde__m128i_to_private(vindex),
  1615. src_ = simde__m128i_to_private(src),
  1616. mask_ = simde__m128i_to_private(mask),
  1617. r_ = simde__m128i_to_private(simde_mm_setzero_si128());
  1618. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1619. SIMDE_VECTORIZE
  1620. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  1621. if ((mask_.i32[i] >> 31) & 1) {
  1622. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1623. int32_t dst;
  1624. simde_memcpy(&dst, src1, sizeof(dst));
  1625. r_.i32[i] = dst;
  1626. }
  1627. else {
  1628. r_.i32[i] = src_.i32[i];
  1629. }
  1630. }
  1631. return simde__m128i_from_private(r_);
  1632. }
  1633. #if defined(SIMDE_X86_AVX2_NATIVE)
  1634. #define simde_mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
  1635. #endif
  1636. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1637. #undef _mm_mask_i64gather_epi32
  1638. #define _mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
  1639. #endif
  1640. SIMDE_FUNCTION_ATTRIBUTES
  1641. simde__m128i
  1642. simde_mm256_i64gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale)
  1643. SIMDE_REQUIRE_CONSTANT(scale)
  1644. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1645. simde__m256i_private
  1646. vindex_ = simde__m256i_to_private(vindex);
  1647. simde__m128i_private
  1648. r_ = simde__m128i_to_private(simde_mm_setzero_si128());
  1649. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1650. SIMDE_VECTORIZE
  1651. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  1652. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1653. int32_t dst;
  1654. simde_memcpy(&dst, src, sizeof(dst));
  1655. r_.i32[i] = dst;
  1656. }
  1657. return simde__m128i_from_private(r_);
  1658. }
  1659. #if defined(SIMDE_X86_AVX2_NATIVE)
  1660. #define simde_mm256_i64gather_epi32(base_addr, vindex, scale) _mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale)
  1661. #endif
  1662. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1663. #undef _mm256_i64gather_epi32
  1664. #define _mm256_i64gather_epi32(base_addr, vindex, scale) simde_mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale)
  1665. #endif
  1666. SIMDE_FUNCTION_ATTRIBUTES
  1667. simde__m128i
  1668. simde_mm256_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m256i vindex, simde__m128i mask, const int32_t scale)
  1669. SIMDE_REQUIRE_CONSTANT(scale)
  1670. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1671. simde__m256i_private
  1672. vindex_ = simde__m256i_to_private(vindex);
  1673. simde__m128i_private
  1674. src_ = simde__m128i_to_private(src),
  1675. mask_ = simde__m128i_to_private(mask),
  1676. r_ = simde__m128i_to_private(simde_mm_setzero_si128());
  1677. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1678. SIMDE_VECTORIZE
  1679. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  1680. if ((mask_.i32[i] >> 31) & 1) {
  1681. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1682. int32_t dst;
  1683. simde_memcpy(&dst, src1, sizeof(dst));
  1684. r_.i32[i] = dst;
  1685. }
  1686. else {
  1687. r_.i32[i] = src_.i32[i];
  1688. }
  1689. }
  1690. return simde__m128i_from_private(r_);
  1691. }
  1692. #if defined(SIMDE_X86_AVX2_NATIVE)
  1693. #define simde_mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale)
  1694. #endif
  1695. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1696. #undef _mm256_mask_i64gather_epi32
  1697. #define _mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale)
  1698. #endif
  1699. SIMDE_FUNCTION_ATTRIBUTES
  1700. simde__m128i
  1701. simde_mm_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
  1702. SIMDE_REQUIRE_CONSTANT(scale)
  1703. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1704. simde__m128i_private
  1705. vindex_ = simde__m128i_to_private(vindex),
  1706. r_;
  1707. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1708. SIMDE_VECTORIZE
  1709. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1710. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1711. int64_t dst;
  1712. simde_memcpy(&dst, src, sizeof(dst));
  1713. r_.i64[i] = dst;
  1714. }
  1715. return simde__m128i_from_private(r_);
  1716. }
  1717. #if defined(SIMDE_X86_AVX2_NATIVE)
  1718. #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  1719. #define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
  1720. #else
  1721. #define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
  1722. #endif
  1723. #endif
  1724. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1725. #undef _mm_i32gather_epi64
  1726. #define _mm_i32gather_epi64(base_addr, vindex, scale) simde_mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
  1727. #endif
  1728. SIMDE_FUNCTION_ATTRIBUTES
  1729. simde__m128i
  1730. simde_mm_mask_i32gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
  1731. SIMDE_REQUIRE_CONSTANT(scale)
  1732. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1733. simde__m128i_private
  1734. vindex_ = simde__m128i_to_private(vindex),
  1735. src_ = simde__m128i_to_private(src),
  1736. mask_ = simde__m128i_to_private(mask),
  1737. r_;
  1738. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1739. SIMDE_VECTORIZE
  1740. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  1741. if ((mask_.i64[i] >> 63) & 1) {
  1742. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1743. int64_t dst;
  1744. simde_memcpy(&dst, src1, sizeof(dst));
  1745. r_.i64[i] = dst;
  1746. }
  1747. else {
  1748. r_.i64[i] = src_.i64[i];
  1749. }
  1750. }
  1751. return simde__m128i_from_private(r_);
  1752. }
  1753. #if defined(SIMDE_X86_AVX2_NATIVE)
  1754. #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  1755. #define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
  1756. #else
  1757. #define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
  1758. #endif
  1759. #endif
  1760. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1761. #undef _mm_mask_i32gather_epi64
  1762. #define _mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
  1763. #endif
  1764. SIMDE_FUNCTION_ATTRIBUTES
  1765. simde__m256i
  1766. simde_mm256_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
  1767. SIMDE_REQUIRE_CONSTANT(scale)
  1768. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1769. simde__m128i_private
  1770. vindex_ = simde__m128i_to_private(vindex);
  1771. simde__m256i_private
  1772. r_;
  1773. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1774. SIMDE_VECTORIZE
  1775. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  1776. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1777. int64_t dst;
  1778. simde_memcpy(&dst, src, sizeof(dst));
  1779. r_.i64[i] = dst;
  1780. }
  1781. return simde__m256i_from_private(r_);
  1782. }
  1783. #if defined(SIMDE_X86_AVX2_NATIVE)
  1784. #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  1785. #define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
  1786. #else
  1787. #define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
  1788. #endif
  1789. #endif
  1790. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1791. #undef _mm256_i32gather_epi64
  1792. #define _mm256_i32gather_epi64(base_addr, vindex, scale) simde_mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
  1793. #endif
  1794. SIMDE_FUNCTION_ATTRIBUTES
  1795. simde__m256i
  1796. simde_mm256_mask_i32gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m128i vindex, simde__m256i mask, const int32_t scale)
  1797. SIMDE_REQUIRE_CONSTANT(scale)
  1798. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1799. simde__m256i_private
  1800. src_ = simde__m256i_to_private(src),
  1801. mask_ = simde__m256i_to_private(mask),
  1802. r_;
  1803. simde__m128i_private
  1804. vindex_ = simde__m128i_to_private(vindex);
  1805. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1806. SIMDE_VECTORIZE
  1807. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  1808. if ((mask_.i64[i] >> 63) & 1) {
  1809. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1810. int64_t dst;
  1811. simde_memcpy(&dst, src1, sizeof(dst));
  1812. r_.i64[i] = dst;
  1813. }
  1814. else {
  1815. r_.i64[i] = src_.i64[i];
  1816. }
  1817. }
  1818. return simde__m256i_from_private(r_);
  1819. }
  1820. #if defined(SIMDE_X86_AVX2_NATIVE)
  1821. #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  1822. #define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
  1823. #else
  1824. #define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
  1825. #endif
  1826. #endif
  1827. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1828. #undef _mm256_mask_i32gather_epi64
  1829. #define _mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
  1830. #endif
  1831. SIMDE_FUNCTION_ATTRIBUTES
  1832. simde__m128i
  1833. simde_mm_i64gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale)
  1834. SIMDE_REQUIRE_CONSTANT(scale)
  1835. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1836. simde__m128i_private
  1837. vindex_ = simde__m128i_to_private(vindex),
  1838. r_ = simde__m128i_to_private(simde_mm_setzero_si128());
  1839. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1840. SIMDE_VECTORIZE
  1841. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  1842. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1843. int64_t dst;
  1844. simde_memcpy(&dst, src, sizeof(dst));
  1845. r_.i64[i] = dst;
  1846. }
  1847. return simde__m128i_from_private(r_);
  1848. }
  1849. #if defined(SIMDE_X86_AVX2_NATIVE)
  1850. #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  1851. #define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
  1852. #else
  1853. #define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
  1854. #endif
  1855. #endif
  1856. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1857. #undef _mm_i64gather_epi64
  1858. #define _mm_i64gather_epi64(base_addr, vindex, scale) simde_mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
  1859. #endif
  1860. SIMDE_FUNCTION_ATTRIBUTES
  1861. simde__m128i
  1862. simde_mm_mask_i64gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale)
  1863. SIMDE_REQUIRE_CONSTANT(scale)
  1864. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1865. simde__m128i_private
  1866. vindex_ = simde__m128i_to_private(vindex),
  1867. src_ = simde__m128i_to_private(src),
  1868. mask_ = simde__m128i_to_private(mask),
  1869. r_ = simde__m128i_to_private(simde_mm_setzero_si128());
  1870. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1871. SIMDE_VECTORIZE
  1872. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  1873. if ((mask_.i64[i] >> 63) & 1) {
  1874. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1875. int64_t dst;
  1876. simde_memcpy(&dst, src1, sizeof(dst));
  1877. r_.i64[i] = dst;
  1878. }
  1879. else {
  1880. r_.i64[i] = src_.i64[i];
  1881. }
  1882. }
  1883. return simde__m128i_from_private(r_);
  1884. }
  1885. #if defined(SIMDE_X86_AVX2_NATIVE)
  1886. #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  1887. #define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
  1888. #else
  1889. #define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
  1890. #endif
  1891. #endif
  1892. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1893. #undef _mm_mask_i64gather_epi64
  1894. #define _mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
  1895. #endif
  1896. SIMDE_FUNCTION_ATTRIBUTES
  1897. simde__m256i
  1898. simde_mm256_i64gather_epi64(const int64_t* base_addr, simde__m256i vindex, const int32_t scale)
  1899. SIMDE_REQUIRE_CONSTANT(scale)
  1900. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1901. simde__m256i_private
  1902. vindex_ = simde__m256i_to_private(vindex),
  1903. r_ = simde__m256i_to_private(simde_mm256_setzero_si256());
  1904. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1905. SIMDE_VECTORIZE
  1906. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  1907. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1908. int64_t dst;
  1909. simde_memcpy(&dst, src, sizeof(dst));
  1910. r_.i64[i] = dst;
  1911. }
  1912. return simde__m256i_from_private(r_);
  1913. }
  1914. #if defined(SIMDE_X86_AVX2_NATIVE)
  1915. #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  1916. #define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
  1917. #else
  1918. #define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale)
  1919. #endif
  1920. #endif
  1921. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1922. #undef _mm256_i64gather_epi64
  1923. #define _mm256_i64gather_epi64(base_addr, vindex, scale) simde_mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale)
  1924. #endif
  1925. SIMDE_FUNCTION_ATTRIBUTES
  1926. simde__m256i
  1927. simde_mm256_mask_i64gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale)
  1928. SIMDE_REQUIRE_CONSTANT(scale)
  1929. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1930. simde__m256i_private
  1931. vindex_ = simde__m256i_to_private(vindex),
  1932. src_ = simde__m256i_to_private(src),
  1933. mask_ = simde__m256i_to_private(mask),
  1934. r_ = simde__m256i_to_private(simde_mm256_setzero_si256());
  1935. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1936. SIMDE_VECTORIZE
  1937. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  1938. if ((mask_.i64[i] >> 63) & 1) {
  1939. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1940. int64_t dst;
  1941. simde_memcpy(&dst, src1, sizeof(dst));
  1942. r_.i64[i] = dst;
  1943. }
  1944. else {
  1945. r_.i64[i] = src_.i64[i];
  1946. }
  1947. }
  1948. return simde__m256i_from_private(r_);
  1949. }
  1950. #if defined(SIMDE_X86_AVX2_NATIVE)
  1951. #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  1952. #define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
  1953. #else
  1954. #define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale)
  1955. #endif
  1956. #endif
  1957. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1958. #undef _mm256_mask_i64gather_epi64
  1959. #define _mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale)
  1960. #endif
  1961. SIMDE_FUNCTION_ATTRIBUTES
  1962. simde__m128
  1963. simde_mm_i32gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale)
  1964. SIMDE_REQUIRE_CONSTANT(scale)
  1965. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1966. simde__m128i_private
  1967. vindex_ = simde__m128i_to_private(vindex);
  1968. simde__m128_private
  1969. r_;
  1970. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1971. SIMDE_VECTORIZE
  1972. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  1973. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  1974. simde_float32 dst;
  1975. simde_memcpy(&dst, src, sizeof(dst));
  1976. r_.f32[i] = dst;
  1977. }
  1978. return simde__m128_from_private(r_);
  1979. }
  1980. #if defined(SIMDE_X86_AVX2_NATIVE)
  1981. #define simde_mm_i32gather_ps(base_addr, vindex, scale) _mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
  1982. #endif
  1983. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  1984. #undef _mm_i32gather_ps
  1985. #define _mm_i32gather_ps(base_addr, vindex, scale) simde_mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
  1986. #endif
  1987. SIMDE_FUNCTION_ATTRIBUTES
  1988. simde__m128
  1989. simde_mm_mask_i32gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale)
  1990. SIMDE_REQUIRE_CONSTANT(scale)
  1991. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  1992. simde__m128i_private
  1993. vindex_ = simde__m128i_to_private(vindex);
  1994. simde__m128_private
  1995. src_ = simde__m128_to_private(src),
  1996. mask_ = simde__m128_to_private(mask),
  1997. r_;
  1998. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  1999. SIMDE_VECTORIZE
  2000. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  2001. if ((mask_.i32[i] >> 31) & 1) {
  2002. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2003. simde_float32 dst;
  2004. simde_memcpy(&dst, src1, sizeof(dst));
  2005. r_.f32[i] = dst;
  2006. }
  2007. else {
  2008. r_.f32[i] = src_.f32[i];
  2009. }
  2010. }
  2011. return simde__m128_from_private(r_);
  2012. }
  2013. #if defined(SIMDE_X86_AVX2_NATIVE)
  2014. #define simde_mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
  2015. #endif
  2016. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2017. #undef _mm_mask_i32gather_ps
  2018. #define _mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
  2019. #endif
  2020. SIMDE_FUNCTION_ATTRIBUTES
  2021. simde__m256
  2022. simde_mm256_i32gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale)
  2023. SIMDE_REQUIRE_CONSTANT(scale)
  2024. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2025. simde__m256i_private
  2026. vindex_ = simde__m256i_to_private(vindex);
  2027. simde__m256_private
  2028. r_;
  2029. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2030. SIMDE_VECTORIZE
  2031. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  2032. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2033. simde_float32 dst;
  2034. simde_memcpy(&dst, src, sizeof(dst));
  2035. r_.f32[i] = dst;
  2036. }
  2037. return simde__m256_from_private(r_);
  2038. }
  2039. #if defined(SIMDE_X86_AVX2_NATIVE)
  2040. #define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, (base_addr)), (vindex), (scale))
  2041. #endif
  2042. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2043. #undef _mm256_i32gather_ps
  2044. #define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, (base_addr)), (vindex), (scale))
  2045. #endif
  2046. SIMDE_FUNCTION_ATTRIBUTES
  2047. simde__m256
  2048. simde_mm256_mask_i32gather_ps(simde__m256 src, const simde_float32* base_addr, simde__m256i vindex, simde__m256 mask, const int32_t scale)
  2049. SIMDE_REQUIRE_CONSTANT(scale)
  2050. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2051. simde__m256i_private
  2052. vindex_ = simde__m256i_to_private(vindex);
  2053. simde__m256_private
  2054. src_ = simde__m256_to_private(src),
  2055. mask_ = simde__m256_to_private(mask),
  2056. r_;
  2057. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2058. SIMDE_VECTORIZE
  2059. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  2060. if ((mask_.i32[i] >> 31) & 1) {
  2061. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2062. simde_float32 dst;
  2063. simde_memcpy(&dst, src1, sizeof(dst));
  2064. r_.f32[i] = dst;
  2065. }
  2066. else {
  2067. r_.f32[i] = src_.f32[i];
  2068. }
  2069. }
  2070. return simde__m256_from_private(r_);
  2071. }
  2072. #if defined(SIMDE_X86_AVX2_NATIVE)
  2073. #define simde_mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
  2074. #endif
  2075. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2076. #undef _mm256_mask_i32gather_ps
  2077. #define _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
  2078. #endif
  2079. SIMDE_FUNCTION_ATTRIBUTES
  2080. simde__m128
  2081. simde_mm_i64gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale)
  2082. SIMDE_REQUIRE_CONSTANT(scale)
  2083. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2084. simde__m128i_private
  2085. vindex_ = simde__m128i_to_private(vindex);
  2086. simde__m128_private
  2087. r_ = simde__m128_to_private(simde_mm_setzero_ps());
  2088. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2089. SIMDE_VECTORIZE
  2090. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  2091. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2092. simde_float32 dst;
  2093. simde_memcpy(&dst, src, sizeof(dst));
  2094. r_.f32[i] = dst;
  2095. }
  2096. return simde__m128_from_private(r_);
  2097. }
  2098. #if defined(SIMDE_X86_AVX2_NATIVE)
  2099. #define simde_mm_i64gather_ps(base_addr, vindex, scale) _mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
  2100. #endif
  2101. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2102. #undef _mm_i64gather_ps
  2103. #define _mm_i64gather_ps(base_addr, vindex, scale) simde_mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
  2104. #endif
  2105. SIMDE_FUNCTION_ATTRIBUTES
  2106. simde__m128
  2107. simde_mm_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale)
  2108. SIMDE_REQUIRE_CONSTANT(scale)
  2109. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2110. simde__m128i_private
  2111. vindex_ = simde__m128i_to_private(vindex);
  2112. simde__m128_private
  2113. src_ = simde__m128_to_private(src),
  2114. mask_ = simde__m128_to_private(mask),
  2115. r_ = simde__m128_to_private(simde_mm_setzero_ps());
  2116. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2117. SIMDE_VECTORIZE
  2118. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  2119. if ((mask_.i32[i] >> 31) & 1) {
  2120. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2121. simde_float32 dst;
  2122. simde_memcpy(&dst, src1, sizeof(dst));
  2123. r_.f32[i] = dst;
  2124. }
  2125. else {
  2126. r_.f32[i] = src_.f32[i];
  2127. }
  2128. }
  2129. return simde__m128_from_private(r_);
  2130. }
  2131. #if defined(SIMDE_X86_AVX2_NATIVE)
  2132. #define simde_mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, float32_t const*, base_addr), vindex, mask, scale)
  2133. #endif
  2134. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2135. #undef _mm_mask_i64gather_ps
  2136. #define _mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
  2137. #endif
  2138. SIMDE_FUNCTION_ATTRIBUTES
  2139. simde__m128
  2140. simde_mm256_i64gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale)
  2141. SIMDE_REQUIRE_CONSTANT(scale)
  2142. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2143. simde__m256i_private
  2144. vindex_ = simde__m256i_to_private(vindex);
  2145. simde__m128_private
  2146. r_ = simde__m128_to_private(simde_mm_setzero_ps());
  2147. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2148. SIMDE_VECTORIZE
  2149. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  2150. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2151. simde_float32 dst;
  2152. simde_memcpy(&dst, src, sizeof(dst));
  2153. r_.f32[i] = dst;
  2154. }
  2155. return simde__m128_from_private(r_);
  2156. }
  2157. #if defined(SIMDE_X86_AVX2_NATIVE)
  2158. #define simde_mm256_i64gather_ps(base_addr, vindex, scale) _mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
  2159. #endif
  2160. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2161. #undef _mm256_i64gather_ps
  2162. #define _mm256_i64gather_ps(base_addr, vindex, scale) simde_mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
  2163. #endif
  2164. SIMDE_FUNCTION_ATTRIBUTES
  2165. simde__m128
  2166. simde_mm256_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m256i vindex, simde__m128 mask, const int32_t scale)
  2167. SIMDE_REQUIRE_CONSTANT(scale)
  2168. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2169. simde__m256i_private
  2170. vindex_ = simde__m256i_to_private(vindex);
  2171. simde__m128_private
  2172. src_ = simde__m128_to_private(src),
  2173. mask_ = simde__m128_to_private(mask),
  2174. r_ = simde__m128_to_private(simde_mm_setzero_ps());
  2175. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2176. SIMDE_VECTORIZE
  2177. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  2178. if ((mask_.i32[i] >> 31) & 1) {
  2179. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2180. simde_float32 dst;
  2181. simde_memcpy(&dst, src1, sizeof(dst));
  2182. r_.f32[i] = dst;
  2183. }
  2184. else {
  2185. r_.f32[i] = src_.f32[i];
  2186. }
  2187. }
  2188. return simde__m128_from_private(r_);
  2189. }
  2190. #if defined(SIMDE_X86_AVX2_NATIVE)
  2191. #define simde_mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale)
  2192. #endif
  2193. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2194. #undef _mm256_mask_i64gather_ps
  2195. #define _mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale)
  2196. #endif
  2197. SIMDE_FUNCTION_ATTRIBUTES
  2198. simde__m128d
  2199. simde_mm_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
  2200. SIMDE_REQUIRE_CONSTANT(scale)
  2201. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2202. simde__m128i_private
  2203. vindex_ = simde__m128i_to_private(vindex);
  2204. simde__m128d_private
  2205. r_;
  2206. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2207. SIMDE_VECTORIZE
  2208. for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
  2209. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2210. simde_float64 dst;
  2211. simde_memcpy(&dst, src, sizeof(dst));
  2212. r_.f64[i] = dst;
  2213. }
  2214. return simde__m128d_from_private(r_);
  2215. }
  2216. #if defined(SIMDE_X86_AVX2_NATIVE)
  2217. #define simde_mm_i32gather_pd(base_addr, vindex, scale) _mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
  2218. #endif
  2219. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2220. #undef _mm_i32gather_pd
  2221. #define _mm_i32gather_pd(base_addr, vindex, scale) simde_mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
  2222. #endif
  2223. SIMDE_FUNCTION_ATTRIBUTES
  2224. simde__m128d
  2225. simde_mm_mask_i32gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale)
  2226. SIMDE_REQUIRE_CONSTANT(scale)
  2227. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2228. simde__m128i_private
  2229. vindex_ = simde__m128i_to_private(vindex);
  2230. simde__m128d_private
  2231. src_ = simde__m128d_to_private(src),
  2232. mask_ = simde__m128d_to_private(mask),
  2233. r_;
  2234. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2235. SIMDE_VECTORIZE
  2236. for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) {
  2237. if ((mask_.i64[i] >> 63) & 1) {
  2238. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2239. simde_float64 dst;
  2240. simde_memcpy(&dst, src1, sizeof(dst));
  2241. r_.f64[i] = dst;
  2242. }
  2243. else {
  2244. r_.f64[i] = src_.f64[i];
  2245. }
  2246. }
  2247. return simde__m128d_from_private(r_);
  2248. }
  2249. #if defined(SIMDE_X86_AVX2_NATIVE)
  2250. #define simde_mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
  2251. #endif
  2252. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2253. #undef _mm_mask_i32gather_pd
  2254. #define _mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
  2255. #endif
  2256. SIMDE_FUNCTION_ATTRIBUTES
  2257. simde__m256d
  2258. simde_mm256_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
  2259. SIMDE_REQUIRE_CONSTANT(scale)
  2260. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2261. simde__m128i_private
  2262. vindex_ = simde__m128i_to_private(vindex);
  2263. simde__m256d_private
  2264. r_;
  2265. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2266. SIMDE_VECTORIZE
  2267. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  2268. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2269. simde_float64 dst;
  2270. simde_memcpy(&dst, src, sizeof(dst));
  2271. r_.f64[i] = dst;
  2272. }
  2273. return simde__m256d_from_private(r_);
  2274. }
  2275. #if defined(SIMDE_X86_AVX2_NATIVE)
  2276. #define simde_mm256_i32gather_pd(base_addr, vindex, scale) _mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
  2277. #endif
  2278. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2279. #undef _mm256_i32gather_pd
  2280. #define _mm256_i32gather_pd(base_addr, vindex, scale) simde_mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
  2281. #endif
  2282. SIMDE_FUNCTION_ATTRIBUTES
  2283. simde__m256d
  2284. simde_mm256_mask_i32gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m128i vindex, simde__m256d mask, const int32_t scale)
  2285. SIMDE_REQUIRE_CONSTANT(scale)
  2286. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2287. simde__m256d_private
  2288. src_ = simde__m256d_to_private(src),
  2289. mask_ = simde__m256d_to_private(mask),
  2290. r_;
  2291. simde__m128i_private
  2292. vindex_ = simde__m128i_to_private(vindex);
  2293. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2294. SIMDE_VECTORIZE
  2295. for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) {
  2296. if ((mask_.i64[i] >> 63) & 1) {
  2297. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2298. simde_float64 dst;
  2299. simde_memcpy(&dst, src1, sizeof(dst));
  2300. r_.f64[i] = dst;
  2301. }
  2302. else {
  2303. r_.f64[i] = src_.f64[i];
  2304. }
  2305. }
  2306. return simde__m256d_from_private(r_);
  2307. }
  2308. #if defined(SIMDE_X86_AVX2_NATIVE)
  2309. #define simde_mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
  2310. #endif
  2311. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2312. #undef _mm256_mask_i32gather_pd
  2313. #define _mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
  2314. #endif
  2315. SIMDE_FUNCTION_ATTRIBUTES
  2316. simde__m128d
  2317. simde_mm_i64gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale)
  2318. SIMDE_REQUIRE_CONSTANT(scale)
  2319. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2320. simde__m128i_private
  2321. vindex_ = simde__m128i_to_private(vindex);
  2322. simde__m128d_private
  2323. r_ = simde__m128d_to_private(simde_mm_setzero_pd());
  2324. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2325. SIMDE_VECTORIZE
  2326. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  2327. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2328. simde_float64 dst;
  2329. simde_memcpy(&dst, src, sizeof(dst));
  2330. r_.f64[i] = dst;
  2331. }
  2332. return simde__m128d_from_private(r_);
  2333. }
  2334. #if defined(SIMDE_X86_AVX2_NATIVE)
  2335. #define simde_mm_i64gather_pd(base_addr, vindex, scale) _mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
  2336. #endif
  2337. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2338. #undef _mm_i64gather_pd
  2339. #define _mm_i64gather_pd(base_addr, vindex, scale) simde_mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
  2340. #endif
  2341. SIMDE_FUNCTION_ATTRIBUTES
  2342. simde__m128d
  2343. simde_mm_mask_i64gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale)
  2344. SIMDE_REQUIRE_CONSTANT(scale)
  2345. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2346. simde__m128i_private
  2347. vindex_ = simde__m128i_to_private(vindex);
  2348. simde__m128d_private
  2349. src_ = simde__m128d_to_private(src),
  2350. mask_ = simde__m128d_to_private(mask),
  2351. r_ = simde__m128d_to_private(simde_mm_setzero_pd());
  2352. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2353. SIMDE_VECTORIZE
  2354. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  2355. if ((mask_.i64[i] >> 63) & 1) {
  2356. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2357. simde_float64 dst;
  2358. simde_memcpy(&dst, src1, sizeof(dst));
  2359. r_.f64[i] = dst;
  2360. }
  2361. else {
  2362. r_.f64[i] = src_.f64[i];
  2363. }
  2364. }
  2365. return simde__m128d_from_private(r_);
  2366. }
  2367. #if defined(SIMDE_X86_AVX2_NATIVE)
  2368. #define simde_mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
  2369. #endif
  2370. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2371. #undef _mm_mask_i64gather_pd
  2372. #define _mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
  2373. #endif
  2374. SIMDE_FUNCTION_ATTRIBUTES
  2375. simde__m256d
  2376. simde_mm256_i64gather_pd(const simde_float64* base_addr, simde__m256i vindex, const int32_t scale)
  2377. SIMDE_REQUIRE_CONSTANT(scale)
  2378. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2379. simde__m256i_private
  2380. vindex_ = simde__m256i_to_private(vindex);
  2381. simde__m256d_private
  2382. r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
  2383. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2384. SIMDE_VECTORIZE
  2385. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  2386. const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2387. simde_float64 dst;
  2388. simde_memcpy(&dst, src, sizeof(dst));
  2389. r_.f64[i] = dst;
  2390. }
  2391. return simde__m256d_from_private(r_);
  2392. }
  2393. #if defined(SIMDE_X86_AVX2_NATIVE)
  2394. #define simde_mm256_i64gather_pd(base_addr, vindex, scale) _mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
  2395. #endif
  2396. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2397. #undef _mm256_i64gather_pd
  2398. #define _mm256_i64gather_pd(base_addr, vindex, scale) simde_mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale)
  2399. #endif
  2400. SIMDE_FUNCTION_ATTRIBUTES
  2401. simde__m256d
  2402. simde_mm256_mask_i64gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m256i vindex, simde__m256d mask, const int32_t scale)
  2403. SIMDE_REQUIRE_CONSTANT(scale)
  2404. HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") {
  2405. simde__m256i_private
  2406. vindex_ = simde__m256i_to_private(vindex);
  2407. simde__m256d_private
  2408. src_ = simde__m256d_to_private(src),
  2409. mask_ = simde__m256d_to_private(mask),
  2410. r_ = simde__m256d_to_private(simde_mm256_setzero_pd());
  2411. const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr);
  2412. SIMDE_VECTORIZE
  2413. for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) {
  2414. if ((mask_.i64[i] >> 63) & 1) {
  2415. const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale));
  2416. simde_float64 dst;
  2417. simde_memcpy(&dst, src1, sizeof(dst));
  2418. r_.f64[i] = dst;
  2419. }
  2420. else {
  2421. r_.f64[i] = src_.f64[i];
  2422. }
  2423. }
  2424. return simde__m256d_from_private(r_);
  2425. }
  2426. #if defined(SIMDE_X86_AVX2_NATIVE)
  2427. #define simde_mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
  2428. #endif
  2429. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2430. #undef _mm256_mask_i64gather_pd
  2431. #define _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale)
  2432. #endif
  2433. SIMDE_FUNCTION_ATTRIBUTES
  2434. simde__m256i
  2435. simde_mm256_inserti128_si256(simde__m256i a, simde__m128i b, const int imm8)
  2436. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
  2437. simde__m256i_private a_ = simde__m256i_to_private(a);
  2438. simde__m128i_private b_ = simde__m128i_to_private(b);
  2439. a_.m128i_private[ imm8 & 1 ] = b_;
  2440. return simde__m256i_from_private(a_);
  2441. }
  2442. #if defined(SIMDE_X86_AVX2_NATIVE)
  2443. #define simde_mm256_inserti128_si256(a, b, imm8) _mm256_inserti128_si256(a, b, imm8)
  2444. #endif
  2445. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2446. #undef _mm256_inserti128_si256
  2447. #define _mm256_inserti128_si256(a, b, imm8) simde_mm256_inserti128_si256(a, b, imm8)
  2448. #endif
  2449. SIMDE_FUNCTION_ATTRIBUTES
  2450. simde__m256i
  2451. simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) {
  2452. #if defined(SIMDE_X86_AVX2_NATIVE)
  2453. return _mm256_madd_epi16(a, b);
  2454. #else
  2455. simde__m256i_private
  2456. r_,
  2457. a_ = simde__m256i_to_private(a),
  2458. b_ = simde__m256i_to_private(b);
  2459. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2460. r_.m128i[0] = simde_mm_madd_epi16(a_.m128i[0], b_.m128i[0]);
  2461. r_.m128i[1] = simde_mm_madd_epi16(a_.m128i[1], b_.m128i[1]);
  2462. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
  2463. SIMDE_ALIGN_TO_32 int32_t product SIMDE_VECTOR(64);
  2464. SIMDE_ALIGN_TO_32 int32_t a32x16 SIMDE_VECTOR(64);
  2465. SIMDE_ALIGN_TO_32 int32_t b32x16 SIMDE_VECTOR(64);
  2466. SIMDE_ALIGN_TO_32 int32_t even SIMDE_VECTOR(32);
  2467. SIMDE_ALIGN_TO_32 int32_t odd SIMDE_VECTOR(32);
  2468. SIMDE_CONVERT_VECTOR_(a32x16, a_.i16);
  2469. SIMDE_CONVERT_VECTOR_(b32x16, b_.i16);
  2470. product = a32x16 * b32x16;
  2471. even = __builtin_shufflevector(product, product, 0, 2, 4, 6, 8, 10, 12, 14);
  2472. odd = __builtin_shufflevector(product, product, 1, 3, 5, 7, 9, 11, 13, 15);
  2473. r_.i32 = even + odd;
  2474. #else
  2475. SIMDE_VECTORIZE
  2476. for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) {
  2477. r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]);
  2478. }
  2479. #endif
  2480. return simde__m256i_from_private(r_);
  2481. #endif
  2482. }
  2483. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2484. #undef _mm256_madd_epi16
  2485. #define _mm256_madd_epi16(a, b) simde_mm256_madd_epi16(a, b)
  2486. #endif
  2487. SIMDE_FUNCTION_ATTRIBUTES
  2488. simde__m256i
  2489. simde_mm256_maddubs_epi16 (simde__m256i a, simde__m256i b) {
  2490. #if defined(SIMDE_X86_AVX2_NATIVE)
  2491. return _mm256_maddubs_epi16(a, b);
  2492. #else
  2493. simde__m256i_private
  2494. r_,
  2495. a_ = simde__m256i_to_private(a),
  2496. b_ = simde__m256i_to_private(b);
  2497. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2498. r_.m128i[0] = simde_mm_maddubs_epi16(a_.m128i[0], b_.m128i[0]);
  2499. r_.m128i[1] = simde_mm_maddubs_epi16(a_.m128i[1], b_.m128i[1]);
  2500. #else
  2501. SIMDE_VECTORIZE
  2502. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  2503. const int idx = HEDLEY_STATIC_CAST(int, i) << 1;
  2504. int32_t ts =
  2505. (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) +
  2506. (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1]));
  2507. r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN;
  2508. }
  2509. #endif
  2510. return simde__m256i_from_private(r_);
  2511. #endif
  2512. }
  2513. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2514. #undef _mm256_maddubs_epi16
  2515. #define _mm256_maddubs_epi16(a, b) simde_mm256_maddubs_epi16(a, b)
  2516. #endif
  2517. SIMDE_FUNCTION_ATTRIBUTES
  2518. simde__m128i
  2519. simde_mm_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) {
  2520. #if defined(SIMDE_X86_AVX2_NATIVE)
  2521. return _mm_maskload_epi32(mem_addr, mask);
  2522. #else
  2523. simde__m128i_private
  2524. r_,
  2525. mask_ = simde__m128i_to_private(mask),
  2526. mask_shr_;
  2527. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2528. mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31);
  2529. #else
  2530. SIMDE_VECTORIZE
  2531. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  2532. mask_shr_.i32[i] = mask_.i32[i] >> 31;
  2533. }
  2534. #endif
  2535. SIMDE_VECTORIZE
  2536. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  2537. r_.i32[i] = mask_shr_.i32[i] ? mem_addr[i] : INT32_C(0);
  2538. }
  2539. return simde__m128i_from_private(r_);
  2540. #endif
  2541. }
  2542. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2543. #undef _mm_maskload_epi32
  2544. #define _mm_maskload_epi32(mem_addr, mask) simde_mm_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask)
  2545. #endif
  2546. SIMDE_FUNCTION_ATTRIBUTES
  2547. simde__m256i
  2548. simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
  2549. #if defined(SIMDE_X86_AVX2_NATIVE)
  2550. return _mm256_maskload_epi32(mem_addr, mask);
  2551. #else
  2552. simde__m256i_private
  2553. mask_ = simde__m256i_to_private(mask),
  2554. r_;
  2555. SIMDE_VECTORIZE
  2556. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  2557. r_.i32[i] = (mask_.i32[i] >> 31) ? mem_addr[i] : INT32_C(0);
  2558. }
  2559. return simde__m256i_from_private(r_);
  2560. #endif
  2561. }
  2562. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2563. #undef _mm256_maskload_epi32
  2564. #define _mm256_maskload_epi32(mem_addr, mask) simde_mm256_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask)
  2565. #endif
  2566. SIMDE_FUNCTION_ATTRIBUTES
  2567. simde__m128i
  2568. simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) {
  2569. #if defined(SIMDE_X86_AVX2_NATIVE)
  2570. return _mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask);
  2571. #else
  2572. simde__m128i_private
  2573. r_,
  2574. mask_ = simde__m128i_to_private(mask),
  2575. mask_shr_;
  2576. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2577. mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63);
  2578. #else
  2579. SIMDE_VECTORIZE
  2580. for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) {
  2581. mask_shr_.i64[i] = mask_.i64[i] >> 63;
  2582. }
  2583. #endif
  2584. SIMDE_VECTORIZE
  2585. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  2586. r_.i64[i] = mask_shr_.i64[i] ? mem_addr[i] : INT64_C(0);
  2587. }
  2588. return simde__m128i_from_private(r_);
  2589. #endif
  2590. }
  2591. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2592. #undef _mm_maskload_epi64
  2593. #define _mm_maskload_epi64(mem_addr, mask) simde_mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask)
  2594. #endif
  2595. SIMDE_FUNCTION_ATTRIBUTES
  2596. simde__m256i
  2597. simde_mm256_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) {
  2598. #if defined(SIMDE_X86_AVX2_NATIVE)
  2599. return _mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask);
  2600. #else
  2601. simde__m256i_private
  2602. mask_ = simde__m256i_to_private(mask),
  2603. r_;
  2604. SIMDE_VECTORIZE
  2605. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  2606. r_.i64[i] = (mask_.i64[i] >> 63) ? mem_addr[i] : INT64_C(0);
  2607. }
  2608. return simde__m256i_from_private(r_);
  2609. #endif
  2610. }
  2611. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2612. #undef _mm256_maskload_epi64
  2613. #define _mm256_maskload_epi64(mem_addr, mask) simde_mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask)
  2614. #endif
  2615. SIMDE_FUNCTION_ATTRIBUTES
  2616. void
  2617. simde_mm_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128i a) {
  2618. #if defined(SIMDE_X86_AVX2_NATIVE)
  2619. _mm_maskstore_epi32(mem_addr, mask, a);
  2620. #else
  2621. simde__m128i_private mask_ = simde__m128i_to_private(mask);
  2622. simde__m128i_private a_ = simde__m128i_to_private(a);
  2623. SIMDE_VECTORIZE
  2624. for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) {
  2625. if (mask_.u32[i] & (UINT32_C(1) << 31))
  2626. mem_addr[i] = a_.i32[i];
  2627. }
  2628. #endif
  2629. }
  2630. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2631. #undef _mm_maskstore_epi32
  2632. #define _mm_maskstore_epi32(mem_addr, mask, a) simde_mm_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a)
  2633. #endif
  2634. SIMDE_FUNCTION_ATTRIBUTES
  2635. void
  2636. simde_mm256_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256i a) {
  2637. #if defined(SIMDE_X86_AVX2_NATIVE)
  2638. _mm256_maskstore_epi32(mem_addr, mask, a);
  2639. #else
  2640. simde__m256i_private mask_ = simde__m256i_to_private(mask);
  2641. simde__m256i_private a_ = simde__m256i_to_private(a);
  2642. SIMDE_VECTORIZE
  2643. for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) {
  2644. if (mask_.u32[i] & (UINT32_C(1) << 31))
  2645. mem_addr[i] = a_.i32[i];
  2646. }
  2647. #endif
  2648. }
  2649. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2650. #undef _mm256_maskstore_epi32
  2651. #define _mm256_maskstore_epi32(mem_addr, mask, a) simde_mm256_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a)
  2652. #endif
  2653. SIMDE_FUNCTION_ATTRIBUTES
  2654. void
  2655. simde_mm_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128i a) {
  2656. #if defined(SIMDE_X86_AVX2_NATIVE)
  2657. _mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a);
  2658. #else
  2659. simde__m128i_private mask_ = simde__m128i_to_private(mask);
  2660. simde__m128i_private a_ = simde__m128i_to_private(a);
  2661. SIMDE_VECTORIZE
  2662. for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) {
  2663. if (mask_.u64[i] >> 63)
  2664. mem_addr[i] = a_.i64[i];
  2665. }
  2666. #endif
  2667. }
  2668. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2669. #undef _mm_maskstore_epi64
  2670. #define _mm_maskstore_epi64(mem_addr, mask, a) simde_mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a)
  2671. #endif
  2672. SIMDE_FUNCTION_ATTRIBUTES
  2673. void
  2674. simde_mm256_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256i a) {
  2675. #if defined(SIMDE_X86_AVX2_NATIVE)
  2676. _mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a);
  2677. #else
  2678. simde__m256i_private mask_ = simde__m256i_to_private(mask);
  2679. simde__m256i_private a_ = simde__m256i_to_private(a);
  2680. SIMDE_VECTORIZE
  2681. for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) {
  2682. if (mask_.u64[i] & (UINT64_C(1) << 63))
  2683. mem_addr[i] = a_.i64[i];
  2684. }
  2685. #endif
  2686. }
  2687. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2688. #undef _mm256_maskstore_epi64
  2689. #define _mm256_maskstore_epi64(mem_addr, mask, a) simde_mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a)
  2690. #endif
  2691. SIMDE_FUNCTION_ATTRIBUTES
  2692. simde__m256i
  2693. simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) {
  2694. #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI)
  2695. return _mm256_max_epi8(a, b);
  2696. #else
  2697. simde__m256i_private
  2698. r_,
  2699. a_ = simde__m256i_to_private(a),
  2700. b_ = simde__m256i_to_private(b);
  2701. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2702. r_.m128i[0] = simde_mm_max_epi8(a_.m128i[0], b_.m128i[0]);
  2703. r_.m128i[1] = simde_mm_max_epi8(a_.m128i[1], b_.m128i[1]);
  2704. #else
  2705. SIMDE_VECTORIZE
  2706. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  2707. r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i];
  2708. }
  2709. #endif
  2710. return simde__m256i_from_private(r_);
  2711. #endif
  2712. }
  2713. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2714. #undef _mm256_max_epi8
  2715. #define _mm256_max_epi8(a, b) simde_mm256_max_epi8(a, b)
  2716. #endif
  2717. SIMDE_FUNCTION_ATTRIBUTES
  2718. simde__m256i
  2719. simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) {
  2720. #if defined(SIMDE_X86_AVX2_NATIVE)
  2721. return _mm256_max_epu8(a, b);
  2722. #else
  2723. simde__m256i_private
  2724. r_,
  2725. a_ = simde__m256i_to_private(a),
  2726. b_ = simde__m256i_to_private(b);
  2727. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2728. r_.m128i[0] = simde_mm_max_epu8(a_.m128i[0], b_.m128i[0]);
  2729. r_.m128i[1] = simde_mm_max_epu8(a_.m128i[1], b_.m128i[1]);
  2730. #else
  2731. SIMDE_VECTORIZE
  2732. for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
  2733. r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
  2734. }
  2735. #endif
  2736. return simde__m256i_from_private(r_);
  2737. #endif
  2738. }
  2739. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2740. #undef _mm256_max_epu8
  2741. #define _mm256_max_epu8(a, b) simde_mm256_max_epu8(a, b)
  2742. #endif
  2743. SIMDE_FUNCTION_ATTRIBUTES
  2744. simde__m256i
  2745. simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) {
  2746. #if defined(SIMDE_X86_AVX2_NATIVE)
  2747. return _mm256_max_epu16(a, b);
  2748. #else
  2749. simde__m256i_private
  2750. r_,
  2751. a_ = simde__m256i_to_private(a),
  2752. b_ = simde__m256i_to_private(b);
  2753. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2754. r_.m128i[0] = simde_mm_max_epu16(a_.m128i[0], b_.m128i[0]);
  2755. r_.m128i[1] = simde_mm_max_epu16(a_.m128i[1], b_.m128i[1]);
  2756. #else
  2757. SIMDE_VECTORIZE
  2758. for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
  2759. r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? a_.u16[i] : b_.u16[i];
  2760. }
  2761. #endif
  2762. return simde__m256i_from_private(r_);
  2763. #endif
  2764. }
  2765. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2766. #undef _mm256_max_epu16
  2767. #define _mm256_max_epu16(a, b) simde_mm256_max_epu16(a, b)
  2768. #endif
  2769. SIMDE_FUNCTION_ATTRIBUTES
  2770. simde__m256i
  2771. simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) {
  2772. #if defined(SIMDE_X86_AVX2_NATIVE)
  2773. return _mm256_max_epu32(a, b);
  2774. #else
  2775. simde__m256i_private
  2776. r_,
  2777. a_ = simde__m256i_to_private(a),
  2778. b_ = simde__m256i_to_private(b);
  2779. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2780. r_.m128i[0] = simde_mm_max_epu32(a_.m128i[0], b_.m128i[0]);
  2781. r_.m128i[1] = simde_mm_max_epu32(a_.m128i[1], b_.m128i[1]);
  2782. #else
  2783. SIMDE_VECTORIZE
  2784. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  2785. r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? a_.u32[i] : b_.u32[i];
  2786. }
  2787. #endif
  2788. return simde__m256i_from_private(r_);
  2789. #endif
  2790. }
  2791. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2792. #undef _mm256_max_epu32
  2793. #define _mm256_max_epu32(a, b) simde_mm256_max_epu32(a, b)
  2794. #endif
  2795. SIMDE_FUNCTION_ATTRIBUTES
  2796. simde__m256i
  2797. simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) {
  2798. #if defined(SIMDE_X86_AVX2_NATIVE)
  2799. return _mm256_max_epi16(a, b);
  2800. #else
  2801. simde__m256i_private
  2802. r_,
  2803. a_ = simde__m256i_to_private(a),
  2804. b_ = simde__m256i_to_private(b);
  2805. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2806. r_.m128i[0] = simde_mm_max_epi16(a_.m128i[0], b_.m128i[0]);
  2807. r_.m128i[1] = simde_mm_max_epi16(a_.m128i[1], b_.m128i[1]);
  2808. #else
  2809. SIMDE_VECTORIZE
  2810. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  2811. r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
  2812. }
  2813. #endif
  2814. return simde__m256i_from_private(r_);
  2815. #endif
  2816. }
  2817. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2818. #undef _mm256_max_epi16
  2819. #define _mm256_max_epi16(a, b) simde_mm256_max_epi16(a, b)
  2820. #endif
  2821. SIMDE_FUNCTION_ATTRIBUTES
  2822. simde__m256i
  2823. simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) {
  2824. #if defined(SIMDE_X86_AVX2_NATIVE)
  2825. return _mm256_max_epi32(a, b);
  2826. #else
  2827. simde__m256i_private
  2828. r_,
  2829. a_ = simde__m256i_to_private(a),
  2830. b_ = simde__m256i_to_private(b);
  2831. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2832. r_.m128i[0] = simde_mm_max_epi32(a_.m128i[0], b_.m128i[0]);
  2833. r_.m128i[1] = simde_mm_max_epi32(a_.m128i[1], b_.m128i[1]);
  2834. #else
  2835. SIMDE_VECTORIZE
  2836. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  2837. r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i];
  2838. }
  2839. #endif
  2840. return simde__m256i_from_private(r_);
  2841. #endif
  2842. }
  2843. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2844. #undef _mm256_max_epi32
  2845. #define _mm256_max_epi32(a, b) simde_mm256_max_epi32(a, b)
  2846. #endif
  2847. SIMDE_FUNCTION_ATTRIBUTES
  2848. simde__m256i
  2849. simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) {
  2850. #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI)
  2851. return _mm256_min_epi8(a, b);
  2852. #else
  2853. simde__m256i_private
  2854. r_,
  2855. a_ = simde__m256i_to_private(a),
  2856. b_ = simde__m256i_to_private(b);
  2857. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2858. r_.m128i[0] = simde_mm_min_epi8(a_.m128i[0], b_.m128i[0]);
  2859. r_.m128i[1] = simde_mm_min_epi8(a_.m128i[1], b_.m128i[1]);
  2860. #else
  2861. SIMDE_VECTORIZE
  2862. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  2863. r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i];
  2864. }
  2865. #endif
  2866. return simde__m256i_from_private(r_);
  2867. #endif
  2868. }
  2869. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2870. #undef _mm256_min_epi8
  2871. #define _mm256_min_epi8(a, b) simde_mm256_min_epi8(a, b)
  2872. #endif
  2873. SIMDE_FUNCTION_ATTRIBUTES
  2874. simde__m256i
  2875. simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) {
  2876. #if defined(SIMDE_X86_AVX2_NATIVE)
  2877. return _mm256_min_epi16(a, b);
  2878. #else
  2879. simde__m256i_private
  2880. r_,
  2881. a_ = simde__m256i_to_private(a),
  2882. b_ = simde__m256i_to_private(b);
  2883. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2884. r_.m128i[0] = simde_mm_min_epi16(a_.m128i[0], b_.m128i[0]);
  2885. r_.m128i[1] = simde_mm_min_epi16(a_.m128i[1], b_.m128i[1]);
  2886. #else
  2887. SIMDE_VECTORIZE
  2888. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  2889. r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
  2890. }
  2891. #endif
  2892. return simde__m256i_from_private(r_);
  2893. #endif
  2894. }
  2895. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2896. #undef _mm256_min_epi16
  2897. #define _mm256_min_epi16(a, b) simde_mm256_min_epi16(a, b)
  2898. #endif
  2899. SIMDE_FUNCTION_ATTRIBUTES
  2900. simde__m256i
  2901. simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) {
  2902. #if defined(SIMDE_X86_AVX2_NATIVE)
  2903. return _mm256_min_epi32(a, b);
  2904. #else
  2905. simde__m256i_private
  2906. r_,
  2907. a_ = simde__m256i_to_private(a),
  2908. b_ = simde__m256i_to_private(b);
  2909. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2910. r_.m128i[0] = simde_mm_min_epi32(a_.m128i[0], b_.m128i[0]);
  2911. r_.m128i[1] = simde_mm_min_epi32(a_.m128i[1], b_.m128i[1]);
  2912. #else
  2913. SIMDE_VECTORIZE
  2914. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  2915. r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i];
  2916. }
  2917. #endif
  2918. return simde__m256i_from_private(r_);
  2919. #endif
  2920. }
  2921. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2922. #undef _mm256_min_epi32
  2923. #define _mm256_min_epi32(a, b) simde_mm256_min_epi32(a, b)
  2924. #endif
  2925. SIMDE_FUNCTION_ATTRIBUTES
  2926. simde__m256i
  2927. simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) {
  2928. #if defined(SIMDE_X86_AVX2_NATIVE)
  2929. return _mm256_min_epu8(a, b);
  2930. #else
  2931. simde__m256i_private
  2932. r_,
  2933. a_ = simde__m256i_to_private(a),
  2934. b_ = simde__m256i_to_private(b);
  2935. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2936. r_.m128i[0] = simde_mm_min_epu8(a_.m128i[0], b_.m128i[0]);
  2937. r_.m128i[1] = simde_mm_min_epu8(a_.m128i[1], b_.m128i[1]);
  2938. #else
  2939. SIMDE_VECTORIZE
  2940. for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
  2941. r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
  2942. }
  2943. #endif
  2944. return simde__m256i_from_private(r_);
  2945. #endif
  2946. }
  2947. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2948. #undef _mm256_min_epu8
  2949. #define _mm256_min_epu8(a, b) simde_mm256_min_epu8(a, b)
  2950. #endif
  2951. SIMDE_FUNCTION_ATTRIBUTES
  2952. simde__m256i
  2953. simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) {
  2954. #if defined(SIMDE_X86_AVX2_NATIVE)
  2955. return _mm256_min_epu16(a, b);
  2956. #else
  2957. simde__m256i_private
  2958. r_,
  2959. a_ = simde__m256i_to_private(a),
  2960. b_ = simde__m256i_to_private(b);
  2961. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2962. r_.m128i[0] = simde_mm_min_epu16(a_.m128i[0], b_.m128i[0]);
  2963. r_.m128i[1] = simde_mm_min_epu16(a_.m128i[1], b_.m128i[1]);
  2964. #else
  2965. SIMDE_VECTORIZE
  2966. for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
  2967. r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? a_.u16[i] : b_.u16[i];
  2968. }
  2969. #endif
  2970. return simde__m256i_from_private(r_);
  2971. #endif
  2972. }
  2973. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  2974. #undef _mm256_min_epu16
  2975. #define _mm256_min_epu16(a, b) simde_mm256_min_epu16(a, b)
  2976. #endif
  2977. SIMDE_FUNCTION_ATTRIBUTES
  2978. simde__m256i
  2979. simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) {
  2980. #if defined(SIMDE_X86_AVX2_NATIVE)
  2981. return _mm256_min_epu32(a, b);
  2982. #else
  2983. simde__m256i_private
  2984. r_,
  2985. a_ = simde__m256i_to_private(a),
  2986. b_ = simde__m256i_to_private(b);
  2987. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  2988. r_.m128i[0] = simde_mm_min_epu32(a_.m128i[0], b_.m128i[0]);
  2989. r_.m128i[1] = simde_mm_min_epu32(a_.m128i[1], b_.m128i[1]);
  2990. #else
  2991. SIMDE_VECTORIZE
  2992. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  2993. r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? a_.u32[i] : b_.u32[i];
  2994. }
  2995. #endif
  2996. return simde__m256i_from_private(r_);
  2997. #endif
  2998. }
  2999. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3000. #undef _mm256_min_epu32
  3001. #define _mm256_min_epu32(a, b) simde_mm256_min_epu32(a, b)
  3002. #endif
  3003. SIMDE_FUNCTION_ATTRIBUTES
  3004. int32_t
  3005. simde_mm256_movemask_epi8 (simde__m256i a) {
  3006. #if defined(SIMDE_X86_AVX2_NATIVE)
  3007. return _mm256_movemask_epi8(a);
  3008. #else
  3009. simde__m256i_private a_ = simde__m256i_to_private(a);
  3010. uint32_t r = 0;
  3011. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3012. for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) {
  3013. r |= HEDLEY_STATIC_CAST(uint32_t,simde_mm_movemask_epi8(a_.m128i[i])) << (16 * i);
  3014. }
  3015. #else
  3016. r = 0;
  3017. SIMDE_VECTORIZE_REDUCTION(|:r)
  3018. for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) {
  3019. r |= HEDLEY_STATIC_CAST(uint32_t, (a_.u8[31 - i] >> 7)) << (31 - i);
  3020. }
  3021. #endif
  3022. return HEDLEY_STATIC_CAST(int32_t, r);
  3023. #endif
  3024. }
  3025. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3026. #undef _mm256_movemask_epi8
  3027. #define _mm256_movemask_epi8(a) simde_mm256_movemask_epi8(a)
  3028. #endif
  3029. SIMDE_FUNCTION_ATTRIBUTES
  3030. simde__m256i
  3031. simde_mm256_mpsadbw_epu8 (simde__m256i a, simde__m256i b, const int imm8)
  3032. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  3033. simde__m256i_private
  3034. r_,
  3035. a_ = simde__m256i_to_private(a),
  3036. b_ = simde__m256i_to_private(b);
  3037. const int a_offset1 = imm8 & 4;
  3038. const int b_offset1 = (imm8 & 3) << 2;
  3039. const int a_offset2 = (imm8 >> 3) & 4;
  3040. const int b_offset2 = ((imm8 >> 3) & 3) << 2;
  3041. #if defined(simde_math_abs)
  3042. const int halfway_point = HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0])) ) / 2;
  3043. for (int i = 0 ; i < halfway_point ; i++) {
  3044. r_.u16[i] =
  3045. HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 0] - b_.u8[b_offset1 + 0]))) +
  3046. HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 1] - b_.u8[b_offset1 + 1]))) +
  3047. HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 2] - b_.u8[b_offset1 + 2]))) +
  3048. HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 3] - b_.u8[b_offset1 + 3])));
  3049. r_.u16[halfway_point + i] =
  3050. HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 0] - b_.u8[2 * halfway_point + b_offset2 + 0]))) +
  3051. HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 1] - b_.u8[2 * halfway_point + b_offset2 + 1]))) +
  3052. HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 2] - b_.u8[2 * halfway_point + b_offset2 + 2]))) +
  3053. HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 3] - b_.u8[2 * halfway_point + b_offset2 + 3])));
  3054. }
  3055. #else
  3056. HEDLEY_UNREACHABLE();
  3057. #endif
  3058. return simde__m256i_from_private(r_);
  3059. }
  3060. #if defined(SIMDE_X86_AVX2_NATIVE) && SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0)
  3061. #define simde_mm256_mpsadbw_epu8(a, b, imm8) _mm256_mpsadbw_epu8(a, b, imm8)
  3062. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3063. #define simde_mm256_mpsadbw_epu8(a, b, imm8) \
  3064. simde_mm256_set_m128i( \
  3065. simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8 >> 3)), \
  3066. simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8)))
  3067. #endif
  3068. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3069. #undef _mm256_mpsadbw_epu8
  3070. #define _mm256_mpsadbw_epu8(a, b, imm8) simde_mm256_mpsadbw_epu8(a, b, imm8)
  3071. #endif
  3072. SIMDE_FUNCTION_ATTRIBUTES
  3073. simde__m256i
  3074. simde_mm256_mul_epi32 (simde__m256i a, simde__m256i b) {
  3075. #if defined(SIMDE_X86_AVX2_NATIVE)
  3076. return _mm256_mul_epi32(a, b);
  3077. #else
  3078. simde__m256i_private
  3079. r_,
  3080. a_ = simde__m256i_to_private(a),
  3081. b_ = simde__m256i_to_private(b);
  3082. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3083. r_.m128i[0] = simde_mm_mul_epi32(a_.m128i[0], b_.m128i[0]);
  3084. r_.m128i[1] = simde_mm_mul_epi32(a_.m128i[1], b_.m128i[1]);
  3085. #else
  3086. SIMDE_VECTORIZE
  3087. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  3088. r_.i64[i] =
  3089. HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) *
  3090. HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]);
  3091. }
  3092. #endif
  3093. return simde__m256i_from_private(r_);
  3094. #endif
  3095. }
  3096. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3097. # define _mm256_mul_epi32(a, b) simde_mm256_mul_epi32(a, b)
  3098. #endif
  3099. SIMDE_FUNCTION_ATTRIBUTES
  3100. simde__m256i
  3101. simde_mm256_mul_epu32 (simde__m256i a, simde__m256i b) {
  3102. #if defined(SIMDE_X86_AVX2_NATIVE)
  3103. return _mm256_mul_epu32(a, b);
  3104. #else
  3105. simde__m256i_private
  3106. r_,
  3107. a_ = simde__m256i_to_private(a),
  3108. b_ = simde__m256i_to_private(b);
  3109. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3110. r_.m128i[0] = simde_mm_mul_epu32(a_.m128i[0], b_.m128i[0]);
  3111. r_.m128i[1] = simde_mm_mul_epu32(a_.m128i[1], b_.m128i[1]);
  3112. #else
  3113. SIMDE_VECTORIZE
  3114. for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
  3115. r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
  3116. }
  3117. #endif
  3118. return simde__m256i_from_private(r_);
  3119. #endif
  3120. }
  3121. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3122. # define _mm256_mul_epu32(a, b) simde_mm256_mul_epu32(a, b)
  3123. #endif
  3124. SIMDE_FUNCTION_ATTRIBUTES
  3125. simde__m256i
  3126. simde_mm256_mulhi_epi16 (simde__m256i a, simde__m256i b) {
  3127. #if defined(SIMDE_X86_AVX2_NATIVE)
  3128. return _mm256_mulhi_epi16(a, b);
  3129. #else
  3130. simde__m256i_private
  3131. r_,
  3132. a_ = simde__m256i_to_private(a),
  3133. b_ = simde__m256i_to_private(b);
  3134. SIMDE_VECTORIZE
  3135. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  3136. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16));
  3137. }
  3138. return simde__m256i_from_private(r_);
  3139. #endif
  3140. }
  3141. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3142. # define _mm256_mulhi_epi16(a, b) simde_mm256_mulhi_epi16(a, b)
  3143. #endif
  3144. SIMDE_FUNCTION_ATTRIBUTES
  3145. simde__m256i
  3146. simde_mm256_mulhi_epu16 (simde__m256i a, simde__m256i b) {
  3147. #if defined(SIMDE_X86_AVX2_NATIVE)
  3148. return _mm256_mulhi_epu16(a, b);
  3149. #else
  3150. simde__m256i_private
  3151. r_,
  3152. a_ = simde__m256i_to_private(a),
  3153. b_ = simde__m256i_to_private(b);
  3154. SIMDE_VECTORIZE
  3155. for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
  3156. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16);
  3157. }
  3158. return simde__m256i_from_private(r_);
  3159. #endif
  3160. }
  3161. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3162. # define _mm256_mulhi_epu16(a, b) simde_mm256_mulhi_epu16(a, b)
  3163. #endif
  3164. SIMDE_FUNCTION_ATTRIBUTES
  3165. simde__m256i
  3166. simde_mm256_mulhrs_epi16 (simde__m256i a, simde__m256i b) {
  3167. #if defined(SIMDE_X86_AVX2_NATIVE)
  3168. return _mm256_mulhrs_epi16(a, b);
  3169. #else
  3170. simde__m256i_private
  3171. r_,
  3172. a_ = simde__m256i_to_private(a),
  3173. b_ = simde__m256i_to_private(b);
  3174. SIMDE_VECTORIZE
  3175. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  3176. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15));
  3177. }
  3178. return simde__m256i_from_private(r_);
  3179. #endif
  3180. }
  3181. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3182. # define _mm256_mulhrs_epi16(a, b) simde_mm256_mulhrs_epi16(a, b)
  3183. #endif
  3184. SIMDE_FUNCTION_ATTRIBUTES
  3185. simde__m256i
  3186. simde_mm256_mullo_epi16 (simde__m256i a, simde__m256i b) {
  3187. #if defined(SIMDE_X86_AVX2_NATIVE)
  3188. return _mm256_mullo_epi16(a, b);
  3189. #else
  3190. simde__m256i_private
  3191. a_ = simde__m256i_to_private(a),
  3192. b_ = simde__m256i_to_private(b),
  3193. r_;
  3194. SIMDE_VECTORIZE
  3195. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  3196. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] * b_.i16[i]);
  3197. }
  3198. return simde__m256i_from_private(r_);
  3199. #endif
  3200. }
  3201. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3202. #undef _mm256_mullo_epi16
  3203. #define _mm256_mullo_epi16(a, b) simde_mm256_mullo_epi16(a, b)
  3204. #endif
  3205. SIMDE_FUNCTION_ATTRIBUTES
  3206. simde__m256i
  3207. simde_mm256_mullo_epi32 (simde__m256i a, simde__m256i b) {
  3208. #if defined(SIMDE_X86_AVX2_NATIVE)
  3209. return _mm256_mullo_epi32(a, b);
  3210. #else
  3211. simde__m256i_private
  3212. a_ = simde__m256i_to_private(a),
  3213. b_ = simde__m256i_to_private(b),
  3214. r_;
  3215. SIMDE_VECTORIZE
  3216. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  3217. r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] * b_.i32[i]);
  3218. }
  3219. return simde__m256i_from_private(r_);
  3220. #endif
  3221. }
  3222. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3223. #undef _mm256_mullo_epi32
  3224. #define _mm256_mullo_epi32(a, b) simde_mm256_mullo_epi32(a, b)
  3225. #endif
  3226. SIMDE_FUNCTION_ATTRIBUTES
  3227. simde__m256i
  3228. simde_x_mm256_mullo_epu32 (simde__m256i a, simde__m256i b) {
  3229. simde__m256i_private
  3230. r_,
  3231. a_ = simde__m256i_to_private(a),
  3232. b_ = simde__m256i_to_private(b);
  3233. #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  3234. r_.u32 = a_.u32 * b_.u32;
  3235. #else
  3236. SIMDE_VECTORIZE
  3237. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  3238. r_.u32[i] = a_.u32[i] * b_.u32[i];
  3239. }
  3240. #endif
  3241. return simde__m256i_from_private(r_);
  3242. }
  3243. SIMDE_FUNCTION_ATTRIBUTES
  3244. simde__m256i
  3245. simde_mm256_or_si256 (simde__m256i a, simde__m256i b) {
  3246. #if defined(SIMDE_X86_AVX2_NATIVE)
  3247. return _mm256_or_si256(a, b);
  3248. #else
  3249. simde__m256i_private
  3250. r_,
  3251. a_ = simde__m256i_to_private(a),
  3252. b_ = simde__m256i_to_private(b);
  3253. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3254. r_.m128i[0] = simde_mm_or_si128(a_.m128i[0], b_.m128i[0]);
  3255. r_.m128i[1] = simde_mm_or_si128(a_.m128i[1], b_.m128i[1]);
  3256. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  3257. r_.i32f = a_.i32f | b_.i32f;
  3258. #else
  3259. SIMDE_VECTORIZE
  3260. for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) {
  3261. r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
  3262. }
  3263. #endif
  3264. return simde__m256i_from_private(r_);
  3265. #endif
  3266. }
  3267. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3268. #undef _mm256_or_si256
  3269. #define _mm256_or_si256(a, b) simde_mm256_or_si256(a, b)
  3270. #endif
  3271. SIMDE_FUNCTION_ATTRIBUTES
  3272. simde__m256i
  3273. simde_mm256_packs_epi16 (simde__m256i a, simde__m256i b) {
  3274. #if defined(SIMDE_X86_AVX2_NATIVE)
  3275. return _mm256_packs_epi16(a, b);
  3276. #else
  3277. simde__m256i_private
  3278. r_,
  3279. a_ = simde__m256i_to_private(a),
  3280. b_ = simde__m256i_to_private(b);
  3281. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3282. r_.m128i[0] = simde_mm_packs_epi16(a_.m128i[0], b_.m128i[0]);
  3283. r_.m128i[1] = simde_mm_packs_epi16(a_.m128i[1], b_.m128i[1]);
  3284. #else
  3285. const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/2;
  3286. const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/4;
  3287. SIMDE_VECTORIZE
  3288. for (size_t i = 0 ; i < quarter_point ; i++) {
  3289. r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i]));
  3290. r_.i8[i + quarter_point] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i]));
  3291. r_.i8[halfway_point + i] = (a_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((a_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[quarter_point + i]));
  3292. r_.i8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((b_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[quarter_point + i]));
  3293. }
  3294. #endif
  3295. return simde__m256i_from_private(r_);
  3296. #endif
  3297. }
  3298. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3299. #undef _mm256_packs_epi16
  3300. #define _mm256_packs_epi16(a, b) simde_mm256_packs_epi16(a, b)
  3301. #endif
  3302. SIMDE_FUNCTION_ATTRIBUTES
  3303. simde__m256i
  3304. simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) {
  3305. #if defined(SIMDE_X86_AVX2_NATIVE)
  3306. return _mm256_packs_epi32(a, b);
  3307. #else
  3308. simde__m256i_private
  3309. r_,
  3310. v_[] = {
  3311. simde__m256i_to_private(a),
  3312. simde__m256i_to_private(b)
  3313. };
  3314. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3315. r_.m128i[0] = simde_mm_packs_epi32(v_[0].m128i[0], v_[1].m128i[0]);
  3316. r_.m128i[1] = simde_mm_packs_epi32(v_[0].m128i[1], v_[1].m128i[1]);
  3317. #else
  3318. SIMDE_VECTORIZE
  3319. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  3320. const int32_t v = v_[(i >> 2) & 1].i32[(i & 11) - ((i & 8) >> 1)];
  3321. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (v > INT16_MAX) ? INT16_MAX : ((v < INT16_MIN) ? INT16_MIN : v));
  3322. }
  3323. #endif
  3324. return simde__m256i_from_private(r_);
  3325. #endif
  3326. }
  3327. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3328. #undef _mm256_packs_epi32
  3329. #define _mm256_packs_epi32(a, b) simde_mm256_packs_epi32(a, b)
  3330. #endif
  3331. SIMDE_FUNCTION_ATTRIBUTES
  3332. simde__m256i
  3333. simde_mm256_packus_epi16 (simde__m256i a, simde__m256i b) {
  3334. #if defined(SIMDE_X86_AVX2_NATIVE)
  3335. return _mm256_packus_epi16(a, b);
  3336. #else
  3337. simde__m256i_private
  3338. r_,
  3339. a_ = simde__m256i_to_private(a),
  3340. b_ = simde__m256i_to_private(b);
  3341. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3342. r_.m128i[0] = simde_mm_packus_epi16(a_.m128i[0], b_.m128i[0]);
  3343. r_.m128i[1] = simde_mm_packus_epi16(a_.m128i[1], b_.m128i[1]);
  3344. #else
  3345. const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 2;
  3346. const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 4;
  3347. SIMDE_VECTORIZE
  3348. for (size_t i = 0 ; i < quarter_point ; i++) {
  3349. r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]));
  3350. r_.u8[i + quarter_point] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]));
  3351. r_.u8[halfway_point + i] = (a_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[quarter_point + i]));
  3352. r_.u8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[quarter_point + i]));
  3353. }
  3354. #endif
  3355. return simde__m256i_from_private(r_);
  3356. #endif
  3357. }
  3358. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3359. #undef _mm256_packus_epi16
  3360. #define _mm256_packus_epi16(a, b) simde_mm256_packus_epi16(a, b)
  3361. #endif
  3362. SIMDE_FUNCTION_ATTRIBUTES
  3363. simde__m256i
  3364. simde_mm256_packus_epi32 (simde__m256i a, simde__m256i b) {
  3365. #if defined(SIMDE_X86_AVX2_NATIVE)
  3366. return _mm256_packus_epi32(a, b);
  3367. #else
  3368. simde__m256i_private
  3369. r_,
  3370. a_ = simde__m256i_to_private(a),
  3371. b_ = simde__m256i_to_private(b);
  3372. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3373. r_.m128i[0] = simde_mm_packus_epi32(a_.m128i[0], b_.m128i[0]);
  3374. r_.m128i[1] = simde_mm_packus_epi32(a_.m128i[1], b_.m128i[1]);
  3375. #else
  3376. const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2;
  3377. const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4;
  3378. SIMDE_VECTORIZE
  3379. for (size_t i = 0 ; i < quarter_point ; i++) {
  3380. r_.u16[i] = (a_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i]));
  3381. r_.u16[i + quarter_point] = (b_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i]));
  3382. r_.u16[halfway_point + i] = (a_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[quarter_point + i]));
  3383. r_.u16[halfway_point + i + quarter_point] = (b_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[quarter_point + i]));
  3384. }
  3385. #endif
  3386. return simde__m256i_from_private(r_);
  3387. #endif
  3388. }
  3389. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3390. #undef _mm256_packus_epi32
  3391. #define _mm256_packus_epi32(a, b) simde_mm256_packus_epi32(a, b)
  3392. #endif
  3393. SIMDE_FUNCTION_ATTRIBUTES
  3394. simde__m256i
  3395. simde_mm256_permute2x128_si256 (simde__m256i a, simde__m256i b, const int imm8)
  3396. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  3397. simde__m256i_private
  3398. r_,
  3399. a_ = simde__m256i_to_private(a),
  3400. b_ = simde__m256i_to_private(b);
  3401. r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]);
  3402. r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]);
  3403. return simde__m256i_from_private(r_);
  3404. }
  3405. #if defined(SIMDE_X86_AVX2_NATIVE)
  3406. # define simde_mm256_permute2x128_si256(a, b, imm8) _mm256_permute2x128_si256(a, b, imm8)
  3407. #endif
  3408. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3409. #undef _mm256_permute2x128_si256
  3410. #define _mm256_permute2x128_si256(a, b, imm8) simde_mm256_permute2x128_si256(a, b, imm8)
  3411. #endif
  3412. SIMDE_FUNCTION_ATTRIBUTES
  3413. simde__m256i
  3414. simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8)
  3415. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  3416. simde__m256i_private
  3417. r_,
  3418. a_ = simde__m256i_to_private(a);
  3419. r_.i64[0] = (imm8 & 0x02) ? a_.i64[((imm8 ) & 1)+2] : a_.i64[(imm8 ) & 1];
  3420. r_.i64[1] = (imm8 & 0x08) ? a_.i64[((imm8 >> 2 ) & 1)+2] : a_.i64[(imm8 >> 2 ) & 1];
  3421. r_.i64[2] = (imm8 & 0x20) ? a_.i64[((imm8 >> 4 ) & 1)+2] : a_.i64[(imm8 >> 4 ) & 1];
  3422. r_.i64[3] = (imm8 & 0x80) ? a_.i64[((imm8 >> 6 ) & 1)+2] : a_.i64[(imm8 >> 6 ) & 1];
  3423. return simde__m256i_from_private(r_);
  3424. }
  3425. #if defined(SIMDE_X86_AVX2_NATIVE)
  3426. # define simde_mm256_permute4x64_epi64(a, imm8) _mm256_permute4x64_epi64(a, imm8)
  3427. #endif
  3428. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3429. #undef _mm256_permute4x64_epi64
  3430. #define _mm256_permute4x64_epi64(a, imm8) simde_mm256_permute4x64_epi64(a, imm8)
  3431. #endif
  3432. SIMDE_FUNCTION_ATTRIBUTES
  3433. simde__m256d
  3434. simde_mm256_permute4x64_pd (simde__m256d a, const int imm8)
  3435. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  3436. simde__m256d_private
  3437. r_,
  3438. a_ = simde__m256d_to_private(a);
  3439. r_.f64[0] = (imm8 & 0x02) ? a_.f64[((imm8 ) & 1)+2] : a_.f64[(imm8 ) & 1];
  3440. r_.f64[1] = (imm8 & 0x08) ? a_.f64[((imm8 >> 2 ) & 1)+2] : a_.f64[(imm8 >> 2 ) & 1];
  3441. r_.f64[2] = (imm8 & 0x20) ? a_.f64[((imm8 >> 4 ) & 1)+2] : a_.f64[(imm8 >> 4 ) & 1];
  3442. r_.f64[3] = (imm8 & 0x80) ? a_.f64[((imm8 >> 6 ) & 1)+2] : a_.f64[(imm8 >> 6 ) & 1];
  3443. return simde__m256d_from_private(r_);
  3444. }
  3445. #if defined(SIMDE_X86_AVX2_NATIVE)
  3446. # define simde_mm256_permute4x64_pd(a, imm8) _mm256_permute4x64_pd(a, imm8)
  3447. #endif
  3448. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3449. #undef _mm256_permute4x64_pd
  3450. #define _mm256_permute4x64_pd(a, imm8) simde_mm256_permute4x64_pd(a, imm8)
  3451. #endif
  3452. SIMDE_FUNCTION_ATTRIBUTES
  3453. simde__m256i
  3454. simde_mm256_permutevar8x32_epi32 (simde__m256i a, simde__m256i idx) {
  3455. #if defined(SIMDE_X86_AVX2_NATIVE)
  3456. return _mm256_permutevar8x32_epi32(a, idx);
  3457. #else
  3458. simde__m256i_private
  3459. r_,
  3460. a_ = simde__m256i_to_private(a),
  3461. idx_ = simde__m256i_to_private(idx);
  3462. SIMDE_VECTORIZE
  3463. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  3464. r_.i32[i] = a_.i32[idx_.i32[i] & 7];
  3465. }
  3466. return simde__m256i_from_private(r_);
  3467. #endif
  3468. }
  3469. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3470. #undef _mm256_permutevar8x32_epi32
  3471. #define _mm256_permutevar8x32_epi32(a, idx) simde_mm256_permutevar8x32_epi32(a, idx)
  3472. #endif
  3473. SIMDE_FUNCTION_ATTRIBUTES
  3474. simde__m256
  3475. simde_mm256_permutevar8x32_ps (simde__m256 a, simde__m256i idx) {
  3476. #if defined(SIMDE_X86_AVX2_NATIVE)
  3477. #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0)
  3478. return _mm256_permutevar8x32_ps(a, HEDLEY_REINTERPRET_CAST(simde__m256, idx));
  3479. #else
  3480. return _mm256_permutevar8x32_ps(a, idx);
  3481. #endif
  3482. #else
  3483. simde__m256_private
  3484. r_,
  3485. a_ = simde__m256_to_private(a);
  3486. simde__m256i_private
  3487. idx_ = simde__m256i_to_private(idx);
  3488. SIMDE_VECTORIZE
  3489. for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
  3490. r_.f32[i] = a_.f32[idx_.i32[i] & 7];
  3491. }
  3492. return simde__m256_from_private(r_);
  3493. #endif
  3494. }
  3495. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3496. #undef _mm256_permutevar8x32_ps
  3497. #define _mm256_permutevar8x32_ps(a, idx) simde_mm256_permutevar8x32_ps(a, idx)
  3498. #endif
  3499. SIMDE_FUNCTION_ATTRIBUTES
  3500. simde__m256i
  3501. simde_mm256_sad_epu8 (simde__m256i a, simde__m256i b) {
  3502. #if defined(SIMDE_X86_AVX2_NATIVE)
  3503. return _mm256_sad_epu8(a, b);
  3504. #else
  3505. simde__m256i_private
  3506. r_,
  3507. a_ = simde__m256i_to_private(a),
  3508. b_ = simde__m256i_to_private(b);
  3509. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3510. r_.m128i[0] = simde_mm_sad_epu8(a_.m128i[0], b_.m128i[0]);
  3511. r_.m128i[1] = simde_mm_sad_epu8(a_.m128i[1], b_.m128i[1]);
  3512. #else
  3513. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  3514. uint16_t tmp = 0;
  3515. SIMDE_VECTORIZE_REDUCTION(+:tmp)
  3516. for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 4) ; j++) {
  3517. const size_t e = j + (i * 8);
  3518. tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]);
  3519. }
  3520. r_.i64[i] = tmp;
  3521. }
  3522. #endif
  3523. return simde__m256i_from_private(r_);
  3524. #endif
  3525. }
  3526. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3527. #undef _mm256_sad_epu8
  3528. #define _mm256_sad_epu8(a, b) simde_mm256_sad_epu8(a, b)
  3529. #endif
  3530. SIMDE_FUNCTION_ATTRIBUTES
  3531. simde__m256i
  3532. simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) {
  3533. #if defined(SIMDE_X86_AVX2_NATIVE)
  3534. return _mm256_shuffle_epi8(a, b);
  3535. #else
  3536. simde__m256i_private
  3537. r_,
  3538. a_ = simde__m256i_to_private(a),
  3539. b_ = simde__m256i_to_private(b);
  3540. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3541. r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]);
  3542. r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]);
  3543. #else
  3544. SIMDE_VECTORIZE
  3545. for (size_t i = 0 ; i < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; i++) {
  3546. r_.u8[ i ] = (b_.u8[ i ] & 0x80) ? 0 : a_.u8[(b_.u8[ i ] & 0x0f) ];
  3547. r_.u8[i + 16] = (b_.u8[i + 16] & 0x80) ? 0 : a_.u8[(b_.u8[i + 16] & 0x0f) + 16];
  3548. }
  3549. #endif
  3550. return simde__m256i_from_private(r_);
  3551. #endif
  3552. }
  3553. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3554. #undef _mm256_shuffle_epi8
  3555. #define _mm256_shuffle_epi8(a, b) simde_mm256_shuffle_epi8(a, b)
  3556. #endif
  3557. SIMDE_FUNCTION_ATTRIBUTES
  3558. simde__m256i
  3559. simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8)
  3560. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  3561. simde__m256i_private
  3562. r_,
  3563. a_ = simde__m256i_to_private(a);
  3564. for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
  3565. r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
  3566. }
  3567. for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) {
  3568. r_.i32[i + 4] = a_.i32[((imm8 >> (i * 2)) & 3) + 4];
  3569. }
  3570. return simde__m256i_from_private(r_);
  3571. }
  3572. #if defined(SIMDE_X86_AVX2_NATIVE)
  3573. # define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8)
  3574. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI)
  3575. # define simde_mm256_shuffle_epi32(a, imm8) \
  3576. simde_mm256_set_m128i( \
  3577. simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  3578. simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
  3579. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3580. # define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \
  3581. const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \
  3582. simde__m256i_from_private((simde__m256i_private) { .i32 = \
  3583. SIMDE_SHUFFLE_VECTOR_(32, 32, \
  3584. (simde_tmp_a_).i32, \
  3585. (simde_tmp_a_).i32, \
  3586. ((imm8) ) & 3, \
  3587. ((imm8) >> 2) & 3, \
  3588. ((imm8) >> 4) & 3, \
  3589. ((imm8) >> 6) & 3, \
  3590. (((imm8) ) & 3) + 4, \
  3591. (((imm8) >> 2) & 3) + 4, \
  3592. (((imm8) >> 4) & 3) + 4, \
  3593. (((imm8) >> 6) & 3) + 4) }); }))
  3594. #endif
  3595. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3596. #undef _mm256_shuffle_epi32
  3597. #define _mm256_shuffle_epi32(a, imm8) simde_mm256_shuffle_epi32(a, imm8)
  3598. #endif
  3599. #if defined(SIMDE_X86_AVX2_NATIVE)
  3600. # define simde_mm256_shufflehi_epi16(a, imm8) _mm256_shufflehi_epi16(a, imm8)
  3601. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3602. # define simde_mm256_shufflehi_epi16(a, imm8) \
  3603. simde_mm256_set_m128i( \
  3604. simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  3605. simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
  3606. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3607. # define simde_mm256_shufflehi_epi16(a, imm8) (__extension__ ({ \
  3608. const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \
  3609. simde__m256i_from_private((simde__m256i_private) { .i16 = \
  3610. SIMDE_SHUFFLE_VECTOR_(16, 32, \
  3611. (simde_tmp_a_).i16, \
  3612. (simde_tmp_a_).i16, \
  3613. 0, 1, 2, 3, \
  3614. (((imm8) ) & 3) + 4, \
  3615. (((imm8) >> 2) & 3) + 4, \
  3616. (((imm8) >> 4) & 3) + 4, \
  3617. (((imm8) >> 6) & 3) + 4, \
  3618. 8, 9, 10, 11, \
  3619. ((((imm8) ) & 3) + 8 + 4), \
  3620. ((((imm8) >> 2) & 3) + 8 + 4), \
  3621. ((((imm8) >> 4) & 3) + 8 + 4), \
  3622. ((((imm8) >> 6) & 3) + 8 + 4) \
  3623. ) }); }))
  3624. #else
  3625. # define simde_mm256_shufflehi_epi16(a, imm8) \
  3626. simde_mm256_set_m128i( \
  3627. simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
  3628. simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
  3629. #endif
  3630. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3631. #undef _mm256_shufflehi_epi16
  3632. #define _mm256_shufflehi_epi16(a, imm8) simde_mm256_shufflehi_epi16(a, imm8)
  3633. #endif
  3634. #if defined(SIMDE_X86_AVX2_NATIVE)
  3635. # define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8)
  3636. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3637. # define simde_mm256_shufflelo_epi16(a, imm8) \
  3638. simde_mm256_set_m128i( \
  3639. simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  3640. simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
  3641. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3642. # define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \
  3643. const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \
  3644. simde__m256i_from_private((simde__m256i_private) { .i16 = \
  3645. SIMDE_SHUFFLE_VECTOR_(16, 32, \
  3646. (simde_tmp_a_).i16, \
  3647. (simde_tmp_a_).i16, \
  3648. (((imm8) ) & 3), \
  3649. (((imm8) >> 2) & 3), \
  3650. (((imm8) >> 4) & 3), \
  3651. (((imm8) >> 6) & 3), \
  3652. 4, 5, 6, 7, \
  3653. ((((imm8) ) & 3) + 8), \
  3654. ((((imm8) >> 2) & 3) + 8), \
  3655. ((((imm8) >> 4) & 3) + 8), \
  3656. ((((imm8) >> 6) & 3) + 8), \
  3657. 12, 13, 14, 15) }); }))
  3658. #else
  3659. # define simde_mm256_shufflelo_epi16(a, imm8) \
  3660. simde_mm256_set_m128i( \
  3661. simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \
  3662. simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8))
  3663. #endif
  3664. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3665. #undef _mm256_shufflelo_epi16
  3666. #define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8)
  3667. #endif
  3668. SIMDE_FUNCTION_ATTRIBUTES
  3669. simde__m256i
  3670. simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) {
  3671. #if defined(SIMDE_X86_AVX2_NATIVE)
  3672. return _mm256_sign_epi8(a, b);
  3673. #else
  3674. simde__m256i_private
  3675. r_,
  3676. a_ = simde__m256i_to_private(a),
  3677. b_ = simde__m256i_to_private(b);
  3678. SIMDE_VECTORIZE
  3679. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  3680. r_.i8[i] = (b_.i8[i] == INT8_C(0)) ? INT8_C(0) : (b_.i8[i] < INT8_C(0)) ? -a_.i8[i] : a_.i8[i];
  3681. }
  3682. return simde__m256i_from_private(r_);
  3683. #endif
  3684. }
  3685. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3686. #undef _mm256_sign_epi8
  3687. #define _mm256_sign_epi8(a, b) simde_mm256_sign_epi8(a, b)
  3688. #endif
  3689. SIMDE_FUNCTION_ATTRIBUTES
  3690. simde__m256i
  3691. simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) {
  3692. #if defined(SIMDE_X86_AVX2_NATIVE)
  3693. return _mm256_sign_epi16(a, b);
  3694. #else
  3695. simde__m256i_private
  3696. r_,
  3697. a_ = simde__m256i_to_private(a),
  3698. b_ = simde__m256i_to_private(b);
  3699. SIMDE_VECTORIZE
  3700. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  3701. r_.i16[i] = (b_.i16[i] == INT16_C(0)) ? INT16_C(0) : (b_.i16[i] < INT16_C(0)) ? -a_.i16[i] : a_.i16[i];
  3702. }
  3703. return simde__m256i_from_private(r_);
  3704. #endif
  3705. }
  3706. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3707. #undef _mm256_sign_epi16
  3708. #define _mm256_sign_epi16(a, b) simde_mm256_sign_epi16(a, b)
  3709. #endif
  3710. SIMDE_FUNCTION_ATTRIBUTES
  3711. simde__m256i
  3712. simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) {
  3713. #if defined(SIMDE_X86_AVX2_NATIVE)
  3714. return _mm256_sign_epi32(a, b);
  3715. #else
  3716. simde__m256i_private
  3717. r_,
  3718. a_ = simde__m256i_to_private(a),
  3719. b_ = simde__m256i_to_private(b);
  3720. SIMDE_VECTORIZE
  3721. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  3722. r_.i32[i] = (b_.i32[i] == INT32_C(0)) ? INT32_C(0) : (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i];
  3723. }
  3724. return simde__m256i_from_private(r_);
  3725. #endif
  3726. }
  3727. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3728. #undef _mm256_sign_epi32
  3729. #define _mm256_sign_epi32(a, b) simde_mm256_sign_epi32(a, b)
  3730. #endif
  3731. SIMDE_FUNCTION_ATTRIBUTES
  3732. simde__m256i
  3733. simde_mm256_sll_epi16 (simde__m256i a, simde__m128i count) {
  3734. #if defined(SIMDE_X86_AVX2_NATIVE)
  3735. return _mm256_sll_epi16(a, count);
  3736. #else
  3737. simde__m256i_private
  3738. r_,
  3739. a_ = simde__m256i_to_private(a);
  3740. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3741. r_.m128i[0] = simde_mm_sll_epi16(a_.m128i[0], count);
  3742. r_.m128i[1] = simde_mm_sll_epi16(a_.m128i[1], count);
  3743. #else
  3744. simde__m128i_private
  3745. count_ = simde__m128i_to_private(count);
  3746. uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
  3747. if (shift > 15)
  3748. return simde_mm256_setzero_si256();
  3749. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  3750. r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, shift);
  3751. #else
  3752. SIMDE_VECTORIZE
  3753. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  3754. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (shift));
  3755. }
  3756. #endif
  3757. #endif
  3758. return simde__m256i_from_private(r_);
  3759. #endif
  3760. }
  3761. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3762. #undef _mm256_sll_epi16
  3763. #define _mm256_sll_epi16(a, count) simde_mm256_sll_epi16(a, count)
  3764. #endif
  3765. SIMDE_FUNCTION_ATTRIBUTES
  3766. simde__m256i
  3767. simde_mm256_sll_epi32 (simde__m256i a, simde__m128i count) {
  3768. #if defined(SIMDE_X86_AVX2_NATIVE)
  3769. return _mm256_sll_epi32(a, count);
  3770. #else
  3771. simde__m256i_private
  3772. r_,
  3773. a_ = simde__m256i_to_private(a);
  3774. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3775. r_.m128i[0] = simde_mm_sll_epi32(a_.m128i[0], count);
  3776. r_.m128i[1] = simde_mm_sll_epi32(a_.m128i[1], count);
  3777. #else
  3778. simde__m128i_private
  3779. count_ = simde__m128i_to_private(count);
  3780. uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
  3781. if (shift > 31)
  3782. return simde_mm256_setzero_si256();
  3783. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  3784. r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, shift);
  3785. #else
  3786. SIMDE_VECTORIZE
  3787. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  3788. r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << (shift));
  3789. }
  3790. #endif
  3791. #endif
  3792. return simde__m256i_from_private(r_);
  3793. #endif
  3794. }
  3795. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3796. #undef _mm256_sll_epi32
  3797. #define _mm256_sll_epi32(a, count) simde_mm256_sll_epi32(a, count)
  3798. #endif
  3799. SIMDE_FUNCTION_ATTRIBUTES
  3800. simde__m256i
  3801. simde_mm256_sll_epi64 (simde__m256i a, simde__m128i count) {
  3802. #if defined(SIMDE_X86_AVX2_NATIVE)
  3803. return _mm256_sll_epi64(a, count);
  3804. #else
  3805. simde__m256i_private
  3806. r_,
  3807. a_ = simde__m256i_to_private(a);
  3808. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3809. r_.m128i[0] = simde_mm_sll_epi64(a_.m128i[0], count);
  3810. r_.m128i[1] = simde_mm_sll_epi64(a_.m128i[1], count);
  3811. #else
  3812. simde__m128i_private
  3813. count_ = simde__m128i_to_private(count);
  3814. uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
  3815. if (shift > 63)
  3816. return simde_mm256_setzero_si256();
  3817. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  3818. r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, shift);
  3819. #else
  3820. SIMDE_VECTORIZE
  3821. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  3822. r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << (shift));
  3823. }
  3824. #endif
  3825. #endif
  3826. return simde__m256i_from_private(r_);
  3827. #endif
  3828. }
  3829. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3830. #undef _mm256_sll_epi64
  3831. #define _mm256_sll_epi64(a, count) simde_mm256_sll_epi64(a, count)
  3832. #endif
  3833. SIMDE_FUNCTION_ATTRIBUTES
  3834. simde__m256i
  3835. simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
  3836. SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
  3837. /* Note: There is no consistency in how compilers handle values outside of
  3838. the expected range, hence the discrepancy between what we allow and what
  3839. Intel specifies. Some compilers will return 0, others seem to just mask
  3840. off everything outside of the range. */
  3841. simde__m256i_private
  3842. r_,
  3843. a_ = simde__m256i_to_private(a);
  3844. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3845. SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8));
  3846. for (size_t i = 0 ; i < (sizeof(a_.altivec_i16) / sizeof(a_.altivec_i16[0])) ; i++) {
  3847. r_.altivec_i16[i] = vec_sl(a_.altivec_i16[i], sv);
  3848. }
  3849. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  3850. r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8);
  3851. #else
  3852. SIMDE_VECTORIZE
  3853. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  3854. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (imm8 & 0xff));
  3855. }
  3856. #endif
  3857. return simde__m256i_from_private(r_);
  3858. }
  3859. #if defined(SIMDE_X86_AVX2_NATIVE)
  3860. # define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8)
  3861. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3862. # define simde_mm256_slli_epi16(a, imm8) \
  3863. simde_mm256_set_m128i( \
  3864. simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  3865. simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
  3866. #endif
  3867. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3868. #undef _mm256_slli_epi16
  3869. #define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8)
  3870. #endif
  3871. SIMDE_FUNCTION_ATTRIBUTES
  3872. simde__m256i
  3873. simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
  3874. SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
  3875. simde__m256i_private
  3876. r_,
  3877. a_ = simde__m256i_to_private(a);
  3878. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3879. SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8));
  3880. for (size_t i = 0 ; i < (sizeof(a_.altivec_i32) / sizeof(a_.altivec_i32[0])) ; i++) {
  3881. r_.altivec_i32[i] = vec_sl(a_.altivec_i32[i], sv);
  3882. }
  3883. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  3884. r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8);
  3885. #else
  3886. SIMDE_VECTORIZE
  3887. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  3888. r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
  3889. }
  3890. #endif
  3891. return simde__m256i_from_private(r_);
  3892. }
  3893. #if defined(SIMDE_X86_AVX2_NATIVE)
  3894. # define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8)
  3895. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3896. # define simde_mm256_slli_epi32(a, imm8) \
  3897. simde_mm256_set_m128i( \
  3898. simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  3899. simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
  3900. #endif
  3901. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3902. #undef _mm256_slli_epi32
  3903. #define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8)
  3904. #endif
  3905. SIMDE_FUNCTION_ATTRIBUTES
  3906. simde__m256i
  3907. simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
  3908. SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
  3909. simde__m256i_private
  3910. r_,
  3911. a_ = simde__m256i_to_private(a);
  3912. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  3913. r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8);
  3914. #else
  3915. SIMDE_VECTORIZE
  3916. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  3917. r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
  3918. }
  3919. #endif
  3920. return simde__m256i_from_private(r_);
  3921. }
  3922. #if defined(SIMDE_X86_AVX2_NATIVE)
  3923. # define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8)
  3924. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  3925. # define simde_mm256_slli_epi64(a, imm8) \
  3926. simde_mm256_set_m128i( \
  3927. simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  3928. simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
  3929. #endif
  3930. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3931. #undef _mm256_slli_epi64
  3932. #define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8)
  3933. #endif
  3934. SIMDE_FUNCTION_ATTRIBUTES
  3935. simde__m256i
  3936. simde_mm256_slli_si256 (simde__m256i a, const int imm8)
  3937. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  3938. simde__m256i_private
  3939. r_,
  3940. a_ = simde__m256i_to_private(a);
  3941. for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) {
  3942. SIMDE_VECTORIZE
  3943. for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
  3944. const int e = HEDLEY_STATIC_CAST(int, i) - imm8;
  3945. r_.m128i_private[h].i8[i] = (e >= 0) ? a_.m128i_private[h].i8[e] : 0;
  3946. }
  3947. }
  3948. return simde__m256i_from_private(r_);
  3949. }
  3950. #if defined(SIMDE_X86_AVX2_NATIVE)
  3951. # define simde_mm256_slli_si256(a, imm8) _mm256_slli_si256(a, imm8)
  3952. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI)
  3953. # define simde_mm256_slli_si256(a, imm8) \
  3954. simde_mm256_set_m128i( \
  3955. simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  3956. simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
  3957. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3958. # define simde_mm256_slli_si256(a, imm8) \
  3959. simde_mm256_set_m128i( \
  3960. simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  3961. simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
  3962. #endif
  3963. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3964. #undef _mm256_slli_si256
  3965. #define _mm256_slli_si256(a, imm8) simde_mm256_slli_si256(a, imm8)
  3966. #endif
  3967. SIMDE_FUNCTION_ATTRIBUTES
  3968. simde__m128i
  3969. simde_mm_sllv_epi32 (simde__m128i a, simde__m128i b) {
  3970. simde__m128i_private
  3971. a_ = simde__m128i_to_private(a),
  3972. b_ = simde__m128i_to_private(b),
  3973. r_;
  3974. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3975. r_.neon_u32 = vshlq_u32(a_.neon_u32, vreinterpretq_s32_u32(b_.neon_u32));
  3976. r_.neon_u32 = vandq_u32(r_.neon_u32, vcltq_u32(b_.neon_u32, vdupq_n_u32(32)));
  3977. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  3978. r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < UINT32_C(32))) & (a_.u32 << b_.u32);
  3979. #else
  3980. SIMDE_VECTORIZE
  3981. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  3982. r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0;
  3983. }
  3984. #endif
  3985. return simde__m128i_from_private(r_);
  3986. }
  3987. #if defined(SIMDE_X86_AVX2_NATIVE)
  3988. #define simde_mm_sllv_epi32(a, b) _mm_sllv_epi32(a, b)
  3989. #endif
  3990. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  3991. #undef _mm_sllv_epi32
  3992. #define _mm_sllv_epi32(a, b) simde_mm_sllv_epi32(a, b)
  3993. #endif
  3994. SIMDE_FUNCTION_ATTRIBUTES
  3995. simde__m256i
  3996. simde_mm256_sllv_epi32 (simde__m256i a, simde__m256i b) {
  3997. simde__m256i_private
  3998. a_ = simde__m256i_to_private(a),
  3999. b_ = simde__m256i_to_private(b),
  4000. r_;
  4001. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4002. r_.m128i[0] = simde_mm_sllv_epi32(a_.m128i[0], b_.m128i[0]);
  4003. r_.m128i[1] = simde_mm_sllv_epi32(a_.m128i[1], b_.m128i[1]);
  4004. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4005. r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 << b_.u32);
  4006. #else
  4007. SIMDE_VECTORIZE
  4008. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  4009. r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0;
  4010. }
  4011. #endif
  4012. return simde__m256i_from_private(r_);
  4013. }
  4014. #if defined(SIMDE_X86_AVX2_NATIVE)
  4015. #define simde_mm256_sllv_epi32(a, b) _mm256_sllv_epi32(a, b)
  4016. #endif
  4017. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4018. #undef _mm256_sllv_epi32
  4019. #define _mm256_sllv_epi32(a, b) simde_mm256_sllv_epi32(a, b)
  4020. #endif
  4021. SIMDE_FUNCTION_ATTRIBUTES
  4022. simde__m128i
  4023. simde_mm_sllv_epi64 (simde__m128i a, simde__m128i b) {
  4024. simde__m128i_private
  4025. a_ = simde__m128i_to_private(a),
  4026. b_ = simde__m128i_to_private(b),
  4027. r_;
  4028. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  4029. r_.neon_u64 = vshlq_u64(a_.neon_u64, vreinterpretq_s64_u64(b_.neon_u64));
  4030. r_.neon_u64 = vandq_u64(r_.neon_u64, vcltq_u64(b_.neon_u64, vdupq_n_u64(64)));
  4031. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4032. r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64);
  4033. #else
  4034. SIMDE_VECTORIZE
  4035. for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
  4036. r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0;
  4037. }
  4038. #endif
  4039. return simde__m128i_from_private(r_);
  4040. }
  4041. #if defined(SIMDE_X86_AVX2_NATIVE)
  4042. #define simde_mm_sllv_epi64(a, b) _mm_sllv_epi64(a, b)
  4043. #endif
  4044. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4045. #undef _mm_sllv_epi64
  4046. #define _mm_sllv_epi64(a, b) simde_mm_sllv_epi64(a, b)
  4047. #endif
  4048. SIMDE_FUNCTION_ATTRIBUTES
  4049. simde__m256i
  4050. simde_mm256_sllv_epi64 (simde__m256i a, simde__m256i b) {
  4051. simde__m256i_private
  4052. a_ = simde__m256i_to_private(a),
  4053. b_ = simde__m256i_to_private(b),
  4054. r_;
  4055. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4056. r_.m128i[0] = simde_mm_sllv_epi64(a_.m128i[0], b_.m128i[0]);
  4057. r_.m128i[1] = simde_mm_sllv_epi64(a_.m128i[1], b_.m128i[1]);
  4058. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4059. r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64);
  4060. #else
  4061. SIMDE_VECTORIZE
  4062. for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
  4063. r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0;
  4064. }
  4065. #endif
  4066. return simde__m256i_from_private(r_);
  4067. }
  4068. #if defined(SIMDE_X86_AVX2_NATIVE)
  4069. #define simde_mm256_sllv_epi64(a, b) _mm256_sllv_epi64(a, b)
  4070. #endif
  4071. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4072. #undef _mm256_sllv_epi64
  4073. #define _mm256_sllv_epi64(a, b) simde_mm256_sllv_epi64(a, b)
  4074. #endif
  4075. SIMDE_FUNCTION_ATTRIBUTES
  4076. simde__m256i
  4077. simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) {
  4078. #if defined(SIMDE_X86_AVX2_NATIVE)
  4079. return _mm256_sra_epi16(a, count);
  4080. #else
  4081. simde__m256i_private
  4082. r_,
  4083. a_ = simde__m256i_to_private(a);
  4084. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4085. r_.m128i[0] = simde_mm_sra_epi16(a_.m128i[0], count);
  4086. r_.m128i[1] = simde_mm_sra_epi16(a_.m128i[1], count);
  4087. #else
  4088. simde__m128i_private
  4089. count_ = simde__m128i_to_private(count);
  4090. uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
  4091. if (shift > 15) shift = 15;
  4092. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4093. r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift);
  4094. #else
  4095. SIMDE_VECTORIZE
  4096. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  4097. r_.i16[i] = a_.i16[i] >> shift;
  4098. }
  4099. #endif
  4100. #endif
  4101. return simde__m256i_from_private(r_);
  4102. #endif
  4103. }
  4104. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4105. #undef _mm256_sra_epi16
  4106. #define _mm256_sra_epi16(a, count) simde_mm256_sra_epi16(a, count)
  4107. #endif
  4108. SIMDE_FUNCTION_ATTRIBUTES
  4109. simde__m256i
  4110. simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) {
  4111. #if defined(SIMDE_X86_AVX2_NATIVE)
  4112. return _mm256_sra_epi32(a, count);
  4113. #else
  4114. simde__m256i_private
  4115. r_,
  4116. a_ = simde__m256i_to_private(a);
  4117. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4118. r_.m128i[0] = simde_mm_sra_epi32(a_.m128i[0], count);
  4119. r_.m128i[1] = simde_mm_sra_epi32(a_.m128i[1], count);
  4120. #else
  4121. simde__m128i_private
  4122. count_ = simde__m128i_to_private(count);
  4123. uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]);
  4124. if (shift > 31) shift = 31;
  4125. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4126. r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift);
  4127. #else
  4128. SIMDE_VECTORIZE
  4129. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  4130. r_.i32[i] = a_.i32[i] >> shift;
  4131. }
  4132. #endif
  4133. #endif
  4134. return simde__m256i_from_private(r_);
  4135. #endif
  4136. }
  4137. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4138. #undef _mm256_sra_epi32
  4139. #define _mm256_sra_epi32(a, count) simde_mm256_sra_epi32(a, count)
  4140. #endif
  4141. SIMDE_FUNCTION_ATTRIBUTES
  4142. simde__m256i
  4143. simde_mm256_srai_epi16 (simde__m256i a, const int imm8)
  4144. SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
  4145. simde__m256i_private
  4146. r_,
  4147. a_ = simde__m256i_to_private(a);
  4148. unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8);
  4149. if (shift > 15) shift = 15;
  4150. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4151. r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift);
  4152. #else
  4153. SIMDE_VECTORIZE
  4154. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  4155. r_.i16[i] = a_.i16[i] >> shift;
  4156. }
  4157. #endif
  4158. return simde__m256i_from_private(r_);
  4159. }
  4160. #if defined(SIMDE_X86_AVX2_NATIVE)
  4161. # define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8)
  4162. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4163. # define simde_mm256_srai_epi16(a, imm8) \
  4164. simde_mm256_set_m128i( \
  4165. simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  4166. simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
  4167. #endif
  4168. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4169. #undef _mm256_srai_epi16
  4170. #define _mm256_srai_epi16(a, imm8) simde_mm256_srai_epi16(a, imm8)
  4171. #endif
  4172. SIMDE_FUNCTION_ATTRIBUTES
  4173. simde__m256i
  4174. simde_mm256_srai_epi32 (simde__m256i a, const int imm8)
  4175. SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
  4176. simde__m256i_private
  4177. r_,
  4178. a_ = simde__m256i_to_private(a);
  4179. unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8);
  4180. if (shift > 31) shift = 31;
  4181. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4182. r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift);
  4183. #else
  4184. SIMDE_VECTORIZE
  4185. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  4186. r_.i32[i] = a_.i32[i] >> shift;
  4187. }
  4188. #endif
  4189. return simde__m256i_from_private(r_);
  4190. }
  4191. #if defined(SIMDE_X86_AVX2_NATIVE)
  4192. # define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8)
  4193. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4194. # define simde_mm256_srai_epi32(a, imm8) \
  4195. simde_mm256_set_m128i( \
  4196. simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  4197. simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
  4198. #endif
  4199. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4200. #undef _mm256_srai_epi32
  4201. #define _mm256_srai_epi32(a, imm8) simde_mm256_srai_epi32(a, imm8)
  4202. #endif
  4203. SIMDE_FUNCTION_ATTRIBUTES
  4204. simde__m128i
  4205. simde_mm_srav_epi32 (simde__m128i a, simde__m128i count) {
  4206. #if defined(SIMDE_X86_AVX2_NATIVE)
  4207. return _mm_srav_epi32(a, count);
  4208. #else
  4209. simde__m128i_private
  4210. r_,
  4211. a_ = simde__m128i_to_private(a),
  4212. count_ = simde__m128i_to_private(count);
  4213. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  4214. int32x4_t cnt = vreinterpretq_s32_u32(vminq_u32(count_.neon_u32, vdupq_n_u32(31)));
  4215. r_.neon_i32 = vshlq_s32(a_.neon_i32, vnegq_s32(cnt));
  4216. #else
  4217. SIMDE_VECTORIZE
  4218. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  4219. uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]);
  4220. r_.i32[i] = a_.i32[i] >> HEDLEY_STATIC_CAST(int, shift > 31 ? 31 : shift);
  4221. }
  4222. #endif
  4223. return simde__m128i_from_private(r_);
  4224. #endif
  4225. }
  4226. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4227. #undef _mm_srav_epi32
  4228. #define _mm_srav_epi32(a, count) simde_mm_srav_epi32(a, count)
  4229. #endif
  4230. SIMDE_FUNCTION_ATTRIBUTES
  4231. simde__m256i
  4232. simde_mm256_srav_epi32 (simde__m256i a, simde__m256i count) {
  4233. #if defined(SIMDE_X86_AVX2_NATIVE)
  4234. return _mm256_srav_epi32(a, count);
  4235. #else
  4236. simde__m256i_private
  4237. r_,
  4238. a_ = simde__m256i_to_private(a),
  4239. count_ = simde__m256i_to_private(count);
  4240. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4241. r_.m128i[0] = simde_mm_srav_epi32(a_.m128i[0], count_.m128i[0]);
  4242. r_.m128i[1] = simde_mm_srav_epi32(a_.m128i[1], count_.m128i[1]);
  4243. #else
  4244. SIMDE_VECTORIZE
  4245. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  4246. uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]);
  4247. if (shift > 31) shift = 31;
  4248. r_.i32[i] = a_.i32[i] >> shift;
  4249. }
  4250. #endif
  4251. return simde__m256i_from_private(r_);
  4252. #endif
  4253. }
  4254. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4255. #undef _mm256_srav_epi32
  4256. #define _mm256_srav_epi32(a, count) simde_mm256_srav_epi32(a, count)
  4257. #endif
  4258. SIMDE_FUNCTION_ATTRIBUTES
  4259. simde__m256i
  4260. simde_mm256_srl_epi16 (simde__m256i a, simde__m128i count) {
  4261. #if defined(SIMDE_X86_AVX2_NATIVE)
  4262. return _mm256_srl_epi16(a, count);
  4263. #else
  4264. simde__m256i_private
  4265. r_,
  4266. a_ = simde__m256i_to_private(a);
  4267. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4268. r_.m128i[0] = simde_mm_srl_epi16(a_.m128i[0], count);
  4269. r_.m128i[1] = simde_mm_srl_epi16(a_.m128i[1], count);
  4270. #else
  4271. simde__m128i_private
  4272. count_ = simde__m128i_to_private(count);
  4273. uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 16 ? 16 : count_.i64[0]));
  4274. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4275. r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, shift);
  4276. #else
  4277. SIMDE_VECTORIZE
  4278. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  4279. r_.u16[i] = a_.u16[i] >> (shift);
  4280. }
  4281. #endif
  4282. #endif
  4283. return simde__m256i_from_private(r_);
  4284. #endif
  4285. }
  4286. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4287. #undef _mm256_srl_epi16
  4288. #define _mm256_srl_epi16(a, count) simde_mm256_srl_epi16(a, count)
  4289. #endif
  4290. SIMDE_FUNCTION_ATTRIBUTES
  4291. simde__m256i
  4292. simde_mm256_srl_epi32 (simde__m256i a, simde__m128i count) {
  4293. #if defined(SIMDE_X86_AVX2_NATIVE)
  4294. return _mm256_srl_epi32(a, count);
  4295. #else
  4296. simde__m256i_private
  4297. r_,
  4298. a_ = simde__m256i_to_private(a);
  4299. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4300. r_.m128i[0] = simde_mm_srl_epi32(a_.m128i[0], count);
  4301. r_.m128i[1] = simde_mm_srl_epi32(a_.m128i[1], count);
  4302. #else
  4303. simde__m128i_private
  4304. count_ = simde__m128i_to_private(count);
  4305. uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 32 ? 32 : count_.i64[0]));
  4306. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4307. r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, shift);
  4308. #else
  4309. SIMDE_VECTORIZE
  4310. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  4311. r_.u32[i] = a_.u32[i] >> (shift);
  4312. }
  4313. #endif
  4314. #endif
  4315. return simde__m256i_from_private(r_);
  4316. #endif
  4317. }
  4318. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4319. #undef _mm256_srl_epi32
  4320. #define _mm256_srl_epi32(a, count) simde_mm256_srl_epi32(a, count)
  4321. #endif
  4322. SIMDE_FUNCTION_ATTRIBUTES
  4323. simde__m256i
  4324. simde_mm256_srl_epi64 (simde__m256i a, simde__m128i count) {
  4325. #if defined(SIMDE_X86_AVX2_NATIVE)
  4326. return _mm256_srl_epi64(a, count);
  4327. #else
  4328. simde__m256i_private
  4329. r_,
  4330. a_ = simde__m256i_to_private(a);
  4331. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4332. r_.m128i[0] = simde_mm_srl_epi64(a_.m128i[0], count);
  4333. r_.m128i[1] = simde_mm_srl_epi64(a_.m128i[1], count);
  4334. #else
  4335. simde__m128i_private
  4336. count_ = simde__m128i_to_private(count);
  4337. uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 64 ? 64 : count_.i64[0]));
  4338. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4339. r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(64, shift);
  4340. #else
  4341. SIMDE_VECTORIZE
  4342. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  4343. r_.u64[i] = a_.u64[i] >> (shift);
  4344. }
  4345. #endif
  4346. #endif
  4347. return simde__m256i_from_private(r_);
  4348. #endif
  4349. }
  4350. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4351. #undef _mm256_srl_epi64
  4352. #define _mm256_srl_epi64(a, count) simde_mm256_srl_epi64(a, count)
  4353. #endif
  4354. SIMDE_FUNCTION_ATTRIBUTES
  4355. simde__m256i
  4356. simde_mm256_srli_epi16 (simde__m256i a, const int imm8)
  4357. SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
  4358. simde__m256i_private
  4359. r_,
  4360. a_ = simde__m256i_to_private(a);
  4361. if (imm8 > 15)
  4362. return simde_mm256_setzero_si256();
  4363. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  4364. SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8));
  4365. for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) {
  4366. r_.altivec_u16[i] = vec_sr(a_.altivec_u16[i], sv);
  4367. }
  4368. #else
  4369. if (HEDLEY_STATIC_CAST(unsigned int, imm8) > 15) {
  4370. simde_memset(&r_, 0, sizeof(r_));
  4371. } else {
  4372. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4373. r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8);
  4374. #else
  4375. SIMDE_VECTORIZE
  4376. for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
  4377. r_.u16[i] = a_.u16[i] >> imm8;
  4378. }
  4379. #endif
  4380. }
  4381. #endif
  4382. return simde__m256i_from_private(r_);
  4383. }
  4384. #if defined(SIMDE_X86_AVX2_NATIVE)
  4385. # define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8)
  4386. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4387. # define simde_mm256_srli_epi16(a, imm8) \
  4388. simde_mm256_set_m128i( \
  4389. simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  4390. simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8)))
  4391. #endif
  4392. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4393. #undef _mm256_srli_epi16
  4394. #define _mm256_srli_epi16(a, imm8) simde_mm256_srli_epi16(a, imm8)
  4395. #endif
  4396. SIMDE_FUNCTION_ATTRIBUTES
  4397. simde__m256i
  4398. simde_mm256_srli_epi32 (simde__m256i a, const int imm8)
  4399. SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
  4400. simde__m256i_private
  4401. r_,
  4402. a_ = simde__m256i_to_private(a);
  4403. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  4404. SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8));
  4405. for (size_t i = 0 ; i < (sizeof(a_.altivec_u32) / sizeof(a_.altivec_u32[0])) ; i++) {
  4406. r_.altivec_u32[i] = vec_sr(a_.altivec_u32[i], sv);
  4407. }
  4408. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4409. r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8);
  4410. #else
  4411. SIMDE_VECTORIZE
  4412. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  4413. r_.u32[i] = a_.u32[i] >> imm8;
  4414. }
  4415. #endif
  4416. return simde__m256i_from_private(r_);
  4417. }
  4418. #if defined(SIMDE_X86_AVX2_NATIVE)
  4419. # define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8)
  4420. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4421. # define simde_mm256_srli_epi32(a, imm8) \
  4422. simde_mm256_set_m128i( \
  4423. simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  4424. simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8)))
  4425. #endif
  4426. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4427. #undef _mm256_srli_epi32
  4428. #define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8)
  4429. #endif
  4430. SIMDE_FUNCTION_ATTRIBUTES
  4431. simde__m256i
  4432. simde_mm256_srli_epi64 (simde__m256i a, const int imm8)
  4433. SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
  4434. simde__m256i_private
  4435. r_,
  4436. a_ = simde__m256i_to_private(a);
  4437. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4438. r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, imm8);
  4439. #else
  4440. SIMDE_VECTORIZE
  4441. for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
  4442. r_.u64[i] = a_.u64[i] >> imm8;
  4443. }
  4444. #endif
  4445. return simde__m256i_from_private(r_);
  4446. }
  4447. #if defined(SIMDE_X86_AVX2_NATIVE)
  4448. # define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8)
  4449. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4450. # define simde_mm256_srli_epi64(a, imm8) \
  4451. simde_mm256_set_m128i( \
  4452. simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  4453. simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8)))
  4454. #endif
  4455. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4456. #undef _mm256_srli_epi64
  4457. #define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8)
  4458. #endif
  4459. SIMDE_FUNCTION_ATTRIBUTES
  4460. simde__m256i
  4461. simde_mm256_srli_si256 (simde__m256i a, const int imm8)
  4462. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
  4463. simde__m256i_private
  4464. r_,
  4465. a_ = simde__m256i_to_private(a);
  4466. for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) {
  4467. SIMDE_VECTORIZE
  4468. for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) {
  4469. const int e = imm8 + HEDLEY_STATIC_CAST(int, i);
  4470. r_.m128i_private[h].i8[i] = (e < 16) ? a_.m128i_private[h].i8[e] : 0;
  4471. }
  4472. }
  4473. return simde__m256i_from_private(r_);
  4474. }
  4475. #if defined(SIMDE_X86_AVX2_NATIVE)
  4476. # define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8)
  4477. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI)
  4478. # define simde_mm256_srli_si256(a, imm8) \
  4479. simde_mm256_set_m128i( \
  4480. simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  4481. simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
  4482. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  4483. # define simde_mm256_srli_si256(a, imm8) \
  4484. simde_mm256_set_m128i( \
  4485. simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \
  4486. simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8)))
  4487. #endif
  4488. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4489. #undef _mm256_srli_si256
  4490. #define _mm256_srli_si256(a, imm8) simde_mm256_srli_si256(a, imm8)
  4491. #endif
  4492. SIMDE_FUNCTION_ATTRIBUTES
  4493. simde__m128i
  4494. simde_mm_srlv_epi32 (simde__m128i a, simde__m128i b) {
  4495. simde__m128i_private
  4496. a_ = simde__m128i_to_private(a),
  4497. b_ = simde__m128i_to_private(b),
  4498. r_;
  4499. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4500. r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32);
  4501. #else
  4502. SIMDE_VECTORIZE
  4503. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  4504. r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0;
  4505. }
  4506. #endif
  4507. return simde__m128i_from_private(r_);
  4508. }
  4509. #if defined(SIMDE_X86_AVX2_NATIVE)
  4510. #define simde_mm_srlv_epi32(a, b) _mm_srlv_epi32(a, b)
  4511. #endif
  4512. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4513. #undef _mm_srlv_epi32
  4514. #define _mm_srlv_epi32(a, b) simde_mm_srlv_epi32(a, b)
  4515. #endif
  4516. SIMDE_FUNCTION_ATTRIBUTES
  4517. simde__m256i
  4518. simde_mm256_srlv_epi32 (simde__m256i a, simde__m256i b) {
  4519. simde__m256i_private
  4520. a_ = simde__m256i_to_private(a),
  4521. b_ = simde__m256i_to_private(b),
  4522. r_;
  4523. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4524. r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32);
  4525. #else
  4526. SIMDE_VECTORIZE
  4527. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  4528. r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0;
  4529. }
  4530. #endif
  4531. return simde__m256i_from_private(r_);
  4532. }
  4533. #if defined(SIMDE_X86_AVX2_NATIVE)
  4534. #define simde_mm256_srlv_epi32(a, b) _mm256_srlv_epi32(a, b)
  4535. #endif
  4536. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4537. #undef _mm256_srlv_epi32
  4538. #define _mm256_srlv_epi32(a, b) simde_mm256_srlv_epi32(a, b)
  4539. #endif
  4540. SIMDE_FUNCTION_ATTRIBUTES
  4541. simde__m128i
  4542. simde_mm_srlv_epi64 (simde__m128i a, simde__m128i b) {
  4543. simde__m128i_private
  4544. a_ = simde__m128i_to_private(a),
  4545. b_ = simde__m128i_to_private(b),
  4546. r_;
  4547. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4548. r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64);
  4549. #else
  4550. SIMDE_VECTORIZE
  4551. for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
  4552. r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0;
  4553. }
  4554. #endif
  4555. return simde__m128i_from_private(r_);
  4556. }
  4557. #if defined(SIMDE_X86_AVX2_NATIVE)
  4558. #define simde_mm_srlv_epi64(a, b) _mm_srlv_epi64(a, b)
  4559. #endif
  4560. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4561. #undef _mm_srlv_epi64
  4562. #define _mm_srlv_epi64(a, b) simde_mm_srlv_epi64(a, b)
  4563. #endif
  4564. SIMDE_FUNCTION_ATTRIBUTES
  4565. simde__m256i
  4566. simde_mm256_srlv_epi64 (simde__m256i a, simde__m256i b) {
  4567. simde__m256i_private
  4568. a_ = simde__m256i_to_private(a),
  4569. b_ = simde__m256i_to_private(b),
  4570. r_;
  4571. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  4572. r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64);
  4573. #else
  4574. SIMDE_VECTORIZE
  4575. for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) {
  4576. r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0;
  4577. }
  4578. #endif
  4579. return simde__m256i_from_private(r_);
  4580. }
  4581. #if defined(SIMDE_X86_AVX2_NATIVE)
  4582. #define simde_mm256_srlv_epi64(a, b) _mm256_srlv_epi64(a, b)
  4583. #endif
  4584. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4585. #undef _mm256_srlv_epi64
  4586. #define _mm256_srlv_epi64(a, b) simde_mm256_srlv_epi64(a, b)
  4587. #endif
  4588. SIMDE_FUNCTION_ATTRIBUTES
  4589. simde__m256i
  4590. simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) {
  4591. #if defined(SIMDE_X86_AVX2_NATIVE)
  4592. return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr));
  4593. #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT)
  4594. return __builtin_nontemporal_load(mem_addr);
  4595. #else
  4596. simde__m256i r;
  4597. simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r));
  4598. return r;
  4599. #endif
  4600. }
  4601. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4602. # define _mm256_stream_load_si256(mem_addr) simde_mm256_stream_load_si256(mem_addr)
  4603. #endif
  4604. SIMDE_FUNCTION_ATTRIBUTES
  4605. simde__m256i
  4606. simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) {
  4607. #if defined(SIMDE_X86_AVX2_NATIVE)
  4608. return _mm256_sub_epi8(a, b);
  4609. #else
  4610. simde__m256i_private
  4611. r_,
  4612. a_ = simde__m256i_to_private(a),
  4613. b_ = simde__m256i_to_private(b);
  4614. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4615. r_.m128i[0] = simde_mm_sub_epi8(a_.m128i[0], b_.m128i[0]);
  4616. r_.m128i[1] = simde_mm_sub_epi8(a_.m128i[1], b_.m128i[1]);
  4617. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  4618. r_.i8 = a_.i8 - b_.i8;
  4619. #else
  4620. SIMDE_VECTORIZE
  4621. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  4622. r_.i8[i] = a_.i8[i] - b_.i8[i];
  4623. }
  4624. #endif
  4625. return simde__m256i_from_private(r_);
  4626. #endif
  4627. }
  4628. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4629. #undef _mm256_sub_epi8
  4630. #define _mm256_sub_epi8(a, b) simde_mm256_sub_epi8(a, b)
  4631. #endif
  4632. SIMDE_FUNCTION_ATTRIBUTES
  4633. simde__m256i
  4634. simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) {
  4635. #if defined(SIMDE_X86_AVX2_NATIVE)
  4636. return _mm256_sub_epi16(a, b);
  4637. #else
  4638. simde__m256i_private
  4639. r_,
  4640. a_ = simde__m256i_to_private(a),
  4641. b_ = simde__m256i_to_private(b);
  4642. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4643. r_.m128i[0] = simde_mm_sub_epi16(a_.m128i[0], b_.m128i[0]);
  4644. r_.m128i[1] = simde_mm_sub_epi16(a_.m128i[1], b_.m128i[1]);
  4645. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  4646. r_.i16 = a_.i16 - b_.i16;
  4647. #else
  4648. SIMDE_VECTORIZE
  4649. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  4650. r_.i16[i] = a_.i16[i] - b_.i16[i];
  4651. }
  4652. #endif
  4653. return simde__m256i_from_private(r_);
  4654. #endif
  4655. }
  4656. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4657. #undef _mm256_sub_epi16
  4658. #define _mm256_sub_epi16(a, b) simde_mm256_sub_epi16(a, b)
  4659. #endif
  4660. SIMDE_FUNCTION_ATTRIBUTES
  4661. simde__m256i
  4662. simde_mm256_hsub_epi16 (simde__m256i a, simde__m256i b) {
  4663. #if defined(SIMDE_X86_AVX2_NATIVE)
  4664. return _mm256_hsub_epi16(a, b);
  4665. #else
  4666. return simde_mm256_sub_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
  4667. #endif
  4668. }
  4669. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4670. #undef _mm256_hsub_epi16
  4671. #define _mm256_hsub_epi16(a, b) simde_mm256_hsub_epi16(a, b)
  4672. #endif
  4673. SIMDE_FUNCTION_ATTRIBUTES
  4674. simde__m256i
  4675. simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) {
  4676. #if defined(SIMDE_X86_AVX2_NATIVE)
  4677. return _mm256_sub_epi32(a, b);
  4678. #else
  4679. simde__m256i_private
  4680. r_,
  4681. a_ = simde__m256i_to_private(a),
  4682. b_ = simde__m256i_to_private(b);
  4683. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4684. r_.m128i[0] = simde_mm_sub_epi32(a_.m128i[0], b_.m128i[0]);
  4685. r_.m128i[1] = simde_mm_sub_epi32(a_.m128i[1], b_.m128i[1]);
  4686. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  4687. r_.i32 = a_.i32 - b_.i32;
  4688. #else
  4689. SIMDE_VECTORIZE
  4690. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
  4691. r_.i32[i] = a_.i32[i] - b_.i32[i];
  4692. }
  4693. #endif
  4694. return simde__m256i_from_private(r_);
  4695. #endif
  4696. }
  4697. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4698. #undef _mm256_sub_epi32
  4699. #define _mm256_sub_epi32(a, b) simde_mm256_sub_epi32(a, b)
  4700. #endif
  4701. SIMDE_FUNCTION_ATTRIBUTES
  4702. simde__m256i
  4703. simde_mm256_hsub_epi32 (simde__m256i a, simde__m256i b) {
  4704. #if defined(SIMDE_X86_AVX2_NATIVE)
  4705. return _mm256_hsub_epi32(a, b);
  4706. #else
  4707. return simde_mm256_sub_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b));
  4708. #endif
  4709. }
  4710. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4711. #undef _mm256_hsub_epi32
  4712. #define _mm256_hsub_epi32(a, b) simde_mm256_hsub_epi32(a, b)
  4713. #endif
  4714. SIMDE_FUNCTION_ATTRIBUTES
  4715. simde__m256i
  4716. simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) {
  4717. #if defined(SIMDE_X86_AVX2_NATIVE)
  4718. return _mm256_sub_epi64(a, b);
  4719. #else
  4720. simde__m256i_private
  4721. r_,
  4722. a_ = simde__m256i_to_private(a),
  4723. b_ = simde__m256i_to_private(b);
  4724. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4725. r_.m128i[0] = simde_mm_sub_epi64(a_.m128i[0], b_.m128i[0]);
  4726. r_.m128i[1] = simde_mm_sub_epi64(a_.m128i[1], b_.m128i[1]);
  4727. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  4728. r_.i64 = a_.i64 - b_.i64;
  4729. #else
  4730. SIMDE_VECTORIZE
  4731. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  4732. r_.i64[i] = a_.i64[i] - b_.i64[i];
  4733. }
  4734. #endif
  4735. return simde__m256i_from_private(r_);
  4736. #endif
  4737. }
  4738. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4739. #undef _mm256_sub_epi64
  4740. #define _mm256_sub_epi64(a, b) simde_mm256_sub_epi64(a, b)
  4741. #endif
  4742. SIMDE_FUNCTION_ATTRIBUTES
  4743. simde__m256i
  4744. simde_x_mm256_sub_epu32 (simde__m256i a, simde__m256i b) {
  4745. simde__m256i_private
  4746. r_,
  4747. a_ = simde__m256i_to_private(a),
  4748. b_ = simde__m256i_to_private(b);
  4749. #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  4750. r_.u32 = a_.u32 - b_.u32;
  4751. #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4752. r_.m128i[0] = simde_x_mm_sub_epu32(a_.m128i[0], b_.m128i[0]);
  4753. r_.m128i[1] = simde_x_mm_sub_epu32(a_.m128i[1], b_.m128i[1]);
  4754. #else
  4755. SIMDE_VECTORIZE
  4756. for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) {
  4757. r_.u32[i] = a_.u32[i] - b_.u32[i];
  4758. }
  4759. #endif
  4760. return simde__m256i_from_private(r_);
  4761. }
  4762. SIMDE_FUNCTION_ATTRIBUTES
  4763. simde__m256i
  4764. simde_mm256_subs_epi8 (simde__m256i a, simde__m256i b) {
  4765. #if defined(SIMDE_X86_AVX2_NATIVE)
  4766. return _mm256_subs_epi8(a, b);
  4767. #else
  4768. simde__m256i_private
  4769. r_,
  4770. a_ = simde__m256i_to_private(a),
  4771. b_ = simde__m256i_to_private(b);
  4772. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4773. r_.m128i[0] = simde_mm_subs_epi8(a_.m128i[0], b_.m128i[0]);
  4774. r_.m128i[1] = simde_mm_subs_epi8(a_.m128i[1], b_.m128i[1]);
  4775. #else
  4776. SIMDE_VECTORIZE
  4777. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) {
  4778. r_.i8[i] = simde_math_subs_i8(a_.i8[i], b_.i8[i]);
  4779. }
  4780. #endif
  4781. return simde__m256i_from_private(r_);
  4782. #endif
  4783. }
  4784. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4785. #undef _mm256_subs_epi8
  4786. #define _mm256_subs_epi8(a, b) simde_mm256_subs_epi8(a, b)
  4787. #endif
  4788. SIMDE_FUNCTION_ATTRIBUTES
  4789. simde__m256i
  4790. simde_mm256_subs_epi16(simde__m256i a, simde__m256i b) {
  4791. #if defined(SIMDE_X86_AVX2_NATIVE)
  4792. return _mm256_subs_epi16(a, b);
  4793. #else
  4794. simde__m256i_private
  4795. r_,
  4796. a_ = simde__m256i_to_private(a),
  4797. b_ = simde__m256i_to_private(b);
  4798. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4799. r_.m128i[0] = simde_mm_subs_epi16(a_.m128i[0], b_.m128i[0]);
  4800. r_.m128i[1] = simde_mm_subs_epi16(a_.m128i[1], b_.m128i[1]);
  4801. #else
  4802. SIMDE_VECTORIZE
  4803. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) {
  4804. r_.i16[i] = simde_math_subs_i16(a_.i16[i], b_.i16[i]);
  4805. }
  4806. #endif
  4807. return simde__m256i_from_private(r_);
  4808. #endif
  4809. }
  4810. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4811. #undef _mm256_subs_epi16
  4812. #define _mm256_subs_epi16(a, b) simde_mm256_subs_epi16(a, b)
  4813. #endif
  4814. SIMDE_FUNCTION_ATTRIBUTES
  4815. simde__m256i
  4816. simde_mm256_hsubs_epi16 (simde__m256i a, simde__m256i b) {
  4817. #if defined(SIMDE_X86_AVX2_NATIVE)
  4818. return _mm256_hsubs_epi16(a, b);
  4819. #else
  4820. return simde_mm256_subs_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b));
  4821. #endif
  4822. }
  4823. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4824. #undef _mm256_hsubs_epi16
  4825. #define _mm256_hsubs_epi16(a, b) simde_mm256_hsubs_epi16(a, b)
  4826. #endif
  4827. SIMDE_FUNCTION_ATTRIBUTES
  4828. simde__m256i
  4829. simde_mm256_subs_epu8 (simde__m256i a, simde__m256i b) {
  4830. #if defined(SIMDE_X86_AVX2_NATIVE)
  4831. return _mm256_subs_epu8(a, b);
  4832. #else
  4833. simde__m256i_private
  4834. r_,
  4835. a_ = simde__m256i_to_private(a),
  4836. b_ = simde__m256i_to_private(b);
  4837. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4838. r_.m128i[0] = simde_mm_subs_epu8(a_.m128i[0], b_.m128i[0]);
  4839. r_.m128i[1] = simde_mm_subs_epu8(a_.m128i[1], b_.m128i[1]);
  4840. #else
  4841. SIMDE_VECTORIZE
  4842. for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
  4843. r_.u8[i] = simde_math_subs_u8(a_.u8[i], b_.u8[i]);
  4844. }
  4845. #endif
  4846. return simde__m256i_from_private(r_);
  4847. #endif
  4848. }
  4849. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4850. #undef _mm256_subs_epu8
  4851. #define _mm256_subs_epu8(a, b) simde_mm256_subs_epu8(a, b)
  4852. #endif
  4853. SIMDE_FUNCTION_ATTRIBUTES
  4854. simde__m256i
  4855. simde_mm256_subs_epu16(simde__m256i a, simde__m256i b) {
  4856. #if defined(SIMDE_X86_AVX2_NATIVE)
  4857. return _mm256_subs_epu16(a, b);
  4858. #else
  4859. simde__m256i_private
  4860. r_,
  4861. a_ = simde__m256i_to_private(a),
  4862. b_ = simde__m256i_to_private(b);
  4863. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4864. r_.m128i[0] = simde_mm_subs_epu16(a_.m128i[0], b_.m128i[0]);
  4865. r_.m128i[1] = simde_mm_subs_epu16(a_.m128i[1], b_.m128i[1]);
  4866. #else
  4867. SIMDE_VECTORIZE
  4868. for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) {
  4869. r_.u16[i] = simde_math_subs_u16(a_.u16[i], b_.u16[i]);
  4870. }
  4871. #endif
  4872. return simde__m256i_from_private(r_);
  4873. #endif
  4874. }
  4875. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4876. #undef _mm256_subs_epu16
  4877. #define _mm256_subs_epu16(a, b) simde_mm256_subs_epu16(a, b)
  4878. #endif
  4879. SIMDE_FUNCTION_ATTRIBUTES
  4880. int
  4881. simde_x_mm256_test_all_ones (simde__m256i a) {
  4882. simde__m256i_private a_ = simde__m256i_to_private(a);
  4883. int r;
  4884. int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0);
  4885. SIMDE_VECTORIZE_REDUCTION(&:r_)
  4886. for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
  4887. r_ &= a_.i32f[i];
  4888. }
  4889. r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0));
  4890. return r;
  4891. }
  4892. SIMDE_FUNCTION_ATTRIBUTES
  4893. simde__m256i
  4894. simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) {
  4895. #if defined(SIMDE_X86_AVX2_NATIVE)
  4896. return _mm256_unpacklo_epi8(a, b);
  4897. #else
  4898. simde__m256i_private
  4899. r_,
  4900. a_ = simde__m256i_to_private(a),
  4901. b_ = simde__m256i_to_private(b);
  4902. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4903. r_.m128i[0] = simde_mm_unpacklo_epi8(a_.m128i[0], b_.m128i[0]);
  4904. r_.m128i[1] = simde_mm_unpacklo_epi8(a_.m128i[1], b_.m128i[1]);
  4905. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  4906. r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8,
  4907. 0, 32, 1, 33, 2, 34, 3, 35,
  4908. 4, 36, 5, 37, 6, 38, 7, 39,
  4909. 16, 48, 17, 49, 18, 50, 19, 51,
  4910. 20, 52, 21, 53, 22, 54, 23, 55);
  4911. #else
  4912. SIMDE_VECTORIZE
  4913. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) {
  4914. r_.i8[2 * i] = a_.i8[i + ~(~i | 7)];
  4915. r_.i8[2 * i + 1] = b_.i8[i + ~(~i | 7)];
  4916. }
  4917. #endif
  4918. return simde__m256i_from_private(r_);
  4919. #endif
  4920. }
  4921. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4922. #undef _mm256_unpacklo_epi8
  4923. #define _mm256_unpacklo_epi8(a, b) simde_mm256_unpacklo_epi8(a, b)
  4924. #endif
  4925. SIMDE_FUNCTION_ATTRIBUTES
  4926. simde__m256i
  4927. simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) {
  4928. #if defined(SIMDE_X86_AVX2_NATIVE)
  4929. return _mm256_unpacklo_epi16(a, b);
  4930. #else
  4931. simde__m256i_private
  4932. r_,
  4933. a_ = simde__m256i_to_private(a),
  4934. b_ = simde__m256i_to_private(b);
  4935. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4936. r_.m128i[0] = simde_mm_unpacklo_epi16(a_.m128i[0], b_.m128i[0]);
  4937. r_.m128i[1] = simde_mm_unpacklo_epi16(a_.m128i[1], b_.m128i[1]);
  4938. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  4939. r_.i16 =SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16,
  4940. 0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27);
  4941. #else
  4942. SIMDE_VECTORIZE
  4943. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) {
  4944. r_.i16[2 * i] = a_.i16[i + ~(~i | 3)];
  4945. r_.i16[2 * i + 1] = b_.i16[i + ~(~i | 3)];
  4946. }
  4947. #endif
  4948. return simde__m256i_from_private(r_);
  4949. #endif
  4950. }
  4951. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4952. #undef _mm256_unpacklo_epi16
  4953. #define _mm256_unpacklo_epi16(a, b) simde_mm256_unpacklo_epi16(a, b)
  4954. #endif
  4955. SIMDE_FUNCTION_ATTRIBUTES
  4956. simde__m256i
  4957. simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) {
  4958. #if defined(SIMDE_X86_AVX2_NATIVE)
  4959. return _mm256_unpacklo_epi32(a, b);
  4960. #else
  4961. simde__m256i_private
  4962. r_,
  4963. a_ = simde__m256i_to_private(a),
  4964. b_ = simde__m256i_to_private(b);
  4965. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4966. r_.m128i[0] = simde_mm_unpacklo_epi32(a_.m128i[0], b_.m128i[0]);
  4967. r_.m128i[1] = simde_mm_unpacklo_epi32(a_.m128i[1], b_.m128i[1]);
  4968. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  4969. r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32,
  4970. 0, 8, 1, 9, 4, 12, 5, 13);
  4971. #else
  4972. SIMDE_VECTORIZE
  4973. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) {
  4974. r_.i32[2 * i] = a_.i32[i + ~(~i | 1)];
  4975. r_.i32[2 * i + 1] = b_.i32[i + ~(~i | 1)];
  4976. }
  4977. #endif
  4978. return simde__m256i_from_private(r_);
  4979. #endif
  4980. }
  4981. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  4982. #undef _mm256_unpacklo_epi32
  4983. #define _mm256_unpacklo_epi32(a, b) simde_mm256_unpacklo_epi32(a, b)
  4984. #endif
  4985. SIMDE_FUNCTION_ATTRIBUTES
  4986. simde__m256i
  4987. simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) {
  4988. #if defined(SIMDE_X86_AVX2_NATIVE)
  4989. return _mm256_unpacklo_epi64(a, b);
  4990. #else
  4991. simde__m256i_private
  4992. r_,
  4993. a_ = simde__m256i_to_private(a),
  4994. b_ = simde__m256i_to_private(b);
  4995. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  4996. r_.m128i[0] = simde_mm_unpacklo_epi64(a_.m128i[0], b_.m128i[0]);
  4997. r_.m128i[1] = simde_mm_unpacklo_epi64(a_.m128i[1], b_.m128i[1]);
  4998. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  4999. r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 0, 4, 2, 6);
  5000. #else
  5001. SIMDE_VECTORIZE
  5002. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) {
  5003. r_.i64[2 * i] = a_.i64[2 * i];
  5004. r_.i64[2 * i + 1] = b_.i64[2 * i];
  5005. }
  5006. #endif
  5007. return simde__m256i_from_private(r_);
  5008. #endif
  5009. }
  5010. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  5011. #undef _mm256_unpacklo_epi64
  5012. #define _mm256_unpacklo_epi64(a, b) simde_mm256_unpacklo_epi64(a, b)
  5013. #endif
  5014. SIMDE_FUNCTION_ATTRIBUTES
  5015. simde__m256i
  5016. simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) {
  5017. #if defined(SIMDE_X86_AVX2_NATIVE)
  5018. return _mm256_unpackhi_epi8(a, b);
  5019. #else
  5020. simde__m256i_private
  5021. r_,
  5022. a_ = simde__m256i_to_private(a),
  5023. b_ = simde__m256i_to_private(b);
  5024. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  5025. r_.m128i[0] = simde_mm_unpackhi_epi8(a_.m128i[0], b_.m128i[0]);
  5026. r_.m128i[1] = simde_mm_unpackhi_epi8(a_.m128i[1], b_.m128i[1]);
  5027. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  5028. r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8,
  5029. 8, 40, 9, 41, 10, 42, 11, 43,
  5030. 12, 44, 13, 45, 14, 46, 15, 47,
  5031. 24, 56, 25, 57, 26, 58, 27, 59,
  5032. 28, 60, 29, 61, 30, 62, 31, 63);
  5033. #else
  5034. SIMDE_VECTORIZE
  5035. for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) {
  5036. r_.i8[2 * i] = a_.i8[i + 8 + ~(~i | 7)];
  5037. r_.i8[2 * i + 1] = b_.i8[i + 8 + ~(~i | 7)];
  5038. }
  5039. #endif
  5040. return simde__m256i_from_private(r_);
  5041. #endif
  5042. }
  5043. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  5044. #undef _mm256_unpackhi_epi8
  5045. #define _mm256_unpackhi_epi8(a, b) simde_mm256_unpackhi_epi8(a, b)
  5046. #endif
  5047. SIMDE_FUNCTION_ATTRIBUTES
  5048. simde__m256i
  5049. simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) {
  5050. #if defined(SIMDE_X86_AVX2_NATIVE)
  5051. return _mm256_unpackhi_epi16(a, b);
  5052. #else
  5053. simde__m256i_private
  5054. r_,
  5055. a_ = simde__m256i_to_private(a),
  5056. b_ = simde__m256i_to_private(b);
  5057. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  5058. r_.m128i[0] = simde_mm_unpackhi_epi16(a_.m128i[0], b_.m128i[0]);
  5059. r_.m128i[1] = simde_mm_unpackhi_epi16(a_.m128i[1], b_.m128i[1]);
  5060. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  5061. r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16,
  5062. 4, 20, 5, 21, 6, 22, 7, 23,
  5063. 12, 28, 13, 29, 14, 30, 15, 31);
  5064. #else
  5065. SIMDE_VECTORIZE
  5066. for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) {
  5067. r_.i16[2 * i] = a_.i16[i + 4 + ~(~i | 3)];
  5068. r_.i16[2 * i + 1] = b_.i16[i + 4 + ~(~i | 3)];
  5069. }
  5070. #endif
  5071. return simde__m256i_from_private(r_);
  5072. #endif
  5073. }
  5074. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  5075. #undef _mm256_unpackhi_epi16
  5076. #define _mm256_unpackhi_epi16(a, b) simde_mm256_unpackhi_epi16(a, b)
  5077. #endif
  5078. SIMDE_FUNCTION_ATTRIBUTES
  5079. simde__m256i
  5080. simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) {
  5081. #if defined(SIMDE_X86_AVX2_NATIVE)
  5082. return _mm256_unpackhi_epi32(a, b);
  5083. #else
  5084. simde__m256i_private
  5085. r_,
  5086. a_ = simde__m256i_to_private(a),
  5087. b_ = simde__m256i_to_private(b);
  5088. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  5089. r_.m128i[0] = simde_mm_unpackhi_epi32(a_.m128i[0], b_.m128i[0]);
  5090. r_.m128i[1] = simde_mm_unpackhi_epi32(a_.m128i[1], b_.m128i[1]);
  5091. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  5092. r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32,
  5093. 2, 10, 3, 11, 6, 14, 7, 15);
  5094. #else
  5095. SIMDE_VECTORIZE
  5096. for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) {
  5097. r_.i32[2 * i] = a_.i32[i + 2 + ~(~i | 1)];
  5098. r_.i32[2 * i + 1] = b_.i32[i + 2 + ~(~i | 1)];
  5099. }
  5100. #endif
  5101. return simde__m256i_from_private(r_);
  5102. #endif
  5103. }
  5104. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  5105. #undef _mm256_unpackhi_epi32
  5106. #define _mm256_unpackhi_epi32(a, b) simde_mm256_unpackhi_epi32(a, b)
  5107. #endif
  5108. SIMDE_FUNCTION_ATTRIBUTES
  5109. simde__m256i
  5110. simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) {
  5111. #if defined(SIMDE_X86_AVX2_NATIVE)
  5112. return _mm256_unpackhi_epi64(a, b);
  5113. #else
  5114. simde__m256i_private
  5115. r_,
  5116. a_ = simde__m256i_to_private(a),
  5117. b_ = simde__m256i_to_private(b);
  5118. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  5119. r_.m128i[0] = simde_mm_unpackhi_epi64(a_.m128i[0], b_.m128i[0]);
  5120. r_.m128i[1] = simde_mm_unpackhi_epi64(a_.m128i[1], b_.m128i[1]);
  5121. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  5122. r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 1, 5, 3, 7);
  5123. #else
  5124. SIMDE_VECTORIZE
  5125. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) {
  5126. r_.i64[2 * i] = a_.i64[2 * i + 1];
  5127. r_.i64[2 * i + 1] = b_.i64[2 * i + 1];
  5128. }
  5129. #endif
  5130. return simde__m256i_from_private(r_);
  5131. #endif
  5132. }
  5133. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  5134. #undef _mm256_unpackhi_epi64
  5135. #define _mm256_unpackhi_epi64(a, b) simde_mm256_unpackhi_epi64(a, b)
  5136. #endif
  5137. SIMDE_FUNCTION_ATTRIBUTES
  5138. simde__m256i
  5139. simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) {
  5140. #if defined(SIMDE_X86_AVX2_NATIVE)
  5141. return _mm256_xor_si256(a, b);
  5142. #else
  5143. simde__m256i_private
  5144. r_,
  5145. a_ = simde__m256i_to_private(a),
  5146. b_ = simde__m256i_to_private(b);
  5147. #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
  5148. r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]);
  5149. r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]);
  5150. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  5151. r_.i32f = a_.i32f ^ b_.i32f;
  5152. #else
  5153. SIMDE_VECTORIZE
  5154. for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) {
  5155. r_.i64[i] = a_.i64[i] ^ b_.i64[i];
  5156. }
  5157. #endif
  5158. return simde__m256i_from_private(r_);
  5159. #endif
  5160. }
  5161. #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
  5162. #undef _mm256_xor_si256
  5163. #define _mm256_xor_si256(a, b) simde_mm256_xor_si256(a, b)
  5164. #endif
  5165. SIMDE_END_DECLS_
  5166. HEDLEY_DIAGNOSTIC_POP
  5167. #endif /* !defined(SIMDE_X86_AVX2_H) */