sse2neon.h 393 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286
  1. // SPDX-FileCopyrightText: Copyright 2015-2024 SSE2NEON Contributors
  2. // SPDX-License-Identifier: MIT
  3. #ifndef SSE2NEON_H
  4. #define SSE2NEON_H
  5. /*
  6. * sse2neon is freely redistributable under the MIT License.
  7. *
  8. * Copyright (c) 2015-2024 SSE2NEON Contributors.
  9. *
  10. * Permission is hereby granted, free of charge, to any person obtaining a copy
  11. * of this software and associated documentation files (the "Software"), to deal
  12. * in the Software without restriction, including without limitation the rights
  13. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14. * copies of the Software, and to permit persons to whom the Software is
  15. * furnished to do so, subject to the following conditions:
  16. *
  17. * The above copyright notice and this permission notice shall be included in
  18. * all copies or substantial portions of the Software.
  19. *
  20. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  23. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  26. * SOFTWARE.
  27. */
  28. // This header file provides a simple API translation layer
  29. // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
  30. //
  31. // Contributors to this work are:
  32. // John W. Ratcliff <jratcliffscarab@gmail.com>
  33. // Brandon Rowlett <browlett@nvidia.com>
  34. // Ken Fast <kfast@gdeb.com>
  35. // Eric van Beurden <evanbeurden@nvidia.com>
  36. // Alexander Potylitsin <apotylitsin@nvidia.com>
  37. // Hasindu Gamaarachchi <hasindu2008@gmail.com>
  38. // Jim Huang <jserv@ccns.ncku.edu.tw>
  39. // Mark Cheng <marktwtn@gmail.com>
  40. // Malcolm James MacLeod <malcolm@gulden.com>
  41. // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
  42. // Sebastian Pop <spop@amazon.com>
  43. // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
  44. // Danila Kutenin <danilak@google.com>
  45. // François Turban (JishinMaster) <francois.turban@gmail.com>
  46. // Pei-Hsuan Hung <afcidk@gmail.com>
  47. // Yang-Hao Yuan <yuanyanghau@gmail.com>
  48. // Syoyo Fujita <syoyo@lighttransport.com>
  49. // Brecht Van Lommel <brecht@blender.org>
  50. // Jonathan Hue <jhue@adobe.com>
  51. // Cuda Chen <clh960524@gmail.com>
  52. // Aymen Qader <aymen.qader@arm.com>
  53. // Anthony Roberts <anthony.roberts@linaro.org>
  54. /* Tunable configurations */
  55. /* Enable precise implementation of math operations
  56. * This would slow down the computation a bit, but gives consistent result with
  57. * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
  58. */
  59. /* _mm_min|max_ps|ss|pd|sd */
  60. #ifndef SSE2NEON_PRECISE_MINMAX
  61. #define SSE2NEON_PRECISE_MINMAX (0)
  62. #endif
  63. /* _mm_rcp_ps and _mm_div_ps */
  64. #ifndef SSE2NEON_PRECISE_DIV
  65. #define SSE2NEON_PRECISE_DIV (0)
  66. #endif
  67. /* _mm_sqrt_ps and _mm_rsqrt_ps */
  68. #ifndef SSE2NEON_PRECISE_SQRT
  69. #define SSE2NEON_PRECISE_SQRT (0)
  70. #endif
  71. /* _mm_dp_pd */
  72. #ifndef SSE2NEON_PRECISE_DP
  73. #define SSE2NEON_PRECISE_DP (0)
  74. #endif
  75. /* Enable inclusion of windows.h on MSVC platforms
  76. * This makes _mm_clflush functional on windows, as there is no builtin.
  77. */
  78. #ifndef SSE2NEON_INCLUDE_WINDOWS_H
  79. #define SSE2NEON_INCLUDE_WINDOWS_H (0)
  80. #endif
  81. /* compiler specific definitions */
  82. #if defined(__GNUC__) || defined(__clang__)
  83. #pragma push_macro("FORCE_INLINE")
  84. #pragma push_macro("ALIGN_STRUCT")
  85. #define FORCE_INLINE static inline __attribute__((always_inline))
  86. #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
  87. #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
  88. #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
  89. #elif defined(_MSC_VER)
  90. #if _MSVC_TRADITIONAL
  91. #error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
  92. #endif
  93. #ifndef FORCE_INLINE
  94. #define FORCE_INLINE static inline
  95. #endif
  96. #ifndef ALIGN_STRUCT
  97. #define ALIGN_STRUCT(x) __declspec(align(x))
  98. #endif
  99. #define _sse2neon_likely(x) (x)
  100. #define _sse2neon_unlikely(x) (x)
  101. #else
  102. #pragma message("Macro name collisions may happen with unsupported compilers.")
  103. #endif
  104. #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
  105. #warning "GCC versions earlier than 10 are not supported."
  106. #endif
  107. /* C language does not allow initializing a variable with a function call. */
  108. #ifdef __cplusplus
  109. #define _sse2neon_const static const
  110. #else
  111. #define _sse2neon_const const
  112. #endif
  113. #include <stdint.h>
  114. #include <stdlib.h>
  115. #if defined(_WIN32)
  116. /* Definitions for _mm_{malloc,free} are provided by <malloc.h>
  117. * from both MinGW-w64 and MSVC.
  118. */
  119. #define SSE2NEON_ALLOC_DEFINED
  120. #endif
  121. /* If using MSVC */
  122. #ifdef _MSC_VER
  123. #include <intrin.h>
  124. #if SSE2NEON_INCLUDE_WINDOWS_H
  125. #include <processthreadsapi.h>
  126. #include <windows.h>
  127. #endif
  128. #if !defined(__cplusplus)
  129. #error SSE2NEON only supports C++ compilation with this compiler
  130. #endif
  131. #ifdef SSE2NEON_ALLOC_DEFINED
  132. #include <malloc.h>
  133. #endif
  134. #if (defined(_M_AMD64) || defined(__x86_64__)) || \
  135. (defined(_M_ARM64) || defined(__arm64__))
  136. #define SSE2NEON_HAS_BITSCAN64
  137. #endif
  138. #endif
  139. #if defined(__GNUC__) || defined(__clang__)
  140. #define _sse2neon_define0(type, s, body) \
  141. __extension__({ \
  142. type _a = (s); \
  143. body \
  144. })
  145. #define _sse2neon_define1(type, s, body) \
  146. __extension__({ \
  147. type _a = (s); \
  148. body \
  149. })
  150. #define _sse2neon_define2(type, a, b, body) \
  151. __extension__({ \
  152. type _a = (a), _b = (b); \
  153. body \
  154. })
  155. #define _sse2neon_return(ret) (ret)
  156. #else
  157. #define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
  158. #define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
  159. #define _sse2neon_define2(type, a, b, body) \
  160. [](type _a, type _b) { body }((a), (b))
  161. #define _sse2neon_return(ret) return ret
  162. #endif
  163. #define _sse2neon_init(...) \
  164. { \
  165. __VA_ARGS__ \
  166. }
  167. /* Compiler barrier */
  168. #if defined(_MSC_VER)
  169. #define SSE2NEON_BARRIER() _ReadWriteBarrier()
  170. #else
  171. #define SSE2NEON_BARRIER() \
  172. do { \
  173. __asm__ __volatile__("" ::: "memory"); \
  174. (void) 0; \
  175. } while (0)
  176. #endif
  177. /* Memory barriers
  178. * __atomic_thread_fence does not include a compiler barrier; instead,
  179. * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
  180. * semantics.
  181. */
  182. #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
  183. #include <stdatomic.h>
  184. #endif
  185. FORCE_INLINE void _sse2neon_smp_mb(void)
  186. {
  187. SSE2NEON_BARRIER();
  188. #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
  189. !defined(__STDC_NO_ATOMICS__)
  190. atomic_thread_fence(memory_order_seq_cst);
  191. #elif defined(__GNUC__) || defined(__clang__)
  192. __atomic_thread_fence(__ATOMIC_SEQ_CST);
  193. #else /* MSVC */
  194. __dmb(_ARM64_BARRIER_ISH);
  195. #endif
  196. }
  197. /* Architecture-specific build options */
  198. /* FIXME: #pragma GCC push_options is only available on GCC */
  199. #if defined(__GNUC__)
  200. #if defined(__arm__) && __ARM_ARCH == 7
  201. /* According to ARM C Language Extensions Architecture specification,
  202. * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
  203. * architecture supported.
  204. */
  205. #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
  206. #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
  207. #endif
  208. #if !defined(__clang__)
  209. #pragma GCC push_options
  210. #pragma GCC target("fpu=neon")
  211. #endif
  212. #elif defined(__aarch64__) || defined(_M_ARM64)
  213. #if !defined(__clang__) && !defined(_MSC_VER)
  214. #pragma GCC push_options
  215. #pragma GCC target("+simd")
  216. #endif
  217. #elif __ARM_ARCH == 8
  218. #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
  219. #error \
  220. "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
  221. #endif
  222. #if !defined(__clang__) && !defined(_MSC_VER)
  223. #pragma GCC push_options
  224. #endif
  225. #else
  226. #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
  227. #endif
  228. #endif
  229. #include <arm_neon.h>
  230. #if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
  231. #if defined __has_include && __has_include(<arm_acle.h>)
  232. #include <arm_acle.h>
  233. #endif
  234. #endif
  235. /* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
  236. * and other Arm microarchitectures use.
  237. * From sysctl -a on Apple M1:
  238. * hw.cachelinesize: 128
  239. */
  240. #if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
  241. #define SSE2NEON_CACHELINE_SIZE 128
  242. #else
  243. #define SSE2NEON_CACHELINE_SIZE 64
  244. #endif
  245. /* Rounding functions require either Aarch64 instructions or libm fallback */
  246. #if !defined(__aarch64__) && !defined(_M_ARM64)
  247. #include <math.h>
  248. #endif
  249. /* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
  250. * or even not accessible in user mode.
  251. * To write or access to these registers in user mode,
  252. * we have to perform syscall instead.
  253. */
  254. #if (!defined(__aarch64__) && !defined(_M_ARM64))
  255. #include <sys/time.h>
  256. #endif
  257. /* "__has_builtin" can be used to query support for built-in functions
  258. * provided by gcc/clang and other compilers that support it.
  259. */
  260. #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
  261. /* Compatibility with gcc <= 9 */
  262. #if defined(__GNUC__) && (__GNUC__ <= 9)
  263. #define __has_builtin(x) HAS##x
  264. #define HAS__builtin_popcount 1
  265. #define HAS__builtin_popcountll 1
  266. // __builtin_shuffle introduced in GCC 4.7.0
  267. #if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
  268. #define HAS__builtin_shuffle 1
  269. #else
  270. #define HAS__builtin_shuffle 0
  271. #endif
  272. #define HAS__builtin_shufflevector 0
  273. #define HAS__builtin_nontemporal_store 0
  274. #else
  275. #define __has_builtin(x) 0
  276. #endif
  277. #endif
  278. /**
  279. * MACRO for shuffle parameter for _mm_shuffle_ps().
  280. * Argument fp3 is a digit[0123] that represents the fp from argument "b"
  281. * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
  282. * for fp2 in result. fp1 is a digit[0123] that represents the fp from
  283. * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
  284. * fp0 is the same for fp0 of result.
  285. */
  286. #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
  287. (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
  288. #if __has_builtin(__builtin_shufflevector)
  289. #define _sse2neon_shuffle(type, a, b, ...) \
  290. __builtin_shufflevector(a, b, __VA_ARGS__)
  291. #elif __has_builtin(__builtin_shuffle)
  292. #define _sse2neon_shuffle(type, a, b, ...) \
  293. __extension__({ \
  294. type tmp = {__VA_ARGS__}; \
  295. __builtin_shuffle(a, b, tmp); \
  296. })
  297. #endif
  298. #ifdef _sse2neon_shuffle
  299. #define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
  300. #define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
  301. #define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
  302. #define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
  303. #define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
  304. #define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
  305. #endif
  306. /* Rounding mode macros. */
  307. #define _MM_FROUND_TO_NEAREST_INT 0x00
  308. #define _MM_FROUND_TO_NEG_INF 0x01
  309. #define _MM_FROUND_TO_POS_INF 0x02
  310. #define _MM_FROUND_TO_ZERO 0x03
  311. #define _MM_FROUND_CUR_DIRECTION 0x04
  312. #define _MM_FROUND_NO_EXC 0x08
  313. #define _MM_FROUND_RAISE_EXC 0x00
  314. #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
  315. #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
  316. #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
  317. #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
  318. #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
  319. #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
  320. #define _MM_ROUND_NEAREST 0x0000
  321. #define _MM_ROUND_DOWN 0x2000
  322. #define _MM_ROUND_UP 0x4000
  323. #define _MM_ROUND_TOWARD_ZERO 0x6000
  324. /* Flush zero mode macros. */
  325. #define _MM_FLUSH_ZERO_MASK 0x8000
  326. #define _MM_FLUSH_ZERO_ON 0x8000
  327. #define _MM_FLUSH_ZERO_OFF 0x0000
  328. /* Denormals are zeros mode macros. */
  329. #define _MM_DENORMALS_ZERO_MASK 0x0040
  330. #define _MM_DENORMALS_ZERO_ON 0x0040
  331. #define _MM_DENORMALS_ZERO_OFF 0x0000
  332. /* indicate immediate constant argument in a given range */
  333. #define __constrange(a, b) const
  334. /* A few intrinsics accept traditional data types like ints or floats, but
  335. * most operate on data types that are specific to SSE.
  336. * If a vector type ends in d, it contains doubles, and if it does not have
  337. * a suffix, it contains floats. An integer vector type can contain any type
  338. * of integer, from chars to shorts to unsigned long longs.
  339. */
  340. typedef int64x1_t __m64;
  341. typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
  342. // On ARM 32-bit architecture, the float64x2_t is not supported.
  343. // The data type __m128d should be represented in a different way for related
  344. // intrinsic conversion.
  345. #if defined(__aarch64__) || defined(_M_ARM64)
  346. typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
  347. #else
  348. typedef float32x4_t __m128d;
  349. #endif
  350. typedef int64x2_t __m128i; /* 128-bit vector containing integers */
  351. // __int64 is defined in the Intrinsics Guide which maps to different datatype
  352. // in different data model
  353. #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
  354. #if (defined(__x86_64__) || defined(__i386__))
  355. #define __int64 long long
  356. #else
  357. #define __int64 int64_t
  358. #endif
  359. #endif
  360. /* type-safe casting between types */
  361. #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
  362. #define vreinterpretq_m128_f32(x) (x)
  363. #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
  364. #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
  365. #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
  366. #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
  367. #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
  368. #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
  369. #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
  370. #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
  371. #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
  372. #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
  373. #define vreinterpretq_f32_m128(x) (x)
  374. #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
  375. #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
  376. #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
  377. #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
  378. #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
  379. #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
  380. #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
  381. #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
  382. #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
  383. #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
  384. #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
  385. #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
  386. #define vreinterpretq_m128i_s64(x) (x)
  387. #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
  388. #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
  389. #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
  390. #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
  391. #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
  392. #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
  393. #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
  394. #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
  395. #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
  396. #define vreinterpretq_s64_m128i(x) (x)
  397. #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
  398. #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
  399. #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
  400. #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
  401. #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
  402. #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
  403. #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
  404. #define vreinterpret_m64_s64(x) (x)
  405. #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
  406. #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
  407. #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
  408. #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
  409. #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
  410. #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
  411. #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
  412. #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
  413. #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
  414. #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
  415. #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
  416. #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
  417. #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
  418. #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
  419. #define vreinterpret_s64_m64(x) (x)
  420. #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
  421. #if defined(__aarch64__) || defined(_M_ARM64)
  422. #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
  423. #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
  424. #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
  425. #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
  426. #define vreinterpretq_m128d_f64(x) (x)
  427. #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
  428. #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
  429. #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
  430. #define vreinterpretq_f64_m128d(x) (x)
  431. #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
  432. #else
  433. #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
  434. #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
  435. #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
  436. #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
  437. #define vreinterpretq_m128d_f32(x) (x)
  438. #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
  439. #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
  440. #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
  441. #define vreinterpretq_f32_m128d(x) (x)
  442. #endif
  443. // A struct is defined in this header file called 'SIMDVec' which can be used
  444. // by applications which attempt to access the contents of an __m128 struct
  445. // directly. It is important to note that accessing the __m128 struct directly
  446. // is bad coding practice by Microsoft: @see:
  447. // https://learn.microsoft.com/en-us/cpp/cpp/m128
  448. //
  449. // However, some legacy source code may try to access the contents of an __m128
  450. // struct directly so the developer can use the SIMDVec as an alias for it. Any
  451. // casting must be done manually by the developer, as you cannot cast or
  452. // otherwise alias the base NEON data type for intrinsic operations.
  453. //
  454. // union intended to allow direct access to an __m128 variable using the names
  455. // that the MSVC compiler provides. This union should really only be used when
  456. // trying to access the members of the vector as integer values. GCC/clang
  457. // allow native access to the float members through a simple array access
  458. // operator (in C since 4.6, in C++ since 4.8).
  459. //
  460. // Ideally direct accesses to SIMD vectors should not be used since it can cause
  461. // a performance hit. If it really is needed however, the original __m128
  462. // variable can be aliased with a pointer to this union and used to access
  463. // individual components. The use of this union should be hidden behind a macro
  464. // that is used throughout the codebase to access the members instead of always
  465. // declaring this type of variable.
  466. typedef union ALIGN_STRUCT(16) SIMDVec {
  467. float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
  468. int8_t m128_i8[16]; // as signed 8-bit integers.
  469. int16_t m128_i16[8]; // as signed 16-bit integers.
  470. int32_t m128_i32[4]; // as signed 32-bit integers.
  471. int64_t m128_i64[2]; // as signed 64-bit integers.
  472. uint8_t m128_u8[16]; // as unsigned 8-bit integers.
  473. uint16_t m128_u16[8]; // as unsigned 16-bit integers.
  474. uint32_t m128_u32[4]; // as unsigned 32-bit integers.
  475. uint64_t m128_u64[2]; // as unsigned 64-bit integers.
  476. } SIMDVec;
  477. // casting using SIMDVec
  478. #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
  479. #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
  480. #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
  481. /* SSE macros */
  482. #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
  483. #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
  484. #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
  485. #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
  486. // Function declaration
  487. // SSE
  488. FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
  489. FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
  490. FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
  491. FORCE_INLINE __m128 _mm_set_ps1(float);
  492. FORCE_INLINE __m128 _mm_setzero_ps(void);
  493. // SSE2
  494. FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
  495. FORCE_INLINE __m128i _mm_castps_si128(__m128);
  496. FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
  497. FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
  498. FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
  499. FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
  500. FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
  501. FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
  502. FORCE_INLINE __m128d _mm_set_pd(double, double);
  503. FORCE_INLINE __m128i _mm_set1_epi32(int);
  504. FORCE_INLINE __m128i _mm_setzero_si128(void);
  505. // SSE4.1
  506. FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
  507. FORCE_INLINE __m128 _mm_ceil_ps(__m128);
  508. FORCE_INLINE __m128d _mm_floor_pd(__m128d);
  509. FORCE_INLINE __m128 _mm_floor_ps(__m128);
  510. FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
  511. FORCE_INLINE __m128 _mm_round_ps(__m128, int);
  512. // SSE4.2
  513. FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
  514. /* Backwards compatibility for compilers with lack of specific type support */
  515. // Older gcc does not define vld1q_u8_x4 type
  516. #if defined(__GNUC__) && !defined(__clang__) && \
  517. ((__GNUC__ <= 13 && defined(__arm__)) || \
  518. (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
  519. (__GNUC__ <= 9 && defined(__aarch64__)))
  520. FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
  521. {
  522. uint8x16x4_t ret;
  523. ret.val[0] = vld1q_u8(p + 0);
  524. ret.val[1] = vld1q_u8(p + 16);
  525. ret.val[2] = vld1q_u8(p + 32);
  526. ret.val[3] = vld1q_u8(p + 48);
  527. return ret;
  528. }
  529. #else
  530. // Wraps vld1q_u8_x4
  531. FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
  532. {
  533. return vld1q_u8_x4(p);
  534. }
  535. #endif
  536. #if !defined(__aarch64__) && !defined(_M_ARM64)
  537. /* emulate vaddv u8 variant */
  538. FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
  539. {
  540. const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
  541. return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
  542. }
  543. #else
  544. // Wraps vaddv_u8
  545. FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
  546. {
  547. return vaddv_u8(v8);
  548. }
  549. #endif
  550. #if !defined(__aarch64__) && !defined(_M_ARM64)
  551. /* emulate vaddvq u8 variant */
  552. FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
  553. {
  554. uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
  555. uint8_t res = 0;
  556. for (int i = 0; i < 8; ++i)
  557. res += tmp[i];
  558. return res;
  559. }
  560. #else
  561. // Wraps vaddvq_u8
  562. FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
  563. {
  564. return vaddvq_u8(a);
  565. }
  566. #endif
  567. #if !defined(__aarch64__) && !defined(_M_ARM64)
  568. /* emulate vaddvq u16 variant */
  569. FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
  570. {
  571. uint32x4_t m = vpaddlq_u16(a);
  572. uint64x2_t n = vpaddlq_u32(m);
  573. uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
  574. return vget_lane_u32((uint32x2_t) o, 0);
  575. }
  576. #else
  577. // Wraps vaddvq_u16
  578. FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
  579. {
  580. return vaddvq_u16(a);
  581. }
  582. #endif
  583. /* Function Naming Conventions
  584. * The naming convention of SSE intrinsics is straightforward. A generic SSE
  585. * intrinsic function is given as follows:
  586. * _mm_<name>_<data_type>
  587. *
  588. * The parts of this format are given as follows:
  589. * 1. <name> describes the operation performed by the intrinsic
  590. * 2. <data_type> identifies the data type of the function's primary arguments
  591. *
  592. * This last part, <data_type>, is a little complicated. It identifies the
  593. * content of the input values, and can be set to any of the following values:
  594. * + ps - vectors contain floats (ps stands for packed single-precision)
  595. * + pd - vectors contain doubles (pd stands for packed double-precision)
  596. * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
  597. * signed integers
  598. * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
  599. * unsigned integers
  600. * + si128 - unspecified 128-bit vector or 256-bit vector
  601. * + m128/m128i/m128d - identifies input vector types when they are different
  602. * than the type of the returned vector
  603. *
  604. * For example, _mm_setzero_ps. The _mm implies that the function returns
  605. * a 128-bit vector. The _ps at the end implies that the argument vectors
  606. * contain floats.
  607. *
  608. * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
  609. * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
  610. * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
  611. * // Set packed 8-bit integers
  612. * // 128 bits, 16 chars, per 8 bits
  613. * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
  614. * 4, 5, 12, 13, 6, 7, 14, 15);
  615. * // Shuffle packed 8-bit integers
  616. * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
  617. */
  618. /* Constants for use with _mm_prefetch. */
  619. enum _mm_hint {
  620. _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
  621. _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
  622. _MM_HINT_T1 = 2, /* load data to L2 cache only */
  623. _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
  624. };
  625. // The bit field mapping to the FPCR(floating-point control register)
  626. typedef struct {
  627. uint16_t res0;
  628. uint8_t res1 : 6;
  629. uint8_t bit22 : 1;
  630. uint8_t bit23 : 1;
  631. uint8_t bit24 : 1;
  632. uint8_t res2 : 7;
  633. #if defined(__aarch64__) || defined(_M_ARM64)
  634. uint32_t res3;
  635. #endif
  636. } fpcr_bitfield;
  637. // Takes the upper 64 bits of a and places it in the low end of the result
  638. // Takes the lower 64 bits of b and places it into the high end of the result.
  639. FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
  640. {
  641. float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
  642. float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
  643. return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
  644. }
  645. // takes the lower two 32-bit values from a and swaps them and places in high
  646. // end of result takes the higher two 32 bit values from b and swaps them and
  647. // places in low end of result.
  648. FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
  649. {
  650. float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
  651. float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
  652. return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
  653. }
  654. FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
  655. {
  656. float32x2_t a21 = vget_high_f32(
  657. vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
  658. float32x2_t b03 = vget_low_f32(
  659. vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
  660. return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
  661. }
  662. FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
  663. {
  664. float32x2_t a03 = vget_low_f32(
  665. vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
  666. float32x2_t b21 = vget_high_f32(
  667. vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
  668. return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
  669. }
  670. FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
  671. {
  672. float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
  673. float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
  674. return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
  675. }
  676. FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
  677. {
  678. float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
  679. float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
  680. return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
  681. }
  682. FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
  683. {
  684. float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
  685. float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
  686. return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
  687. }
  688. // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
  689. // high
  690. FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
  691. {
  692. float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
  693. float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
  694. return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
  695. }
  696. FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
  697. {
  698. float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
  699. float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
  700. return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
  701. }
  702. FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
  703. {
  704. float32x2_t a22 =
  705. vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
  706. float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
  707. return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
  708. }
  709. FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
  710. {
  711. float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
  712. float32x2_t b22 =
  713. vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
  714. return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
  715. }
  716. FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
  717. {
  718. float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
  719. float32x2_t a22 =
  720. vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
  721. float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
  722. float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
  723. return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
  724. }
  725. FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
  726. {
  727. float32x2_t a33 =
  728. vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
  729. float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
  730. return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
  731. }
  732. FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
  733. {
  734. float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
  735. float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
  736. float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
  737. float32x2_t b20 = vset_lane_f32(b2, b00, 1);
  738. return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
  739. }
  740. FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
  741. {
  742. float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
  743. float32_t b2 = vgetq_lane_f32(b, 2);
  744. float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
  745. float32x2_t b20 = vset_lane_f32(b2, b00, 1);
  746. return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
  747. }
  748. FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
  749. {
  750. float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
  751. float32_t b2 = vgetq_lane_f32(b, 2);
  752. float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
  753. float32x2_t b20 = vset_lane_f32(b2, b00, 1);
  754. return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
  755. }
  756. // For MSVC, we check only if it is ARM64, as every single ARM64 processor
  757. // supported by WoA has crypto extensions. If this changes in the future,
  758. // this can be verified via the runtime-only method of:
  759. // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
  760. #if (defined(_M_ARM64) && !defined(__clang__)) || \
  761. (defined(__ARM_FEATURE_CRYPTO) && \
  762. (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
  763. // Wraps vmull_p64
  764. FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
  765. {
  766. poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
  767. poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
  768. #if defined(_MSC_VER)
  769. __n64 a1 = {a}, b1 = {b};
  770. return vreinterpretq_u64_p128(vmull_p64(a1, b1));
  771. #else
  772. return vreinterpretq_u64_p128(vmull_p64(a, b));
  773. #endif
  774. }
  775. #else // ARMv7 polyfill
  776. // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
  777. //
  778. // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
  779. // 64-bit->128-bit polynomial multiply.
  780. //
  781. // It needs some work and is somewhat slow, but it is still faster than all
  782. // known scalar methods.
  783. //
  784. // Algorithm adapted to C from
  785. // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
  786. // from "Fast Software Polynomial Multiplication on ARM Processors Using the
  787. // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
  788. // (https://hal.inria.fr/hal-01506572)
  789. static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
  790. {
  791. poly8x8_t a = vreinterpret_p8_u64(_a);
  792. poly8x8_t b = vreinterpret_p8_u64(_b);
  793. // Masks
  794. uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
  795. vcreate_u8(0x00000000ffffffff));
  796. uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
  797. vcreate_u8(0x0000000000000000));
  798. // Do the multiplies, rotating with vext to get all combinations
  799. uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
  800. uint8x16_t e =
  801. vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
  802. uint8x16_t f =
  803. vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
  804. uint8x16_t g =
  805. vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
  806. uint8x16_t h =
  807. vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
  808. uint8x16_t i =
  809. vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
  810. uint8x16_t j =
  811. vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
  812. uint8x16_t k =
  813. vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
  814. // Add cross products
  815. uint8x16_t l = veorq_u8(e, f); // L = E + F
  816. uint8x16_t m = veorq_u8(g, h); // M = G + H
  817. uint8x16_t n = veorq_u8(i, j); // N = I + J
  818. // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
  819. // instructions.
  820. #if defined(__aarch64__)
  821. uint8x16_t lm_p0 = vreinterpretq_u8_u64(
  822. vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
  823. uint8x16_t lm_p1 = vreinterpretq_u8_u64(
  824. vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
  825. uint8x16_t nk_p0 = vreinterpretq_u8_u64(
  826. vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
  827. uint8x16_t nk_p1 = vreinterpretq_u8_u64(
  828. vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
  829. #else
  830. uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
  831. uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
  832. uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
  833. uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
  834. #endif
  835. // t0 = (L) (P0 + P1) << 8
  836. // t1 = (M) (P2 + P3) << 16
  837. uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
  838. uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
  839. uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
  840. // t2 = (N) (P4 + P5) << 24
  841. // t3 = (K) (P6 + P7) << 32
  842. uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
  843. uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
  844. uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
  845. // De-interleave
  846. #if defined(__aarch64__)
  847. uint8x16_t t0 = vreinterpretq_u8_u64(
  848. vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
  849. uint8x16_t t1 = vreinterpretq_u8_u64(
  850. vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
  851. uint8x16_t t2 = vreinterpretq_u8_u64(
  852. vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
  853. uint8x16_t t3 = vreinterpretq_u8_u64(
  854. vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
  855. #else
  856. uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
  857. uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
  858. uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
  859. uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
  860. #endif
  861. // Shift the cross products
  862. uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
  863. uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
  864. uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
  865. uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
  866. // Accumulate the products
  867. uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
  868. uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
  869. uint8x16_t mix = veorq_u8(d, cross1);
  870. uint8x16_t r = veorq_u8(mix, cross2);
  871. return vreinterpretq_u64_u8(r);
  872. }
  873. #endif // ARMv7 polyfill
  874. // C equivalent:
  875. // __m128i _mm_shuffle_epi32_default(__m128i a,
  876. // __constrange(0, 255) int imm) {
  877. // __m128i ret;
  878. // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
  879. // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
  880. // return ret;
  881. // }
  882. #define _mm_shuffle_epi32_default(a, imm) \
  883. vreinterpretq_m128i_s32(vsetq_lane_s32( \
  884. vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
  885. vsetq_lane_s32( \
  886. vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
  887. vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
  888. ((imm) >> 2) & 0x3), \
  889. vmovq_n_s32(vgetq_lane_s32( \
  890. vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
  891. 1), \
  892. 2), \
  893. 3))
  894. // Takes the upper 64 bits of a and places it in the low end of the result
  895. // Takes the lower 64 bits of a and places it into the high end of the result.
  896. FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
  897. {
  898. int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
  899. int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
  900. return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
  901. }
  902. // takes the lower two 32-bit values from a and swaps them and places in low end
  903. // of result takes the higher two 32 bit values from a and swaps them and places
  904. // in high end of result.
  905. FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
  906. {
  907. int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
  908. int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
  909. return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
  910. }
  911. // rotates the least significant 32 bits into the most significant 32 bits, and
  912. // shifts the rest down
  913. FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
  914. {
  915. return vreinterpretq_m128i_s32(
  916. vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
  917. }
  918. // rotates the most significant 32 bits into the least significant 32 bits, and
  919. // shifts the rest up
  920. FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
  921. {
  922. return vreinterpretq_m128i_s32(
  923. vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
  924. }
  925. // gets the lower 64 bits of a, and places it in the upper 64 bits
  926. // gets the lower 64 bits of a and places it in the lower 64 bits
  927. FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
  928. {
  929. int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
  930. return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
  931. }
  932. // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
  933. // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
  934. FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
  935. {
  936. int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
  937. int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
  938. return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
  939. }
  940. // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
  941. // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
  942. // places it in the lower 64 bits
  943. FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
  944. {
  945. int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
  946. return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
  947. }
  948. FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
  949. {
  950. int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
  951. int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
  952. return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
  953. }
  954. FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
  955. {
  956. int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
  957. int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
  958. return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
  959. }
  960. FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
  961. {
  962. int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
  963. int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
  964. return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
  965. }
  966. #if defined(__aarch64__) || defined(_M_ARM64)
  967. #define _mm_shuffle_epi32_splat(a, imm) \
  968. vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
  969. #else
  970. #define _mm_shuffle_epi32_splat(a, imm) \
  971. vreinterpretq_m128i_s32( \
  972. vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
  973. #endif
  974. // NEON does not support a general purpose permute intrinsic.
  975. // Shuffle single-precision (32-bit) floating-point elements in a using the
  976. // control in imm8, and store the results in dst.
  977. //
  978. // C equivalent:
  979. // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
  980. // __constrange(0, 255) int imm) {
  981. // __m128 ret;
  982. // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
  983. // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
  984. // return ret;
  985. // }
  986. //
  987. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
  988. #define _mm_shuffle_ps_default(a, b, imm) \
  989. vreinterpretq_m128_f32(vsetq_lane_f32( \
  990. vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
  991. vsetq_lane_f32( \
  992. vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
  993. vsetq_lane_f32( \
  994. vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
  995. vmovq_n_f32( \
  996. vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
  997. 1), \
  998. 2), \
  999. 3))
  1000. // Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
  1001. // Store the results in the low 64 bits of dst, with the high 64 bits being
  1002. // copied from a to dst.
  1003. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
  1004. #define _mm_shufflelo_epi16_function(a, imm) \
  1005. _sse2neon_define1( \
  1006. __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
  1007. int16x4_t lowBits = vget_low_s16(ret); \
  1008. ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
  1009. ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
  1010. 1); \
  1011. ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
  1012. 2); \
  1013. ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
  1014. 3); \
  1015. _sse2neon_return(vreinterpretq_m128i_s16(ret));)
  1016. // Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
  1017. // Store the results in the high 64 bits of dst, with the low 64 bits being
  1018. // copied from a to dst.
  1019. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
  1020. #define _mm_shufflehi_epi16_function(a, imm) \
  1021. _sse2neon_define1( \
  1022. __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
  1023. int16x4_t highBits = vget_high_s16(ret); \
  1024. ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
  1025. ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
  1026. 5); \
  1027. ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
  1028. 6); \
  1029. ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
  1030. 7); \
  1031. _sse2neon_return(vreinterpretq_m128i_s16(ret));)
  1032. /* MMX */
  1033. //_mm_empty is a no-op on arm
  1034. FORCE_INLINE void _mm_empty(void) {}
  1035. /* SSE */
  1036. // Add packed single-precision (32-bit) floating-point elements in a and b, and
  1037. // store the results in dst.
  1038. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
  1039. FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
  1040. {
  1041. return vreinterpretq_m128_f32(
  1042. vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1043. }
  1044. // Add the lower single-precision (32-bit) floating-point element in a and b,
  1045. // store the result in the lower element of dst, and copy the upper 3 packed
  1046. // elements from a to the upper elements of dst.
  1047. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
  1048. FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
  1049. {
  1050. float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
  1051. float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
  1052. // the upper values in the result must be the remnants of <a>.
  1053. return vreinterpretq_m128_f32(vaddq_f32(a, value));
  1054. }
  1055. // Compute the bitwise AND of packed single-precision (32-bit) floating-point
  1056. // elements in a and b, and store the results in dst.
  1057. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
  1058. FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
  1059. {
  1060. return vreinterpretq_m128_s32(
  1061. vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
  1062. }
  1063. // Compute the bitwise NOT of packed single-precision (32-bit) floating-point
  1064. // elements in a and then AND with b, and store the results in dst.
  1065. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
  1066. FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
  1067. {
  1068. return vreinterpretq_m128_s32(
  1069. vbicq_s32(vreinterpretq_s32_m128(b),
  1070. vreinterpretq_s32_m128(a))); // *NOTE* argument swap
  1071. }
  1072. // Average packed unsigned 16-bit integers in a and b, and store the results in
  1073. // dst.
  1074. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
  1075. FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
  1076. {
  1077. return vreinterpret_m64_u16(
  1078. vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
  1079. }
  1080. // Average packed unsigned 8-bit integers in a and b, and store the results in
  1081. // dst.
  1082. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
  1083. FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
  1084. {
  1085. return vreinterpret_m64_u8(
  1086. vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
  1087. }
  1088. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1089. // for equality, and store the results in dst.
  1090. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
  1091. FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
  1092. {
  1093. return vreinterpretq_m128_u32(
  1094. vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1095. }
  1096. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1097. // b for equality, store the result in the lower element of dst, and copy the
  1098. // upper 3 packed elements from a to the upper elements of dst.
  1099. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
  1100. FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
  1101. {
  1102. return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
  1103. }
  1104. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1105. // for greater-than-or-equal, and store the results in dst.
  1106. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
  1107. FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
  1108. {
  1109. return vreinterpretq_m128_u32(
  1110. vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1111. }
  1112. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1113. // b for greater-than-or-equal, store the result in the lower element of dst,
  1114. // and copy the upper 3 packed elements from a to the upper elements of dst.
  1115. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
  1116. FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
  1117. {
  1118. return _mm_move_ss(a, _mm_cmpge_ps(a, b));
  1119. }
  1120. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1121. // for greater-than, and store the results in dst.
  1122. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
  1123. FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
  1124. {
  1125. return vreinterpretq_m128_u32(
  1126. vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1127. }
  1128. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1129. // b for greater-than, store the result in the lower element of dst, and copy
  1130. // the upper 3 packed elements from a to the upper elements of dst.
  1131. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
  1132. FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
  1133. {
  1134. return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
  1135. }
  1136. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1137. // for less-than-or-equal, and store the results in dst.
  1138. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
  1139. FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
  1140. {
  1141. return vreinterpretq_m128_u32(
  1142. vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1143. }
  1144. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1145. // b for less-than-or-equal, store the result in the lower element of dst, and
  1146. // copy the upper 3 packed elements from a to the upper elements of dst.
  1147. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
  1148. FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
  1149. {
  1150. return _mm_move_ss(a, _mm_cmple_ps(a, b));
  1151. }
  1152. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1153. // for less-than, and store the results in dst.
  1154. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
  1155. FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
  1156. {
  1157. return vreinterpretq_m128_u32(
  1158. vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1159. }
  1160. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1161. // b for less-than, store the result in the lower element of dst, and copy the
  1162. // upper 3 packed elements from a to the upper elements of dst.
  1163. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
  1164. FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
  1165. {
  1166. return _mm_move_ss(a, _mm_cmplt_ps(a, b));
  1167. }
  1168. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1169. // for not-equal, and store the results in dst.
  1170. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
  1171. FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
  1172. {
  1173. return vreinterpretq_m128_u32(vmvnq_u32(
  1174. vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
  1175. }
  1176. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1177. // b for not-equal, store the result in the lower element of dst, and copy the
  1178. // upper 3 packed elements from a to the upper elements of dst.
  1179. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
  1180. FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
  1181. {
  1182. return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
  1183. }
  1184. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1185. // for not-greater-than-or-equal, and store the results in dst.
  1186. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
  1187. FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
  1188. {
  1189. return vreinterpretq_m128_u32(vmvnq_u32(
  1190. vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
  1191. }
  1192. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1193. // b for not-greater-than-or-equal, store the result in the lower element of
  1194. // dst, and copy the upper 3 packed elements from a to the upper elements of
  1195. // dst.
  1196. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
  1197. FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
  1198. {
  1199. return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
  1200. }
  1201. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1202. // for not-greater-than, and store the results in dst.
  1203. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
  1204. FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
  1205. {
  1206. return vreinterpretq_m128_u32(vmvnq_u32(
  1207. vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
  1208. }
  1209. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1210. // b for not-greater-than, store the result in the lower element of dst, and
  1211. // copy the upper 3 packed elements from a to the upper elements of dst.
  1212. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
  1213. FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
  1214. {
  1215. return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
  1216. }
  1217. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1218. // for not-less-than-or-equal, and store the results in dst.
  1219. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
  1220. FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
  1221. {
  1222. return vreinterpretq_m128_u32(vmvnq_u32(
  1223. vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
  1224. }
  1225. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1226. // b for not-less-than-or-equal, store the result in the lower element of dst,
  1227. // and copy the upper 3 packed elements from a to the upper elements of dst.
  1228. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
  1229. FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
  1230. {
  1231. return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
  1232. }
  1233. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1234. // for not-less-than, and store the results in dst.
  1235. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
  1236. FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
  1237. {
  1238. return vreinterpretq_m128_u32(vmvnq_u32(
  1239. vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
  1240. }
  1241. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1242. // b for not-less-than, store the result in the lower element of dst, and copy
  1243. // the upper 3 packed elements from a to the upper elements of dst.
  1244. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
  1245. FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
  1246. {
  1247. return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
  1248. }
  1249. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1250. // to see if neither is NaN, and store the results in dst.
  1251. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
  1252. //
  1253. // See also:
  1254. // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
  1255. // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
  1256. FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
  1257. {
  1258. // Note: NEON does not have ordered compare builtin
  1259. // Need to compare a eq a and b eq b to check for NaN
  1260. // Do AND of results to get final
  1261. uint32x4_t ceqaa =
  1262. vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
  1263. uint32x4_t ceqbb =
  1264. vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
  1265. return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
  1266. }
  1267. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1268. // b to see if neither is NaN, store the result in the lower element of dst, and
  1269. // copy the upper 3 packed elements from a to the upper elements of dst.
  1270. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
  1271. FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
  1272. {
  1273. return _mm_move_ss(a, _mm_cmpord_ps(a, b));
  1274. }
  1275. // Compare packed single-precision (32-bit) floating-point elements in a and b
  1276. // to see if either is NaN, and store the results in dst.
  1277. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
  1278. FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
  1279. {
  1280. uint32x4_t f32a =
  1281. vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
  1282. uint32x4_t f32b =
  1283. vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
  1284. return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
  1285. }
  1286. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1287. // b to see if either is NaN, store the result in the lower element of dst, and
  1288. // copy the upper 3 packed elements from a to the upper elements of dst.
  1289. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
  1290. FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
  1291. {
  1292. return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
  1293. }
  1294. // Compare the lower single-precision (32-bit) floating-point element in a and b
  1295. // for equality, and return the boolean result (0 or 1).
  1296. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
  1297. FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
  1298. {
  1299. uint32x4_t a_eq_b =
  1300. vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
  1301. return vgetq_lane_u32(a_eq_b, 0) & 0x1;
  1302. }
  1303. // Compare the lower single-precision (32-bit) floating-point element in a and b
  1304. // for greater-than-or-equal, and return the boolean result (0 or 1).
  1305. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
  1306. FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
  1307. {
  1308. uint32x4_t a_ge_b =
  1309. vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
  1310. return vgetq_lane_u32(a_ge_b, 0) & 0x1;
  1311. }
  1312. // Compare the lower single-precision (32-bit) floating-point element in a and b
  1313. // for greater-than, and return the boolean result (0 or 1).
  1314. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
  1315. FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
  1316. {
  1317. uint32x4_t a_gt_b =
  1318. vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
  1319. return vgetq_lane_u32(a_gt_b, 0) & 0x1;
  1320. }
  1321. // Compare the lower single-precision (32-bit) floating-point element in a and b
  1322. // for less-than-or-equal, and return the boolean result (0 or 1).
  1323. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
  1324. FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
  1325. {
  1326. uint32x4_t a_le_b =
  1327. vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
  1328. return vgetq_lane_u32(a_le_b, 0) & 0x1;
  1329. }
  1330. // Compare the lower single-precision (32-bit) floating-point element in a and b
  1331. // for less-than, and return the boolean result (0 or 1).
  1332. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
  1333. FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
  1334. {
  1335. uint32x4_t a_lt_b =
  1336. vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
  1337. return vgetq_lane_u32(a_lt_b, 0) & 0x1;
  1338. }
  1339. // Compare the lower single-precision (32-bit) floating-point element in a and b
  1340. // for not-equal, and return the boolean result (0 or 1).
  1341. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
  1342. FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
  1343. {
  1344. return !_mm_comieq_ss(a, b);
  1345. }
  1346. // Convert packed signed 32-bit integers in b to packed single-precision
  1347. // (32-bit) floating-point elements, store the results in the lower 2 elements
  1348. // of dst, and copy the upper 2 packed elements from a to the upper elements of
  1349. // dst.
  1350. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
  1351. FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
  1352. {
  1353. return vreinterpretq_m128_f32(
  1354. vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
  1355. vget_high_f32(vreinterpretq_f32_m128(a))));
  1356. }
  1357. // Convert packed single-precision (32-bit) floating-point elements in a to
  1358. // packed 32-bit integers, and store the results in dst.
  1359. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
  1360. FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
  1361. {
  1362. #if (defined(__aarch64__) || defined(_M_ARM64)) || \
  1363. defined(__ARM_FEATURE_DIRECTED_ROUNDING)
  1364. return vreinterpret_m64_s32(
  1365. vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
  1366. #else
  1367. return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
  1368. vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
  1369. #endif
  1370. }
  1371. // Convert the signed 32-bit integer b to a single-precision (32-bit)
  1372. // floating-point element, store the result in the lower element of dst, and
  1373. // copy the upper 3 packed elements from a to the upper elements of dst.
  1374. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
  1375. FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
  1376. {
  1377. return vreinterpretq_m128_f32(
  1378. vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
  1379. }
  1380. // Convert the lower single-precision (32-bit) floating-point element in a to a
  1381. // 32-bit integer, and store the result in dst.
  1382. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
  1383. FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
  1384. {
  1385. #if (defined(__aarch64__) || defined(_M_ARM64)) || \
  1386. defined(__ARM_FEATURE_DIRECTED_ROUNDING)
  1387. return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
  1388. 0);
  1389. #else
  1390. float32_t data = vgetq_lane_f32(
  1391. vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
  1392. return (int32_t) data;
  1393. #endif
  1394. }
  1395. // Convert packed 16-bit integers in a to packed single-precision (32-bit)
  1396. // floating-point elements, and store the results in dst.
  1397. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
  1398. FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
  1399. {
  1400. return vreinterpretq_m128_f32(
  1401. vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
  1402. }
  1403. // Convert packed 32-bit integers in b to packed single-precision (32-bit)
  1404. // floating-point elements, store the results in the lower 2 elements of dst,
  1405. // and copy the upper 2 packed elements from a to the upper elements of dst.
  1406. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
  1407. FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
  1408. {
  1409. return vreinterpretq_m128_f32(
  1410. vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
  1411. vget_high_f32(vreinterpretq_f32_m128(a))));
  1412. }
  1413. // Convert packed signed 32-bit integers in a to packed single-precision
  1414. // (32-bit) floating-point elements, store the results in the lower 2 elements
  1415. // of dst, then convert the packed signed 32-bit integers in b to
  1416. // single-precision (32-bit) floating-point element, and store the results in
  1417. // the upper 2 elements of dst.
  1418. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
  1419. FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
  1420. {
  1421. return vreinterpretq_m128_f32(vcvtq_f32_s32(
  1422. vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
  1423. }
  1424. // Convert the lower packed 8-bit integers in a to packed single-precision
  1425. // (32-bit) floating-point elements, and store the results in dst.
  1426. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
  1427. FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
  1428. {
  1429. return vreinterpretq_m128_f32(vcvtq_f32_s32(
  1430. vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
  1431. }
  1432. // Convert packed single-precision (32-bit) floating-point elements in a to
  1433. // packed 16-bit integers, and store the results in dst. Note: this intrinsic
  1434. // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
  1435. // 0x7FFFFFFF.
  1436. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
  1437. FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
  1438. {
  1439. return vreinterpret_m64_s16(
  1440. vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
  1441. }
  1442. // Convert packed single-precision (32-bit) floating-point elements in a to
  1443. // packed 32-bit integers, and store the results in dst.
  1444. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
  1445. #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
  1446. // Convert packed single-precision (32-bit) floating-point elements in a to
  1447. // packed 8-bit integers, and store the results in lower 4 elements of dst.
  1448. // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
  1449. // between 0x7F and 0x7FFFFFFF.
  1450. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
  1451. FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
  1452. {
  1453. return vreinterpret_m64_s8(vqmovn_s16(
  1454. vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
  1455. }
  1456. // Convert packed unsigned 16-bit integers in a to packed single-precision
  1457. // (32-bit) floating-point elements, and store the results in dst.
  1458. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
  1459. FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
  1460. {
  1461. return vreinterpretq_m128_f32(
  1462. vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
  1463. }
  1464. // Convert the lower packed unsigned 8-bit integers in a to packed
  1465. // single-precision (32-bit) floating-point elements, and store the results in
  1466. // dst.
  1467. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
  1468. FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
  1469. {
  1470. return vreinterpretq_m128_f32(vcvtq_f32_u32(
  1471. vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
  1472. }
  1473. // Convert the signed 32-bit integer b to a single-precision (32-bit)
  1474. // floating-point element, store the result in the lower element of dst, and
  1475. // copy the upper 3 packed elements from a to the upper elements of dst.
  1476. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
  1477. #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
  1478. // Convert the signed 64-bit integer b to a single-precision (32-bit)
  1479. // floating-point element, store the result in the lower element of dst, and
  1480. // copy the upper 3 packed elements from a to the upper elements of dst.
  1481. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
  1482. FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
  1483. {
  1484. return vreinterpretq_m128_f32(
  1485. vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
  1486. }
  1487. // Copy the lower single-precision (32-bit) floating-point element of a to dst.
  1488. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
  1489. FORCE_INLINE float _mm_cvtss_f32(__m128 a)
  1490. {
  1491. return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
  1492. }
  1493. // Convert the lower single-precision (32-bit) floating-point element in a to a
  1494. // 32-bit integer, and store the result in dst.
  1495. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
  1496. #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
  1497. // Convert the lower single-precision (32-bit) floating-point element in a to a
  1498. // 64-bit integer, and store the result in dst.
  1499. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
  1500. FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
  1501. {
  1502. #if (defined(__aarch64__) || defined(_M_ARM64)) || \
  1503. defined(__ARM_FEATURE_DIRECTED_ROUNDING)
  1504. return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
  1505. #else
  1506. float32_t data = vgetq_lane_f32(
  1507. vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
  1508. return (int64_t) data;
  1509. #endif
  1510. }
  1511. // Convert packed single-precision (32-bit) floating-point elements in a to
  1512. // packed 32-bit integers with truncation, and store the results in dst.
  1513. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
  1514. FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
  1515. {
  1516. return vreinterpret_m64_s32(
  1517. vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
  1518. }
  1519. // Convert the lower single-precision (32-bit) floating-point element in a to a
  1520. // 32-bit integer with truncation, and store the result in dst.
  1521. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
  1522. FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
  1523. {
  1524. return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
  1525. }
  1526. // Convert packed single-precision (32-bit) floating-point elements in a to
  1527. // packed 32-bit integers with truncation, and store the results in dst.
  1528. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
  1529. #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
  1530. // Convert the lower single-precision (32-bit) floating-point element in a to a
  1531. // 32-bit integer with truncation, and store the result in dst.
  1532. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
  1533. #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
  1534. // Convert the lower single-precision (32-bit) floating-point element in a to a
  1535. // 64-bit integer with truncation, and store the result in dst.
  1536. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
  1537. FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
  1538. {
  1539. return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
  1540. }
  1541. // Divide packed single-precision (32-bit) floating-point elements in a by
  1542. // packed elements in b, and store the results in dst.
  1543. // Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
  1544. // division by multiplying a by b's reciprocal before using the Newton-Raphson
  1545. // method to approximate the results.
  1546. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
  1547. FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
  1548. {
  1549. #if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_DIV
  1550. return vreinterpretq_m128_f32(
  1551. vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1552. #else
  1553. float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
  1554. recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
  1555. // Additional Netwon-Raphson iteration for accuracy
  1556. recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
  1557. return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
  1558. #endif
  1559. }
  1560. // Divide the lower single-precision (32-bit) floating-point element in a by the
  1561. // lower single-precision (32-bit) floating-point element in b, store the result
  1562. // in the lower element of dst, and copy the upper 3 packed elements from a to
  1563. // the upper elements of dst.
  1564. // Warning: ARMv7-A does not produce the same result compared to Intel and not
  1565. // IEEE-compliant.
  1566. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
  1567. FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
  1568. {
  1569. float32_t value =
  1570. vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
  1571. return vreinterpretq_m128_f32(
  1572. vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
  1573. }
  1574. // Extract a 16-bit integer from a, selected with imm8, and store the result in
  1575. // the lower element of dst.
  1576. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
  1577. #define _mm_extract_pi16(a, imm) \
  1578. (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
  1579. // Free aligned memory that was allocated with _mm_malloc.
  1580. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
  1581. #if !defined(SSE2NEON_ALLOC_DEFINED)
  1582. FORCE_INLINE void _mm_free(void *addr)
  1583. {
  1584. free(addr);
  1585. }
  1586. #endif
  1587. FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
  1588. {
  1589. uint64_t value;
  1590. #if defined(_MSC_VER)
  1591. value = _ReadStatusReg(ARM64_FPCR);
  1592. #else
  1593. __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
  1594. #endif
  1595. return value;
  1596. }
  1597. FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
  1598. {
  1599. #if defined(_MSC_VER)
  1600. _WriteStatusReg(ARM64_FPCR, value);
  1601. #else
  1602. __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
  1603. #endif
  1604. }
  1605. // Macro: Get the flush zero bits from the MXCSR control and status register.
  1606. // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
  1607. // _MM_FLUSH_ZERO_OFF
  1608. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
  1609. FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
  1610. {
  1611. union {
  1612. fpcr_bitfield field;
  1613. #if defined(__aarch64__) || defined(_M_ARM64)
  1614. uint64_t value;
  1615. #else
  1616. uint32_t value;
  1617. #endif
  1618. } r;
  1619. #if defined(__aarch64__) || defined(_M_ARM64)
  1620. r.value = _sse2neon_get_fpcr();
  1621. #else
  1622. __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
  1623. #endif
  1624. return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
  1625. }
  1626. // Macro: Get the rounding mode bits from the MXCSR control and status register.
  1627. // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
  1628. // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
  1629. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
  1630. FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
  1631. {
  1632. union {
  1633. fpcr_bitfield field;
  1634. #if defined(__aarch64__) || defined(_M_ARM64)
  1635. uint64_t value;
  1636. #else
  1637. uint32_t value;
  1638. #endif
  1639. } r;
  1640. #if defined(__aarch64__) || defined(_M_ARM64)
  1641. r.value = _sse2neon_get_fpcr();
  1642. #else
  1643. __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
  1644. #endif
  1645. if (r.field.bit22) {
  1646. return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
  1647. } else {
  1648. return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
  1649. }
  1650. }
  1651. // Copy a to dst, and insert the 16-bit integer i into dst at the location
  1652. // specified by imm8.
  1653. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
  1654. #define _mm_insert_pi16(a, b, imm) \
  1655. vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
  1656. // Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
  1657. // elements) from memory into dst. mem_addr must be aligned on a 16-byte
  1658. // boundary or a general-protection exception may be generated.
  1659. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
  1660. FORCE_INLINE __m128 _mm_load_ps(const float *p)
  1661. {
  1662. return vreinterpretq_m128_f32(vld1q_f32(p));
  1663. }
  1664. // Load a single-precision (32-bit) floating-point element from memory into all
  1665. // elements of dst.
  1666. //
  1667. // dst[31:0] := MEM[mem_addr+31:mem_addr]
  1668. // dst[63:32] := MEM[mem_addr+31:mem_addr]
  1669. // dst[95:64] := MEM[mem_addr+31:mem_addr]
  1670. // dst[127:96] := MEM[mem_addr+31:mem_addr]
  1671. //
  1672. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
  1673. #define _mm_load_ps1 _mm_load1_ps
  1674. // Load a single-precision (32-bit) floating-point element from memory into the
  1675. // lower of dst, and zero the upper 3 elements. mem_addr does not need to be
  1676. // aligned on any particular boundary.
  1677. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
  1678. FORCE_INLINE __m128 _mm_load_ss(const float *p)
  1679. {
  1680. return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
  1681. }
  1682. // Load a single-precision (32-bit) floating-point element from memory into all
  1683. // elements of dst.
  1684. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
  1685. FORCE_INLINE __m128 _mm_load1_ps(const float *p)
  1686. {
  1687. return vreinterpretq_m128_f32(vld1q_dup_f32(p));
  1688. }
  1689. // Load 2 single-precision (32-bit) floating-point elements from memory into the
  1690. // upper 2 elements of dst, and copy the lower 2 elements from a to dst.
  1691. // mem_addr does not need to be aligned on any particular boundary.
  1692. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
  1693. FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
  1694. {
  1695. return vreinterpretq_m128_f32(
  1696. vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
  1697. }
  1698. // Load 2 single-precision (32-bit) floating-point elements from memory into the
  1699. // lower 2 elements of dst, and copy the upper 2 elements from a to dst.
  1700. // mem_addr does not need to be aligned on any particular boundary.
  1701. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
  1702. FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
  1703. {
  1704. return vreinterpretq_m128_f32(
  1705. vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
  1706. }
  1707. // Load 4 single-precision (32-bit) floating-point elements from memory into dst
  1708. // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
  1709. // general-protection exception may be generated.
  1710. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
  1711. FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
  1712. {
  1713. float32x4_t v = vrev64q_f32(vld1q_f32(p));
  1714. return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
  1715. }
  1716. // Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
  1717. // elements) from memory into dst. mem_addr does not need to be aligned on any
  1718. // particular boundary.
  1719. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
  1720. FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
  1721. {
  1722. // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
  1723. // equivalent for neon
  1724. return vreinterpretq_m128_f32(vld1q_f32(p));
  1725. }
  1726. // Load unaligned 16-bit integer from memory into the first element of dst.
  1727. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
  1728. FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
  1729. {
  1730. return vreinterpretq_m128i_s16(
  1731. vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
  1732. }
  1733. // Load unaligned 64-bit integer from memory into the first element of dst.
  1734. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
  1735. FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
  1736. {
  1737. return vreinterpretq_m128i_s64(
  1738. vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
  1739. }
  1740. // Allocate size bytes of memory, aligned to the alignment specified in align,
  1741. // and return a pointer to the allocated memory. _mm_free should be used to free
  1742. // memory that is allocated with _mm_malloc.
  1743. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
  1744. #if !defined(SSE2NEON_ALLOC_DEFINED)
  1745. FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
  1746. {
  1747. void *ptr;
  1748. if (align == 1)
  1749. return malloc(size);
  1750. if (align == 2 || (sizeof(void *) == 8 && align == 4))
  1751. align = sizeof(void *);
  1752. if (!posix_memalign(&ptr, align, size))
  1753. return ptr;
  1754. return NULL;
  1755. }
  1756. #endif
  1757. // Conditionally store 8-bit integer elements from a into memory using mask
  1758. // (elements are not stored when the highest bit is not set in the corresponding
  1759. // element) and a non-temporal memory hint.
  1760. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
  1761. FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
  1762. {
  1763. int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
  1764. __m128 b = _mm_load_ps((const float *) mem_addr);
  1765. int8x8_t masked =
  1766. vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
  1767. vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
  1768. vst1_s8((int8_t *) mem_addr, masked);
  1769. }
  1770. // Conditionally store 8-bit integer elements from a into memory using mask
  1771. // (elements are not stored when the highest bit is not set in the corresponding
  1772. // element) and a non-temporal memory hint.
  1773. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
  1774. #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
  1775. // Compare packed signed 16-bit integers in a and b, and store packed maximum
  1776. // values in dst.
  1777. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
  1778. FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
  1779. {
  1780. return vreinterpret_m64_s16(
  1781. vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
  1782. }
  1783. // Compare packed single-precision (32-bit) floating-point elements in a and b,
  1784. // and store packed maximum values in dst. dst does not follow the IEEE Standard
  1785. // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
  1786. // signed-zero values.
  1787. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
  1788. FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
  1789. {
  1790. #if SSE2NEON_PRECISE_MINMAX
  1791. float32x4_t _a = vreinterpretq_f32_m128(a);
  1792. float32x4_t _b = vreinterpretq_f32_m128(b);
  1793. return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
  1794. #else
  1795. return vreinterpretq_m128_f32(
  1796. vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1797. #endif
  1798. }
  1799. // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
  1800. // values in dst.
  1801. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
  1802. FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
  1803. {
  1804. return vreinterpret_m64_u8(
  1805. vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
  1806. }
  1807. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1808. // b, store the maximum value in the lower element of dst, and copy the upper 3
  1809. // packed elements from a to the upper element of dst. dst does not follow the
  1810. // IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
  1811. // inputs are NaN or signed-zero values.
  1812. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
  1813. FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
  1814. {
  1815. float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
  1816. return vreinterpretq_m128_f32(
  1817. vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
  1818. }
  1819. // Compare packed signed 16-bit integers in a and b, and store packed minimum
  1820. // values in dst.
  1821. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
  1822. FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
  1823. {
  1824. return vreinterpret_m64_s16(
  1825. vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
  1826. }
  1827. // Compare packed single-precision (32-bit) floating-point elements in a and b,
  1828. // and store packed minimum values in dst. dst does not follow the IEEE Standard
  1829. // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
  1830. // signed-zero values.
  1831. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
  1832. FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
  1833. {
  1834. #if SSE2NEON_PRECISE_MINMAX
  1835. float32x4_t _a = vreinterpretq_f32_m128(a);
  1836. float32x4_t _b = vreinterpretq_f32_m128(b);
  1837. return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
  1838. #else
  1839. return vreinterpretq_m128_f32(
  1840. vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1841. #endif
  1842. }
  1843. // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
  1844. // values in dst.
  1845. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
  1846. FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
  1847. {
  1848. return vreinterpret_m64_u8(
  1849. vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
  1850. }
  1851. // Compare the lower single-precision (32-bit) floating-point elements in a and
  1852. // b, store the minimum value in the lower element of dst, and copy the upper 3
  1853. // packed elements from a to the upper element of dst. dst does not follow the
  1854. // IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
  1855. // inputs are NaN or signed-zero values.
  1856. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
  1857. FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
  1858. {
  1859. float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
  1860. return vreinterpretq_m128_f32(
  1861. vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
  1862. }
  1863. // Move the lower single-precision (32-bit) floating-point element from b to the
  1864. // lower element of dst, and copy the upper 3 packed elements from a to the
  1865. // upper elements of dst.
  1866. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
  1867. FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
  1868. {
  1869. return vreinterpretq_m128_f32(
  1870. vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
  1871. vreinterpretq_f32_m128(a), 0));
  1872. }
  1873. // Move the upper 2 single-precision (32-bit) floating-point elements from b to
  1874. // the lower 2 elements of dst, and copy the upper 2 elements from a to the
  1875. // upper 2 elements of dst.
  1876. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
  1877. FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
  1878. {
  1879. #if defined(aarch64__)
  1880. return vreinterpretq_m128_u64(
  1881. vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
  1882. #else
  1883. float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
  1884. float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
  1885. return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
  1886. #endif
  1887. }
  1888. // Move the lower 2 single-precision (32-bit) floating-point elements from b to
  1889. // the upper 2 elements of dst, and copy the lower 2 elements from a to the
  1890. // lower 2 elements of dst.
  1891. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
  1892. FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
  1893. {
  1894. float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
  1895. float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
  1896. return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
  1897. }
  1898. // Create mask from the most significant bit of each 8-bit element in a, and
  1899. // store the result in dst.
  1900. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
  1901. FORCE_INLINE int _mm_movemask_pi8(__m64 a)
  1902. {
  1903. uint8x8_t input = vreinterpret_u8_m64(a);
  1904. #if defined(__aarch64__) || defined(_M_ARM64)
  1905. static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  1906. uint8x8_t tmp = vshr_n_u8(input, 7);
  1907. return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
  1908. #else
  1909. // Refer the implementation of `_mm_movemask_epi8`
  1910. uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
  1911. uint32x2_t paired16 =
  1912. vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
  1913. uint8x8_t paired32 =
  1914. vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
  1915. return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
  1916. #endif
  1917. }
  1918. // Set each bit of mask dst based on the most significant bit of the
  1919. // corresponding packed single-precision (32-bit) floating-point element in a.
  1920. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
  1921. FORCE_INLINE int _mm_movemask_ps(__m128 a)
  1922. {
  1923. uint32x4_t input = vreinterpretq_u32_m128(a);
  1924. #if defined(__aarch64__) || defined(_M_ARM64)
  1925. static const int32_t shift[4] = {0, 1, 2, 3};
  1926. uint32x4_t tmp = vshrq_n_u32(input, 31);
  1927. return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
  1928. #else
  1929. // Uses the exact same method as _mm_movemask_epi8, see that for details.
  1930. // Shift out everything but the sign bits with a 32-bit unsigned shift
  1931. // right.
  1932. uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
  1933. // Merge the two pairs together with a 64-bit unsigned shift right + add.
  1934. uint8x16_t paired =
  1935. vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
  1936. // Extract the result.
  1937. return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
  1938. #endif
  1939. }
  1940. // Multiply packed single-precision (32-bit) floating-point elements in a and b,
  1941. // and store the results in dst.
  1942. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
  1943. FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
  1944. {
  1945. return vreinterpretq_m128_f32(
  1946. vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  1947. }
  1948. // Multiply the lower single-precision (32-bit) floating-point element in a and
  1949. // b, store the result in the lower element of dst, and copy the upper 3 packed
  1950. // elements from a to the upper elements of dst.
  1951. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
  1952. FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
  1953. {
  1954. return _mm_move_ss(a, _mm_mul_ps(a, b));
  1955. }
  1956. // Multiply the packed unsigned 16-bit integers in a and b, producing
  1957. // intermediate 32-bit integers, and store the high 16 bits of the intermediate
  1958. // integers in dst.
  1959. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
  1960. FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
  1961. {
  1962. return vreinterpret_m64_u16(vshrn_n_u32(
  1963. vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
  1964. }
  1965. // Compute the bitwise OR of packed single-precision (32-bit) floating-point
  1966. // elements in a and b, and store the results in dst.
  1967. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
  1968. FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
  1969. {
  1970. return vreinterpretq_m128_s32(
  1971. vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
  1972. }
  1973. // Average packed unsigned 8-bit integers in a and b, and store the results in
  1974. // dst.
  1975. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
  1976. #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
  1977. // Average packed unsigned 16-bit integers in a and b, and store the results in
  1978. // dst.
  1979. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
  1980. #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
  1981. // Extract a 16-bit integer from a, selected with imm8, and store the result in
  1982. // the lower element of dst.
  1983. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
  1984. #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
  1985. // Copy a to dst, and insert the 16-bit integer i into dst at the location
  1986. // specified by imm8.
  1987. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
  1988. #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
  1989. // Compare packed signed 16-bit integers in a and b, and store packed maximum
  1990. // values in dst.
  1991. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
  1992. #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
  1993. // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
  1994. // values in dst.
  1995. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
  1996. #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
  1997. // Compare packed signed 16-bit integers in a and b, and store packed minimum
  1998. // values in dst.
  1999. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
  2000. #define _m_pminsw(a, b) _mm_min_pi16(a, b)
  2001. // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
  2002. // values in dst.
  2003. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
  2004. #define _m_pminub(a, b) _mm_min_pu8(a, b)
  2005. // Create mask from the most significant bit of each 8-bit element in a, and
  2006. // store the result in dst.
  2007. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
  2008. #define _m_pmovmskb(a) _mm_movemask_pi8(a)
  2009. // Multiply the packed unsigned 16-bit integers in a and b, producing
  2010. // intermediate 32-bit integers, and store the high 16 bits of the intermediate
  2011. // integers in dst.
  2012. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
  2013. #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
  2014. // Fetch the line of data from memory that contains address p to a location in
  2015. // the cache hierarchy specified by the locality hint i.
  2016. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
  2017. FORCE_INLINE void _mm_prefetch(char const *p, int i)
  2018. {
  2019. (void) i;
  2020. #if defined(_MSC_VER)
  2021. switch (i) {
  2022. case _MM_HINT_NTA:
  2023. __prefetch2(p, 1);
  2024. break;
  2025. case _MM_HINT_T0:
  2026. __prefetch2(p, 0);
  2027. break;
  2028. case _MM_HINT_T1:
  2029. __prefetch2(p, 2);
  2030. break;
  2031. case _MM_HINT_T2:
  2032. __prefetch2(p, 4);
  2033. break;
  2034. }
  2035. #else
  2036. switch (i) {
  2037. case _MM_HINT_NTA:
  2038. __builtin_prefetch(p, 0, 0);
  2039. break;
  2040. case _MM_HINT_T0:
  2041. __builtin_prefetch(p, 0, 3);
  2042. break;
  2043. case _MM_HINT_T1:
  2044. __builtin_prefetch(p, 0, 2);
  2045. break;
  2046. case _MM_HINT_T2:
  2047. __builtin_prefetch(p, 0, 1);
  2048. break;
  2049. }
  2050. #endif
  2051. }
  2052. // Compute the absolute differences of packed unsigned 8-bit integers in a and
  2053. // b, then horizontally sum each consecutive 8 differences to produce four
  2054. // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
  2055. // 16 bits of dst.
  2056. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
  2057. #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
  2058. // Shuffle 16-bit integers in a using the control in imm8, and store the results
  2059. // in dst.
  2060. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
  2061. #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
  2062. // Compute the approximate reciprocal of packed single-precision (32-bit)
  2063. // floating-point elements in a, and store the results in dst. The maximum
  2064. // relative error for this approximation is less than 1.5*2^-12.
  2065. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
  2066. FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
  2067. {
  2068. float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
  2069. recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
  2070. #if SSE2NEON_PRECISE_DIV
  2071. // Additional Netwon-Raphson iteration for accuracy
  2072. recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
  2073. #endif
  2074. return vreinterpretq_m128_f32(recip);
  2075. }
  2076. // Compute the approximate reciprocal of the lower single-precision (32-bit)
  2077. // floating-point element in a, store the result in the lower element of dst,
  2078. // and copy the upper 3 packed elements from a to the upper elements of dst. The
  2079. // maximum relative error for this approximation is less than 1.5*2^-12.
  2080. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
  2081. FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
  2082. {
  2083. return _mm_move_ss(a, _mm_rcp_ps(a));
  2084. }
  2085. // Compute the approximate reciprocal square root of packed single-precision
  2086. // (32-bit) floating-point elements in a, and store the results in dst. The
  2087. // maximum relative error for this approximation is less than 1.5*2^-12.
  2088. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
  2089. FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
  2090. {
  2091. float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
  2092. // Generate masks for detecting whether input has any 0.0f/-0.0f
  2093. // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
  2094. const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
  2095. const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
  2096. const uint32x4_t has_pos_zero =
  2097. vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
  2098. const uint32x4_t has_neg_zero =
  2099. vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
  2100. out = vmulq_f32(
  2101. out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
  2102. #if SSE2NEON_PRECISE_SQRT
  2103. // Additional Netwon-Raphson iteration for accuracy
  2104. out = vmulq_f32(
  2105. out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
  2106. #endif
  2107. // Set output vector element to infinity/negative-infinity if
  2108. // the corresponding input vector element is 0.0f/-0.0f.
  2109. out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
  2110. out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
  2111. return vreinterpretq_m128_f32(out);
  2112. }
  2113. // Compute the approximate reciprocal square root of the lower single-precision
  2114. // (32-bit) floating-point element in a, store the result in the lower element
  2115. // of dst, and copy the upper 3 packed elements from a to the upper elements of
  2116. // dst.
  2117. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
  2118. FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
  2119. {
  2120. return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
  2121. }
  2122. // Compute the absolute differences of packed unsigned 8-bit integers in a and
  2123. // b, then horizontally sum each consecutive 8 differences to produce four
  2124. // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
  2125. // 16 bits of dst.
  2126. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
  2127. FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
  2128. {
  2129. uint64x1_t t = vpaddl_u32(vpaddl_u16(
  2130. vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
  2131. return vreinterpret_m64_u16(
  2132. vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
  2133. }
  2134. // Macro: Set the flush zero bits of the MXCSR control and status register to
  2135. // the value in unsigned 32-bit integer a. The flush zero may contain any of the
  2136. // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
  2137. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
  2138. FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
  2139. {
  2140. // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
  2141. // regardless of the value of the FZ bit.
  2142. union {
  2143. fpcr_bitfield field;
  2144. #if defined(__aarch64__) || defined(_M_ARM64)
  2145. uint64_t value;
  2146. #else
  2147. uint32_t value;
  2148. #endif
  2149. } r;
  2150. #if defined(__aarch64__) || defined(_M_ARM64)
  2151. r.value = _sse2neon_get_fpcr();
  2152. #else
  2153. __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
  2154. #endif
  2155. r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
  2156. #if defined(__aarch64__) || defined(_M_ARM64)
  2157. _sse2neon_set_fpcr(r.value);
  2158. #else
  2159. __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
  2160. #endif
  2161. }
  2162. // Set packed single-precision (32-bit) floating-point elements in dst with the
  2163. // supplied values.
  2164. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
  2165. FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
  2166. {
  2167. float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
  2168. return vreinterpretq_m128_f32(vld1q_f32(data));
  2169. }
  2170. // Broadcast single-precision (32-bit) floating-point value a to all elements of
  2171. // dst.
  2172. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
  2173. FORCE_INLINE __m128 _mm_set_ps1(float _w)
  2174. {
  2175. return vreinterpretq_m128_f32(vdupq_n_f32(_w));
  2176. }
  2177. // Macro: Set the rounding mode bits of the MXCSR control and status register to
  2178. // the value in unsigned 32-bit integer a. The rounding mode may contain any of
  2179. // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
  2180. // _MM_ROUND_TOWARD_ZERO
  2181. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
  2182. FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
  2183. {
  2184. union {
  2185. fpcr_bitfield field;
  2186. #if defined(__aarch64__) || defined(_M_ARM64)
  2187. uint64_t value;
  2188. #else
  2189. uint32_t value;
  2190. #endif
  2191. } r;
  2192. #if defined(__aarch64__) || defined(_M_ARM64)
  2193. r.value = _sse2neon_get_fpcr();
  2194. #else
  2195. __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
  2196. #endif
  2197. switch (rounding) {
  2198. case _MM_ROUND_TOWARD_ZERO:
  2199. r.field.bit22 = 1;
  2200. r.field.bit23 = 1;
  2201. break;
  2202. case _MM_ROUND_DOWN:
  2203. r.field.bit22 = 0;
  2204. r.field.bit23 = 1;
  2205. break;
  2206. case _MM_ROUND_UP:
  2207. r.field.bit22 = 1;
  2208. r.field.bit23 = 0;
  2209. break;
  2210. default: //_MM_ROUND_NEAREST
  2211. r.field.bit22 = 0;
  2212. r.field.bit23 = 0;
  2213. }
  2214. #if defined(__aarch64__) || defined(_M_ARM64)
  2215. _sse2neon_set_fpcr(r.value);
  2216. #else
  2217. __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
  2218. #endif
  2219. }
  2220. // Copy single-precision (32-bit) floating-point element a to the lower element
  2221. // of dst, and zero the upper 3 elements.
  2222. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
  2223. FORCE_INLINE __m128 _mm_set_ss(float a)
  2224. {
  2225. return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
  2226. }
  2227. // Broadcast single-precision (32-bit) floating-point value a to all elements of
  2228. // dst.
  2229. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
  2230. FORCE_INLINE __m128 _mm_set1_ps(float _w)
  2231. {
  2232. return vreinterpretq_m128_f32(vdupq_n_f32(_w));
  2233. }
  2234. // Set the MXCSR control and status register with the value in unsigned 32-bit
  2235. // integer a.
  2236. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
  2237. // FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
  2238. FORCE_INLINE void _mm_setcsr(unsigned int a)
  2239. {
  2240. _MM_SET_ROUNDING_MODE(a);
  2241. }
  2242. // Get the unsigned 32-bit value of the MXCSR control and status register.
  2243. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
  2244. // FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
  2245. FORCE_INLINE unsigned int _mm_getcsr(void)
  2246. {
  2247. return _MM_GET_ROUNDING_MODE();
  2248. }
  2249. // Set packed single-precision (32-bit) floating-point elements in dst with the
  2250. // supplied values in reverse order.
  2251. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
  2252. FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
  2253. {
  2254. float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
  2255. return vreinterpretq_m128_f32(vld1q_f32(data));
  2256. }
  2257. // Return vector of type __m128 with all elements set to zero.
  2258. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
  2259. FORCE_INLINE __m128 _mm_setzero_ps(void)
  2260. {
  2261. return vreinterpretq_m128_f32(vdupq_n_f32(0));
  2262. }
  2263. // Shuffle 16-bit integers in a using the control in imm8, and store the results
  2264. // in dst.
  2265. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
  2266. #ifdef _sse2neon_shuffle
  2267. #define _mm_shuffle_pi16(a, imm) \
  2268. vreinterpret_m64_s16(vshuffle_s16( \
  2269. vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
  2270. ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
  2271. #else
  2272. #define _mm_shuffle_pi16(a, imm) \
  2273. _sse2neon_define1( \
  2274. __m64, a, int16x4_t ret; \
  2275. ret = vmov_n_s16( \
  2276. vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \
  2277. ret = vset_lane_s16( \
  2278. vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
  2279. 1); \
  2280. ret = vset_lane_s16( \
  2281. vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
  2282. 2); \
  2283. ret = vset_lane_s16( \
  2284. vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
  2285. 3); \
  2286. _sse2neon_return(vreinterpret_m64_s16(ret));)
  2287. #endif
  2288. // Perform a serializing operation on all store-to-memory instructions that were
  2289. // issued prior to this instruction. Guarantees that every store instruction
  2290. // that precedes, in program order, is globally visible before any store
  2291. // instruction which follows the fence in program order.
  2292. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
  2293. FORCE_INLINE void _mm_sfence(void)
  2294. {
  2295. _sse2neon_smp_mb();
  2296. }
  2297. // Perform a serializing operation on all load-from-memory and store-to-memory
  2298. // instructions that were issued prior to this instruction. Guarantees that
  2299. // every memory access that precedes, in program order, the memory fence
  2300. // instruction is globally visible before any memory instruction which follows
  2301. // the fence in program order.
  2302. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
  2303. FORCE_INLINE void _mm_mfence(void)
  2304. {
  2305. _sse2neon_smp_mb();
  2306. }
  2307. // Perform a serializing operation on all load-from-memory instructions that
  2308. // were issued prior to this instruction. Guarantees that every load instruction
  2309. // that precedes, in program order, is globally visible before any load
  2310. // instruction which follows the fence in program order.
  2311. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
  2312. FORCE_INLINE void _mm_lfence(void)
  2313. {
  2314. _sse2neon_smp_mb();
  2315. }
  2316. // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
  2317. // int imm)
  2318. #ifdef _sse2neon_shuffle
  2319. #define _mm_shuffle_ps(a, b, imm) \
  2320. __extension__({ \
  2321. float32x4_t _input1 = vreinterpretq_f32_m128(a); \
  2322. float32x4_t _input2 = vreinterpretq_f32_m128(b); \
  2323. float32x4_t _shuf = \
  2324. vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
  2325. (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
  2326. vreinterpretq_m128_f32(_shuf); \
  2327. })
  2328. #else // generic
  2329. #define _mm_shuffle_ps(a, b, imm) \
  2330. _sse2neon_define2( \
  2331. __m128, a, b, __m128 ret; switch (imm) { \
  2332. case _MM_SHUFFLE(1, 0, 3, 2): \
  2333. ret = _mm_shuffle_ps_1032(_a, _b); \
  2334. break; \
  2335. case _MM_SHUFFLE(2, 3, 0, 1): \
  2336. ret = _mm_shuffle_ps_2301(_a, _b); \
  2337. break; \
  2338. case _MM_SHUFFLE(0, 3, 2, 1): \
  2339. ret = _mm_shuffle_ps_0321(_a, _b); \
  2340. break; \
  2341. case _MM_SHUFFLE(2, 1, 0, 3): \
  2342. ret = _mm_shuffle_ps_2103(_a, _b); \
  2343. break; \
  2344. case _MM_SHUFFLE(1, 0, 1, 0): \
  2345. ret = _mm_movelh_ps(_a, _b); \
  2346. break; \
  2347. case _MM_SHUFFLE(1, 0, 0, 1): \
  2348. ret = _mm_shuffle_ps_1001(_a, _b); \
  2349. break; \
  2350. case _MM_SHUFFLE(0, 1, 0, 1): \
  2351. ret = _mm_shuffle_ps_0101(_a, _b); \
  2352. break; \
  2353. case _MM_SHUFFLE(3, 2, 1, 0): \
  2354. ret = _mm_shuffle_ps_3210(_a, _b); \
  2355. break; \
  2356. case _MM_SHUFFLE(0, 0, 1, 1): \
  2357. ret = _mm_shuffle_ps_0011(_a, _b); \
  2358. break; \
  2359. case _MM_SHUFFLE(0, 0, 2, 2): \
  2360. ret = _mm_shuffle_ps_0022(_a, _b); \
  2361. break; \
  2362. case _MM_SHUFFLE(2, 2, 0, 0): \
  2363. ret = _mm_shuffle_ps_2200(_a, _b); \
  2364. break; \
  2365. case _MM_SHUFFLE(3, 2, 0, 2): \
  2366. ret = _mm_shuffle_ps_3202(_a, _b); \
  2367. break; \
  2368. case _MM_SHUFFLE(3, 2, 3, 2): \
  2369. ret = _mm_movehl_ps(_b, _a); \
  2370. break; \
  2371. case _MM_SHUFFLE(1, 1, 3, 3): \
  2372. ret = _mm_shuffle_ps_1133(_a, _b); \
  2373. break; \
  2374. case _MM_SHUFFLE(2, 0, 1, 0): \
  2375. ret = _mm_shuffle_ps_2010(_a, _b); \
  2376. break; \
  2377. case _MM_SHUFFLE(2, 0, 0, 1): \
  2378. ret = _mm_shuffle_ps_2001(_a, _b); \
  2379. break; \
  2380. case _MM_SHUFFLE(2, 0, 3, 2): \
  2381. ret = _mm_shuffle_ps_2032(_a, _b); \
  2382. break; \
  2383. default: \
  2384. ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
  2385. break; \
  2386. } _sse2neon_return(ret);)
  2387. #endif
  2388. // Compute the square root of packed single-precision (32-bit) floating-point
  2389. // elements in a, and store the results in dst.
  2390. // Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
  2391. // square root by multiplying input in with its reciprocal square root before
  2392. // using the Newton-Raphson method to approximate the results.
  2393. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
  2394. FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
  2395. {
  2396. #if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
  2397. return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
  2398. #else
  2399. float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
  2400. // Test for vrsqrteq_f32(0) -> positive infinity case.
  2401. // Change to zero, so that s * 1/sqrt(s) result is zero too.
  2402. const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
  2403. const uint32x4_t div_by_zero =
  2404. vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
  2405. recip = vreinterpretq_f32_u32(
  2406. vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
  2407. recip = vmulq_f32(
  2408. vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
  2409. recip);
  2410. // Additional Netwon-Raphson iteration for accuracy
  2411. recip = vmulq_f32(
  2412. vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
  2413. recip);
  2414. // sqrt(s) = s * 1/sqrt(s)
  2415. return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
  2416. #endif
  2417. }
  2418. // Compute the square root of the lower single-precision (32-bit) floating-point
  2419. // element in a, store the result in the lower element of dst, and copy the
  2420. // upper 3 packed elements from a to the upper elements of dst.
  2421. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
  2422. FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
  2423. {
  2424. float32_t value =
  2425. vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
  2426. return vreinterpretq_m128_f32(
  2427. vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
  2428. }
  2429. // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
  2430. // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
  2431. // or a general-protection exception may be generated.
  2432. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
  2433. FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
  2434. {
  2435. vst1q_f32(p, vreinterpretq_f32_m128(a));
  2436. }
  2437. // Store the lower single-precision (32-bit) floating-point element from a into
  2438. // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
  2439. // boundary or a general-protection exception may be generated.
  2440. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
  2441. FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
  2442. {
  2443. float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
  2444. vst1q_f32(p, vdupq_n_f32(a0));
  2445. }
  2446. // Store the lower single-precision (32-bit) floating-point element from a into
  2447. // memory. mem_addr does not need to be aligned on any particular boundary.
  2448. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
  2449. FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
  2450. {
  2451. vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
  2452. }
  2453. // Store the lower single-precision (32-bit) floating-point element from a into
  2454. // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
  2455. // boundary or a general-protection exception may be generated.
  2456. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
  2457. #define _mm_store1_ps _mm_store_ps1
  2458. // Store the upper 2 single-precision (32-bit) floating-point elements from a
  2459. // into memory.
  2460. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
  2461. FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
  2462. {
  2463. *p = vreinterpret_m64_f32(vget_high_f32(a));
  2464. }
  2465. // Store the lower 2 single-precision (32-bit) floating-point elements from a
  2466. // into memory.
  2467. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
  2468. FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
  2469. {
  2470. *p = vreinterpret_m64_f32(vget_low_f32(a));
  2471. }
  2472. // Store 4 single-precision (32-bit) floating-point elements from a into memory
  2473. // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
  2474. // general-protection exception may be generated.
  2475. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
  2476. FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
  2477. {
  2478. float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
  2479. float32x4_t rev = vextq_f32(tmp, tmp, 2);
  2480. vst1q_f32(p, rev);
  2481. }
  2482. // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
  2483. // elements) from a into memory. mem_addr does not need to be aligned on any
  2484. // particular boundary.
  2485. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
  2486. FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
  2487. {
  2488. vst1q_f32(p, vreinterpretq_f32_m128(a));
  2489. }
  2490. // Stores 16-bits of integer data a at the address p.
  2491. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
  2492. FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
  2493. {
  2494. vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
  2495. }
  2496. // Stores 64-bits of integer data a at the address p.
  2497. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
  2498. FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
  2499. {
  2500. vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
  2501. }
  2502. // Store 64-bits of integer data from a into memory using a non-temporal memory
  2503. // hint.
  2504. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
  2505. FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
  2506. {
  2507. vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
  2508. }
  2509. // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
  2510. // point elements) from a into memory using a non-temporal memory hint.
  2511. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
  2512. FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
  2513. {
  2514. #if __has_builtin(__builtin_nontemporal_store)
  2515. __builtin_nontemporal_store(a, (float32x4_t *) p);
  2516. #else
  2517. vst1q_f32(p, vreinterpretq_f32_m128(a));
  2518. #endif
  2519. }
  2520. // Subtract packed single-precision (32-bit) floating-point elements in b from
  2521. // packed single-precision (32-bit) floating-point elements in a, and store the
  2522. // results in dst.
  2523. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
  2524. FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
  2525. {
  2526. return vreinterpretq_m128_f32(
  2527. vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  2528. }
  2529. // Subtract the lower single-precision (32-bit) floating-point element in b from
  2530. // the lower single-precision (32-bit) floating-point element in a, store the
  2531. // result in the lower element of dst, and copy the upper 3 packed elements from
  2532. // a to the upper elements of dst.
  2533. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
  2534. FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
  2535. {
  2536. return _mm_move_ss(a, _mm_sub_ps(a, b));
  2537. }
  2538. // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
  2539. // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
  2540. // transposed matrix in these vectors (row0 now contains column 0, etc.).
  2541. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
  2542. #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
  2543. do { \
  2544. float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
  2545. float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
  2546. row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
  2547. vget_low_f32(ROW23.val[0])); \
  2548. row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
  2549. vget_low_f32(ROW23.val[1])); \
  2550. row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
  2551. vget_high_f32(ROW23.val[0])); \
  2552. row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
  2553. vget_high_f32(ROW23.val[1])); \
  2554. } while (0)
  2555. // according to the documentation, these intrinsics behave the same as the
  2556. // non-'u' versions. We'll just alias them here.
  2557. #define _mm_ucomieq_ss _mm_comieq_ss
  2558. #define _mm_ucomige_ss _mm_comige_ss
  2559. #define _mm_ucomigt_ss _mm_comigt_ss
  2560. #define _mm_ucomile_ss _mm_comile_ss
  2561. #define _mm_ucomilt_ss _mm_comilt_ss
  2562. #define _mm_ucomineq_ss _mm_comineq_ss
  2563. // Return vector of type __m128i with undefined elements.
  2564. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
  2565. FORCE_INLINE __m128i _mm_undefined_si128(void)
  2566. {
  2567. #if defined(__GNUC__) || defined(__clang__)
  2568. #pragma GCC diagnostic push
  2569. #pragma GCC diagnostic ignored "-Wuninitialized"
  2570. #endif
  2571. __m128i a;
  2572. #if defined(_MSC_VER)
  2573. a = _mm_setzero_si128();
  2574. #endif
  2575. return a;
  2576. #if defined(__GNUC__) || defined(__clang__)
  2577. #pragma GCC diagnostic pop
  2578. #endif
  2579. }
  2580. // Return vector of type __m128 with undefined elements.
  2581. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
  2582. FORCE_INLINE __m128 _mm_undefined_ps(void)
  2583. {
  2584. #if defined(__GNUC__) || defined(__clang__)
  2585. #pragma GCC diagnostic push
  2586. #pragma GCC diagnostic ignored "-Wuninitialized"
  2587. #endif
  2588. __m128 a;
  2589. #if defined(_MSC_VER)
  2590. a = _mm_setzero_ps();
  2591. #endif
  2592. return a;
  2593. #if defined(__GNUC__) || defined(__clang__)
  2594. #pragma GCC diagnostic pop
  2595. #endif
  2596. }
  2597. // Unpack and interleave single-precision (32-bit) floating-point elements from
  2598. // the high half a and b, and store the results in dst.
  2599. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
  2600. FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
  2601. {
  2602. #if defined(__aarch64__) || defined(_M_ARM64)
  2603. return vreinterpretq_m128_f32(
  2604. vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  2605. #else
  2606. float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
  2607. float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
  2608. float32x2x2_t result = vzip_f32(a1, b1);
  2609. return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
  2610. #endif
  2611. }
  2612. // Unpack and interleave single-precision (32-bit) floating-point elements from
  2613. // the low half of a and b, and store the results in dst.
  2614. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
  2615. FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
  2616. {
  2617. #if defined(__aarch64__) || defined(_M_ARM64)
  2618. return vreinterpretq_m128_f32(
  2619. vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  2620. #else
  2621. float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
  2622. float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
  2623. float32x2x2_t result = vzip_f32(a1, b1);
  2624. return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
  2625. #endif
  2626. }
  2627. // Compute the bitwise XOR of packed single-precision (32-bit) floating-point
  2628. // elements in a and b, and store the results in dst.
  2629. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
  2630. FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
  2631. {
  2632. return vreinterpretq_m128_s32(
  2633. veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
  2634. }
  2635. /* SSE2 */
  2636. // Add packed 16-bit integers in a and b, and store the results in dst.
  2637. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
  2638. FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
  2639. {
  2640. return vreinterpretq_m128i_s16(
  2641. vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  2642. }
  2643. // Add packed 32-bit integers in a and b, and store the results in dst.
  2644. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
  2645. FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
  2646. {
  2647. return vreinterpretq_m128i_s32(
  2648. vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  2649. }
  2650. // Add packed 64-bit integers in a and b, and store the results in dst.
  2651. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
  2652. FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
  2653. {
  2654. return vreinterpretq_m128i_s64(
  2655. vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
  2656. }
  2657. // Add packed 8-bit integers in a and b, and store the results in dst.
  2658. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
  2659. FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
  2660. {
  2661. return vreinterpretq_m128i_s8(
  2662. vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  2663. }
  2664. // Add packed double-precision (64-bit) floating-point elements in a and b, and
  2665. // store the results in dst.
  2666. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
  2667. FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
  2668. {
  2669. #if defined(__aarch64__) || defined(_M_ARM64)
  2670. return vreinterpretq_m128d_f64(
  2671. vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  2672. #else
  2673. double *da = (double *) &a;
  2674. double *db = (double *) &b;
  2675. double c[2];
  2676. c[0] = da[0] + db[0];
  2677. c[1] = da[1] + db[1];
  2678. return vld1q_f32((float32_t *) c);
  2679. #endif
  2680. }
  2681. // Add the lower double-precision (64-bit) floating-point element in a and b,
  2682. // store the result in the lower element of dst, and copy the upper element from
  2683. // a to the upper element of dst.
  2684. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
  2685. FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
  2686. {
  2687. #if defined(__aarch64__) || defined(_M_ARM64)
  2688. return _mm_move_sd(a, _mm_add_pd(a, b));
  2689. #else
  2690. double *da = (double *) &a;
  2691. double *db = (double *) &b;
  2692. double c[2];
  2693. c[0] = da[0] + db[0];
  2694. c[1] = da[1];
  2695. return vld1q_f32((float32_t *) c);
  2696. #endif
  2697. }
  2698. // Add 64-bit integers a and b, and store the result in dst.
  2699. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
  2700. FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
  2701. {
  2702. return vreinterpret_m64_s64(
  2703. vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
  2704. }
  2705. // Add packed signed 16-bit integers in a and b using saturation, and store the
  2706. // results in dst.
  2707. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
  2708. FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
  2709. {
  2710. return vreinterpretq_m128i_s16(
  2711. vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  2712. }
  2713. // Add packed signed 8-bit integers in a and b using saturation, and store the
  2714. // results in dst.
  2715. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
  2716. FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
  2717. {
  2718. return vreinterpretq_m128i_s8(
  2719. vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  2720. }
  2721. // Add packed unsigned 16-bit integers in a and b using saturation, and store
  2722. // the results in dst.
  2723. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
  2724. FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
  2725. {
  2726. return vreinterpretq_m128i_u16(
  2727. vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
  2728. }
  2729. // Add packed unsigned 8-bit integers in a and b using saturation, and store the
  2730. // results in dst.
  2731. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
  2732. FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
  2733. {
  2734. return vreinterpretq_m128i_u8(
  2735. vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
  2736. }
  2737. // Compute the bitwise AND of packed double-precision (64-bit) floating-point
  2738. // elements in a and b, and store the results in dst.
  2739. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
  2740. FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
  2741. {
  2742. return vreinterpretq_m128d_s64(
  2743. vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
  2744. }
  2745. // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
  2746. // and store the result in dst.
  2747. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
  2748. FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
  2749. {
  2750. return vreinterpretq_m128i_s32(
  2751. vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  2752. }
  2753. // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
  2754. // elements in a and then AND with b, and store the results in dst.
  2755. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
  2756. FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
  2757. {
  2758. // *NOTE* argument swap
  2759. return vreinterpretq_m128d_s64(
  2760. vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
  2761. }
  2762. // Compute the bitwise NOT of 128 bits (representing integer data) in a and then
  2763. // AND with b, and store the result in dst.
  2764. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
  2765. FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
  2766. {
  2767. return vreinterpretq_m128i_s32(
  2768. vbicq_s32(vreinterpretq_s32_m128i(b),
  2769. vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
  2770. }
  2771. // Average packed unsigned 16-bit integers in a and b, and store the results in
  2772. // dst.
  2773. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
  2774. FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
  2775. {
  2776. return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
  2777. vreinterpretq_u16_m128i(b));
  2778. }
  2779. // Average packed unsigned 8-bit integers in a and b, and store the results in
  2780. // dst.
  2781. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
  2782. FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
  2783. {
  2784. return vreinterpretq_m128i_u8(
  2785. vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
  2786. }
  2787. // Shift a left by imm8 bytes while shifting in zeros, and store the results in
  2788. // dst.
  2789. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
  2790. #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
  2791. // Shift a right by imm8 bytes while shifting in zeros, and store the results in
  2792. // dst.
  2793. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
  2794. #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
  2795. // Cast vector of type __m128d to type __m128. This intrinsic is only used for
  2796. // compilation and does not generate any instructions, thus it has zero latency.
  2797. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
  2798. FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
  2799. {
  2800. return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
  2801. }
  2802. // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
  2803. // compilation and does not generate any instructions, thus it has zero latency.
  2804. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
  2805. FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
  2806. {
  2807. return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
  2808. }
  2809. // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
  2810. // compilation and does not generate any instructions, thus it has zero latency.
  2811. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
  2812. FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
  2813. {
  2814. return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
  2815. }
  2816. // Cast vector of type __m128 to type __m128i. This intrinsic is only used for
  2817. // compilation and does not generate any instructions, thus it has zero latency.
  2818. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
  2819. FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
  2820. {
  2821. return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
  2822. }
  2823. // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
  2824. // compilation and does not generate any instructions, thus it has zero latency.
  2825. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
  2826. FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
  2827. {
  2828. #if defined(__aarch64__) || defined(_M_ARM64)
  2829. return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
  2830. #else
  2831. return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
  2832. #endif
  2833. }
  2834. // Cast vector of type __m128i to type __m128. This intrinsic is only used for
  2835. // compilation and does not generate any instructions, thus it has zero latency.
  2836. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
  2837. FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
  2838. {
  2839. return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
  2840. }
  2841. // Invalidate and flush the cache line that contains p from all levels of the
  2842. // cache hierarchy.
  2843. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
  2844. #if defined(__APPLE__)
  2845. #include <libkern/OSCacheControl.h>
  2846. #endif
  2847. FORCE_INLINE void _mm_clflush(void const *p)
  2848. {
  2849. (void) p;
  2850. /* sys_icache_invalidate is supported since macOS 10.5.
  2851. * However, it does not work on non-jailbroken iOS devices, although the
  2852. * compilation is successful.
  2853. */
  2854. #if defined(__APPLE__)
  2855. sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
  2856. #elif defined(__GNUC__) || defined(__clang__)
  2857. uintptr_t ptr = (uintptr_t) p;
  2858. __builtin___clear_cache((char *) ptr,
  2859. (char *) ptr + SSE2NEON_CACHELINE_SIZE);
  2860. #elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
  2861. FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
  2862. #endif
  2863. }
  2864. // Compare packed 16-bit integers in a and b for equality, and store the results
  2865. // in dst.
  2866. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
  2867. FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
  2868. {
  2869. return vreinterpretq_m128i_u16(
  2870. vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  2871. }
  2872. // Compare packed 32-bit integers in a and b for equality, and store the results
  2873. // in dst.
  2874. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
  2875. FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
  2876. {
  2877. return vreinterpretq_m128i_u32(
  2878. vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  2879. }
  2880. // Compare packed 8-bit integers in a and b for equality, and store the results
  2881. // in dst.
  2882. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
  2883. FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
  2884. {
  2885. return vreinterpretq_m128i_u8(
  2886. vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  2887. }
  2888. // Compare packed double-precision (64-bit) floating-point elements in a and b
  2889. // for equality, and store the results in dst.
  2890. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
  2891. FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
  2892. {
  2893. #if defined(__aarch64__) || defined(_M_ARM64)
  2894. return vreinterpretq_m128d_u64(
  2895. vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  2896. #else
  2897. // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
  2898. uint32x4_t cmp =
  2899. vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
  2900. uint32x4_t swapped = vrev64q_u32(cmp);
  2901. return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
  2902. #endif
  2903. }
  2904. // Compare the lower double-precision (64-bit) floating-point elements in a and
  2905. // b for equality, store the result in the lower element of dst, and copy the
  2906. // upper element from a to the upper element of dst.
  2907. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
  2908. FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
  2909. {
  2910. return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
  2911. }
  2912. // Compare packed double-precision (64-bit) floating-point elements in a and b
  2913. // for greater-than-or-equal, and store the results in dst.
  2914. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
  2915. FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
  2916. {
  2917. #if defined(__aarch64__) || defined(_M_ARM64)
  2918. return vreinterpretq_m128d_u64(
  2919. vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  2920. #else
  2921. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  2922. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  2923. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  2924. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  2925. uint64_t d[2];
  2926. d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
  2927. d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
  2928. return vreinterpretq_m128d_u64(vld1q_u64(d));
  2929. #endif
  2930. }
  2931. // Compare the lower double-precision (64-bit) floating-point elements in a and
  2932. // b for greater-than-or-equal, store the result in the lower element of dst,
  2933. // and copy the upper element from a to the upper element of dst.
  2934. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
  2935. FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
  2936. {
  2937. #if defined(__aarch64__) || defined(_M_ARM64)
  2938. return _mm_move_sd(a, _mm_cmpge_pd(a, b));
  2939. #else
  2940. // expand "_mm_cmpge_pd()" to reduce unnecessary operations
  2941. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  2942. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  2943. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  2944. uint64_t d[2];
  2945. d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
  2946. d[1] = a1;
  2947. return vreinterpretq_m128d_u64(vld1q_u64(d));
  2948. #endif
  2949. }
  2950. // Compare packed signed 16-bit integers in a and b for greater-than, and store
  2951. // the results in dst.
  2952. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
  2953. FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
  2954. {
  2955. return vreinterpretq_m128i_u16(
  2956. vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  2957. }
  2958. // Compare packed signed 32-bit integers in a and b for greater-than, and store
  2959. // the results in dst.
  2960. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
  2961. FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
  2962. {
  2963. return vreinterpretq_m128i_u32(
  2964. vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  2965. }
  2966. // Compare packed signed 8-bit integers in a and b for greater-than, and store
  2967. // the results in dst.
  2968. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
  2969. FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
  2970. {
  2971. return vreinterpretq_m128i_u8(
  2972. vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  2973. }
  2974. // Compare packed double-precision (64-bit) floating-point elements in a and b
  2975. // for greater-than, and store the results in dst.
  2976. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
  2977. FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
  2978. {
  2979. #if defined(__aarch64__) || defined(_M_ARM64)
  2980. return vreinterpretq_m128d_u64(
  2981. vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  2982. #else
  2983. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  2984. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  2985. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  2986. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  2987. uint64_t d[2];
  2988. d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
  2989. d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
  2990. return vreinterpretq_m128d_u64(vld1q_u64(d));
  2991. #endif
  2992. }
  2993. // Compare the lower double-precision (64-bit) floating-point elements in a and
  2994. // b for greater-than, store the result in the lower element of dst, and copy
  2995. // the upper element from a to the upper element of dst.
  2996. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
  2997. FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
  2998. {
  2999. #if defined(__aarch64__) || defined(_M_ARM64)
  3000. return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
  3001. #else
  3002. // expand "_mm_cmpge_pd()" to reduce unnecessary operations
  3003. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3004. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3005. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3006. uint64_t d[2];
  3007. d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
  3008. d[1] = a1;
  3009. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3010. #endif
  3011. }
  3012. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3013. // for less-than-or-equal, and store the results in dst.
  3014. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
  3015. FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
  3016. {
  3017. #if defined(__aarch64__) || defined(_M_ARM64)
  3018. return vreinterpretq_m128d_u64(
  3019. vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  3020. #else
  3021. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3022. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3023. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3024. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  3025. uint64_t d[2];
  3026. d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
  3027. d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
  3028. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3029. #endif
  3030. }
  3031. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3032. // b for less-than-or-equal, store the result in the lower element of dst, and
  3033. // copy the upper element from a to the upper element of dst.
  3034. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
  3035. FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
  3036. {
  3037. #if defined(__aarch64__) || defined(_M_ARM64)
  3038. return _mm_move_sd(a, _mm_cmple_pd(a, b));
  3039. #else
  3040. // expand "_mm_cmpge_pd()" to reduce unnecessary operations
  3041. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3042. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3043. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3044. uint64_t d[2];
  3045. d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
  3046. d[1] = a1;
  3047. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3048. #endif
  3049. }
  3050. // Compare packed signed 16-bit integers in a and b for less-than, and store the
  3051. // results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
  3052. // order of the operands switched.
  3053. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
  3054. FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
  3055. {
  3056. return vreinterpretq_m128i_u16(
  3057. vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  3058. }
  3059. // Compare packed signed 32-bit integers in a and b for less-than, and store the
  3060. // results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
  3061. // order of the operands switched.
  3062. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
  3063. FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
  3064. {
  3065. return vreinterpretq_m128i_u32(
  3066. vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  3067. }
  3068. // Compare packed signed 8-bit integers in a and b for less-than, and store the
  3069. // results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
  3070. // order of the operands switched.
  3071. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
  3072. FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
  3073. {
  3074. return vreinterpretq_m128i_u8(
  3075. vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  3076. }
  3077. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3078. // for less-than, and store the results in dst.
  3079. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
  3080. FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
  3081. {
  3082. #if defined(__aarch64__) || defined(_M_ARM64)
  3083. return vreinterpretq_m128d_u64(
  3084. vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  3085. #else
  3086. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3087. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3088. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3089. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  3090. uint64_t d[2];
  3091. d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
  3092. d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
  3093. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3094. #endif
  3095. }
  3096. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3097. // b for less-than, store the result in the lower element of dst, and copy the
  3098. // upper element from a to the upper element of dst.
  3099. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
  3100. FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
  3101. {
  3102. #if defined(__aarch64__) || defined(_M_ARM64)
  3103. return _mm_move_sd(a, _mm_cmplt_pd(a, b));
  3104. #else
  3105. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3106. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3107. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3108. uint64_t d[2];
  3109. d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
  3110. d[1] = a1;
  3111. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3112. #endif
  3113. }
  3114. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3115. // for not-equal, and store the results in dst.
  3116. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
  3117. FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
  3118. {
  3119. #if defined(__aarch64__) || defined(_M_ARM64)
  3120. return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
  3121. vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
  3122. #else
  3123. // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
  3124. uint32x4_t cmp =
  3125. vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
  3126. uint32x4_t swapped = vrev64q_u32(cmp);
  3127. return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
  3128. #endif
  3129. }
  3130. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3131. // b for not-equal, store the result in the lower element of dst, and copy the
  3132. // upper element from a to the upper element of dst.
  3133. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
  3134. FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
  3135. {
  3136. return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
  3137. }
  3138. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3139. // for not-greater-than-or-equal, and store the results in dst.
  3140. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
  3141. FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
  3142. {
  3143. #if defined(__aarch64__) || defined(_M_ARM64)
  3144. return vreinterpretq_m128d_u64(veorq_u64(
  3145. vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
  3146. vdupq_n_u64(UINT64_MAX)));
  3147. #else
  3148. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3149. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3150. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3151. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  3152. uint64_t d[2];
  3153. d[0] =
  3154. !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
  3155. d[1] =
  3156. !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
  3157. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3158. #endif
  3159. }
  3160. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3161. // b for not-greater-than-or-equal, store the result in the lower element of
  3162. // dst, and copy the upper element from a to the upper element of dst.
  3163. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
  3164. FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
  3165. {
  3166. return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
  3167. }
  3168. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3169. // for not-greater-than, and store the results in dst.
  3170. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
  3171. FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
  3172. {
  3173. #if defined(__aarch64__) || defined(_M_ARM64)
  3174. return vreinterpretq_m128d_u64(veorq_u64(
  3175. vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
  3176. vdupq_n_u64(UINT64_MAX)));
  3177. #else
  3178. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3179. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3180. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3181. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  3182. uint64_t d[2];
  3183. d[0] =
  3184. !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
  3185. d[1] =
  3186. !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
  3187. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3188. #endif
  3189. }
  3190. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3191. // b for not-greater-than, store the result in the lower element of dst, and
  3192. // copy the upper element from a to the upper element of dst.
  3193. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
  3194. FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
  3195. {
  3196. return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
  3197. }
  3198. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3199. // for not-less-than-or-equal, and store the results in dst.
  3200. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
  3201. FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
  3202. {
  3203. #if defined(__aarch64__) || defined(_M_ARM64)
  3204. return vreinterpretq_m128d_u64(veorq_u64(
  3205. vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
  3206. vdupq_n_u64(UINT64_MAX)));
  3207. #else
  3208. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3209. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3210. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3211. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  3212. uint64_t d[2];
  3213. d[0] =
  3214. !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
  3215. d[1] =
  3216. !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
  3217. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3218. #endif
  3219. }
  3220. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3221. // b for not-less-than-or-equal, store the result in the lower element of dst,
  3222. // and copy the upper element from a to the upper element of dst.
  3223. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
  3224. FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
  3225. {
  3226. return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
  3227. }
  3228. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3229. // for not-less-than, and store the results in dst.
  3230. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
  3231. FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
  3232. {
  3233. #if defined(__aarch64__) || defined(_M_ARM64)
  3234. return vreinterpretq_m128d_u64(veorq_u64(
  3235. vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
  3236. vdupq_n_u64(UINT64_MAX)));
  3237. #else
  3238. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3239. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3240. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3241. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  3242. uint64_t d[2];
  3243. d[0] =
  3244. !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
  3245. d[1] =
  3246. !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
  3247. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3248. #endif
  3249. }
  3250. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3251. // b for not-less-than, store the result in the lower element of dst, and copy
  3252. // the upper element from a to the upper element of dst.
  3253. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
  3254. FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
  3255. {
  3256. return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
  3257. }
  3258. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3259. // to see if neither is NaN, and store the results in dst.
  3260. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
  3261. FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
  3262. {
  3263. #if defined(__aarch64__) || defined(_M_ARM64)
  3264. // Excluding NaNs, any two floating point numbers can be compared.
  3265. uint64x2_t not_nan_a =
  3266. vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
  3267. uint64x2_t not_nan_b =
  3268. vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
  3269. return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
  3270. #else
  3271. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3272. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3273. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3274. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  3275. uint64_t d[2];
  3276. d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
  3277. (*(double *) &b0) == (*(double *) &b0))
  3278. ? ~UINT64_C(0)
  3279. : UINT64_C(0);
  3280. d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
  3281. (*(double *) &b1) == (*(double *) &b1))
  3282. ? ~UINT64_C(0)
  3283. : UINT64_C(0);
  3284. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3285. #endif
  3286. }
  3287. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3288. // b to see if neither is NaN, store the result in the lower element of dst, and
  3289. // copy the upper element from a to the upper element of dst.
  3290. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
  3291. FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
  3292. {
  3293. #if defined(__aarch64__) || defined(_M_ARM64)
  3294. return _mm_move_sd(a, _mm_cmpord_pd(a, b));
  3295. #else
  3296. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3297. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3298. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3299. uint64_t d[2];
  3300. d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
  3301. (*(double *) &b0) == (*(double *) &b0))
  3302. ? ~UINT64_C(0)
  3303. : UINT64_C(0);
  3304. d[1] = a1;
  3305. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3306. #endif
  3307. }
  3308. // Compare packed double-precision (64-bit) floating-point elements in a and b
  3309. // to see if either is NaN, and store the results in dst.
  3310. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
  3311. FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
  3312. {
  3313. #if defined(__aarch64__) || defined(_M_ARM64)
  3314. // Two NaNs are not equal in comparison operation.
  3315. uint64x2_t not_nan_a =
  3316. vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
  3317. uint64x2_t not_nan_b =
  3318. vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
  3319. return vreinterpretq_m128d_s32(
  3320. vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
  3321. #else
  3322. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3323. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3324. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3325. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  3326. uint64_t d[2];
  3327. d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
  3328. (*(double *) &b0) == (*(double *) &b0))
  3329. ? UINT64_C(0)
  3330. : ~UINT64_C(0);
  3331. d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
  3332. (*(double *) &b1) == (*(double *) &b1))
  3333. ? UINT64_C(0)
  3334. : ~UINT64_C(0);
  3335. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3336. #endif
  3337. }
  3338. // Compare the lower double-precision (64-bit) floating-point elements in a and
  3339. // b to see if either is NaN, store the result in the lower element of dst, and
  3340. // copy the upper element from a to the upper element of dst.
  3341. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
  3342. FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
  3343. {
  3344. #if defined(__aarch64__) || defined(_M_ARM64)
  3345. return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
  3346. #else
  3347. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3348. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3349. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  3350. uint64_t d[2];
  3351. d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
  3352. (*(double *) &b0) == (*(double *) &b0))
  3353. ? UINT64_C(0)
  3354. : ~UINT64_C(0);
  3355. d[1] = a1;
  3356. return vreinterpretq_m128d_u64(vld1q_u64(d));
  3357. #endif
  3358. }
  3359. // Compare the lower double-precision (64-bit) floating-point element in a and b
  3360. // for greater-than-or-equal, and return the boolean result (0 or 1).
  3361. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
  3362. FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
  3363. {
  3364. #if defined(__aarch64__) || defined(_M_ARM64)
  3365. return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
  3366. #else
  3367. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3368. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3369. return (*(double *) &a0 >= *(double *) &b0);
  3370. #endif
  3371. }
  3372. // Compare the lower double-precision (64-bit) floating-point element in a and b
  3373. // for greater-than, and return the boolean result (0 or 1).
  3374. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
  3375. FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
  3376. {
  3377. #if defined(__aarch64__) || defined(_M_ARM64)
  3378. return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
  3379. #else
  3380. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3381. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3382. return (*(double *) &a0 > *(double *) &b0);
  3383. #endif
  3384. }
  3385. // Compare the lower double-precision (64-bit) floating-point element in a and b
  3386. // for less-than-or-equal, and return the boolean result (0 or 1).
  3387. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
  3388. FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
  3389. {
  3390. #if defined(__aarch64__) || defined(_M_ARM64)
  3391. return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
  3392. #else
  3393. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3394. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3395. return (*(double *) &a0 <= *(double *) &b0);
  3396. #endif
  3397. }
  3398. // Compare the lower double-precision (64-bit) floating-point element in a and b
  3399. // for less-than, and return the boolean result (0 or 1).
  3400. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
  3401. FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
  3402. {
  3403. #if defined(__aarch64__) || defined(_M_ARM64)
  3404. return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
  3405. #else
  3406. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  3407. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  3408. return (*(double *) &a0 < *(double *) &b0);
  3409. #endif
  3410. }
  3411. // Compare the lower double-precision (64-bit) floating-point element in a and b
  3412. // for equality, and return the boolean result (0 or 1).
  3413. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
  3414. FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
  3415. {
  3416. #if defined(__aarch64__) || defined(_M_ARM64)
  3417. return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
  3418. #else
  3419. uint32x4_t a_not_nan =
  3420. vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
  3421. uint32x4_t b_not_nan =
  3422. vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
  3423. uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
  3424. uint32x4_t a_eq_b =
  3425. vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
  3426. uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
  3427. vreinterpretq_u64_u32(a_eq_b));
  3428. return vgetq_lane_u64(and_results, 0) & 0x1;
  3429. #endif
  3430. }
  3431. // Compare the lower double-precision (64-bit) floating-point element in a and b
  3432. // for not-equal, and return the boolean result (0 or 1).
  3433. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
  3434. FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
  3435. {
  3436. return !_mm_comieq_sd(a, b);
  3437. }
  3438. // Convert packed signed 32-bit integers in a to packed double-precision
  3439. // (64-bit) floating-point elements, and store the results in dst.
  3440. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
  3441. FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
  3442. {
  3443. #if defined(__aarch64__) || defined(_M_ARM64)
  3444. return vreinterpretq_m128d_f64(
  3445. vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
  3446. #else
  3447. double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
  3448. double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
  3449. return _mm_set_pd(a1, a0);
  3450. #endif
  3451. }
  3452. // Convert packed signed 32-bit integers in a to packed single-precision
  3453. // (32-bit) floating-point elements, and store the results in dst.
  3454. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
  3455. FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
  3456. {
  3457. return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
  3458. }
  3459. // Convert packed double-precision (64-bit) floating-point elements in a to
  3460. // packed 32-bit integers, and store the results in dst.
  3461. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
  3462. FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
  3463. {
  3464. // vrnd32xq_f64 not supported on clang
  3465. #if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
  3466. float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
  3467. int64x2_t integers = vcvtq_s64_f64(rounded);
  3468. return vreinterpretq_m128i_s32(
  3469. vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
  3470. #else
  3471. __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
  3472. double d0 = ((double *) &rnd)[0];
  3473. double d1 = ((double *) &rnd)[1];
  3474. return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
  3475. #endif
  3476. }
  3477. // Convert packed double-precision (64-bit) floating-point elements in a to
  3478. // packed 32-bit integers, and store the results in dst.
  3479. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
  3480. FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
  3481. {
  3482. __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
  3483. double d0 = ((double *) &rnd)[0];
  3484. double d1 = ((double *) &rnd)[1];
  3485. int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
  3486. return vreinterpret_m64_s32(vld1_s32(data));
  3487. }
  3488. // Convert packed double-precision (64-bit) floating-point elements in a to
  3489. // packed single-precision (32-bit) floating-point elements, and store the
  3490. // results in dst.
  3491. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
  3492. FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
  3493. {
  3494. #if defined(__aarch64__) || defined(_M_ARM64)
  3495. float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
  3496. return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
  3497. #else
  3498. float a0 = (float) ((double *) &a)[0];
  3499. float a1 = (float) ((double *) &a)[1];
  3500. return _mm_set_ps(0, 0, a1, a0);
  3501. #endif
  3502. }
  3503. // Convert packed signed 32-bit integers in a to packed double-precision
  3504. // (64-bit) floating-point elements, and store the results in dst.
  3505. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
  3506. FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
  3507. {
  3508. #if defined(__aarch64__) || defined(_M_ARM64)
  3509. return vreinterpretq_m128d_f64(
  3510. vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
  3511. #else
  3512. double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
  3513. double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
  3514. return _mm_set_pd(a1, a0);
  3515. #endif
  3516. }
  3517. // Convert packed single-precision (32-bit) floating-point elements in a to
  3518. // packed 32-bit integers, and store the results in dst.
  3519. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
  3520. // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
  3521. // does not support! It is supported on ARMv8-A however.
  3522. FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
  3523. {
  3524. #if defined(__ARM_FEATURE_FRINT)
  3525. return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
  3526. #elif (defined(__aarch64__) || defined(_M_ARM64)) || \
  3527. defined(__ARM_FEATURE_DIRECTED_ROUNDING)
  3528. switch (_MM_GET_ROUNDING_MODE()) {
  3529. case _MM_ROUND_NEAREST:
  3530. return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
  3531. case _MM_ROUND_DOWN:
  3532. return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
  3533. case _MM_ROUND_UP:
  3534. return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
  3535. default: // _MM_ROUND_TOWARD_ZERO
  3536. return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
  3537. }
  3538. #else
  3539. float *f = (float *) &a;
  3540. switch (_MM_GET_ROUNDING_MODE()) {
  3541. case _MM_ROUND_NEAREST: {
  3542. uint32x4_t signmask = vdupq_n_u32(0x80000000);
  3543. float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
  3544. vdupq_n_f32(0.5f)); /* +/- 0.5 */
  3545. int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
  3546. vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
  3547. int32x4_t r_trunc = vcvtq_s32_f32(
  3548. vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
  3549. int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
  3550. vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
  3551. int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
  3552. vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
  3553. float32x4_t delta = vsubq_f32(
  3554. vreinterpretq_f32_m128(a),
  3555. vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
  3556. uint32x4_t is_delta_half =
  3557. vceqq_f32(delta, half); /* delta == +/- 0.5 */
  3558. return vreinterpretq_m128i_s32(
  3559. vbslq_s32(is_delta_half, r_even, r_normal));
  3560. }
  3561. case _MM_ROUND_DOWN:
  3562. return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
  3563. floorf(f[0]));
  3564. case _MM_ROUND_UP:
  3565. return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
  3566. ceilf(f[0]));
  3567. default: // _MM_ROUND_TOWARD_ZERO
  3568. return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
  3569. (int32_t) f[0]);
  3570. }
  3571. #endif
  3572. }
  3573. // Convert packed single-precision (32-bit) floating-point elements in a to
  3574. // packed double-precision (64-bit) floating-point elements, and store the
  3575. // results in dst.
  3576. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
  3577. FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
  3578. {
  3579. #if defined(__aarch64__) || defined(_M_ARM64)
  3580. return vreinterpretq_m128d_f64(
  3581. vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
  3582. #else
  3583. double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
  3584. double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
  3585. return _mm_set_pd(a1, a0);
  3586. #endif
  3587. }
  3588. // Copy the lower double-precision (64-bit) floating-point element of a to dst.
  3589. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
  3590. FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
  3591. {
  3592. #if defined(__aarch64__) || defined(_M_ARM64)
  3593. return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
  3594. #else
  3595. return ((double *) &a)[0];
  3596. #endif
  3597. }
  3598. // Convert the lower double-precision (64-bit) floating-point element in a to a
  3599. // 32-bit integer, and store the result in dst.
  3600. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
  3601. FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
  3602. {
  3603. #if defined(__aarch64__) || defined(_M_ARM64)
  3604. return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
  3605. #else
  3606. __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
  3607. double ret = ((double *) &rnd)[0];
  3608. return (int32_t) ret;
  3609. #endif
  3610. }
  3611. // Convert the lower double-precision (64-bit) floating-point element in a to a
  3612. // 64-bit integer, and store the result in dst.
  3613. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
  3614. FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
  3615. {
  3616. #if defined(__aarch64__) || defined(_M_ARM64)
  3617. return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
  3618. #else
  3619. __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
  3620. double ret = ((double *) &rnd)[0];
  3621. return (int64_t) ret;
  3622. #endif
  3623. }
  3624. // Convert the lower double-precision (64-bit) floating-point element in a to a
  3625. // 64-bit integer, and store the result in dst.
  3626. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
  3627. #define _mm_cvtsd_si64x _mm_cvtsd_si64
  3628. // Convert the lower double-precision (64-bit) floating-point element in b to a
  3629. // single-precision (32-bit) floating-point element, store the result in the
  3630. // lower element of dst, and copy the upper 3 packed elements from a to the
  3631. // upper elements of dst.
  3632. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
  3633. FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
  3634. {
  3635. #if defined(__aarch64__) || defined(_M_ARM64)
  3636. return vreinterpretq_m128_f32(vsetq_lane_f32(
  3637. vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
  3638. vreinterpretq_f32_m128(a), 0));
  3639. #else
  3640. return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
  3641. vreinterpretq_f32_m128(a), 0));
  3642. #endif
  3643. }
  3644. // Copy the lower 32-bit integer in a to dst.
  3645. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
  3646. FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
  3647. {
  3648. return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
  3649. }
  3650. // Copy the lower 64-bit integer in a to dst.
  3651. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
  3652. FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
  3653. {
  3654. return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
  3655. }
  3656. // Copy the lower 64-bit integer in a to dst.
  3657. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
  3658. #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
  3659. // Convert the signed 32-bit integer b to a double-precision (64-bit)
  3660. // floating-point element, store the result in the lower element of dst, and
  3661. // copy the upper element from a to the upper element of dst.
  3662. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
  3663. FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
  3664. {
  3665. #if defined(__aarch64__) || defined(_M_ARM64)
  3666. return vreinterpretq_m128d_f64(
  3667. vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
  3668. #else
  3669. double bf = (double) b;
  3670. return vreinterpretq_m128d_s64(
  3671. vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
  3672. #endif
  3673. }
  3674. // Copy the lower 64-bit integer in a to dst.
  3675. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
  3676. #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
  3677. // Copy 32-bit integer a to the lower elements of dst, and zero the upper
  3678. // elements of dst.
  3679. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
  3680. FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
  3681. {
  3682. return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
  3683. }
  3684. // Convert the signed 64-bit integer b to a double-precision (64-bit)
  3685. // floating-point element, store the result in the lower element of dst, and
  3686. // copy the upper element from a to the upper element of dst.
  3687. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
  3688. FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
  3689. {
  3690. #if defined(__aarch64__) || defined(_M_ARM64)
  3691. return vreinterpretq_m128d_f64(
  3692. vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
  3693. #else
  3694. double bf = (double) b;
  3695. return vreinterpretq_m128d_s64(
  3696. vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
  3697. #endif
  3698. }
  3699. // Copy 64-bit integer a to the lower element of dst, and zero the upper
  3700. // element.
  3701. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
  3702. FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
  3703. {
  3704. return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
  3705. }
  3706. // Copy 64-bit integer a to the lower element of dst, and zero the upper
  3707. // element.
  3708. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
  3709. #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
  3710. // Convert the signed 64-bit integer b to a double-precision (64-bit)
  3711. // floating-point element, store the result in the lower element of dst, and
  3712. // copy the upper element from a to the upper element of dst.
  3713. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
  3714. #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
  3715. // Convert the lower single-precision (32-bit) floating-point element in b to a
  3716. // double-precision (64-bit) floating-point element, store the result in the
  3717. // lower element of dst, and copy the upper element from a to the upper element
  3718. // of dst.
  3719. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
  3720. FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
  3721. {
  3722. double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
  3723. #if defined(__aarch64__) || defined(_M_ARM64)
  3724. return vreinterpretq_m128d_f64(
  3725. vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
  3726. #else
  3727. return vreinterpretq_m128d_s64(
  3728. vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
  3729. #endif
  3730. }
  3731. // Convert packed double-precision (64-bit) floating-point elements in a to
  3732. // packed 32-bit integers with truncation, and store the results in dst.
  3733. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
  3734. FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
  3735. {
  3736. double a0 = ((double *) &a)[0];
  3737. double a1 = ((double *) &a)[1];
  3738. return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
  3739. }
  3740. // Convert packed double-precision (64-bit) floating-point elements in a to
  3741. // packed 32-bit integers with truncation, and store the results in dst.
  3742. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
  3743. FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
  3744. {
  3745. double a0 = ((double *) &a)[0];
  3746. double a1 = ((double *) &a)[1];
  3747. int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
  3748. return vreinterpret_m64_s32(vld1_s32(data));
  3749. }
  3750. // Convert packed single-precision (32-bit) floating-point elements in a to
  3751. // packed 32-bit integers with truncation, and store the results in dst.
  3752. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
  3753. FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
  3754. {
  3755. return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
  3756. }
  3757. // Convert the lower double-precision (64-bit) floating-point element in a to a
  3758. // 32-bit integer with truncation, and store the result in dst.
  3759. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
  3760. FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
  3761. {
  3762. double ret = *((double *) &a);
  3763. return (int32_t) ret;
  3764. }
  3765. // Convert the lower double-precision (64-bit) floating-point element in a to a
  3766. // 64-bit integer with truncation, and store the result in dst.
  3767. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
  3768. FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
  3769. {
  3770. #if defined(__aarch64__) || defined(_M_ARM64)
  3771. return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
  3772. #else
  3773. double ret = *((double *) &a);
  3774. return (int64_t) ret;
  3775. #endif
  3776. }
  3777. // Convert the lower double-precision (64-bit) floating-point element in a to a
  3778. // 64-bit integer with truncation, and store the result in dst.
  3779. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
  3780. #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
  3781. // Divide packed double-precision (64-bit) floating-point elements in a by
  3782. // packed elements in b, and store the results in dst.
  3783. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
  3784. FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
  3785. {
  3786. #if defined(__aarch64__) || defined(_M_ARM64)
  3787. return vreinterpretq_m128d_f64(
  3788. vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  3789. #else
  3790. double *da = (double *) &a;
  3791. double *db = (double *) &b;
  3792. double c[2];
  3793. c[0] = da[0] / db[0];
  3794. c[1] = da[1] / db[1];
  3795. return vld1q_f32((float32_t *) c);
  3796. #endif
  3797. }
  3798. // Divide the lower double-precision (64-bit) floating-point element in a by the
  3799. // lower double-precision (64-bit) floating-point element in b, store the result
  3800. // in the lower element of dst, and copy the upper element from a to the upper
  3801. // element of dst.
  3802. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
  3803. FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
  3804. {
  3805. #if defined(__aarch64__) || defined(_M_ARM64)
  3806. float64x2_t tmp =
  3807. vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
  3808. return vreinterpretq_m128d_f64(
  3809. vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
  3810. #else
  3811. return _mm_move_sd(a, _mm_div_pd(a, b));
  3812. #endif
  3813. }
  3814. // Extract a 16-bit integer from a, selected with imm8, and store the result in
  3815. // the lower element of dst.
  3816. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
  3817. // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
  3818. #define _mm_extract_epi16(a, imm) \
  3819. vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
  3820. // Copy a to dst, and insert the 16-bit integer i into dst at the location
  3821. // specified by imm8.
  3822. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
  3823. // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
  3824. // __constrange(0,8) int imm)
  3825. #define _mm_insert_epi16(a, b, imm) \
  3826. vreinterpretq_m128i_s16( \
  3827. vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
  3828. // Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
  3829. // elements) from memory into dst. mem_addr must be aligned on a 16-byte
  3830. // boundary or a general-protection exception may be generated.
  3831. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
  3832. FORCE_INLINE __m128d _mm_load_pd(const double *p)
  3833. {
  3834. #if defined(__aarch64__) || defined(_M_ARM64)
  3835. return vreinterpretq_m128d_f64(vld1q_f64(p));
  3836. #else
  3837. const float *fp = (const float *) p;
  3838. float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
  3839. return vreinterpretq_m128d_f32(vld1q_f32(data));
  3840. #endif
  3841. }
  3842. // Load a double-precision (64-bit) floating-point element from memory into both
  3843. // elements of dst.
  3844. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
  3845. #define _mm_load_pd1 _mm_load1_pd
  3846. // Load a double-precision (64-bit) floating-point element from memory into the
  3847. // lower of dst, and zero the upper element. mem_addr does not need to be
  3848. // aligned on any particular boundary.
  3849. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
  3850. FORCE_INLINE __m128d _mm_load_sd(const double *p)
  3851. {
  3852. #if defined(__aarch64__) || defined(_M_ARM64)
  3853. return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
  3854. #else
  3855. const float *fp = (const float *) p;
  3856. float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
  3857. return vreinterpretq_m128d_f32(vld1q_f32(data));
  3858. #endif
  3859. }
  3860. // Load 128-bits of integer data from memory into dst. mem_addr must be aligned
  3861. // on a 16-byte boundary or a general-protection exception may be generated.
  3862. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
  3863. FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
  3864. {
  3865. return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
  3866. }
  3867. // Load a double-precision (64-bit) floating-point element from memory into both
  3868. // elements of dst.
  3869. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
  3870. FORCE_INLINE __m128d _mm_load1_pd(const double *p)
  3871. {
  3872. #if defined(__aarch64__) || defined(_M_ARM64)
  3873. return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
  3874. #else
  3875. return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
  3876. #endif
  3877. }
  3878. // Load a double-precision (64-bit) floating-point element from memory into the
  3879. // upper element of dst, and copy the lower element from a to dst. mem_addr does
  3880. // not need to be aligned on any particular boundary.
  3881. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
  3882. FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
  3883. {
  3884. #if defined(__aarch64__) || defined(_M_ARM64)
  3885. return vreinterpretq_m128d_f64(
  3886. vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
  3887. #else
  3888. return vreinterpretq_m128d_f32(vcombine_f32(
  3889. vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
  3890. #endif
  3891. }
  3892. // Load 64-bit integer from memory into the first element of dst.
  3893. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
  3894. FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
  3895. {
  3896. /* Load the lower 64 bits of the value pointed to by p into the
  3897. * lower 64 bits of the result, zeroing the upper 64 bits of the result.
  3898. */
  3899. return vreinterpretq_m128i_s32(
  3900. vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
  3901. }
  3902. // Load a double-precision (64-bit) floating-point element from memory into the
  3903. // lower element of dst, and copy the upper element from a to dst. mem_addr does
  3904. // not need to be aligned on any particular boundary.
  3905. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
  3906. FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
  3907. {
  3908. #if defined(__aarch64__) || defined(_M_ARM64)
  3909. return vreinterpretq_m128d_f64(
  3910. vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
  3911. #else
  3912. return vreinterpretq_m128d_f32(
  3913. vcombine_f32(vld1_f32((const float *) p),
  3914. vget_high_f32(vreinterpretq_f32_m128d(a))));
  3915. #endif
  3916. }
  3917. // Load 2 double-precision (64-bit) floating-point elements from memory into dst
  3918. // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
  3919. // general-protection exception may be generated.
  3920. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
  3921. FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
  3922. {
  3923. #if defined(__aarch64__) || defined(_M_ARM64)
  3924. float64x2_t v = vld1q_f64(p);
  3925. return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
  3926. #else
  3927. int64x2_t v = vld1q_s64((const int64_t *) p);
  3928. return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
  3929. #endif
  3930. }
  3931. // Loads two double-precision from unaligned memory, floating-point values.
  3932. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
  3933. FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
  3934. {
  3935. return _mm_load_pd(p);
  3936. }
  3937. // Load 128-bits of integer data from memory into dst. mem_addr does not need to
  3938. // be aligned on any particular boundary.
  3939. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
  3940. FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
  3941. {
  3942. return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
  3943. }
  3944. // Load unaligned 32-bit integer from memory into the first element of dst.
  3945. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
  3946. FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
  3947. {
  3948. return vreinterpretq_m128i_s32(
  3949. vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
  3950. }
  3951. // Multiply packed signed 16-bit integers in a and b, producing intermediate
  3952. // signed 32-bit integers. Horizontally add adjacent pairs of intermediate
  3953. // 32-bit integers, and pack the results in dst.
  3954. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
  3955. FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
  3956. {
  3957. int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
  3958. vget_low_s16(vreinterpretq_s16_m128i(b)));
  3959. #if defined(__aarch64__) || defined(_M_ARM64)
  3960. int32x4_t high =
  3961. vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
  3962. return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
  3963. #else
  3964. int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
  3965. vget_high_s16(vreinterpretq_s16_m128i(b)));
  3966. int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
  3967. int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
  3968. return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
  3969. #endif
  3970. }
  3971. // Conditionally store 8-bit integer elements from a into memory using mask
  3972. // (elements are not stored when the highest bit is not set in the corresponding
  3973. // element) and a non-temporal memory hint. mem_addr does not need to be aligned
  3974. // on any particular boundary.
  3975. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
  3976. FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
  3977. {
  3978. int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
  3979. __m128 b = _mm_load_ps((const float *) mem_addr);
  3980. int8x16_t masked =
  3981. vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
  3982. vreinterpretq_s8_m128(b));
  3983. vst1q_s8((int8_t *) mem_addr, masked);
  3984. }
  3985. // Compare packed signed 16-bit integers in a and b, and store packed maximum
  3986. // values in dst.
  3987. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
  3988. FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
  3989. {
  3990. return vreinterpretq_m128i_s16(
  3991. vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  3992. }
  3993. // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
  3994. // values in dst.
  3995. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
  3996. FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
  3997. {
  3998. return vreinterpretq_m128i_u8(
  3999. vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
  4000. }
  4001. // Compare packed double-precision (64-bit) floating-point elements in a and b,
  4002. // and store packed maximum values in dst.
  4003. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
  4004. FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
  4005. {
  4006. #if defined(__aarch64__) || defined(_M_ARM64)
  4007. #if SSE2NEON_PRECISE_MINMAX
  4008. float64x2_t _a = vreinterpretq_f64_m128d(a);
  4009. float64x2_t _b = vreinterpretq_f64_m128d(b);
  4010. return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
  4011. #else
  4012. return vreinterpretq_m128d_f64(
  4013. vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  4014. #endif
  4015. #else
  4016. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  4017. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  4018. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  4019. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  4020. uint64_t d[2];
  4021. d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
  4022. d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
  4023. return vreinterpretq_m128d_u64(vld1q_u64(d));
  4024. #endif
  4025. }
  4026. // Compare the lower double-precision (64-bit) floating-point elements in a and
  4027. // b, store the maximum value in the lower element of dst, and copy the upper
  4028. // element from a to the upper element of dst.
  4029. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
  4030. FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
  4031. {
  4032. #if defined(__aarch64__) || defined(_M_ARM64)
  4033. return _mm_move_sd(a, _mm_max_pd(a, b));
  4034. #else
  4035. double *da = (double *) &a;
  4036. double *db = (double *) &b;
  4037. double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
  4038. return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
  4039. #endif
  4040. }
  4041. // Compare packed signed 16-bit integers in a and b, and store packed minimum
  4042. // values in dst.
  4043. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
  4044. FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
  4045. {
  4046. return vreinterpretq_m128i_s16(
  4047. vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  4048. }
  4049. // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
  4050. // values in dst.
  4051. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
  4052. FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
  4053. {
  4054. return vreinterpretq_m128i_u8(
  4055. vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
  4056. }
  4057. // Compare packed double-precision (64-bit) floating-point elements in a and b,
  4058. // and store packed minimum values in dst.
  4059. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
  4060. FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
  4061. {
  4062. #if defined(__aarch64__) || defined(_M_ARM64)
  4063. #if SSE2NEON_PRECISE_MINMAX
  4064. float64x2_t _a = vreinterpretq_f64_m128d(a);
  4065. float64x2_t _b = vreinterpretq_f64_m128d(b);
  4066. return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
  4067. #else
  4068. return vreinterpretq_m128d_f64(
  4069. vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  4070. #endif
  4071. #else
  4072. uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
  4073. uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
  4074. uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
  4075. uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
  4076. uint64_t d[2];
  4077. d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
  4078. d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
  4079. return vreinterpretq_m128d_u64(vld1q_u64(d));
  4080. #endif
  4081. }
  4082. // Compare the lower double-precision (64-bit) floating-point elements in a and
  4083. // b, store the minimum value in the lower element of dst, and copy the upper
  4084. // element from a to the upper element of dst.
  4085. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
  4086. FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
  4087. {
  4088. #if defined(__aarch64__) || defined(_M_ARM64)
  4089. return _mm_move_sd(a, _mm_min_pd(a, b));
  4090. #else
  4091. double *da = (double *) &a;
  4092. double *db = (double *) &b;
  4093. double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
  4094. return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
  4095. #endif
  4096. }
  4097. // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
  4098. // upper element.
  4099. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
  4100. FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
  4101. {
  4102. return vreinterpretq_m128i_s64(
  4103. vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
  4104. }
  4105. // Move the lower double-precision (64-bit) floating-point element from b to the
  4106. // lower element of dst, and copy the upper element from a to the upper element
  4107. // of dst.
  4108. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
  4109. FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
  4110. {
  4111. return vreinterpretq_m128d_f32(
  4112. vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
  4113. vget_high_f32(vreinterpretq_f32_m128d(a))));
  4114. }
  4115. // Create mask from the most significant bit of each 8-bit element in a, and
  4116. // store the result in dst.
  4117. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
  4118. FORCE_INLINE int _mm_movemask_epi8(__m128i a)
  4119. {
  4120. // Use increasingly wide shifts+adds to collect the sign bits
  4121. // together.
  4122. // Since the widening shifts would be rather confusing to follow in little
  4123. // endian, everything will be illustrated in big endian order instead. This
  4124. // has a different result - the bits would actually be reversed on a big
  4125. // endian machine.
  4126. // Starting input (only half the elements are shown):
  4127. // 89 ff 1d c0 00 10 99 33
  4128. uint8x16_t input = vreinterpretq_u8_m128i(a);
  4129. // Shift out everything but the sign bits with an unsigned shift right.
  4130. //
  4131. // Bytes of the vector::
  4132. // 89 ff 1d c0 00 10 99 33
  4133. // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
  4134. // | | | | | | | |
  4135. // 01 01 00 01 00 00 01 00
  4136. //
  4137. // Bits of first important lane(s):
  4138. // 10001001 (89)
  4139. // \______
  4140. // |
  4141. // 00000001 (01)
  4142. uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
  4143. // Merge the even lanes together with a 16-bit unsigned shift right + add.
  4144. // 'xx' represents garbage data which will be ignored in the final result.
  4145. // In the important bytes, the add functions like a binary OR.
  4146. //
  4147. // 01 01 00 01 00 00 01 00
  4148. // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
  4149. // \| \| \| \|
  4150. // xx 03 xx 01 xx 00 xx 02
  4151. //
  4152. // 00000001 00000001 (01 01)
  4153. // \_______ |
  4154. // \|
  4155. // xxxxxxxx xxxxxx11 (xx 03)
  4156. uint32x4_t paired16 =
  4157. vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
  4158. // Repeat with a wider 32-bit shift + add.
  4159. // xx 03 xx 01 xx 00 xx 02
  4160. // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
  4161. // 14))
  4162. // \| \|
  4163. // xx xx xx 0d xx xx xx 02
  4164. //
  4165. // 00000011 00000001 (03 01)
  4166. // \\_____ ||
  4167. // '----.\||
  4168. // xxxxxxxx xxxx1101 (xx 0d)
  4169. uint64x2_t paired32 =
  4170. vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
  4171. // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
  4172. // lanes. xx xx xx 0d xx xx xx 02
  4173. // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
  4174. // 28))
  4175. // \|
  4176. // xx xx xx xx xx xx xx d2
  4177. //
  4178. // 00001101 00000010 (0d 02)
  4179. // \ \___ | |
  4180. // '---. \| |
  4181. // xxxxxxxx 11010010 (xx d2)
  4182. uint8x16_t paired64 =
  4183. vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
  4184. // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
  4185. // xx xx xx xx xx xx xx d2
  4186. // || return paired64[0]
  4187. // d2
  4188. // Note: Little endian would return the correct value 4b (01001011) instead.
  4189. return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
  4190. }
  4191. // Set each bit of mask dst based on the most significant bit of the
  4192. // corresponding packed double-precision (64-bit) floating-point element in a.
  4193. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
  4194. FORCE_INLINE int _mm_movemask_pd(__m128d a)
  4195. {
  4196. uint64x2_t input = vreinterpretq_u64_m128d(a);
  4197. uint64x2_t high_bits = vshrq_n_u64(input, 63);
  4198. return (int) (vgetq_lane_u64(high_bits, 0) |
  4199. (vgetq_lane_u64(high_bits, 1) << 1));
  4200. }
  4201. // Copy the lower 64-bit integer in a to dst.
  4202. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
  4203. FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
  4204. {
  4205. return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
  4206. }
  4207. // Copy the 64-bit integer a to the lower element of dst, and zero the upper
  4208. // element.
  4209. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
  4210. FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
  4211. {
  4212. return vreinterpretq_m128i_s64(
  4213. vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
  4214. }
  4215. // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
  4216. // a and b, and store the unsigned 64-bit results in dst.
  4217. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
  4218. FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
  4219. {
  4220. // vmull_u32 upcasts instead of masking, so we downcast.
  4221. uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
  4222. uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
  4223. return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
  4224. }
  4225. // Multiply packed double-precision (64-bit) floating-point elements in a and b,
  4226. // and store the results in dst.
  4227. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
  4228. FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
  4229. {
  4230. #if defined(__aarch64__) || defined(_M_ARM64)
  4231. return vreinterpretq_m128d_f64(
  4232. vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  4233. #else
  4234. double *da = (double *) &a;
  4235. double *db = (double *) &b;
  4236. double c[2];
  4237. c[0] = da[0] * db[0];
  4238. c[1] = da[1] * db[1];
  4239. return vld1q_f32((float32_t *) c);
  4240. #endif
  4241. }
  4242. // Multiply the lower double-precision (64-bit) floating-point element in a and
  4243. // b, store the result in the lower element of dst, and copy the upper element
  4244. // from a to the upper element of dst.
  4245. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
  4246. FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
  4247. {
  4248. return _mm_move_sd(a, _mm_mul_pd(a, b));
  4249. }
  4250. // Multiply the low unsigned 32-bit integers from a and b, and store the
  4251. // unsigned 64-bit result in dst.
  4252. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
  4253. FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
  4254. {
  4255. return vreinterpret_m64_u64(vget_low_u64(
  4256. vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
  4257. }
  4258. // Multiply the packed signed 16-bit integers in a and b, producing intermediate
  4259. // 32-bit integers, and store the high 16 bits of the intermediate integers in
  4260. // dst.
  4261. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
  4262. FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
  4263. {
  4264. /* FIXME: issue with large values because of result saturation */
  4265. // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
  4266. // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
  4267. // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
  4268. int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
  4269. int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
  4270. int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
  4271. int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
  4272. int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
  4273. int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
  4274. uint16x8x2_t r =
  4275. vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
  4276. return vreinterpretq_m128i_u16(r.val[1]);
  4277. }
  4278. // Multiply the packed unsigned 16-bit integers in a and b, producing
  4279. // intermediate 32-bit integers, and store the high 16 bits of the intermediate
  4280. // integers in dst.
  4281. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
  4282. FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
  4283. {
  4284. uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
  4285. uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
  4286. uint32x4_t ab3210 = vmull_u16(a3210, b3210);
  4287. #if defined(__aarch64__) || defined(_M_ARM64)
  4288. uint32x4_t ab7654 =
  4289. vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
  4290. uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
  4291. vreinterpretq_u16_u32(ab7654));
  4292. return vreinterpretq_m128i_u16(r);
  4293. #else
  4294. uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
  4295. uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
  4296. uint32x4_t ab7654 = vmull_u16(a7654, b7654);
  4297. uint16x8x2_t r =
  4298. vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
  4299. return vreinterpretq_m128i_u16(r.val[1]);
  4300. #endif
  4301. }
  4302. // Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
  4303. // integers, and store the low 16 bits of the intermediate integers in dst.
  4304. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
  4305. FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
  4306. {
  4307. return vreinterpretq_m128i_s16(
  4308. vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  4309. }
  4310. // Compute the bitwise OR of packed double-precision (64-bit) floating-point
  4311. // elements in a and b, and store the results in dst.
  4312. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
  4313. FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
  4314. {
  4315. return vreinterpretq_m128d_s64(
  4316. vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
  4317. }
  4318. // Compute the bitwise OR of 128 bits (representing integer data) in a and b,
  4319. // and store the result in dst.
  4320. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
  4321. FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
  4322. {
  4323. return vreinterpretq_m128i_s32(
  4324. vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  4325. }
  4326. // Convert packed signed 16-bit integers from a and b to packed 8-bit integers
  4327. // using signed saturation, and store the results in dst.
  4328. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
  4329. FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
  4330. {
  4331. return vreinterpretq_m128i_s8(
  4332. vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
  4333. vqmovn_s16(vreinterpretq_s16_m128i(b))));
  4334. }
  4335. // Convert packed signed 32-bit integers from a and b to packed 16-bit integers
  4336. // using signed saturation, and store the results in dst.
  4337. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
  4338. FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
  4339. {
  4340. return vreinterpretq_m128i_s16(
  4341. vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
  4342. vqmovn_s32(vreinterpretq_s32_m128i(b))));
  4343. }
  4344. // Convert packed signed 16-bit integers from a and b to packed 8-bit integers
  4345. // using unsigned saturation, and store the results in dst.
  4346. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
  4347. FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
  4348. {
  4349. return vreinterpretq_m128i_u8(
  4350. vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
  4351. vqmovun_s16(vreinterpretq_s16_m128i(b))));
  4352. }
  4353. // Pause the processor. This is typically used in spin-wait loops and depending
  4354. // on the x86 processor typical values are in the 40-100 cycle range. The
  4355. // 'yield' instruction isn't a good fit because it's effectively a nop on most
  4356. // Arm cores. Experience with several databases has shown has shown an 'isb' is
  4357. // a reasonable approximation.
  4358. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
  4359. FORCE_INLINE void _mm_pause(void)
  4360. {
  4361. #if defined(_MSC_VER)
  4362. __isb(_ARM64_BARRIER_SY);
  4363. #else
  4364. __asm__ __volatile__("isb\n");
  4365. #endif
  4366. }
  4367. // Compute the absolute differences of packed unsigned 8-bit integers in a and
  4368. // b, then horizontally sum each consecutive 8 differences to produce two
  4369. // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
  4370. // 16 bits of 64-bit elements in dst.
  4371. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
  4372. FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
  4373. {
  4374. uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
  4375. return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
  4376. }
  4377. // Set packed 16-bit integers in dst with the supplied values.
  4378. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
  4379. FORCE_INLINE __m128i _mm_set_epi16(short i7,
  4380. short i6,
  4381. short i5,
  4382. short i4,
  4383. short i3,
  4384. short i2,
  4385. short i1,
  4386. short i0)
  4387. {
  4388. int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
  4389. return vreinterpretq_m128i_s16(vld1q_s16(data));
  4390. }
  4391. // Set packed 32-bit integers in dst with the supplied values.
  4392. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
  4393. FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
  4394. {
  4395. int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
  4396. return vreinterpretq_m128i_s32(vld1q_s32(data));
  4397. }
  4398. // Set packed 64-bit integers in dst with the supplied values.
  4399. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
  4400. FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
  4401. {
  4402. return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
  4403. }
  4404. // Set packed 64-bit integers in dst with the supplied values.
  4405. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
  4406. FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
  4407. {
  4408. return vreinterpretq_m128i_s64(
  4409. vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
  4410. }
  4411. // Set packed 8-bit integers in dst with the supplied values.
  4412. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
  4413. FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
  4414. signed char b14,
  4415. signed char b13,
  4416. signed char b12,
  4417. signed char b11,
  4418. signed char b10,
  4419. signed char b9,
  4420. signed char b8,
  4421. signed char b7,
  4422. signed char b6,
  4423. signed char b5,
  4424. signed char b4,
  4425. signed char b3,
  4426. signed char b2,
  4427. signed char b1,
  4428. signed char b0)
  4429. {
  4430. int8_t ALIGN_STRUCT(16)
  4431. data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
  4432. (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
  4433. (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
  4434. (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
  4435. return (__m128i) vld1q_s8(data);
  4436. }
  4437. // Set packed double-precision (64-bit) floating-point elements in dst with the
  4438. // supplied values.
  4439. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
  4440. FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
  4441. {
  4442. double ALIGN_STRUCT(16) data[2] = {e0, e1};
  4443. #if defined(__aarch64__) || defined(_M_ARM64)
  4444. return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
  4445. #else
  4446. return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
  4447. #endif
  4448. }
  4449. // Broadcast double-precision (64-bit) floating-point value a to all elements of
  4450. // dst.
  4451. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
  4452. #define _mm_set_pd1 _mm_set1_pd
  4453. // Copy double-precision (64-bit) floating-point element a to the lower element
  4454. // of dst, and zero the upper element.
  4455. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
  4456. FORCE_INLINE __m128d _mm_set_sd(double a)
  4457. {
  4458. #if defined(__aarch64__) || defined(_M_ARM64)
  4459. return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
  4460. #else
  4461. return _mm_set_pd(0, a);
  4462. #endif
  4463. }
  4464. // Broadcast 16-bit integer a to all elements of dst.
  4465. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
  4466. FORCE_INLINE __m128i _mm_set1_epi16(short w)
  4467. {
  4468. return vreinterpretq_m128i_s16(vdupq_n_s16(w));
  4469. }
  4470. // Broadcast 32-bit integer a to all elements of dst.
  4471. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
  4472. FORCE_INLINE __m128i _mm_set1_epi32(int _i)
  4473. {
  4474. return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
  4475. }
  4476. // Broadcast 64-bit integer a to all elements of dst.
  4477. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
  4478. FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
  4479. {
  4480. return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
  4481. }
  4482. // Broadcast 64-bit integer a to all elements of dst.
  4483. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
  4484. FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
  4485. {
  4486. return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
  4487. }
  4488. // Broadcast 8-bit integer a to all elements of dst.
  4489. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
  4490. FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
  4491. {
  4492. return vreinterpretq_m128i_s8(vdupq_n_s8(w));
  4493. }
  4494. // Broadcast double-precision (64-bit) floating-point value a to all elements of
  4495. // dst.
  4496. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
  4497. FORCE_INLINE __m128d _mm_set1_pd(double d)
  4498. {
  4499. #if defined(__aarch64__) || defined(_M_ARM64)
  4500. return vreinterpretq_m128d_f64(vdupq_n_f64(d));
  4501. #else
  4502. return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
  4503. #endif
  4504. }
  4505. // Set packed 16-bit integers in dst with the supplied values in reverse order.
  4506. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
  4507. FORCE_INLINE __m128i _mm_setr_epi16(short w0,
  4508. short w1,
  4509. short w2,
  4510. short w3,
  4511. short w4,
  4512. short w5,
  4513. short w6,
  4514. short w7)
  4515. {
  4516. int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
  4517. return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
  4518. }
  4519. // Set packed 32-bit integers in dst with the supplied values in reverse order.
  4520. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
  4521. FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
  4522. {
  4523. int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
  4524. return vreinterpretq_m128i_s32(vld1q_s32(data));
  4525. }
  4526. // Set packed 64-bit integers in dst with the supplied values in reverse order.
  4527. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
  4528. FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
  4529. {
  4530. return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
  4531. }
  4532. // Set packed 8-bit integers in dst with the supplied values in reverse order.
  4533. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
  4534. FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
  4535. signed char b1,
  4536. signed char b2,
  4537. signed char b3,
  4538. signed char b4,
  4539. signed char b5,
  4540. signed char b6,
  4541. signed char b7,
  4542. signed char b8,
  4543. signed char b9,
  4544. signed char b10,
  4545. signed char b11,
  4546. signed char b12,
  4547. signed char b13,
  4548. signed char b14,
  4549. signed char b15)
  4550. {
  4551. int8_t ALIGN_STRUCT(16)
  4552. data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
  4553. (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
  4554. (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
  4555. (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
  4556. return (__m128i) vld1q_s8(data);
  4557. }
  4558. // Set packed double-precision (64-bit) floating-point elements in dst with the
  4559. // supplied values in reverse order.
  4560. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
  4561. FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
  4562. {
  4563. return _mm_set_pd(e0, e1);
  4564. }
  4565. // Return vector of type __m128d with all elements set to zero.
  4566. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
  4567. FORCE_INLINE __m128d _mm_setzero_pd(void)
  4568. {
  4569. #if defined(__aarch64__) || defined(_M_ARM64)
  4570. return vreinterpretq_m128d_f64(vdupq_n_f64(0));
  4571. #else
  4572. return vreinterpretq_m128d_f32(vdupq_n_f32(0));
  4573. #endif
  4574. }
  4575. // Return vector of type __m128i with all elements set to zero.
  4576. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
  4577. FORCE_INLINE __m128i _mm_setzero_si128(void)
  4578. {
  4579. return vreinterpretq_m128i_s32(vdupq_n_s32(0));
  4580. }
  4581. // Shuffle 32-bit integers in a using the control in imm8, and store the results
  4582. // in dst.
  4583. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
  4584. // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
  4585. // __constrange(0,255) int imm)
  4586. #if defined(_sse2neon_shuffle)
  4587. #define _mm_shuffle_epi32(a, imm) \
  4588. __extension__({ \
  4589. int32x4_t _input = vreinterpretq_s32_m128i(a); \
  4590. int32x4_t _shuf = \
  4591. vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
  4592. ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
  4593. vreinterpretq_m128i_s32(_shuf); \
  4594. })
  4595. #else // generic
  4596. #define _mm_shuffle_epi32(a, imm) \
  4597. _sse2neon_define1( \
  4598. __m128i, a, __m128i ret; switch (imm) { \
  4599. case _MM_SHUFFLE(1, 0, 3, 2): \
  4600. ret = _mm_shuffle_epi_1032(_a); \
  4601. break; \
  4602. case _MM_SHUFFLE(2, 3, 0, 1): \
  4603. ret = _mm_shuffle_epi_2301(_a); \
  4604. break; \
  4605. case _MM_SHUFFLE(0, 3, 2, 1): \
  4606. ret = _mm_shuffle_epi_0321(_a); \
  4607. break; \
  4608. case _MM_SHUFFLE(2, 1, 0, 3): \
  4609. ret = _mm_shuffle_epi_2103(_a); \
  4610. break; \
  4611. case _MM_SHUFFLE(1, 0, 1, 0): \
  4612. ret = _mm_shuffle_epi_1010(_a); \
  4613. break; \
  4614. case _MM_SHUFFLE(1, 0, 0, 1): \
  4615. ret = _mm_shuffle_epi_1001(_a); \
  4616. break; \
  4617. case _MM_SHUFFLE(0, 1, 0, 1): \
  4618. ret = _mm_shuffle_epi_0101(_a); \
  4619. break; \
  4620. case _MM_SHUFFLE(2, 2, 1, 1): \
  4621. ret = _mm_shuffle_epi_2211(_a); \
  4622. break; \
  4623. case _MM_SHUFFLE(0, 1, 2, 2): \
  4624. ret = _mm_shuffle_epi_0122(_a); \
  4625. break; \
  4626. case _MM_SHUFFLE(3, 3, 3, 2): \
  4627. ret = _mm_shuffle_epi_3332(_a); \
  4628. break; \
  4629. case _MM_SHUFFLE(0, 0, 0, 0): \
  4630. ret = _mm_shuffle_epi32_splat(_a, 0); \
  4631. break; \
  4632. case _MM_SHUFFLE(1, 1, 1, 1): \
  4633. ret = _mm_shuffle_epi32_splat(_a, 1); \
  4634. break; \
  4635. case _MM_SHUFFLE(2, 2, 2, 2): \
  4636. ret = _mm_shuffle_epi32_splat(_a, 2); \
  4637. break; \
  4638. case _MM_SHUFFLE(3, 3, 3, 3): \
  4639. ret = _mm_shuffle_epi32_splat(_a, 3); \
  4640. break; \
  4641. default: \
  4642. ret = _mm_shuffle_epi32_default(_a, (imm)); \
  4643. break; \
  4644. } _sse2neon_return(ret);)
  4645. #endif
  4646. // Shuffle double-precision (64-bit) floating-point elements using the control
  4647. // in imm8, and store the results in dst.
  4648. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
  4649. #ifdef _sse2neon_shuffle
  4650. #define _mm_shuffle_pd(a, b, imm8) \
  4651. vreinterpretq_m128d_s64( \
  4652. vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
  4653. imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
  4654. #else
  4655. #define _mm_shuffle_pd(a, b, imm8) \
  4656. _mm_castsi128_pd(_mm_set_epi64x( \
  4657. vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
  4658. vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
  4659. #endif
  4660. // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
  4661. // __constrange(0,255) int imm)
  4662. #if defined(_sse2neon_shuffle)
  4663. #define _mm_shufflehi_epi16(a, imm) \
  4664. __extension__({ \
  4665. int16x8_t _input = vreinterpretq_s16_m128i(a); \
  4666. int16x8_t _shuf = \
  4667. vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
  4668. (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
  4669. (((imm) >> 6) & 0x3) + 4); \
  4670. vreinterpretq_m128i_s16(_shuf); \
  4671. })
  4672. #else // generic
  4673. #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
  4674. #endif
  4675. // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
  4676. // __constrange(0,255) int imm)
  4677. #if defined(_sse2neon_shuffle)
  4678. #define _mm_shufflelo_epi16(a, imm) \
  4679. __extension__({ \
  4680. int16x8_t _input = vreinterpretq_s16_m128i(a); \
  4681. int16x8_t _shuf = vshuffleq_s16( \
  4682. _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
  4683. (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
  4684. vreinterpretq_m128i_s16(_shuf); \
  4685. })
  4686. #else // generic
  4687. #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
  4688. #endif
  4689. // Shift packed 16-bit integers in a left by count while shifting in zeros, and
  4690. // store the results in dst.
  4691. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
  4692. FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
  4693. {
  4694. uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
  4695. if (_sse2neon_unlikely(c & ~15))
  4696. return _mm_setzero_si128();
  4697. int16x8_t vc = vdupq_n_s16((int16_t) c);
  4698. return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
  4699. }
  4700. // Shift packed 32-bit integers in a left by count while shifting in zeros, and
  4701. // store the results in dst.
  4702. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
  4703. FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
  4704. {
  4705. uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
  4706. if (_sse2neon_unlikely(c & ~31))
  4707. return _mm_setzero_si128();
  4708. int32x4_t vc = vdupq_n_s32((int32_t) c);
  4709. return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
  4710. }
  4711. // Shift packed 64-bit integers in a left by count while shifting in zeros, and
  4712. // store the results in dst.
  4713. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
  4714. FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
  4715. {
  4716. uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
  4717. if (_sse2neon_unlikely(c & ~63))
  4718. return _mm_setzero_si128();
  4719. int64x2_t vc = vdupq_n_s64((int64_t) c);
  4720. return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
  4721. }
  4722. // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
  4723. // store the results in dst.
  4724. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
  4725. FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
  4726. {
  4727. if (_sse2neon_unlikely(imm & ~15))
  4728. return _mm_setzero_si128();
  4729. return vreinterpretq_m128i_s16(
  4730. vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
  4731. }
  4732. // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
  4733. // store the results in dst.
  4734. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
  4735. FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
  4736. {
  4737. if (_sse2neon_unlikely(imm & ~31))
  4738. return _mm_setzero_si128();
  4739. return vreinterpretq_m128i_s32(
  4740. vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
  4741. }
  4742. // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
  4743. // store the results in dst.
  4744. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
  4745. FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
  4746. {
  4747. if (_sse2neon_unlikely(imm & ~63))
  4748. return _mm_setzero_si128();
  4749. return vreinterpretq_m128i_s64(
  4750. vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
  4751. }
  4752. // Shift a left by imm8 bytes while shifting in zeros, and store the results in
  4753. // dst.
  4754. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
  4755. #define _mm_slli_si128(a, imm) \
  4756. _sse2neon_define1( \
  4757. __m128i, a, int8x16_t ret; \
  4758. if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
  4759. else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
  4760. else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \
  4761. ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
  4762. _sse2neon_return(vreinterpretq_m128i_s8(ret));)
  4763. // Compute the square root of packed double-precision (64-bit) floating-point
  4764. // elements in a, and store the results in dst.
  4765. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
  4766. FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
  4767. {
  4768. #if defined(__aarch64__) || defined(_M_ARM64)
  4769. return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
  4770. #else
  4771. double a0 = sqrt(((double *) &a)[0]);
  4772. double a1 = sqrt(((double *) &a)[1]);
  4773. return _mm_set_pd(a1, a0);
  4774. #endif
  4775. }
  4776. // Compute the square root of the lower double-precision (64-bit) floating-point
  4777. // element in b, store the result in the lower element of dst, and copy the
  4778. // upper element from a to the upper element of dst.
  4779. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
  4780. FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
  4781. {
  4782. #if defined(__aarch64__) || defined(_M_ARM64)
  4783. return _mm_move_sd(a, _mm_sqrt_pd(b));
  4784. #else
  4785. return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
  4786. #endif
  4787. }
  4788. // Shift packed 16-bit integers in a right by count while shifting in sign bits,
  4789. // and store the results in dst.
  4790. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
  4791. FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
  4792. {
  4793. int64_t c = vgetq_lane_s64(count, 0);
  4794. if (_sse2neon_unlikely(c & ~15))
  4795. return _mm_cmplt_epi16(a, _mm_setzero_si128());
  4796. return vreinterpretq_m128i_s16(
  4797. vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
  4798. }
  4799. // Shift packed 32-bit integers in a right by count while shifting in sign bits,
  4800. // and store the results in dst.
  4801. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
  4802. FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
  4803. {
  4804. int64_t c = vgetq_lane_s64(count, 0);
  4805. if (_sse2neon_unlikely(c & ~31))
  4806. return _mm_cmplt_epi32(a, _mm_setzero_si128());
  4807. return vreinterpretq_m128i_s32(
  4808. vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
  4809. }
  4810. // Shift packed 16-bit integers in a right by imm8 while shifting in sign
  4811. // bits, and store the results in dst.
  4812. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
  4813. FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
  4814. {
  4815. const int count = (imm & ~15) ? 15 : imm;
  4816. return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
  4817. }
  4818. // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
  4819. // and store the results in dst.
  4820. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
  4821. // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
  4822. #define _mm_srai_epi32(a, imm) \
  4823. _sse2neon_define0( \
  4824. __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \
  4825. ret = _a; \
  4826. } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
  4827. ret = vreinterpretq_m128i_s32( \
  4828. vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
  4829. } else { \
  4830. ret = vreinterpretq_m128i_s32( \
  4831. vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \
  4832. } _sse2neon_return(ret);)
  4833. // Shift packed 16-bit integers in a right by count while shifting in zeros, and
  4834. // store the results in dst.
  4835. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
  4836. FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
  4837. {
  4838. uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
  4839. if (_sse2neon_unlikely(c & ~15))
  4840. return _mm_setzero_si128();
  4841. int16x8_t vc = vdupq_n_s16(-(int16_t) c);
  4842. return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
  4843. }
  4844. // Shift packed 32-bit integers in a right by count while shifting in zeros, and
  4845. // store the results in dst.
  4846. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
  4847. FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
  4848. {
  4849. uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
  4850. if (_sse2neon_unlikely(c & ~31))
  4851. return _mm_setzero_si128();
  4852. int32x4_t vc = vdupq_n_s32(-(int32_t) c);
  4853. return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
  4854. }
  4855. // Shift packed 64-bit integers in a right by count while shifting in zeros, and
  4856. // store the results in dst.
  4857. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
  4858. FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
  4859. {
  4860. uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
  4861. if (_sse2neon_unlikely(c & ~63))
  4862. return _mm_setzero_si128();
  4863. int64x2_t vc = vdupq_n_s64(-(int64_t) c);
  4864. return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
  4865. }
  4866. // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
  4867. // store the results in dst.
  4868. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
  4869. #define _mm_srli_epi16(a, imm) \
  4870. _sse2neon_define0( \
  4871. __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \
  4872. ret = _mm_setzero_si128(); \
  4873. } else { \
  4874. ret = vreinterpretq_m128i_u16( \
  4875. vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
  4876. } _sse2neon_return(ret);)
  4877. // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
  4878. // store the results in dst.
  4879. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
  4880. // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
  4881. #define _mm_srli_epi32(a, imm) \
  4882. _sse2neon_define0( \
  4883. __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \
  4884. ret = _mm_setzero_si128(); \
  4885. } else { \
  4886. ret = vreinterpretq_m128i_u32( \
  4887. vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
  4888. } _sse2neon_return(ret);)
  4889. // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
  4890. // store the results in dst.
  4891. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
  4892. #define _mm_srli_epi64(a, imm) \
  4893. _sse2neon_define0( \
  4894. __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \
  4895. ret = _mm_setzero_si128(); \
  4896. } else { \
  4897. ret = vreinterpretq_m128i_u64( \
  4898. vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
  4899. } _sse2neon_return(ret);)
  4900. // Shift a right by imm8 bytes while shifting in zeros, and store the results in
  4901. // dst.
  4902. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
  4903. #define _mm_srli_si128(a, imm) \
  4904. _sse2neon_define1( \
  4905. __m128i, a, int8x16_t ret; \
  4906. if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
  4907. else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
  4908. (imm > 15 ? 0 : imm)); \
  4909. _sse2neon_return(vreinterpretq_m128i_s8(ret));)
  4910. // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
  4911. // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
  4912. // or a general-protection exception may be generated.
  4913. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
  4914. FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
  4915. {
  4916. #if defined(__aarch64__) || defined(_M_ARM64)
  4917. vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
  4918. #else
  4919. vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
  4920. #endif
  4921. }
  4922. // Store the lower double-precision (64-bit) floating-point element from a into
  4923. // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
  4924. // boundary or a general-protection exception may be generated.
  4925. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
  4926. FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
  4927. {
  4928. #if defined(__aarch64__) || defined(_M_ARM64)
  4929. float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
  4930. vst1q_f64((float64_t *) mem_addr,
  4931. vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
  4932. #else
  4933. float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
  4934. vst1q_f32((float32_t *) mem_addr,
  4935. vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
  4936. #endif
  4937. }
  4938. // Store the lower double-precision (64-bit) floating-point element from a into
  4939. // memory. mem_addr does not need to be aligned on any particular boundary.
  4940. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
  4941. FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
  4942. {
  4943. #if defined(__aarch64__) || defined(_M_ARM64)
  4944. vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
  4945. #else
  4946. vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
  4947. #endif
  4948. }
  4949. // Store 128-bits of integer data from a into memory. mem_addr must be aligned
  4950. // on a 16-byte boundary or a general-protection exception may be generated.
  4951. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
  4952. FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
  4953. {
  4954. vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
  4955. }
  4956. // Store the lower double-precision (64-bit) floating-point element from a into
  4957. // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
  4958. // boundary or a general-protection exception may be generated.
  4959. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
  4960. #define _mm_store1_pd _mm_store_pd1
  4961. // Store the upper double-precision (64-bit) floating-point element from a into
  4962. // memory.
  4963. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
  4964. FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
  4965. {
  4966. #if defined(__aarch64__) || defined(_M_ARM64)
  4967. vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
  4968. #else
  4969. vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
  4970. #endif
  4971. }
  4972. // Store 64-bit integer from the first element of a into memory.
  4973. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
  4974. FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
  4975. {
  4976. vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
  4977. }
  4978. // Store the lower double-precision (64-bit) floating-point element from a into
  4979. // memory.
  4980. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
  4981. FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
  4982. {
  4983. #if defined(__aarch64__) || defined(_M_ARM64)
  4984. vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
  4985. #else
  4986. vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
  4987. #endif
  4988. }
  4989. // Store 2 double-precision (64-bit) floating-point elements from a into memory
  4990. // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
  4991. // general-protection exception may be generated.
  4992. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
  4993. FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
  4994. {
  4995. float32x4_t f = vreinterpretq_f32_m128d(a);
  4996. _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
  4997. }
  4998. // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
  4999. // elements) from a into memory. mem_addr does not need to be aligned on any
  5000. // particular boundary.
  5001. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
  5002. FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
  5003. {
  5004. _mm_store_pd(mem_addr, a);
  5005. }
  5006. // Store 128-bits of integer data from a into memory. mem_addr does not need to
  5007. // be aligned on any particular boundary.
  5008. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
  5009. FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
  5010. {
  5011. vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
  5012. }
  5013. // Store 32-bit integer from the first element of a into memory. mem_addr does
  5014. // not need to be aligned on any particular boundary.
  5015. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
  5016. FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
  5017. {
  5018. vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
  5019. }
  5020. // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
  5021. // elements) from a into memory using a non-temporal memory hint. mem_addr must
  5022. // be aligned on a 16-byte boundary or a general-protection exception may be
  5023. // generated.
  5024. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
  5025. FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
  5026. {
  5027. #if __has_builtin(__builtin_nontemporal_store)
  5028. __builtin_nontemporal_store(a, (__m128d *) p);
  5029. #elif defined(__aarch64__) || defined(_M_ARM64)
  5030. vst1q_f64(p, vreinterpretq_f64_m128d(a));
  5031. #else
  5032. vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
  5033. #endif
  5034. }
  5035. // Store 128-bits of integer data from a into memory using a non-temporal memory
  5036. // hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
  5037. // exception may be generated.
  5038. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
  5039. FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
  5040. {
  5041. #if __has_builtin(__builtin_nontemporal_store)
  5042. __builtin_nontemporal_store(a, p);
  5043. #else
  5044. vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
  5045. #endif
  5046. }
  5047. // Store 32-bit integer a into memory using a non-temporal hint to minimize
  5048. // cache pollution. If the cache line containing address mem_addr is already in
  5049. // the cache, the cache will be updated.
  5050. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
  5051. FORCE_INLINE void _mm_stream_si32(int *p, int a)
  5052. {
  5053. vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
  5054. }
  5055. // Store 64-bit integer a into memory using a non-temporal hint to minimize
  5056. // cache pollution. If the cache line containing address mem_addr is already in
  5057. // the cache, the cache will be updated.
  5058. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
  5059. FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
  5060. {
  5061. vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
  5062. }
  5063. // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
  5064. // store the results in dst.
  5065. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
  5066. FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
  5067. {
  5068. return vreinterpretq_m128i_s16(
  5069. vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  5070. }
  5071. // Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
  5072. // store the results in dst.
  5073. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
  5074. FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
  5075. {
  5076. return vreinterpretq_m128i_s32(
  5077. vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  5078. }
  5079. // Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
  5080. // store the results in dst.
  5081. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
  5082. FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
  5083. {
  5084. return vreinterpretq_m128i_s64(
  5085. vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
  5086. }
  5087. // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
  5088. // store the results in dst.
  5089. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
  5090. FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
  5091. {
  5092. return vreinterpretq_m128i_s8(
  5093. vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  5094. }
  5095. // Subtract packed double-precision (64-bit) floating-point elements in b from
  5096. // packed double-precision (64-bit) floating-point elements in a, and store the
  5097. // results in dst.
  5098. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
  5099. FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
  5100. {
  5101. #if defined(__aarch64__) || defined(_M_ARM64)
  5102. return vreinterpretq_m128d_f64(
  5103. vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  5104. #else
  5105. double *da = (double *) &a;
  5106. double *db = (double *) &b;
  5107. double c[2];
  5108. c[0] = da[0] - db[0];
  5109. c[1] = da[1] - db[1];
  5110. return vld1q_f32((float32_t *) c);
  5111. #endif
  5112. }
  5113. // Subtract the lower double-precision (64-bit) floating-point element in b from
  5114. // the lower double-precision (64-bit) floating-point element in a, store the
  5115. // result in the lower element of dst, and copy the upper element from a to the
  5116. // upper element of dst.
  5117. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
  5118. FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
  5119. {
  5120. return _mm_move_sd(a, _mm_sub_pd(a, b));
  5121. }
  5122. // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
  5123. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
  5124. FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
  5125. {
  5126. return vreinterpret_m64_s64(
  5127. vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
  5128. }
  5129. // Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
  5130. // using saturation, and store the results in dst.
  5131. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
  5132. FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
  5133. {
  5134. return vreinterpretq_m128i_s16(
  5135. vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  5136. }
  5137. // Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
  5138. // using saturation, and store the results in dst.
  5139. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
  5140. FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
  5141. {
  5142. return vreinterpretq_m128i_s8(
  5143. vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  5144. }
  5145. // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
  5146. // integers in a using saturation, and store the results in dst.
  5147. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
  5148. FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
  5149. {
  5150. return vreinterpretq_m128i_u16(
  5151. vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
  5152. }
  5153. // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
  5154. // integers in a using saturation, and store the results in dst.
  5155. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
  5156. FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
  5157. {
  5158. return vreinterpretq_m128i_u8(
  5159. vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
  5160. }
  5161. #define _mm_ucomieq_sd _mm_comieq_sd
  5162. #define _mm_ucomige_sd _mm_comige_sd
  5163. #define _mm_ucomigt_sd _mm_comigt_sd
  5164. #define _mm_ucomile_sd _mm_comile_sd
  5165. #define _mm_ucomilt_sd _mm_comilt_sd
  5166. #define _mm_ucomineq_sd _mm_comineq_sd
  5167. // Return vector of type __m128d with undefined elements.
  5168. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
  5169. FORCE_INLINE __m128d _mm_undefined_pd(void)
  5170. {
  5171. #if defined(__GNUC__) || defined(__clang__)
  5172. #pragma GCC diagnostic push
  5173. #pragma GCC diagnostic ignored "-Wuninitialized"
  5174. #endif
  5175. __m128d a;
  5176. #if defined(_MSC_VER)
  5177. a = _mm_setzero_pd();
  5178. #endif
  5179. return a;
  5180. #if defined(__GNUC__) || defined(__clang__)
  5181. #pragma GCC diagnostic pop
  5182. #endif
  5183. }
  5184. // Unpack and interleave 16-bit integers from the high half of a and b, and
  5185. // store the results in dst.
  5186. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
  5187. FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
  5188. {
  5189. #if defined(__aarch64__) || defined(_M_ARM64)
  5190. return vreinterpretq_m128i_s16(
  5191. vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  5192. #else
  5193. int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
  5194. int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
  5195. int16x4x2_t result = vzip_s16(a1, b1);
  5196. return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
  5197. #endif
  5198. }
  5199. // Unpack and interleave 32-bit integers from the high half of a and b, and
  5200. // store the results in dst.
  5201. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
  5202. FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
  5203. {
  5204. #if defined(__aarch64__) || defined(_M_ARM64)
  5205. return vreinterpretq_m128i_s32(
  5206. vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  5207. #else
  5208. int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
  5209. int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
  5210. int32x2x2_t result = vzip_s32(a1, b1);
  5211. return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
  5212. #endif
  5213. }
  5214. // Unpack and interleave 64-bit integers from the high half of a and b, and
  5215. // store the results in dst.
  5216. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
  5217. FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
  5218. {
  5219. #if defined(__aarch64__) || defined(_M_ARM64)
  5220. return vreinterpretq_m128i_s64(
  5221. vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
  5222. #else
  5223. int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
  5224. int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
  5225. return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
  5226. #endif
  5227. }
  5228. // Unpack and interleave 8-bit integers from the high half of a and b, and store
  5229. // the results in dst.
  5230. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
  5231. FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
  5232. {
  5233. #if defined(__aarch64__) || defined(_M_ARM64)
  5234. return vreinterpretq_m128i_s8(
  5235. vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  5236. #else
  5237. int8x8_t a1 =
  5238. vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
  5239. int8x8_t b1 =
  5240. vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
  5241. int8x8x2_t result = vzip_s8(a1, b1);
  5242. return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
  5243. #endif
  5244. }
  5245. // Unpack and interleave double-precision (64-bit) floating-point elements from
  5246. // the high half of a and b, and store the results in dst.
  5247. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
  5248. FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
  5249. {
  5250. #if defined(__aarch64__) || defined(_M_ARM64)
  5251. return vreinterpretq_m128d_f64(
  5252. vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  5253. #else
  5254. return vreinterpretq_m128d_s64(
  5255. vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
  5256. vget_high_s64(vreinterpretq_s64_m128d(b))));
  5257. #endif
  5258. }
  5259. // Unpack and interleave 16-bit integers from the low half of a and b, and store
  5260. // the results in dst.
  5261. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
  5262. FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
  5263. {
  5264. #if defined(__aarch64__) || defined(_M_ARM64)
  5265. return vreinterpretq_m128i_s16(
  5266. vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
  5267. #else
  5268. int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
  5269. int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
  5270. int16x4x2_t result = vzip_s16(a1, b1);
  5271. return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
  5272. #endif
  5273. }
  5274. // Unpack and interleave 32-bit integers from the low half of a and b, and store
  5275. // the results in dst.
  5276. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
  5277. FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
  5278. {
  5279. #if defined(__aarch64__) || defined(_M_ARM64)
  5280. return vreinterpretq_m128i_s32(
  5281. vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  5282. #else
  5283. int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
  5284. int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
  5285. int32x2x2_t result = vzip_s32(a1, b1);
  5286. return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
  5287. #endif
  5288. }
  5289. // Unpack and interleave 64-bit integers from the low half of a and b, and store
  5290. // the results in dst.
  5291. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
  5292. FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
  5293. {
  5294. #if defined(__aarch64__) || defined(_M_ARM64)
  5295. return vreinterpretq_m128i_s64(
  5296. vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
  5297. #else
  5298. int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
  5299. int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
  5300. return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
  5301. #endif
  5302. }
  5303. // Unpack and interleave 8-bit integers from the low half of a and b, and store
  5304. // the results in dst.
  5305. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
  5306. FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
  5307. {
  5308. #if defined(__aarch64__) || defined(_M_ARM64)
  5309. return vreinterpretq_m128i_s8(
  5310. vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  5311. #else
  5312. int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
  5313. int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
  5314. int8x8x2_t result = vzip_s8(a1, b1);
  5315. return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
  5316. #endif
  5317. }
  5318. // Unpack and interleave double-precision (64-bit) floating-point elements from
  5319. // the low half of a and b, and store the results in dst.
  5320. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
  5321. FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
  5322. {
  5323. #if defined(__aarch64__) || defined(_M_ARM64)
  5324. return vreinterpretq_m128d_f64(
  5325. vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  5326. #else
  5327. return vreinterpretq_m128d_s64(
  5328. vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
  5329. vget_low_s64(vreinterpretq_s64_m128d(b))));
  5330. #endif
  5331. }
  5332. // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
  5333. // elements in a and b, and store the results in dst.
  5334. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
  5335. FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
  5336. {
  5337. return vreinterpretq_m128d_s64(
  5338. veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
  5339. }
  5340. // Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
  5341. // and store the result in dst.
  5342. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
  5343. FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
  5344. {
  5345. return vreinterpretq_m128i_s32(
  5346. veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  5347. }
  5348. /* SSE3 */
  5349. // Alternatively add and subtract packed double-precision (64-bit)
  5350. // floating-point elements in a to/from packed elements in b, and store the
  5351. // results in dst.
  5352. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
  5353. FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
  5354. {
  5355. _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
  5356. #if defined(__aarch64__) || defined(_M_ARM64)
  5357. return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
  5358. vreinterpretq_f64_m128d(b),
  5359. vreinterpretq_f64_m128d(mask)));
  5360. #else
  5361. return _mm_add_pd(_mm_mul_pd(b, mask), a);
  5362. #endif
  5363. }
  5364. // Alternatively add and subtract packed single-precision (32-bit)
  5365. // floating-point elements in a to/from packed elements in b, and store the
  5366. // results in dst.
  5367. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
  5368. FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
  5369. {
  5370. _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
  5371. #if (defined(__aarch64__) || defined(_M_ARM64)) || \
  5372. defined(__ARM_FEATURE_FMA) /* VFPv4+ */
  5373. return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
  5374. vreinterpretq_f32_m128(mask),
  5375. vreinterpretq_f32_m128(b)));
  5376. #else
  5377. return _mm_add_ps(_mm_mul_ps(b, mask), a);
  5378. #endif
  5379. }
  5380. // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
  5381. // elements in a and b, and pack the results in dst.
  5382. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
  5383. FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
  5384. {
  5385. #if defined(__aarch64__) || defined(_M_ARM64)
  5386. return vreinterpretq_m128d_f64(
  5387. vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
  5388. #else
  5389. double *da = (double *) &a;
  5390. double *db = (double *) &b;
  5391. double c[] = {da[0] + da[1], db[0] + db[1]};
  5392. return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
  5393. #endif
  5394. }
  5395. // Horizontally add adjacent pairs of single-precision (32-bit) floating-point
  5396. // elements in a and b, and pack the results in dst.
  5397. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
  5398. FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
  5399. {
  5400. #if defined(__aarch64__) || defined(_M_ARM64)
  5401. return vreinterpretq_m128_f32(
  5402. vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
  5403. #else
  5404. float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
  5405. float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
  5406. float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
  5407. float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
  5408. return vreinterpretq_m128_f32(
  5409. vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
  5410. #endif
  5411. }
  5412. // Horizontally subtract adjacent pairs of double-precision (64-bit)
  5413. // floating-point elements in a and b, and pack the results in dst.
  5414. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
  5415. FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
  5416. {
  5417. #if defined(__aarch64__) || defined(_M_ARM64)
  5418. float64x2_t a = vreinterpretq_f64_m128d(_a);
  5419. float64x2_t b = vreinterpretq_f64_m128d(_b);
  5420. return vreinterpretq_m128d_f64(
  5421. vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
  5422. #else
  5423. double *da = (double *) &_a;
  5424. double *db = (double *) &_b;
  5425. double c[] = {da[0] - da[1], db[0] - db[1]};
  5426. return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
  5427. #endif
  5428. }
  5429. // Horizontally subtract adjacent pairs of single-precision (32-bit)
  5430. // floating-point elements in a and b, and pack the results in dst.
  5431. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
  5432. FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
  5433. {
  5434. float32x4_t a = vreinterpretq_f32_m128(_a);
  5435. float32x4_t b = vreinterpretq_f32_m128(_b);
  5436. #if defined(__aarch64__) || defined(_M_ARM64)
  5437. return vreinterpretq_m128_f32(
  5438. vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
  5439. #else
  5440. float32x4x2_t c = vuzpq_f32(a, b);
  5441. return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
  5442. #endif
  5443. }
  5444. // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
  5445. // may perform better than _mm_loadu_si128 when the data crosses a cache line
  5446. // boundary.
  5447. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
  5448. #define _mm_lddqu_si128 _mm_loadu_si128
  5449. // Load a double-precision (64-bit) floating-point element from memory into both
  5450. // elements of dst.
  5451. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
  5452. #define _mm_loaddup_pd _mm_load1_pd
  5453. // Duplicate the low double-precision (64-bit) floating-point element from a,
  5454. // and store the results in dst.
  5455. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
  5456. FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
  5457. {
  5458. #if defined(__aarch64__) || defined(_M_ARM64)
  5459. return vreinterpretq_m128d_f64(
  5460. vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
  5461. #else
  5462. return vreinterpretq_m128d_u64(
  5463. vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
  5464. #endif
  5465. }
  5466. // Duplicate odd-indexed single-precision (32-bit) floating-point elements
  5467. // from a, and store the results in dst.
  5468. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
  5469. FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
  5470. {
  5471. #if defined(__aarch64__) || defined(_M_ARM64)
  5472. return vreinterpretq_m128_f32(
  5473. vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
  5474. #elif defined(_sse2neon_shuffle)
  5475. return vreinterpretq_m128_f32(vshuffleq_s32(
  5476. vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
  5477. #else
  5478. float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
  5479. float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
  5480. float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
  5481. return vreinterpretq_m128_f32(vld1q_f32(data));
  5482. #endif
  5483. }
  5484. // Duplicate even-indexed single-precision (32-bit) floating-point elements
  5485. // from a, and store the results in dst.
  5486. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
  5487. FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
  5488. {
  5489. #if defined(__aarch64__) || defined(_M_ARM64)
  5490. return vreinterpretq_m128_f32(
  5491. vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
  5492. #elif defined(_sse2neon_shuffle)
  5493. return vreinterpretq_m128_f32(vshuffleq_s32(
  5494. vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
  5495. #else
  5496. float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
  5497. float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
  5498. float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
  5499. return vreinterpretq_m128_f32(vld1q_f32(data));
  5500. #endif
  5501. }
  5502. /* SSSE3 */
  5503. // Compute the absolute value of packed signed 16-bit integers in a, and store
  5504. // the unsigned results in dst.
  5505. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
  5506. FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
  5507. {
  5508. return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
  5509. }
  5510. // Compute the absolute value of packed signed 32-bit integers in a, and store
  5511. // the unsigned results in dst.
  5512. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
  5513. FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
  5514. {
  5515. return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
  5516. }
  5517. // Compute the absolute value of packed signed 8-bit integers in a, and store
  5518. // the unsigned results in dst.
  5519. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
  5520. FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
  5521. {
  5522. return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
  5523. }
  5524. // Compute the absolute value of packed signed 16-bit integers in a, and store
  5525. // the unsigned results in dst.
  5526. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
  5527. FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
  5528. {
  5529. return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
  5530. }
  5531. // Compute the absolute value of packed signed 32-bit integers in a, and store
  5532. // the unsigned results in dst.
  5533. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
  5534. FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
  5535. {
  5536. return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
  5537. }
  5538. // Compute the absolute value of packed signed 8-bit integers in a, and store
  5539. // the unsigned results in dst.
  5540. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
  5541. FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
  5542. {
  5543. return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
  5544. }
  5545. // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
  5546. // the result right by imm8 bytes, and store the low 16 bytes in dst.
  5547. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
  5548. #if defined(__GNUC__) && !defined(__clang__)
  5549. #define _mm_alignr_epi8(a, b, imm) \
  5550. __extension__({ \
  5551. uint8x16_t _a = vreinterpretq_u8_m128i(a); \
  5552. uint8x16_t _b = vreinterpretq_u8_m128i(b); \
  5553. __m128i ret; \
  5554. if (_sse2neon_unlikely((imm) & ~31)) \
  5555. ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
  5556. else if (imm >= 16) \
  5557. ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \
  5558. else \
  5559. ret = \
  5560. vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
  5561. ret; \
  5562. })
  5563. #else
  5564. #define _mm_alignr_epi8(a, b, imm) \
  5565. _sse2neon_define2( \
  5566. __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
  5567. uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
  5568. if (_sse2neon_unlikely((imm) & ~31)) ret = \
  5569. vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
  5570. else if (imm >= 16) ret = \
  5571. _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \
  5572. else ret = \
  5573. vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
  5574. _sse2neon_return(ret);)
  5575. #endif
  5576. // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
  5577. // the result right by imm8 bytes, and store the low 8 bytes in dst.
  5578. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
  5579. #define _mm_alignr_pi8(a, b, imm) \
  5580. _sse2neon_define2( \
  5581. __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \
  5582. ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
  5583. } else { \
  5584. uint8x8_t tmp_low; \
  5585. uint8x8_t tmp_high; \
  5586. if ((imm) >= 8) { \
  5587. const int idx = (imm) -8; \
  5588. tmp_low = vreinterpret_u8_m64(_a); \
  5589. tmp_high = vdup_n_u8(0); \
  5590. ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
  5591. } else { \
  5592. const int idx = (imm); \
  5593. tmp_low = vreinterpret_u8_m64(_b); \
  5594. tmp_high = vreinterpret_u8_m64(_a); \
  5595. ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
  5596. } \
  5597. } _sse2neon_return(ret);)
  5598. // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
  5599. // signed 16-bit results in dst.
  5600. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
  5601. FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
  5602. {
  5603. int16x8_t a = vreinterpretq_s16_m128i(_a);
  5604. int16x8_t b = vreinterpretq_s16_m128i(_b);
  5605. #if defined(__aarch64__) || defined(_M_ARM64)
  5606. return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
  5607. #else
  5608. return vreinterpretq_m128i_s16(
  5609. vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
  5610. vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
  5611. #endif
  5612. }
  5613. // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
  5614. // signed 32-bit results in dst.
  5615. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
  5616. FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
  5617. {
  5618. int32x4_t a = vreinterpretq_s32_m128i(_a);
  5619. int32x4_t b = vreinterpretq_s32_m128i(_b);
  5620. #if defined(__aarch64__) || defined(_M_ARM64)
  5621. return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
  5622. #else
  5623. return vreinterpretq_m128i_s32(
  5624. vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
  5625. vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
  5626. #endif
  5627. }
  5628. // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
  5629. // signed 16-bit results in dst.
  5630. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
  5631. FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
  5632. {
  5633. return vreinterpret_m64_s16(
  5634. vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
  5635. }
  5636. // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
  5637. // signed 32-bit results in dst.
  5638. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
  5639. FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
  5640. {
  5641. return vreinterpret_m64_s32(
  5642. vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
  5643. }
  5644. // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
  5645. // saturation, and pack the signed 16-bit results in dst.
  5646. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
  5647. FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
  5648. {
  5649. #if defined(__aarch64__) || defined(_M_ARM64)
  5650. int16x8_t a = vreinterpretq_s16_m128i(_a);
  5651. int16x8_t b = vreinterpretq_s16_m128i(_b);
  5652. return vreinterpretq_s64_s16(
  5653. vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
  5654. #else
  5655. int32x4_t a = vreinterpretq_s32_m128i(_a);
  5656. int32x4_t b = vreinterpretq_s32_m128i(_b);
  5657. // Interleave using vshrn/vmovn
  5658. // [a0|a2|a4|a6|b0|b2|b4|b6]
  5659. // [a1|a3|a5|a7|b1|b3|b5|b7]
  5660. int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
  5661. int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
  5662. // Saturated add
  5663. return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
  5664. #endif
  5665. }
  5666. // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
  5667. // saturation, and pack the signed 16-bit results in dst.
  5668. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
  5669. FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
  5670. {
  5671. int16x4_t a = vreinterpret_s16_m64(_a);
  5672. int16x4_t b = vreinterpret_s16_m64(_b);
  5673. #if defined(__aarch64__) || defined(_M_ARM64)
  5674. return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
  5675. #else
  5676. int16x4x2_t res = vuzp_s16(a, b);
  5677. return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
  5678. #endif
  5679. }
  5680. // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
  5681. // the signed 16-bit results in dst.
  5682. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
  5683. FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
  5684. {
  5685. int16x8_t a = vreinterpretq_s16_m128i(_a);
  5686. int16x8_t b = vreinterpretq_s16_m128i(_b);
  5687. #if defined(__aarch64__) || defined(_M_ARM64)
  5688. return vreinterpretq_m128i_s16(
  5689. vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
  5690. #else
  5691. int16x8x2_t c = vuzpq_s16(a, b);
  5692. return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
  5693. #endif
  5694. }
  5695. // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
  5696. // the signed 32-bit results in dst.
  5697. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
  5698. FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
  5699. {
  5700. int32x4_t a = vreinterpretq_s32_m128i(_a);
  5701. int32x4_t b = vreinterpretq_s32_m128i(_b);
  5702. #if defined(__aarch64__) || defined(_M_ARM64)
  5703. return vreinterpretq_m128i_s32(
  5704. vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
  5705. #else
  5706. int32x4x2_t c = vuzpq_s32(a, b);
  5707. return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
  5708. #endif
  5709. }
  5710. // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
  5711. // the signed 16-bit results in dst.
  5712. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
  5713. FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
  5714. {
  5715. int16x4_t a = vreinterpret_s16_m64(_a);
  5716. int16x4_t b = vreinterpret_s16_m64(_b);
  5717. #if defined(__aarch64__) || defined(_M_ARM64)
  5718. return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
  5719. #else
  5720. int16x4x2_t c = vuzp_s16(a, b);
  5721. return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
  5722. #endif
  5723. }
  5724. // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
  5725. // the signed 32-bit results in dst.
  5726. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
  5727. FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
  5728. {
  5729. int32x2_t a = vreinterpret_s32_m64(_a);
  5730. int32x2_t b = vreinterpret_s32_m64(_b);
  5731. #if defined(__aarch64__) || defined(_M_ARM64)
  5732. return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
  5733. #else
  5734. int32x2x2_t c = vuzp_s32(a, b);
  5735. return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
  5736. #endif
  5737. }
  5738. // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
  5739. // using saturation, and pack the signed 16-bit results in dst.
  5740. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
  5741. FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
  5742. {
  5743. int16x8_t a = vreinterpretq_s16_m128i(_a);
  5744. int16x8_t b = vreinterpretq_s16_m128i(_b);
  5745. #if defined(__aarch64__) || defined(_M_ARM64)
  5746. return vreinterpretq_m128i_s16(
  5747. vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
  5748. #else
  5749. int16x8x2_t c = vuzpq_s16(a, b);
  5750. return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
  5751. #endif
  5752. }
  5753. // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
  5754. // using saturation, and pack the signed 16-bit results in dst.
  5755. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
  5756. FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
  5757. {
  5758. int16x4_t a = vreinterpret_s16_m64(_a);
  5759. int16x4_t b = vreinterpret_s16_m64(_b);
  5760. #if defined(__aarch64__) || defined(_M_ARM64)
  5761. return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
  5762. #else
  5763. int16x4x2_t c = vuzp_s16(a, b);
  5764. return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
  5765. #endif
  5766. }
  5767. // Vertically multiply each unsigned 8-bit integer from a with the corresponding
  5768. // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
  5769. // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
  5770. // and pack the saturated results in dst.
  5771. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
  5772. FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
  5773. {
  5774. #if defined(__aarch64__) || defined(_M_ARM64)
  5775. uint8x16_t a = vreinterpretq_u8_m128i(_a);
  5776. int8x16_t b = vreinterpretq_s8_m128i(_b);
  5777. int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
  5778. vmovl_s8(vget_low_s8(b)));
  5779. int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
  5780. vmovl_s8(vget_high_s8(b)));
  5781. return vreinterpretq_m128i_s16(
  5782. vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
  5783. #else
  5784. // This would be much simpler if x86 would choose to zero extend OR sign
  5785. // extend, not both. This could probably be optimized better.
  5786. uint16x8_t a = vreinterpretq_u16_m128i(_a);
  5787. int16x8_t b = vreinterpretq_s16_m128i(_b);
  5788. // Zero extend a
  5789. int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
  5790. int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
  5791. // Sign extend by shifting left then shifting right.
  5792. int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
  5793. int16x8_t b_odd = vshrq_n_s16(b, 8);
  5794. // multiply
  5795. int16x8_t prod1 = vmulq_s16(a_even, b_even);
  5796. int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
  5797. // saturated add
  5798. return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
  5799. #endif
  5800. }
  5801. // Vertically multiply each unsigned 8-bit integer from a with the corresponding
  5802. // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
  5803. // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
  5804. // pack the saturated results in dst.
  5805. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
  5806. FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
  5807. {
  5808. uint16x4_t a = vreinterpret_u16_m64(_a);
  5809. int16x4_t b = vreinterpret_s16_m64(_b);
  5810. // Zero extend a
  5811. int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
  5812. int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
  5813. // Sign extend by shifting left then shifting right.
  5814. int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
  5815. int16x4_t b_odd = vshr_n_s16(b, 8);
  5816. // multiply
  5817. int16x4_t prod1 = vmul_s16(a_even, b_even);
  5818. int16x4_t prod2 = vmul_s16(a_odd, b_odd);
  5819. // saturated add
  5820. return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
  5821. }
  5822. // Multiply packed signed 16-bit integers in a and b, producing intermediate
  5823. // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
  5824. // the packed 16-bit integers in dst.
  5825. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
  5826. FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
  5827. {
  5828. // Has issues due to saturation
  5829. // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
  5830. // Multiply
  5831. int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
  5832. vget_low_s16(vreinterpretq_s16_m128i(b)));
  5833. int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
  5834. vget_high_s16(vreinterpretq_s16_m128i(b)));
  5835. // Rounding narrowing shift right
  5836. // narrow = (int16_t)((mul + 16384) >> 15);
  5837. int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
  5838. int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
  5839. // Join together
  5840. return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
  5841. }
  5842. // Multiply packed signed 16-bit integers in a and b, producing intermediate
  5843. // signed 32-bit integers. Truncate each intermediate integer to the 18 most
  5844. // significant bits, round by adding 1, and store bits [16:1] to dst.
  5845. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
  5846. FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
  5847. {
  5848. int32x4_t mul_extend =
  5849. vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
  5850. // Rounding narrowing shift right
  5851. return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
  5852. }
  5853. // Shuffle packed 8-bit integers in a according to shuffle control mask in the
  5854. // corresponding 8-bit element of b, and store the results in dst.
  5855. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
  5856. FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
  5857. {
  5858. int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
  5859. uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
  5860. uint8x16_t idx_masked =
  5861. vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
  5862. #if defined(__aarch64__) || defined(_M_ARM64)
  5863. return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
  5864. #elif defined(__GNUC__)
  5865. int8x16_t ret;
  5866. // %e and %f represent the even and odd D registers
  5867. // respectively.
  5868. __asm__ __volatile__(
  5869. "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
  5870. "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
  5871. : [ret] "=&w"(ret)
  5872. : [tbl] "w"(tbl), [idx] "w"(idx_masked));
  5873. return vreinterpretq_m128i_s8(ret);
  5874. #else
  5875. // use this line if testing on aarch64
  5876. int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
  5877. return vreinterpretq_m128i_s8(
  5878. vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
  5879. vtbl2_s8(a_split, vget_high_u8(idx_masked))));
  5880. #endif
  5881. }
  5882. // Shuffle packed 8-bit integers in a according to shuffle control mask in the
  5883. // corresponding 8-bit element of b, and store the results in dst.
  5884. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
  5885. FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
  5886. {
  5887. const int8x8_t controlMask =
  5888. vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
  5889. int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
  5890. return vreinterpret_m64_s8(res);
  5891. }
  5892. // Negate packed 16-bit integers in a when the corresponding signed
  5893. // 16-bit integer in b is negative, and store the results in dst.
  5894. // Element in dst are zeroed out when the corresponding element
  5895. // in b is zero.
  5896. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
  5897. FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
  5898. {
  5899. int16x8_t a = vreinterpretq_s16_m128i(_a);
  5900. int16x8_t b = vreinterpretq_s16_m128i(_b);
  5901. // signed shift right: faster than vclt
  5902. // (b < 0) ? 0xFFFF : 0
  5903. uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
  5904. // (b == 0) ? 0xFFFF : 0
  5905. #if defined(__aarch64__) || defined(_M_ARM64)
  5906. int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
  5907. #else
  5908. int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
  5909. #endif
  5910. // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
  5911. // 'a') based on ltMask
  5912. int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
  5913. // res = masked & (~zeroMask)
  5914. int16x8_t res = vbicq_s16(masked, zeroMask);
  5915. return vreinterpretq_m128i_s16(res);
  5916. }
  5917. // Negate packed 32-bit integers in a when the corresponding signed
  5918. // 32-bit integer in b is negative, and store the results in dst.
  5919. // Element in dst are zeroed out when the corresponding element
  5920. // in b is zero.
  5921. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
  5922. FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
  5923. {
  5924. int32x4_t a = vreinterpretq_s32_m128i(_a);
  5925. int32x4_t b = vreinterpretq_s32_m128i(_b);
  5926. // signed shift right: faster than vclt
  5927. // (b < 0) ? 0xFFFFFFFF : 0
  5928. uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
  5929. // (b == 0) ? 0xFFFFFFFF : 0
  5930. #if defined(__aarch64__) || defined(_M_ARM64)
  5931. int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
  5932. #else
  5933. int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
  5934. #endif
  5935. // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
  5936. // 'a') based on ltMask
  5937. int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
  5938. // res = masked & (~zeroMask)
  5939. int32x4_t res = vbicq_s32(masked, zeroMask);
  5940. return vreinterpretq_m128i_s32(res);
  5941. }
  5942. // Negate packed 8-bit integers in a when the corresponding signed
  5943. // 8-bit integer in b is negative, and store the results in dst.
  5944. // Element in dst are zeroed out when the corresponding element
  5945. // in b is zero.
  5946. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
  5947. FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
  5948. {
  5949. int8x16_t a = vreinterpretq_s8_m128i(_a);
  5950. int8x16_t b = vreinterpretq_s8_m128i(_b);
  5951. // signed shift right: faster than vclt
  5952. // (b < 0) ? 0xFF : 0
  5953. uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
  5954. // (b == 0) ? 0xFF : 0
  5955. #if defined(__aarch64__) || defined(_M_ARM64)
  5956. int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
  5957. #else
  5958. int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
  5959. #endif
  5960. // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
  5961. // based on ltMask
  5962. int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
  5963. // res = masked & (~zeroMask)
  5964. int8x16_t res = vbicq_s8(masked, zeroMask);
  5965. return vreinterpretq_m128i_s8(res);
  5966. }
  5967. // Negate packed 16-bit integers in a when the corresponding signed 16-bit
  5968. // integer in b is negative, and store the results in dst. Element in dst are
  5969. // zeroed out when the corresponding element in b is zero.
  5970. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
  5971. FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
  5972. {
  5973. int16x4_t a = vreinterpret_s16_m64(_a);
  5974. int16x4_t b = vreinterpret_s16_m64(_b);
  5975. // signed shift right: faster than vclt
  5976. // (b < 0) ? 0xFFFF : 0
  5977. uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
  5978. // (b == 0) ? 0xFFFF : 0
  5979. #if defined(__aarch64__) || defined(_M_ARM64)
  5980. int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
  5981. #else
  5982. int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
  5983. #endif
  5984. // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
  5985. // based on ltMask
  5986. int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
  5987. // res = masked & (~zeroMask)
  5988. int16x4_t res = vbic_s16(masked, zeroMask);
  5989. return vreinterpret_m64_s16(res);
  5990. }
  5991. // Negate packed 32-bit integers in a when the corresponding signed 32-bit
  5992. // integer in b is negative, and store the results in dst. Element in dst are
  5993. // zeroed out when the corresponding element in b is zero.
  5994. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
  5995. FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
  5996. {
  5997. int32x2_t a = vreinterpret_s32_m64(_a);
  5998. int32x2_t b = vreinterpret_s32_m64(_b);
  5999. // signed shift right: faster than vclt
  6000. // (b < 0) ? 0xFFFFFFFF : 0
  6001. uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
  6002. // (b == 0) ? 0xFFFFFFFF : 0
  6003. #if defined(__aarch64__) || defined(_M_ARM64)
  6004. int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
  6005. #else
  6006. int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
  6007. #endif
  6008. // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
  6009. // based on ltMask
  6010. int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
  6011. // res = masked & (~zeroMask)
  6012. int32x2_t res = vbic_s32(masked, zeroMask);
  6013. return vreinterpret_m64_s32(res);
  6014. }
  6015. // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
  6016. // in b is negative, and store the results in dst. Element in dst are zeroed out
  6017. // when the corresponding element in b is zero.
  6018. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
  6019. FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
  6020. {
  6021. int8x8_t a = vreinterpret_s8_m64(_a);
  6022. int8x8_t b = vreinterpret_s8_m64(_b);
  6023. // signed shift right: faster than vclt
  6024. // (b < 0) ? 0xFF : 0
  6025. uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
  6026. // (b == 0) ? 0xFF : 0
  6027. #if defined(__aarch64__) || defined(_M_ARM64)
  6028. int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
  6029. #else
  6030. int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
  6031. #endif
  6032. // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
  6033. // based on ltMask
  6034. int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
  6035. // res = masked & (~zeroMask)
  6036. int8x8_t res = vbic_s8(masked, zeroMask);
  6037. return vreinterpret_m64_s8(res);
  6038. }
  6039. /* SSE4.1 */
  6040. // Blend packed 16-bit integers from a and b using control mask imm8, and store
  6041. // the results in dst.
  6042. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
  6043. // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
  6044. // __constrange(0,255) int imm)
  6045. #define _mm_blend_epi16(a, b, imm) \
  6046. _sse2neon_define2( \
  6047. __m128i, a, b, \
  6048. const uint16_t _mask[8] = \
  6049. _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
  6050. ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
  6051. ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
  6052. ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
  6053. ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
  6054. ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
  6055. ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
  6056. ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
  6057. uint16x8_t _mask_vec = vld1q_u16(_mask); \
  6058. uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
  6059. uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
  6060. vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
  6061. // Blend packed double-precision (64-bit) floating-point elements from a and b
  6062. // using control mask imm8, and store the results in dst.
  6063. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
  6064. #define _mm_blend_pd(a, b, imm) \
  6065. _sse2neon_define2( \
  6066. __m128d, a, b, \
  6067. const uint64_t _mask[2] = \
  6068. _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
  6069. ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
  6070. uint64x2_t _mask_vec = vld1q_u64(_mask); \
  6071. uint64x2_t __a = vreinterpretq_u64_m128d(_a); \
  6072. uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \
  6073. vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
  6074. // Blend packed single-precision (32-bit) floating-point elements from a and b
  6075. // using mask, and store the results in dst.
  6076. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
  6077. FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
  6078. {
  6079. const uint32_t ALIGN_STRUCT(16)
  6080. data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
  6081. ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
  6082. ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
  6083. ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
  6084. uint32x4_t mask = vld1q_u32(data);
  6085. float32x4_t a = vreinterpretq_f32_m128(_a);
  6086. float32x4_t b = vreinterpretq_f32_m128(_b);
  6087. return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
  6088. }
  6089. // Blend packed 8-bit integers from a and b using mask, and store the results in
  6090. // dst.
  6091. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
  6092. FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
  6093. {
  6094. // Use a signed shift right to create a mask with the sign bit
  6095. uint8x16_t mask =
  6096. vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
  6097. uint8x16_t a = vreinterpretq_u8_m128i(_a);
  6098. uint8x16_t b = vreinterpretq_u8_m128i(_b);
  6099. return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
  6100. }
  6101. // Blend packed double-precision (64-bit) floating-point elements from a and b
  6102. // using mask, and store the results in dst.
  6103. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
  6104. FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
  6105. {
  6106. uint64x2_t mask =
  6107. vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
  6108. #if defined(__aarch64__) || defined(_M_ARM64)
  6109. float64x2_t a = vreinterpretq_f64_m128d(_a);
  6110. float64x2_t b = vreinterpretq_f64_m128d(_b);
  6111. return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
  6112. #else
  6113. uint64x2_t a = vreinterpretq_u64_m128d(_a);
  6114. uint64x2_t b = vreinterpretq_u64_m128d(_b);
  6115. return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
  6116. #endif
  6117. }
  6118. // Blend packed single-precision (32-bit) floating-point elements from a and b
  6119. // using mask, and store the results in dst.
  6120. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
  6121. FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
  6122. {
  6123. // Use a signed shift right to create a mask with the sign bit
  6124. uint32x4_t mask =
  6125. vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
  6126. float32x4_t a = vreinterpretq_f32_m128(_a);
  6127. float32x4_t b = vreinterpretq_f32_m128(_b);
  6128. return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
  6129. }
  6130. // Round the packed double-precision (64-bit) floating-point elements in a up
  6131. // to an integer value, and store the results as packed double-precision
  6132. // floating-point elements in dst.
  6133. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
  6134. FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
  6135. {
  6136. #if defined(__aarch64__) || defined(_M_ARM64)
  6137. return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
  6138. #else
  6139. double *f = (double *) &a;
  6140. return _mm_set_pd(ceil(f[1]), ceil(f[0]));
  6141. #endif
  6142. }
  6143. // Round the packed single-precision (32-bit) floating-point elements in a up to
  6144. // an integer value, and store the results as packed single-precision
  6145. // floating-point elements in dst.
  6146. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
  6147. FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
  6148. {
  6149. #if (defined(__aarch64__) || defined(_M_ARM64)) || \
  6150. defined(__ARM_FEATURE_DIRECTED_ROUNDING)
  6151. return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
  6152. #else
  6153. float *f = (float *) &a;
  6154. return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
  6155. #endif
  6156. }
  6157. // Round the lower double-precision (64-bit) floating-point element in b up to
  6158. // an integer value, store the result as a double-precision floating-point
  6159. // element in the lower element of dst, and copy the upper element from a to the
  6160. // upper element of dst.
  6161. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
  6162. FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
  6163. {
  6164. return _mm_move_sd(a, _mm_ceil_pd(b));
  6165. }
  6166. // Round the lower single-precision (32-bit) floating-point element in b up to
  6167. // an integer value, store the result as a single-precision floating-point
  6168. // element in the lower element of dst, and copy the upper 3 packed elements
  6169. // from a to the upper elements of dst.
  6170. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
  6171. FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
  6172. {
  6173. return _mm_move_ss(a, _mm_ceil_ps(b));
  6174. }
  6175. // Compare packed 64-bit integers in a and b for equality, and store the results
  6176. // in dst
  6177. FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
  6178. {
  6179. #if defined(__aarch64__) || defined(_M_ARM64)
  6180. return vreinterpretq_m128i_u64(
  6181. vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
  6182. #else
  6183. // ARMv7 lacks vceqq_u64
  6184. // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
  6185. uint32x4_t cmp =
  6186. vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
  6187. uint32x4_t swapped = vrev64q_u32(cmp);
  6188. return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
  6189. #endif
  6190. }
  6191. // Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
  6192. // the results in dst.
  6193. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
  6194. FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
  6195. {
  6196. return vreinterpretq_m128i_s32(
  6197. vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
  6198. }
  6199. // Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
  6200. // the results in dst.
  6201. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
  6202. FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
  6203. {
  6204. int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
  6205. int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
  6206. int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
  6207. return vreinterpretq_m128i_s64(s64x2);
  6208. }
  6209. // Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
  6210. // the results in dst.
  6211. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
  6212. FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
  6213. {
  6214. return vreinterpretq_m128i_s64(
  6215. vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
  6216. }
  6217. // Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
  6218. // the results in dst.
  6219. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
  6220. FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
  6221. {
  6222. int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
  6223. int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
  6224. return vreinterpretq_m128i_s16(s16x8);
  6225. }
  6226. // Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
  6227. // the results in dst.
  6228. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
  6229. FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
  6230. {
  6231. int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
  6232. int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
  6233. int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
  6234. return vreinterpretq_m128i_s32(s32x4);
  6235. }
  6236. // Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
  6237. // integers, and store the results in dst.
  6238. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
  6239. FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
  6240. {
  6241. int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
  6242. int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
  6243. int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
  6244. int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
  6245. return vreinterpretq_m128i_s64(s64x2);
  6246. }
  6247. // Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
  6248. // and store the results in dst.
  6249. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
  6250. FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
  6251. {
  6252. return vreinterpretq_m128i_u32(
  6253. vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
  6254. }
  6255. // Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
  6256. // and store the results in dst.
  6257. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
  6258. FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
  6259. {
  6260. uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
  6261. uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
  6262. uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
  6263. return vreinterpretq_m128i_u64(u64x2);
  6264. }
  6265. // Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
  6266. // and store the results in dst.
  6267. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
  6268. FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
  6269. {
  6270. return vreinterpretq_m128i_u64(
  6271. vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
  6272. }
  6273. // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
  6274. // and store the results in dst.
  6275. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
  6276. FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
  6277. {
  6278. uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
  6279. uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
  6280. return vreinterpretq_m128i_u16(u16x8);
  6281. }
  6282. // Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
  6283. // and store the results in dst.
  6284. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
  6285. FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
  6286. {
  6287. uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
  6288. uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
  6289. uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
  6290. return vreinterpretq_m128i_u32(u32x4);
  6291. }
  6292. // Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
  6293. // 64-bit integers, and store the results in dst.
  6294. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
  6295. FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
  6296. {
  6297. uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
  6298. uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
  6299. uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
  6300. uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
  6301. return vreinterpretq_m128i_u64(u64x2);
  6302. }
  6303. // Conditionally multiply the packed double-precision (64-bit) floating-point
  6304. // elements in a and b using the high 4 bits in imm8, sum the four products, and
  6305. // conditionally store the sum in dst using the low 4 bits of imm8.
  6306. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
  6307. FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
  6308. {
  6309. // Generate mask value from constant immediate bit value
  6310. const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
  6311. const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
  6312. #if !SSE2NEON_PRECISE_DP
  6313. const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
  6314. const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
  6315. #endif
  6316. // Conditional multiplication
  6317. #if !SSE2NEON_PRECISE_DP
  6318. __m128d mul = _mm_mul_pd(a, b);
  6319. const __m128d mulMask =
  6320. _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
  6321. __m128d tmp = _mm_and_pd(mul, mulMask);
  6322. #else
  6323. #if defined(__aarch64__) || defined(_M_ARM64)
  6324. double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
  6325. vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
  6326. : 0;
  6327. double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
  6328. vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
  6329. : 0;
  6330. #else
  6331. double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
  6332. double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
  6333. #endif
  6334. __m128d tmp = _mm_set_pd(d1, d0);
  6335. #endif
  6336. // Sum the products
  6337. #if defined(__aarch64__) || defined(_M_ARM64)
  6338. double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
  6339. #else
  6340. double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
  6341. #endif
  6342. // Conditionally store the sum
  6343. const __m128d sumMask =
  6344. _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
  6345. __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
  6346. return res;
  6347. }
  6348. // Conditionally multiply the packed single-precision (32-bit) floating-point
  6349. // elements in a and b using the high 4 bits in imm8, sum the four products,
  6350. // and conditionally store the sum in dst using the low 4 bits of imm.
  6351. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
  6352. FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
  6353. {
  6354. float32x4_t elementwise_prod = _mm_mul_ps(a, b);
  6355. #if defined(__aarch64__) || defined(_M_ARM64)
  6356. /* shortcuts */
  6357. if (imm == 0xFF) {
  6358. return _mm_set1_ps(vaddvq_f32(elementwise_prod));
  6359. }
  6360. if ((imm & 0x0F) == 0x0F) {
  6361. if (!(imm & (1 << 4)))
  6362. elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
  6363. if (!(imm & (1 << 5)))
  6364. elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
  6365. if (!(imm & (1 << 6)))
  6366. elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
  6367. if (!(imm & (1 << 7)))
  6368. elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
  6369. return _mm_set1_ps(vaddvq_f32(elementwise_prod));
  6370. }
  6371. #endif
  6372. float s = 0.0f;
  6373. if (imm & (1 << 4))
  6374. s += vgetq_lane_f32(elementwise_prod, 0);
  6375. if (imm & (1 << 5))
  6376. s += vgetq_lane_f32(elementwise_prod, 1);
  6377. if (imm & (1 << 6))
  6378. s += vgetq_lane_f32(elementwise_prod, 2);
  6379. if (imm & (1 << 7))
  6380. s += vgetq_lane_f32(elementwise_prod, 3);
  6381. const float32_t res[4] = {
  6382. (imm & 0x1) ? s : 0.0f,
  6383. (imm & 0x2) ? s : 0.0f,
  6384. (imm & 0x4) ? s : 0.0f,
  6385. (imm & 0x8) ? s : 0.0f,
  6386. };
  6387. return vreinterpretq_m128_f32(vld1q_f32(res));
  6388. }
  6389. // Extract a 32-bit integer from a, selected with imm8, and store the result in
  6390. // dst.
  6391. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
  6392. // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
  6393. #define _mm_extract_epi32(a, imm) \
  6394. vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
  6395. // Extract a 64-bit integer from a, selected with imm8, and store the result in
  6396. // dst.
  6397. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
  6398. // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
  6399. #define _mm_extract_epi64(a, imm) \
  6400. vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
  6401. // Extract an 8-bit integer from a, selected with imm8, and store the result in
  6402. // the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
  6403. // __constrange(0,16) int imm)
  6404. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
  6405. #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
  6406. // Extracts the selected single-precision (32-bit) floating-point from a.
  6407. // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
  6408. #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
  6409. // Round the packed double-precision (64-bit) floating-point elements in a down
  6410. // to an integer value, and store the results as packed double-precision
  6411. // floating-point elements in dst.
  6412. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
  6413. FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
  6414. {
  6415. #if defined(__aarch64__) || defined(_M_ARM64)
  6416. return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
  6417. #else
  6418. double *f = (double *) &a;
  6419. return _mm_set_pd(floor(f[1]), floor(f[0]));
  6420. #endif
  6421. }
  6422. // Round the packed single-precision (32-bit) floating-point elements in a down
  6423. // to an integer value, and store the results as packed single-precision
  6424. // floating-point elements in dst.
  6425. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
  6426. FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
  6427. {
  6428. #if (defined(__aarch64__) || defined(_M_ARM64)) || \
  6429. defined(__ARM_FEATURE_DIRECTED_ROUNDING)
  6430. return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
  6431. #else
  6432. float *f = (float *) &a;
  6433. return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
  6434. #endif
  6435. }
  6436. // Round the lower double-precision (64-bit) floating-point element in b down to
  6437. // an integer value, store the result as a double-precision floating-point
  6438. // element in the lower element of dst, and copy the upper element from a to the
  6439. // upper element of dst.
  6440. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
  6441. FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
  6442. {
  6443. return _mm_move_sd(a, _mm_floor_pd(b));
  6444. }
  6445. // Round the lower single-precision (32-bit) floating-point element in b down to
  6446. // an integer value, store the result as a single-precision floating-point
  6447. // element in the lower element of dst, and copy the upper 3 packed elements
  6448. // from a to the upper elements of dst.
  6449. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
  6450. FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
  6451. {
  6452. return _mm_move_ss(a, _mm_floor_ps(b));
  6453. }
  6454. // Copy a to dst, and insert the 32-bit integer i into dst at the location
  6455. // specified by imm8.
  6456. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
  6457. // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
  6458. // __constrange(0,4) int imm)
  6459. #define _mm_insert_epi32(a, b, imm) \
  6460. vreinterpretq_m128i_s32( \
  6461. vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
  6462. // Copy a to dst, and insert the 64-bit integer i into dst at the location
  6463. // specified by imm8.
  6464. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
  6465. // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
  6466. // __constrange(0,2) int imm)
  6467. #define _mm_insert_epi64(a, b, imm) \
  6468. vreinterpretq_m128i_s64( \
  6469. vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
  6470. // Copy a to dst, and insert the lower 8-bit integer from i into dst at the
  6471. // location specified by imm8.
  6472. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
  6473. // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
  6474. // __constrange(0,16) int imm)
  6475. #define _mm_insert_epi8(a, b, imm) \
  6476. vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
  6477. // Copy a to tmp, then insert a single-precision (32-bit) floating-point
  6478. // element from b into tmp using the control in imm8. Store tmp to dst using
  6479. // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
  6480. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
  6481. #define _mm_insert_ps(a, b, imm8) \
  6482. _sse2neon_define2( \
  6483. __m128, a, b, \
  6484. float32x4_t tmp1 = \
  6485. vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \
  6486. vreinterpretq_f32_m128(_a), 0); \
  6487. float32x4_t tmp2 = \
  6488. vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \
  6489. vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
  6490. const uint32_t data[4] = \
  6491. _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
  6492. ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
  6493. ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
  6494. ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \
  6495. uint32x4_t mask = vld1q_u32(data); \
  6496. float32x4_t all_zeros = vdupq_n_f32(0); \
  6497. \
  6498. _sse2neon_return(vreinterpretq_m128_f32( \
  6499. vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
  6500. // Compare packed signed 32-bit integers in a and b, and store packed maximum
  6501. // values in dst.
  6502. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
  6503. FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
  6504. {
  6505. return vreinterpretq_m128i_s32(
  6506. vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  6507. }
  6508. // Compare packed signed 8-bit integers in a and b, and store packed maximum
  6509. // values in dst.
  6510. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
  6511. FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
  6512. {
  6513. return vreinterpretq_m128i_s8(
  6514. vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  6515. }
  6516. // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
  6517. // values in dst.
  6518. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
  6519. FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
  6520. {
  6521. return vreinterpretq_m128i_u16(
  6522. vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
  6523. }
  6524. // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
  6525. // values in dst.
  6526. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
  6527. FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
  6528. {
  6529. return vreinterpretq_m128i_u32(
  6530. vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
  6531. }
  6532. // Compare packed signed 32-bit integers in a and b, and store packed minimum
  6533. // values in dst.
  6534. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
  6535. FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
  6536. {
  6537. return vreinterpretq_m128i_s32(
  6538. vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  6539. }
  6540. // Compare packed signed 8-bit integers in a and b, and store packed minimum
  6541. // values in dst.
  6542. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
  6543. FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
  6544. {
  6545. return vreinterpretq_m128i_s8(
  6546. vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
  6547. }
  6548. // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
  6549. // values in dst.
  6550. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
  6551. FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
  6552. {
  6553. return vreinterpretq_m128i_u16(
  6554. vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
  6555. }
  6556. // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
  6557. // values in dst.
  6558. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
  6559. FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
  6560. {
  6561. return vreinterpretq_m128i_u32(
  6562. vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
  6563. }
  6564. // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
  6565. // in a, store the minimum and index in dst, and zero the remaining bits in dst.
  6566. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
  6567. FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
  6568. {
  6569. __m128i dst;
  6570. uint16_t min, idx = 0;
  6571. #if defined(__aarch64__) || defined(_M_ARM64)
  6572. // Find the minimum value
  6573. min = vminvq_u16(vreinterpretq_u16_m128i(a));
  6574. // Get the index of the minimum value
  6575. static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
  6576. uint16x8_t minv = vdupq_n_u16(min);
  6577. uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
  6578. idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
  6579. #else
  6580. // Find the minimum value
  6581. __m64 tmp;
  6582. tmp = vreinterpret_m64_u16(
  6583. vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
  6584. vget_high_u16(vreinterpretq_u16_m128i(a))));
  6585. tmp = vreinterpret_m64_u16(
  6586. vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
  6587. tmp = vreinterpret_m64_u16(
  6588. vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
  6589. min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
  6590. // Get the index of the minimum value
  6591. int i;
  6592. for (i = 0; i < 8; i++) {
  6593. if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
  6594. idx = (uint16_t) i;
  6595. break;
  6596. }
  6597. a = _mm_srli_si128(a, 2);
  6598. }
  6599. #endif
  6600. // Generate result
  6601. dst = _mm_setzero_si128();
  6602. dst = vreinterpretq_m128i_u16(
  6603. vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
  6604. dst = vreinterpretq_m128i_u16(
  6605. vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
  6606. return dst;
  6607. }
  6608. // Compute the sum of absolute differences (SADs) of quadruplets of unsigned
  6609. // 8-bit integers in a compared to those in b, and store the 16-bit results in
  6610. // dst. Eight SADs are performed using one quadruplet from b and eight
  6611. // quadruplets from a. One quadruplet is selected from b starting at on the
  6612. // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
  6613. // integers selected from a starting at the offset specified in imm8.
  6614. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
  6615. FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
  6616. {
  6617. uint8x16_t _a, _b;
  6618. switch (imm & 0x4) {
  6619. case 0:
  6620. // do nothing
  6621. _a = vreinterpretq_u8_m128i(a);
  6622. break;
  6623. case 4:
  6624. _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
  6625. vreinterpretq_u32_m128i(a), 1));
  6626. break;
  6627. default:
  6628. #if defined(__GNUC__) || defined(__clang__)
  6629. __builtin_unreachable();
  6630. #elif defined(_MSC_VER)
  6631. __assume(0);
  6632. #endif
  6633. break;
  6634. }
  6635. switch (imm & 0x3) {
  6636. case 0:
  6637. _b = vreinterpretq_u8_u32(
  6638. vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
  6639. break;
  6640. case 1:
  6641. _b = vreinterpretq_u8_u32(
  6642. vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
  6643. break;
  6644. case 2:
  6645. _b = vreinterpretq_u8_u32(
  6646. vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
  6647. break;
  6648. case 3:
  6649. _b = vreinterpretq_u8_u32(
  6650. vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
  6651. break;
  6652. default:
  6653. #if defined(__GNUC__) || defined(__clang__)
  6654. __builtin_unreachable();
  6655. #elif defined(_MSC_VER)
  6656. __assume(0);
  6657. #endif
  6658. break;
  6659. }
  6660. int16x8_t c04, c15, c26, c37;
  6661. uint8x8_t low_b = vget_low_u8(_b);
  6662. c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
  6663. uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
  6664. c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
  6665. uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
  6666. c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
  6667. uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
  6668. c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
  6669. #if defined(__aarch64__) || defined(_M_ARM64)
  6670. // |0|4|2|6|
  6671. c04 = vpaddq_s16(c04, c26);
  6672. // |1|5|3|7|
  6673. c15 = vpaddq_s16(c15, c37);
  6674. int32x4_t trn1_c =
  6675. vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
  6676. int32x4_t trn2_c =
  6677. vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
  6678. return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
  6679. vreinterpretq_s16_s32(trn2_c)));
  6680. #else
  6681. int16x4_t c01, c23, c45, c67;
  6682. c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
  6683. c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
  6684. c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
  6685. c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
  6686. return vreinterpretq_m128i_s16(
  6687. vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
  6688. #endif
  6689. }
  6690. // Multiply the low signed 32-bit integers from each packed 64-bit element in
  6691. // a and b, and store the signed 64-bit results in dst.
  6692. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
  6693. FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
  6694. {
  6695. // vmull_s32 upcasts instead of masking, so we downcast.
  6696. int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
  6697. int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
  6698. return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
  6699. }
  6700. // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
  6701. // integers, and store the low 32 bits of the intermediate integers in dst.
  6702. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
  6703. FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
  6704. {
  6705. return vreinterpretq_m128i_s32(
  6706. vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
  6707. }
  6708. // Convert packed signed 32-bit integers from a and b to packed 16-bit integers
  6709. // using unsigned saturation, and store the results in dst.
  6710. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
  6711. FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
  6712. {
  6713. return vreinterpretq_m128i_u16(
  6714. vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
  6715. vqmovun_s32(vreinterpretq_s32_m128i(b))));
  6716. }
  6717. // Round the packed double-precision (64-bit) floating-point elements in a using
  6718. // the rounding parameter, and store the results as packed double-precision
  6719. // floating-point elements in dst.
  6720. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
  6721. FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
  6722. {
  6723. #if defined(__aarch64__) || defined(_M_ARM64)
  6724. switch (rounding) {
  6725. case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
  6726. return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
  6727. case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
  6728. return _mm_floor_pd(a);
  6729. case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
  6730. return _mm_ceil_pd(a);
  6731. case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
  6732. return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
  6733. default: //_MM_FROUND_CUR_DIRECTION
  6734. return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
  6735. }
  6736. #else
  6737. double *v_double = (double *) &a;
  6738. if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
  6739. (rounding == _MM_FROUND_CUR_DIRECTION &&
  6740. _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
  6741. double res[2], tmp;
  6742. for (int i = 0; i < 2; i++) {
  6743. tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
  6744. double roundDown = floor(tmp); // Round down value
  6745. double roundUp = ceil(tmp); // Round up value
  6746. double diffDown = tmp - roundDown;
  6747. double diffUp = roundUp - tmp;
  6748. if (diffDown < diffUp) {
  6749. /* If it's closer to the round down value, then use it */
  6750. res[i] = roundDown;
  6751. } else if (diffDown > diffUp) {
  6752. /* If it's closer to the round up value, then use it */
  6753. res[i] = roundUp;
  6754. } else {
  6755. /* If it's equidistant between round up and round down value,
  6756. * pick the one which is an even number */
  6757. double half = roundDown / 2;
  6758. if (half != floor(half)) {
  6759. /* If the round down value is odd, return the round up value
  6760. */
  6761. res[i] = roundUp;
  6762. } else {
  6763. /* If the round up value is odd, return the round down value
  6764. */
  6765. res[i] = roundDown;
  6766. }
  6767. }
  6768. res[i] = (v_double[i] < 0) ? -res[i] : res[i];
  6769. }
  6770. return _mm_set_pd(res[1], res[0]);
  6771. } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
  6772. (rounding == _MM_FROUND_CUR_DIRECTION &&
  6773. _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
  6774. return _mm_floor_pd(a);
  6775. } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
  6776. (rounding == _MM_FROUND_CUR_DIRECTION &&
  6777. _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
  6778. return _mm_ceil_pd(a);
  6779. }
  6780. return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
  6781. v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
  6782. #endif
  6783. }
  6784. // Round the packed single-precision (32-bit) floating-point elements in a using
  6785. // the rounding parameter, and store the results as packed single-precision
  6786. // floating-point elements in dst.
  6787. // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
  6788. FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
  6789. {
  6790. #if (defined(__aarch64__) || defined(_M_ARM64)) || \
  6791. defined(__ARM_FEATURE_DIRECTED_ROUNDING)
  6792. switch (rounding) {
  6793. case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
  6794. return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
  6795. case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
  6796. return _mm_floor_ps(a);
  6797. case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
  6798. return _mm_ceil_ps(a);
  6799. case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
  6800. return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
  6801. default: //_MM_FROUND_CUR_DIRECTION
  6802. return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
  6803. }
  6804. #else
  6805. float *v_float = (float *) &a;
  6806. if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
  6807. (rounding == _MM_FROUND_CUR_DIRECTION &&
  6808. _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
  6809. uint32x4_t signmask = vdupq_n_u32(0x80000000);
  6810. float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
  6811. vdupq_n_f32(0.5f)); /* +/- 0.5 */
  6812. int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
  6813. vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
  6814. int32x4_t r_trunc = vcvtq_s32_f32(
  6815. vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
  6816. int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
  6817. vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
  6818. int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
  6819. vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
  6820. float32x4_t delta = vsubq_f32(
  6821. vreinterpretq_f32_m128(a),
  6822. vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
  6823. uint32x4_t is_delta_half =
  6824. vceqq_f32(delta, half); /* delta == +/- 0.5 */
  6825. return vreinterpretq_m128_f32(
  6826. vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
  6827. } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
  6828. (rounding == _MM_FROUND_CUR_DIRECTION &&
  6829. _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
  6830. return _mm_floor_ps(a);
  6831. } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
  6832. (rounding == _MM_FROUND_CUR_DIRECTION &&
  6833. _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
  6834. return _mm_ceil_ps(a);
  6835. }
  6836. return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
  6837. v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
  6838. v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
  6839. v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
  6840. #endif
  6841. }
  6842. // Round the lower double-precision (64-bit) floating-point element in b using
  6843. // the rounding parameter, store the result as a double-precision floating-point
  6844. // element in the lower element of dst, and copy the upper element from a to the
  6845. // upper element of dst.
  6846. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
  6847. FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
  6848. {
  6849. return _mm_move_sd(a, _mm_round_pd(b, rounding));
  6850. }
  6851. // Round the lower single-precision (32-bit) floating-point element in b using
  6852. // the rounding parameter, store the result as a single-precision floating-point
  6853. // element in the lower element of dst, and copy the upper 3 packed elements
  6854. // from a to the upper elements of dst. Rounding is done according to the
  6855. // rounding[3:0] parameter, which can be one of:
  6856. // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
  6857. // suppress exceptions
  6858. // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
  6859. // suppress exceptions
  6860. // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
  6861. // exceptions
  6862. // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
  6863. // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
  6864. // _MM_SET_ROUNDING_MODE
  6865. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
  6866. FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
  6867. {
  6868. return _mm_move_ss(a, _mm_round_ps(b, rounding));
  6869. }
  6870. // Load 128-bits of integer data from memory into dst using a non-temporal
  6871. // memory hint. mem_addr must be aligned on a 16-byte boundary or a
  6872. // general-protection exception may be generated.
  6873. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
  6874. FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
  6875. {
  6876. #if __has_builtin(__builtin_nontemporal_store)
  6877. return __builtin_nontemporal_load(p);
  6878. #else
  6879. return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
  6880. #endif
  6881. }
  6882. // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
  6883. // all 1's, and return 1 if the result is zero, otherwise return 0.
  6884. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
  6885. FORCE_INLINE int _mm_test_all_ones(__m128i a)
  6886. {
  6887. return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
  6888. ~(uint64_t) 0;
  6889. }
  6890. // Compute the bitwise AND of 128 bits (representing integer data) in a and
  6891. // mask, and return 1 if the result is zero, otherwise return 0.
  6892. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
  6893. FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
  6894. {
  6895. int64x2_t a_and_mask =
  6896. vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
  6897. return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
  6898. }
  6899. // Compute the bitwise AND of 128 bits (representing integer data) in a and
  6900. // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
  6901. // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
  6902. // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
  6903. // otherwise return 0.
  6904. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
  6905. // Note: Argument names may be wrong in the Intel intrinsics guide.
  6906. FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
  6907. {
  6908. uint64x2_t v = vreinterpretq_u64_m128i(a);
  6909. uint64x2_t m = vreinterpretq_u64_m128i(mask);
  6910. // find ones (set-bits) and zeros (clear-bits) under clip mask
  6911. uint64x2_t ones = vandq_u64(m, v);
  6912. uint64x2_t zeros = vbicq_u64(m, v);
  6913. // If both 128-bit variables are populated (non-zero) then return 1.
  6914. // For comparision purposes, first compact each var down to 32-bits.
  6915. uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
  6916. // if folding minimum is non-zero then both vars must be non-zero
  6917. return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
  6918. }
  6919. // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
  6920. // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
  6921. // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
  6922. // otherwise set CF to 0. Return the CF value.
  6923. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
  6924. FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
  6925. {
  6926. int64x2_t s64 =
  6927. vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
  6928. return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
  6929. }
  6930. // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
  6931. // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
  6932. // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
  6933. // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
  6934. // otherwise return 0.
  6935. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
  6936. #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
  6937. // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
  6938. // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
  6939. // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
  6940. // otherwise set CF to 0. Return the ZF value.
  6941. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
  6942. FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
  6943. {
  6944. int64x2_t s64 =
  6945. vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
  6946. return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
  6947. }
  6948. /* SSE4.2 */
  6949. static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
  6950. 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
  6951. };
  6952. static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
  6953. 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
  6954. 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
  6955. };
  6956. /* specify the source data format */
  6957. #define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
  6958. #define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
  6959. #define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
  6960. #define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
  6961. /* specify the comparison operation */
  6962. #define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */
  6963. #define _SIDD_CMP_RANGES 0x04 /* compare ranges */
  6964. #define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */
  6965. #define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
  6966. /* specify the polarity */
  6967. #define _SIDD_POSITIVE_POLARITY 0x00
  6968. #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
  6969. #define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
  6970. #define _SIDD_MASKED_NEGATIVE_POLARITY \
  6971. 0x30 /* negate results only before end of string */
  6972. /* specify the output selection in _mm_cmpXstri */
  6973. #define _SIDD_LEAST_SIGNIFICANT 0x00
  6974. #define _SIDD_MOST_SIGNIFICANT 0x40
  6975. /* specify the output selection in _mm_cmpXstrm */
  6976. #define _SIDD_BIT_MASK 0x00
  6977. #define _SIDD_UNIT_MASK 0x40
  6978. /* Pattern Matching for C macros.
  6979. * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
  6980. */
  6981. /* catenate */
  6982. #define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
  6983. #define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
  6984. #define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
  6985. /* run the 2nd parameter */
  6986. #define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
  6987. /* run the 1st parameter */
  6988. #define SSE2NEON_IIF_1(t, ...) t
  6989. #define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
  6990. #define SSE2NEON_COMPL_0 1
  6991. #define SSE2NEON_COMPL_1 0
  6992. #define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
  6993. #define SSE2NEON_DEC_1 0
  6994. #define SSE2NEON_DEC_2 1
  6995. #define SSE2NEON_DEC_3 2
  6996. #define SSE2NEON_DEC_4 3
  6997. #define SSE2NEON_DEC_5 4
  6998. #define SSE2NEON_DEC_6 5
  6999. #define SSE2NEON_DEC_7 6
  7000. #define SSE2NEON_DEC_8 7
  7001. #define SSE2NEON_DEC_9 8
  7002. #define SSE2NEON_DEC_10 9
  7003. #define SSE2NEON_DEC_11 10
  7004. #define SSE2NEON_DEC_12 11
  7005. #define SSE2NEON_DEC_13 12
  7006. #define SSE2NEON_DEC_14 13
  7007. #define SSE2NEON_DEC_15 14
  7008. #define SSE2NEON_DEC_16 15
  7009. /* detection */
  7010. #define SSE2NEON_CHECK_N(x, n, ...) n
  7011. #define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
  7012. #define SSE2NEON_PROBE(x) x, 1,
  7013. #define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
  7014. #define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
  7015. #define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
  7016. #define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
  7017. #define SSE2NEON_EAT(...)
  7018. #define SSE2NEON_EXPAND(...) __VA_ARGS__
  7019. #define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
  7020. /* recursion */
  7021. /* deferred expression */
  7022. #define SSE2NEON_EMPTY()
  7023. #define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
  7024. #define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
  7025. #define SSE2NEON_EXPAND(...) __VA_ARGS__
  7026. #define SSE2NEON_EVAL(...) \
  7027. SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
  7028. #define SSE2NEON_EVAL1(...) \
  7029. SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
  7030. #define SSE2NEON_EVAL2(...) \
  7031. SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
  7032. #define SSE2NEON_EVAL3(...) __VA_ARGS__
  7033. #define SSE2NEON_REPEAT(count, macro, ...) \
  7034. SSE2NEON_WHEN(count) \
  7035. (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
  7036. SSE2NEON_DEC(count), macro, \
  7037. __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
  7038. __VA_ARGS__))
  7039. #define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
  7040. #define SSE2NEON_SIZE_OF_byte 8
  7041. #define SSE2NEON_NUMBER_OF_LANES_byte 16
  7042. #define SSE2NEON_SIZE_OF_word 16
  7043. #define SSE2NEON_NUMBER_OF_LANES_word 8
  7044. #define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
  7045. mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
  7046. vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
  7047. vreinterpretq_##type##_m128i(a)));
  7048. #define SSE2NEON_FILL_LANE(i, type) \
  7049. vec_b[i] = \
  7050. vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
  7051. #define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
  7052. number_of_lanes, byte_or_word) \
  7053. do { \
  7054. SSE2NEON_CAT( \
  7055. data_type_prefix, \
  7056. SSE2NEON_CAT(size, \
  7057. SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
  7058. vec_b[number_of_lanes]; \
  7059. __m128i mask = SSE2NEON_IIF(byte_or_word)( \
  7060. vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
  7061. vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
  7062. SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
  7063. SSE2NEON_CAT(type_prefix, size))) \
  7064. for (int i = 0; i < number_of_lanes; i++) { \
  7065. mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
  7066. size)(SSE2NEON_CAT(vbslq_u, size)( \
  7067. SSE2NEON_CAT(vreinterpretq_u, \
  7068. SSE2NEON_CAT(size, _m128i))(mask), \
  7069. SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
  7070. vec_b[i], \
  7071. SSE2NEON_CAT( \
  7072. vreinterpretq_, \
  7073. SSE2NEON_CAT(type_prefix, \
  7074. SSE2NEON_CAT(size, _m128i(a))))), \
  7075. SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
  7076. vec_b[i], \
  7077. SSE2NEON_CAT( \
  7078. vreinterpretq_, \
  7079. SSE2NEON_CAT(type_prefix, \
  7080. SSE2NEON_CAT(size, _m128i(a))))))); \
  7081. } \
  7082. } while (0)
  7083. #define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
  7084. do { \
  7085. SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
  7086. SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
  7087. SSE2NEON_CAT(u, size))) \
  7088. } while (0)
  7089. #define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
  7090. static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
  7091. int lb) \
  7092. { \
  7093. __m128i mtx[16]; \
  7094. PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
  7095. SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
  7096. return SSE2NEON_CAT( \
  7097. _sse2neon_aggregate_equal_any_, \
  7098. SSE2NEON_CAT( \
  7099. SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
  7100. SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
  7101. type))))(la, lb, mtx); \
  7102. }
  7103. #define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
  7104. static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
  7105. int lb) \
  7106. { \
  7107. __m128i mtx[16]; \
  7108. PCMPSTR_RANGES( \
  7109. a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
  7110. SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
  7111. return SSE2NEON_CAT( \
  7112. _sse2neon_aggregate_ranges_, \
  7113. SSE2NEON_CAT( \
  7114. SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
  7115. SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
  7116. type))))(la, lb, mtx); \
  7117. }
  7118. #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
  7119. static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
  7120. __m128i b, int lb) \
  7121. { \
  7122. __m128i mtx[16]; \
  7123. PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
  7124. SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
  7125. return SSE2NEON_CAT( \
  7126. _sse2neon_aggregate_equal_ordered_, \
  7127. SSE2NEON_CAT( \
  7128. SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
  7129. SSE2NEON_CAT(x, \
  7130. SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
  7131. SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
  7132. }
  7133. static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
  7134. {
  7135. int res = 0;
  7136. int m = (1 << la) - 1;
  7137. uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
  7138. uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
  7139. uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
  7140. uint8x16_t vec = vcombine_u8(t_lo, t_hi);
  7141. for (int j = 0; j < lb; j++) {
  7142. mtx[j] = vreinterpretq_m128i_u8(
  7143. vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
  7144. mtx[j] = vreinterpretq_m128i_u8(
  7145. vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
  7146. int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
  7147. res |= (tmp << j);
  7148. }
  7149. return res;
  7150. }
  7151. static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
  7152. {
  7153. int res = 0;
  7154. int m = (1 << la) - 1;
  7155. uint16x8_t vec =
  7156. vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
  7157. for (int j = 0; j < lb; j++) {
  7158. mtx[j] = vreinterpretq_m128i_u16(
  7159. vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
  7160. mtx[j] = vreinterpretq_m128i_u16(
  7161. vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
  7162. int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
  7163. res |= (tmp << j);
  7164. }
  7165. return res;
  7166. }
  7167. /* clang-format off */
  7168. #define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
  7169. prefix##IMPL(byte) \
  7170. prefix##IMPL(word)
  7171. /* clang-format on */
  7172. SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
  7173. static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
  7174. {
  7175. int res = 0;
  7176. int m = (1 << la) - 1;
  7177. uint16x8_t vec =
  7178. vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
  7179. for (int j = 0; j < lb; j++) {
  7180. mtx[j] = vreinterpretq_m128i_u16(
  7181. vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
  7182. mtx[j] = vreinterpretq_m128i_u16(
  7183. vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
  7184. __m128i tmp = vreinterpretq_m128i_u32(
  7185. vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
  7186. uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
  7187. vreinterpretq_u32_m128i(tmp));
  7188. #if defined(__aarch64__) || defined(_M_ARM64)
  7189. int t = vaddvq_u32(vec_res) ? 1 : 0;
  7190. #else
  7191. uint64x2_t sumh = vpaddlq_u32(vec_res);
  7192. int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
  7193. #endif
  7194. res |= (t << j);
  7195. }
  7196. return res;
  7197. }
  7198. static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
  7199. {
  7200. int res = 0;
  7201. int m = (1 << la) - 1;
  7202. uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
  7203. uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
  7204. uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
  7205. uint8x16_t vec = vcombine_u8(t_lo, t_hi);
  7206. for (int j = 0; j < lb; j++) {
  7207. mtx[j] = vreinterpretq_m128i_u8(
  7208. vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
  7209. mtx[j] = vreinterpretq_m128i_u8(
  7210. vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
  7211. __m128i tmp = vreinterpretq_m128i_u16(
  7212. vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
  7213. uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
  7214. vreinterpretq_u16_m128i(tmp));
  7215. int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
  7216. res |= (t << j);
  7217. }
  7218. return res;
  7219. }
  7220. #define SSE2NEON_CMP_RANGES_IS_BYTE 1
  7221. #define SSE2NEON_CMP_RANGES_IS_WORD 0
  7222. /* clang-format off */
  7223. #define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
  7224. prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
  7225. prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
  7226. prefix##IMPL(word, uint, u, prefix##IS_WORD) \
  7227. prefix##IMPL(word, int, s, prefix##IS_WORD)
  7228. /* clang-format on */
  7229. SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
  7230. #undef SSE2NEON_CMP_RANGES_IS_BYTE
  7231. #undef SSE2NEON_CMP_RANGES_IS_WORD
  7232. static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
  7233. {
  7234. uint8x16_t mtx =
  7235. vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
  7236. int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
  7237. int m1 = 0x10000 - (1 << la);
  7238. int tb = 0x10000 - (1 << lb);
  7239. uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
  7240. uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
  7241. vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
  7242. vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
  7243. vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
  7244. vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
  7245. vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
  7246. tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
  7247. tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
  7248. res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
  7249. res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
  7250. res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
  7251. res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
  7252. res_lo = vand_u8(res_lo, vec_mask);
  7253. res_hi = vand_u8(res_hi, vec_mask);
  7254. int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
  7255. return res;
  7256. }
  7257. static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
  7258. {
  7259. uint16x8_t mtx =
  7260. vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
  7261. int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
  7262. int m1 = 0x100 - (1 << la);
  7263. int tb = 0x100 - (1 << lb);
  7264. uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
  7265. uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
  7266. uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
  7267. uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
  7268. mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
  7269. mtx = vbslq_u16(vec1, tmp, mtx);
  7270. mtx = vandq_u16(mtx, vec_mask);
  7271. return _sse2neon_vaddvq_u16(mtx);
  7272. }
  7273. #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
  7274. #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
  7275. #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
  7276. static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
  7277. int bound, int la, int lb, __m128i mtx[16]) \
  7278. { \
  7279. int res = 0; \
  7280. int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \
  7281. uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
  7282. vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
  7283. vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
  7284. uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
  7285. vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \
  7286. vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
  7287. vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \
  7288. uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
  7289. uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
  7290. for (int j = 0; j < lb; j++) { \
  7291. mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
  7292. vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
  7293. } \
  7294. for (int j = lb; j < bound; j++) { \
  7295. mtx[j] = vreinterpretq_m128i_u##size( \
  7296. vbslq_u##size(vec1, vec_minusone, vec_zero)); \
  7297. } \
  7298. unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
  7299. (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
  7300. for (int i = 0; i < bound; i++) { \
  7301. int val = 1; \
  7302. for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
  7303. val &= ptr[k * bound + j]; \
  7304. res += val << i; \
  7305. } \
  7306. return res; \
  7307. }
  7308. /* clang-format off */
  7309. #define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
  7310. prefix##IMPL(8, 16, prefix##IS_UBYTE) \
  7311. prefix##IMPL(16, 8, prefix##IS_UWORD)
  7312. /* clang-format on */
  7313. SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
  7314. #undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
  7315. #undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
  7316. /* clang-format off */
  7317. #define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
  7318. prefix##IMPL(byte) \
  7319. prefix##IMPL(word)
  7320. /* clang-format on */
  7321. SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
  7322. #define SSE2NEON_CMPESTR_LIST \
  7323. _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
  7324. _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
  7325. _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
  7326. _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
  7327. _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
  7328. _(CMP_UWORD_RANGES, cmp_uword_ranges) \
  7329. _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
  7330. _(CMP_SWORD_RANGES, cmp_sword_ranges) \
  7331. _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
  7332. _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
  7333. _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
  7334. _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
  7335. _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
  7336. _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
  7337. _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
  7338. _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
  7339. enum {
  7340. #define _(name, func_suffix) name,
  7341. SSE2NEON_CMPESTR_LIST
  7342. #undef _
  7343. };
  7344. typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
  7345. static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
  7346. #define _(name, func_suffix) _sse2neon_##func_suffix,
  7347. SSE2NEON_CMPESTR_LIST
  7348. #undef _
  7349. };
  7350. FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
  7351. {
  7352. switch (imm8 & 0x30) {
  7353. case _SIDD_NEGATIVE_POLARITY:
  7354. res ^= 0xffffffff;
  7355. break;
  7356. case _SIDD_MASKED_NEGATIVE_POLARITY:
  7357. res ^= (1 << lb) - 1;
  7358. break;
  7359. default:
  7360. break;
  7361. }
  7362. return res & ((bound == 8) ? 0xFF : 0xFFFF);
  7363. }
  7364. FORCE_INLINE int _sse2neon_clz(unsigned int x)
  7365. {
  7366. #ifdef _MSC_VER
  7367. unsigned long cnt = 0;
  7368. if (_BitScanReverse(&cnt, x))
  7369. return 31 - cnt;
  7370. return 32;
  7371. #else
  7372. return x != 0 ? __builtin_clz(x) : 32;
  7373. #endif
  7374. }
  7375. FORCE_INLINE int _sse2neon_ctz(unsigned int x)
  7376. {
  7377. #ifdef _MSC_VER
  7378. unsigned long cnt = 0;
  7379. if (_BitScanForward(&cnt, x))
  7380. return cnt;
  7381. return 32;
  7382. #else
  7383. return x != 0 ? __builtin_ctz(x) : 32;
  7384. #endif
  7385. }
  7386. FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
  7387. {
  7388. #ifdef _MSC_VER
  7389. unsigned long cnt;
  7390. #if defined(SSE2NEON_HAS_BITSCAN64)
  7391. if (_BitScanForward64(&cnt, x))
  7392. return (int) (cnt);
  7393. #else
  7394. if (_BitScanForward(&cnt, (unsigned long) (x)))
  7395. return (int) cnt;
  7396. if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
  7397. return (int) (cnt + 32);
  7398. #endif /* SSE2NEON_HAS_BITSCAN64 */
  7399. return 64;
  7400. #else /* assume GNU compatible compilers */
  7401. return x != 0 ? __builtin_ctzll(x) : 64;
  7402. #endif
  7403. }
  7404. #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
  7405. #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
  7406. const int var = (imm & 0x01) ? 8 : 16
  7407. #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
  7408. int tmp1 = la ^ (la >> 31); \
  7409. la = tmp1 - (la >> 31); \
  7410. int tmp2 = lb ^ (lb >> 31); \
  7411. lb = tmp2 - (lb >> 31); \
  7412. la = SSE2NEON_MIN(la, bound); \
  7413. lb = SSE2NEON_MIN(lb, bound)
  7414. // Compare all pairs of character in string a and b,
  7415. // then aggregate the result.
  7416. // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
  7417. // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
  7418. // string a and b.
  7419. #define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
  7420. SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
  7421. SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
  7422. int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
  7423. r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
  7424. #define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
  7425. return (r2 == 0) ? bound \
  7426. : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
  7427. : _sse2neon_ctz(r2))
  7428. #define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
  7429. __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
  7430. if (imm8 & 0x40) { \
  7431. if (bound == 8) { \
  7432. uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
  7433. vld1q_u16(_sse2neon_cmpestr_mask16b)); \
  7434. dst = vreinterpretq_m128i_u16(vbslq_u16( \
  7435. tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
  7436. } else { \
  7437. uint8x16_t vec_r2 = \
  7438. vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \
  7439. uint8x16_t tmp = \
  7440. vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
  7441. dst = vreinterpretq_m128i_u8( \
  7442. vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
  7443. } \
  7444. } else { \
  7445. if (bound == 16) { \
  7446. dst = vreinterpretq_m128i_u16( \
  7447. vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
  7448. } else { \
  7449. dst = vreinterpretq_m128i_u8( \
  7450. vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \
  7451. } \
  7452. } \
  7453. return dst
  7454. // Compare packed strings in a and b with lengths la and lb using the control
  7455. // in imm8, and returns 1 if b did not contain a null character and the
  7456. // resulting mask was zero, and 0 otherwise.
  7457. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
  7458. FORCE_INLINE int _mm_cmpestra(__m128i a,
  7459. int la,
  7460. __m128i b,
  7461. int lb,
  7462. const int imm8)
  7463. {
  7464. int lb_cpy = lb;
  7465. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
  7466. return !r2 & (lb_cpy > bound);
  7467. }
  7468. // Compare packed strings in a and b with lengths la and lb using the control in
  7469. // imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
  7470. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
  7471. FORCE_INLINE int _mm_cmpestrc(__m128i a,
  7472. int la,
  7473. __m128i b,
  7474. int lb,
  7475. const int imm8)
  7476. {
  7477. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
  7478. return r2 != 0;
  7479. }
  7480. // Compare packed strings in a and b with lengths la and lb using the control
  7481. // in imm8, and store the generated index in dst.
  7482. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
  7483. FORCE_INLINE int _mm_cmpestri(__m128i a,
  7484. int la,
  7485. __m128i b,
  7486. int lb,
  7487. const int imm8)
  7488. {
  7489. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
  7490. SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
  7491. }
  7492. // Compare packed strings in a and b with lengths la and lb using the control
  7493. // in imm8, and store the generated mask in dst.
  7494. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
  7495. FORCE_INLINE __m128i
  7496. _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
  7497. {
  7498. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
  7499. SSE2NEON_CMPSTR_GENERATE_MASK(dst);
  7500. }
  7501. // Compare packed strings in a and b with lengths la and lb using the control in
  7502. // imm8, and returns bit 0 of the resulting bit mask.
  7503. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
  7504. FORCE_INLINE int _mm_cmpestro(__m128i a,
  7505. int la,
  7506. __m128i b,
  7507. int lb,
  7508. const int imm8)
  7509. {
  7510. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
  7511. return r2 & 1;
  7512. }
  7513. // Compare packed strings in a and b with lengths la and lb using the control in
  7514. // imm8, and returns 1 if any character in a was null, and 0 otherwise.
  7515. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
  7516. FORCE_INLINE int _mm_cmpestrs(__m128i a,
  7517. int la,
  7518. __m128i b,
  7519. int lb,
  7520. const int imm8)
  7521. {
  7522. (void) a;
  7523. (void) b;
  7524. (void) lb;
  7525. SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
  7526. return la <= (bound - 1);
  7527. }
  7528. // Compare packed strings in a and b with lengths la and lb using the control in
  7529. // imm8, and returns 1 if any character in b was null, and 0 otherwise.
  7530. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
  7531. FORCE_INLINE int _mm_cmpestrz(__m128i a,
  7532. int la,
  7533. __m128i b,
  7534. int lb,
  7535. const int imm8)
  7536. {
  7537. (void) a;
  7538. (void) b;
  7539. (void) la;
  7540. SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
  7541. return lb <= (bound - 1);
  7542. }
  7543. #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
  7544. do { \
  7545. if (imm8 & 0x01) { \
  7546. uint16x8_t equal_mask_##str = \
  7547. vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
  7548. uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
  7549. uint64_t matches_##str = \
  7550. vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
  7551. len = _sse2neon_ctzll(matches_##str) >> 3; \
  7552. } else { \
  7553. uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
  7554. vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
  7555. uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
  7556. uint64_t matches_##str = \
  7557. vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
  7558. len = _sse2neon_ctzll(matches_##str) >> 2; \
  7559. } \
  7560. } while (0)
  7561. #define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
  7562. int la, lb; \
  7563. do { \
  7564. SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
  7565. SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
  7566. } while (0)
  7567. // Compare packed strings with implicit lengths in a and b using the control in
  7568. // imm8, and returns 1 if b did not contain a null character and the resulting
  7569. // mask was zero, and 0 otherwise.
  7570. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
  7571. FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
  7572. {
  7573. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
  7574. return !r2 & (lb >= bound);
  7575. }
  7576. // Compare packed strings with implicit lengths in a and b using the control in
  7577. // imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
  7578. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
  7579. FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
  7580. {
  7581. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
  7582. return r2 != 0;
  7583. }
  7584. // Compare packed strings with implicit lengths in a and b using the control in
  7585. // imm8, and store the generated index in dst.
  7586. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
  7587. FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
  7588. {
  7589. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
  7590. SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
  7591. }
  7592. // Compare packed strings with implicit lengths in a and b using the control in
  7593. // imm8, and store the generated mask in dst.
  7594. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
  7595. FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
  7596. {
  7597. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
  7598. SSE2NEON_CMPSTR_GENERATE_MASK(dst);
  7599. }
  7600. // Compare packed strings with implicit lengths in a and b using the control in
  7601. // imm8, and returns bit 0 of the resulting bit mask.
  7602. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
  7603. FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
  7604. {
  7605. SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
  7606. return r2 & 1;
  7607. }
  7608. // Compare packed strings with implicit lengths in a and b using the control in
  7609. // imm8, and returns 1 if any character in a was null, and 0 otherwise.
  7610. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
  7611. FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
  7612. {
  7613. (void) b;
  7614. SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
  7615. int la;
  7616. SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
  7617. return la <= (bound - 1);
  7618. }
  7619. // Compare packed strings with implicit lengths in a and b using the control in
  7620. // imm8, and returns 1 if any character in b was null, and 0 otherwise.
  7621. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
  7622. FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
  7623. {
  7624. (void) a;
  7625. SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
  7626. int lb;
  7627. SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
  7628. return lb <= (bound - 1);
  7629. }
  7630. // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
  7631. // in b for greater than.
  7632. FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
  7633. {
  7634. #if defined(__aarch64__) || defined(_M_ARM64)
  7635. return vreinterpretq_m128i_u64(
  7636. vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
  7637. #else
  7638. return vreinterpretq_m128i_s64(vshrq_n_s64(
  7639. vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
  7640. 63));
  7641. #endif
  7642. }
  7643. // Starting with the initial value in crc, accumulates a CRC32 value for
  7644. // unsigned 16-bit integer v, and stores the result in dst.
  7645. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
  7646. FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
  7647. {
  7648. #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
  7649. __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
  7650. : [c] "+r"(crc)
  7651. : [v] "r"(v));
  7652. #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
  7653. (defined(_M_ARM64) && !defined(__clang__))
  7654. crc = __crc32ch(crc, v);
  7655. #else
  7656. crc = _mm_crc32_u8(crc, v & 0xff);
  7657. crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
  7658. #endif
  7659. return crc;
  7660. }
  7661. // Starting with the initial value in crc, accumulates a CRC32 value for
  7662. // unsigned 32-bit integer v, and stores the result in dst.
  7663. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
  7664. FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
  7665. {
  7666. #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
  7667. __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
  7668. : [c] "+r"(crc)
  7669. : [v] "r"(v));
  7670. #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
  7671. (defined(_M_ARM64) && !defined(__clang__))
  7672. crc = __crc32cw(crc, v);
  7673. #else
  7674. crc = _mm_crc32_u16(crc, v & 0xffff);
  7675. crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
  7676. #endif
  7677. return crc;
  7678. }
  7679. // Starting with the initial value in crc, accumulates a CRC32 value for
  7680. // unsigned 64-bit integer v, and stores the result in dst.
  7681. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
  7682. FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
  7683. {
  7684. #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
  7685. __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
  7686. : [c] "+r"(crc)
  7687. : [v] "r"(v));
  7688. #elif (defined(_M_ARM64) && !defined(__clang__))
  7689. crc = __crc32cd((uint32_t) crc, v);
  7690. #else
  7691. crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
  7692. crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
  7693. #endif
  7694. return crc;
  7695. }
  7696. // Starting with the initial value in crc, accumulates a CRC32 value for
  7697. // unsigned 8-bit integer v, and stores the result in dst.
  7698. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
  7699. FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
  7700. {
  7701. #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
  7702. __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
  7703. : [c] "+r"(crc)
  7704. : [v] "r"(v));
  7705. #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
  7706. (defined(_M_ARM64) && !defined(__clang__))
  7707. crc = __crc32cb(crc, v);
  7708. #else
  7709. crc ^= v;
  7710. #if defined(__ARM_FEATURE_CRYPTO)
  7711. // Adapted from: https://mary.rs/lab/crc32/
  7712. // Barrent reduction
  7713. uint64x2_t orig =
  7714. vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
  7715. uint64x2_t tmp = orig;
  7716. // Polynomial P(x) of CRC32C
  7717. uint64_t p = 0x105EC76F1;
  7718. // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
  7719. // 2^{64} / P(x) \rfloor = 0x11f91caf6
  7720. uint64_t mu = 0x1dea713f1;
  7721. // Multiply by mu_{64}
  7722. tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
  7723. // Divide by 2^{64} (mask away the unnecessary bits)
  7724. tmp =
  7725. vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
  7726. // Multiply by P(x) (shifted left by 1 for alignment reasons)
  7727. tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
  7728. // Subtract original from result
  7729. tmp = veorq_u64(tmp, orig);
  7730. // Extract the 'lower' (in bit-reflected sense) 32 bits
  7731. crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
  7732. #else // Fall back to the generic table lookup approach
  7733. // Adapted from: https://create.stephan-brumme.com/crc32/
  7734. // Apply half-byte comparision algorithm for the best ratio between
  7735. // performance and lookup table.
  7736. // The lookup table just needs to store every 16th entry
  7737. // of the standard look-up table.
  7738. static const uint32_t crc32_half_byte_tbl[] = {
  7739. 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
  7740. 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
  7741. 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
  7742. };
  7743. crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
  7744. crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
  7745. #endif
  7746. #endif
  7747. return crc;
  7748. }
  7749. /* AES */
  7750. #if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
  7751. /* clang-format off */
  7752. #define SSE2NEON_AES_SBOX(w) \
  7753. { \
  7754. w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
  7755. w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
  7756. w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
  7757. w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
  7758. w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
  7759. w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
  7760. w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
  7761. w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
  7762. w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
  7763. w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
  7764. w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
  7765. w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
  7766. w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
  7767. w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
  7768. w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
  7769. w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
  7770. w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
  7771. w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
  7772. w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
  7773. w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
  7774. w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
  7775. w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
  7776. w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
  7777. w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
  7778. w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
  7779. w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
  7780. w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
  7781. w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
  7782. w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
  7783. w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
  7784. w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
  7785. w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
  7786. w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
  7787. w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
  7788. w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
  7789. w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
  7790. w(0xb0), w(0x54), w(0xbb), w(0x16) \
  7791. }
  7792. #define SSE2NEON_AES_RSBOX(w) \
  7793. { \
  7794. w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
  7795. w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
  7796. w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
  7797. w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
  7798. w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
  7799. w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
  7800. w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
  7801. w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
  7802. w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
  7803. w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
  7804. w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
  7805. w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
  7806. w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
  7807. w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
  7808. w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
  7809. w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
  7810. w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
  7811. w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
  7812. w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
  7813. w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
  7814. w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
  7815. w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
  7816. w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
  7817. w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
  7818. w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
  7819. w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
  7820. w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
  7821. w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
  7822. w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
  7823. w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
  7824. w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
  7825. w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
  7826. w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
  7827. w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
  7828. w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
  7829. w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
  7830. w(0x55), w(0x21), w(0x0c), w(0x7d) \
  7831. }
  7832. /* clang-format on */
  7833. /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
  7834. #define SSE2NEON_AES_H0(x) (x)
  7835. static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
  7836. static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
  7837. #undef SSE2NEON_AES_H0
  7838. /* x_time function and matrix multiply function */
  7839. #if !defined(__aarch64__) && !defined(_M_ARM64)
  7840. #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
  7841. #define SSE2NEON_MULTIPLY(x, y) \
  7842. (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
  7843. ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
  7844. ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
  7845. ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
  7846. #endif
  7847. // In the absence of crypto extensions, implement aesenc using regular NEON
  7848. // intrinsics instead. See:
  7849. // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
  7850. // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
  7851. // for more information.
  7852. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
  7853. {
  7854. #if defined(__aarch64__) || defined(_M_ARM64)
  7855. static const uint8_t shift_rows[] = {
  7856. 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
  7857. 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
  7858. };
  7859. static const uint8_t ror32by8[] = {
  7860. 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
  7861. 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
  7862. };
  7863. uint8x16_t v;
  7864. uint8x16_t w = vreinterpretq_u8_m128i(a);
  7865. /* shift rows */
  7866. w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
  7867. /* sub bytes */
  7868. // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
  7869. // look up each of the table. After each lookup, we load the next table
  7870. // which locates at the next 64-bytes. In the meantime, the index in the
  7871. // table would be smaller than it was, so the index parameters of
  7872. // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
  7873. v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
  7874. // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
  7875. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
  7876. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
  7877. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
  7878. /* mix columns */
  7879. w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
  7880. w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
  7881. w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
  7882. /* add round key */
  7883. return vreinterpretq_m128i_u8(w) ^ RoundKey;
  7884. #else /* ARMv7-A implementation for a table-based AES */
  7885. #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
  7886. (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
  7887. ((uint32_t) (b1) << 8) | (uint32_t) (b0))
  7888. // muliplying 'x' by 2 in GF(2^8)
  7889. #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
  7890. // muliplying 'x' by 3 in GF(2^8)
  7891. #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
  7892. #define SSE2NEON_AES_U0(p) \
  7893. SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
  7894. #define SSE2NEON_AES_U1(p) \
  7895. SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
  7896. #define SSE2NEON_AES_U2(p) \
  7897. SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
  7898. #define SSE2NEON_AES_U3(p) \
  7899. SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
  7900. // this generates a table containing every possible permutation of
  7901. // shift_rows() and sub_bytes() with mix_columns().
  7902. static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
  7903. SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
  7904. SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
  7905. SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
  7906. SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
  7907. };
  7908. #undef SSE2NEON_AES_B2W
  7909. #undef SSE2NEON_AES_F2
  7910. #undef SSE2NEON_AES_F3
  7911. #undef SSE2NEON_AES_U0
  7912. #undef SSE2NEON_AES_U1
  7913. #undef SSE2NEON_AES_U2
  7914. #undef SSE2NEON_AES_U3
  7915. uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
  7916. uint32_t x1 =
  7917. _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
  7918. uint32_t x2 =
  7919. _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
  7920. uint32_t x3 =
  7921. _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]
  7922. // finish the modulo addition step in mix_columns()
  7923. __m128i out = _mm_set_epi32(
  7924. (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
  7925. aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
  7926. (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
  7927. aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
  7928. (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
  7929. aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
  7930. (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
  7931. aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
  7932. return _mm_xor_si128(out, RoundKey);
  7933. #endif
  7934. }
  7935. // Perform one round of an AES decryption flow on data (state) in a using the
  7936. // round key in RoundKey, and store the result in dst.
  7937. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
  7938. FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
  7939. {
  7940. #if defined(__aarch64__)
  7941. static const uint8_t inv_shift_rows[] = {
  7942. 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
  7943. 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
  7944. };
  7945. static const uint8_t ror32by8[] = {
  7946. 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
  7947. 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
  7948. };
  7949. uint8x16_t v;
  7950. uint8x16_t w = vreinterpretq_u8_m128i(a);
  7951. // inverse shift rows
  7952. w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
  7953. // inverse sub bytes
  7954. v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
  7955. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
  7956. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
  7957. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
  7958. // inverse mix columns
  7959. // multiplying 'v' by 4 in GF(2^8)
  7960. w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
  7961. w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
  7962. v ^= w;
  7963. v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
  7964. w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
  7965. 0x1b); // muliplying 'v' by 2 in GF(2^8)
  7966. w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
  7967. w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
  7968. // add round key
  7969. return vreinterpretq_m128i_u8(w) ^ RoundKey;
  7970. #else /* ARMv7-A NEON implementation */
  7971. /* FIXME: optimized for NEON */
  7972. uint8_t i, e, f, g, h, v[4][4];
  7973. uint8_t *_a = (uint8_t *) &a;
  7974. for (i = 0; i < 16; ++i) {
  7975. v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
  7976. }
  7977. // inverse mix columns
  7978. for (i = 0; i < 4; ++i) {
  7979. e = v[i][0];
  7980. f = v[i][1];
  7981. g = v[i][2];
  7982. h = v[i][3];
  7983. v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
  7984. SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
  7985. v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
  7986. SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
  7987. v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
  7988. SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
  7989. v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
  7990. SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
  7991. }
  7992. return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
  7993. #endif
  7994. }
  7995. // Perform the last round of an AES encryption flow on data (state) in a using
  7996. // the round key in RoundKey, and store the result in dst.
  7997. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
  7998. FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
  7999. {
  8000. #if defined(__aarch64__)
  8001. static const uint8_t shift_rows[] = {
  8002. 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
  8003. 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
  8004. };
  8005. uint8x16_t v;
  8006. uint8x16_t w = vreinterpretq_u8_m128i(a);
  8007. // shift rows
  8008. w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
  8009. // sub bytes
  8010. v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
  8011. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
  8012. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
  8013. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
  8014. // add round key
  8015. return vreinterpretq_m128i_u8(v) ^ RoundKey;
  8016. #else /* ARMv7-A implementation */
  8017. uint8_t v[16] = {
  8018. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
  8019. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
  8020. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
  8021. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
  8022. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
  8023. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
  8024. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
  8025. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
  8026. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
  8027. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
  8028. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
  8029. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
  8030. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
  8031. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
  8032. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
  8033. _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
  8034. };
  8035. return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
  8036. #endif
  8037. }
  8038. // Perform the last round of an AES decryption flow on data (state) in a using
  8039. // the round key in RoundKey, and store the result in dst.
  8040. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
  8041. FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
  8042. {
  8043. #if defined(__aarch64__)
  8044. static const uint8_t inv_shift_rows[] = {
  8045. 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
  8046. 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
  8047. };
  8048. uint8x16_t v;
  8049. uint8x16_t w = vreinterpretq_u8_m128i(a);
  8050. // inverse shift rows
  8051. w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
  8052. // inverse sub bytes
  8053. v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
  8054. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
  8055. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
  8056. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
  8057. // add round key
  8058. return vreinterpretq_m128i_u8(v) ^ RoundKey;
  8059. #else /* ARMv7-A NEON implementation */
  8060. /* FIXME: optimized for NEON */
  8061. uint8_t v[4][4];
  8062. uint8_t *_a = (uint8_t *) &a;
  8063. for (int i = 0; i < 16; ++i) {
  8064. v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
  8065. }
  8066. return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
  8067. #endif
  8068. }
  8069. // Perform the InvMixColumns transformation on a and store the result in dst.
  8070. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
  8071. FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
  8072. {
  8073. #if defined(__aarch64__)
  8074. static const uint8_t ror32by8[] = {
  8075. 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
  8076. 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
  8077. };
  8078. uint8x16_t v = vreinterpretq_u8_m128i(a);
  8079. uint8x16_t w;
  8080. // multiplying 'v' by 4 in GF(2^8)
  8081. w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
  8082. w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
  8083. v ^= w;
  8084. v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
  8085. // multiplying 'v' by 2 in GF(2^8)
  8086. w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
  8087. w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
  8088. w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
  8089. return vreinterpretq_m128i_u8(w);
  8090. #else /* ARMv7-A NEON implementation */
  8091. uint8_t i, e, f, g, h, v[4][4];
  8092. vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
  8093. for (i = 0; i < 4; ++i) {
  8094. e = v[i][0];
  8095. f = v[i][1];
  8096. g = v[i][2];
  8097. h = v[i][3];
  8098. v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
  8099. SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
  8100. v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
  8101. SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
  8102. v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
  8103. SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
  8104. v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
  8105. SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
  8106. }
  8107. return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
  8108. #endif
  8109. }
  8110. // Assist in expanding the AES cipher key by computing steps towards generating
  8111. // a round key for encryption cipher using data from a and an 8-bit round
  8112. // constant specified in imm8, and store the result in dst.
  8113. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
  8114. //
  8115. // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
  8116. // This instruction generates a round key for AES encryption. See
  8117. // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
  8118. // for details.
  8119. FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
  8120. {
  8121. #if defined(__aarch64__)
  8122. uint8x16_t _a = vreinterpretq_u8_m128i(a);
  8123. uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
  8124. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
  8125. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
  8126. v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
  8127. uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
  8128. uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
  8129. uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
  8130. return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
  8131. #else /* ARMv7-A NEON implementation */
  8132. uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
  8133. uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
  8134. for (int i = 0; i < 4; ++i) {
  8135. ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
  8136. ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
  8137. }
  8138. return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
  8139. ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
  8140. #endif
  8141. }
  8142. #undef SSE2NEON_AES_SBOX
  8143. #undef SSE2NEON_AES_RSBOX
  8144. #if defined(__aarch64__)
  8145. #undef SSE2NEON_XT
  8146. #undef SSE2NEON_MULTIPLY
  8147. #endif
  8148. #else /* __ARM_FEATURE_CRYPTO */
  8149. // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
  8150. // AESMC and then manually applying the real key as an xor operation. This
  8151. // unfortunately means an additional xor op; the compiler should be able to
  8152. // optimize this away for repeated calls however. See
  8153. // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
  8154. // for more details.
  8155. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
  8156. {
  8157. return vreinterpretq_m128i_u8(veorq_u8(
  8158. vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
  8159. vreinterpretq_u8_m128i(b)));
  8160. }
  8161. // Perform one round of an AES decryption flow on data (state) in a using the
  8162. // round key in RoundKey, and store the result in dst.
  8163. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
  8164. FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
  8165. {
  8166. return vreinterpretq_m128i_u8(veorq_u8(
  8167. vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
  8168. vreinterpretq_u8_m128i(RoundKey)));
  8169. }
  8170. // Perform the last round of an AES encryption flow on data (state) in a using
  8171. // the round key in RoundKey, and store the result in dst.
  8172. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
  8173. FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
  8174. {
  8175. return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
  8176. vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
  8177. RoundKey);
  8178. }
  8179. // Perform the last round of an AES decryption flow on data (state) in a using
  8180. // the round key in RoundKey, and store the result in dst.
  8181. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
  8182. FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
  8183. {
  8184. return vreinterpretq_m128i_u8(
  8185. veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
  8186. vreinterpretq_u8_m128i(RoundKey)));
  8187. }
  8188. // Perform the InvMixColumns transformation on a and store the result in dst.
  8189. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
  8190. FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
  8191. {
  8192. return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
  8193. }
  8194. // Assist in expanding the AES cipher key by computing steps towards generating
  8195. // a round key for encryption cipher using data from a and an 8-bit round
  8196. // constant specified in imm8, and store the result in dst."
  8197. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
  8198. FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
  8199. {
  8200. // AESE does ShiftRows and SubBytes on A
  8201. uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
  8202. #ifndef _MSC_VER
  8203. uint8x16_t dest = {
  8204. // Undo ShiftRows step from AESE and extract X1 and X3
  8205. u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
  8206. u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
  8207. u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
  8208. u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
  8209. };
  8210. uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
  8211. return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
  8212. #else
  8213. // We have to do this hack because MSVC is strictly adhering to the CPP
  8214. // standard, in particular C++03 8.5.1 sub-section 15, which states that
  8215. // unions must be initialized by their first member type.
  8216. // As per the Windows ARM64 ABI, it is always little endian, so this works
  8217. __n128 dest{
  8218. ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
  8219. ((uint64_t) u8.n128_u8[0xE] << 16) |
  8220. ((uint64_t) u8.n128_u8[0xB] << 24) |
  8221. ((uint64_t) u8.n128_u8[0x1] << 32) |
  8222. ((uint64_t) u8.n128_u8[0xE] << 40) |
  8223. ((uint64_t) u8.n128_u8[0xB] << 48) |
  8224. ((uint64_t) u8.n128_u8[0x4] << 56),
  8225. ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
  8226. ((uint64_t) u8.n128_u8[0x6] << 16) |
  8227. ((uint64_t) u8.n128_u8[0x3] << 24) |
  8228. ((uint64_t) u8.n128_u8[0x9] << 32) |
  8229. ((uint64_t) u8.n128_u8[0x6] << 40) |
  8230. ((uint64_t) u8.n128_u8[0x3] << 48) |
  8231. ((uint64_t) u8.n128_u8[0xC] << 56)};
  8232. dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
  8233. dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
  8234. return dest;
  8235. #endif
  8236. }
  8237. #endif
  8238. /* Others */
  8239. // Perform a carry-less multiplication of two 64-bit integers, selected from a
  8240. // and b according to imm8, and store the results in dst.
  8241. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
  8242. FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
  8243. {
  8244. uint64x2_t a = vreinterpretq_u64_m128i(_a);
  8245. uint64x2_t b = vreinterpretq_u64_m128i(_b);
  8246. switch (imm & 0x11) {
  8247. case 0x00:
  8248. return vreinterpretq_m128i_u64(
  8249. _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
  8250. case 0x01:
  8251. return vreinterpretq_m128i_u64(
  8252. _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
  8253. case 0x10:
  8254. return vreinterpretq_m128i_u64(
  8255. _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
  8256. case 0x11:
  8257. return vreinterpretq_m128i_u64(
  8258. _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
  8259. default:
  8260. abort();
  8261. }
  8262. }
  8263. FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
  8264. {
  8265. union {
  8266. fpcr_bitfield field;
  8267. #if defined(__aarch64__) || defined(_M_ARM64)
  8268. uint64_t value;
  8269. #else
  8270. uint32_t value;
  8271. #endif
  8272. } r;
  8273. #if defined(__aarch64__) || defined(_M_ARM64)
  8274. r.value = _sse2neon_get_fpcr();
  8275. #else
  8276. __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
  8277. #endif
  8278. return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
  8279. }
  8280. // Count the number of bits set to 1 in unsigned 32-bit integer a, and
  8281. // return that count in dst.
  8282. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
  8283. FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
  8284. {
  8285. #if defined(__aarch64__) || defined(_M_ARM64)
  8286. #if __has_builtin(__builtin_popcount)
  8287. return __builtin_popcount(a);
  8288. #elif defined(_MSC_VER)
  8289. return _CountOneBits(a);
  8290. #else
  8291. return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
  8292. #endif
  8293. #else
  8294. uint32_t count = 0;
  8295. uint8x8_t input_val, count8x8_val;
  8296. uint16x4_t count16x4_val;
  8297. uint32x2_t count32x2_val;
  8298. input_val = vld1_u8((uint8_t *) &a);
  8299. count8x8_val = vcnt_u8(input_val);
  8300. count16x4_val = vpaddl_u8(count8x8_val);
  8301. count32x2_val = vpaddl_u16(count16x4_val);
  8302. vst1_u32(&count, count32x2_val);
  8303. return count;
  8304. #endif
  8305. }
  8306. // Count the number of bits set to 1 in unsigned 64-bit integer a, and
  8307. // return that count in dst.
  8308. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
  8309. FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
  8310. {
  8311. #if defined(__aarch64__) || defined(_M_ARM64)
  8312. #if __has_builtin(__builtin_popcountll)
  8313. return __builtin_popcountll(a);
  8314. #elif defined(_MSC_VER)
  8315. return _CountOneBits64(a);
  8316. #else
  8317. return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
  8318. #endif
  8319. #else
  8320. uint64_t count = 0;
  8321. uint8x8_t input_val, count8x8_val;
  8322. uint16x4_t count16x4_val;
  8323. uint32x2_t count32x2_val;
  8324. uint64x1_t count64x1_val;
  8325. input_val = vld1_u8((uint8_t *) &a);
  8326. count8x8_val = vcnt_u8(input_val);
  8327. count16x4_val = vpaddl_u8(count8x8_val);
  8328. count32x2_val = vpaddl_u16(count16x4_val);
  8329. count64x1_val = vpaddl_u32(count32x2_val);
  8330. vst1_u64(&count, count64x1_val);
  8331. return count;
  8332. #endif
  8333. }
  8334. FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
  8335. {
  8336. // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
  8337. // regardless of the value of the FZ bit.
  8338. union {
  8339. fpcr_bitfield field;
  8340. #if defined(__aarch64__) || defined(_M_ARM64)
  8341. uint64_t value;
  8342. #else
  8343. uint32_t value;
  8344. #endif
  8345. } r;
  8346. #if defined(__aarch64__) || defined(_M_ARM64)
  8347. r.value = _sse2neon_get_fpcr();
  8348. #else
  8349. __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
  8350. #endif
  8351. r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
  8352. #if defined(__aarch64__) || defined(_M_ARM64)
  8353. _sse2neon_set_fpcr(r.value);
  8354. #else
  8355. __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
  8356. #endif
  8357. }
  8358. // Return the current 64-bit value of the processor's time-stamp counter.
  8359. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
  8360. FORCE_INLINE uint64_t _rdtsc(void)
  8361. {
  8362. #if defined(__aarch64__) || defined(_M_ARM64)
  8363. uint64_t val;
  8364. /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
  8365. * system counter is at least 56 bits wide; from Armv8.6, the counter
  8366. * must be 64 bits wide. So the system counter could be less than 64
  8367. * bits wide and it is attributed with the flag 'cap_user_time_short'
  8368. * is true.
  8369. */
  8370. #if defined(_MSC_VER)
  8371. val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
  8372. #else
  8373. __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
  8374. #endif
  8375. return val;
  8376. #else
  8377. uint32_t pmccntr, pmuseren, pmcntenset;
  8378. // Read the user mode Performance Monitoring Unit (PMU)
  8379. // User Enable Register (PMUSERENR) access permissions.
  8380. __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
  8381. if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
  8382. __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
  8383. if (pmcntenset & 0x80000000UL) { // Is it counting?
  8384. __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
  8385. // The counter is set up to count every 64th cycle
  8386. return (uint64_t) (pmccntr) << 6;
  8387. }
  8388. }
  8389. // Fallback to syscall as we can't enable PMUSERENR in user mode.
  8390. struct timeval tv;
  8391. gettimeofday(&tv, NULL);
  8392. return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
  8393. #endif
  8394. }
  8395. #if defined(__GNUC__) || defined(__clang__)
  8396. #pragma pop_macro("ALIGN_STRUCT")
  8397. #pragma pop_macro("FORCE_INLINE")
  8398. #endif
  8399. #if defined(__GNUC__) && !defined(__clang__)
  8400. #pragma GCC pop_options
  8401. #endif
  8402. #endif