ppc_vsx-inl.h 263 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229
  1. // Copyright 2023 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. // 128-bit vectors for VSX/Z14
  16. // External include guard in highway.h - see comment there.
  17. #if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
  18. #define HWY_S390X_HAVE_Z14 1
  19. #else
  20. #define HWY_S390X_HAVE_Z14 0
  21. #endif
  22. #pragma push_macro("vector")
  23. #pragma push_macro("pixel")
  24. #pragma push_macro("bool")
  25. #undef vector
  26. #undef pixel
  27. #undef bool
  28. #if HWY_S390X_HAVE_Z14
  29. #include <vecintrin.h>
  30. #else
  31. #include <altivec.h>
  32. #endif
  33. #pragma pop_macro("vector")
  34. #pragma pop_macro("pixel")
  35. #pragma pop_macro("bool")
  36. #include "hwy/ops/shared-inl.h"
  37. // clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__, and
  38. // some GCC do the same for _ARCH_PWR10.
  39. // This means we can only use POWER10-specific intrinsics in static dispatch
  40. // mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
  41. // On other compilers, the usual target check is sufficient.
  42. #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
  43. (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
  44. #define HWY_PPC_HAVE_9 1
  45. #else
  46. #define HWY_PPC_HAVE_9 0
  47. #endif
  48. #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
  49. (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
  50. #define HWY_PPC_HAVE_10 1
  51. #else
  52. #define HWY_PPC_HAVE_10 0
  53. #endif
  54. #if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
  55. #define HWY_S390X_HAVE_Z15 1
  56. #else
  57. #define HWY_S390X_HAVE_Z15 0
  58. #endif
  59. HWY_BEFORE_NAMESPACE();
  60. namespace hwy {
  61. namespace HWY_NAMESPACE {
  62. namespace detail {
  63. template <typename T>
  64. struct Raw128;
  65. // Each Raw128 specialization defines the following typedefs:
  66. // - type:
  67. // the backing Altivec/VSX raw vector type of the Vec128<T, N> type
  68. // - RawBoolVec:
  69. // the backing Altivec/VSX raw __bool vector type of the Mask128<T, N> type
  70. // - RawT:
  71. // the lane type for intrinsics, in particular vec_splat
  72. // - AlignedRawVec:
  73. // the 128-bit GCC/Clang vector type for aligned loads/stores
  74. // - UnalignedRawVec:
  75. // the 128-bit GCC/Clang vector type for unaligned loads/stores
  76. #define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \
  77. template <> \
  78. struct Raw128<LANE_TYPE> { \
  79. using type = __vector RAW_VECT_LANE_TYPE; \
  80. using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \
  81. using RawT = RAW_VECT_LANE_TYPE; \
  82. typedef LANE_TYPE AlignedRawVec \
  83. __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \
  84. typedef LANE_TYPE UnalignedRawVec __attribute__(( \
  85. __vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \
  86. };
  87. HWY_VSX_RAW128(int8_t, signed char, char)
  88. HWY_VSX_RAW128(uint8_t, unsigned char, char)
  89. HWY_VSX_RAW128(int16_t, signed short, short) // NOLINT(runtime/int)
  90. HWY_VSX_RAW128(uint16_t, unsigned short, short) // NOLINT(runtime/int)
  91. HWY_VSX_RAW128(int32_t, signed int, int)
  92. HWY_VSX_RAW128(uint32_t, unsigned int, int)
  93. HWY_VSX_RAW128(int64_t, signed long long, long long) // NOLINT(runtime/int)
  94. HWY_VSX_RAW128(uint64_t, unsigned long long, long long) // NOLINT(runtime/int)
  95. HWY_VSX_RAW128(float, float, int)
  96. HWY_VSX_RAW128(double, double, long long) // NOLINT(runtime/int)
  97. template <>
  98. struct Raw128<bfloat16_t> : public Raw128<uint16_t> {};
  99. template <>
  100. struct Raw128<float16_t> : public Raw128<uint16_t> {};
  101. #undef HWY_VSX_RAW128
  102. } // namespace detail
  103. template <typename T, size_t N = 16 / sizeof(T)>
  104. class Vec128 {
  105. using Raw = typename detail::Raw128<T>::type;
  106. public:
  107. using PrivateT = T; // only for DFromV
  108. static constexpr size_t kPrivateN = N; // only for DFromV
  109. // Compound assignment. Only usable if there is a corresponding non-member
  110. // binary operator overload. For example, only f32 and f64 support division.
  111. HWY_INLINE Vec128& operator*=(const Vec128 other) {
  112. return *this = (*this * other);
  113. }
  114. HWY_INLINE Vec128& operator/=(const Vec128 other) {
  115. return *this = (*this / other);
  116. }
  117. HWY_INLINE Vec128& operator+=(const Vec128 other) {
  118. return *this = (*this + other);
  119. }
  120. HWY_INLINE Vec128& operator-=(const Vec128 other) {
  121. return *this = (*this - other);
  122. }
  123. HWY_INLINE Vec128& operator%=(const Vec128 other) {
  124. return *this = (*this % other);
  125. }
  126. HWY_INLINE Vec128& operator&=(const Vec128 other) {
  127. return *this = (*this & other);
  128. }
  129. HWY_INLINE Vec128& operator|=(const Vec128 other) {
  130. return *this = (*this | other);
  131. }
  132. HWY_INLINE Vec128& operator^=(const Vec128 other) {
  133. return *this = (*this ^ other);
  134. }
  135. Raw raw;
  136. };
  137. template <typename T>
  138. using Vec64 = Vec128<T, 8 / sizeof(T)>;
  139. template <typename T>
  140. using Vec32 = Vec128<T, 4 / sizeof(T)>;
  141. template <typename T>
  142. using Vec16 = Vec128<T, 2 / sizeof(T)>;
  143. // FF..FF or 0.
  144. template <typename T, size_t N = 16 / sizeof(T)>
  145. struct Mask128 {
  146. typename detail::Raw128<T>::RawBoolVec raw;
  147. using PrivateT = T; // only for DFromM
  148. static constexpr size_t kPrivateN = N; // only for DFromM
  149. };
  150. template <class V>
  151. using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
  152. template <class M>
  153. using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
  154. template <class V>
  155. using TFromV = typename V::PrivateT;
  156. // ------------------------------ Zero
  157. // Returns an all-zero vector/part.
  158. template <class D, typename T = TFromD<D>>
  159. HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
  160. // There is no vec_splats for 64-bit, so we cannot rely on casting the 0
  161. // argument in order to select the correct overload. We instead cast the
  162. // return vector type; see also the comment in BitCast.
  163. return Vec128<T, HWY_MAX_LANES_D(D)>{
  164. reinterpret_cast<typename detail::Raw128<T>::type>(vec_splats(0))};
  165. }
  166. template <class D>
  167. using VFromD = decltype(Zero(D()));
  168. // ------------------------------ BitCast
  169. template <class D, typename FromT>
  170. HWY_API VFromD<D> BitCast(D /*d*/,
  171. Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
  172. // C-style casts are not sufficient when compiling with
  173. // -fno-lax-vector-conversions, which will be the future default in Clang,
  174. // but reinterpret_cast is.
  175. return VFromD<D>{
  176. reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
  177. }
  178. // ------------------------------ ResizeBitCast
  179. template <class D, typename FromV>
  180. HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
  181. // C-style casts are not sufficient when compiling with
  182. // -fno-lax-vector-conversions, which will be the future default in Clang,
  183. // but reinterpret_cast is.
  184. return VFromD<D>{
  185. reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
  186. }
  187. // ------------------------------ Set
  188. // Returns a vector/part with all lanes set to "t".
  189. template <class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)>
  190. HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
  191. using RawLane = typename detail::Raw128<TFromD<D>>::RawT;
  192. return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
  193. }
  194. template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
  195. HWY_API VFromD<D> Set(D d, TFromD<D> t) {
  196. const RebindToUnsigned<decltype(d)> du;
  197. return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
  198. }
  199. // Returns a vector with uninitialized elements.
  200. template <class D>
  201. HWY_API VFromD<D> Undefined(D d) {
  202. #if HWY_COMPILER_GCC_ACTUAL
  203. // Suppressing maybe-uninitialized both here and at the caller does not work,
  204. // so initialize.
  205. return Zero(d);
  206. #elif HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
  207. return VFromD<D>{__builtin_nondeterministic_value(Zero(d).raw)};
  208. #else
  209. HWY_DIAGNOSTICS(push)
  210. HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
  211. typename detail::Raw128<TFromD<D>>::type raw;
  212. return VFromD<decltype(d)>{raw};
  213. HWY_DIAGNOSTICS(pop)
  214. #endif
  215. }
  216. // ------------------------------ GetLane
  217. // Gets the single value stored in a vector/part.
  218. template <typename T, size_t N>
  219. HWY_API T GetLane(Vec128<T, N> v) {
  220. return static_cast<T>(v.raw[0]);
  221. }
  222. // ------------------------------ Dup128VecFromValues
  223. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  224. HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
  225. TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
  226. TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
  227. TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
  228. TFromD<D> t11, TFromD<D> t12,
  229. TFromD<D> t13, TFromD<D> t14,
  230. TFromD<D> t15) {
  231. const typename detail::Raw128<TFromD<D>>::type raw = {
  232. t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
  233. return VFromD<D>{raw};
  234. }
  235. template <class D, HWY_IF_UI16_D(D)>
  236. HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
  237. TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
  238. TFromD<D> t5, TFromD<D> t6,
  239. TFromD<D> t7) {
  240. const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
  241. t4, t5, t6, t7};
  242. return VFromD<D>{raw};
  243. }
  244. template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
  245. HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
  246. TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
  247. TFromD<D> t5, TFromD<D> t6,
  248. TFromD<D> t7) {
  249. const RebindToUnsigned<decltype(d)> du;
  250. return BitCast(
  251. d, Dup128VecFromValues(
  252. du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
  253. BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
  254. BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
  255. BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
  256. }
  257. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  258. HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
  259. TFromD<D> t2, TFromD<D> t3) {
  260. const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
  261. return VFromD<D>{raw};
  262. }
  263. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  264. HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
  265. const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
  266. return VFromD<D>{raw};
  267. }
  268. // ================================================== LOGICAL
  269. // ------------------------------ And
  270. template <typename T, size_t N>
  271. HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
  272. const DFromV<decltype(a)> d;
  273. const RebindToUnsigned<decltype(d)> du;
  274. using VU = VFromD<decltype(du)>;
  275. #if HWY_S390X_HAVE_Z14
  276. return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
  277. #else
  278. return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
  279. #endif
  280. }
  281. // ------------------------------ AndNot
  282. // Returns ~not_mask & mask.
  283. template <typename T, size_t N>
  284. HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
  285. const DFromV<decltype(mask)> d;
  286. const RebindToUnsigned<decltype(d)> du;
  287. using VU = VFromD<decltype(du)>;
  288. return BitCast(
  289. d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)});
  290. }
  291. // ------------------------------ Or
  292. template <typename T, size_t N>
  293. HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
  294. const DFromV<decltype(a)> d;
  295. const RebindToUnsigned<decltype(d)> du;
  296. using VU = VFromD<decltype(du)>;
  297. #if HWY_S390X_HAVE_Z14
  298. return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
  299. #else
  300. return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
  301. #endif
  302. }
  303. // ------------------------------ Xor
  304. template <typename T, size_t N>
  305. HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
  306. const DFromV<decltype(a)> d;
  307. const RebindToUnsigned<decltype(d)> du;
  308. using VU = VFromD<decltype(du)>;
  309. #if HWY_S390X_HAVE_Z14
  310. return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
  311. #else
  312. return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
  313. #endif
  314. }
  315. // ------------------------------ Not
  316. template <typename T, size_t N>
  317. HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
  318. const DFromV<decltype(v)> d;
  319. const RebindToUnsigned<decltype(d)> du;
  320. using VU = VFromD<decltype(du)>;
  321. return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)});
  322. }
  323. // ------------------------------ IsConstantRawAltivecVect
  324. namespace detail {
  325. template <class RawV>
  326. static HWY_INLINE bool IsConstantRawAltivecVect(
  327. hwy::SizeTag<1> /* lane_size_tag */, RawV v) {
  328. return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
  329. __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
  330. __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
  331. __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
  332. __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
  333. __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
  334. __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
  335. __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
  336. }
  337. template <class RawV>
  338. static HWY_INLINE bool IsConstantRawAltivecVect(
  339. hwy::SizeTag<2> /* lane_size_tag */, RawV v) {
  340. return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
  341. __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
  342. __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
  343. __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
  344. }
  345. template <class RawV>
  346. static HWY_INLINE bool IsConstantRawAltivecVect(
  347. hwy::SizeTag<4> /* lane_size_tag */, RawV v) {
  348. return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
  349. __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
  350. }
  351. template <class RawV>
  352. static HWY_INLINE bool IsConstantRawAltivecVect(
  353. hwy::SizeTag<8> /* lane_size_tag */, RawV v) {
  354. return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
  355. }
  356. template <class RawV>
  357. static HWY_INLINE bool IsConstantRawAltivecVect(RawV v) {
  358. return IsConstantRawAltivecVect(hwy::SizeTag<sizeof(decltype(v[0]))>(), v);
  359. }
  360. } // namespace detail
  361. // ------------------------------ TernaryLogic
  362. #if HWY_PPC_HAVE_10
  363. namespace detail {
  364. // NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse
  365. // order of the kTernLogOp bits of AVX3
  366. // _mm_ternarylogic_epi64(a, b, c, kTernLogOp)
  367. template <uint8_t kTernLogOp, class V>
  368. HWY_INLINE V TernaryLogic(V a, V b, V c) {
  369. const DFromV<decltype(a)> d;
  370. const RebindToUnsigned<decltype(d)> du;
  371. using VU = VFromD<decltype(du)>;
  372. const auto a_raw = BitCast(du, a).raw;
  373. const auto b_raw = BitCast(du, b).raw;
  374. const auto c_raw = BitCast(du, c).raw;
  375. #if HWY_COMPILER_GCC_ACTUAL
  376. // Use inline assembly on GCC to work around GCC compiler bug
  377. typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result;
  378. __asm__("xxeval %x0,%x1,%x2,%x3,%4"
  379. : "=wa"(raw_ternlog_result)
  380. : "wa"(a_raw), "wa"(b_raw), "wa"(c_raw),
  381. "n"(static_cast<unsigned>(kTernLogOp))
  382. :);
  383. #else
  384. const auto raw_ternlog_result =
  385. vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp);
  386. #endif
  387. return BitCast(d, VU{raw_ternlog_result});
  388. }
  389. } // namespace detail
  390. #endif // HWY_PPC_HAVE_10
  391. // ------------------------------ Xor3
  392. template <typename T, size_t N>
  393. HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
  394. #if HWY_PPC_HAVE_10
  395. #if defined(__OPTIMIZE__)
  396. if (static_cast<int>(detail::IsConstantRawAltivecVect(x1.raw)) +
  397. static_cast<int>(detail::IsConstantRawAltivecVect(x2.raw)) +
  398. static_cast<int>(detail::IsConstantRawAltivecVect(x3.raw)) >=
  399. 2) {
  400. return Xor(x1, Xor(x2, x3));
  401. } else // NOLINT
  402. #endif
  403. {
  404. return detail::TernaryLogic<0x69>(x1, x2, x3);
  405. }
  406. #else
  407. return Xor(x1, Xor(x2, x3));
  408. #endif
  409. }
  410. // ------------------------------ Or3
  411. template <typename T, size_t N>
  412. HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
  413. #if HWY_PPC_HAVE_10
  414. #if defined(__OPTIMIZE__)
  415. if (static_cast<int>(detail::IsConstantRawAltivecVect(o1.raw)) +
  416. static_cast<int>(detail::IsConstantRawAltivecVect(o2.raw)) +
  417. static_cast<int>(detail::IsConstantRawAltivecVect(o3.raw)) >=
  418. 2) {
  419. return Or(o1, Or(o2, o3));
  420. } else // NOLINT
  421. #endif
  422. {
  423. return detail::TernaryLogic<0x7F>(o1, o2, o3);
  424. }
  425. #else
  426. return Or(o1, Or(o2, o3));
  427. #endif
  428. }
  429. // ------------------------------ OrAnd
  430. template <typename T, size_t N>
  431. HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
  432. #if HWY_PPC_HAVE_10
  433. #if defined(__OPTIMIZE__)
  434. if (detail::IsConstantRawAltivecVect(a1.raw) &&
  435. detail::IsConstantRawAltivecVect(a2.raw)) {
  436. return Or(o, And(a1, a2));
  437. } else // NOLINT
  438. #endif
  439. {
  440. return detail::TernaryLogic<0x1F>(o, a1, a2);
  441. }
  442. #else
  443. return Or(o, And(a1, a2));
  444. #endif
  445. }
  446. // ------------------------------ IfVecThenElse
  447. template <typename T, size_t N>
  448. HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
  449. Vec128<T, N> no) {
  450. const DFromV<decltype(yes)> d;
  451. const RebindToUnsigned<decltype(d)> du;
  452. return BitCast(
  453. d, VFromD<decltype(du)>{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw,
  454. BitCast(du, mask).raw)});
  455. }
  456. // ------------------------------ BitwiseIfThenElse
  457. #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
  458. #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
  459. #else
  460. #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
  461. #endif
  462. template <class V>
  463. HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
  464. return IfVecThenElse(mask, yes, no);
  465. }
  466. // ------------------------------ Operator overloads (internal-only if float)
  467. template <typename T, size_t N>
  468. HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
  469. return And(a, b);
  470. }
  471. template <typename T, size_t N>
  472. HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
  473. return Or(a, b);
  474. }
  475. template <typename T, size_t N>
  476. HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
  477. return Xor(a, b);
  478. }
  479. // ================================================== SIGN
  480. // ------------------------------ Neg
  481. template <typename T, size_t N, HWY_IF_SIGNED(T)>
  482. HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
  483. // If T is an signed integer type, use Zero(d) - v instead of vec_neg to
  484. // avoid undefined behavior in the case where v[i] == LimitsMin<T>()
  485. const DFromV<decltype(v)> d;
  486. return Zero(d) - v;
  487. }
  488. template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
  489. HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
  490. #if HWY_S390X_HAVE_Z14
  491. return Xor(v, SignBit(DFromV<decltype(v)>()));
  492. #else
  493. return Vec128<T, N>{vec_neg(v.raw)};
  494. #endif
  495. }
  496. template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
  497. HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
  498. return Xor(v, SignBit(DFromV<decltype(v)>()));
  499. }
  500. // ------------------------------ Abs
  501. // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
  502. template <class T, size_t N, HWY_IF_SIGNED(T)>
  503. HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
  504. // If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to
  505. // avoid undefined behavior in the case where v[i] == LimitsMin<T>().
  506. return Max(v, Neg(v));
  507. }
  508. template <class T, size_t N, HWY_IF_FLOAT3264(T)>
  509. HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
  510. return Vec128<T, N>{vec_abs(v.raw)};
  511. }
  512. // ------------------------------ CopySign
  513. #if HWY_S390X_HAVE_Z14
  514. template <class V>
  515. HWY_API V CopySign(const V magn, const V sign) {
  516. static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
  517. const DFromV<decltype(magn)> d;
  518. const auto msb = SignBit(d);
  519. // Truth table for msb, magn, sign | bitwise msb ? sign : mag
  520. // 0 0 0 | 0
  521. // 0 0 1 | 0
  522. // 0 1 0 | 1
  523. // 0 1 1 | 1
  524. // 1 0 0 | 0
  525. // 1 0 1 | 1
  526. // 1 1 0 | 0
  527. // 1 1 1 | 1
  528. return BitwiseIfThenElse(msb, sign, magn);
  529. }
  530. #else // VSX
  531. template <size_t N>
  532. HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
  533. Vec128<float, N> sign) {
  534. // Work around compiler bugs that are there with vec_cpsgn on older versions
  535. // of GCC/Clang
  536. #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
  537. return Vec128<float, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
  538. #elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
  539. HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp)
  540. return Vec128<float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)};
  541. #else
  542. return Vec128<float, N>{vec_cpsgn(sign.raw, magn.raw)};
  543. #endif
  544. }
  545. template <size_t N>
  546. HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
  547. Vec128<double, N> sign) {
  548. // Work around compiler bugs that are there with vec_cpsgn on older versions
  549. // of GCC/Clang
  550. #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
  551. return Vec128<double, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
  552. #elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
  553. HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp)
  554. return Vec128<double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)};
  555. #else
  556. return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
  557. #endif
  558. }
  559. #endif // HWY_S390X_HAVE_Z14
  560. template <typename T, size_t N>
  561. HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
  562. // PPC8 can also handle abs < 0, so no extra action needed.
  563. static_assert(IsFloat<T>(), "Only makes sense for floating-point");
  564. return CopySign(abs, sign);
  565. }
  566. // ================================================== MEMORY (1)
  567. // Note: type punning is safe because the types are tagged with may_alias.
  568. // (https://godbolt.org/z/fqrWjfjsP)
  569. // ------------------------------ Load
  570. template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
  571. HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
  572. // Suppress the ignoring attributes warning that is generated by
  573. // HWY_RCAST_ALIGNED(const LoadRaw*, aligned) with GCC
  574. #if HWY_COMPILER_GCC
  575. HWY_DIAGNOSTICS(push)
  576. HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
  577. #endif
  578. using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
  579. const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
  580. using ResultRaw = typename detail::Raw128<T>::type;
  581. return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
  582. #if HWY_COMPILER_GCC
  583. HWY_DIAGNOSTICS(pop)
  584. #endif
  585. }
  586. // Any <= 64 bit
  587. template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
  588. HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
  589. using BitsT = UnsignedFromSize<d.MaxBytes()>;
  590. BitsT bits;
  591. const Repartition<BitsT, decltype(d)> d_bits;
  592. CopyBytes<d.MaxBytes()>(p, &bits);
  593. return BitCast(d, Set(d_bits, bits));
  594. }
  595. // ================================================== MASK
  596. // ------------------------------ Mask
  597. // Mask and Vec are both backed by vector types (true = FF..FF).
  598. template <typename T, size_t N>
  599. HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
  600. using Raw = typename detail::Raw128<T>::RawBoolVec;
  601. return Mask128<T, N>{reinterpret_cast<Raw>(v.raw)};
  602. }
  603. template <class D>
  604. using MFromD = decltype(MaskFromVec(VFromD<D>()));
  605. template <typename T, size_t N>
  606. HWY_API Vec128<T, N> VecFromMask(Mask128<T, N> v) {
  607. return Vec128<T, N>{
  608. reinterpret_cast<typename detail::Raw128<T>::type>(v.raw)};
  609. }
  610. template <class D>
  611. HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
  612. return VFromD<D>{
  613. reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
  614. }
  615. // mask ? yes : no
  616. template <typename T, size_t N>
  617. HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
  618. Vec128<T, N> no) {
  619. const DFromV<decltype(yes)> d;
  620. const RebindToUnsigned<decltype(d)> du;
  621. return BitCast(d, VFromD<decltype(du)>{vec_sel(
  622. BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)});
  623. }
  624. // mask ? yes : 0
  625. template <typename T, size_t N>
  626. HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
  627. return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
  628. }
  629. // mask ? 0 : no
  630. template <typename T, size_t N>
  631. HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
  632. return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
  633. }
  634. // ------------------------------ Mask logical
  635. template <typename T, size_t N>
  636. HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
  637. return Mask128<T, N>{vec_nor(m.raw, m.raw)};
  638. }
  639. template <typename T, size_t N>
  640. HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
  641. #if HWY_S390X_HAVE_Z14
  642. return Mask128<T, N>{a.raw & b.raw};
  643. #else
  644. return Mask128<T, N>{vec_and(a.raw, b.raw)};
  645. #endif
  646. }
  647. template <typename T, size_t N>
  648. HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
  649. return Mask128<T, N>{vec_andc(b.raw, a.raw)};
  650. }
  651. template <typename T, size_t N>
  652. HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
  653. #if HWY_S390X_HAVE_Z14
  654. return Mask128<T, N>{a.raw | b.raw};
  655. #else
  656. return Mask128<T, N>{vec_or(a.raw, b.raw)};
  657. #endif
  658. }
  659. template <typename T, size_t N>
  660. HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
  661. #if HWY_S390X_HAVE_Z14
  662. return Mask128<T, N>{a.raw ^ b.raw};
  663. #else
  664. return Mask128<T, N>{vec_xor(a.raw, b.raw)};
  665. #endif
  666. }
  667. template <typename T, size_t N>
  668. HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
  669. return Mask128<T, N>{vec_nor(a.raw, b.raw)};
  670. }
  671. // ------------------------------ ShiftLeftSame
  672. template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  673. HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
  674. const DFromV<decltype(v)> d;
  675. const RebindToUnsigned<decltype(d)> du;
  676. using TU = TFromD<decltype(du)>;
  677. #if HWY_S390X_HAVE_Z14
  678. return BitCast(d,
  679. VFromD<decltype(du)>{BitCast(du, v).raw
  680. << Set(du, static_cast<TU>(bits)).raw});
  681. #else
  682. // Do an unsigned vec_sl operation to avoid undefined behavior
  683. return BitCast(
  684. d, VFromD<decltype(du)>{
  685. vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)});
  686. #endif
  687. }
  688. // ------------------------------ ShiftRightSame
  689. template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
  690. HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
  691. using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
  692. #if HWY_S390X_HAVE_Z14
  693. return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))};
  694. #else
  695. return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
  696. #endif
  697. }
  698. template <typename T, size_t N, HWY_IF_SIGNED(T)>
  699. HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
  700. #if HWY_S390X_HAVE_Z14
  701. using TI = typename detail::Raw128<T>::RawT;
  702. return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))};
  703. #else
  704. using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
  705. return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
  706. #endif
  707. }
  708. // ------------------------------ ShiftLeft
  709. template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  710. HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
  711. static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
  712. return ShiftLeftSame(v, kBits);
  713. }
  714. // ------------------------------ ShiftRight
  715. template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  716. HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
  717. static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
  718. return ShiftRightSame(v, kBits);
  719. }
  720. // ------------------------------ BroadcastSignBit
  721. template <typename T, size_t N, HWY_IF_SIGNED(T)>
  722. HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
  723. return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1));
  724. }
  725. // ================================================== SWIZZLE (1)
  726. // ------------------------------ TableLookupBytes
  727. template <typename T, size_t N, typename TI, size_t NI>
  728. HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes,
  729. Vec128<TI, NI> from) {
  730. const Repartition<uint8_t, DFromV<decltype(from)>> du8_from;
  731. return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
  732. vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))};
  733. }
  734. // ------------------------------ TableLookupBytesOr0
  735. // For all vector widths; Altivec/VSX needs zero out
  736. template <class V, class VI>
  737. HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
  738. const DFromV<VI> di;
  739. Repartition<int8_t, decltype(di)> di8;
  740. const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from)));
  741. return AndNot(zeroOutMask, TableLookupBytes(bytes, from));
  742. }
  743. // ------------------------------ Reverse
  744. template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
  745. HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) {
  746. return Vec128<T>{vec_reve(v.raw)};
  747. }
  748. // ------------------------------ Shuffles (Reverse)
  749. // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
  750. // Shuffle0321 rotates one lane to the right (the previous least-significant
  751. // lane is now most-significant). These could also be implemented via
  752. // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
  753. // Swap 32-bit halves in 64-bit halves.
  754. template <typename T, size_t N>
  755. HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) {
  756. static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
  757. static_assert(N == 2 || N == 4, "Does not make sense for N=1");
  758. const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
  759. 12, 13, 14, 15, 8, 9, 10, 11};
  760. return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)};
  761. }
  762. // These are used by generic_ops-inl to implement LoadInterleaved3. As with
  763. // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
  764. // comes from the first argument.
  765. namespace detail {
  766. template <typename T, HWY_IF_T_SIZE(T, 1)>
  767. HWY_API Vec32<T> ShuffleTwo2301(Vec32<T> a, Vec32<T> b) {
  768. const __vector unsigned char kShuffle16 = {1, 0, 19, 18};
  769. return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle16)};
  770. }
  771. template <typename T, HWY_IF_T_SIZE(T, 2)>
  772. HWY_API Vec64<T> ShuffleTwo2301(Vec64<T> a, Vec64<T> b) {
  773. const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21};
  774. return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
  775. }
  776. template <typename T, HWY_IF_T_SIZE(T, 4)>
  777. HWY_API Vec128<T> ShuffleTwo2301(Vec128<T> a, Vec128<T> b) {
  778. const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
  779. 28, 29, 30, 31, 24, 25, 26, 27};
  780. return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
  781. }
  782. template <typename T, HWY_IF_T_SIZE(T, 1)>
  783. HWY_API Vec32<T> ShuffleTwo1230(Vec32<T> a, Vec32<T> b) {
  784. const __vector unsigned char kShuffle = {0, 3, 18, 17};
  785. return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
  786. }
  787. template <typename T, HWY_IF_T_SIZE(T, 2)>
  788. HWY_API Vec64<T> ShuffleTwo1230(Vec64<T> a, Vec64<T> b) {
  789. const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19};
  790. return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
  791. }
  792. template <typename T, HWY_IF_T_SIZE(T, 4)>
  793. HWY_API Vec128<T> ShuffleTwo1230(Vec128<T> a, Vec128<T> b) {
  794. const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15,
  795. 24, 25, 26, 27, 20, 21, 22, 23};
  796. return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
  797. }
  798. template <typename T, HWY_IF_T_SIZE(T, 1)>
  799. HWY_API Vec32<T> ShuffleTwo3012(Vec32<T> a, Vec32<T> b) {
  800. const __vector unsigned char kShuffle = {2, 1, 16, 19};
  801. return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
  802. }
  803. template <typename T, HWY_IF_T_SIZE(T, 2)>
  804. HWY_API Vec64<T> ShuffleTwo3012(Vec64<T> a, Vec64<T> b) {
  805. const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23};
  806. return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
  807. }
  808. template <typename T, HWY_IF_T_SIZE(T, 4)>
  809. HWY_API Vec128<T> ShuffleTwo3012(Vec128<T> a, Vec128<T> b) {
  810. const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7,
  811. 16, 17, 18, 19, 28, 29, 30, 31};
  812. return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
  813. }
  814. } // namespace detail
  815. // Swap 64-bit halves
  816. template <class T, HWY_IF_T_SIZE(T, 4)>
  817. HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
  818. const Full128<T> d;
  819. const Full128<uint64_t> du64;
  820. return BitCast(d, Reverse(du64, BitCast(du64, v)));
  821. }
  822. template <class T, HWY_IF_T_SIZE(T, 8)>
  823. HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
  824. return Reverse(Full128<T>(), v);
  825. }
  826. // Rotate right 32 bits
  827. template <class T, HWY_IF_T_SIZE(T, 4)>
  828. HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
  829. #if HWY_IS_LITTLE_ENDIAN
  830. return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
  831. #else
  832. return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
  833. #endif
  834. }
  835. // Rotate left 32 bits
  836. template <class T, HWY_IF_T_SIZE(T, 4)>
  837. HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
  838. #if HWY_IS_LITTLE_ENDIAN
  839. return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
  840. #else
  841. return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
  842. #endif
  843. }
  844. template <class T, HWY_IF_T_SIZE(T, 4)>
  845. HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
  846. return Reverse(Full128<T>(), v);
  847. }
  848. // ================================================== COMPARE
  849. // Comparisons fill a lane with 1-bits if the condition is true, else 0.
  850. template <class DTo, typename TFrom, size_t NFrom>
  851. HWY_API MFromD<DTo> RebindMask(DTo /*dto*/, Mask128<TFrom, NFrom> m) {
  852. static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
  853. return MFromD<DTo>{m.raw};
  854. }
  855. template <typename T, size_t N>
  856. HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
  857. static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
  858. return (v & bit) == bit;
  859. }
  860. // ------------------------------ Equality
  861. template <typename T, size_t N>
  862. HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
  863. return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)};
  864. }
  865. // ------------------------------ Inequality
  866. // This cannot have T as a template argument, otherwise it is not more
  867. // specialized than rewritten operator== in C++20, leading to compile
  868. // errors: https://gcc.godbolt.org/z/xsrPhPvPT.
  869. template <size_t N>
  870. HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
  871. Vec128<uint8_t, N> b) {
  872. #if HWY_PPC_HAVE_9
  873. return Mask128<uint8_t, N>{vec_cmpne(a.raw, b.raw)};
  874. #else
  875. return Not(a == b);
  876. #endif
  877. }
  878. template <size_t N>
  879. HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
  880. Vec128<uint16_t, N> b) {
  881. #if HWY_PPC_HAVE_9
  882. return Mask128<uint16_t, N>{vec_cmpne(a.raw, b.raw)};
  883. #else
  884. return Not(a == b);
  885. #endif
  886. }
  887. template <size_t N>
  888. HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
  889. Vec128<uint32_t, N> b) {
  890. #if HWY_PPC_HAVE_9
  891. return Mask128<uint32_t, N>{vec_cmpne(a.raw, b.raw)};
  892. #else
  893. return Not(a == b);
  894. #endif
  895. }
  896. template <size_t N>
  897. HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
  898. Vec128<uint64_t, N> b) {
  899. return Not(a == b);
  900. }
  901. template <size_t N>
  902. HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
  903. Vec128<int8_t, N> b) {
  904. #if HWY_PPC_HAVE_9
  905. return Mask128<int8_t, N>{vec_cmpne(a.raw, b.raw)};
  906. #else
  907. return Not(a == b);
  908. #endif
  909. }
  910. template <size_t N>
  911. HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
  912. Vec128<int16_t, N> b) {
  913. #if HWY_PPC_HAVE_9
  914. return Mask128<int16_t, N>{vec_cmpne(a.raw, b.raw)};
  915. #else
  916. return Not(a == b);
  917. #endif
  918. }
  919. template <size_t N>
  920. HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
  921. Vec128<int32_t, N> b) {
  922. #if HWY_PPC_HAVE_9
  923. return Mask128<int32_t, N>{vec_cmpne(a.raw, b.raw)};
  924. #else
  925. return Not(a == b);
  926. #endif
  927. }
  928. template <size_t N>
  929. HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
  930. Vec128<int64_t, N> b) {
  931. return Not(a == b);
  932. }
  933. template <size_t N>
  934. HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
  935. return Not(a == b);
  936. }
  937. template <size_t N>
  938. HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
  939. Vec128<double, N> b) {
  940. return Not(a == b);
  941. }
  942. // ------------------------------ Strict inequality
  943. template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  944. HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
  945. return Mask128<T, N>{vec_cmpgt(a.raw, b.raw)};
  946. }
  947. // ------------------------------ Weak inequality
  948. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  949. HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
  950. return Mask128<T, N>{vec_cmpge(a.raw, b.raw)};
  951. }
  952. template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  953. HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
  954. return Not(b > a);
  955. }
  956. // ------------------------------ Reversed comparisons
  957. template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  958. HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
  959. return b > a;
  960. }
  961. template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  962. HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
  963. return b >= a;
  964. }
  965. // ================================================== MEMORY (2)
  966. // ------------------------------ Load
  967. template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
  968. HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) {
  969. using LoadRaw = typename detail::Raw128<T>::UnalignedRawVec;
  970. const LoadRaw* HWY_RESTRICT praw = reinterpret_cast<const LoadRaw*>(p);
  971. using ResultRaw = typename detail::Raw128<T>::type;
  972. return Vec128<T>{reinterpret_cast<ResultRaw>(*praw)};
  973. }
  974. // For < 128 bit, LoadU == Load.
  975. template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
  976. HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
  977. return Load(d, p);
  978. }
  979. // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
  980. template <class D, typename T = TFromD<D>>
  981. HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) {
  982. return LoadU(d, p);
  983. }
  984. #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
  985. #ifdef HWY_NATIVE_LOAD_N
  986. #undef HWY_NATIVE_LOAD_N
  987. #else
  988. #define HWY_NATIVE_LOAD_N
  989. #endif
  990. template <class D, typename T = TFromD<D>>
  991. HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
  992. size_t max_lanes_to_load) {
  993. #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
  994. if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
  995. return Zero(d);
  996. }
  997. if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
  998. max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
  999. return LoadU(d, p);
  1000. }
  1001. #endif
  1002. const size_t num_of_bytes_to_load =
  1003. HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
  1004. const Repartition<uint8_t, decltype(d)> du8;
  1005. #if HWY_S390X_HAVE_Z14
  1006. return (num_of_bytes_to_load > 0)
  1007. ? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
  1008. const_cast<unsigned char*>(
  1009. reinterpret_cast<const unsigned char*>(p)),
  1010. static_cast<unsigned>(num_of_bytes_to_load - 1))})
  1011. : Zero(d);
  1012. #else
  1013. return BitCast(
  1014. d,
  1015. VFromD<decltype(du8)>{vec_xl_len(
  1016. const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
  1017. num_of_bytes_to_load)});
  1018. #endif
  1019. }
  1020. template <class D, typename T = TFromD<D>>
  1021. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
  1022. size_t max_lanes_to_load) {
  1023. #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
  1024. if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
  1025. return no;
  1026. }
  1027. if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
  1028. max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
  1029. return LoadU(d, p);
  1030. }
  1031. #endif
  1032. return IfThenElse(FirstN(d, max_lanes_to_load),
  1033. LoadN(d, p, max_lanes_to_load), no);
  1034. }
  1035. #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
  1036. // Returns a vector with lane i=[0, N) set to "first" + i.
  1037. namespace detail {
  1038. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  1039. HWY_INLINE VFromD<D> Iota0(D d) {
  1040. constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
  1041. 8, 9, 10, 11, 12, 13, 14, 15};
  1042. return BitCast(d, VFromD<RebindToUnsigned<D>>{kU8Iota0});
  1043. }
  1044. template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
  1045. HWY_INLINE VFromD<D> Iota0(D d) {
  1046. constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
  1047. return BitCast(d, VFromD<RebindToUnsigned<D>>{kU16Iota0});
  1048. }
  1049. template <class D, HWY_IF_UI32_D(D)>
  1050. HWY_INLINE VFromD<D> Iota0(D d) {
  1051. constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3};
  1052. return BitCast(d, VFromD<RebindToUnsigned<D>>{kU32Iota0});
  1053. }
  1054. template <class D, HWY_IF_UI64_D(D)>
  1055. HWY_INLINE VFromD<D> Iota0(D d) {
  1056. constexpr __vector unsigned long long kU64Iota0 = {0, 1};
  1057. return BitCast(d, VFromD<RebindToUnsigned<D>>{kU64Iota0});
  1058. }
  1059. template <class D, HWY_IF_F32_D(D)>
  1060. HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  1061. constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
  1062. return VFromD<D>{kF32Iota0};
  1063. }
  1064. template <class D, HWY_IF_F64_D(D)>
  1065. HWY_INLINE VFromD<D> Iota0(D /*d*/) {
  1066. constexpr __vector double kF64Iota0 = {0.0, 1.0};
  1067. return VFromD<D>{kF64Iota0};
  1068. }
  1069. } // namespace detail
  1070. template <class D, typename T2>
  1071. HWY_API VFromD<D> Iota(D d, const T2 first) {
  1072. return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
  1073. }
  1074. // ------------------------------ FirstN (Iota, Lt)
  1075. template <class D>
  1076. HWY_API MFromD<D> FirstN(D d, size_t num) {
  1077. const RebindToUnsigned<decltype(d)> du;
  1078. using TU = TFromD<decltype(du)>;
  1079. return RebindMask(d, Iota(du, 0) < Set(du, static_cast<TU>(num)));
  1080. }
  1081. // ------------------------------ MaskedLoad
  1082. template <class D, typename T = TFromD<D>>
  1083. HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT p) {
  1084. return IfThenElseZero(m, LoadU(d, p));
  1085. }
  1086. // ------------------------------ MaskedLoadOr
  1087. template <class D, typename T = TFromD<D>>
  1088. HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
  1089. const T* HWY_RESTRICT p) {
  1090. return IfThenElse(m, LoadU(d, p), v);
  1091. }
  1092. // ------------------------------ Store
  1093. template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
  1094. HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
  1095. // Suppress the ignoring attributes warning that is generated by
  1096. // HWY_RCAST_ALIGNED(StoreRaw*, aligned) with GCC
  1097. #if HWY_COMPILER_GCC
  1098. HWY_DIAGNOSTICS(push)
  1099. HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
  1100. #endif
  1101. using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
  1102. *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
  1103. #if HWY_COMPILER_GCC
  1104. HWY_DIAGNOSTICS(pop)
  1105. #endif
  1106. }
  1107. template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
  1108. HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) {
  1109. using StoreRaw = typename detail::Raw128<T>::UnalignedRawVec;
  1110. *reinterpret_cast<StoreRaw*>(p) = reinterpret_cast<StoreRaw>(v.raw);
  1111. }
  1112. template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
  1113. HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
  1114. using BitsT = UnsignedFromSize<d.MaxBytes()>;
  1115. const Repartition<BitsT, decltype(d)> d_bits;
  1116. const BitsT bits = GetLane(BitCast(d_bits, v));
  1117. CopyBytes<d.MaxBytes()>(&bits, p);
  1118. }
  1119. // For < 128 bit, StoreU == Store.
  1120. template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
  1121. HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
  1122. Store(v, d, p);
  1123. }
  1124. #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
  1125. #ifdef HWY_NATIVE_STORE_N
  1126. #undef HWY_NATIVE_STORE_N
  1127. #else
  1128. #define HWY_NATIVE_STORE_N
  1129. #endif
  1130. template <class D, typename T = TFromD<D>>
  1131. HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
  1132. size_t max_lanes_to_store) {
  1133. #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
  1134. if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) {
  1135. return;
  1136. }
  1137. if (__builtin_constant_p(max_lanes_to_store >= HWY_MAX_LANES_D(D)) &&
  1138. max_lanes_to_store >= HWY_MAX_LANES_D(D)) {
  1139. StoreU(v, d, p);
  1140. return;
  1141. }
  1142. #endif
  1143. const size_t num_of_bytes_to_store =
  1144. HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
  1145. const Repartition<uint8_t, decltype(d)> du8;
  1146. #if HWY_S390X_HAVE_Z14
  1147. if (num_of_bytes_to_store > 0) {
  1148. vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
  1149. static_cast<unsigned>(num_of_bytes_to_store - 1));
  1150. }
  1151. #else
  1152. vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
  1153. num_of_bytes_to_store);
  1154. #endif
  1155. }
  1156. #endif
  1157. // ------------------------------ BlendedStore
  1158. template <class D>
  1159. HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
  1160. TFromD<D>* HWY_RESTRICT p) {
  1161. const VFromD<D> old = LoadU(d, p);
  1162. StoreU(IfThenElse(RebindMask(d, m), v, old), d, p);
  1163. }
  1164. // ================================================== ARITHMETIC
  1165. namespace detail {
  1166. // If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D>
  1167. // rebinds D to MakeUnsigned<TFromD<D>>.
  1168. // Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16),
  1169. // detail::RebindToUnsignedIfNotFloat<D> is the same as D.
  1170. template <class D>
  1171. using RebindToUnsignedIfNotFloat =
  1172. hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
  1173. RebindToUnsigned<D>, D>;
  1174. } // namespace detail
  1175. // ------------------------------ Addition
  1176. template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  1177. HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
  1178. const DFromV<decltype(a)> d;
  1179. const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
  1180. // If T is an integer type, do an unsigned vec_add to avoid undefined behavior
  1181. #if HWY_S390X_HAVE_Z14
  1182. return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
  1183. BitCast(d_arith, b).raw});
  1184. #else
  1185. return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
  1186. BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
  1187. #endif
  1188. }
  1189. // ------------------------------ Subtraction
  1190. template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  1191. HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
  1192. const DFromV<decltype(a)> d;
  1193. const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
  1194. // If T is an integer type, do an unsigned vec_sub to avoid undefined behavior
  1195. #if HWY_S390X_HAVE_Z14
  1196. return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
  1197. BitCast(d_arith, b).raw});
  1198. #else
  1199. return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
  1200. BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
  1201. #endif
  1202. }
  1203. // ------------------------------ SumsOf8
  1204. template <class V, HWY_IF_U8(TFromV<V>)>
  1205. HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
  1206. return SumsOf2(SumsOf4(v));
  1207. }
  1208. template <class V, HWY_IF_I8(TFromV<V>)>
  1209. HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
  1210. #if HWY_S390X_HAVE_Z14
  1211. const DFromV<decltype(v)> di8;
  1212. const RebindToUnsigned<decltype(di8)> du8;
  1213. const RepartitionToWideX3<decltype(di8)> di64;
  1214. return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) +
  1215. Set(di64, int64_t{-1024});
  1216. #else
  1217. return SumsOf2(SumsOf4(v));
  1218. #endif
  1219. }
  1220. // ------------------------------ SaturatedAdd
  1221. // Returns a + b clamped to the destination range.
  1222. #if HWY_S390X_HAVE_Z14
  1223. // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most
  1224. // other integer SIMD instruction sets
  1225. template <typename T, size_t N, HWY_IF_UNSIGNED(T),
  1226. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
  1227. HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
  1228. return Add(a, Min(b, Not(a)));
  1229. }
  1230. template <typename T, size_t N, HWY_IF_SIGNED(T),
  1231. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
  1232. HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
  1233. const DFromV<decltype(a)> d;
  1234. const auto sum = Add(a, b);
  1235. const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
  1236. const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
  1237. return IfNegativeThenElse(overflow_mask, overflow_result, sum);
  1238. }
  1239. #else // VSX
  1240. #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
  1241. #undef HWY_NATIVE_I32_SATURATED_ADDSUB
  1242. #else
  1243. #define HWY_NATIVE_I32_SATURATED_ADDSUB
  1244. #endif
  1245. #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
  1246. #undef HWY_NATIVE_U32_SATURATED_ADDSUB
  1247. #else
  1248. #define HWY_NATIVE_U32_SATURATED_ADDSUB
  1249. #endif
  1250. template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
  1251. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
  1252. HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
  1253. return Vec128<T, N>{vec_adds(a.raw, b.raw)};
  1254. }
  1255. #endif // HWY_S390X_HAVE_Z14
  1256. #if HWY_PPC_HAVE_10
  1257. #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
  1258. #undef HWY_NATIVE_I64_SATURATED_ADDSUB
  1259. #else
  1260. #define HWY_NATIVE_I64_SATURATED_ADDSUB
  1261. #endif
  1262. template <class V, HWY_IF_I64_D(DFromV<V>)>
  1263. HWY_API V SaturatedAdd(V a, V b) {
  1264. const DFromV<decltype(a)> d;
  1265. const auto sum = Add(a, b);
  1266. const auto overflow_mask =
  1267. BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum));
  1268. const auto overflow_result =
  1269. Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
  1270. return IfNegativeThenElse(overflow_mask, overflow_result, sum);
  1271. }
  1272. #endif // HWY_PPC_HAVE_10
  1273. // ------------------------------ SaturatedSub
  1274. // Returns a - b clamped to the destination range.
  1275. #if HWY_S390X_HAVE_Z14
  1276. // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most
  1277. // other integer SIMD instruction sets
  1278. template <typename T, size_t N, HWY_IF_UNSIGNED(T),
  1279. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
  1280. HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
  1281. return Sub(a, Min(a, b));
  1282. }
  1283. template <typename T, size_t N, HWY_IF_SIGNED(T),
  1284. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
  1285. HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
  1286. const DFromV<decltype(a)> d;
  1287. const auto diff = Sub(a, b);
  1288. const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
  1289. const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
  1290. return IfNegativeThenElse(overflow_mask, overflow_result, diff);
  1291. }
  1292. #else // VSX
  1293. template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
  1294. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
  1295. HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
  1296. return Vec128<T, N>{vec_subs(a.raw, b.raw)};
  1297. }
  1298. #endif // HWY_S390X_HAVE_Z14
  1299. #if HWY_PPC_HAVE_10
  1300. template <class V, HWY_IF_I64_D(DFromV<V>)>
  1301. HWY_API V SaturatedSub(V a, V b) {
  1302. const DFromV<decltype(a)> d;
  1303. const auto diff = Sub(a, b);
  1304. const auto overflow_mask =
  1305. BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff));
  1306. const auto overflow_result =
  1307. Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
  1308. return IfNegativeThenElse(overflow_mask, overflow_result, diff);
  1309. }
  1310. #endif // HWY_PPC_HAVE_10
  1311. // ------------------------------ AverageRound
  1312. // Returns (a + b + 1) / 2
  1313. template <typename T, size_t N, HWY_IF_UNSIGNED(T),
  1314. HWY_IF_T_SIZE_ONE_OF(T, 0x6)>
  1315. HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
  1316. return Vec128<T, N>{vec_avg(a.raw, b.raw)};
  1317. }
  1318. // ------------------------------ Multiplication
  1319. // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
  1320. #ifdef HWY_NATIVE_MUL_8
  1321. #undef HWY_NATIVE_MUL_8
  1322. #else
  1323. #define HWY_NATIVE_MUL_8
  1324. #endif
  1325. #ifdef HWY_NATIVE_MUL_64
  1326. #undef HWY_NATIVE_MUL_64
  1327. #else
  1328. #define HWY_NATIVE_MUL_64
  1329. #endif
  1330. template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  1331. HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
  1332. const DFromV<decltype(a)> d;
  1333. const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
  1334. // If T is an integer type, do an unsigned vec_mul to avoid undefined behavior
  1335. #if HWY_S390X_HAVE_Z14
  1336. return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
  1337. BitCast(d_arith, b).raw});
  1338. #else
  1339. return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
  1340. BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
  1341. #endif
  1342. }
  1343. // Returns the upper sizeof(T)*8 bits of a * b in each lane.
  1344. #if HWY_S390X_HAVE_Z14
  1345. #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
  1346. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
  1347. #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
  1348. hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
  1349. #elif HWY_PPC_HAVE_10
  1350. #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
  1351. HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
  1352. #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
  1353. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
  1354. #else
  1355. #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
  1356. hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
  1357. #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
  1358. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
  1359. #endif
  1360. #if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
  1361. template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T),
  1362. HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1363. HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
  1364. return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
  1365. }
  1366. #endif
  1367. template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
  1368. HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1369. HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
  1370. const auto p_even = MulEven(a, b);
  1371. #if HWY_IS_LITTLE_ENDIAN
  1372. const auto p_even_full = ResizeBitCast(Full128<T>(), p_even);
  1373. return Vec128<T, 1>{
  1374. vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))};
  1375. #else
  1376. const DFromV<decltype(a)> d;
  1377. return ResizeBitCast(d, p_even);
  1378. #endif
  1379. }
  1380. template <typename T, size_t N,
  1381. HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
  1382. HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), HWY_IF_LANES_GT(N, 1)>
  1383. HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
  1384. const DFromV<decltype(a)> d;
  1385. const auto p_even = BitCast(d, MulEven(a, b));
  1386. const auto p_odd = BitCast(d, MulOdd(a, b));
  1387. #if HWY_IS_LITTLE_ENDIAN
  1388. return InterleaveOdd(d, p_even, p_odd);
  1389. #else
  1390. return InterleaveEven(d, p_even, p_odd);
  1391. #endif
  1392. }
  1393. #if !HWY_PPC_HAVE_10
  1394. template <class T, HWY_IF_UI64(T)>
  1395. HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
  1396. T p_hi;
  1397. Mul128(GetLane(a), GetLane(b), &p_hi);
  1398. return Set(Full64<T>(), p_hi);
  1399. }
  1400. template <class T, HWY_IF_UI64(T)>
  1401. HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
  1402. const DFromV<decltype(a)> d;
  1403. const Half<decltype(d)> dh;
  1404. return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)),
  1405. MulHigh(LowerHalf(dh, a), LowerHalf(dh, b)));
  1406. }
  1407. #endif // !HWY_PPC_HAVE_10
  1408. #undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
  1409. #undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
  1410. // Multiplies even lanes (0, 2, ..) and places the double-wide result into
  1411. // even and the upper half into its odd neighbor lane.
  1412. template <typename T, size_t N,
  1413. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
  1414. HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1415. HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
  1416. Vec128<T, N> b) {
  1417. return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)};
  1418. }
  1419. // Multiplies odd lanes (1, 3, ..) and places the double-wide result into
  1420. // even and the upper half into its odd neighbor lane.
  1421. template <typename T, size_t N,
  1422. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
  1423. HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1424. HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
  1425. Vec128<T, N> b) {
  1426. return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
  1427. }
  1428. // ------------------------------ Rol/Ror
  1429. #ifdef HWY_NATIVE_ROL_ROR_8
  1430. #undef HWY_NATIVE_ROL_ROR_8
  1431. #else
  1432. #define HWY_NATIVE_ROL_ROR_8
  1433. #endif
  1434. #ifdef HWY_NATIVE_ROL_ROR_16
  1435. #undef HWY_NATIVE_ROL_ROR_16
  1436. #else
  1437. #define HWY_NATIVE_ROL_ROR_16
  1438. #endif
  1439. #ifdef HWY_NATIVE_ROL_ROR_32_64
  1440. #undef HWY_NATIVE_ROL_ROR_32_64
  1441. #else
  1442. #define HWY_NATIVE_ROL_ROR_32_64
  1443. #endif
  1444. template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1445. HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
  1446. const DFromV<decltype(a)> d;
  1447. const RebindToUnsigned<decltype(d)> du;
  1448. return BitCast(
  1449. d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
  1450. }
  1451. template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1452. HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
  1453. const DFromV<decltype(a)> d;
  1454. const RebindToSigned<decltype(d)> di;
  1455. return Rol(a, BitCast(d, Neg(BitCast(di, b))));
  1456. }
  1457. // ------------------------------ RotateRight
  1458. template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1459. HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
  1460. const DFromV<decltype(v)> d;
  1461. constexpr size_t kSizeInBits = sizeof(T) * 8;
  1462. static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
  1463. return (kBits == 0)
  1464. ? v
  1465. : Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
  1466. kBits)));
  1467. }
  1468. // ------------------------------ RotateLeftSame/RotateRightSame
  1469. #ifdef HWY_NATIVE_ROL_ROR_SAME_8
  1470. #undef HWY_NATIVE_ROL_ROR_SAME_8
  1471. #else
  1472. #define HWY_NATIVE_ROL_ROR_SAME_8
  1473. #endif
  1474. #ifdef HWY_NATIVE_ROL_ROR_SAME_16
  1475. #undef HWY_NATIVE_ROL_ROR_SAME_16
  1476. #else
  1477. #define HWY_NATIVE_ROL_ROR_SAME_16
  1478. #endif
  1479. #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
  1480. #undef HWY_NATIVE_ROL_ROR_SAME_32_64
  1481. #else
  1482. #define HWY_NATIVE_ROL_ROR_SAME_32_64
  1483. #endif
  1484. template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1485. HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) {
  1486. const DFromV<decltype(v)> d;
  1487. return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
  1488. }
  1489. template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  1490. HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) {
  1491. const DFromV<decltype(v)> d;
  1492. return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
  1493. }
  1494. // ------------------------------ IfNegativeThenElse
  1495. template <typename T, size_t N>
  1496. HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
  1497. Vec128<T, N> no) {
  1498. static_assert(IsSigned<T>(), "Only works for signed/float");
  1499. const DFromV<decltype(v)> d;
  1500. #if HWY_PPC_HAVE_10
  1501. const RebindToUnsigned<decltype(d)> du;
  1502. return BitCast(
  1503. d, VFromD<decltype(du)>{vec_blendv(
  1504. BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
  1505. #else
  1506. const RebindToSigned<decltype(d)> di;
  1507. return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
  1508. #endif
  1509. }
  1510. #if HWY_PPC_HAVE_10
  1511. #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
  1512. #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
  1513. #else
  1514. #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
  1515. #endif
  1516. #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
  1517. #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
  1518. #else
  1519. #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
  1520. #endif
  1521. template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
  1522. HWY_API V IfNegativeThenElseZero(V v, V yes) {
  1523. const DFromV<decltype(v)> d;
  1524. return IfNegativeThenElse(v, yes, Zero(d));
  1525. }
  1526. template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
  1527. HWY_API V IfNegativeThenZeroElse(V v, V no) {
  1528. const DFromV<decltype(v)> d;
  1529. return IfNegativeThenElse(v, Zero(d), no);
  1530. }
  1531. #endif
  1532. // generic_ops takes care of integer T.
  1533. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  1534. HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
  1535. return Abs(a - b);
  1536. }
  1537. // ------------------------------ Floating-point multiply-add variants
  1538. // Returns mul * x + add
  1539. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  1540. HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
  1541. Vec128<T, N> add) {
  1542. return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)};
  1543. }
  1544. // Returns add - mul * x
  1545. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  1546. HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
  1547. Vec128<T, N> add) {
  1548. // NOTE: the vec_nmsub operation below computes -(mul * x - add),
  1549. // which is equivalent to add - mul * x in the round-to-nearest
  1550. // and round-towards-zero rounding modes
  1551. return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)};
  1552. }
  1553. // Returns mul * x - sub
  1554. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  1555. HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
  1556. Vec128<T, N> sub) {
  1557. return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)};
  1558. }
  1559. // Returns -mul * x - sub
  1560. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  1561. HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
  1562. Vec128<T, N> sub) {
  1563. // NOTE: The vec_nmadd operation below computes -(mul * x + sub),
  1564. // which is equivalent to -mul * x - sub in the round-to-nearest
  1565. // and round-towards-zero rounding modes
  1566. return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)};
  1567. }
  1568. // ------------------------------ Floating-point div
  1569. // Approximate reciprocal
  1570. #ifdef HWY_NATIVE_F64_APPROX_RECIP
  1571. #undef HWY_NATIVE_F64_APPROX_RECIP
  1572. #else
  1573. #define HWY_NATIVE_F64_APPROX_RECIP
  1574. #endif
  1575. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  1576. HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
  1577. #if HWY_S390X_HAVE_Z14
  1578. return Vec128<T, N>{a.raw / b.raw};
  1579. #else
  1580. return Vec128<T, N>{vec_div(a.raw, b.raw)};
  1581. #endif
  1582. }
  1583. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  1584. HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
  1585. #if HWY_S390X_HAVE_Z14
  1586. const DFromV<decltype(v)> d;
  1587. return Set(d, T(1.0)) / v;
  1588. #else
  1589. return Vec128<T, N>{vec_re(v.raw)};
  1590. #endif
  1591. }
  1592. // ------------------------------ Floating-point square root
  1593. #if HWY_S390X_HAVE_Z14
  1594. // Approximate reciprocal square root
  1595. template <size_t N>
  1596. HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
  1597. const DFromV<decltype(v)> d;
  1598. const RebindToUnsigned<decltype(d)> du;
  1599. const auto half = v * Set(d, 0.5f);
  1600. // Initial guess based on log2(f)
  1601. const auto guess = BitCast(
  1602. d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
  1603. // One Newton-Raphson iteration
  1604. return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
  1605. }
  1606. #else // VSX
  1607. #ifdef HWY_NATIVE_F64_APPROX_RSQRT
  1608. #undef HWY_NATIVE_F64_APPROX_RSQRT
  1609. #else
  1610. #define HWY_NATIVE_F64_APPROX_RSQRT
  1611. #endif
  1612. // Approximate reciprocal square root
  1613. template <class T, size_t N, HWY_IF_FLOAT(T)>
  1614. HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
  1615. return Vec128<T, N>{vec_rsqrte(v.raw)};
  1616. }
  1617. #endif // HWY_S390X_HAVE_Z14
  1618. // Full precision square root
  1619. template <class T, size_t N, HWY_IF_FLOAT(T)>
  1620. HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
  1621. return Vec128<T, N>{vec_sqrt(v.raw)};
  1622. }
  1623. // ------------------------------ Min (Gt, IfThenElse)
  1624. template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  1625. HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) {
  1626. return Vec128<T, N>{vec_min(a.raw, b.raw)};
  1627. }
  1628. // ------------------------------ Max (Gt, IfThenElse)
  1629. template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  1630. HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) {
  1631. return Vec128<T, N>{vec_max(a.raw, b.raw)};
  1632. }
  1633. // ------------------------------- Integer AbsDiff for PPC9/PPC10
  1634. #if HWY_PPC_HAVE_9
  1635. #ifdef HWY_NATIVE_INTEGER_ABS_DIFF
  1636. #undef HWY_NATIVE_INTEGER_ABS_DIFF
  1637. #else
  1638. #define HWY_NATIVE_INTEGER_ABS_DIFF
  1639. #endif
  1640. template <class V, HWY_IF_UNSIGNED_V(V),
  1641. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
  1642. HWY_API V AbsDiff(const V a, const V b) {
  1643. return V{vec_absd(a.raw, b.raw)};
  1644. }
  1645. template <class V, HWY_IF_U64_D(DFromV<V>)>
  1646. HWY_API V AbsDiff(const V a, const V b) {
  1647. return Sub(Max(a, b), Min(a, b));
  1648. }
  1649. template <class V, HWY_IF_SIGNED_V(V)>
  1650. HWY_API V AbsDiff(const V a, const V b) {
  1651. return Sub(Max(a, b), Min(a, b));
  1652. }
  1653. #endif // HWY_PPC_HAVE_9
  1654. // ------------------------------ Integer Div for PPC10
  1655. #if HWY_PPC_HAVE_10
  1656. #ifdef HWY_NATIVE_INT_DIV
  1657. #undef HWY_NATIVE_INT_DIV
  1658. #else
  1659. #define HWY_NATIVE_INT_DIV
  1660. #endif
  1661. template <size_t N>
  1662. HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a,
  1663. Vec128<int32_t, N> b) {
  1664. // Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid
  1665. // undefined behavior if b[i] == 0 or
  1666. // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
  1667. // Clang will also optimize out I32 vec_div on PPC10 if optimizations are
  1668. // enabled and any of the lanes of b are known to be zero (even in the unused
  1669. // lanes of a partial vector)
  1670. __vector signed int raw_result;
  1671. __asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
  1672. return Vec128<int32_t, N>{raw_result};
  1673. }
  1674. template <size_t N>
  1675. HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a,
  1676. Vec128<uint32_t, N> b) {
  1677. // Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid
  1678. // undefined behavior if b[i] == 0
  1679. // Clang will also optimize out U32 vec_div on PPC10 if optimizations are
  1680. // enabled and any of the lanes of b are known to be zero (even in the unused
  1681. // lanes of a partial vector)
  1682. __vector unsigned int raw_result;
  1683. __asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
  1684. return Vec128<uint32_t, N>{raw_result};
  1685. }
  1686. template <size_t N>
  1687. HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a,
  1688. Vec128<int64_t, N> b) {
  1689. // Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid
  1690. // undefined behavior if b[i] == 0 or
  1691. // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
  1692. // Clang will also optimize out I64 vec_div on PPC10 if optimizations are
  1693. // enabled and any of the lanes of b are known to be zero (even in the unused
  1694. // lanes of a partial vector)
  1695. __vector signed long long raw_result;
  1696. __asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
  1697. return Vec128<int64_t, N>{raw_result};
  1698. }
  1699. template <size_t N>
  1700. HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a,
  1701. Vec128<uint64_t, N> b) {
  1702. // Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid
  1703. // undefined behavior if b[i] == 0
  1704. // Clang will also optimize out U64 vec_div on PPC10 if optimizations are
  1705. // enabled and any of the lanes of b are known to be zero (even in the unused
  1706. // lanes of a partial vector)
  1707. __vector unsigned long long raw_result;
  1708. __asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
  1709. return Vec128<uint64_t, N>{raw_result};
  1710. }
  1711. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
  1712. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
  1713. HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) {
  1714. const DFromV<decltype(a)> d;
  1715. const RepartitionToWide<decltype(d)> dw;
  1716. return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
  1717. PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
  1718. }
  1719. template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
  1720. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
  1721. HWY_IF_V_SIZE_LE(T, N, 8)>
  1722. HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
  1723. const DFromV<decltype(a)> d;
  1724. const Rebind<MakeWide<T>, decltype(d)> dw;
  1725. return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
  1726. }
  1727. template <size_t N>
  1728. HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a,
  1729. Vec128<int32_t, N> b) {
  1730. // Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid
  1731. // undefined behavior if b[i] == 0 or
  1732. // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
  1733. // Clang will also optimize out I32 vec_mod on PPC10 if optimizations are
  1734. // enabled and any of the lanes of b are known to be zero (even in the unused
  1735. // lanes of a partial vector)
  1736. __vector signed int raw_result;
  1737. __asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
  1738. return Vec128<int32_t, N>{raw_result};
  1739. }
  1740. template <size_t N>
  1741. HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a,
  1742. Vec128<uint32_t, N> b) {
  1743. // Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid
  1744. // undefined behavior if b[i] == 0
  1745. // Clang will also optimize out U32 vec_mod on PPC10 if optimizations are
  1746. // enabled and any of the lanes of b are known to be zero (even in the unused
  1747. // lanes of a partial vector)
  1748. __vector unsigned int raw_result;
  1749. __asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
  1750. return Vec128<uint32_t, N>{raw_result};
  1751. }
  1752. template <size_t N>
  1753. HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a,
  1754. Vec128<int64_t, N> b) {
  1755. // Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid
  1756. // undefined behavior if b[i] == 0 or
  1757. // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
  1758. // Clang will also optimize out I64 vec_mod on PPC10 if optimizations are
  1759. // enabled and any of the lanes of b are known to be zero (even in the unused
  1760. // lanes of a partial vector)
  1761. __vector signed long long raw_result;
  1762. __asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
  1763. return Vec128<int64_t, N>{raw_result};
  1764. }
  1765. template <size_t N>
  1766. HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a,
  1767. Vec128<uint64_t, N> b) {
  1768. // Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid
  1769. // undefined behavior if b[i] == 0
  1770. // Clang will also optimize out U64 vec_mod on PPC10 if optimizations are
  1771. // enabled and any of the lanes of b are known to be zero (even in the unused
  1772. // lanes of a partial vector)
  1773. __vector unsigned long long raw_result;
  1774. __asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
  1775. return Vec128<uint64_t, N>{raw_result};
  1776. }
  1777. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
  1778. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
  1779. HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) {
  1780. const DFromV<decltype(a)> d;
  1781. const RepartitionToWide<decltype(d)> dw;
  1782. return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b),
  1783. PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b));
  1784. }
  1785. template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
  1786. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
  1787. HWY_IF_V_SIZE_LE(T, N, 8)>
  1788. HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
  1789. const DFromV<decltype(a)> d;
  1790. const Rebind<MakeWide<T>, decltype(d)> dw;
  1791. return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b));
  1792. }
  1793. #endif
  1794. // ================================================== MEMORY (3)
  1795. // ------------------------------ Non-temporal stores
  1796. template <class D>
  1797. HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
  1798. __builtin_prefetch(aligned, 1, 0);
  1799. Store(v, d, aligned);
  1800. }
  1801. // ------------------------------ Scatter in generic_ops-inl.h
  1802. // ------------------------------ Gather in generic_ops-inl.h
  1803. // ================================================== SWIZZLE (2)
  1804. // ------------------------------ LowerHalf
  1805. // Returns upper/lower half of a vector.
  1806. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  1807. HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
  1808. return VFromD<D>{v.raw};
  1809. }
  1810. template <typename T, size_t N>
  1811. HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
  1812. return Vec128<T, N / 2>{v.raw};
  1813. }
  1814. // ------------------------------ ShiftLeftBytes
  1815. // NOTE: The ShiftLeftBytes operation moves the elements of v to the right
  1816. // by kBytes bytes and zeroes out the first kBytes bytes of v on both
  1817. // little-endian and big-endian PPC targets
  1818. // (same behavior as the HWY_EMU128 ShiftLeftBytes operation on both
  1819. // little-endian and big-endian targets)
  1820. template <int kBytes, class D>
  1821. HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
  1822. static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
  1823. if (kBytes == 0) return v;
  1824. const auto zeros = Zero(d);
  1825. #if HWY_IS_LITTLE_ENDIAN
  1826. return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
  1827. #else
  1828. return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
  1829. #endif
  1830. }
  1831. template <int kBytes, typename T, size_t N>
  1832. HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
  1833. return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
  1834. }
  1835. // ------------------------------ ShiftLeftLanes
  1836. // NOTE: The ShiftLeftLanes operation moves the elements of v to the right
  1837. // by kLanes lanes and zeroes out the first kLanes lanes of v on both
  1838. // little-endian and big-endian PPC targets
  1839. // (same behavior as the HWY_EMU128 ShiftLeftLanes operation on both
  1840. // little-endian and big-endian targets)
  1841. template <int kLanes, class D, typename T = TFromD<D>>
  1842. HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
  1843. const Repartition<uint8_t, decltype(d)> d8;
  1844. return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
  1845. }
  1846. template <int kLanes, typename T, size_t N>
  1847. HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
  1848. return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
  1849. }
  1850. // ------------------------------ ShiftRightBytes
  1851. // NOTE: The ShiftRightBytes operation moves the elements of v to the left
  1852. // by kBytes bytes and zeroes out the last kBytes bytes of v on both
  1853. // little-endian and big-endian PPC targets
  1854. // (same behavior as the HWY_EMU128 ShiftRightBytes operation on both
  1855. // little-endian and big-endian targets)
  1856. template <int kBytes, class D>
  1857. HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
  1858. static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
  1859. if (kBytes == 0) return v;
  1860. // For partial vectors, clear upper lanes so we shift in zeros.
  1861. if (d.MaxBytes() != 16) {
  1862. const Full128<TFromD<D>> dfull;
  1863. VFromD<decltype(dfull)> vfull{v.raw};
  1864. v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
  1865. }
  1866. const auto zeros = Zero(d);
  1867. #if HWY_IS_LITTLE_ENDIAN
  1868. return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
  1869. #else
  1870. return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
  1871. #endif
  1872. }
  1873. // ------------------------------ ShiftRightLanes
  1874. // NOTE: The ShiftRightLanes operation moves the elements of v to the left
  1875. // by kLanes lanes and zeroes out the last kLanes lanes of v on both
  1876. // little-endian and big-endian PPC targets
  1877. // (same behavior as the HWY_EMU128 ShiftRightLanes operation on both
  1878. // little-endian and big-endian targets)
  1879. template <int kLanes, class D>
  1880. HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
  1881. const Repartition<uint8_t, decltype(d)> d8;
  1882. constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
  1883. return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
  1884. }
  1885. // ------------------------------ UpperHalf (ShiftRightBytes)
  1886. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  1887. HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
  1888. return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
  1889. }
  1890. // ------------------------------ ExtractLane
  1891. template <typename T, size_t N>
  1892. HWY_API T ExtractLane(Vec128<T, N> v, size_t i) {
  1893. return static_cast<T>(v.raw[i]);
  1894. }
  1895. // ------------------------------ InsertLane
  1896. template <typename T, size_t N>
  1897. HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
  1898. #if HWY_IS_LITTLE_ENDIAN
  1899. typename detail::Raw128<T>::type raw_result = v.raw;
  1900. raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
  1901. return Vec128<T, N>{raw_result};
  1902. #else
  1903. // On ppc64be without this, mul_test fails, but swizzle_test passes.
  1904. DFromV<decltype(v)> d;
  1905. alignas(16) T lanes[16 / sizeof(T)];
  1906. Store(v, d, lanes);
  1907. lanes[i] = t;
  1908. return Load(d, lanes);
  1909. #endif
  1910. }
  1911. // ------------------------------ CombineShiftRightBytes
  1912. // NOTE: The CombineShiftRightBytes operation below moves the elements of lo to
  1913. // the left by kBytes bytes and moves the elements of hi right by (d.MaxBytes()
  1914. // - kBytes) bytes on both little-endian and big-endian PPC targets.
  1915. template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
  1916. HWY_API Vec128<T> CombineShiftRightBytes(D /*d*/, Vec128<T> hi, Vec128<T> lo) {
  1917. constexpr size_t kSize = 16;
  1918. static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
  1919. #if HWY_IS_LITTLE_ENDIAN
  1920. return Vec128<T>{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)};
  1921. #else
  1922. return Vec128<T>{vec_sld(lo.raw, hi.raw, kBytes)};
  1923. #endif
  1924. }
  1925. template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  1926. HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
  1927. constexpr size_t kSize = d.MaxBytes();
  1928. static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
  1929. const Repartition<uint8_t, decltype(d)> d8;
  1930. using V8 = Vec128<uint8_t>;
  1931. const DFromV<V8> dfull8;
  1932. const Repartition<TFromD<D>, decltype(dfull8)> dfull;
  1933. const V8 hi8{BitCast(d8, hi).raw};
  1934. // Move into most-significant bytes
  1935. const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
  1936. const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
  1937. return VFromD<D>{BitCast(dfull, r).raw};
  1938. }
  1939. // ------------------------------ Broadcast/splat any lane
  1940. template <int kLane, typename T, size_t N>
  1941. HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
  1942. static_assert(0 <= kLane && kLane < N, "Invalid lane");
  1943. return Vec128<T, N>{vec_splat(v.raw, kLane)};
  1944. }
  1945. // ------------------------------ TableLookupLanes (Shuffle01)
  1946. // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
  1947. template <typename T, size_t N = 16 / sizeof(T)>
  1948. struct Indices128 {
  1949. __vector unsigned char raw;
  1950. };
  1951. namespace detail {
  1952. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  1953. HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
  1954. D d) {
  1955. const Repartition<uint8_t, decltype(d)> d8;
  1956. return Iota(d8, 0);
  1957. }
  1958. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  1959. HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
  1960. D d) {
  1961. const Repartition<uint8_t, decltype(d)> d8;
  1962. #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  1963. constexpr __vector unsigned char kBroadcastLaneBytes = {
  1964. 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  1965. #else
  1966. constexpr __vector unsigned char kBroadcastLaneBytes = {
  1967. 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
  1968. #endif
  1969. return VFromD<decltype(d8)>{kBroadcastLaneBytes};
  1970. }
  1971. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  1972. HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
  1973. D d) {
  1974. const Repartition<uint8_t, decltype(d)> d8;
  1975. #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  1976. constexpr __vector unsigned char kBroadcastLaneBytes = {
  1977. 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
  1978. #else
  1979. constexpr __vector unsigned char kBroadcastLaneBytes = {
  1980. 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15};
  1981. #endif
  1982. return VFromD<decltype(d8)>{kBroadcastLaneBytes};
  1983. }
  1984. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  1985. HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
  1986. D d) {
  1987. const Repartition<uint8_t, decltype(d)> d8;
  1988. #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  1989. constexpr __vector unsigned char kBroadcastLaneBytes = {
  1990. 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
  1991. #else
  1992. constexpr __vector unsigned char kBroadcastLaneBytes = {
  1993. 7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
  1994. #endif
  1995. return VFromD<decltype(d8)>{kBroadcastLaneBytes};
  1996. }
  1997. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  1998. HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
  1999. const Repartition<uint8_t, decltype(d)> d8;
  2000. return Zero(d8);
  2001. }
  2002. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  2003. HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
  2004. const Repartition<uint8_t, decltype(d)> d8;
  2005. constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1,
  2006. 0, 1, 0, 1, 0, 1, 0, 1};
  2007. return VFromD<decltype(d8)>{kByteOffsets};
  2008. }
  2009. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  2010. HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
  2011. const Repartition<uint8_t, decltype(d)> d8;
  2012. constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3,
  2013. 0, 1, 2, 3, 0, 1, 2, 3};
  2014. return VFromD<decltype(d8)>{kByteOffsets};
  2015. }
  2016. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  2017. HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
  2018. const Repartition<uint8_t, decltype(d)> d8;
  2019. constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7,
  2020. 0, 1, 2, 3, 4, 5, 6, 7};
  2021. return VFromD<decltype(d8)>{kByteOffsets};
  2022. }
  2023. } // namespace detail
  2024. template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
  2025. HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
  2026. D d, Vec128<TI, MaxLanes(D())> vec) {
  2027. using T = TFromD<D>;
  2028. static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
  2029. #if HWY_IS_DEBUG_BUILD
  2030. const RebindToUnsigned<decltype(d)> du;
  2031. using TU = TFromD<decltype(du)>;
  2032. HWY_DASSERT(AllTrue(
  2033. du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
  2034. #endif
  2035. const Repartition<uint8_t, decltype(d)> d8;
  2036. return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d8, vec).raw};
  2037. }
  2038. template <class D, typename TI,
  2039. HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
  2040. HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
  2041. D d, Vec128<TI, MaxLanes(D())> vec) {
  2042. using T = TFromD<D>;
  2043. static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
  2044. #if HWY_IS_DEBUG_BUILD
  2045. const RebindToUnsigned<decltype(d)> du;
  2046. using TU = TFromD<decltype(du)>;
  2047. HWY_DASSERT(AllTrue(
  2048. du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
  2049. #endif
  2050. const Repartition<uint8_t, decltype(d)> d8;
  2051. using V8 = VFromD<decltype(d8)>;
  2052. // Broadcast each lane index to all bytes of T and shift to bytes
  2053. const V8 lane_indices = TableLookupBytes(
  2054. BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d));
  2055. constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
  2056. const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
  2057. const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
  2058. return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw};
  2059. }
  2060. template <class D, typename TI>
  2061. HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
  2062. D d, const TI* idx) {
  2063. const Rebind<TI, decltype(d)> di;
  2064. return IndicesFromVec(d, LoadU(di, idx));
  2065. }
  2066. template <typename T, size_t N>
  2067. HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
  2068. const DFromV<decltype(v)> d;
  2069. const Repartition<uint8_t, decltype(d)> d8;
  2070. return BitCast(d, TableLookupBytes(v, VFromD<decltype(d8)>{idx.raw}));
  2071. }
  2072. // Single lane: no change
  2073. template <typename T>
  2074. HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
  2075. Indices128<T, 1> /* idx */) {
  2076. return v;
  2077. }
  2078. template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
  2079. HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
  2080. Indices128<T, N> idx) {
  2081. const DFromV<decltype(a)> d;
  2082. const Twice<decltype(d)> dt;
  2083. const Repartition<uint8_t, decltype(dt)> dt_u8;
  2084. // TableLookupLanes currently requires table and index vectors to be the same
  2085. // size, though a half-length index vector would be sufficient here.
  2086. #if HWY_IS_MSAN
  2087. const Vec128<T, N> idx_vec{idx.raw};
  2088. const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
  2089. #else
  2090. // We only keep LowerHalf of the result, which is valid in idx.
  2091. const Indices128<T, N * 2> idx2{idx.raw};
  2092. #endif
  2093. return LowerHalf(
  2094. d, TableLookupBytes(Combine(dt, b, a),
  2095. BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw})));
  2096. }
  2097. template <typename T>
  2098. HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
  2099. Indices128<T> idx) {
  2100. return Vec128<T>{vec_perm(a.raw, b.raw, idx.raw)};
  2101. }
  2102. // ------------------------------ ReverseBlocks
  2103. // Single block: no change
  2104. template <class D>
  2105. HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
  2106. return v;
  2107. }
  2108. // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
  2109. // Single lane: no change
  2110. template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
  2111. HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) {
  2112. return v;
  2113. }
  2114. // 32-bit x2: shuffle
  2115. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
  2116. HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) {
  2117. return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw};
  2118. }
  2119. // 16-bit x4: shuffle
  2120. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
  2121. HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) {
  2122. const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
  2123. 14, 15, 12, 13, 10, 11, 8, 9};
  2124. return Vec64<T>{vec_perm(v.raw, v.raw, kShuffle)};
  2125. }
  2126. // 16-bit x2: rotate bytes
  2127. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
  2128. HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
  2129. const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
  2130. return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
  2131. }
  2132. // ------------------------------- ReverseLaneBytes
  2133. #if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
  2134. (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
  2135. // Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
  2136. #ifdef HWY_NATIVE_REVERSE_LANE_BYTES
  2137. #undef HWY_NATIVE_REVERSE_LANE_BYTES
  2138. #else
  2139. #define HWY_NATIVE_REVERSE_LANE_BYTES
  2140. #endif
  2141. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
  2142. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
  2143. HWY_API V ReverseLaneBytes(V v) {
  2144. return V{vec_revb(v.raw)};
  2145. }
  2146. // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
  2147. #ifdef HWY_NATIVE_REVERSE2_8
  2148. #undef HWY_NATIVE_REVERSE2_8
  2149. #else
  2150. #define HWY_NATIVE_REVERSE2_8
  2151. #endif
  2152. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2153. HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  2154. const Repartition<uint16_t, decltype(d)> du16;
  2155. return BitCast(d, ReverseLaneBytes(BitCast(du16, v)));
  2156. }
  2157. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2158. HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
  2159. const Repartition<uint32_t, decltype(d)> du32;
  2160. return BitCast(d, ReverseLaneBytes(BitCast(du32, v)));
  2161. }
  2162. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2163. HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
  2164. const Repartition<uint64_t, decltype(d)> du64;
  2165. return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
  2166. }
  2167. #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
  2168. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2169. HWY_API Vec16<T> Reverse(D d, Vec16<T> v) {
  2170. return Reverse2(d, v);
  2171. }
  2172. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2173. HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
  2174. return Reverse4(d, v);
  2175. }
  2176. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2177. HWY_API Vec64<T> Reverse(D d, Vec64<T> v) {
  2178. return Reverse8(d, v);
  2179. }
  2180. // ------------------------------ Reverse2
  2181. // Single lane: no change
  2182. template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
  2183. HWY_API Vec128<T, 1> Reverse2(D /* tag */, Vec128<T, 1> v) {
  2184. return v;
  2185. }
  2186. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
  2187. HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  2188. const Repartition<uint32_t, decltype(d)> du32;
  2189. return BitCast(d, RotateRight<16>(BitCast(du32, v)));
  2190. }
  2191. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
  2192. HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  2193. const Repartition<uint64_t, decltype(d)> du64;
  2194. return BitCast(d, RotateRight<32>(BitCast(du64, v)));
  2195. }
  2196. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
  2197. HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
  2198. return Shuffle01(v);
  2199. }
  2200. // ------------------------------ Reverse4
  2201. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  2202. HWY_API VFromD<D> Reverse4(D /*d*/, VFromD<D> v) {
  2203. const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
  2204. 14, 15, 12, 13, 10, 11, 8, 9};
  2205. return VFromD<D>{vec_perm(v.raw, v.raw, kShuffle)};
  2206. }
  2207. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  2208. HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
  2209. return Reverse(d, v);
  2210. }
  2211. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  2212. HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) {
  2213. HWY_ASSERT(0); // don't have 4 u64 lanes
  2214. }
  2215. // ------------------------------ Reverse8
  2216. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  2217. HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
  2218. return Reverse(d, v);
  2219. }
  2220. template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
  2221. HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) {
  2222. HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit
  2223. }
  2224. // ------------------------------ InterleaveLower
  2225. // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
  2226. // the least-significant lane) and "b". To concatenate two half-width integers
  2227. // into one, use ZipLower/Upper instead (also works with scalar).
  2228. template <typename T, size_t N>
  2229. HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
  2230. return Vec128<T, N>{vec_mergeh(a.raw, b.raw)};
  2231. }
  2232. // Additional overload for the optional tag
  2233. template <class D>
  2234. HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
  2235. return InterleaveLower(a, b);
  2236. }
  2237. // ------------------------------ InterleaveUpper (UpperHalf)
  2238. // Full
  2239. template <class D, typename T = TFromD<D>>
  2240. HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) {
  2241. return Vec128<T>{vec_mergel(a.raw, b.raw)};
  2242. }
  2243. // Partial
  2244. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  2245. HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
  2246. const Half<decltype(d)> d2;
  2247. return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
  2248. VFromD<D>{UpperHalf(d2, b).raw});
  2249. }
  2250. // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
  2251. // Same as Interleave*, except that the return lanes are double-width integers;
  2252. // this is necessary because the single-lane scalar cannot return two values.
  2253. template <class V, class DW = RepartitionToWide<DFromV<V>>>
  2254. HWY_API VFromD<DW> ZipLower(V a, V b) {
  2255. return BitCast(DW(), InterleaveLower(a, b));
  2256. }
  2257. template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
  2258. HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
  2259. return BitCast(dw, InterleaveLower(D(), a, b));
  2260. }
  2261. template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
  2262. HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
  2263. return BitCast(dw, InterleaveUpper(D(), a, b));
  2264. }
  2265. // ------------------------------ Per4LaneBlkShufDupSet4xU32
  2266. // Used by hwy/ops/generic_ops-inl.h to implement Per4LaneBlockShuffle
  2267. namespace detail {
  2268. #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
  2269. #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
  2270. #else
  2271. #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
  2272. #endif
  2273. template <class D>
  2274. HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
  2275. const uint32_t x2,
  2276. const uint32_t x1,
  2277. const uint32_t x0) {
  2278. const __vector unsigned int raw = {x0, x1, x2, x3};
  2279. return ResizeBitCast(d, Vec128<uint32_t>{raw});
  2280. }
  2281. } // namespace detail
  2282. // ------------------------------ SlideUpLanes
  2283. template <class D>
  2284. HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
  2285. const Repartition<uint8_t, decltype(d)> du8;
  2286. using VU8 = VFromD<decltype(du8)>;
  2287. const auto v_shift_amt =
  2288. BitCast(Full128<uint8_t>(),
  2289. Set(Full128<uint32_t>(),
  2290. static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
  2291. #if HWY_S390X_HAVE_Z14
  2292. return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)});
  2293. #else // VSX
  2294. #if HWY_IS_LITTLE_ENDIAN
  2295. return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
  2296. #else
  2297. return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
  2298. #endif // HWY_IS_LITTLE_ENDIAN
  2299. #endif // HWY_S390X_HAVE_Z14
  2300. }
  2301. // ------------------------------ SlideDownLanes
  2302. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  2303. HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
  2304. using TU = UnsignedFromSize<d.MaxBytes()>;
  2305. const Repartition<TU, decltype(d)> du;
  2306. const auto v_shift_amt =
  2307. Set(du, static_cast<TU>(amt * sizeof(TFromD<D>) * 8));
  2308. #if HWY_IS_LITTLE_ENDIAN
  2309. return BitCast(d, BitCast(du, v) >> v_shift_amt);
  2310. #else
  2311. return BitCast(d, BitCast(du, v) << v_shift_amt);
  2312. #endif
  2313. }
  2314. template <class D, HWY_IF_V_SIZE_D(D, 16)>
  2315. HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
  2316. const Repartition<uint8_t, decltype(d)> du8;
  2317. using VU8 = VFromD<decltype(du8)>;
  2318. const auto v_shift_amt =
  2319. BitCast(Full128<uint8_t>(),
  2320. Set(Full128<uint32_t>(),
  2321. static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
  2322. #if HWY_S390X_HAVE_Z14
  2323. return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)});
  2324. #else // VSX
  2325. #if HWY_IS_LITTLE_ENDIAN
  2326. return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
  2327. #else
  2328. return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
  2329. #endif // HWY_IS_LITTLE_ENDIAN
  2330. #endif // HWY_S390X_HAVE_Z14
  2331. }
  2332. // ================================================== COMBINE
  2333. // ------------------------------ Combine (InterleaveLower)
  2334. // N = N/2 + N/2 (upper half undefined)
  2335. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
  2336. HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
  2337. const Half<decltype(d)> dh;
  2338. // Treat half-width input as one lane, and expand to two lanes.
  2339. using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
  2340. using Raw = typename detail::Raw128<TFromV<VU>>::type;
  2341. const VU lo{reinterpret_cast<Raw>(lo_half.raw)};
  2342. const VU hi{reinterpret_cast<Raw>(hi_half.raw)};
  2343. return BitCast(d, InterleaveLower(lo, hi));
  2344. }
  2345. // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
  2346. template <class D>
  2347. HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
  2348. const Half<D> dh;
  2349. return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
  2350. }
  2351. // ------------------------------ Concat full (InterleaveLower)
  2352. // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
  2353. template <class D, typename T = TFromD<D>>
  2354. HWY_API Vec128<T> ConcatLowerLower(D d, Vec128<T> hi, Vec128<T> lo) {
  2355. const Repartition<uint64_t, decltype(d)> d64;
  2356. return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
  2357. }
  2358. // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
  2359. template <class D, typename T = TFromD<D>>
  2360. HWY_API Vec128<T> ConcatUpperUpper(D d, Vec128<T> hi, Vec128<T> lo) {
  2361. const Repartition<uint64_t, decltype(d)> d64;
  2362. return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
  2363. }
  2364. // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
  2365. template <class D, typename T = TFromD<D>>
  2366. HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) {
  2367. return CombineShiftRightBytes<8>(d, hi, lo);
  2368. }
  2369. // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
  2370. template <class D, typename T = TFromD<D>>
  2371. HWY_API Vec128<T> ConcatUpperLower(D /*d*/, Vec128<T> hi, Vec128<T> lo) {
  2372. const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7,
  2373. 24, 25, 26, 27, 28, 29, 30, 31};
  2374. return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
  2375. }
  2376. // ------------------------------ Concat partial (Combine, LowerHalf)
  2377. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  2378. HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
  2379. const Half<decltype(d)> d2;
  2380. return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
  2381. }
  2382. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  2383. HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  2384. const Half<decltype(d)> d2;
  2385. return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
  2386. }
  2387. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  2388. HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
  2389. const Half<decltype(d)> d2;
  2390. return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
  2391. }
  2392. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  2393. HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
  2394. const Half<decltype(d)> d2;
  2395. return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
  2396. }
  2397. // ------------------------------ TruncateTo
  2398. template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
  2399. hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 2)>* = nullptr,
  2400. HWY_IF_LANES_D(D, 1)>
  2401. HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<FromT, 1> v) {
  2402. using Raw = typename detail::Raw128<TFromD<D>>::type;
  2403. #if HWY_IS_LITTLE_ENDIAN
  2404. return VFromD<D>{reinterpret_cast<Raw>(v.raw)};
  2405. #else
  2406. return VFromD<D>{reinterpret_cast<Raw>(
  2407. vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD<D>)))};
  2408. #endif
  2409. }
  2410. namespace detail {
  2411. template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
  2412. HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
  2413. HWY_API VFromD<D> Truncate2To(
  2414. D /* tag */, Vec128<FromT, Repartition<FromT, D>().MaxLanes()> lo,
  2415. Vec128<FromT, Repartition<FromT, D>().MaxLanes()> hi) {
  2416. return VFromD<D>{vec_pack(lo.raw, hi.raw)};
  2417. }
  2418. } // namespace detail
  2419. template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
  2420. HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
  2421. HWY_API VFromD<D> TruncateTo(D /* d */,
  2422. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  2423. return VFromD<D>{vec_pack(v.raw, v.raw)};
  2424. }
  2425. template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
  2426. hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr,
  2427. HWY_IF_LANES_GT_D(D, 1)>
  2428. HWY_API VFromD<D> TruncateTo(D d,
  2429. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  2430. const Rebind<MakeNarrow<FromT>, decltype(d)> d2;
  2431. return TruncateTo(d, TruncateTo(d2, v));
  2432. }
  2433. // ------------------------------ ConcatOdd (TruncateTo)
  2434. // 8-bit full
  2435. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2436. HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
  2437. const Repartition<uint16_t, decltype(d)> dw;
  2438. const RebindToUnsigned<decltype(d)> du;
  2439. #if HWY_IS_LITTLE_ENDIAN
  2440. // Right-shift 8 bits per u16 so we can pack.
  2441. const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
  2442. const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
  2443. #else
  2444. const Vec128<uint16_t> uH = BitCast(dw, hi);
  2445. const Vec128<uint16_t> uL = BitCast(dw, lo);
  2446. #endif
  2447. return BitCast(d, detail::Truncate2To(du, uL, uH));
  2448. }
  2449. // 8-bit x8
  2450. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2451. HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
  2452. // Don't care about upper half, no need to zero.
  2453. const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23};
  2454. return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
  2455. }
  2456. // 8-bit x4
  2457. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2458. HWY_API Vec32<T> ConcatOdd(D /*d*/, Vec32<T> hi, Vec32<T> lo) {
  2459. // Don't care about upper half, no need to zero.
  2460. const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19};
  2461. return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
  2462. }
  2463. // 16-bit full
  2464. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
  2465. HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
  2466. const Repartition<uint32_t, decltype(d)> dw;
  2467. const RebindToUnsigned<decltype(d)> du;
  2468. #if HWY_IS_LITTLE_ENDIAN
  2469. const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
  2470. const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
  2471. #else
  2472. const Vec128<uint32_t> uH = BitCast(dw, hi);
  2473. const Vec128<uint32_t> uL = BitCast(dw, lo);
  2474. #endif
  2475. return BitCast(d, detail::Truncate2To(du, uL, uH));
  2476. }
  2477. // 16-bit x4
  2478. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
  2479. HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
  2480. // Don't care about upper half, no need to zero.
  2481. const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23};
  2482. return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU16)};
  2483. }
  2484. // 32-bit full
  2485. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
  2486. HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
  2487. #if HWY_IS_LITTLE_ENDIAN
  2488. (void)d;
  2489. const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15,
  2490. 20, 21, 22, 23, 28, 29, 30, 31};
  2491. return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
  2492. #else
  2493. const RebindToUnsigned<decltype(d)> du;
  2494. const Repartition<uint64_t, decltype(d)> dw;
  2495. return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
  2496. #endif
  2497. }
  2498. // Any type x2
  2499. template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
  2500. HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
  2501. return InterleaveUpper(d, lo, hi);
  2502. }
  2503. // ------------------------------ ConcatEven (TruncateTo)
  2504. // 8-bit full
  2505. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2506. HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
  2507. const Repartition<uint16_t, decltype(d)> dw;
  2508. const RebindToUnsigned<decltype(d)> du;
  2509. #if HWY_IS_LITTLE_ENDIAN
  2510. const Vec128<uint16_t> uH = BitCast(dw, hi);
  2511. const Vec128<uint16_t> uL = BitCast(dw, lo);
  2512. #else
  2513. // Right-shift 8 bits per u16 so we can pack.
  2514. const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
  2515. const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
  2516. #endif
  2517. return BitCast(d, detail::Truncate2To(du, uL, uH));
  2518. }
  2519. // 8-bit x8
  2520. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2521. HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
  2522. // Don't care about upper half, no need to zero.
  2523. const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22};
  2524. return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
  2525. }
  2526. // 8-bit x4
  2527. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
  2528. HWY_API Vec32<T> ConcatEven(D /*d*/, Vec32<T> hi, Vec32<T> lo) {
  2529. // Don't care about upper half, no need to zero.
  2530. const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18};
  2531. return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
  2532. }
  2533. // 16-bit full
  2534. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
  2535. HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
  2536. // Isolate lower 16 bits per u32 so we can pack.
  2537. const Repartition<uint32_t, decltype(d)> dw;
  2538. const RebindToUnsigned<decltype(d)> du;
  2539. #if HWY_IS_LITTLE_ENDIAN
  2540. const Vec128<uint32_t> uH = BitCast(dw, hi);
  2541. const Vec128<uint32_t> uL = BitCast(dw, lo);
  2542. #else
  2543. const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
  2544. const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
  2545. #endif
  2546. return BitCast(d, detail::Truncate2To(du, uL, uH));
  2547. }
  2548. // 16-bit x4
  2549. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
  2550. HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
  2551. // Don't care about upper half, no need to zero.
  2552. const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21};
  2553. return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU16)};
  2554. }
  2555. // 32-bit full
  2556. template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
  2557. HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
  2558. #if HWY_IS_LITTLE_ENDIAN
  2559. const Repartition<uint64_t, decltype(d)> dw;
  2560. const RebindToUnsigned<decltype(d)> du;
  2561. return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
  2562. #else
  2563. (void)d;
  2564. constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11,
  2565. 16, 17, 18, 19, 24, 25, 26, 27};
  2566. return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
  2567. #endif
  2568. }
  2569. // Any T x2
  2570. template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
  2571. HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
  2572. return InterleaveLower(d, lo, hi);
  2573. }
  2574. // ------------------------------ OrderedTruncate2To (ConcatEven, ConcatOdd)
  2575. #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
  2576. #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
  2577. #else
  2578. #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
  2579. #endif
  2580. template <class D, HWY_IF_UNSIGNED_D(D), class V, HWY_IF_UNSIGNED_V(V),
  2581. HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
  2582. HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
  2583. HWY_API VFromD<D> OrderedTruncate2To(D d, V a, V b) {
  2584. #if HWY_IS_LITTLE_ENDIAN
  2585. return ConcatEven(d, BitCast(d, b), BitCast(d, a));
  2586. #else
  2587. return ConcatOdd(d, BitCast(d, b), BitCast(d, a));
  2588. #endif
  2589. }
  2590. // ------------------------------ DupEven (InterleaveLower)
  2591. template <typename T>
  2592. HWY_API Vec128<T, 1> DupEven(Vec128<T, 1> v) {
  2593. return v;
  2594. }
  2595. template <typename T>
  2596. HWY_API Vec128<T, 2> DupEven(Vec128<T, 2> v) {
  2597. return InterleaveLower(DFromV<decltype(v)>(), v, v);
  2598. }
  2599. template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
  2600. HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
  2601. const DFromV<decltype(v)> d;
  2602. const Repartition<uint8_t, decltype(d)> du8;
  2603. constexpr __vector unsigned char kShuffle = {0, 0, 2, 2, 4, 4, 6, 6,
  2604. 8, 8, 10, 10, 12, 12, 14, 14};
  2605. return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
  2606. }
  2607. template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
  2608. HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
  2609. const DFromV<decltype(v)> d;
  2610. const Repartition<uint8_t, decltype(d)> du8;
  2611. constexpr __vector unsigned char kShuffle = {0, 1, 0, 1, 4, 5, 4, 5,
  2612. 8, 9, 8, 9, 12, 13, 12, 13};
  2613. return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
  2614. }
  2615. template <typename T, HWY_IF_T_SIZE(T, 4)>
  2616. HWY_API Vec128<T> DupEven(Vec128<T> v) {
  2617. #if HWY_S390X_HAVE_Z14
  2618. const DFromV<decltype(v)> d;
  2619. const Repartition<uint8_t, decltype(d)> du8;
  2620. return TableLookupBytes(
  2621. v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
  2622. 11, 8, 9, 10, 11)));
  2623. #else
  2624. return Vec128<T>{vec_mergee(v.raw, v.raw)};
  2625. #endif
  2626. }
  2627. // ------------------------------ DupOdd (InterleaveUpper)
  2628. template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
  2629. HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
  2630. const DFromV<decltype(v)> d;
  2631. const Repartition<uint8_t, decltype(d)> du8;
  2632. constexpr __vector unsigned char kShuffle = {1, 1, 3, 3, 5, 5, 7, 7,
  2633. 9, 9, 11, 11, 13, 13, 15, 15};
  2634. return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
  2635. }
  2636. template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
  2637. HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
  2638. const DFromV<decltype(v)> d;
  2639. const Repartition<uint8_t, decltype(d)> du8;
  2640. constexpr __vector unsigned char kShuffle = {2, 3, 2, 3, 6, 7, 6, 7,
  2641. 10, 11, 10, 11, 14, 15, 14, 15};
  2642. return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
  2643. }
  2644. template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
  2645. HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
  2646. #if HWY_S390X_HAVE_Z14
  2647. const DFromV<decltype(v)> d;
  2648. const Repartition<uint8_t, decltype(d)> du8;
  2649. return TableLookupBytes(
  2650. v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
  2651. 15, 12, 13, 14, 15)));
  2652. #else
  2653. return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
  2654. #endif
  2655. }
  2656. template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
  2657. HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
  2658. return InterleaveUpper(DFromV<decltype(v)>(), v, v);
  2659. }
  2660. // ------------------------------ OddEven (IfThenElse)
  2661. template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
  2662. HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
  2663. const DFromV<decltype(a)> d;
  2664. const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
  2665. 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
  2666. return IfVecThenElse(BitCast(d, Vec128<uint8_t, N>{mask}), b, a);
  2667. }
  2668. template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
  2669. HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
  2670. const DFromV<decltype(a)> d;
  2671. const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
  2672. 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
  2673. return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 2>{mask}), b, a);
  2674. }
  2675. template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
  2676. HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
  2677. const DFromV<decltype(a)> d;
  2678. const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0,
  2679. 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
  2680. return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 4>{mask}), b, a);
  2681. }
  2682. template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
  2683. HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
  2684. // Same as ConcatUpperLower for full vectors; do not call that because this
  2685. // is more efficient for 64x1 vectors.
  2686. const DFromV<decltype(a)> d;
  2687. const __vector unsigned char mask = {
  2688. 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0};
  2689. return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
  2690. }
  2691. // ------------------------------ InterleaveEven
  2692. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  2693. HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
  2694. const Full128<TFromD<D>> d_full;
  2695. const Indices128<TFromD<D>> idx{
  2696. Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
  2697. 10, 26, 12, 28, 14, 30)
  2698. .raw};
  2699. return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
  2700. ResizeBitCast(d_full, b), idx));
  2701. }
  2702. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  2703. HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
  2704. const Full128<TFromD<D>> d_full;
  2705. const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
  2706. 16, 17, 4, 5, 20, 21, 8,
  2707. 9, 24, 25, 12, 13, 28, 29)
  2708. .raw};
  2709. return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
  2710. ResizeBitCast(d_full, b), idx));
  2711. }
  2712. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  2713. HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
  2714. #if HWY_S390X_HAVE_Z14
  2715. const Full128<TFromD<D>> d_full;
  2716. const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
  2717. 2, 3, 16, 17, 18, 19, 8,
  2718. 9, 10, 11, 24, 25, 26, 27)
  2719. .raw};
  2720. return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
  2721. ResizeBitCast(d_full, b), idx));
  2722. #else
  2723. (void)d;
  2724. return VFromD<D>{vec_mergee(a.raw, b.raw)};
  2725. #endif
  2726. }
  2727. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  2728. HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
  2729. return InterleaveLower(a, b);
  2730. }
  2731. // ------------------------------ InterleaveOdd
  2732. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  2733. HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
  2734. const Full128<TFromD<D>> d_full;
  2735. const Indices128<TFromD<D>> idx{
  2736. Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
  2737. 11, 27, 13, 29, 15, 31)
  2738. .raw};
  2739. return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
  2740. ResizeBitCast(d_full, b), idx));
  2741. }
  2742. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  2743. HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
  2744. const Full128<TFromD<D>> d_full;
  2745. const Indices128<TFromD<D>> idx{
  2746. Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
  2747. 11, 26, 27, 14, 15, 30, 31)
  2748. .raw};
  2749. return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
  2750. ResizeBitCast(d_full, b), idx));
  2751. }
  2752. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  2753. HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
  2754. #if HWY_S390X_HAVE_Z14
  2755. const Full128<TFromD<D>> d_full;
  2756. const Indices128<TFromD<D>> idx{
  2757. Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
  2758. 13, 14, 15, 28, 29, 30, 31)
  2759. .raw};
  2760. return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
  2761. ResizeBitCast(d_full, b), idx));
  2762. #else
  2763. (void)d;
  2764. return VFromD<D>{vec_mergeo(a.raw, b.raw)};
  2765. #endif
  2766. }
  2767. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  2768. HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
  2769. return InterleaveUpper(d, a, b);
  2770. }
  2771. // ------------------------------ OddEvenBlocks
  2772. template <typename T, size_t N>
  2773. HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
  2774. return even;
  2775. }
  2776. // ------------------------------ SwapAdjacentBlocks
  2777. template <typename T, size_t N>
  2778. HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
  2779. return v;
  2780. }
  2781. // ------------------------------ MulFixedPoint15 (OddEven)
  2782. #if HWY_S390X_HAVE_Z14
  2783. HWY_API Vec16<int16_t> MulFixedPoint15(Vec16<int16_t> a, Vec16<int16_t> b) {
  2784. const DFromV<decltype(a)> di16;
  2785. const RepartitionToWide<decltype(di16)> di32;
  2786. const auto round_up_incr = Set(di32, 0x4000);
  2787. const auto i32_product = MulEven(a, b) + round_up_incr;
  2788. return ResizeBitCast(di16, ShiftLeft<1>(i32_product));
  2789. }
  2790. template <size_t N, HWY_IF_LANES_GT(N, 1)>
  2791. HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
  2792. Vec128<int16_t, N> b) {
  2793. const DFromV<decltype(a)> di16;
  2794. const RepartitionToWide<decltype(di16)> di32;
  2795. const auto round_up_incr = Set(di32, 0x4000);
  2796. const auto even_product = MulEven(a, b) + round_up_incr;
  2797. const auto odd_product = MulOdd(a, b) + round_up_incr;
  2798. return OddEven(BitCast(di16, ShiftRight<15>(odd_product)),
  2799. BitCast(di16, ShiftLeft<1>(even_product)));
  2800. }
  2801. #else
  2802. template <size_t N>
  2803. HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
  2804. Vec128<int16_t, N> b) {
  2805. const Vec128<int16_t> zero = Zero(Full128<int16_t>());
  2806. return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
  2807. }
  2808. #endif
  2809. // ------------------------------ Shl
  2810. namespace detail {
  2811. template <typename T, size_t N>
  2812. HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
  2813. Vec128<T, N> bits) {
  2814. #if HWY_S390X_HAVE_Z14
  2815. return Vec128<T, N>{v.raw << bits.raw};
  2816. #else
  2817. return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
  2818. #endif
  2819. }
  2820. // Signed left shift is the same as unsigned.
  2821. template <typename T, size_t N>
  2822. HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
  2823. Vec128<T, N> bits) {
  2824. const DFromV<decltype(v)> di;
  2825. const RebindToUnsigned<decltype(di)> du;
  2826. return BitCast(di,
  2827. Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
  2828. }
  2829. } // namespace detail
  2830. template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
  2831. HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
  2832. return detail::Shl(hwy::TypeTag<T>(), v, bits);
  2833. }
  2834. // ------------------------------ Shr
  2835. namespace detail {
  2836. template <typename T, size_t N>
  2837. HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
  2838. Vec128<T, N> bits) {
  2839. #if HWY_S390X_HAVE_Z14
  2840. return Vec128<T, N>{v.raw >> bits.raw};
  2841. #else
  2842. return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
  2843. #endif
  2844. }
  2845. template <typename T, size_t N>
  2846. HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v,
  2847. Vec128<T, N> bits) {
  2848. #if HWY_S390X_HAVE_Z14
  2849. return Vec128<T, N>{v.raw >> bits.raw};
  2850. #else
  2851. const DFromV<decltype(v)> di;
  2852. const RebindToUnsigned<decltype(di)> du;
  2853. return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
  2854. #endif
  2855. }
  2856. } // namespace detail
  2857. template <typename T, size_t N>
  2858. HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
  2859. return detail::Shr(hwy::TypeTag<T>(), v, bits);
  2860. }
  2861. // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
  2862. template <class T, HWY_IF_UI64(T)>
  2863. HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
  2864. #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
  2865. using V64 = typename detail::Raw128<T>::type;
  2866. const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw));
  2867. #if HWY_IS_LITTLE_ENDIAN
  2868. return Vec128<T>{mul128_result};
  2869. #else
  2870. // Need to swap the two halves of mul128_result on big-endian targets as
  2871. // the upper 64 bits of the product are in lane 0 of mul128_result and
  2872. // the lower 64 bits of the product are in lane 1 of mul128_result
  2873. return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
  2874. #endif
  2875. #else
  2876. alignas(16) T mul[2];
  2877. mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
  2878. return Load(Full128<T>(), mul);
  2879. #endif
  2880. }
  2881. template <class T, HWY_IF_UI64(T)>
  2882. HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
  2883. #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
  2884. using V64 = typename detail::Raw128<T>::type;
  2885. const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw));
  2886. #if HWY_IS_LITTLE_ENDIAN
  2887. return Vec128<T>{mul128_result};
  2888. #else
  2889. // Need to swap the two halves of mul128_result on big-endian targets as
  2890. // the upper 64 bits of the product are in lane 0 of mul128_result and
  2891. // the lower 64 bits of the product are in lane 1 of mul128_result
  2892. return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
  2893. #endif
  2894. #else
  2895. alignas(16) T mul[2];
  2896. const Full64<T> d2;
  2897. mul[0] =
  2898. Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
  2899. return Load(Full128<T>(), mul);
  2900. #endif
  2901. }
  2902. // ------------------------------ PromoteEvenTo/PromoteOddTo
  2903. #include "hwy/ops/inside-inl.h"
  2904. // ------------------------------ WidenMulPairwiseAdd
  2905. template <class DF, HWY_IF_F32_D(DF),
  2906. class VBF = VFromD<Repartition<bfloat16_t, DF>>>
  2907. HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
  2908. return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
  2909. Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
  2910. }
  2911. // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
  2912. template <class D32, HWY_IF_UI32_D(D32),
  2913. class V16 = VFromD<RepartitionToNarrow<D32>>>
  2914. HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
  2915. #if HWY_S390X_HAVE_Z14
  2916. (void)d32;
  2917. return MulEven(a, b) + MulOdd(a, b);
  2918. #else
  2919. return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)};
  2920. #endif
  2921. }
  2922. // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
  2923. // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
  2924. template <class D32, HWY_IF_UI32_D(D32),
  2925. class V16 = VFromD<RepartitionToNarrow<D32>>>
  2926. HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b,
  2927. VFromD<D32> sum0,
  2928. VFromD<D32>& /*sum1*/) {
  2929. #if HWY_S390X_HAVE_Z14
  2930. return MulEven(a, b) + MulOdd(a, b) + sum0;
  2931. #else
  2932. return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
  2933. #endif
  2934. }
  2935. // ------------------------------ RearrangeToOddPlusEven
  2936. template <size_t N>
  2937. HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(Vec128<int32_t, N> sum0,
  2938. Vec128<int32_t, N> /*sum1*/) {
  2939. return sum0; // invariant already holds
  2940. }
  2941. template <size_t N>
  2942. HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
  2943. Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) {
  2944. return sum0; // invariant already holds
  2945. }
  2946. template <class VW>
  2947. HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
  2948. return Add(sum0, sum1);
  2949. }
  2950. // ------------------------------ SatWidenMulPairwiseAccumulate
  2951. #if !HWY_S390X_HAVE_Z14
  2952. #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
  2953. #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
  2954. #else
  2955. #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
  2956. #endif
  2957. template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
  2958. HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
  2959. DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
  2960. VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
  2961. return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)};
  2962. }
  2963. #endif // !HWY_S390X_HAVE_Z14
  2964. // ------------------------------ SumOfMulQuadAccumulate
  2965. #if !HWY_S390X_HAVE_Z14
  2966. #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  2967. #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  2968. #else
  2969. #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  2970. #endif
  2971. template <class DU32, HWY_IF_U32_D(DU32)>
  2972. HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
  2973. DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
  2974. VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
  2975. return VFromD<DU32>{vec_msum(a.raw, b.raw, sum.raw)};
  2976. }
  2977. #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  2978. #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  2979. #else
  2980. #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  2981. #endif
  2982. template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
  2983. HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
  2984. DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
  2985. VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
  2986. return VFromD<DI32>{vec_msum(b_i.raw, a_u.raw, sum.raw)};
  2987. }
  2988. #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  2989. #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  2990. #else
  2991. #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  2992. #endif
  2993. template <class DI32, HWY_IF_I32_D(DI32)>
  2994. HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
  2995. VFromD<Repartition<int8_t, DI32>> a,
  2996. VFromD<Repartition<int8_t, DI32>> b,
  2997. VFromD<DI32> sum) {
  2998. const Repartition<uint8_t, decltype(di32)> du8;
  2999. const auto result_sum_0 =
  3000. SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum);
  3001. const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a))));
  3002. return result_sum_0 - result_sum_1;
  3003. }
  3004. #endif // !HWY_S390X_HAVE_Z14
  3005. // ================================================== CONVERT
  3006. // ------------------------------ Promotions (part w/ narrow lanes -> full)
  3007. // Unsigned to signed/unsigned: zero-extend.
  3008. template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
  3009. HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_UNSIGNED(FromT)>
  3010. HWY_API VFromD<D> PromoteTo(D /* d */,
  3011. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3012. // First pretend the input has twice the lanes - the upper half will be
  3013. // ignored by ZipLower.
  3014. const Rebind<FromT, Twice<D>> d2;
  3015. const VFromD<decltype(d2)> twice{v.raw};
  3016. // Then cast to narrow as expected by ZipLower, in case the sign of FromT
  3017. // differs from that of D.
  3018. const RepartitionToNarrow<D> dn;
  3019. #if HWY_IS_LITTLE_ENDIAN
  3020. return ZipLower(BitCast(dn, twice), Zero(dn));
  3021. #else
  3022. return ZipLower(Zero(dn), BitCast(dn, twice));
  3023. #endif
  3024. }
  3025. // Signed: replicate sign bit.
  3026. template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
  3027. HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_SIGNED(FromT)>
  3028. HWY_API VFromD<D> PromoteTo(D /* d */,
  3029. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3030. using Raw = typename detail::Raw128<TFromD<D>>::type;
  3031. return VFromD<D>{reinterpret_cast<Raw>(vec_unpackh(v.raw))};
  3032. }
  3033. // 8-bit to 32-bit: First, promote to 16-bit, and then convert to 32-bit.
  3034. template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_D(D),
  3035. HWY_IF_T_SIZE(FromT, 1)>
  3036. HWY_API VFromD<D> PromoteTo(D d32,
  3037. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3038. const DFromV<decltype(v)> d8;
  3039. const Rebind<MakeWide<FromT>, decltype(d8)> d16;
  3040. return PromoteTo(d32, PromoteTo(d16, v));
  3041. }
  3042. // 8-bit or 16-bit to 64-bit: First, promote to MakeWide<FromT>, and then
  3043. // convert to 64-bit.
  3044. template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 8), HWY_IF_NOT_FLOAT_D(D),
  3045. HWY_IF_NOT_FLOAT_NOR_SPECIAL(FromT),
  3046. HWY_IF_T_SIZE_ONE_OF(FromT, (1 << 1) | (1 << 2))>
  3047. HWY_API VFromD<D> PromoteTo(D d64,
  3048. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3049. const Rebind<MakeWide<FromT>, decltype(d64)> dw;
  3050. return PromoteTo(d64, PromoteTo(dw, v));
  3051. }
  3052. #if HWY_PPC_HAVE_9
  3053. // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
  3054. #ifdef HWY_NATIVE_F16C
  3055. #undef HWY_NATIVE_F16C
  3056. #else
  3057. #define HWY_NATIVE_F16C
  3058. #endif
  3059. template <class D, HWY_IF_F32_D(D)>
  3060. HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
  3061. return VFromD<D>{vec_extract_fp32_from_shorth(v.raw)};
  3062. }
  3063. #endif // HWY_PPC_HAVE_9
  3064. template <class D, HWY_IF_F32_D(D)>
  3065. HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
  3066. const Rebind<uint16_t, decltype(df32)> du16;
  3067. const RebindToSigned<decltype(df32)> di32;
  3068. return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
  3069. }
  3070. template <class D, HWY_IF_F64_D(D)>
  3071. HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
  3072. const __vector float raw_v = InterleaveLower(v, v).raw;
  3073. #if HWY_IS_LITTLE_ENDIAN
  3074. return VFromD<D>{vec_doubleo(raw_v)};
  3075. #else
  3076. return VFromD<D>{vec_doublee(raw_v)};
  3077. #endif
  3078. }
  3079. template <class D, HWY_IF_F64_D(D)>
  3080. HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<int32_t, D>> v) {
  3081. #if HWY_S390X_HAVE_Z14
  3082. const RebindToSigned<decltype(df64)> di64;
  3083. return ConvertTo(df64, PromoteTo(di64, v));
  3084. #else // VSX
  3085. (void)df64;
  3086. const __vector signed int raw_v = InterleaveLower(v, v).raw;
  3087. #if HWY_IS_LITTLE_ENDIAN
  3088. return VFromD<D>{vec_doubleo(raw_v)};
  3089. #else
  3090. return VFromD<D>{vec_doublee(raw_v)};
  3091. #endif
  3092. #endif // HWY_S390X_HAVE_Z14
  3093. }
  3094. template <class D, HWY_IF_F64_D(D)>
  3095. HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
  3096. #if HWY_S390X_HAVE_Z14
  3097. const RebindToUnsigned<decltype(df64)> du64;
  3098. return ConvertTo(df64, PromoteTo(du64, v));
  3099. #else // VSX
  3100. (void)df64;
  3101. const __vector unsigned int raw_v = InterleaveLower(v, v).raw;
  3102. #if HWY_IS_LITTLE_ENDIAN
  3103. return VFromD<D>{vec_doubleo(raw_v)};
  3104. #else
  3105. return VFromD<D>{vec_doublee(raw_v)};
  3106. #endif
  3107. #endif // HWY_S390X_HAVE_Z14
  3108. }
  3109. #if !HWY_S390X_HAVE_Z14
  3110. namespace detail {
  3111. template <class V>
  3112. static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
  3113. #if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
  3114. // Workaround for QEMU 7/8 VSX float to int conversion bug
  3115. return IfThenElseZero(v == v, v);
  3116. #else
  3117. return v;
  3118. #endif
  3119. }
  3120. } // namespace detail
  3121. #endif // !HWY_S390X_HAVE_Z14
  3122. template <class D, HWY_IF_I64_D(D)>
  3123. HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
  3124. #if !HWY_S390X_HAVE_Z14 && \
  3125. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
  3126. const __vector float raw_v =
  3127. detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
  3128. return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
  3129. #else
  3130. const RebindToFloat<decltype(di64)> df64;
  3131. return ConvertTo(di64, PromoteTo(df64, v));
  3132. #endif
  3133. }
  3134. template <class D, HWY_IF_U64_D(D)>
  3135. HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
  3136. #if !HWY_S390X_HAVE_Z14 && \
  3137. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
  3138. const __vector float raw_v =
  3139. detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
  3140. return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
  3141. __builtin_vsx_xvcvspuxds(raw_v))};
  3142. #else
  3143. const RebindToFloat<decltype(du64)> df64;
  3144. return ConvertTo(du64, PromoteTo(df64, v));
  3145. #endif
  3146. }
  3147. // ------------------------------ PromoteUpperTo
  3148. #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
  3149. #undef HWY_NATIVE_PROMOTE_UPPER_TO
  3150. #else
  3151. #define HWY_NATIVE_PROMOTE_UPPER_TO
  3152. #endif
  3153. // Unsigned to signed/unsigned: zero-extend.
  3154. template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16),
  3155. HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
  3156. HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_UNSIGNED(FromT)>
  3157. HWY_API VFromD<D> PromoteUpperTo(D d, Vec128<FromT> v) {
  3158. const RebindToUnsigned<D> du;
  3159. const RepartitionToNarrow<decltype(du)> dn;
  3160. #if HWY_IS_LITTLE_ENDIAN
  3161. return BitCast(d, ZipUpper(du, v, Zero(dn)));
  3162. #else
  3163. return BitCast(d, ZipUpper(du, Zero(dn), v));
  3164. #endif
  3165. }
  3166. // Signed: replicate sign bit.
  3167. template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16),
  3168. HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
  3169. HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_SIGNED(FromT)>
  3170. HWY_API VFromD<D> PromoteUpperTo(D /* d */, Vec128<FromT> v) {
  3171. using Raw = typename detail::Raw128<TFromD<D>>::type;
  3172. return VFromD<D>{reinterpret_cast<Raw>(vec_unpackl(v.raw))};
  3173. }
  3174. // F16 to F32
  3175. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
  3176. HWY_API VFromD<D> PromoteUpperTo(D df32, Vec128<float16_t> v) {
  3177. #if HWY_PPC_HAVE_9
  3178. (void)df32;
  3179. return VFromD<D>{vec_extract_fp32_from_shortl(v.raw)};
  3180. #else
  3181. const Rebind<float16_t, decltype(df32)> dh;
  3182. return PromoteTo(df32, UpperHalf(dh, v));
  3183. #endif
  3184. }
  3185. // BF16 to F32
  3186. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
  3187. HWY_API VFromD<D> PromoteUpperTo(D df32, Vec128<bfloat16_t> v) {
  3188. const Repartition<uint16_t, decltype(df32)> du16;
  3189. const RebindToSigned<decltype(df32)> di32;
  3190. return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
  3191. }
  3192. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
  3193. HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) {
  3194. const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
  3195. #if HWY_IS_LITTLE_ENDIAN
  3196. return VFromD<D>{vec_doubleo(raw_v)};
  3197. #else
  3198. return VFromD<D>{vec_doublee(raw_v)};
  3199. #endif
  3200. }
  3201. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
  3202. HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<int32_t> v) {
  3203. #if HWY_S390X_HAVE_Z14
  3204. const RebindToSigned<decltype(df64)> di64;
  3205. return ConvertTo(df64, PromoteUpperTo(di64, v));
  3206. #else // VSX
  3207. (void)df64;
  3208. const __vector signed int raw_v =
  3209. InterleaveUpper(Full128<int32_t>(), v, v).raw;
  3210. #if HWY_IS_LITTLE_ENDIAN
  3211. return VFromD<D>{vec_doubleo(raw_v)};
  3212. #else
  3213. return VFromD<D>{vec_doublee(raw_v)};
  3214. #endif
  3215. #endif // HWY_S390X_HAVE_Z14
  3216. }
  3217. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
  3218. HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
  3219. #if HWY_S390X_HAVE_Z14
  3220. const RebindToUnsigned<decltype(df64)> du64;
  3221. return ConvertTo(df64, PromoteUpperTo(du64, v));
  3222. #else // VSX
  3223. (void)df64;
  3224. const __vector unsigned int raw_v =
  3225. InterleaveUpper(Full128<uint32_t>(), v, v).raw;
  3226. #if HWY_IS_LITTLE_ENDIAN
  3227. return VFromD<D>{vec_doubleo(raw_v)};
  3228. #else
  3229. return VFromD<D>{vec_doublee(raw_v)};
  3230. #endif
  3231. #endif // HWY_S390X_HAVE_Z14
  3232. }
  3233. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
  3234. HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
  3235. #if !HWY_S390X_HAVE_Z14 && \
  3236. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
  3237. const __vector float raw_v =
  3238. detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
  3239. .raw;
  3240. return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
  3241. #else
  3242. const RebindToFloat<decltype(di64)> df64;
  3243. return ConvertTo(di64, PromoteUpperTo(df64, v));
  3244. #endif
  3245. }
  3246. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
  3247. HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
  3248. #if !HWY_S390X_HAVE_Z14 && \
  3249. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
  3250. const __vector float raw_v =
  3251. detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
  3252. .raw;
  3253. return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
  3254. __builtin_vsx_xvcvspuxds(raw_v))};
  3255. #else
  3256. const RebindToFloat<decltype(du64)> df64;
  3257. return ConvertTo(du64, PromoteUpperTo(df64, v));
  3258. #endif
  3259. }
  3260. // Generic version for <=64 bit input/output
  3261. template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V>
  3262. HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
  3263. const Rebind<TFromV<V>, decltype(d)> dh;
  3264. return PromoteTo(d, UpperHalf(dh, v));
  3265. }
  3266. // ------------------------------ PromoteEvenTo/PromoteOddTo
  3267. namespace detail {
  3268. // Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10
  3269. #if HWY_PPC_HAVE_9 && \
  3270. (HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
  3271. #if HWY_IS_LITTLE_ENDIAN
  3272. template <class D, class V>
  3273. HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
  3274. hwy::SizeTag<4> /*to_lane_size_tag*/,
  3275. hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
  3276. V v) {
  3277. return VFromD<D>{vec_signexti(v.raw)};
  3278. }
  3279. template <class D, class V>
  3280. HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
  3281. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3282. hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
  3283. V v) {
  3284. return VFromD<D>{vec_signextll(v.raw)};
  3285. }
  3286. #else
  3287. template <class D, class V>
  3288. HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
  3289. hwy::SizeTag<4> /*to_lane_size_tag*/,
  3290. hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
  3291. V v) {
  3292. return VFromD<D>{vec_signexti(v.raw)};
  3293. }
  3294. template <class D, class V>
  3295. HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
  3296. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3297. hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
  3298. V v) {
  3299. return VFromD<D>{vec_signextll(v.raw)};
  3300. }
  3301. #endif // HWY_IS_LITTLE_ENDIAN
  3302. #endif // HWY_PPC_HAVE_9
  3303. // I32/U32/F32->F64 PromoteEvenTo
  3304. #if HWY_S390X_HAVE_Z14
  3305. template <class D, class V>
  3306. HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
  3307. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3308. hwy::FloatTag /*from_type_tag*/, D /*d_to*/,
  3309. V v) {
  3310. return VFromD<D>{vec_doublee(v.raw)};
  3311. }
  3312. template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
  3313. HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
  3314. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3315. FromTypeTag /*from_type_tag*/, D d_to, V v) {
  3316. const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
  3317. return ConvertTo(d_to, PromoteEvenTo(dw, v));
  3318. }
  3319. #else // VSX
  3320. template <class D, class V, class FromTypeTag>
  3321. HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
  3322. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3323. FromTypeTag /*from_type_tag*/, D /*d_to*/,
  3324. V v) {
  3325. return VFromD<D>{vec_doublee(v.raw)};
  3326. }
  3327. #endif // HWY_S390X_HAVE_Z14
  3328. // F32->I64 PromoteEvenTo
  3329. template <class D, class V>
  3330. HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
  3331. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3332. hwy::FloatTag /*from_type_tag*/, D d_to,
  3333. V v) {
  3334. #if !HWY_S390X_HAVE_Z14 && \
  3335. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
  3336. (void)d_to;
  3337. const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
  3338. #if HWY_IS_LITTLE_ENDIAN
  3339. // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
  3340. // on little-endian PPC, and the vec_sld operation below will shift the even
  3341. // lanes of normalized_v into the odd lanes.
  3342. return VFromD<D>{
  3343. __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
  3344. #else
  3345. // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
  3346. // on big-endian PPC.
  3347. return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
  3348. #endif
  3349. #else
  3350. const RebindToFloat<decltype(d_to)> df64;
  3351. return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
  3352. hwy::FloatTag(), df64, v));
  3353. #endif
  3354. }
  3355. // F32->U64 PromoteEvenTo
  3356. template <class D, class V>
  3357. HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
  3358. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3359. hwy::FloatTag /*from_type_tag*/, D d_to,
  3360. V v) {
  3361. #if !HWY_S390X_HAVE_Z14 && \
  3362. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
  3363. (void)d_to;
  3364. const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
  3365. #if HWY_IS_LITTLE_ENDIAN
  3366. // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
  3367. // on little-endian PPC, and the vec_sld operation below will shift the even
  3368. // lanes of normalized_v into the odd lanes.
  3369. return VFromD<D>{
  3370. reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
  3371. vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
  3372. #else
  3373. // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
  3374. // on big-endian PPC.
  3375. return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
  3376. __builtin_vsx_xvcvspuxds(normalized_v.raw))};
  3377. #endif
  3378. #else
  3379. const RebindToFloat<decltype(d_to)> df64;
  3380. return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
  3381. hwy::FloatTag(), df64, v));
  3382. #endif
  3383. }
  3384. // I32/U32/F32->F64 PromoteOddTo
  3385. #if HWY_S390X_HAVE_Z14
  3386. template <class D, class V>
  3387. HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
  3388. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3389. hwy::FloatTag /*from_type_tag*/, D d_to,
  3390. V v) {
  3391. return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(),
  3392. d_to, V{vec_sld(v.raw, v.raw, 4)});
  3393. }
  3394. template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
  3395. HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
  3396. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3397. FromTypeTag /*from_type_tag*/, D d_to, V v) {
  3398. const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
  3399. return ConvertTo(d_to, PromoteOddTo(dw, v));
  3400. }
  3401. #else
  3402. template <class D, class V, class FromTypeTag>
  3403. HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
  3404. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3405. FromTypeTag /*from_type_tag*/, D /*d_to*/,
  3406. V v) {
  3407. return VFromD<D>{vec_doubleo(v.raw)};
  3408. }
  3409. #endif
  3410. // F32->I64 PromoteOddTo
  3411. template <class D, class V>
  3412. HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
  3413. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3414. hwy::FloatTag /*from_type_tag*/, D d_to,
  3415. V v) {
  3416. #if !HWY_S390X_HAVE_Z14 && \
  3417. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
  3418. (void)d_to;
  3419. const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
  3420. #if HWY_IS_LITTLE_ENDIAN
  3421. // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
  3422. // on little-endian PPC
  3423. return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
  3424. #else
  3425. // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
  3426. // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
  3427. // of normalized_v into the even lanes.
  3428. return VFromD<D>{
  3429. __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
  3430. #endif
  3431. #else
  3432. const RebindToFloat<decltype(d_to)> df64;
  3433. return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
  3434. hwy::FloatTag(), df64, v));
  3435. #endif
  3436. }
  3437. // F32->U64 PromoteOddTo
  3438. template <class D, class V>
  3439. HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
  3440. hwy::SizeTag<8> /*to_lane_size_tag*/,
  3441. hwy::FloatTag /*from_type_tag*/, D d_to,
  3442. V v) {
  3443. #if !HWY_S390X_HAVE_Z14 && \
  3444. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
  3445. (void)d_to;
  3446. const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
  3447. #if HWY_IS_LITTLE_ENDIAN
  3448. // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
  3449. // on little-endian PPC
  3450. return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
  3451. __builtin_vsx_xvcvspuxds(normalized_v.raw))};
  3452. #else
  3453. // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
  3454. // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
  3455. // of normalized_v into the even lanes.
  3456. return VFromD<D>{
  3457. reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
  3458. vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
  3459. #endif
  3460. #else
  3461. const RebindToFloat<decltype(d_to)> df64;
  3462. return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
  3463. hwy::FloatTag(), df64, v));
  3464. #endif
  3465. }
  3466. } // namespace detail
  3467. // ------------------------------ Demotions (full -> part w/ narrow lanes)
  3468. template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
  3469. HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
  3470. HWY_IF_SIGNED(FromT), HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
  3471. HWY_API VFromD<D> DemoteTo(D /* tag */,
  3472. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3473. return VFromD<D>{vec_packsu(v.raw, v.raw)};
  3474. }
  3475. template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
  3476. HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
  3477. HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
  3478. HWY_API VFromD<D> DemoteTo(D /* tag */,
  3479. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3480. return VFromD<D>{vec_packs(v.raw, v.raw)};
  3481. }
  3482. template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
  3483. HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
  3484. HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
  3485. HWY_API VFromD<D> DemoteTo(D /* tag */,
  3486. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3487. return VFromD<D>{vec_packs(v.raw, v.raw)};
  3488. }
  3489. template <class D, class FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
  3490. HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
  3491. hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
  3492. HWY_API VFromD<D> DemoteTo(D d,
  3493. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3494. const Rebind<MakeNarrow<FromT>, D> d2;
  3495. return DemoteTo(d, DemoteTo(d2, v));
  3496. }
  3497. template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
  3498. HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
  3499. hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
  3500. HWY_API VFromD<D> DemoteTo(D d,
  3501. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3502. const Rebind<MakeNarrow<FromT>, D> d2;
  3503. return DemoteTo(d, DemoteTo(d2, v));
  3504. }
  3505. template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_SIGNED(FromT),
  3506. HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
  3507. hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
  3508. HWY_API VFromD<D> DemoteTo(D d,
  3509. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3510. const Rebind<MakeUnsigned<MakeNarrow<FromT>>, D> d2;
  3511. return DemoteTo(d, DemoteTo(d2, v));
  3512. }
  3513. #if HWY_PPC_HAVE_9 && \
  3514. (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp))
  3515. // We already toggled HWY_NATIVE_F16C above.
  3516. template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
  3517. HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
  3518. // Avoid vec_pack_to_short_fp32 on Clang because its implementation is buggy.
  3519. #if HWY_COMPILER_GCC_ACTUAL
  3520. (void)df16;
  3521. return VFromD<D>{vec_pack_to_short_fp32(v.raw, v.raw)};
  3522. #elif HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)
  3523. // Work around bug in the clang implementation of vec_pack_to_short_fp32
  3524. // by using the __builtin_vsx_xvcvsphp builtin on PPC9/PPC10 targets
  3525. // if the __builtin_vsx_xvcvsphp intrinsic is available
  3526. const RebindToUnsigned<decltype(df16)> du16;
  3527. const Rebind<uint32_t, D> du;
  3528. const VFromD<decltype(du)> bits16{
  3529. reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))};
  3530. return BitCast(df16, TruncateTo(du16, bits16));
  3531. #else
  3532. #error "Only define the function if we have a native implementation"
  3533. #endif
  3534. }
  3535. #endif // HWY_PPC_HAVE_9
  3536. #if HWY_PPC_HAVE_9
  3537. #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
  3538. #undef HWY_NATIVE_DEMOTE_F64_TO_F16
  3539. #else
  3540. #define HWY_NATIVE_DEMOTE_F64_TO_F16
  3541. #endif
  3542. namespace detail {
  3543. // On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64
  3544. // vector with the resulting F16 bits in the lower 16 bits of U64 lane 0
  3545. // On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as
  3546. // an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1
  3547. static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
  3548. // Inline assembly is needed for the PPC9 xscvdphp instruction as there is
  3549. // currently no intrinsic available for the PPC9 xscvdphp instruction
  3550. __vector unsigned long long raw_result;
  3551. __asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
  3552. return Vec128<uint64_t>{raw_result};
  3553. }
  3554. } // namespace detail
  3555. template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
  3556. HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
  3557. const RebindToUnsigned<decltype(df16)> du16;
  3558. const Rebind<uint64_t, decltype(df16)> du64;
  3559. const Full128<double> df64_full;
  3560. #if HWY_IS_LITTLE_ENDIAN
  3561. const auto bits16_as_u64 =
  3562. UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
  3563. #else
  3564. const auto bits16_as_u64 =
  3565. LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
  3566. #endif
  3567. return BitCast(df16, TruncateTo(du16, bits16_as_u64));
  3568. }
  3569. template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
  3570. HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
  3571. const RebindToUnsigned<decltype(df16)> du16;
  3572. const Rebind<uint64_t, decltype(df16)> du64;
  3573. const Rebind<double, decltype(df16)> df64;
  3574. #if HWY_IS_LITTLE_ENDIAN
  3575. const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
  3576. const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
  3577. const auto bits64_as_u64 =
  3578. InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
  3579. #else
  3580. const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
  3581. const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
  3582. const auto bits64_as_u64 =
  3583. InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
  3584. #endif
  3585. return BitCast(df16, TruncateTo(du16, bits64_as_u64));
  3586. }
  3587. #elif HWY_S390X_HAVE_Z14
  3588. #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
  3589. #undef HWY_NATIVE_DEMOTE_F64_TO_F16
  3590. #else
  3591. #define HWY_NATIVE_DEMOTE_F64_TO_F16
  3592. #endif
  3593. namespace detail {
  3594. template <class DF32, HWY_IF_F32_D(DF32)>
  3595. static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd(
  3596. DF32 df32, VFromD<Rebind<double, DF32>> v) {
  3597. const Twice<DF32> dt_f32;
  3598. __vector float raw_f32_in_even;
  3599. __asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
  3600. const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
  3601. return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
  3602. }
  3603. } // namespace detail
  3604. template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
  3605. HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
  3606. const Rebind<float, decltype(df16)> df32;
  3607. return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
  3608. }
  3609. #endif // HWY_PPC_HAVE_9
  3610. #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
  3611. #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
  3612. #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
  3613. #else
  3614. #define HWY_NATIVE_DEMOTE_F32_TO_BF16
  3615. #endif
  3616. namespace detail {
  3617. // VsxXvcvspbf16 converts a F32 vector to a BF16 vector, bitcasted to an U32
  3618. // vector with the resulting BF16 bits in the lower 16 bits of each U32 lane
  3619. template <class D, HWY_IF_BF16_D(D)>
  3620. static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16(
  3621. D dbf16, VFromD<Rebind<float, D>> v) {
  3622. const Rebind<uint32_t, decltype(dbf16)> du32;
  3623. const Repartition<uint8_t, decltype(du32)> du32_as_du8;
  3624. using VU32 = __vector unsigned int;
  3625. // Even though the __builtin_vsx_xvcvspbf16 builtin performs a F32 to BF16
  3626. // conversion, the __builtin_vsx_xvcvspbf16 intrinsic expects a
  3627. // __vector unsigned char argument (at least as of GCC 13 and Clang 17)
  3628. return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>(
  3629. __builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))};
  3630. }
  3631. } // namespace detail
  3632. template <class D, HWY_IF_BF16_D(D)>
  3633. HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
  3634. const RebindToUnsigned<decltype(dbf16)> du16;
  3635. return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v)));
  3636. }
  3637. #endif // HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
  3638. // Specializations for partial vectors because vec_packs sets lanes above 2*N.
  3639. template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
  3640. HWY_IF_SIGNED_V(V),
  3641. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3642. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3643. HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  3644. const DFromV<decltype(a)> d;
  3645. const Twice<decltype(d)> dt;
  3646. return DemoteTo(dn, Combine(dt, b, a));
  3647. }
  3648. template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_SIGNED_D(DN),
  3649. HWY_IF_SIGNED_V(V),
  3650. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3651. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3652. HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  3653. const Twice<decltype(dn)> dn_full;
  3654. const Repartition<uint32_t, decltype(dn_full)> du32_full;
  3655. const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
  3656. const auto vu32_full = BitCast(du32_full, v_full);
  3657. return LowerHalf(
  3658. BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
  3659. }
  3660. template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_SIGNED_D(DN),
  3661. HWY_IF_SIGNED_V(V),
  3662. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3663. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3664. HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
  3665. return VFromD<DN>{vec_packs(a.raw, b.raw)};
  3666. }
  3667. template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
  3668. HWY_IF_UNSIGNED_D(DN), HWY_IF_SIGNED_V(V),
  3669. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3670. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3671. HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  3672. const DFromV<decltype(a)> d;
  3673. const Twice<decltype(d)> dt;
  3674. return DemoteTo(dn, Combine(dt, b, a));
  3675. }
  3676. template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
  3677. HWY_IF_SIGNED_V(V),
  3678. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3679. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3680. HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  3681. const Twice<decltype(dn)> dn_full;
  3682. const Repartition<uint32_t, decltype(dn_full)> du32_full;
  3683. const VFromD<decltype(dn_full)> v_full{vec_packsu(a.raw, b.raw)};
  3684. const auto vu32_full = BitCast(du32_full, v_full);
  3685. return LowerHalf(
  3686. BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
  3687. }
  3688. template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
  3689. HWY_IF_SIGNED_V(V),
  3690. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3691. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3692. HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
  3693. return VFromD<DN>{vec_packsu(a.raw, b.raw)};
  3694. }
  3695. template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
  3696. HWY_IF_UNSIGNED_D(DN), HWY_IF_UNSIGNED_V(V),
  3697. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3698. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3699. HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  3700. const DFromV<decltype(a)> d;
  3701. const Twice<decltype(d)> dt;
  3702. return DemoteTo(dn, Combine(dt, b, a));
  3703. }
  3704. template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
  3705. HWY_IF_UNSIGNED_V(V),
  3706. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3707. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3708. HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  3709. const Twice<decltype(dn)> dn_full;
  3710. const Repartition<uint32_t, decltype(dn_full)> du32_full;
  3711. const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
  3712. const auto vu32_full = BitCast(du32_full, v_full);
  3713. return LowerHalf(
  3714. BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
  3715. }
  3716. template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
  3717. HWY_IF_UNSIGNED_V(V),
  3718. HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
  3719. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
  3720. HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
  3721. return VFromD<DN>{vec_packs(a.raw, b.raw)};
  3722. }
  3723. #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
  3724. template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
  3725. HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
  3726. HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) {
  3727. const RebindToUnsigned<decltype(dbf16)> du16;
  3728. const Half<decltype(dbf16)> dh_bf16;
  3729. return BitCast(dbf16,
  3730. OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a),
  3731. detail::VsxXvcvspbf16(dh_bf16, b)));
  3732. }
  3733. #endif
  3734. template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
  3735. HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
  3736. HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
  3737. HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
  3738. HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
  3739. return ReorderDemote2To(d, a, b);
  3740. }
  3741. #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
  3742. template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>),
  3743. HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
  3744. HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
  3745. return ReorderDemote2To(d, a, b);
  3746. }
  3747. #endif
  3748. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
  3749. HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
  3750. return Vec32<float>{vec_floate(v.raw)};
  3751. }
  3752. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
  3753. HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
  3754. #if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
  3755. const Vec128<float> f64_to_f32{vec_floate(v.raw)};
  3756. #else
  3757. const Vec128<float> f64_to_f32{vec_floato(v.raw)};
  3758. #endif
  3759. #if HWY_S390X_HAVE_Z14
  3760. const Twice<decltype(d)> dt;
  3761. return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32));
  3762. #else
  3763. const RebindToUnsigned<D> du;
  3764. const Rebind<uint64_t, D> du64;
  3765. return Vec64<float>{
  3766. BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
  3767. #endif
  3768. }
  3769. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
  3770. HWY_API Vec32<int32_t> DemoteTo(D di32, Vec64<double> v) {
  3771. #if HWY_S390X_HAVE_Z14
  3772. const Rebind<int64_t, decltype(di32)> di64;
  3773. return DemoteTo(di32, ConvertTo(di64, v));
  3774. #else
  3775. (void)di32;
  3776. return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
  3777. #endif
  3778. }
  3779. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
  3780. HWY_API Vec64<int32_t> DemoteTo(D di32, Vec128<double> v) {
  3781. #if HWY_S390X_HAVE_Z14
  3782. const Rebind<int64_t, decltype(di32)> di64;
  3783. return DemoteTo(di32, ConvertTo(di64, v));
  3784. #else
  3785. (void)di32;
  3786. #if HWY_IS_LITTLE_ENDIAN
  3787. const Vec128<int32_t> f64_to_i32{
  3788. vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
  3789. #else
  3790. const Vec128<int32_t> f64_to_i32{
  3791. vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
  3792. #endif
  3793. const Rebind<int64_t, D> di64;
  3794. const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
  3795. return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
  3796. #endif
  3797. }
  3798. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
  3799. HWY_API Vec32<uint32_t> DemoteTo(D du32, Vec64<double> v) {
  3800. #if HWY_S390X_HAVE_Z14
  3801. const Rebind<uint64_t, decltype(du32)> du64;
  3802. return DemoteTo(du32, ConvertTo(du64, v));
  3803. #else
  3804. (void)du32;
  3805. return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
  3806. #endif
  3807. }
  3808. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
  3809. HWY_API Vec64<uint32_t> DemoteTo(D du32, Vec128<double> v) {
  3810. #if HWY_S390X_HAVE_Z14
  3811. const Rebind<uint64_t, decltype(du32)> du64;
  3812. return DemoteTo(du32, ConvertTo(du64, v));
  3813. #else
  3814. (void)du32;
  3815. #if HWY_IS_LITTLE_ENDIAN
  3816. const Vec128<uint32_t> f64_to_u32{
  3817. vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
  3818. #else
  3819. const Vec128<uint32_t> f64_to_u32{
  3820. vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
  3821. #endif
  3822. const Rebind<uint64_t, D> du64;
  3823. const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32);
  3824. return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)};
  3825. #endif
  3826. }
  3827. #if HWY_S390X_HAVE_Z14
  3828. namespace detail {
  3829. template <class V, HWY_IF_I64(TFromV<V>)>
  3830. HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
  3831. __vector double raw_result;
  3832. // Use inline assembly to do a round-to-odd I64->F64 conversion on Z14
  3833. __asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
  3834. return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
  3835. }
  3836. template <class V, HWY_IF_U64(TFromV<V>)>
  3837. HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
  3838. __vector double raw_result;
  3839. // Use inline assembly to do a round-to-odd U64->F64 conversion on Z14
  3840. __asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
  3841. return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
  3842. }
  3843. } // namespace detail
  3844. #endif // HWY_S390X_HAVE_Z14
  3845. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
  3846. HWY_API Vec32<float> DemoteTo(D df32, Vec64<int64_t> v) {
  3847. #if HWY_S390X_HAVE_Z14
  3848. return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
  3849. #else // VSX
  3850. (void)df32;
  3851. return Vec32<float>{vec_floate(v.raw)};
  3852. #endif
  3853. }
  3854. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
  3855. HWY_API Vec64<float> DemoteTo(D df32, Vec128<int64_t> v) {
  3856. #if HWY_S390X_HAVE_Z14
  3857. return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
  3858. #else // VSX
  3859. #if HWY_IS_LITTLE_ENDIAN
  3860. const Vec128<float> i64_to_f32{vec_floate(v.raw)};
  3861. #else
  3862. const Vec128<float> i64_to_f32{vec_floato(v.raw)};
  3863. #endif
  3864. const RebindToUnsigned<decltype(df32)> du32;
  3865. const Rebind<uint64_t, decltype(df32)> du64;
  3866. return Vec64<float>{
  3867. BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw};
  3868. #endif
  3869. }
  3870. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
  3871. HWY_API Vec32<float> DemoteTo(D df32, Vec64<uint64_t> v) {
  3872. #if HWY_S390X_HAVE_Z14
  3873. return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
  3874. #else // VSX
  3875. (void)df32;
  3876. return Vec32<float>{vec_floate(v.raw)};
  3877. #endif
  3878. }
  3879. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
  3880. HWY_API Vec64<float> DemoteTo(D df32, Vec128<uint64_t> v) {
  3881. #if HWY_S390X_HAVE_Z14
  3882. return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
  3883. #else // VSX
  3884. #if HWY_IS_LITTLE_ENDIAN
  3885. const Vec128<float> u64_to_f32{vec_floate(v.raw)};
  3886. #else
  3887. const Vec128<float> u64_to_f32{vec_floato(v.raw)};
  3888. #endif
  3889. const RebindToUnsigned<decltype(df32)> du;
  3890. const Rebind<uint64_t, decltype(df32)> du64;
  3891. return Vec64<float>{
  3892. BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
  3893. #endif
  3894. }
  3895. // For already range-limited input [0, 255].
  3896. template <size_t N>
  3897. HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
  3898. const Rebind<uint16_t, DFromV<decltype(v)>> du16;
  3899. const Rebind<uint8_t, decltype(du16)> du8;
  3900. return TruncateTo(du8, TruncateTo(du16, v));
  3901. }
  3902. // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
  3903. // Note: altivec.h vec_ct* currently contain C casts which triggers
  3904. // -Wdeprecate-lax-vec-conv-all warnings, so disable them.
  3905. #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
  3906. template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
  3907. HWY_IF_V_SIZE_LE_D(D, 8)>
  3908. HWY_API VFromD<D> ConvertTo(D df32,
  3909. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3910. const Rebind<double, decltype(df32)> df64;
  3911. return DemoteTo(df32, PromoteTo(df64, v));
  3912. }
  3913. template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
  3914. HWY_IF_V_SIZE_D(D, 16)>
  3915. HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
  3916. const RepartitionToWide<decltype(df32)> df64;
  3917. const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
  3918. const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
  3919. return ConcatEven(df32, vf32_hi, vf32_lo);
  3920. }
  3921. #else // Z15 or PPC
  3922. template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
  3923. HWY_API VFromD<D> ConvertTo(D /* tag */,
  3924. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3925. HWY_DIAGNOSTICS(push)
  3926. #if HWY_COMPILER_CLANG
  3927. HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
  3928. #endif
  3929. #if HWY_S390X_HAVE_Z15
  3930. return VFromD<D>{vec_float(v.raw)};
  3931. #else
  3932. return VFromD<D>{vec_ctf(v.raw, 0)};
  3933. #endif
  3934. HWY_DIAGNOSTICS(pop)
  3935. }
  3936. #endif // HWY_TARGET == HWY_Z14
  3937. template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
  3938. HWY_IF_T_SIZE_D(D, sizeof(FromT))>
  3939. HWY_API VFromD<D> ConvertTo(D /* tag */,
  3940. Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
  3941. return VFromD<D>{vec_double(v.raw)};
  3942. }
  3943. // Truncates (rounds toward zero).
  3944. #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
  3945. template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
  3946. HWY_API VFromD<D> ConvertTo(D di32,
  3947. Vec128<float, Rebind<float, D>().MaxLanes()> v) {
  3948. const Rebind<int64_t, decltype(di32)> di64;
  3949. return DemoteTo(di32, PromoteTo(di64, v));
  3950. }
  3951. template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
  3952. HWY_API VFromD<D> ConvertTo(D di32,
  3953. Vec128<float, Rebind<float, D>().MaxLanes()> v) {
  3954. const RepartitionToWide<decltype(di32)> di64;
  3955. return OrderedDemote2To(di32, PromoteLowerTo(di64, v),
  3956. PromoteUpperTo(di64, v));
  3957. }
  3958. #else // Z15 or PPC
  3959. template <class D, HWY_IF_I32_D(D)>
  3960. HWY_API VFromD<D> ConvertTo(D /* tag */,
  3961. Vec128<float, Rebind<float, D>().MaxLanes()> v) {
  3962. #if defined(__OPTIMIZE__)
  3963. if (detail::IsConstantRawAltivecVect(v.raw)) {
  3964. constexpr int32_t kMinI32 = LimitsMin<int32_t>();
  3965. constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
  3966. return Dup128VecFromValues(
  3967. D(),
  3968. (v.raw[0] >= -2147483648.0f)
  3969. ? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0])
  3970. : kMaxI32)
  3971. : ((v.raw[0] < 0) ? kMinI32 : 0),
  3972. (v.raw[1] >= -2147483648.0f)
  3973. ? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
  3974. : kMaxI32)
  3975. : ((v.raw[1] < 0) ? kMinI32 : 0),
  3976. (v.raw[2] >= -2147483648.0f)
  3977. ? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
  3978. : kMaxI32)
  3979. : ((v.raw[2] < 0) ? kMinI32 : 0),
  3980. (v.raw[3] >= -2147483648.0f)
  3981. ? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
  3982. : kMaxI32)
  3983. : ((v.raw[3] < 0) ? kMinI32 : 0));
  3984. }
  3985. #endif
  3986. #if HWY_S390X_HAVE_Z15
  3987. // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
  3988. // the range of an int32_t
  3989. __vector signed int raw_result;
  3990. __asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
  3991. return VFromD<D>{raw_result};
  3992. #else
  3993. HWY_DIAGNOSTICS(push)
  3994. #if HWY_COMPILER_CLANG
  3995. HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
  3996. #endif
  3997. return VFromD<D>{vec_cts(v.raw, 0)};
  3998. HWY_DIAGNOSTICS(pop)
  3999. #endif // HWY_S390X_HAVE_Z15
  4000. }
  4001. #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
  4002. template <class D, HWY_IF_I64_D(D)>
  4003. HWY_API VFromD<D> ConvertTo(D /* tag */,
  4004. Vec128<double, Rebind<double, D>().MaxLanes()> v) {
  4005. #if defined(__OPTIMIZE__)
  4006. if (detail::IsConstantRawAltivecVect(v.raw)) {
  4007. constexpr int64_t kMinI64 = LimitsMin<int64_t>();
  4008. constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
  4009. return Dup128VecFromValues(D(),
  4010. (v.raw[0] >= -9223372036854775808.0)
  4011. ? ((v.raw[0] < 9223372036854775808.0)
  4012. ? static_cast<int64_t>(v.raw[0])
  4013. : kMaxI64)
  4014. : ((v.raw[0] < 0) ? kMinI64 : 0LL),
  4015. (v.raw[1] >= -9223372036854775808.0)
  4016. ? ((v.raw[1] < 9223372036854775808.0)
  4017. ? static_cast<int64_t>(v.raw[1])
  4018. : kMaxI64)
  4019. : ((v.raw[1] < 0) ? kMinI64 : 0LL));
  4020. }
  4021. #endif
  4022. // Use inline assembly to avoid undefined behavior if v[i] is not within the
  4023. // range of an int64_t
  4024. __vector signed long long raw_result;
  4025. #if HWY_S390X_HAVE_Z14
  4026. __asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
  4027. #else
  4028. __asm__("xvcvdpsxds %x0,%x1"
  4029. : "=wa"(raw_result)
  4030. : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
  4031. #endif
  4032. return VFromD<D>{raw_result};
  4033. }
  4034. #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
  4035. template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
  4036. HWY_API VFromD<D> ConvertTo(D du32,
  4037. Vec128<float, Rebind<float, D>().MaxLanes()> v) {
  4038. const Rebind<uint64_t, decltype(du32)> du64;
  4039. return DemoteTo(du32, PromoteTo(du64, v));
  4040. }
  4041. template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
  4042. HWY_API VFromD<D> ConvertTo(D du32,
  4043. Vec128<float, Rebind<float, D>().MaxLanes()> v) {
  4044. const RepartitionToWide<decltype(du32)> du64;
  4045. return OrderedDemote2To(du32, PromoteLowerTo(du64, v),
  4046. PromoteUpperTo(du64, v));
  4047. }
  4048. #else // Z15 or VSX
  4049. template <class D, HWY_IF_U32_D(D)>
  4050. HWY_API VFromD<D> ConvertTo(D /* tag */,
  4051. Vec128<float, Rebind<float, D>().MaxLanes()> v) {
  4052. #if defined(__OPTIMIZE__)
  4053. if (detail::IsConstantRawAltivecVect(v.raw)) {
  4054. constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
  4055. return Dup128VecFromValues(
  4056. D(),
  4057. (v.raw[0] >= 0.0f)
  4058. ? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0])
  4059. : kMaxU32)
  4060. : 0,
  4061. (v.raw[1] >= 0.0f)
  4062. ? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
  4063. : kMaxU32)
  4064. : 0,
  4065. (v.raw[2] >= 0.0f)
  4066. ? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
  4067. : kMaxU32)
  4068. : 0,
  4069. (v.raw[3] >= 0.0f)
  4070. ? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
  4071. : kMaxU32)
  4072. : 0);
  4073. }
  4074. #endif
  4075. #if HWY_S390X_HAVE_Z15
  4076. // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
  4077. // the range of an uint32_t
  4078. __vector unsigned int raw_result;
  4079. __asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
  4080. return VFromD<D>{raw_result};
  4081. #else // VSX
  4082. HWY_DIAGNOSTICS(push)
  4083. #if HWY_COMPILER_CLANG
  4084. HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
  4085. #endif
  4086. VFromD<D> result{vec_ctu(v.raw, 0)};
  4087. HWY_DIAGNOSTICS(pop)
  4088. return result;
  4089. #endif // HWY_S390X_HAVE_Z15
  4090. }
  4091. #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
  4092. template <class D, HWY_IF_U64_D(D)>
  4093. HWY_API VFromD<D> ConvertTo(D /* tag */,
  4094. Vec128<double, Rebind<double, D>().MaxLanes()> v) {
  4095. HWY_DIAGNOSTICS(push)
  4096. #if HWY_COMPILER_CLANG
  4097. HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
  4098. #endif
  4099. #if defined(__OPTIMIZE__)
  4100. if (detail::IsConstantRawAltivecVect(v.raw)) {
  4101. constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
  4102. return Dup128VecFromValues(
  4103. D(),
  4104. (v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
  4105. ? static_cast<uint64_t>(v.raw[0])
  4106. : kMaxU64)
  4107. : 0,
  4108. (v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
  4109. ? static_cast<uint64_t>(v.raw[1])
  4110. : kMaxU64)
  4111. : 0);
  4112. }
  4113. #endif
  4114. // Use inline assembly to avoid undefined behavior if v[i] is not within the
  4115. // range of an uint64_t
  4116. __vector unsigned long long raw_result;
  4117. #if HWY_S390X_HAVE_Z14
  4118. __asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
  4119. #else // VSX
  4120. __asm__("xvcvdpuxds %x0,%x1"
  4121. : "=wa"(raw_result)
  4122. : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
  4123. #endif
  4124. return VFromD<D>{raw_result};
  4125. }
  4126. // ------------------------------ Floating-point rounding (ConvertTo)
  4127. // Toward nearest integer, ties to even
  4128. template <size_t N>
  4129. HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
  4130. return Vec128<float, N>{vec_round(v.raw)};
  4131. }
  4132. template <size_t N>
  4133. HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
  4134. #if HWY_S390X_HAVE_Z14
  4135. return Vec128<double, N>{vec_round(v.raw)};
  4136. #else
  4137. return Vec128<double, N>{vec_rint(v.raw)};
  4138. #endif
  4139. }
  4140. template <size_t N>
  4141. HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
  4142. const DFromV<decltype(v)> d;
  4143. const RebindToSigned<decltype(d)> di;
  4144. return ConvertTo(di, Round(v));
  4145. }
  4146. // Toward zero, aka truncate
  4147. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  4148. HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
  4149. return Vec128<T, N>{vec_trunc(v.raw)};
  4150. }
  4151. // Toward +infinity, aka ceiling
  4152. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  4153. HWY_API Vec128<T, N> Ceil(Vec128<T, N> v) {
  4154. return Vec128<T, N>{vec_ceil(v.raw)};
  4155. }
  4156. // Toward -infinity, aka floor
  4157. template <typename T, size_t N, HWY_IF_FLOAT(T)>
  4158. HWY_API Vec128<T, N> Floor(Vec128<T, N> v) {
  4159. return Vec128<T, N>{vec_floor(v.raw)};
  4160. }
  4161. // ------------------------------ Floating-point classification
  4162. template <typename T, size_t N>
  4163. HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
  4164. static_assert(IsFloat<T>(), "Only for float");
  4165. return v != v;
  4166. }
  4167. template <typename T, size_t N>
  4168. HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
  4169. static_assert(IsFloat<T>(), "Only for float");
  4170. using TU = MakeUnsigned<T>;
  4171. const DFromV<decltype(v)> d;
  4172. const RebindToUnsigned<decltype(d)> du;
  4173. const VFromD<decltype(du)> vu = BitCast(du, v);
  4174. // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
  4175. return RebindMask(
  4176. d,
  4177. Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
  4178. }
  4179. // Returns whether normal/subnormal/zero.
  4180. template <typename T, size_t N>
  4181. HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
  4182. static_assert(IsFloat<T>(), "Only for float");
  4183. using TU = MakeUnsigned<T>;
  4184. const DFromV<decltype(v)> d;
  4185. const RebindToUnsigned<decltype(d)> du;
  4186. const VFromD<decltype(du)> vu = BitCast(du, v);
  4187. // 'Shift left' to clear the sign bit, check for exponent<max.
  4188. return RebindMask(
  4189. d,
  4190. Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
  4191. }
  4192. // ================================================== CRYPTO
  4193. #if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
  4194. // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
  4195. #ifdef HWY_NATIVE_AES
  4196. #undef HWY_NATIVE_AES
  4197. #else
  4198. #define HWY_NATIVE_AES
  4199. #endif
  4200. namespace detail {
  4201. #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600
  4202. using CipherTag = Full128<uint64_t>;
  4203. #else
  4204. using CipherTag = Full128<uint8_t>;
  4205. #endif // !HWY_COMPILER_CLANG
  4206. using CipherVec = VFromD<CipherTag>;
  4207. } // namespace detail
  4208. HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
  4209. Vec128<uint8_t> round_key) {
  4210. const detail::CipherTag dc;
  4211. const Full128<uint8_t> du8;
  4212. #if HWY_IS_LITTLE_ENDIAN
  4213. return Reverse(du8,
  4214. BitCast(du8, detail::CipherVec{vec_cipher_be(
  4215. BitCast(dc, Reverse(du8, state)).raw,
  4216. BitCast(dc, Reverse(du8, round_key)).raw)}));
  4217. #else
  4218. return BitCast(du8, detail::CipherVec{vec_cipher_be(
  4219. BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
  4220. #endif
  4221. }
  4222. HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
  4223. Vec128<uint8_t> round_key) {
  4224. const detail::CipherTag dc;
  4225. const Full128<uint8_t> du8;
  4226. #if HWY_IS_LITTLE_ENDIAN
  4227. return Reverse(du8,
  4228. BitCast(du8, detail::CipherVec{vec_cipherlast_be(
  4229. BitCast(dc, Reverse(du8, state)).raw,
  4230. BitCast(dc, Reverse(du8, round_key)).raw)}));
  4231. #else
  4232. return BitCast(du8, detail::CipherVec{vec_cipherlast_be(
  4233. BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
  4234. #endif
  4235. }
  4236. HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
  4237. Vec128<uint8_t> round_key) {
  4238. const detail::CipherTag dc;
  4239. const Full128<uint8_t> du8;
  4240. #if HWY_IS_LITTLE_ENDIAN
  4241. return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be(
  4242. BitCast(dc, Reverse(du8, state)).raw,
  4243. Zero(dc).raw)})),
  4244. round_key);
  4245. #else
  4246. return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be(
  4247. BitCast(dc, state).raw, Zero(dc).raw)}),
  4248. round_key);
  4249. #endif
  4250. }
  4251. HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
  4252. Vec128<uint8_t> round_key) {
  4253. const detail::CipherTag dc;
  4254. const Full128<uint8_t> du8;
  4255. #if HWY_IS_LITTLE_ENDIAN
  4256. return Reverse(du8,
  4257. BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
  4258. BitCast(dc, Reverse(du8, state)).raw,
  4259. BitCast(dc, Reverse(du8, round_key)).raw)}));
  4260. #else
  4261. return BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
  4262. BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
  4263. #endif
  4264. }
  4265. HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
  4266. const Full128<uint8_t> du8;
  4267. const auto zero = Zero(du8);
  4268. // PPC8/PPC9/PPC10 does not have a single instruction for the AES
  4269. // InvMixColumns operation like ARM Crypto, SVE2 Crypto, or AES-NI do.
  4270. // The AESInvMixColumns operation can be carried out on PPC8/PPC9/PPC10
  4271. // by doing an AESLastRound operation with a zero round_key followed by an
  4272. // AESRoundInv operation with a zero round_key.
  4273. return AESRoundInv(AESLastRound(state, zero), zero);
  4274. }
  4275. template <uint8_t kRcon>
  4276. HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
  4277. constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0,
  4278. 0, 0, 0, 0, kRcon, 0, 0, 0};
  4279. constexpr __vector unsigned char kRotWordShuffle = {
  4280. 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
  4281. const detail::CipherTag dc;
  4282. const Full128<uint8_t> du8;
  4283. const auto sub_word_result =
  4284. BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)});
  4285. const auto rot_word_result =
  4286. TableLookupBytes(sub_word_result, Vec128<uint8_t>{kRotWordShuffle});
  4287. return Xor(rot_word_result, Vec128<uint8_t>{kRconXorMask});
  4288. }
  4289. template <size_t N>
  4290. HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
  4291. Vec128<uint64_t, N> b) {
  4292. // NOTE: Lane 1 of both a and b need to be zeroed out for the
  4293. // vec_pmsum_be operation below as the vec_pmsum_be operation
  4294. // does a carryless multiplication of each 64-bit half and then
  4295. // adds the two halves using an bitwise XOR operation.
  4296. const DFromV<decltype(a)> d;
  4297. const auto zero = Zero(d);
  4298. using VU64 = __vector unsigned long long;
  4299. const VU64 pmsum_result = reinterpret_cast<VU64>(
  4300. vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw));
  4301. #if HWY_IS_LITTLE_ENDIAN
  4302. return Vec128<uint64_t, N>{pmsum_result};
  4303. #else
  4304. // Need to swap the two halves of pmsum_result on big-endian targets as
  4305. // the upper 64 bits of the carryless multiplication result are in lane 0 of
  4306. // pmsum_result and the lower 64 bits of the carryless multiplication result
  4307. // are in lane 1 of mul128_result
  4308. return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
  4309. #endif
  4310. }
  4311. template <size_t N>
  4312. HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
  4313. Vec128<uint64_t, N> b) {
  4314. // NOTE: Lane 0 of both a and b need to be zeroed out for the
  4315. // vec_pmsum_be operation below as the vec_pmsum_be operation
  4316. // does a carryless multiplication of each 64-bit half and then
  4317. // adds the two halves using an bitwise XOR operation.
  4318. const DFromV<decltype(a)> d;
  4319. const auto zero = Zero(d);
  4320. using VU64 = __vector unsigned long long;
  4321. const VU64 pmsum_result = reinterpret_cast<VU64>(
  4322. vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw)));
  4323. #if HWY_IS_LITTLE_ENDIAN
  4324. return Vec128<uint64_t, N>{pmsum_result};
  4325. #else
  4326. // Need to swap the two halves of pmsum_result on big-endian targets as
  4327. // the upper 64 bits of the carryless multiplication result are in lane 0 of
  4328. // pmsum_result and the lower 64 bits of the carryless multiplication result
  4329. // are in lane 1 of mul128_result
  4330. return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
  4331. #endif
  4332. }
  4333. #endif // !defined(HWY_DISABLE_PPC8_CRYPTO)
  4334. // ================================================== MISC
  4335. // ------------------------------ LoadMaskBits (TestBit)
  4336. namespace detail {
  4337. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  4338. HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
  4339. #if HWY_PPC_HAVE_10
  4340. const Vec128<uint8_t> mask_vec{vec_genbm(mask_bits)};
  4341. #if HWY_IS_LITTLE_ENDIAN
  4342. return MFromD<D>{MaskFromVec(mask_vec).raw};
  4343. #else
  4344. return MFromD<D>{MaskFromVec(Reverse(Full128<uint8_t>(), mask_vec)).raw};
  4345. #endif // HWY_IS_LITTLE_ENDIAN
  4346. #else // PPC9 or earlier
  4347. const Full128<uint8_t> du8;
  4348. const Full128<uint16_t> du16;
  4349. const Vec128<uint8_t> vbits =
  4350. BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
  4351. // Replicate bytes 8x such that each byte contains the bit that governs it.
  4352. #if HWY_IS_LITTLE_ENDIAN
  4353. const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0,
  4354. 1, 1, 1, 1, 1, 1, 1, 1};
  4355. #else
  4356. const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1,
  4357. 0, 0, 0, 0, 0, 0, 0, 0};
  4358. #endif // HWY_IS_LITTLE_ENDIAN
  4359. const Vec128<uint8_t> rep8{vec_perm(vbits.raw, vbits.raw, kRep8)};
  4360. const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128,
  4361. 1, 2, 4, 8, 16, 32, 64, 128};
  4362. return MFromD<D>{TestBit(rep8, Vec128<uint8_t>{kBit}).raw};
  4363. #endif // HWY_PPC_HAVE_10
  4364. }
  4365. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  4366. HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
  4367. #if HWY_PPC_HAVE_10
  4368. const Vec128<uint16_t> mask_vec{vec_genhm(mask_bits)};
  4369. #if HWY_IS_LITTLE_ENDIAN
  4370. return MFromD<D>{MaskFromVec(mask_vec).raw};
  4371. #else
  4372. return MFromD<D>{MaskFromVec(Reverse(Full128<uint16_t>(), mask_vec)).raw};
  4373. #endif // HWY_IS_LITTLE_ENDIAN
  4374. #else // PPC9 or earlier
  4375. const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128};
  4376. const auto vmask_bits =
  4377. Set(Full128<uint16_t>(), static_cast<uint16_t>(mask_bits));
  4378. return MFromD<D>{TestBit(vmask_bits, Vec128<uint16_t>{kBit}).raw};
  4379. #endif // HWY_PPC_HAVE_10
  4380. }
  4381. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  4382. HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
  4383. #if HWY_PPC_HAVE_10
  4384. const Vec128<uint32_t> mask_vec{vec_genwm(mask_bits)};
  4385. #if HWY_IS_LITTLE_ENDIAN
  4386. return MFromD<D>{MaskFromVec(mask_vec).raw};
  4387. #else
  4388. return MFromD<D>{MaskFromVec(Reverse(Full128<uint32_t>(), mask_vec)).raw};
  4389. #endif // HWY_IS_LITTLE_ENDIAN
  4390. #else // PPC9 or earlier
  4391. const __vector unsigned int kBit = {1, 2, 4, 8};
  4392. const auto vmask_bits =
  4393. Set(Full128<uint32_t>(), static_cast<uint32_t>(mask_bits));
  4394. return MFromD<D>{TestBit(vmask_bits, Vec128<uint32_t>{kBit}).raw};
  4395. #endif // HWY_PPC_HAVE_10
  4396. }
  4397. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  4398. HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
  4399. #if HWY_PPC_HAVE_10
  4400. const Vec128<uint64_t> mask_vec{vec_gendm(mask_bits)};
  4401. #if HWY_IS_LITTLE_ENDIAN
  4402. return MFromD<D>{MaskFromVec(mask_vec).raw};
  4403. #else
  4404. return MFromD<D>{MaskFromVec(Reverse(Full128<uint64_t>(), mask_vec)).raw};
  4405. #endif // HWY_IS_LITTLE_ENDIAN
  4406. #else // PPC9 or earlier
  4407. const __vector unsigned long long kBit = {1, 2};
  4408. const auto vmask_bits =
  4409. Set(Full128<uint64_t>(), static_cast<uint64_t>(mask_bits));
  4410. return MFromD<D>{TestBit(vmask_bits, Vec128<uint64_t>{kBit}).raw};
  4411. #endif // HWY_PPC_HAVE_10
  4412. }
  4413. } // namespace detail
  4414. // `p` points to at least 8 readable bytes, not all of which need be valid.
  4415. template <class D, HWY_IF_LANES_LE_D(D, 8)>
  4416. HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
  4417. // If there are 8 or fewer lanes, simply convert bits[0] to a uint64_t
  4418. uint64_t mask_bits = bits[0];
  4419. constexpr size_t kN = MaxLanes(d);
  4420. if (kN < 8) mask_bits &= (1u << kN) - 1;
  4421. return detail::LoadMaskBits128(d, mask_bits);
  4422. }
  4423. template <class D, HWY_IF_LANES_D(D, 16)>
  4424. HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
  4425. // First, copy the mask bits to a uint16_t as there as there are at most
  4426. // 16 lanes in a vector.
  4427. // Copying the mask bits to a uint16_t first will also ensure that the
  4428. // mask bits are loaded into the lower 16 bits on big-endian PPC targets.
  4429. uint16_t u16_mask_bits;
  4430. CopyBytes<sizeof(uint16_t)>(bits, &u16_mask_bits);
  4431. #if HWY_IS_LITTLE_ENDIAN
  4432. return detail::LoadMaskBits128(d, u16_mask_bits);
  4433. #else
  4434. // On big-endian targets, u16_mask_bits need to be byte swapped as bits
  4435. // contains the mask bits in little-endian byte order
  4436. // GCC/Clang will optimize the load of u16_mask_bits and byte swap to a
  4437. // single lhbrx instruction on big-endian PPC targets when optimizations
  4438. // are enabled.
  4439. #if HWY_HAS_BUILTIN(__builtin_bswap16)
  4440. return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits));
  4441. #else
  4442. return detail::LoadMaskBits128(
  4443. d, static_cast<uint16_t>((u16_mask_bits << 8) | (u16_mask_bits >> 8)));
  4444. #endif
  4445. #endif
  4446. }
  4447. template <typename T>
  4448. struct CompressIsPartition {
  4449. // generic_ops-inl does not guarantee IsPartition for 8-bit.
  4450. enum { value = (sizeof(T) != 1) };
  4451. };
  4452. // ------------------------------ Dup128MaskFromMaskBits
  4453. template <class D>
  4454. HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
  4455. constexpr size_t kN = MaxLanes(d);
  4456. if (kN < 8) mask_bits &= (1u << kN) - 1;
  4457. return detail::LoadMaskBits128(d, mask_bits);
  4458. }
  4459. // ------------------------------ StoreMaskBits
  4460. namespace detail {
  4461. #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
  4462. // fallback for missing vec_extractm
  4463. template <size_t N>
  4464. HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
  4465. __vector unsigned char bit_shuffle) {
  4466. // clang POWER8 and 9 targets appear to differ in their return type of
  4467. // vec_vbpermq: unsigned or signed, so cast to avoid a warning.
  4468. using VU64 = detail::Raw128<uint64_t>::type;
  4469. #if HWY_S390X_HAVE_Z14
  4470. const Vec128<uint64_t> extracted{
  4471. reinterpret_cast<VU64>(vec_bperm_u128(sign_bits.raw, bit_shuffle))};
  4472. #else
  4473. const Vec128<uint64_t> extracted{
  4474. reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
  4475. #endif
  4476. return extracted.raw[HWY_IS_LITTLE_ENDIAN];
  4477. }
  4478. #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
  4479. template <typename T, size_t N>
  4480. HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
  4481. const DFromM<decltype(mask)> d;
  4482. const Repartition<uint8_t, decltype(d)> du8;
  4483. const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
  4484. #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
  4485. return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
  4486. #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
  4487. const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
  4488. 56, 48, 40, 32, 24, 16, 8, 0};
  4489. return ExtractSignBits(sign_bits, kBitShuffle);
  4490. #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
  4491. }
  4492. template <typename T, size_t N>
  4493. HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
  4494. const DFromM<decltype(mask)> d;
  4495. const RebindToUnsigned<decltype(d)> du;
  4496. const Repartition<uint8_t, decltype(d)> du8;
  4497. const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
  4498. #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
  4499. return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
  4500. #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
  4501. (void)du;
  4502. #if HWY_IS_LITTLE_ENDIAN
  4503. const __vector unsigned char kBitShuffle = {
  4504. 112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
  4505. #else
  4506. const __vector unsigned char kBitShuffle = {
  4507. 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0};
  4508. #endif
  4509. return ExtractSignBits(sign_bits, kBitShuffle);
  4510. #endif // HWY_PPC_HAVE_10
  4511. }
  4512. template <typename T, size_t N>
  4513. HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
  4514. const DFromM<decltype(mask)> d;
  4515. const RebindToUnsigned<decltype(d)> du;
  4516. const Repartition<uint8_t, decltype(d)> du8;
  4517. const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
  4518. #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
  4519. return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
  4520. #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
  4521. (void)du;
  4522. #if HWY_IS_LITTLE_ENDIAN
  4523. const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
  4524. 128, 128, 128, 128, 128, 128,
  4525. 128, 128, 128, 128};
  4526. #else
  4527. const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
  4528. 128, 128, 128, 128, 128, 128,
  4529. 96, 64, 32, 0};
  4530. #endif
  4531. return ExtractSignBits(sign_bits, kBitShuffle);
  4532. #endif // HWY_PPC_HAVE_10
  4533. }
  4534. template <typename T, size_t N>
  4535. HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
  4536. const DFromM<decltype(mask)> d;
  4537. const RebindToUnsigned<decltype(d)> du;
  4538. const Repartition<uint8_t, decltype(d)> du8;
  4539. const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
  4540. #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
  4541. return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
  4542. #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
  4543. (void)du;
  4544. #if HWY_IS_LITTLE_ENDIAN
  4545. const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
  4546. 128, 128, 128, 128, 128, 128,
  4547. 128, 128, 128, 128};
  4548. #else
  4549. const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
  4550. 128, 128, 128, 128, 128, 128,
  4551. 128, 128, 64, 0};
  4552. #endif
  4553. return ExtractSignBits(sign_bits, kBitShuffle);
  4554. #endif // HWY_PPC_HAVE_10
  4555. }
  4556. // Returns the lowest N of the mask bits.
  4557. template <typename T, size_t N>
  4558. constexpr uint64_t OnlyActive(uint64_t mask_bits) {
  4559. return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
  4560. }
  4561. template <typename T, size_t N>
  4562. HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
  4563. return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
  4564. }
  4565. } // namespace detail
  4566. // `p` points to at least 8 writable bytes.
  4567. template <class D, HWY_IF_LANES_LE_D(D, 8)>
  4568. HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
  4569. // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask
  4570. // to an uint8_t and store the result in bits[0].
  4571. bits[0] = static_cast<uint8_t>(detail::BitsFromMask(mask));
  4572. return sizeof(uint8_t);
  4573. }
  4574. template <class D, HWY_IF_LANES_D(D, 16)>
  4575. HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
  4576. const auto mask_bits = detail::BitsFromMask(mask);
  4577. // First convert mask_bits to a uint16_t as we only want to store
  4578. // the lower 16 bits of mask_bits as there are 16 lanes in mask.
  4579. // Converting mask_bits to a uint16_t first will also ensure that
  4580. // the lower 16 bits of mask_bits are stored instead of the upper 16 bits
  4581. // of mask_bits on big-endian PPC targets.
  4582. #if HWY_IS_LITTLE_ENDIAN
  4583. const uint16_t u16_mask_bits = static_cast<uint16_t>(mask_bits);
  4584. #else
  4585. // On big-endian targets, the bytes of mask_bits need to be swapped
  4586. // as StoreMaskBits expects the mask bits to be stored in little-endian
  4587. // byte order.
  4588. // GCC will also optimize the byte swap and CopyBytes operations below
  4589. // to a single sthbrx instruction when optimizations are enabled on
  4590. // big-endian PPC targets
  4591. #if HWY_HAS_BUILTIN(__builtin_bswap16)
  4592. const uint16_t u16_mask_bits =
  4593. __builtin_bswap16(static_cast<uint16_t>(mask_bits));
  4594. #else
  4595. const uint16_t u16_mask_bits = static_cast<uint16_t>(
  4596. (mask_bits << 8) | (static_cast<uint16_t>(mask_bits) >> 8));
  4597. #endif
  4598. #endif
  4599. CopyBytes<sizeof(uint16_t)>(&u16_mask_bits, bits);
  4600. return sizeof(uint16_t);
  4601. }
  4602. // ------------------------------ Mask testing
  4603. template <class D, HWY_IF_V_SIZE_D(D, 16)>
  4604. HWY_API bool AllFalse(D d, MFromD<D> mask) {
  4605. const RebindToUnsigned<decltype(d)> du;
  4606. return static_cast<bool>(
  4607. vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw));
  4608. }
  4609. template <class D, HWY_IF_V_SIZE_D(D, 16)>
  4610. HWY_API bool AllTrue(D d, MFromD<D> mask) {
  4611. const RebindToUnsigned<decltype(d)> du;
  4612. using TU = TFromD<decltype(du)>;
  4613. return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw,
  4614. Set(du, hwy::LimitsMax<TU>()).raw));
  4615. }
  4616. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  4617. HWY_API bool AllFalse(D d, MFromD<D> mask) {
  4618. const Full128<TFromD<D>> d_full;
  4619. constexpr size_t kN = MaxLanes(d);
  4620. return AllFalse(d_full,
  4621. And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN)));
  4622. }
  4623. template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  4624. HWY_API bool AllTrue(D d, MFromD<D> mask) {
  4625. const Full128<TFromD<D>> d_full;
  4626. constexpr size_t kN = MaxLanes(d);
  4627. return AllTrue(
  4628. d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN))));
  4629. }
  4630. template <class D>
  4631. HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
  4632. return PopCount(detail::BitsFromMask(mask));
  4633. }
  4634. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4635. namespace detail {
  4636. template <class V>
  4637. static HWY_INLINE size_t VsxCntlzLsbb(V v) {
  4638. #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
  4639. HWY_IS_LITTLE_ENDIAN
  4640. // Use inline assembly to work around bug in GCC 11 and earlier on
  4641. // little-endian PPC9
  4642. int idx;
  4643. __asm__("vctzlsbb %0,%1" : "=r"(idx) : "v"(v.raw));
  4644. return static_cast<size_t>(idx);
  4645. #else
  4646. return static_cast<size_t>(vec_cntlz_lsbb(v.raw));
  4647. #endif
  4648. }
  4649. template <class V>
  4650. static HWY_INLINE size_t VsxCnttzLsbb(V v) {
  4651. #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
  4652. HWY_IS_LITTLE_ENDIAN
  4653. // Use inline assembly to work around bug in GCC 11 and earlier on
  4654. // little-endian PPC9
  4655. int idx;
  4656. __asm__("vclzlsbb %0,%1" : "=r"(idx) : "v"(v.raw));
  4657. return static_cast<size_t>(idx);
  4658. #else
  4659. return static_cast<size_t>(vec_cnttz_lsbb(v.raw));
  4660. #endif
  4661. }
  4662. } // namespace detail
  4663. #endif
  4664. template <class D, typename T = TFromD<D>>
  4665. HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
  4666. // For little-endian PPC10, BitsFromMask is already efficient.
  4667. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4668. if (detail::IsFull(d)) {
  4669. const Repartition<uint8_t, D> d8;
  4670. const auto bytes = BitCast(d8, VecFromMask(d, mask));
  4671. return detail::VsxCntlzLsbb(bytes) / sizeof(T);
  4672. }
  4673. #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4674. (void)d;
  4675. return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask));
  4676. }
  4677. template <class D, typename T = TFromD<D>>
  4678. HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
  4679. // For little-endian PPC10, BitsFromMask is already efficient.
  4680. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4681. constexpr size_t kN = 16 / sizeof(T);
  4682. if (detail::IsFull(d)) {
  4683. const Repartition<uint8_t, D> d8;
  4684. const auto bytes = BitCast(d8, VecFromMask(d, mask));
  4685. const size_t idx = detail::VsxCntlzLsbb(bytes) / sizeof(T);
  4686. return idx == kN ? -1 : static_cast<intptr_t>(idx);
  4687. }
  4688. #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4689. (void)d;
  4690. const uint64_t mask_bits = detail::BitsFromMask(mask);
  4691. return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
  4692. }
  4693. template <class D, typename T = TFromD<D>>
  4694. HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
  4695. // For little-endian PPC10, BitsFromMask is already efficient.
  4696. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4697. if (detail::IsFull(d)) {
  4698. const Repartition<uint8_t, D> d8;
  4699. const auto bytes = BitCast(d8, VecFromMask(d, mask));
  4700. const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T);
  4701. return 16 / sizeof(T) - 1 - idx;
  4702. }
  4703. #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4704. (void)d;
  4705. return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask));
  4706. }
  4707. template <class D, typename T = TFromD<D>>
  4708. HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
  4709. // For little-endian PPC10, BitsFromMask is already efficient.
  4710. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4711. constexpr size_t kN = 16 / sizeof(T);
  4712. if (detail::IsFull(d)) {
  4713. const Repartition<uint8_t, D> d8;
  4714. const auto bytes = BitCast(d8, VecFromMask(d, mask));
  4715. const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T);
  4716. return idx == kN ? -1 : static_cast<intptr_t>(kN - 1 - idx);
  4717. }
  4718. #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
  4719. (void)d;
  4720. const uint64_t mask_bits = detail::BitsFromMask(mask);
  4721. return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits))
  4722. : -1;
  4723. }
  4724. // ------------------------------ Compress, CompressBits
  4725. namespace detail {
  4726. #if HWY_PPC_HAVE_10
  4727. template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 1)>
  4728. HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
  4729. constexpr unsigned kGenPcvmMode =
  4730. (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
  4731. // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
  4732. // compiler bugs on little-endian PPC10
  4733. typename detail::Raw128<TFromD<D>>::type idx;
  4734. __asm__("xxgenpcvbm %x0, %1, %2"
  4735. : "=wa"(idx)
  4736. : "v"(mask.raw), "i"(kGenPcvmMode));
  4737. return VFromD<decltype(d)>{idx};
  4738. }
  4739. template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)>
  4740. HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
  4741. constexpr unsigned kGenPcvmMode =
  4742. (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
  4743. // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
  4744. // compiler bugs on little-endian PPC10
  4745. typename detail::Raw128<TFromD<D>>::type idx;
  4746. __asm__("xxgenpcvhm %x0, %1, %2"
  4747. : "=wa"(idx)
  4748. : "v"(mask.raw), "i"(kGenPcvmMode));
  4749. return VFromD<decltype(d)>{idx};
  4750. }
  4751. template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)>
  4752. HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
  4753. constexpr unsigned kGenPcvmMode =
  4754. (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
  4755. // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
  4756. // compiler bugs on little-endian PPC10
  4757. typename detail::Raw128<TFromD<D>>::type idx;
  4758. __asm__("xxgenpcvwm %x0, %1, %2"
  4759. : "=wa"(idx)
  4760. : "v"(mask.raw), "i"(kGenPcvmMode));
  4761. return VFromD<decltype(d)>{idx};
  4762. }
  4763. #endif
  4764. // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
  4765. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  4766. HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  4767. HWY_DASSERT(mask_bits < 256);
  4768. const Rebind<uint8_t, decltype(d)> d8;
  4769. const Twice<decltype(d8)> d8t;
  4770. const RebindToUnsigned<decltype(d)> du;
  4771. // To reduce cache footprint, store lane indices and convert to byte indices
  4772. // (2*lane + 0..1), with the doubling baked into the table. It's not clear
  4773. // that the additional cost of unpacking nibbles is worthwhile.
  4774. alignas(16) static constexpr uint8_t table[2048] = {
  4775. // PrintCompress16x8Tables
  4776. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
  4777. 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
  4778. 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, //
  4779. 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
  4780. 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, //
  4781. 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, //
  4782. 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, //
  4783. 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
  4784. 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, //
  4785. 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, //
  4786. 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, //
  4787. 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, //
  4788. 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, //
  4789. 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, //
  4790. 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, //
  4791. 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
  4792. 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, //
  4793. 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, //
  4794. 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, //
  4795. 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, //
  4796. 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, //
  4797. 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, //
  4798. 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, //
  4799. 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, //
  4800. 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, //
  4801. 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, //
  4802. 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, //
  4803. 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, //
  4804. 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, //
  4805. 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, //
  4806. 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, //
  4807. 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
  4808. 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, //
  4809. 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, //
  4810. 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, //
  4811. 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, //
  4812. 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, //
  4813. 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, //
  4814. 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, //
  4815. 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, //
  4816. 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, //
  4817. 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, //
  4818. 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, //
  4819. 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, //
  4820. 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, //
  4821. 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, //
  4822. 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, //
  4823. 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, //
  4824. 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, //
  4825. 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, //
  4826. 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, //
  4827. 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, //
  4828. 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, //
  4829. 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, //
  4830. 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, //
  4831. 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, //
  4832. 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, //
  4833. 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, //
  4834. 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, //
  4835. 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, //
  4836. 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, //
  4837. 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, //
  4838. 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, //
  4839. 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
  4840. 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, //
  4841. 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, //
  4842. 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, //
  4843. 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, //
  4844. 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, //
  4845. 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, //
  4846. 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, //
  4847. 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, //
  4848. 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, //
  4849. 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, //
  4850. 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, //
  4851. 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, //
  4852. 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, //
  4853. 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, //
  4854. 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, //
  4855. 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, //
  4856. 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, //
  4857. 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, //
  4858. 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, //
  4859. 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, //
  4860. 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, //
  4861. 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, //
  4862. 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, //
  4863. 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, //
  4864. 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, //
  4865. 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, //
  4866. 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, //
  4867. 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, //
  4868. 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, //
  4869. 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, //
  4870. 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, //
  4871. 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, //
  4872. 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, //
  4873. 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, //
  4874. 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, //
  4875. 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, //
  4876. 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, //
  4877. 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, //
  4878. 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, //
  4879. 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, //
  4880. 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, //
  4881. 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, //
  4882. 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, //
  4883. 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, //
  4884. 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, //
  4885. 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, //
  4886. 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, //
  4887. 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, //
  4888. 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, //
  4889. 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, //
  4890. 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, //
  4891. 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, //
  4892. 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, //
  4893. 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, //
  4894. 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, //
  4895. 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, //
  4896. 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, //
  4897. 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, //
  4898. 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, //
  4899. 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, //
  4900. 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, //
  4901. 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, //
  4902. 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, //
  4903. 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14};
  4904. const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
  4905. const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
  4906. constexpr uint16_t kPairIndexIncrement =
  4907. HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
  4908. return BitCast(d, pairs + Set(du, kPairIndexIncrement));
  4909. }
  4910. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  4911. HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  4912. HWY_DASSERT(mask_bits < 256);
  4913. const Rebind<uint8_t, decltype(d)> d8;
  4914. const Twice<decltype(d8)> d8t;
  4915. const RebindToUnsigned<decltype(d)> du;
  4916. // To reduce cache footprint, store lane indices and convert to byte indices
  4917. // (2*lane + 0..1), with the doubling baked into the table. It's not clear
  4918. // that the additional cost of unpacking nibbles is worthwhile.
  4919. alignas(16) static constexpr uint8_t table[2048] = {
  4920. // PrintCompressNot16x8Tables
  4921. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, //
  4922. 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, //
  4923. 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, //
  4924. 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, //
  4925. 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, //
  4926. 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, //
  4927. 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, //
  4928. 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, //
  4929. 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, //
  4930. 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, //
  4931. 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, //
  4932. 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, //
  4933. 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, //
  4934. 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, //
  4935. 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, //
  4936. 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, //
  4937. 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, //
  4938. 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, //
  4939. 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, //
  4940. 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, //
  4941. 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, //
  4942. 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, //
  4943. 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, //
  4944. 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, //
  4945. 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, //
  4946. 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, //
  4947. 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, //
  4948. 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, //
  4949. 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, //
  4950. 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, //
  4951. 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, //
  4952. 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, //
  4953. 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, //
  4954. 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, //
  4955. 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, //
  4956. 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, //
  4957. 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, //
  4958. 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, //
  4959. 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, //
  4960. 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, //
  4961. 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, //
  4962. 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, //
  4963. 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, //
  4964. 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, //
  4965. 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, //
  4966. 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, //
  4967. 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, //
  4968. 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, //
  4969. 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, //
  4970. 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, //
  4971. 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, //
  4972. 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, //
  4973. 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, //
  4974. 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, //
  4975. 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, //
  4976. 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, //
  4977. 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, //
  4978. 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, //
  4979. 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, //
  4980. 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, //
  4981. 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, //
  4982. 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, //
  4983. 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, //
  4984. 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, //
  4985. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, //
  4986. 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, //
  4987. 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, //
  4988. 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, //
  4989. 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, //
  4990. 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, //
  4991. 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, //
  4992. 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, //
  4993. 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, //
  4994. 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, //
  4995. 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, //
  4996. 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, //
  4997. 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, //
  4998. 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, //
  4999. 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, //
  5000. 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, //
  5001. 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, //
  5002. 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, //
  5003. 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, //
  5004. 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, //
  5005. 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, //
  5006. 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, //
  5007. 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, //
  5008. 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, //
  5009. 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, //
  5010. 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, //
  5011. 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, //
  5012. 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, //
  5013. 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, //
  5014. 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, //
  5015. 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, //
  5016. 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, //
  5017. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, //
  5018. 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, //
  5019. 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, //
  5020. 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, //
  5021. 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, //
  5022. 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, //
  5023. 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, //
  5024. 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, //
  5025. 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, //
  5026. 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, //
  5027. 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, //
  5028. 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, //
  5029. 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, //
  5030. 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, //
  5031. 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, //
  5032. 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, //
  5033. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, //
  5034. 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, //
  5035. 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, //
  5036. 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, //
  5037. 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, //
  5038. 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, //
  5039. 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, //
  5040. 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, //
  5041. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, //
  5042. 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, //
  5043. 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, //
  5044. 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, //
  5045. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, //
  5046. 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, //
  5047. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, //
  5048. 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14};
  5049. const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
  5050. const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
  5051. constexpr uint16_t kPairIndexIncrement =
  5052. HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
  5053. return BitCast(d, pairs + Set(du, kPairIndexIncrement));
  5054. }
  5055. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  5056. HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  5057. HWY_DASSERT(mask_bits < 16);
  5058. // There are only 4 lanes, so we can afford to load the index vector directly.
  5059. alignas(16) static constexpr uint8_t u8_indices[256] = {
  5060. // PrintCompress32x4Tables
  5061. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
  5062. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
  5063. 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
  5064. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
  5065. 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
  5066. 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
  5067. 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
  5068. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
  5069. 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
  5070. 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
  5071. 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
  5072. 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
  5073. 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
  5074. 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
  5075. 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
  5076. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
  5077. const Repartition<uint8_t, decltype(d)> d8;
  5078. return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
  5079. }
  5080. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  5081. HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  5082. HWY_DASSERT(mask_bits < 16);
  5083. // There are only 4 lanes, so we can afford to load the index vector directly.
  5084. alignas(16) static constexpr uint8_t u8_indices[256] = {
  5085. // PrintCompressNot32x4Tables
  5086. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
  5087. 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
  5088. 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
  5089. 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
  5090. 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
  5091. 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
  5092. 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
  5093. 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  5094. 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
  5095. 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
  5096. 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
  5097. 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
  5098. 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
  5099. 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
  5100. 12, 13, 14, 15};
  5101. const Repartition<uint8_t, decltype(d)> d8;
  5102. return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
  5103. }
  5104. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  5105. HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
  5106. HWY_DASSERT(mask_bits < 4);
  5107. // There are only 2 lanes, so we can afford to load the index vector directly.
  5108. alignas(16) static constexpr uint8_t u8_indices[64] = {
  5109. // PrintCompress64x2Tables
  5110. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  5111. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  5112. 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
  5113. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
  5114. const Repartition<uint8_t, decltype(d)> d8;
  5115. return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
  5116. }
  5117. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  5118. HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
  5119. HWY_DASSERT(mask_bits < 4);
  5120. // There are only 2 lanes, so we can afford to load the index vector directly.
  5121. alignas(16) static constexpr uint8_t u8_indices[64] = {
  5122. // PrintCompressNot64x2Tables
  5123. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  5124. 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
  5125. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  5126. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
  5127. const Repartition<uint8_t, decltype(d)> d8;
  5128. return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
  5129. }
  5130. template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
  5131. HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
  5132. const DFromV<decltype(v)> d;
  5133. const RebindToUnsigned<decltype(d)> du;
  5134. HWY_DASSERT(mask_bits < (1ull << N));
  5135. const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  5136. return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  5137. }
  5138. template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
  5139. HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
  5140. const DFromV<decltype(v)> d;
  5141. const RebindToUnsigned<decltype(d)> du;
  5142. HWY_DASSERT(mask_bits < (1ull << N));
  5143. const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
  5144. return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  5145. }
  5146. } // namespace detail
  5147. // Single lane: no-op
  5148. template <typename T>
  5149. HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  5150. return v;
  5151. }
  5152. // Two lanes: conditional swap
  5153. template <typename T, HWY_IF_T_SIZE(T, 8)>
  5154. HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
  5155. // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
  5156. const Full128<T> d;
  5157. const Vec128<T> m = VecFromMask(d, mask);
  5158. const Vec128<T> maskL = DupEven(m);
  5159. const Vec128<T> maskH = DupOdd(m);
  5160. const Vec128<T> swap = AndNot(maskL, maskH);
  5161. return IfVecThenElse(swap, Shuffle01(v), v);
  5162. }
  5163. #if HWY_PPC_HAVE_10
  5164. #ifdef HWY_NATIVE_COMPRESS8
  5165. #undef HWY_NATIVE_COMPRESS8
  5166. #else
  5167. #define HWY_NATIVE_COMPRESS8
  5168. #endif
  5169. // General case, 1 byte
  5170. template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
  5171. HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
  5172. const DFromV<decltype(v)> d;
  5173. return TableLookupBytes(
  5174. v, detail::CompressOrExpandIndicesFromMask<true>(d, mask));
  5175. }
  5176. #endif
  5177. // General case, 2 or 4 bytes
  5178. template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
  5179. HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
  5180. return detail::CompressBits(v, detail::BitsFromMask(mask));
  5181. }
  5182. // ------------------------------ CompressNot
  5183. // Single lane: no-op
  5184. template <typename T>
  5185. HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
  5186. return v;
  5187. }
  5188. // Two lanes: conditional swap
  5189. template <typename T, HWY_IF_T_SIZE(T, 8)>
  5190. HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
  5191. // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
  5192. const Full128<T> d;
  5193. const Vec128<T> m = VecFromMask(d, mask);
  5194. const Vec128<T> maskL = DupEven(m);
  5195. const Vec128<T> maskH = DupOdd(m);
  5196. const Vec128<T> swap = AndNot(maskH, maskL);
  5197. return IfVecThenElse(swap, Shuffle01(v), v);
  5198. }
  5199. #if HWY_PPC_HAVE_10
  5200. // General case, 1 byte
  5201. template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
  5202. HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
  5203. const DFromV<decltype(v)> d;
  5204. return TableLookupBytes(
  5205. v, detail::CompressOrExpandIndicesFromMask<true>(d, Not(mask)));
  5206. }
  5207. #endif
  5208. // General case, 2 or 4 bytes
  5209. template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
  5210. HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
  5211. // For partial vectors, we cannot pull the Not() into the table because
  5212. // BitsFromMask clears the upper bits.
  5213. if (N < 16 / sizeof(T)) {
  5214. return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
  5215. }
  5216. return detail::CompressNotBits(v, detail::BitsFromMask(mask));
  5217. }
  5218. // ------------------------------ CompressBlocksNot
  5219. HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
  5220. Mask128<uint64_t> /* m */) {
  5221. return v;
  5222. }
  5223. #if HWY_PPC_HAVE_10
  5224. template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
  5225. HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
  5226. const uint8_t* HWY_RESTRICT bits) {
  5227. const DFromV<decltype(v)> d;
  5228. return Compress(v, LoadMaskBits(d, bits));
  5229. }
  5230. #endif
  5231. template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
  5232. HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
  5233. const uint8_t* HWY_RESTRICT bits) {
  5234. // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply
  5235. // convert bits[0] to a uint64_t
  5236. uint64_t mask_bits = bits[0];
  5237. if (N < 8) {
  5238. mask_bits &= (1ull << N) - 1;
  5239. }
  5240. return detail::CompressBits(v, mask_bits);
  5241. }
  5242. // ------------------------------ CompressStore, CompressBitsStore
  5243. #if HWY_PPC_HAVE_10
  5244. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  5245. HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
  5246. TFromD<D>* HWY_RESTRICT unaligned) {
  5247. const size_t count = CountTrue(d, m);
  5248. const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m);
  5249. const auto compressed = TableLookupBytes(v, indices);
  5250. StoreU(compressed, d, unaligned);
  5251. return count;
  5252. }
  5253. #endif
  5254. template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
  5255. HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
  5256. TFromD<D>* HWY_RESTRICT unaligned) {
  5257. const RebindToUnsigned<decltype(d)> du;
  5258. const uint64_t mask_bits = detail::BitsFromMask(m);
  5259. HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
  5260. const size_t count = PopCount(mask_bits);
  5261. const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  5262. const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  5263. StoreU(compressed, d, unaligned);
  5264. return count;
  5265. }
  5266. #if HWY_PPC_HAVE_10
  5267. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  5268. HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
  5269. TFromD<D>* HWY_RESTRICT unaligned) {
  5270. const size_t count = CountTrue(d, m);
  5271. const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m);
  5272. const auto compressed = TableLookupBytes(v, indices);
  5273. StoreN(compressed, d, unaligned, count);
  5274. return count;
  5275. }
  5276. #endif
  5277. template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
  5278. HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
  5279. TFromD<D>* HWY_RESTRICT unaligned) {
  5280. const RebindToUnsigned<decltype(d)> du;
  5281. const uint64_t mask_bits = detail::BitsFromMask(m);
  5282. HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
  5283. const size_t count = PopCount(mask_bits);
  5284. const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  5285. const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  5286. #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
  5287. StoreN(compressed, d, unaligned, count);
  5288. #else
  5289. BlendedStore(compressed, FirstN(d, count), d, unaligned);
  5290. #endif
  5291. return count;
  5292. }
  5293. #if HWY_PPC_HAVE_10
  5294. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  5295. HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
  5296. D d, TFromD<D>* HWY_RESTRICT unaligned) {
  5297. return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
  5298. }
  5299. #endif
  5300. template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
  5301. HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
  5302. D d, TFromD<D>* HWY_RESTRICT unaligned) {
  5303. const RebindToUnsigned<decltype(d)> du;
  5304. // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply
  5305. // convert bits[0] to a uint64_t
  5306. uint64_t mask_bits = bits[0];
  5307. constexpr size_t kN = MaxLanes(d);
  5308. if (kN < 8) {
  5309. mask_bits &= (1ull << kN) - 1;
  5310. }
  5311. const size_t count = PopCount(mask_bits);
  5312. const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
  5313. const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
  5314. StoreU(compressed, d, unaligned);
  5315. return count;
  5316. }
  5317. // ------------------------------ Expand
  5318. #if HWY_PPC_HAVE_10
  5319. #ifdef HWY_NATIVE_EXPAND
  5320. #undef HWY_NATIVE_EXPAND
  5321. #else
  5322. #define HWY_NATIVE_EXPAND
  5323. #endif
  5324. template <typename T, size_t N,
  5325. HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
  5326. HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  5327. const DFromV<decltype(v)> d;
  5328. const auto idx = detail::CompressOrExpandIndicesFromMask<false>(d, mask);
  5329. return IfThenElseZero(mask, TableLookupBytes(v, idx));
  5330. }
  5331. template <typename T, HWY_IF_T_SIZE(T, 8)>
  5332. HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
  5333. // Same as Compress, just zero out the mask=false lanes.
  5334. return IfThenElseZero(mask, Compress(v, mask));
  5335. }
  5336. // For single-element vectors, this is at least as fast as native.
  5337. template <typename T>
  5338. HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
  5339. return IfThenElseZero(mask, v);
  5340. }
  5341. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  5342. HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
  5343. const TFromD<D>* HWY_RESTRICT unaligned) {
  5344. return Expand(LoadU(d, unaligned), mask);
  5345. }
  5346. #endif // HWY_PPC_HAVE_10
  5347. // ------------------------------ StoreInterleaved2/3/4
  5348. // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
  5349. // generic_ops-inl.h.
  5350. // ------------------------------ Additional mask logical operations
  5351. namespace detail {
  5352. #if HWY_IS_LITTLE_ENDIAN
  5353. template <class V>
  5354. HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
  5355. return v;
  5356. }
  5357. template <class V>
  5358. HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
  5359. return v;
  5360. }
  5361. #else
  5362. template <class V, HWY_IF_T_SIZE_V(V, 1)>
  5363. HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
  5364. const DFromV<decltype(v)> d;
  5365. return Reverse8(d, v);
  5366. }
  5367. template <class V, HWY_IF_T_SIZE_V(V, 2)>
  5368. HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
  5369. const DFromV<decltype(v)> d;
  5370. return Reverse4(d, v);
  5371. }
  5372. template <class V, HWY_IF_T_SIZE_V(V, 4)>
  5373. HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
  5374. const DFromV<decltype(v)> d;
  5375. return Reverse2(d, v);
  5376. }
  5377. template <class V, HWY_IF_T_SIZE_V(V, 8)>
  5378. HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
  5379. return v;
  5380. }
  5381. template <class V>
  5382. HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
  5383. const DFromV<decltype(v)> d;
  5384. return Reverse(d, v);
  5385. }
  5386. #endif
  5387. template <class V>
  5388. HWY_INLINE V I128Subtract(V a, V b) {
  5389. #if HWY_S390X_HAVE_Z14
  5390. const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
  5391. vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
  5392. reinterpret_cast<__vector unsigned char>(b.raw)))};
  5393. #elif defined(__SIZEOF_INT128__)
  5394. using VU128 = __vector unsigned __int128;
  5395. const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
  5396. vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))};
  5397. #else
  5398. const DFromV<decltype(a)> d;
  5399. const Repartition<uint64_t, decltype(d)> du64;
  5400. const auto u64_a = BitCast(du64, a);
  5401. const auto u64_b = BitCast(du64, b);
  5402. const auto diff_u64 = u64_a - u64_b;
  5403. const auto borrow_u64 = VecFromMask(du64, u64_a < u64_b);
  5404. #if HWY_IS_LITTLE_ENDIAN
  5405. const auto borrow_u64_shifted = ShiftLeftBytes<8>(du64, borrow_u64);
  5406. #else
  5407. const auto borrow_u64_shifted = ShiftRightBytes<8>(du64, borrow_u64);
  5408. #endif
  5409. const auto diff_i128 = BitCast(d, diff_u64 + borrow_u64_shifted);
  5410. #endif
  5411. return diff_i128;
  5412. }
  5413. } // namespace detail
  5414. template <class T>
  5415. HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
  5416. return mask;
  5417. }
  5418. template <class T>
  5419. HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
  5420. const FixedTag<T, 2> d;
  5421. const auto vmask = VecFromMask(d, mask);
  5422. return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
  5423. }
  5424. template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
  5425. HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
  5426. const Simd<T, N, 0> d;
  5427. const Full64<T> d_full64;
  5428. const auto vmask = VecFromMask(d, mask);
  5429. const auto vmask_le64 =
  5430. BitCast(Full64<int64_t>(),
  5431. detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask)));
  5432. const auto neg_vmask_le64 = Neg(vmask_le64);
  5433. const auto neg_vmask = ResizeBitCast(
  5434. d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64)));
  5435. return MaskFromVec(Or(vmask, neg_vmask));
  5436. }
  5437. template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
  5438. HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
  5439. const Full128<T> d;
  5440. auto vmask = VecFromMask(d, mask);
  5441. const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask);
  5442. const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128);
  5443. const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128);
  5444. return MaskFromVec(BitCast(d, Or(vmask, neg_vmask)));
  5445. }
  5446. template <class T, size_t N>
  5447. HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
  5448. return Not(SetAtOrAfterFirst(mask));
  5449. }
  5450. template <class T>
  5451. HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
  5452. return mask;
  5453. }
  5454. template <class T>
  5455. HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
  5456. const FixedTag<T, 2> d;
  5457. const RebindToSigned<decltype(d)> di;
  5458. const auto vmask = BitCast(di, VecFromMask(d, mask));
  5459. const auto zero = Zero(di);
  5460. const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
  5461. return MaskFromVec(BitCast(d, And(vmask, vmask2)));
  5462. }
  5463. template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
  5464. HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
  5465. const Simd<T, N, 0> d;
  5466. const Full64<T> d_full64;
  5467. const RebindToSigned<decltype(d)> di;
  5468. const auto vmask = VecFromMask(d, mask);
  5469. const auto vmask_le64 =
  5470. BitCast(Full64<int64_t>(),
  5471. detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask)));
  5472. const auto neg_vmask_le64 = Neg(vmask_le64);
  5473. const auto neg_vmask = ResizeBitCast(
  5474. d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64)));
  5475. const auto first_vmask = BitCast(di, And(vmask, neg_vmask));
  5476. return MaskFromVec(BitCast(d, Or(first_vmask, Neg(first_vmask))));
  5477. }
  5478. template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
  5479. HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
  5480. const Full128<T> d;
  5481. const RebindToSigned<decltype(d)> di;
  5482. const auto vmask = VecFromMask(d, mask);
  5483. const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask);
  5484. const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128);
  5485. const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128);
  5486. return MaskFromVec(BitCast(d, Neg(BitCast(di, And(vmask, neg_vmask)))));
  5487. }
  5488. template <class T>
  5489. HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
  5490. const FixedTag<T, 1> d;
  5491. const RebindToSigned<decltype(d)> di;
  5492. using TI = MakeSigned<T>;
  5493. return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
  5494. }
  5495. template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
  5496. HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
  5497. const Simd<T, N, 0> d;
  5498. return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
  5499. }
  5500. // ------------------------------ SumsOf2 and SumsOf4
  5501. namespace detail {
  5502. #if !HWY_S390X_HAVE_Z14
  5503. // Casts nominally int32_t result to D.
  5504. template <class D>
  5505. HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
  5506. __vector signed int b) {
  5507. const Repartition<int32_t, D> di32;
  5508. #ifdef __OPTIMIZE__
  5509. if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
  5510. const int64_t sum0 =
  5511. static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
  5512. static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
  5513. static_cast<int64_t>(b[0]);
  5514. const int64_t sum1 =
  5515. static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
  5516. static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
  5517. static_cast<int64_t>(b[1]);
  5518. const int64_t sum2 =
  5519. static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
  5520. static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
  5521. static_cast<int64_t>(b[2]);
  5522. const int64_t sum3 =
  5523. static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
  5524. static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
  5525. static_cast<int64_t>(b[3]);
  5526. const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
  5527. const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
  5528. const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
  5529. const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
  5530. using Raw = typename detail::Raw128<int32_t>::type;
  5531. return BitCast(
  5532. d,
  5533. VFromD<decltype(di32)>{Raw{
  5534. (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
  5535. : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
  5536. (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
  5537. : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
  5538. (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
  5539. : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
  5540. (sign3 == (sum3 >> 31))
  5541. ? static_cast<int32_t>(sum3)
  5542. : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
  5543. } else // NOLINT
  5544. #endif
  5545. {
  5546. return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
  5547. }
  5548. }
  5549. // Casts nominally uint32_t result to D.
  5550. template <class D>
  5551. HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
  5552. __vector unsigned int b) {
  5553. const Repartition<uint32_t, D> du32;
  5554. #ifdef __OPTIMIZE__
  5555. if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
  5556. const uint64_t sum0 =
  5557. static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
  5558. static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
  5559. static_cast<uint64_t>(b[0]);
  5560. const uint64_t sum1 =
  5561. static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
  5562. static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
  5563. static_cast<uint64_t>(b[1]);
  5564. const uint64_t sum2 =
  5565. static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
  5566. static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
  5567. static_cast<uint64_t>(b[2]);
  5568. const uint64_t sum3 =
  5569. static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
  5570. static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
  5571. static_cast<uint64_t>(b[3]);
  5572. return BitCast(
  5573. d,
  5574. VFromD<decltype(du32)>{(__vector unsigned int){
  5575. static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
  5576. static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
  5577. static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
  5578. static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
  5579. : 0xFFFFFFFFu)}});
  5580. } else // NOLINT
  5581. #endif
  5582. {
  5583. return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
  5584. }
  5585. }
  5586. // Casts nominally int32_t result to D.
  5587. template <class D>
  5588. HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
  5589. __vector signed int b) {
  5590. const Repartition<int32_t, D> di32;
  5591. #ifdef __OPTIMIZE__
  5592. const Repartition<uint64_t, D> du64;
  5593. constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
  5594. if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
  5595. __builtin_constant_p(b[kDestLaneOffset + 2])) {
  5596. const int64_t sum0 = static_cast<int64_t>(a[0]) +
  5597. static_cast<int64_t>(a[1]) +
  5598. static_cast<int64_t>(b[kDestLaneOffset]);
  5599. const int64_t sum1 = static_cast<int64_t>(a[2]) +
  5600. static_cast<int64_t>(a[3]) +
  5601. static_cast<int64_t>(b[kDestLaneOffset + 2]);
  5602. const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
  5603. const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
  5604. return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
  5605. (sign0 == (sum0 >> 31))
  5606. ? static_cast<uint32_t>(sum0)
  5607. : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
  5608. (sign1 == (sum1 >> 31))
  5609. ? static_cast<uint32_t>(sum1)
  5610. : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
  5611. } else // NOLINT
  5612. #endif
  5613. {
  5614. __vector signed int sum;
  5615. // Inline assembly is used for vsum2sws to avoid unnecessary shuffling
  5616. // on little-endian PowerPC targets as the result of the vsum2sws
  5617. // instruction will already be in the correct lanes on little-endian
  5618. // PowerPC targets.
  5619. __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
  5620. return BitCast(d, VFromD<decltype(di32)>{sum});
  5621. }
  5622. }
  5623. // Casts nominally int32_t result to D.
  5624. template <class D>
  5625. HWY_INLINE VFromD<D> AltivecVsum4shs(D d, __vector signed short a,
  5626. __vector signed int b) {
  5627. const Repartition<int32_t, D> di32;
  5628. #ifdef __OPTIMIZE__
  5629. if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
  5630. const int64_t sum0 = static_cast<int64_t>(a[0]) +
  5631. static_cast<int64_t>(a[1]) +
  5632. static_cast<int64_t>(b[0]);
  5633. const int64_t sum1 = static_cast<int64_t>(a[2]) +
  5634. static_cast<int64_t>(a[3]) +
  5635. static_cast<int64_t>(b[1]);
  5636. const int64_t sum2 = static_cast<int64_t>(a[4]) +
  5637. static_cast<int64_t>(a[5]) +
  5638. static_cast<int64_t>(b[2]);
  5639. const int64_t sum3 = static_cast<int64_t>(a[6]) +
  5640. static_cast<int64_t>(a[7]) +
  5641. static_cast<int64_t>(b[3]);
  5642. const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
  5643. const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
  5644. const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
  5645. const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
  5646. using Raw = typename detail::Raw128<int32_t>::type;
  5647. return BitCast(
  5648. d,
  5649. VFromD<decltype(di32)>{Raw{
  5650. (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
  5651. : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
  5652. (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
  5653. : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
  5654. (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
  5655. : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
  5656. (sign3 == (sum3 >> 31))
  5657. ? static_cast<int32_t>(sum3)
  5658. : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
  5659. } else // NOLINT
  5660. #endif
  5661. {
  5662. return BitCast(d, VFromD<decltype(di32)>{vec_vsum4shs(a, b)});
  5663. }
  5664. }
  5665. // Casts nominally int32_t result to D.
  5666. template <class D>
  5667. HWY_INLINE VFromD<D> AltivecVsumsws(D d, __vector signed int a,
  5668. __vector signed int b) {
  5669. const Repartition<int32_t, D> di32;
  5670. #ifdef __OPTIMIZE__
  5671. constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
  5672. if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) {
  5673. const int64_t sum =
  5674. static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
  5675. static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
  5676. static_cast<int64_t>(b[kDestLaneOffset]);
  5677. const int32_t sign = static_cast<int32_t>(sum >> 63);
  5678. #if HWY_IS_LITTLE_ENDIAN
  5679. return BitCast(
  5680. d, VFromD<decltype(di32)>{(__vector signed int){
  5681. (sign == (sum >> 31)) ? static_cast<int32_t>(sum)
  5682. : static_cast<int32_t>(sign ^ 0x7FFFFFFF),
  5683. 0, 0, 0}});
  5684. #else
  5685. return BitCast(d, VFromD<decltype(di32)>{(__vector signed int){
  5686. 0, 0, 0,
  5687. (sign == (sum >> 31))
  5688. ? static_cast<int32_t>(sum)
  5689. : static_cast<int32_t>(sign ^ 0x7FFFFFFF)}});
  5690. #endif
  5691. } else // NOLINT
  5692. #endif
  5693. {
  5694. __vector signed int sum;
  5695. // Inline assembly is used for vsumsws to avoid unnecessary shuffling
  5696. // on little-endian PowerPC targets as the result of the vsumsws
  5697. // instruction will already be in the correct lanes on little-endian
  5698. // PowerPC targets.
  5699. __asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
  5700. return BitCast(d, VFromD<decltype(di32)>{sum});
  5701. }
  5702. }
  5703. template <size_t N>
  5704. HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
  5705. const RebindToSigned<DFromV<decltype(v)>> di16;
  5706. const RepartitionToWide<decltype(di16)> di32;
  5707. return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
  5708. Set(di32, 65536).raw);
  5709. }
  5710. #endif // !HWY_S390X_HAVE_Z14
  5711. // U16->U32 SumsOf2
  5712. template <class V>
  5713. HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
  5714. hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
  5715. const DFromV<V> d;
  5716. const RepartitionToWide<decltype(d)> dw;
  5717. #if HWY_S390X_HAVE_Z14
  5718. return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)};
  5719. #else
  5720. return BitCast(dw, AltivecU16SumsOf2(v));
  5721. #endif
  5722. }
  5723. // I16->I32 SumsOf2
  5724. template <class V>
  5725. HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
  5726. hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
  5727. const DFromV<V> d;
  5728. const RepartitionToWide<decltype(d)> dw;
  5729. #if HWY_S390X_HAVE_Z14
  5730. const RebindToUnsigned<decltype(d)> du;
  5731. return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(),
  5732. BitCast(du, Xor(v, SignBit(d))))) +
  5733. Set(dw, int32_t{-65536});
  5734. #else
  5735. return AltivecVsum4shs(dw, v.raw, Zero(dw).raw);
  5736. #endif
  5737. }
  5738. #if HWY_S390X_HAVE_Z14
  5739. // U32->U64 SumsOf2
  5740. template <class V>
  5741. HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
  5742. hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
  5743. const DFromV<V> d;
  5744. const RepartitionToWide<decltype(d)> dw;
  5745. return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)};
  5746. }
  5747. // I32->I64 SumsOf2
  5748. template <class V>
  5749. HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
  5750. hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
  5751. const DFromV<V> d;
  5752. const RepartitionToWide<decltype(d)> dw;
  5753. const RebindToUnsigned<decltype(d)> du;
  5754. return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
  5755. BitCast(du, Xor(v, SignBit(d))))) +
  5756. Set(dw, int64_t{-4294967296LL});
  5757. }
  5758. #endif
  5759. // U8->U32 SumsOf4
  5760. template <class V>
  5761. HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
  5762. hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
  5763. const DFromV<V> d;
  5764. const RepartitionToWideX2<decltype(d)> dw2;
  5765. #if HWY_S390X_HAVE_Z14
  5766. return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)};
  5767. #else
  5768. return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw);
  5769. #endif
  5770. }
  5771. // I8->I32 SumsOf4
  5772. template <class V>
  5773. HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
  5774. hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
  5775. const DFromV<V> d;
  5776. const RepartitionToWideX2<decltype(d)> dw2;
  5777. #if HWY_S390X_HAVE_Z14
  5778. const RebindToUnsigned<decltype(d)> du;
  5779. return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(),
  5780. BitCast(du, Xor(v, SignBit(d))))) +
  5781. Set(dw2, int32_t{-512});
  5782. #else
  5783. return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw);
  5784. #endif
  5785. }
  5786. // U16->U64 SumsOf4
  5787. template <class V>
  5788. HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
  5789. hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
  5790. const DFromV<V> d;
  5791. const RepartitionToWide<decltype(d)> dw;
  5792. const RepartitionToWide<decltype(dw)> dw2;
  5793. #if HWY_S390X_HAVE_Z14
  5794. return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)};
  5795. #else
  5796. const RebindToSigned<decltype(dw)> dw_i;
  5797. return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw);
  5798. #endif
  5799. }
  5800. // I16->I64 SumsOf4
  5801. template <class V>
  5802. HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
  5803. hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
  5804. const DFromV<V> d;
  5805. const RepartitionToWide<decltype(d)> dw;
  5806. const RepartitionToWide<decltype(dw)> dw2;
  5807. #if HWY_S390X_HAVE_Z14
  5808. const RebindToUnsigned<decltype(d)> du;
  5809. return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(),
  5810. BitCast(du, Xor(v, SignBit(d))))) +
  5811. Set(dw2, int64_t{-131072});
  5812. #else // VSX
  5813. const auto sums_of_4_in_lo32 =
  5814. AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw);
  5815. #if HWY_IS_LITTLE_ENDIAN
  5816. return PromoteEvenTo(dw2, sums_of_4_in_lo32);
  5817. #else
  5818. return PromoteOddTo(dw2, sums_of_4_in_lo32);
  5819. #endif // HWY_IS_LITTLE_ENDIAN
  5820. #endif // HWY_S390X_HAVE_Z14
  5821. }
  5822. } // namespace detail
  5823. // ------------------------------ SumOfLanes
  5824. // We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16);
  5825. // enable generic for the rest.
  5826. #undef HWY_IF_SUM_OF_LANES_D
  5827. #if HWY_S390X_HAVE_Z14
  5828. #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
  5829. #else
  5830. #define HWY_IF_SUM_OF_LANES_D(D) \
  5831. HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
  5832. #endif
  5833. #if HWY_S390X_HAVE_Z14
  5834. namespace detail {
  5835. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
  5836. HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
  5837. HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
  5838. const DFromV<decltype(v)> d;
  5839. const RebindToUnsigned<decltype(d)> du;
  5840. return BitCast(
  5841. d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
  5842. }
  5843. } // namespace detail
  5844. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
  5845. HWY_API VFromD<D> SumOfLanes(D /*d64*/, VFromD<D> v) {
  5846. return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v));
  5847. }
  5848. #endif
  5849. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
  5850. HWY_API Vec32<uint16_t> SumOfLanes(D du16, Vec32<uint16_t> v) {
  5851. constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
  5852. return Broadcast<kSumLaneIdx>(
  5853. BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
  5854. }
  5855. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
  5856. HWY_API Vec64<uint16_t> SumOfLanes(D du16, Vec64<uint16_t> v) {
  5857. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
  5858. return Broadcast<kSumLaneIdx>(
  5859. BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
  5860. }
  5861. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
  5862. HWY_API Vec128<uint16_t> SumOfLanes(D du16, Vec128<uint16_t> v) {
  5863. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
  5864. #if HWY_S390X_HAVE_Z14
  5865. return Broadcast<kSumLaneIdx>(
  5866. BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
  5867. hwy::UnsignedTag(), hwy::SizeTag<2>(), v))));
  5868. #else // VSX
  5869. const auto zero = Zero(Full128<int32_t>());
  5870. return Broadcast<kSumLaneIdx>(
  5871. detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
  5872. #endif
  5873. }
  5874. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
  5875. HWY_API Vec32<int16_t> SumOfLanes(D di16, Vec32<int16_t> v) {
  5876. #if HWY_S390X_HAVE_Z14
  5877. const RebindToUnsigned<decltype(di16)> du16;
  5878. return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
  5879. #else
  5880. constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
  5881. return Broadcast<kSumLaneIdx>(
  5882. BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
  5883. #endif
  5884. }
  5885. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
  5886. HWY_API Vec64<int16_t> SumOfLanes(D di16, Vec64<int16_t> v) {
  5887. #if HWY_S390X_HAVE_Z14
  5888. const RebindToUnsigned<decltype(di16)> du16;
  5889. return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
  5890. #else
  5891. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
  5892. return Broadcast<kSumLaneIdx>(
  5893. BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
  5894. #endif
  5895. }
  5896. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
  5897. HWY_API Vec128<int16_t> SumOfLanes(D di16, Vec128<int16_t> v) {
  5898. #if HWY_S390X_HAVE_Z14
  5899. const RebindToUnsigned<decltype(di16)> du16;
  5900. return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
  5901. #else
  5902. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
  5903. const Full128<int32_t> di32;
  5904. const auto zero = Zero(di32);
  5905. return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
  5906. di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
  5907. #endif
  5908. }
  5909. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
  5910. HWY_API Vec32<uint8_t> SumOfLanes(D du8, Vec32<uint8_t> v) {
  5911. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
  5912. return Broadcast<kSumLaneIdx>(
  5913. BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v)));
  5914. }
  5915. template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
  5916. HWY_API Vec16<uint8_t> SumOfLanes(D du8, Vec16<uint8_t> v) {
  5917. const Twice<decltype(du8)> dt_u8;
  5918. return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v)));
  5919. }
  5920. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
  5921. HWY_API Vec64<uint8_t> SumOfLanes(D du8, Vec64<uint8_t> v) {
  5922. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
  5923. return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
  5924. }
  5925. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
  5926. HWY_API Vec128<uint8_t> SumOfLanes(D du8, Vec128<uint8_t> v) {
  5927. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
  5928. #if HWY_S390X_HAVE_Z14
  5929. return Broadcast<kSumLaneIdx>(
  5930. BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
  5931. hwy::UnsignedTag(), hwy::SizeTag<1>(), v))));
  5932. #else
  5933. const Full128<uint32_t> du32;
  5934. const RebindToSigned<decltype(du32)> di32;
  5935. const Vec128<uint32_t> zero = Zero(du32);
  5936. return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
  5937. du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
  5938. BitCast(di32, zero).raw));
  5939. #endif
  5940. }
  5941. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
  5942. HWY_API Vec32<int8_t> SumOfLanes(D di8, Vec32<int8_t> v) {
  5943. #if HWY_S390X_HAVE_Z14
  5944. const RebindToUnsigned<decltype(di8)> du8;
  5945. return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
  5946. #else
  5947. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
  5948. return Broadcast<kSumLaneIdx>(
  5949. BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v)));
  5950. #endif
  5951. }
  5952. template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
  5953. HWY_API Vec16<int8_t> SumOfLanes(D di8, Vec16<int8_t> v) {
  5954. const Twice<decltype(di8)> dt_i8;
  5955. return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v)));
  5956. }
  5957. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
  5958. HWY_API Vec64<int8_t> SumOfLanes(D di8, Vec64<int8_t> v) {
  5959. #if HWY_S390X_HAVE_Z14
  5960. const RebindToUnsigned<decltype(di8)> du8;
  5961. return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
  5962. #else
  5963. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
  5964. return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v)));
  5965. #endif
  5966. }
  5967. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
  5968. HWY_API Vec128<int8_t> SumOfLanes(D di8, Vec128<int8_t> v) {
  5969. #if HWY_S390X_HAVE_Z14
  5970. const RebindToUnsigned<decltype(di8)> du8;
  5971. return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
  5972. #else
  5973. constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
  5974. const Full128<int32_t> di32;
  5975. const Vec128<int32_t> zero = Zero(di32);
  5976. return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
  5977. di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
  5978. #endif
  5979. }
  5980. #if HWY_S390X_HAVE_Z14
  5981. template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
  5982. HWY_API VFromD<D> SumOfLanes(D d32, VFromD<D> v) {
  5983. const RebindToUnsigned<decltype(d32)> du32;
  5984. return Broadcast<1>(
  5985. BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
  5986. BitCast(du32, v))));
  5987. }
  5988. template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
  5989. HWY_API VFromD<D> SumOfLanes(D /*d32*/, VFromD<D> v) {
  5990. return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v));
  5991. }
  5992. #endif
  5993. // generic_ops defines MinOfLanes and MaxOfLanes.
  5994. // ------------------------------ ReduceSum for N=4 I8/U8
  5995. // GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4
  5996. // I8/U8 ReduceSum implementation in generic_ops-inl.h
  5997. #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
  5998. #undef HWY_NATIVE_REDUCE_SUM_4_UI8
  5999. #else
  6000. #define HWY_NATIVE_REDUCE_SUM_4_UI8
  6001. #endif
  6002. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
  6003. HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
  6004. return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
  6005. }
  6006. // ------------------------------ BitShuffle
  6007. #ifdef HWY_NATIVE_BITSHUFFLE
  6008. #undef HWY_NATIVE_BITSHUFFLE
  6009. #else
  6010. #define HWY_NATIVE_BITSHUFFLE
  6011. #endif
  6012. template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
  6013. HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
  6014. HWY_API V BitShuffle(V v, VI idx) {
  6015. const DFromV<decltype(v)> d64;
  6016. const RebindToUnsigned<decltype(d64)> du64;
  6017. const Repartition<uint8_t, decltype(d64)> du8;
  6018. const Full128<TFromD<decltype(du64)>> d_full_u64;
  6019. const Full128<TFromD<decltype(du8)>> d_full_u8;
  6020. using RawVU64 = __vector unsigned long long;
  6021. #if HWY_PPC_HAVE_9
  6022. #if HWY_IS_LITTLE_ENDIAN
  6023. (void)d_full_u64;
  6024. auto bit_idx = ResizeBitCast(d_full_u8, idx);
  6025. #else
  6026. auto bit_idx =
  6027. BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx)));
  6028. #endif
  6029. bit_idx = Xor(bit_idx, Set(d_full_u8, uint8_t{0x3F}));
  6030. return BitCast(d64, VFromD<decltype(du64)>{reinterpret_cast<RawVU64>(
  6031. vec_bperm(BitCast(du64, v).raw, bit_idx.raw))});
  6032. #else // !HWY_PPC_HAVE_9
  6033. #if HWY_IS_LITTLE_ENDIAN
  6034. const auto bit_idx_xor_mask = BitCast(
  6035. d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x7F7F7F7F7F7F7F7Fu},
  6036. uint64_t{0x3F3F3F3F3F3F3F3Fu}));
  6037. const auto bit_idx = Xor(ResizeBitCast(d_full_u8, idx), bit_idx_xor_mask);
  6038. constexpr int kBitShufResultByteShrAmt = 8;
  6039. #else
  6040. const auto bit_idx_xor_mask = BitCast(
  6041. d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x3F3F3F3F3F3F3F3Fu},
  6042. uint64_t{0x7F7F7F7F7F7F7F7Fu}));
  6043. const auto bit_idx =
  6044. Xor(BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))),
  6045. bit_idx_xor_mask);
  6046. constexpr int kBitShufResultByteShrAmt = 6;
  6047. #endif
  6048. #if HWY_S390X_HAVE_Z14
  6049. const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
  6050. vec_bperm_u128(BitCast(du8, v).raw, bit_idx.raw))};
  6051. #elif defined(__SIZEOF_INT128__)
  6052. using RawVU128 = __vector unsigned __int128;
  6053. const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
  6054. vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
  6055. #else
  6056. using RawVU128 = __vector unsigned char;
  6057. const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
  6058. vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
  6059. #endif
  6060. return ResizeBitCast(
  6061. d64, PromoteTo(d_full_u64,
  6062. ResizeBitCast(
  6063. Rebind<uint8_t, decltype(d_full_u64)>(),
  6064. CombineShiftRightBytes<kBitShufResultByteShrAmt>(
  6065. d_full_u64, bit_shuf_result, bit_shuf_result))));
  6066. #endif // HWY_PPC_HAVE_9
  6067. }
  6068. // ------------------------------ Lt128
  6069. namespace detail {
  6070. // Returns vector-mask for Lt128.
  6071. template <class D, class V = VFromD<D>>
  6072. HWY_INLINE V Lt128Vec(D d, V a, V b) {
  6073. static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
  6074. #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
  6075. (void)d;
  6076. using VU64 = __vector unsigned long long;
  6077. using VU128 = __vector unsigned __int128;
  6078. #if HWY_IS_LITTLE_ENDIAN
  6079. const VU128 a_u128 = reinterpret_cast<VU128>(a.raw);
  6080. const VU128 b_u128 = reinterpret_cast<VU128>(b.raw);
  6081. #else
  6082. // NOTE: Need to swap the halves of both a and b on big-endian targets
  6083. // as the upper 64 bits of a and b are in lane 1 and the lower 64 bits
  6084. // of a and b are in lane 0 whereas the vec_cmplt operation below expects
  6085. // the upper 64 bits in lane 0 and the lower 64 bits in lane 1 on
  6086. // big-endian PPC targets.
  6087. const VU128 a_u128 = reinterpret_cast<VU128>(vec_sld(a.raw, a.raw, 8));
  6088. const VU128 b_u128 = reinterpret_cast<VU128>(vec_sld(b.raw, b.raw, 8));
  6089. #endif
  6090. return V{reinterpret_cast<VU64>(vec_cmplt(a_u128, b_u128))};
  6091. #else // !HWY_PPC_HAVE_10
  6092. // Truth table of Eq and Lt for Hi and Lo u64.
  6093. // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  6094. // =H =L cH cL | out = cH | (=H & cL)
  6095. // 0 0 0 0 | 0
  6096. // 0 0 0 1 | 0
  6097. // 0 0 1 0 | 1
  6098. // 0 0 1 1 | 1
  6099. // 0 1 0 0 | 0
  6100. // 0 1 0 1 | 0
  6101. // 0 1 1 0 | 1
  6102. // 1 0 0 0 | 0
  6103. // 1 0 0 1 | 1
  6104. // 1 1 0 0 | 0
  6105. const auto eqHL = Eq(a, b);
  6106. const V ltHL = VecFromMask(d, Lt(a, b));
  6107. const V ltLX = ShiftLeftLanes<1>(ltHL);
  6108. const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
  6109. return InterleaveUpper(d, vecHx, vecHx);
  6110. #endif
  6111. }
  6112. // Returns vector-mask for Eq128.
  6113. template <class D, class V = VFromD<D>>
  6114. HWY_INLINE V Eq128Vec(D d, V a, V b) {
  6115. static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
  6116. #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
  6117. (void)d;
  6118. using VU64 = __vector unsigned long long;
  6119. using VU128 = __vector unsigned __int128;
  6120. return V{reinterpret_cast<VU64>(vec_cmpeq(reinterpret_cast<VU128>(a.raw),
  6121. reinterpret_cast<VU128>(b.raw)))};
  6122. #else
  6123. const auto eqHL = VecFromMask(d, Eq(a, b));
  6124. const auto eqLH = Reverse2(d, eqHL);
  6125. return And(eqHL, eqLH);
  6126. #endif
  6127. }
  6128. template <class D, class V = VFromD<D>>
  6129. HWY_INLINE V Ne128Vec(D d, V a, V b) {
  6130. static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
  6131. #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
  6132. (void)d;
  6133. using VU64 = __vector unsigned long long;
  6134. using VU128 = __vector unsigned __int128;
  6135. return V{reinterpret_cast<VU64>(vec_cmpne(reinterpret_cast<VU128>(a.raw),
  6136. reinterpret_cast<VU128>(b.raw)))};
  6137. #else
  6138. const auto neHL = VecFromMask(d, Ne(a, b));
  6139. const auto neLH = Reverse2(d, neHL);
  6140. return Or(neHL, neLH);
  6141. #endif
  6142. }
  6143. template <class D, class V = VFromD<D>>
  6144. HWY_INLINE V Lt128UpperVec(D d, V a, V b) {
  6145. const V ltHL = VecFromMask(d, Lt(a, b));
  6146. return InterleaveUpper(d, ltHL, ltHL);
  6147. }
  6148. template <class D, class V = VFromD<D>>
  6149. HWY_INLINE V Eq128UpperVec(D d, V a, V b) {
  6150. const V eqHL = VecFromMask(d, Eq(a, b));
  6151. return InterleaveUpper(d, eqHL, eqHL);
  6152. }
  6153. template <class D, class V = VFromD<D>>
  6154. HWY_INLINE V Ne128UpperVec(D d, V a, V b) {
  6155. const V neHL = VecFromMask(d, Ne(a, b));
  6156. return InterleaveUpper(d, neHL, neHL);
  6157. }
  6158. } // namespace detail
  6159. template <class D, class V = VFromD<D>>
  6160. HWY_API MFromD<D> Lt128(D d, V a, V b) {
  6161. return MaskFromVec(detail::Lt128Vec(d, a, b));
  6162. }
  6163. template <class D, class V = VFromD<D>>
  6164. HWY_API MFromD<D> Eq128(D d, V a, V b) {
  6165. return MaskFromVec(detail::Eq128Vec(d, a, b));
  6166. }
  6167. template <class D, class V = VFromD<D>>
  6168. HWY_API MFromD<D> Ne128(D d, V a, V b) {
  6169. return MaskFromVec(detail::Ne128Vec(d, a, b));
  6170. }
  6171. template <class D, class V = VFromD<D>>
  6172. HWY_API MFromD<D> Lt128Upper(D d, V a, V b) {
  6173. return MaskFromVec(detail::Lt128UpperVec(d, a, b));
  6174. }
  6175. template <class D, class V = VFromD<D>>
  6176. HWY_API MFromD<D> Eq128Upper(D d, V a, V b) {
  6177. return MaskFromVec(detail::Eq128UpperVec(d, a, b));
  6178. }
  6179. template <class D, class V = VFromD<D>>
  6180. HWY_API MFromD<D> Ne128Upper(D d, V a, V b) {
  6181. return MaskFromVec(detail::Ne128UpperVec(d, a, b));
  6182. }
  6183. // ------------------------------ Min128, Max128 (Lt128)
  6184. // Avoids the extra MaskFromVec in Lt128.
  6185. template <class D, class V = VFromD<D>>
  6186. HWY_API V Min128(D d, const V a, const V b) {
  6187. return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
  6188. }
  6189. template <class D, class V = VFromD<D>>
  6190. HWY_API V Max128(D d, const V a, const V b) {
  6191. return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
  6192. }
  6193. template <class D, class V = VFromD<D>>
  6194. HWY_API V Min128Upper(D d, const V a, const V b) {
  6195. return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
  6196. }
  6197. template <class D, class V = VFromD<D>>
  6198. HWY_API V Max128Upper(D d, const V a, const V b) {
  6199. return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
  6200. }
  6201. // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
  6202. #ifdef HWY_NATIVE_LEADING_ZERO_COUNT
  6203. #undef HWY_NATIVE_LEADING_ZERO_COUNT
  6204. #else
  6205. #define HWY_NATIVE_LEADING_ZERO_COUNT
  6206. #endif
  6207. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  6208. HWY_API V LeadingZeroCount(V v) {
  6209. #if HWY_S390X_HAVE_Z14
  6210. const DFromV<decltype(v)> d;
  6211. const RebindToUnsigned<decltype(d)> du;
  6212. #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
  6213. // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
  6214. // constant
  6215. __asm__("" : "+v"(v.raw));
  6216. #endif
  6217. return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)});
  6218. #else
  6219. return V{vec_cntlz(v.raw)};
  6220. #endif
  6221. }
  6222. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  6223. HWY_API V HighestSetBitIndex(V v) {
  6224. const DFromV<decltype(v)> d;
  6225. using T = TFromD<decltype(d)>;
  6226. return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
  6227. }
  6228. #if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
  6229. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  6230. HWY_API V TrailingZeroCount(V v) {
  6231. #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
  6232. return V{vec_vctz(v.raw)};
  6233. #else
  6234. #if HWY_S390X_HAVE_Z14
  6235. const DFromV<decltype(v)> d;
  6236. const RebindToUnsigned<decltype(d)> du;
  6237. #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
  6238. // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
  6239. // constant
  6240. __asm__("" : "+v"(v.raw));
  6241. #endif
  6242. return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)});
  6243. #else
  6244. return V{vec_cnttz(v.raw)};
  6245. #endif // HWY_S390X_HAVE_Z14
  6246. #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
  6247. }
  6248. #else
  6249. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  6250. HWY_API V TrailingZeroCount(V v) {
  6251. const DFromV<decltype(v)> d;
  6252. const RebindToSigned<decltype(d)> di;
  6253. using TI = TFromD<decltype(di)>;
  6254. const auto vi = BitCast(di, v);
  6255. const auto lowest_bit = And(vi, Neg(vi));
  6256. constexpr TI kNumOfBitsInT{sizeof(TI) * 8};
  6257. const auto bit_idx = HighestSetBitIndex(lowest_bit);
  6258. return BitCast(d, IfThenElse(MaskFromVec(BroadcastSignBit(bit_idx)),
  6259. Set(di, kNumOfBitsInT), bit_idx));
  6260. }
  6261. #endif
  6262. #undef HWY_PPC_HAVE_9
  6263. #undef HWY_PPC_HAVE_10
  6264. #undef HWY_S390X_HAVE_Z14
  6265. #undef HWY_S390X_HAVE_Z15
  6266. // NOLINTNEXTLINE(google-readability-namespace-comments)
  6267. } // namespace HWY_NAMESPACE
  6268. } // namespace hwy
  6269. HWY_AFTER_NAMESPACE();