| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229 |
- // Copyright 2023 Google LLC
- // SPDX-License-Identifier: Apache-2.0
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- // 128-bit vectors for VSX/Z14
- // External include guard in highway.h - see comment there.
- #if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
- #define HWY_S390X_HAVE_Z14 1
- #else
- #define HWY_S390X_HAVE_Z14 0
- #endif
- #pragma push_macro("vector")
- #pragma push_macro("pixel")
- #pragma push_macro("bool")
- #undef vector
- #undef pixel
- #undef bool
- #if HWY_S390X_HAVE_Z14
- #include <vecintrin.h>
- #else
- #include <altivec.h>
- #endif
- #pragma pop_macro("vector")
- #pragma pop_macro("pixel")
- #pragma pop_macro("bool")
- #include "hwy/ops/shared-inl.h"
- // clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__, and
- // some GCC do the same for _ARCH_PWR10.
- // This means we can only use POWER10-specific intrinsics in static dispatch
- // mode (where the -mpower10-vector compiler flag is passed). Same for PPC9.
- // On other compilers, the usual target check is sufficient.
- #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
- (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
- #define HWY_PPC_HAVE_9 1
- #else
- #define HWY_PPC_HAVE_9 0
- #endif
- #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
- (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
- #define HWY_PPC_HAVE_10 1
- #else
- #define HWY_PPC_HAVE_10 0
- #endif
- #if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
- #define HWY_S390X_HAVE_Z15 1
- #else
- #define HWY_S390X_HAVE_Z15 0
- #endif
- HWY_BEFORE_NAMESPACE();
- namespace hwy {
- namespace HWY_NAMESPACE {
- namespace detail {
- template <typename T>
- struct Raw128;
- // Each Raw128 specialization defines the following typedefs:
- // - type:
- // the backing Altivec/VSX raw vector type of the Vec128<T, N> type
- // - RawBoolVec:
- // the backing Altivec/VSX raw __bool vector type of the Mask128<T, N> type
- // - RawT:
- // the lane type for intrinsics, in particular vec_splat
- // - AlignedRawVec:
- // the 128-bit GCC/Clang vector type for aligned loads/stores
- // - UnalignedRawVec:
- // the 128-bit GCC/Clang vector type for unaligned loads/stores
- #define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \
- template <> \
- struct Raw128<LANE_TYPE> { \
- using type = __vector RAW_VECT_LANE_TYPE; \
- using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \
- using RawT = RAW_VECT_LANE_TYPE; \
- typedef LANE_TYPE AlignedRawVec \
- __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \
- typedef LANE_TYPE UnalignedRawVec __attribute__(( \
- __vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \
- };
- HWY_VSX_RAW128(int8_t, signed char, char)
- HWY_VSX_RAW128(uint8_t, unsigned char, char)
- HWY_VSX_RAW128(int16_t, signed short, short) // NOLINT(runtime/int)
- HWY_VSX_RAW128(uint16_t, unsigned short, short) // NOLINT(runtime/int)
- HWY_VSX_RAW128(int32_t, signed int, int)
- HWY_VSX_RAW128(uint32_t, unsigned int, int)
- HWY_VSX_RAW128(int64_t, signed long long, long long) // NOLINT(runtime/int)
- HWY_VSX_RAW128(uint64_t, unsigned long long, long long) // NOLINT(runtime/int)
- HWY_VSX_RAW128(float, float, int)
- HWY_VSX_RAW128(double, double, long long) // NOLINT(runtime/int)
- template <>
- struct Raw128<bfloat16_t> : public Raw128<uint16_t> {};
- template <>
- struct Raw128<float16_t> : public Raw128<uint16_t> {};
- #undef HWY_VSX_RAW128
- } // namespace detail
- template <typename T, size_t N = 16 / sizeof(T)>
- class Vec128 {
- using Raw = typename detail::Raw128<T>::type;
- public:
- using PrivateT = T; // only for DFromV
- static constexpr size_t kPrivateN = N; // only for DFromV
- // Compound assignment. Only usable if there is a corresponding non-member
- // binary operator overload. For example, only f32 and f64 support division.
- HWY_INLINE Vec128& operator*=(const Vec128 other) {
- return *this = (*this * other);
- }
- HWY_INLINE Vec128& operator/=(const Vec128 other) {
- return *this = (*this / other);
- }
- HWY_INLINE Vec128& operator+=(const Vec128 other) {
- return *this = (*this + other);
- }
- HWY_INLINE Vec128& operator-=(const Vec128 other) {
- return *this = (*this - other);
- }
- HWY_INLINE Vec128& operator%=(const Vec128 other) {
- return *this = (*this % other);
- }
- HWY_INLINE Vec128& operator&=(const Vec128 other) {
- return *this = (*this & other);
- }
- HWY_INLINE Vec128& operator|=(const Vec128 other) {
- return *this = (*this | other);
- }
- HWY_INLINE Vec128& operator^=(const Vec128 other) {
- return *this = (*this ^ other);
- }
- Raw raw;
- };
- template <typename T>
- using Vec64 = Vec128<T, 8 / sizeof(T)>;
- template <typename T>
- using Vec32 = Vec128<T, 4 / sizeof(T)>;
- template <typename T>
- using Vec16 = Vec128<T, 2 / sizeof(T)>;
- // FF..FF or 0.
- template <typename T, size_t N = 16 / sizeof(T)>
- struct Mask128 {
- typename detail::Raw128<T>::RawBoolVec raw;
- using PrivateT = T; // only for DFromM
- static constexpr size_t kPrivateN = N; // only for DFromM
- };
- template <class V>
- using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
- template <class M>
- using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
- template <class V>
- using TFromV = typename V::PrivateT;
- // ------------------------------ Zero
- // Returns an all-zero vector/part.
- template <class D, typename T = TFromD<D>>
- HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D /* tag */) {
- // There is no vec_splats for 64-bit, so we cannot rely on casting the 0
- // argument in order to select the correct overload. We instead cast the
- // return vector type; see also the comment in BitCast.
- return Vec128<T, HWY_MAX_LANES_D(D)>{
- reinterpret_cast<typename detail::Raw128<T>::type>(vec_splats(0))};
- }
- template <class D>
- using VFromD = decltype(Zero(D()));
- // ------------------------------ BitCast
- template <class D, typename FromT>
- HWY_API VFromD<D> BitCast(D /*d*/,
- Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
- // C-style casts are not sufficient when compiling with
- // -fno-lax-vector-conversions, which will be the future default in Clang,
- // but reinterpret_cast is.
- return VFromD<D>{
- reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
- }
- // ------------------------------ ResizeBitCast
- template <class D, typename FromV>
- HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
- // C-style casts are not sufficient when compiling with
- // -fno-lax-vector-conversions, which will be the future default in Clang,
- // but reinterpret_cast is.
- return VFromD<D>{
- reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
- }
- // ------------------------------ Set
- // Returns a vector/part with all lanes set to "t".
- template <class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)>
- HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) {
- using RawLane = typename detail::Raw128<TFromD<D>>::RawT;
- return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
- }
- template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
- HWY_API VFromD<D> Set(D d, TFromD<D> t) {
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
- }
- // Returns a vector with uninitialized elements.
- template <class D>
- HWY_API VFromD<D> Undefined(D d) {
- #if HWY_COMPILER_GCC_ACTUAL
- // Suppressing maybe-uninitialized both here and at the caller does not work,
- // so initialize.
- return Zero(d);
- #elif HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
- return VFromD<D>{__builtin_nondeterministic_value(Zero(d).raw)};
- #else
- HWY_DIAGNOSTICS(push)
- HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
- typename detail::Raw128<TFromD<D>>::type raw;
- return VFromD<decltype(d)>{raw};
- HWY_DIAGNOSTICS(pop)
- #endif
- }
- // ------------------------------ GetLane
- // Gets the single value stored in a vector/part.
- template <typename T, size_t N>
- HWY_API T GetLane(Vec128<T, N> v) {
- return static_cast<T>(v.raw[0]);
- }
- // ------------------------------ Dup128VecFromValues
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
- TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
- TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
- TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
- TFromD<D> t11, TFromD<D> t12,
- TFromD<D> t13, TFromD<D> t14,
- TFromD<D> t15) {
- const typename detail::Raw128<TFromD<D>>::type raw = {
- t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
- return VFromD<D>{raw};
- }
- template <class D, HWY_IF_UI16_D(D)>
- HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
- TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
- TFromD<D> t5, TFromD<D> t6,
- TFromD<D> t7) {
- const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
- t4, t5, t6, t7};
- return VFromD<D>{raw};
- }
- template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
- HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
- TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
- TFromD<D> t5, TFromD<D> t6,
- TFromD<D> t7) {
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(
- d, Dup128VecFromValues(
- du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
- BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
- BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
- BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1,
- TFromD<D> t2, TFromD<D> t3) {
- const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
- return VFromD<D>{raw};
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) {
- const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
- return VFromD<D>{raw};
- }
- // ================================================== LOGICAL
- // ------------------------------ And
- template <typename T, size_t N>
- HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const RebindToUnsigned<decltype(d)> du;
- using VU = VFromD<decltype(du)>;
- #if HWY_S390X_HAVE_Z14
- return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
- #else
- return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
- #endif
- }
- // ------------------------------ AndNot
- // Returns ~not_mask & mask.
- template <typename T, size_t N>
- HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
- const DFromV<decltype(mask)> d;
- const RebindToUnsigned<decltype(d)> du;
- using VU = VFromD<decltype(du)>;
- return BitCast(
- d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)});
- }
- // ------------------------------ Or
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const RebindToUnsigned<decltype(d)> du;
- using VU = VFromD<decltype(du)>;
- #if HWY_S390X_HAVE_Z14
- return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
- #else
- return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
- #endif
- }
- // ------------------------------ Xor
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const RebindToUnsigned<decltype(d)> du;
- using VU = VFromD<decltype(du)>;
- #if HWY_S390X_HAVE_Z14
- return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
- #else
- return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
- #endif
- }
- // ------------------------------ Not
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- using VU = VFromD<decltype(du)>;
- return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)});
- }
- // ------------------------------ IsConstantRawAltivecVect
- namespace detail {
- template <class RawV>
- static HWY_INLINE bool IsConstantRawAltivecVect(
- hwy::SizeTag<1> /* lane_size_tag */, RawV v) {
- return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
- __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
- __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
- __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
- __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
- __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
- __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
- __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
- }
- template <class RawV>
- static HWY_INLINE bool IsConstantRawAltivecVect(
- hwy::SizeTag<2> /* lane_size_tag */, RawV v) {
- return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
- __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
- __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
- __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
- }
- template <class RawV>
- static HWY_INLINE bool IsConstantRawAltivecVect(
- hwy::SizeTag<4> /* lane_size_tag */, RawV v) {
- return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
- __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
- }
- template <class RawV>
- static HWY_INLINE bool IsConstantRawAltivecVect(
- hwy::SizeTag<8> /* lane_size_tag */, RawV v) {
- return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
- }
- template <class RawV>
- static HWY_INLINE bool IsConstantRawAltivecVect(RawV v) {
- return IsConstantRawAltivecVect(hwy::SizeTag<sizeof(decltype(v[0]))>(), v);
- }
- } // namespace detail
- // ------------------------------ TernaryLogic
- #if HWY_PPC_HAVE_10
- namespace detail {
- // NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse
- // order of the kTernLogOp bits of AVX3
- // _mm_ternarylogic_epi64(a, b, c, kTernLogOp)
- template <uint8_t kTernLogOp, class V>
- HWY_INLINE V TernaryLogic(V a, V b, V c) {
- const DFromV<decltype(a)> d;
- const RebindToUnsigned<decltype(d)> du;
- using VU = VFromD<decltype(du)>;
- const auto a_raw = BitCast(du, a).raw;
- const auto b_raw = BitCast(du, b).raw;
- const auto c_raw = BitCast(du, c).raw;
- #if HWY_COMPILER_GCC_ACTUAL
- // Use inline assembly on GCC to work around GCC compiler bug
- typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result;
- __asm__("xxeval %x0,%x1,%x2,%x3,%4"
- : "=wa"(raw_ternlog_result)
- : "wa"(a_raw), "wa"(b_raw), "wa"(c_raw),
- "n"(static_cast<unsigned>(kTernLogOp))
- :);
- #else
- const auto raw_ternlog_result =
- vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp);
- #endif
- return BitCast(d, VU{raw_ternlog_result});
- }
- } // namespace detail
- #endif // HWY_PPC_HAVE_10
- // ------------------------------ Xor3
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
- #if HWY_PPC_HAVE_10
- #if defined(__OPTIMIZE__)
- if (static_cast<int>(detail::IsConstantRawAltivecVect(x1.raw)) +
- static_cast<int>(detail::IsConstantRawAltivecVect(x2.raw)) +
- static_cast<int>(detail::IsConstantRawAltivecVect(x3.raw)) >=
- 2) {
- return Xor(x1, Xor(x2, x3));
- } else // NOLINT
- #endif
- {
- return detail::TernaryLogic<0x69>(x1, x2, x3);
- }
- #else
- return Xor(x1, Xor(x2, x3));
- #endif
- }
- // ------------------------------ Or3
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
- #if HWY_PPC_HAVE_10
- #if defined(__OPTIMIZE__)
- if (static_cast<int>(detail::IsConstantRawAltivecVect(o1.raw)) +
- static_cast<int>(detail::IsConstantRawAltivecVect(o2.raw)) +
- static_cast<int>(detail::IsConstantRawAltivecVect(o3.raw)) >=
- 2) {
- return Or(o1, Or(o2, o3));
- } else // NOLINT
- #endif
- {
- return detail::TernaryLogic<0x7F>(o1, o2, o3);
- }
- #else
- return Or(o1, Or(o2, o3));
- #endif
- }
- // ------------------------------ OrAnd
- template <typename T, size_t N>
- HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
- #if HWY_PPC_HAVE_10
- #if defined(__OPTIMIZE__)
- if (detail::IsConstantRawAltivecVect(a1.raw) &&
- detail::IsConstantRawAltivecVect(a2.raw)) {
- return Or(o, And(a1, a2));
- } else // NOLINT
- #endif
- {
- return detail::TernaryLogic<0x1F>(o, a1, a2);
- }
- #else
- return Or(o, And(a1, a2));
- #endif
- }
- // ------------------------------ IfVecThenElse
- template <typename T, size_t N>
- HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
- Vec128<T, N> no) {
- const DFromV<decltype(yes)> d;
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(
- d, VFromD<decltype(du)>{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw,
- BitCast(du, mask).raw)});
- }
- // ------------------------------ BitwiseIfThenElse
- #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
- #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
- #else
- #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
- #endif
- template <class V>
- HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
- return IfVecThenElse(mask, yes, no);
- }
- // ------------------------------ Operator overloads (internal-only if float)
- template <typename T, size_t N>
- HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
- return And(a, b);
- }
- template <typename T, size_t N>
- HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
- return Or(a, b);
- }
- template <typename T, size_t N>
- HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
- return Xor(a, b);
- }
- // ================================================== SIGN
- // ------------------------------ Neg
- template <typename T, size_t N, HWY_IF_SIGNED(T)>
- HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
- // If T is an signed integer type, use Zero(d) - v instead of vec_neg to
- // avoid undefined behavior in the case where v[i] == LimitsMin<T>()
- const DFromV<decltype(v)> d;
- return Zero(d) - v;
- }
- template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
- HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
- #if HWY_S390X_HAVE_Z14
- return Xor(v, SignBit(DFromV<decltype(v)>()));
- #else
- return Vec128<T, N>{vec_neg(v.raw)};
- #endif
- }
- template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
- HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
- return Xor(v, SignBit(DFromV<decltype(v)>()));
- }
- // ------------------------------ Abs
- // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
- template <class T, size_t N, HWY_IF_SIGNED(T)>
- HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
- // If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to
- // avoid undefined behavior in the case where v[i] == LimitsMin<T>().
- return Max(v, Neg(v));
- }
- template <class T, size_t N, HWY_IF_FLOAT3264(T)>
- HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
- return Vec128<T, N>{vec_abs(v.raw)};
- }
- // ------------------------------ CopySign
- #if HWY_S390X_HAVE_Z14
- template <class V>
- HWY_API V CopySign(const V magn, const V sign) {
- static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
- const DFromV<decltype(magn)> d;
- const auto msb = SignBit(d);
- // Truth table for msb, magn, sign | bitwise msb ? sign : mag
- // 0 0 0 | 0
- // 0 0 1 | 0
- // 0 1 0 | 1
- // 0 1 1 | 1
- // 1 0 0 | 0
- // 1 0 1 | 1
- // 1 1 0 | 0
- // 1 1 1 | 1
- return BitwiseIfThenElse(msb, sign, magn);
- }
- #else // VSX
- template <size_t N>
- HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
- Vec128<float, N> sign) {
- // Work around compiler bugs that are there with vec_cpsgn on older versions
- // of GCC/Clang
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
- return Vec128<float, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
- #elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
- HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp)
- return Vec128<float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)};
- #else
- return Vec128<float, N>{vec_cpsgn(sign.raw, magn.raw)};
- #endif
- }
- template <size_t N>
- HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
- Vec128<double, N> sign) {
- // Work around compiler bugs that are there with vec_cpsgn on older versions
- // of GCC/Clang
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
- return Vec128<double, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
- #elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
- HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp)
- return Vec128<double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)};
- #else
- return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
- #endif
- }
- #endif // HWY_S390X_HAVE_Z14
- template <typename T, size_t N>
- HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
- // PPC8 can also handle abs < 0, so no extra action needed.
- static_assert(IsFloat<T>(), "Only makes sense for floating-point");
- return CopySign(abs, sign);
- }
- // ================================================== MEMORY (1)
- // Note: type punning is safe because the types are tagged with may_alias.
- // (https://godbolt.org/z/fqrWjfjsP)
- // ------------------------------ Load
- template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
- HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
- // Suppress the ignoring attributes warning that is generated by
- // HWY_RCAST_ALIGNED(const LoadRaw*, aligned) with GCC
- #if HWY_COMPILER_GCC
- HWY_DIAGNOSTICS(push)
- HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
- #endif
- using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
- const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
- using ResultRaw = typename detail::Raw128<T>::type;
- return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
- #if HWY_COMPILER_GCC
- HWY_DIAGNOSTICS(pop)
- #endif
- }
- // Any <= 64 bit
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
- HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
- using BitsT = UnsignedFromSize<d.MaxBytes()>;
- BitsT bits;
- const Repartition<BitsT, decltype(d)> d_bits;
- CopyBytes<d.MaxBytes()>(p, &bits);
- return BitCast(d, Set(d_bits, bits));
- }
- // ================================================== MASK
- // ------------------------------ Mask
- // Mask and Vec are both backed by vector types (true = FF..FF).
- template <typename T, size_t N>
- HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
- using Raw = typename detail::Raw128<T>::RawBoolVec;
- return Mask128<T, N>{reinterpret_cast<Raw>(v.raw)};
- }
- template <class D>
- using MFromD = decltype(MaskFromVec(VFromD<D>()));
- template <typename T, size_t N>
- HWY_API Vec128<T, N> VecFromMask(Mask128<T, N> v) {
- return Vec128<T, N>{
- reinterpret_cast<typename detail::Raw128<T>::type>(v.raw)};
- }
- template <class D>
- HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) {
- return VFromD<D>{
- reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
- }
- // mask ? yes : no
- template <typename T, size_t N>
- HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
- Vec128<T, N> no) {
- const DFromV<decltype(yes)> d;
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(d, VFromD<decltype(du)>{vec_sel(
- BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)});
- }
- // mask ? yes : 0
- template <typename T, size_t N>
- HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
- return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
- }
- // mask ? 0 : no
- template <typename T, size_t N>
- HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
- return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
- }
- // ------------------------------ Mask logical
- template <typename T, size_t N>
- HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
- return Mask128<T, N>{vec_nor(m.raw, m.raw)};
- }
- template <typename T, size_t N>
- HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
- #if HWY_S390X_HAVE_Z14
- return Mask128<T, N>{a.raw & b.raw};
- #else
- return Mask128<T, N>{vec_and(a.raw, b.raw)};
- #endif
- }
- template <typename T, size_t N>
- HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
- return Mask128<T, N>{vec_andc(b.raw, a.raw)};
- }
- template <typename T, size_t N>
- HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
- #if HWY_S390X_HAVE_Z14
- return Mask128<T, N>{a.raw | b.raw};
- #else
- return Mask128<T, N>{vec_or(a.raw, b.raw)};
- #endif
- }
- template <typename T, size_t N>
- HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
- #if HWY_S390X_HAVE_Z14
- return Mask128<T, N>{a.raw ^ b.raw};
- #else
- return Mask128<T, N>{vec_xor(a.raw, b.raw)};
- #endif
- }
- template <typename T, size_t N>
- HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
- return Mask128<T, N>{vec_nor(a.raw, b.raw)};
- }
- // ------------------------------ ShiftLeftSame
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- using TU = TFromD<decltype(du)>;
- #if HWY_S390X_HAVE_Z14
- return BitCast(d,
- VFromD<decltype(du)>{BitCast(du, v).raw
- << Set(du, static_cast<TU>(bits)).raw});
- #else
- // Do an unsigned vec_sl operation to avoid undefined behavior
- return BitCast(
- d, VFromD<decltype(du)>{
- vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)});
- #endif
- }
- // ------------------------------ ShiftRightSame
- template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
- HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
- using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
- #if HWY_S390X_HAVE_Z14
- return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))};
- #else
- return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
- #endif
- }
- template <typename T, size_t N, HWY_IF_SIGNED(T)>
- HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
- #if HWY_S390X_HAVE_Z14
- using TI = typename detail::Raw128<T>::RawT;
- return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))};
- #else
- using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
- return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
- #endif
- }
- // ------------------------------ ShiftLeft
- template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
- static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
- return ShiftLeftSame(v, kBits);
- }
- // ------------------------------ ShiftRight
- template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
- static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
- return ShiftRightSame(v, kBits);
- }
- // ------------------------------ BroadcastSignBit
- template <typename T, size_t N, HWY_IF_SIGNED(T)>
- HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
- return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1));
- }
- // ================================================== SWIZZLE (1)
- // ------------------------------ TableLookupBytes
- template <typename T, size_t N, typename TI, size_t NI>
- HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes,
- Vec128<TI, NI> from) {
- const Repartition<uint8_t, DFromV<decltype(from)>> du8_from;
- return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
- vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))};
- }
- // ------------------------------ TableLookupBytesOr0
- // For all vector widths; Altivec/VSX needs zero out
- template <class V, class VI>
- HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
- const DFromV<VI> di;
- Repartition<int8_t, decltype(di)> di8;
- const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from)));
- return AndNot(zeroOutMask, TableLookupBytes(bytes, from));
- }
- // ------------------------------ Reverse
- template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
- HWY_API Vec128<T> Reverse(D /* tag */, Vec128<T> v) {
- return Vec128<T>{vec_reve(v.raw)};
- }
- // ------------------------------ Shuffles (Reverse)
- // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
- // Shuffle0321 rotates one lane to the right (the previous least-significant
- // lane is now most-significant). These could also be implemented via
- // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
- // Swap 32-bit halves in 64-bit halves.
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) {
- static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
- static_assert(N == 2 || N == 4, "Does not make sense for N=1");
- const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
- 12, 13, 14, 15, 8, 9, 10, 11};
- return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)};
- }
- // These are used by generic_ops-inl to implement LoadInterleaved3. As with
- // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
- // comes from the first argument.
- namespace detail {
- template <typename T, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec32<T> ShuffleTwo2301(Vec32<T> a, Vec32<T> b) {
- const __vector unsigned char kShuffle16 = {1, 0, 19, 18};
- return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle16)};
- }
- template <typename T, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec64<T> ShuffleTwo2301(Vec64<T> a, Vec64<T> b) {
- const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21};
- return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
- }
- template <typename T, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> ShuffleTwo2301(Vec128<T> a, Vec128<T> b) {
- const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
- 28, 29, 30, 31, 24, 25, 26, 27};
- return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
- }
- template <typename T, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec32<T> ShuffleTwo1230(Vec32<T> a, Vec32<T> b) {
- const __vector unsigned char kShuffle = {0, 3, 18, 17};
- return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
- }
- template <typename T, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec64<T> ShuffleTwo1230(Vec64<T> a, Vec64<T> b) {
- const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19};
- return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
- }
- template <typename T, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> ShuffleTwo1230(Vec128<T> a, Vec128<T> b) {
- const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15,
- 24, 25, 26, 27, 20, 21, 22, 23};
- return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
- }
- template <typename T, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec32<T> ShuffleTwo3012(Vec32<T> a, Vec32<T> b) {
- const __vector unsigned char kShuffle = {2, 1, 16, 19};
- return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
- }
- template <typename T, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec64<T> ShuffleTwo3012(Vec64<T> a, Vec64<T> b) {
- const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23};
- return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
- }
- template <typename T, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> ShuffleTwo3012(Vec128<T> a, Vec128<T> b) {
- const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7,
- 16, 17, 18, 19, 28, 29, 30, 31};
- return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
- }
- } // namespace detail
- // Swap 64-bit halves
- template <class T, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
- const Full128<T> d;
- const Full128<uint64_t> du64;
- return BitCast(d, Reverse(du64, BitCast(du64, v)));
- }
- template <class T, HWY_IF_T_SIZE(T, 8)>
- HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
- return Reverse(Full128<T>(), v);
- }
- // Rotate right 32 bits
- template <class T, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
- #if HWY_IS_LITTLE_ENDIAN
- return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
- #else
- return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
- #endif
- }
- // Rotate left 32 bits
- template <class T, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
- #if HWY_IS_LITTLE_ENDIAN
- return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
- #else
- return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
- #endif
- }
- template <class T, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
- return Reverse(Full128<T>(), v);
- }
- // ================================================== COMPARE
- // Comparisons fill a lane with 1-bits if the condition is true, else 0.
- template <class DTo, typename TFrom, size_t NFrom>
- HWY_API MFromD<DTo> RebindMask(DTo /*dto*/, Mask128<TFrom, NFrom> m) {
- static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
- return MFromD<DTo>{m.raw};
- }
- template <typename T, size_t N>
- HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
- static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
- return (v & bit) == bit;
- }
- // ------------------------------ Equality
- template <typename T, size_t N>
- HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
- return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)};
- }
- // ------------------------------ Inequality
- // This cannot have T as a template argument, otherwise it is not more
- // specialized than rewritten operator== in C++20, leading to compile
- // errors: https://gcc.godbolt.org/z/xsrPhPvPT.
- template <size_t N>
- HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
- Vec128<uint8_t, N> b) {
- #if HWY_PPC_HAVE_9
- return Mask128<uint8_t, N>{vec_cmpne(a.raw, b.raw)};
- #else
- return Not(a == b);
- #endif
- }
- template <size_t N>
- HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
- Vec128<uint16_t, N> b) {
- #if HWY_PPC_HAVE_9
- return Mask128<uint16_t, N>{vec_cmpne(a.raw, b.raw)};
- #else
- return Not(a == b);
- #endif
- }
- template <size_t N>
- HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
- Vec128<uint32_t, N> b) {
- #if HWY_PPC_HAVE_9
- return Mask128<uint32_t, N>{vec_cmpne(a.raw, b.raw)};
- #else
- return Not(a == b);
- #endif
- }
- template <size_t N>
- HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
- Vec128<uint64_t, N> b) {
- return Not(a == b);
- }
- template <size_t N>
- HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
- Vec128<int8_t, N> b) {
- #if HWY_PPC_HAVE_9
- return Mask128<int8_t, N>{vec_cmpne(a.raw, b.raw)};
- #else
- return Not(a == b);
- #endif
- }
- template <size_t N>
- HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
- Vec128<int16_t, N> b) {
- #if HWY_PPC_HAVE_9
- return Mask128<int16_t, N>{vec_cmpne(a.raw, b.raw)};
- #else
- return Not(a == b);
- #endif
- }
- template <size_t N>
- HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
- Vec128<int32_t, N> b) {
- #if HWY_PPC_HAVE_9
- return Mask128<int32_t, N>{vec_cmpne(a.raw, b.raw)};
- #else
- return Not(a == b);
- #endif
- }
- template <size_t N>
- HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
- Vec128<int64_t, N> b) {
- return Not(a == b);
- }
- template <size_t N>
- HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
- return Not(a == b);
- }
- template <size_t N>
- HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
- Vec128<double, N> b) {
- return Not(a == b);
- }
- // ------------------------------ Strict inequality
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
- HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
- return Mask128<T, N>{vec_cmpgt(a.raw, b.raw)};
- }
- // ------------------------------ Weak inequality
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
- return Mask128<T, N>{vec_cmpge(a.raw, b.raw)};
- }
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
- return Not(b > a);
- }
- // ------------------------------ Reversed comparisons
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
- HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
- return b > a;
- }
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
- HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
- return b >= a;
- }
- // ================================================== MEMORY (2)
- // ------------------------------ Load
- template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
- HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) {
- using LoadRaw = typename detail::Raw128<T>::UnalignedRawVec;
- const LoadRaw* HWY_RESTRICT praw = reinterpret_cast<const LoadRaw*>(p);
- using ResultRaw = typename detail::Raw128<T>::type;
- return Vec128<T>{reinterpret_cast<ResultRaw>(*praw)};
- }
- // For < 128 bit, LoadU == Load.
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
- HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
- return Load(d, p);
- }
- // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
- template <class D, typename T = TFromD<D>>
- HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) {
- return LoadU(d, p);
- }
- #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
- #ifdef HWY_NATIVE_LOAD_N
- #undef HWY_NATIVE_LOAD_N
- #else
- #define HWY_NATIVE_LOAD_N
- #endif
- template <class D, typename T = TFromD<D>>
- HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
- size_t max_lanes_to_load) {
- #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
- if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
- return Zero(d);
- }
- if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
- max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
- return LoadU(d, p);
- }
- #endif
- const size_t num_of_bytes_to_load =
- HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
- const Repartition<uint8_t, decltype(d)> du8;
- #if HWY_S390X_HAVE_Z14
- return (num_of_bytes_to_load > 0)
- ? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
- const_cast<unsigned char*>(
- reinterpret_cast<const unsigned char*>(p)),
- static_cast<unsigned>(num_of_bytes_to_load - 1))})
- : Zero(d);
- #else
- return BitCast(
- d,
- VFromD<decltype(du8)>{vec_xl_len(
- const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
- num_of_bytes_to_load)});
- #endif
- }
- template <class D, typename T = TFromD<D>>
- HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
- size_t max_lanes_to_load) {
- #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
- if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
- return no;
- }
- if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
- max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
- return LoadU(d, p);
- }
- #endif
- return IfThenElse(FirstN(d, max_lanes_to_load),
- LoadN(d, p, max_lanes_to_load), no);
- }
- #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
- // Returns a vector with lane i=[0, N) set to "first" + i.
- namespace detail {
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_INLINE VFromD<D> Iota0(D d) {
- constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15};
- return BitCast(d, VFromD<RebindToUnsigned<D>>{kU8Iota0});
- }
- template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
- HWY_INLINE VFromD<D> Iota0(D d) {
- constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
- return BitCast(d, VFromD<RebindToUnsigned<D>>{kU16Iota0});
- }
- template <class D, HWY_IF_UI32_D(D)>
- HWY_INLINE VFromD<D> Iota0(D d) {
- constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3};
- return BitCast(d, VFromD<RebindToUnsigned<D>>{kU32Iota0});
- }
- template <class D, HWY_IF_UI64_D(D)>
- HWY_INLINE VFromD<D> Iota0(D d) {
- constexpr __vector unsigned long long kU64Iota0 = {0, 1};
- return BitCast(d, VFromD<RebindToUnsigned<D>>{kU64Iota0});
- }
- template <class D, HWY_IF_F32_D(D)>
- HWY_INLINE VFromD<D> Iota0(D /*d*/) {
- constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
- return VFromD<D>{kF32Iota0};
- }
- template <class D, HWY_IF_F64_D(D)>
- HWY_INLINE VFromD<D> Iota0(D /*d*/) {
- constexpr __vector double kF64Iota0 = {0.0, 1.0};
- return VFromD<D>{kF64Iota0};
- }
- } // namespace detail
- template <class D, typename T2>
- HWY_API VFromD<D> Iota(D d, const T2 first) {
- return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
- }
- // ------------------------------ FirstN (Iota, Lt)
- template <class D>
- HWY_API MFromD<D> FirstN(D d, size_t num) {
- const RebindToUnsigned<decltype(d)> du;
- using TU = TFromD<decltype(du)>;
- return RebindMask(d, Iota(du, 0) < Set(du, static_cast<TU>(num)));
- }
- // ------------------------------ MaskedLoad
- template <class D, typename T = TFromD<D>>
- HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT p) {
- return IfThenElseZero(m, LoadU(d, p));
- }
- // ------------------------------ MaskedLoadOr
- template <class D, typename T = TFromD<D>>
- HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
- const T* HWY_RESTRICT p) {
- return IfThenElse(m, LoadU(d, p), v);
- }
- // ------------------------------ Store
- template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
- HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
- // Suppress the ignoring attributes warning that is generated by
- // HWY_RCAST_ALIGNED(StoreRaw*, aligned) with GCC
- #if HWY_COMPILER_GCC
- HWY_DIAGNOSTICS(push)
- HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
- #endif
- using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
- *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
- #if HWY_COMPILER_GCC
- HWY_DIAGNOSTICS(pop)
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
- HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) {
- using StoreRaw = typename detail::Raw128<T>::UnalignedRawVec;
- *reinterpret_cast<StoreRaw*>(p) = reinterpret_cast<StoreRaw>(v.raw);
- }
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
- HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
- using BitsT = UnsignedFromSize<d.MaxBytes()>;
- const Repartition<BitsT, decltype(d)> d_bits;
- const BitsT bits = GetLane(BitCast(d_bits, v));
- CopyBytes<d.MaxBytes()>(&bits, p);
- }
- // For < 128 bit, StoreU == Store.
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
- HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
- Store(v, d, p);
- }
- #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
- #ifdef HWY_NATIVE_STORE_N
- #undef HWY_NATIVE_STORE_N
- #else
- #define HWY_NATIVE_STORE_N
- #endif
- template <class D, typename T = TFromD<D>>
- HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
- size_t max_lanes_to_store) {
- #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
- if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) {
- return;
- }
- if (__builtin_constant_p(max_lanes_to_store >= HWY_MAX_LANES_D(D)) &&
- max_lanes_to_store >= HWY_MAX_LANES_D(D)) {
- StoreU(v, d, p);
- return;
- }
- #endif
- const size_t num_of_bytes_to_store =
- HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
- const Repartition<uint8_t, decltype(d)> du8;
- #if HWY_S390X_HAVE_Z14
- if (num_of_bytes_to_store > 0) {
- vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
- static_cast<unsigned>(num_of_bytes_to_store - 1));
- }
- #else
- vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
- num_of_bytes_to_store);
- #endif
- }
- #endif
- // ------------------------------ BlendedStore
- template <class D>
- HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
- TFromD<D>* HWY_RESTRICT p) {
- const VFromD<D> old = LoadU(d, p);
- StoreU(IfThenElse(RebindMask(d, m), v, old), d, p);
- }
- // ================================================== ARITHMETIC
- namespace detail {
- // If TFromD<D> is an integer type, detail::RebindToUnsignedIfNotFloat<D>
- // rebinds D to MakeUnsigned<TFromD<D>>.
- // Otherwise, if TFromD<D> is a floating-point type (including F16 and BF16),
- // detail::RebindToUnsignedIfNotFloat<D> is the same as D.
- template <class D>
- using RebindToUnsignedIfNotFloat =
- hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
- RebindToUnsigned<D>, D>;
- } // namespace detail
- // ------------------------------ Addition
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
- HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
- // If T is an integer type, do an unsigned vec_add to avoid undefined behavior
- #if HWY_S390X_HAVE_Z14
- return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
- BitCast(d_arith, b).raw});
- #else
- return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
- BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
- #endif
- }
- // ------------------------------ Subtraction
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
- HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
- // If T is an integer type, do an unsigned vec_sub to avoid undefined behavior
- #if HWY_S390X_HAVE_Z14
- return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
- BitCast(d_arith, b).raw});
- #else
- return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
- BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
- #endif
- }
- // ------------------------------ SumsOf8
- template <class V, HWY_IF_U8(TFromV<V>)>
- HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
- return SumsOf2(SumsOf4(v));
- }
- template <class V, HWY_IF_I8(TFromV<V>)>
- HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
- #if HWY_S390X_HAVE_Z14
- const DFromV<decltype(v)> di8;
- const RebindToUnsigned<decltype(di8)> du8;
- const RepartitionToWideX3<decltype(di8)> di64;
- return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) +
- Set(di64, int64_t{-1024});
- #else
- return SumsOf2(SumsOf4(v));
- #endif
- }
- // ------------------------------ SaturatedAdd
- // Returns a + b clamped to the destination range.
- #if HWY_S390X_HAVE_Z14
- // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most
- // other integer SIMD instruction sets
- template <typename T, size_t N, HWY_IF_UNSIGNED(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
- HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
- return Add(a, Min(b, Not(a)));
- }
- template <typename T, size_t N, HWY_IF_SIGNED(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
- HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const auto sum = Add(a, b);
- const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
- const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
- return IfNegativeThenElse(overflow_mask, overflow_result, sum);
- }
- #else // VSX
- #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
- #undef HWY_NATIVE_I32_SATURATED_ADDSUB
- #else
- #define HWY_NATIVE_I32_SATURATED_ADDSUB
- #endif
- #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
- #undef HWY_NATIVE_U32_SATURATED_ADDSUB
- #else
- #define HWY_NATIVE_U32_SATURATED_ADDSUB
- #endif
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
- HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
- return Vec128<T, N>{vec_adds(a.raw, b.raw)};
- }
- #endif // HWY_S390X_HAVE_Z14
- #if HWY_PPC_HAVE_10
- #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
- #undef HWY_NATIVE_I64_SATURATED_ADDSUB
- #else
- #define HWY_NATIVE_I64_SATURATED_ADDSUB
- #endif
- template <class V, HWY_IF_I64_D(DFromV<V>)>
- HWY_API V SaturatedAdd(V a, V b) {
- const DFromV<decltype(a)> d;
- const auto sum = Add(a, b);
- const auto overflow_mask =
- BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum));
- const auto overflow_result =
- Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
- return IfNegativeThenElse(overflow_mask, overflow_result, sum);
- }
- #endif // HWY_PPC_HAVE_10
- // ------------------------------ SaturatedSub
- // Returns a - b clamped to the destination range.
- #if HWY_S390X_HAVE_Z14
- // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most
- // other integer SIMD instruction sets
- template <typename T, size_t N, HWY_IF_UNSIGNED(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
- HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
- return Sub(a, Min(a, b));
- }
- template <typename T, size_t N, HWY_IF_SIGNED(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
- HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const auto diff = Sub(a, b);
- const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
- const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
- return IfNegativeThenElse(overflow_mask, overflow_result, diff);
- }
- #else // VSX
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
- HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
- return Vec128<T, N>{vec_subs(a.raw, b.raw)};
- }
- #endif // HWY_S390X_HAVE_Z14
- #if HWY_PPC_HAVE_10
- template <class V, HWY_IF_I64_D(DFromV<V>)>
- HWY_API V SaturatedSub(V a, V b) {
- const DFromV<decltype(a)> d;
- const auto diff = Sub(a, b);
- const auto overflow_mask =
- BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff));
- const auto overflow_result =
- Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
- return IfNegativeThenElse(overflow_mask, overflow_result, diff);
- }
- #endif // HWY_PPC_HAVE_10
- // ------------------------------ AverageRound
- // Returns (a + b + 1) / 2
- template <typename T, size_t N, HWY_IF_UNSIGNED(T),
- HWY_IF_T_SIZE_ONE_OF(T, 0x6)>
- HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
- return Vec128<T, N>{vec_avg(a.raw, b.raw)};
- }
- // ------------------------------ Multiplication
- // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
- #ifdef HWY_NATIVE_MUL_8
- #undef HWY_NATIVE_MUL_8
- #else
- #define HWY_NATIVE_MUL_8
- #endif
- #ifdef HWY_NATIVE_MUL_64
- #undef HWY_NATIVE_MUL_64
- #else
- #define HWY_NATIVE_MUL_64
- #endif
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
- HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
- // If T is an integer type, do an unsigned vec_mul to avoid undefined behavior
- #if HWY_S390X_HAVE_Z14
- return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
- BitCast(d_arith, b).raw});
- #else
- return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
- BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
- #endif
- }
- // Returns the upper sizeof(T)*8 bits of a * b in each lane.
- #if HWY_S390X_HAVE_Z14
- #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
- #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
- hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
- #elif HWY_PPC_HAVE_10
- #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
- #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
- #else
- #define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
- hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
- #define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
- #endif
- #if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
- template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
- return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
- }
- #endif
- template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
- const auto p_even = MulEven(a, b);
- #if HWY_IS_LITTLE_ENDIAN
- const auto p_even_full = ResizeBitCast(Full128<T>(), p_even);
- return Vec128<T, 1>{
- vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))};
- #else
- const DFromV<decltype(a)> d;
- return ResizeBitCast(d, p_even);
- #endif
- }
- template <typename T, size_t N,
- HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), HWY_IF_LANES_GT(N, 1)>
- HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const auto p_even = BitCast(d, MulEven(a, b));
- const auto p_odd = BitCast(d, MulOdd(a, b));
- #if HWY_IS_LITTLE_ENDIAN
- return InterleaveOdd(d, p_even, p_odd);
- #else
- return InterleaveEven(d, p_even, p_odd);
- #endif
- }
- #if !HWY_PPC_HAVE_10
- template <class T, HWY_IF_UI64(T)>
- HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
- T p_hi;
- Mul128(GetLane(a), GetLane(b), &p_hi);
- return Set(Full64<T>(), p_hi);
- }
- template <class T, HWY_IF_UI64(T)>
- HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
- const DFromV<decltype(a)> d;
- const Half<decltype(d)> dh;
- return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)),
- MulHigh(LowerHalf(dh, a), LowerHalf(dh, b)));
- }
- #endif // !HWY_PPC_HAVE_10
- #undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
- #undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
- // Multiplies even lanes (0, 2, ..) and places the double-wide result into
- // even and the upper half into its odd neighbor lane.
- template <typename T, size_t N,
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
- Vec128<T, N> b) {
- return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)};
- }
- // Multiplies odd lanes (1, 3, ..) and places the double-wide result into
- // even and the upper half into its odd neighbor lane.
- template <typename T, size_t N,
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
- Vec128<T, N> b) {
- return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
- }
- // ------------------------------ Rol/Ror
- #ifdef HWY_NATIVE_ROL_ROR_8
- #undef HWY_NATIVE_ROL_ROR_8
- #else
- #define HWY_NATIVE_ROL_ROR_8
- #endif
- #ifdef HWY_NATIVE_ROL_ROR_16
- #undef HWY_NATIVE_ROL_ROR_16
- #else
- #define HWY_NATIVE_ROL_ROR_16
- #endif
- #ifdef HWY_NATIVE_ROL_ROR_32_64
- #undef HWY_NATIVE_ROL_ROR_32_64
- #else
- #define HWY_NATIVE_ROL_ROR_32_64
- #endif
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(
- d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
- }
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const RebindToSigned<decltype(d)> di;
- return Rol(a, BitCast(d, Neg(BitCast(di, b))));
- }
- // ------------------------------ RotateRight
- template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
- const DFromV<decltype(v)> d;
- constexpr size_t kSizeInBits = sizeof(T) * 8;
- static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
- return (kBits == 0)
- ? v
- : Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
- kBits)));
- }
- // ------------------------------ RotateLeftSame/RotateRightSame
- #ifdef HWY_NATIVE_ROL_ROR_SAME_8
- #undef HWY_NATIVE_ROL_ROR_SAME_8
- #else
- #define HWY_NATIVE_ROL_ROR_SAME_8
- #endif
- #ifdef HWY_NATIVE_ROL_ROR_SAME_16
- #undef HWY_NATIVE_ROL_ROR_SAME_16
- #else
- #define HWY_NATIVE_ROL_ROR_SAME_16
- #endif
- #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
- #undef HWY_NATIVE_ROL_ROR_SAME_32_64
- #else
- #define HWY_NATIVE_ROL_ROR_SAME_32_64
- #endif
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) {
- const DFromV<decltype(v)> d;
- return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
- }
- template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
- HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) {
- const DFromV<decltype(v)> d;
- return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
- }
- // ------------------------------ IfNegativeThenElse
- template <typename T, size_t N>
- HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
- Vec128<T, N> no) {
- static_assert(IsSigned<T>(), "Only works for signed/float");
- const DFromV<decltype(v)> d;
- #if HWY_PPC_HAVE_10
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(
- d, VFromD<decltype(du)>{vec_blendv(
- BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
- #else
- const RebindToSigned<decltype(d)> di;
- return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
- #endif
- }
- #if HWY_PPC_HAVE_10
- #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
- #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
- #else
- #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
- #endif
- #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
- #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
- #else
- #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
- #endif
- template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
- HWY_API V IfNegativeThenElseZero(V v, V yes) {
- const DFromV<decltype(v)> d;
- return IfNegativeThenElse(v, yes, Zero(d));
- }
- template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
- HWY_API V IfNegativeThenZeroElse(V v, V no) {
- const DFromV<decltype(v)> d;
- return IfNegativeThenElse(v, Zero(d), no);
- }
- #endif
- // generic_ops takes care of integer T.
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
- return Abs(a - b);
- }
- // ------------------------------ Floating-point multiply-add variants
- // Returns mul * x + add
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
- Vec128<T, N> add) {
- return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)};
- }
- // Returns add - mul * x
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
- Vec128<T, N> add) {
- // NOTE: the vec_nmsub operation below computes -(mul * x - add),
- // which is equivalent to add - mul * x in the round-to-nearest
- // and round-towards-zero rounding modes
- return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)};
- }
- // Returns mul * x - sub
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
- Vec128<T, N> sub) {
- return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)};
- }
- // Returns -mul * x - sub
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
- Vec128<T, N> sub) {
- // NOTE: The vec_nmadd operation below computes -(mul * x + sub),
- // which is equivalent to -mul * x - sub in the round-to-nearest
- // and round-towards-zero rounding modes
- return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)};
- }
- // ------------------------------ Floating-point div
- // Approximate reciprocal
- #ifdef HWY_NATIVE_F64_APPROX_RECIP
- #undef HWY_NATIVE_F64_APPROX_RECIP
- #else
- #define HWY_NATIVE_F64_APPROX_RECIP
- #endif
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
- #if HWY_S390X_HAVE_Z14
- return Vec128<T, N>{a.raw / b.raw};
- #else
- return Vec128<T, N>{vec_div(a.raw, b.raw)};
- #endif
- }
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
- #if HWY_S390X_HAVE_Z14
- const DFromV<decltype(v)> d;
- return Set(d, T(1.0)) / v;
- #else
- return Vec128<T, N>{vec_re(v.raw)};
- #endif
- }
- // ------------------------------ Floating-point square root
- #if HWY_S390X_HAVE_Z14
- // Approximate reciprocal square root
- template <size_t N>
- HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- const auto half = v * Set(d, 0.5f);
- // Initial guess based on log2(f)
- const auto guess = BitCast(
- d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
- // One Newton-Raphson iteration
- return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
- }
- #else // VSX
- #ifdef HWY_NATIVE_F64_APPROX_RSQRT
- #undef HWY_NATIVE_F64_APPROX_RSQRT
- #else
- #define HWY_NATIVE_F64_APPROX_RSQRT
- #endif
- // Approximate reciprocal square root
- template <class T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
- return Vec128<T, N>{vec_rsqrte(v.raw)};
- }
- #endif // HWY_S390X_HAVE_Z14
- // Full precision square root
- template <class T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
- return Vec128<T, N>{vec_sqrt(v.raw)};
- }
- // ------------------------------ Min (Gt, IfThenElse)
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
- HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) {
- return Vec128<T, N>{vec_min(a.raw, b.raw)};
- }
- // ------------------------------ Max (Gt, IfThenElse)
- template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
- HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) {
- return Vec128<T, N>{vec_max(a.raw, b.raw)};
- }
- // ------------------------------- Integer AbsDiff for PPC9/PPC10
- #if HWY_PPC_HAVE_9
- #ifdef HWY_NATIVE_INTEGER_ABS_DIFF
- #undef HWY_NATIVE_INTEGER_ABS_DIFF
- #else
- #define HWY_NATIVE_INTEGER_ABS_DIFF
- #endif
- template <class V, HWY_IF_UNSIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
- HWY_API V AbsDiff(const V a, const V b) {
- return V{vec_absd(a.raw, b.raw)};
- }
- template <class V, HWY_IF_U64_D(DFromV<V>)>
- HWY_API V AbsDiff(const V a, const V b) {
- return Sub(Max(a, b), Min(a, b));
- }
- template <class V, HWY_IF_SIGNED_V(V)>
- HWY_API V AbsDiff(const V a, const V b) {
- return Sub(Max(a, b), Min(a, b));
- }
- #endif // HWY_PPC_HAVE_9
- // ------------------------------ Integer Div for PPC10
- #if HWY_PPC_HAVE_10
- #ifdef HWY_NATIVE_INT_DIV
- #undef HWY_NATIVE_INT_DIV
- #else
- #define HWY_NATIVE_INT_DIV
- #endif
- template <size_t N>
- HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a,
- Vec128<int32_t, N> b) {
- // Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid
- // undefined behavior if b[i] == 0 or
- // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
- // Clang will also optimize out I32 vec_div on PPC10 if optimizations are
- // enabled and any of the lanes of b are known to be zero (even in the unused
- // lanes of a partial vector)
- __vector signed int raw_result;
- __asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
- return Vec128<int32_t, N>{raw_result};
- }
- template <size_t N>
- HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a,
- Vec128<uint32_t, N> b) {
- // Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid
- // undefined behavior if b[i] == 0
- // Clang will also optimize out U32 vec_div on PPC10 if optimizations are
- // enabled and any of the lanes of b are known to be zero (even in the unused
- // lanes of a partial vector)
- __vector unsigned int raw_result;
- __asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
- return Vec128<uint32_t, N>{raw_result};
- }
- template <size_t N>
- HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a,
- Vec128<int64_t, N> b) {
- // Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid
- // undefined behavior if b[i] == 0 or
- // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
- // Clang will also optimize out I64 vec_div on PPC10 if optimizations are
- // enabled and any of the lanes of b are known to be zero (even in the unused
- // lanes of a partial vector)
- __vector signed long long raw_result;
- __asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
- return Vec128<int64_t, N>{raw_result};
- }
- template <size_t N>
- HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a,
- Vec128<uint64_t, N> b) {
- // Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid
- // undefined behavior if b[i] == 0
- // Clang will also optimize out U64 vec_div on PPC10 if optimizations are
- // enabled and any of the lanes of b are known to be zero (even in the unused
- // lanes of a partial vector)
- __vector unsigned long long raw_result;
- __asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
- return Vec128<uint64_t, N>{raw_result};
- }
- template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
- HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) {
- const DFromV<decltype(a)> d;
- const RepartitionToWide<decltype(d)> dw;
- return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
- PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
- }
- template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
- HWY_IF_V_SIZE_LE(T, N, 8)>
- HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const Rebind<MakeWide<T>, decltype(d)> dw;
- return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
- }
- template <size_t N>
- HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a,
- Vec128<int32_t, N> b) {
- // Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid
- // undefined behavior if b[i] == 0 or
- // (a[i] == LimitsMin<int32_t>() && b[i] == -1)
- // Clang will also optimize out I32 vec_mod on PPC10 if optimizations are
- // enabled and any of the lanes of b are known to be zero (even in the unused
- // lanes of a partial vector)
- __vector signed int raw_result;
- __asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
- return Vec128<int32_t, N>{raw_result};
- }
- template <size_t N>
- HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a,
- Vec128<uint32_t, N> b) {
- // Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid
- // undefined behavior if b[i] == 0
- // Clang will also optimize out U32 vec_mod on PPC10 if optimizations are
- // enabled and any of the lanes of b are known to be zero (even in the unused
- // lanes of a partial vector)
- __vector unsigned int raw_result;
- __asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
- return Vec128<uint32_t, N>{raw_result};
- }
- template <size_t N>
- HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a,
- Vec128<int64_t, N> b) {
- // Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid
- // undefined behavior if b[i] == 0 or
- // (a[i] == LimitsMin<int64_t>() && b[i] == -1)
- // Clang will also optimize out I64 vec_mod on PPC10 if optimizations are
- // enabled and any of the lanes of b are known to be zero (even in the unused
- // lanes of a partial vector)
- __vector signed long long raw_result;
- __asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
- return Vec128<int64_t, N>{raw_result};
- }
- template <size_t N>
- HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a,
- Vec128<uint64_t, N> b) {
- // Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid
- // undefined behavior if b[i] == 0
- // Clang will also optimize out U64 vec_mod on PPC10 if optimizations are
- // enabled and any of the lanes of b are known to be zero (even in the unused
- // lanes of a partial vector)
- __vector unsigned long long raw_result;
- __asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
- return Vec128<uint64_t, N>{raw_result};
- }
- template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
- HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) {
- const DFromV<decltype(a)> d;
- const RepartitionToWide<decltype(d)> dw;
- return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b),
- PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b));
- }
- template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
- HWY_IF_V_SIZE_LE(T, N, 8)>
- HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const Rebind<MakeWide<T>, decltype(d)> dw;
- return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b));
- }
- #endif
- // ================================================== MEMORY (3)
- // ------------------------------ Non-temporal stores
- template <class D>
- HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
- __builtin_prefetch(aligned, 1, 0);
- Store(v, d, aligned);
- }
- // ------------------------------ Scatter in generic_ops-inl.h
- // ------------------------------ Gather in generic_ops-inl.h
- // ================================================== SWIZZLE (2)
- // ------------------------------ LowerHalf
- // Returns upper/lower half of a vector.
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) {
- return VFromD<D>{v.raw};
- }
- template <typename T, size_t N>
- HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
- return Vec128<T, N / 2>{v.raw};
- }
- // ------------------------------ ShiftLeftBytes
- // NOTE: The ShiftLeftBytes operation moves the elements of v to the right
- // by kBytes bytes and zeroes out the first kBytes bytes of v on both
- // little-endian and big-endian PPC targets
- // (same behavior as the HWY_EMU128 ShiftLeftBytes operation on both
- // little-endian and big-endian targets)
- template <int kBytes, class D>
- HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
- static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
- if (kBytes == 0) return v;
- const auto zeros = Zero(d);
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
- #else
- return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
- #endif
- }
- template <int kBytes, typename T, size_t N>
- HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
- return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
- }
- // ------------------------------ ShiftLeftLanes
- // NOTE: The ShiftLeftLanes operation moves the elements of v to the right
- // by kLanes lanes and zeroes out the first kLanes lanes of v on both
- // little-endian and big-endian PPC targets
- // (same behavior as the HWY_EMU128 ShiftLeftLanes operation on both
- // little-endian and big-endian targets)
- template <int kLanes, class D, typename T = TFromD<D>>
- HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
- const Repartition<uint8_t, decltype(d)> d8;
- return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
- }
- template <int kLanes, typename T, size_t N>
- HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
- return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
- }
- // ------------------------------ ShiftRightBytes
- // NOTE: The ShiftRightBytes operation moves the elements of v to the left
- // by kBytes bytes and zeroes out the last kBytes bytes of v on both
- // little-endian and big-endian PPC targets
- // (same behavior as the HWY_EMU128 ShiftRightBytes operation on both
- // little-endian and big-endian targets)
- template <int kBytes, class D>
- HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
- static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
- if (kBytes == 0) return v;
- // For partial vectors, clear upper lanes so we shift in zeros.
- if (d.MaxBytes() != 16) {
- const Full128<TFromD<D>> dfull;
- VFromD<decltype(dfull)> vfull{v.raw};
- v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
- }
- const auto zeros = Zero(d);
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
- #else
- return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
- #endif
- }
- // ------------------------------ ShiftRightLanes
- // NOTE: The ShiftRightLanes operation moves the elements of v to the left
- // by kLanes lanes and zeroes out the last kLanes lanes of v on both
- // little-endian and big-endian PPC targets
- // (same behavior as the HWY_EMU128 ShiftRightLanes operation on both
- // little-endian and big-endian targets)
- template <int kLanes, class D>
- HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
- const Repartition<uint8_t, decltype(d)> d8;
- constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
- return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
- }
- // ------------------------------ UpperHalf (ShiftRightBytes)
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
- return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
- }
- // ------------------------------ ExtractLane
- template <typename T, size_t N>
- HWY_API T ExtractLane(Vec128<T, N> v, size_t i) {
- return static_cast<T>(v.raw[i]);
- }
- // ------------------------------ InsertLane
- template <typename T, size_t N>
- HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
- #if HWY_IS_LITTLE_ENDIAN
- typename detail::Raw128<T>::type raw_result = v.raw;
- raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
- return Vec128<T, N>{raw_result};
- #else
- // On ppc64be without this, mul_test fails, but swizzle_test passes.
- DFromV<decltype(v)> d;
- alignas(16) T lanes[16 / sizeof(T)];
- Store(v, d, lanes);
- lanes[i] = t;
- return Load(d, lanes);
- #endif
- }
- // ------------------------------ CombineShiftRightBytes
- // NOTE: The CombineShiftRightBytes operation below moves the elements of lo to
- // the left by kBytes bytes and moves the elements of hi right by (d.MaxBytes()
- // - kBytes) bytes on both little-endian and big-endian PPC targets.
- template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
- HWY_API Vec128<T> CombineShiftRightBytes(D /*d*/, Vec128<T> hi, Vec128<T> lo) {
- constexpr size_t kSize = 16;
- static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
- #if HWY_IS_LITTLE_ENDIAN
- return Vec128<T>{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)};
- #else
- return Vec128<T>{vec_sld(lo.raw, hi.raw, kBytes)};
- #endif
- }
- template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
- constexpr size_t kSize = d.MaxBytes();
- static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
- const Repartition<uint8_t, decltype(d)> d8;
- using V8 = Vec128<uint8_t>;
- const DFromV<V8> dfull8;
- const Repartition<TFromD<D>, decltype(dfull8)> dfull;
- const V8 hi8{BitCast(d8, hi).raw};
- // Move into most-significant bytes
- const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
- const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
- return VFromD<D>{BitCast(dfull, r).raw};
- }
- // ------------------------------ Broadcast/splat any lane
- template <int kLane, typename T, size_t N>
- HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
- static_assert(0 <= kLane && kLane < N, "Invalid lane");
- return Vec128<T, N>{vec_splat(v.raw, kLane)};
- }
- // ------------------------------ TableLookupLanes (Shuffle01)
- // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
- template <typename T, size_t N = 16 / sizeof(T)>
- struct Indices128 {
- __vector unsigned char raw;
- };
- namespace detail {
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
- D d) {
- const Repartition<uint8_t, decltype(d)> d8;
- return Iota(d8, 0);
- }
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
- D d) {
- const Repartition<uint8_t, decltype(d)> d8;
- #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- constexpr __vector unsigned char kBroadcastLaneBytes = {
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
- #else
- constexpr __vector unsigned char kBroadcastLaneBytes = {
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
- #endif
- return VFromD<decltype(d8)>{kBroadcastLaneBytes};
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
- D d) {
- const Repartition<uint8_t, decltype(d)> d8;
- #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- constexpr __vector unsigned char kBroadcastLaneBytes = {
- 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
- #else
- constexpr __vector unsigned char kBroadcastLaneBytes = {
- 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15};
- #endif
- return VFromD<decltype(d8)>{kBroadcastLaneBytes};
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
- D d) {
- const Repartition<uint8_t, decltype(d)> d8;
- #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- constexpr __vector unsigned char kBroadcastLaneBytes = {
- 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
- #else
- constexpr __vector unsigned char kBroadcastLaneBytes = {
- 7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
- #endif
- return VFromD<decltype(d8)>{kBroadcastLaneBytes};
- }
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
- const Repartition<uint8_t, decltype(d)> d8;
- return Zero(d8);
- }
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
- const Repartition<uint8_t, decltype(d)> d8;
- constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1};
- return VFromD<decltype(d8)>{kByteOffsets};
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
- const Repartition<uint8_t, decltype(d)> d8;
- constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3,
- 0, 1, 2, 3, 0, 1, 2, 3};
- return VFromD<decltype(d8)>{kByteOffsets};
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
- const Repartition<uint8_t, decltype(d)> d8;
- constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7,
- 0, 1, 2, 3, 4, 5, 6, 7};
- return VFromD<decltype(d8)>{kByteOffsets};
- }
- } // namespace detail
- template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
- HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
- D d, Vec128<TI, MaxLanes(D())> vec) {
- using T = TFromD<D>;
- static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
- #if HWY_IS_DEBUG_BUILD
- const RebindToUnsigned<decltype(d)> du;
- using TU = TFromD<decltype(du)>;
- HWY_DASSERT(AllTrue(
- du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
- #endif
- const Repartition<uint8_t, decltype(d)> d8;
- return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d8, vec).raw};
- }
- template <class D, typename TI,
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
- HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
- D d, Vec128<TI, MaxLanes(D())> vec) {
- using T = TFromD<D>;
- static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
- #if HWY_IS_DEBUG_BUILD
- const RebindToUnsigned<decltype(d)> du;
- using TU = TFromD<decltype(du)>;
- HWY_DASSERT(AllTrue(
- du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
- #endif
- const Repartition<uint8_t, decltype(d)> d8;
- using V8 = VFromD<decltype(d8)>;
- // Broadcast each lane index to all bytes of T and shift to bytes
- const V8 lane_indices = TableLookupBytes(
- BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d));
- constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
- const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
- const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
- return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw};
- }
- template <class D, typename TI>
- HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
- D d, const TI* idx) {
- const Rebind<TI, decltype(d)> di;
- return IndicesFromVec(d, LoadU(di, idx));
- }
- template <typename T, size_t N>
- HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
- const DFromV<decltype(v)> d;
- const Repartition<uint8_t, decltype(d)> d8;
- return BitCast(d, TableLookupBytes(v, VFromD<decltype(d8)>{idx.raw}));
- }
- // Single lane: no change
- template <typename T>
- HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
- Indices128<T, 1> /* idx */) {
- return v;
- }
- template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
- HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
- Indices128<T, N> idx) {
- const DFromV<decltype(a)> d;
- const Twice<decltype(d)> dt;
- const Repartition<uint8_t, decltype(dt)> dt_u8;
- // TableLookupLanes currently requires table and index vectors to be the same
- // size, though a half-length index vector would be sufficient here.
- #if HWY_IS_MSAN
- const Vec128<T, N> idx_vec{idx.raw};
- const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
- #else
- // We only keep LowerHalf of the result, which is valid in idx.
- const Indices128<T, N * 2> idx2{idx.raw};
- #endif
- return LowerHalf(
- d, TableLookupBytes(Combine(dt, b, a),
- BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw})));
- }
- template <typename T>
- HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
- Indices128<T> idx) {
- return Vec128<T>{vec_perm(a.raw, b.raw, idx.raw)};
- }
- // ------------------------------ ReverseBlocks
- // Single block: no change
- template <class D>
- HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
- return v;
- }
- // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
- // Single lane: no change
- template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
- HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) {
- return v;
- }
- // 32-bit x2: shuffle
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) {
- return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw};
- }
- // 16-bit x4: shuffle
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec64<T> Reverse(D /* tag */, Vec64<T> v) {
- const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
- 14, 15, 12, 13, 10, 11, 8, 9};
- return Vec64<T>{vec_perm(v.raw, v.raw, kShuffle)};
- }
- // 16-bit x2: rotate bytes
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
- const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
- return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
- }
- // ------------------------------- ReverseLaneBytes
- #if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
- (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400)
- // Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes.
- #ifdef HWY_NATIVE_REVERSE_LANE_BYTES
- #undef HWY_NATIVE_REVERSE_LANE_BYTES
- #else
- #define HWY_NATIVE_REVERSE_LANE_BYTES
- #endif
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
- HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
- HWY_API V ReverseLaneBytes(V v) {
- return V{vec_revb(v.raw)};
- }
- // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
- #ifdef HWY_NATIVE_REVERSE2_8
- #undef HWY_NATIVE_REVERSE2_8
- #else
- #define HWY_NATIVE_REVERSE2_8
- #endif
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
- const Repartition<uint16_t, decltype(d)> du16;
- return BitCast(d, ReverseLaneBytes(BitCast(du16, v)));
- }
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
- const Repartition<uint32_t, decltype(d)> du32;
- return BitCast(d, ReverseLaneBytes(BitCast(du32, v)));
- }
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
- const Repartition<uint64_t, decltype(d)> du64;
- return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
- }
- #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec16<T> Reverse(D d, Vec16<T> v) {
- return Reverse2(d, v);
- }
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
- return Reverse4(d, v);
- }
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec64<T> Reverse(D d, Vec64<T> v) {
- return Reverse8(d, v);
- }
- // ------------------------------ Reverse2
- // Single lane: no change
- template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
- HWY_API Vec128<T, 1> Reverse2(D /* tag */, Vec128<T, 1> v) {
- return v;
- }
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
- HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
- const Repartition<uint32_t, decltype(d)> du32;
- return BitCast(d, RotateRight<16>(BitCast(du32, v)));
- }
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
- HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
- const Repartition<uint64_t, decltype(d)> du64;
- return BitCast(d, RotateRight<32>(BitCast(du64, v)));
- }
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
- HWY_API VFromD<D> Reverse2(D /* tag */, VFromD<D> v) {
- return Shuffle01(v);
- }
- // ------------------------------ Reverse4
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_API VFromD<D> Reverse4(D /*d*/, VFromD<D> v) {
- const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
- 14, 15, 12, 13, 10, 11, 8, 9};
- return VFromD<D>{vec_perm(v.raw, v.raw, kShuffle)};
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
- return Reverse(d, v);
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D> /* v */) {
- HWY_ASSERT(0); // don't have 4 u64 lanes
- }
- // ------------------------------ Reverse8
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
- return Reverse(d, v);
- }
- template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
- HWY_API VFromD<D> Reverse8(D /* tag */, VFromD<D> /* v */) {
- HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit
- }
- // ------------------------------ InterleaveLower
- // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
- // the least-significant lane) and "b". To concatenate two half-width integers
- // into one, use ZipLower/Upper instead (also works with scalar).
- template <typename T, size_t N>
- HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
- return Vec128<T, N>{vec_mergeh(a.raw, b.raw)};
- }
- // Additional overload for the optional tag
- template <class D>
- HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
- return InterleaveLower(a, b);
- }
- // ------------------------------ InterleaveUpper (UpperHalf)
- // Full
- template <class D, typename T = TFromD<D>>
- HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) {
- return Vec128<T>{vec_mergel(a.raw, b.raw)};
- }
- // Partial
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
- const Half<decltype(d)> d2;
- return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
- VFromD<D>{UpperHalf(d2, b).raw});
- }
- // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
- // Same as Interleave*, except that the return lanes are double-width integers;
- // this is necessary because the single-lane scalar cannot return two values.
- template <class V, class DW = RepartitionToWide<DFromV<V>>>
- HWY_API VFromD<DW> ZipLower(V a, V b) {
- return BitCast(DW(), InterleaveLower(a, b));
- }
- template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
- HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
- return BitCast(dw, InterleaveLower(D(), a, b));
- }
- template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
- HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
- return BitCast(dw, InterleaveUpper(D(), a, b));
- }
- // ------------------------------ Per4LaneBlkShufDupSet4xU32
- // Used by hwy/ops/generic_ops-inl.h to implement Per4LaneBlockShuffle
- namespace detail {
- #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
- #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
- #else
- #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
- #endif
- template <class D>
- HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
- const uint32_t x2,
- const uint32_t x1,
- const uint32_t x0) {
- const __vector unsigned int raw = {x0, x1, x2, x3};
- return ResizeBitCast(d, Vec128<uint32_t>{raw});
- }
- } // namespace detail
- // ------------------------------ SlideUpLanes
- template <class D>
- HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
- const Repartition<uint8_t, decltype(d)> du8;
- using VU8 = VFromD<decltype(du8)>;
- const auto v_shift_amt =
- BitCast(Full128<uint8_t>(),
- Set(Full128<uint32_t>(),
- static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
- #if HWY_S390X_HAVE_Z14
- return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)});
- #else // VSX
- #if HWY_IS_LITTLE_ENDIAN
- return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
- #else
- return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
- #endif // HWY_IS_LITTLE_ENDIAN
- #endif // HWY_S390X_HAVE_Z14
- }
- // ------------------------------ SlideDownLanes
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
- using TU = UnsignedFromSize<d.MaxBytes()>;
- const Repartition<TU, decltype(d)> du;
- const auto v_shift_amt =
- Set(du, static_cast<TU>(amt * sizeof(TFromD<D>) * 8));
- #if HWY_IS_LITTLE_ENDIAN
- return BitCast(d, BitCast(du, v) >> v_shift_amt);
- #else
- return BitCast(d, BitCast(du, v) << v_shift_amt);
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16)>
- HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
- const Repartition<uint8_t, decltype(d)> du8;
- using VU8 = VFromD<decltype(du8)>;
- const auto v_shift_amt =
- BitCast(Full128<uint8_t>(),
- Set(Full128<uint32_t>(),
- static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
- #if HWY_S390X_HAVE_Z14
- return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)});
- #else // VSX
- #if HWY_IS_LITTLE_ENDIAN
- return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
- #else
- return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
- #endif // HWY_IS_LITTLE_ENDIAN
- #endif // HWY_S390X_HAVE_Z14
- }
- // ================================================== COMBINE
- // ------------------------------ Combine (InterleaveLower)
- // N = N/2 + N/2 (upper half undefined)
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
- HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
- const Half<decltype(d)> dh;
- // Treat half-width input as one lane, and expand to two lanes.
- using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
- using Raw = typename detail::Raw128<TFromV<VU>>::type;
- const VU lo{reinterpret_cast<Raw>(lo_half.raw)};
- const VU hi{reinterpret_cast<Raw>(hi_half.raw)};
- return BitCast(d, InterleaveLower(lo, hi));
- }
- // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
- template <class D>
- HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
- const Half<D> dh;
- return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
- }
- // ------------------------------ Concat full (InterleaveLower)
- // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
- template <class D, typename T = TFromD<D>>
- HWY_API Vec128<T> ConcatLowerLower(D d, Vec128<T> hi, Vec128<T> lo) {
- const Repartition<uint64_t, decltype(d)> d64;
- return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
- }
- // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
- template <class D, typename T = TFromD<D>>
- HWY_API Vec128<T> ConcatUpperUpper(D d, Vec128<T> hi, Vec128<T> lo) {
- const Repartition<uint64_t, decltype(d)> d64;
- return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
- }
- // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
- template <class D, typename T = TFromD<D>>
- HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) {
- return CombineShiftRightBytes<8>(d, hi, lo);
- }
- // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
- template <class D, typename T = TFromD<D>>
- HWY_API Vec128<T> ConcatUpperLower(D /*d*/, Vec128<T> hi, Vec128<T> lo) {
- const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7,
- 24, 25, 26, 27, 28, 29, 30, 31};
- return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
- }
- // ------------------------------ Concat partial (Combine, LowerHalf)
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
- const Half<decltype(d)> d2;
- return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
- }
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
- const Half<decltype(d)> d2;
- return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
- }
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
- const Half<decltype(d)> d2;
- return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
- }
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
- const Half<decltype(d)> d2;
- return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
- }
- // ------------------------------ TruncateTo
- template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
- hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 2)>* = nullptr,
- HWY_IF_LANES_D(D, 1)>
- HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<FromT, 1> v) {
- using Raw = typename detail::Raw128<TFromD<D>>::type;
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{reinterpret_cast<Raw>(v.raw)};
- #else
- return VFromD<D>{reinterpret_cast<Raw>(
- vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD<D>)))};
- #endif
- }
- namespace detail {
- template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
- HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
- HWY_API VFromD<D> Truncate2To(
- D /* tag */, Vec128<FromT, Repartition<FromT, D>().MaxLanes()> lo,
- Vec128<FromT, Repartition<FromT, D>().MaxLanes()> hi) {
- return VFromD<D>{vec_pack(lo.raw, hi.raw)};
- }
- } // namespace detail
- template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
- HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
- HWY_API VFromD<D> TruncateTo(D /* d */,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- return VFromD<D>{vec_pack(v.raw, v.raw)};
- }
- template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
- hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr,
- HWY_IF_LANES_GT_D(D, 1)>
- HWY_API VFromD<D> TruncateTo(D d,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- const Rebind<MakeNarrow<FromT>, decltype(d)> d2;
- return TruncateTo(d, TruncateTo(d2, v));
- }
- // ------------------------------ ConcatOdd (TruncateTo)
- // 8-bit full
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
- const Repartition<uint16_t, decltype(d)> dw;
- const RebindToUnsigned<decltype(d)> du;
- #if HWY_IS_LITTLE_ENDIAN
- // Right-shift 8 bits per u16 so we can pack.
- const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
- const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
- #else
- const Vec128<uint16_t> uH = BitCast(dw, hi);
- const Vec128<uint16_t> uL = BitCast(dw, lo);
- #endif
- return BitCast(d, detail::Truncate2To(du, uL, uH));
- }
- // 8-bit x8
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
- // Don't care about upper half, no need to zero.
- const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23};
- return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
- }
- // 8-bit x4
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec32<T> ConcatOdd(D /*d*/, Vec32<T> hi, Vec32<T> lo) {
- // Don't care about upper half, no need to zero.
- const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19};
- return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
- }
- // 16-bit full
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
- const Repartition<uint32_t, decltype(d)> dw;
- const RebindToUnsigned<decltype(d)> du;
- #if HWY_IS_LITTLE_ENDIAN
- const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
- const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
- #else
- const Vec128<uint32_t> uH = BitCast(dw, hi);
- const Vec128<uint32_t> uL = BitCast(dw, lo);
- #endif
- return BitCast(d, detail::Truncate2To(du, uL, uH));
- }
- // 16-bit x4
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec64<T> ConcatOdd(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
- // Don't care about upper half, no need to zero.
- const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23};
- return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU16)};
- }
- // 32-bit full
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
- #if HWY_IS_LITTLE_ENDIAN
- (void)d;
- const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15,
- 20, 21, 22, 23, 28, 29, 30, 31};
- return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
- #else
- const RebindToUnsigned<decltype(d)> du;
- const Repartition<uint64_t, decltype(d)> dw;
- return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
- #endif
- }
- // Any type x2
- template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
- HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
- return InterleaveUpper(d, lo, hi);
- }
- // ------------------------------ ConcatEven (TruncateTo)
- // 8-bit full
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
- const Repartition<uint16_t, decltype(d)> dw;
- const RebindToUnsigned<decltype(d)> du;
- #if HWY_IS_LITTLE_ENDIAN
- const Vec128<uint16_t> uH = BitCast(dw, hi);
- const Vec128<uint16_t> uL = BitCast(dw, lo);
- #else
- // Right-shift 8 bits per u16 so we can pack.
- const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
- const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
- #endif
- return BitCast(d, detail::Truncate2To(du, uL, uH));
- }
- // 8-bit x8
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
- // Don't care about upper half, no need to zero.
- const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22};
- return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
- }
- // 8-bit x4
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec32<T> ConcatEven(D /*d*/, Vec32<T> hi, Vec32<T> lo) {
- // Don't care about upper half, no need to zero.
- const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18};
- return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
- }
- // 16-bit full
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
- // Isolate lower 16 bits per u32 so we can pack.
- const Repartition<uint32_t, decltype(d)> dw;
- const RebindToUnsigned<decltype(d)> du;
- #if HWY_IS_LITTLE_ENDIAN
- const Vec128<uint32_t> uH = BitCast(dw, hi);
- const Vec128<uint32_t> uL = BitCast(dw, lo);
- #else
- const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
- const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
- #endif
- return BitCast(d, detail::Truncate2To(du, uL, uH));
- }
- // 16-bit x4
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec64<T> ConcatEven(D /*d*/, Vec64<T> hi, Vec64<T> lo) {
- // Don't care about upper half, no need to zero.
- const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21};
- return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU16)};
- }
- // 32-bit full
- template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
- #if HWY_IS_LITTLE_ENDIAN
- const Repartition<uint64_t, decltype(d)> dw;
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
- #else
- (void)d;
- constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11,
- 16, 17, 18, 19, 24, 25, 26, 27};
- return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
- #endif
- }
- // Any T x2
- template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
- HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
- return InterleaveLower(d, lo, hi);
- }
- // ------------------------------ OrderedTruncate2To (ConcatEven, ConcatOdd)
- #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
- #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
- #else
- #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
- #endif
- template <class D, HWY_IF_UNSIGNED_D(D), class V, HWY_IF_UNSIGNED_V(V),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
- HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
- HWY_API VFromD<D> OrderedTruncate2To(D d, V a, V b) {
- #if HWY_IS_LITTLE_ENDIAN
- return ConcatEven(d, BitCast(d, b), BitCast(d, a));
- #else
- return ConcatOdd(d, BitCast(d, b), BitCast(d, a));
- #endif
- }
- // ------------------------------ DupEven (InterleaveLower)
- template <typename T>
- HWY_API Vec128<T, 1> DupEven(Vec128<T, 1> v) {
- return v;
- }
- template <typename T>
- HWY_API Vec128<T, 2> DupEven(Vec128<T, 2> v) {
- return InterleaveLower(DFromV<decltype(v)>(), v, v);
- }
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
- const DFromV<decltype(v)> d;
- const Repartition<uint8_t, decltype(d)> du8;
- constexpr __vector unsigned char kShuffle = {0, 0, 2, 2, 4, 4, 6, 6,
- 8, 8, 10, 10, 12, 12, 14, 14};
- return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
- }
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
- const DFromV<decltype(v)> d;
- const Repartition<uint8_t, decltype(d)> du8;
- constexpr __vector unsigned char kShuffle = {0, 1, 0, 1, 4, 5, 4, 5,
- 8, 9, 8, 9, 12, 13, 12, 13};
- return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
- }
- template <typename T, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T> DupEven(Vec128<T> v) {
- #if HWY_S390X_HAVE_Z14
- const DFromV<decltype(v)> d;
- const Repartition<uint8_t, decltype(d)> du8;
- return TableLookupBytes(
- v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
- 11, 8, 9, 10, 11)));
- #else
- return Vec128<T>{vec_mergee(v.raw, v.raw)};
- #endif
- }
- // ------------------------------ DupOdd (InterleaveUpper)
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
- const DFromV<decltype(v)> d;
- const Repartition<uint8_t, decltype(d)> du8;
- constexpr __vector unsigned char kShuffle = {1, 1, 3, 3, 5, 5, 7, 7,
- 9, 9, 11, 11, 13, 13, 15, 15};
- return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
- }
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
- HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
- const DFromV<decltype(v)> d;
- const Repartition<uint8_t, decltype(d)> du8;
- constexpr __vector unsigned char kShuffle = {2, 3, 2, 3, 6, 7, 6, 7,
- 10, 11, 10, 11, 14, 15, 14, 15};
- return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
- }
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
- HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
- #if HWY_S390X_HAVE_Z14
- const DFromV<decltype(v)> d;
- const Repartition<uint8_t, decltype(d)> du8;
- return TableLookupBytes(
- v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
- 15, 12, 13, 14, 15)));
- #else
- return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
- #endif
- }
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
- HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
- return InterleaveUpper(DFromV<decltype(v)>(), v, v);
- }
- // ------------------------------ OddEven (IfThenElse)
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
- HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
- 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
- return IfVecThenElse(BitCast(d, Vec128<uint8_t, N>{mask}), b, a);
- }
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
- HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
- 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
- return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 2>{mask}), b, a);
- }
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
- HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
- const DFromV<decltype(a)> d;
- const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0,
- 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
- return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 4>{mask}), b, a);
- }
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
- HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
- // Same as ConcatUpperLower for full vectors; do not call that because this
- // is more efficient for 64x1 vectors.
- const DFromV<decltype(a)> d;
- const __vector unsigned char mask = {
- 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0};
- return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
- }
- // ------------------------------ InterleaveEven
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
- const Full128<TFromD<D>> d_full;
- const Indices128<TFromD<D>> idx{
- Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
- 10, 26, 12, 28, 14, 30)
- .raw};
- return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
- ResizeBitCast(d_full, b), idx));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
- const Full128<TFromD<D>> d_full;
- const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
- 16, 17, 4, 5, 20, 21, 8,
- 9, 24, 25, 12, 13, 28, 29)
- .raw};
- return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
- ResizeBitCast(d_full, b), idx));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
- #if HWY_S390X_HAVE_Z14
- const Full128<TFromD<D>> d_full;
- const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
- 2, 3, 16, 17, 18, 19, 8,
- 9, 10, 11, 24, 25, 26, 27)
- .raw};
- return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
- ResizeBitCast(d_full, b), idx));
- #else
- (void)d;
- return VFromD<D>{vec_mergee(a.raw, b.raw)};
- #endif
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
- return InterleaveLower(a, b);
- }
- // ------------------------------ InterleaveOdd
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
- const Full128<TFromD<D>> d_full;
- const Indices128<TFromD<D>> idx{
- Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
- 11, 27, 13, 29, 15, 31)
- .raw};
- return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
- ResizeBitCast(d_full, b), idx));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
- const Full128<TFromD<D>> d_full;
- const Indices128<TFromD<D>> idx{
- Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
- 11, 26, 27, 14, 15, 30, 31)
- .raw};
- return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
- ResizeBitCast(d_full, b), idx));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
- #if HWY_S390X_HAVE_Z14
- const Full128<TFromD<D>> d_full;
- const Indices128<TFromD<D>> idx{
- Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
- 13, 14, 15, 28, 29, 30, 31)
- .raw};
- return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
- ResizeBitCast(d_full, b), idx));
- #else
- (void)d;
- return VFromD<D>{vec_mergeo(a.raw, b.raw)};
- #endif
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
- return InterleaveUpper(d, a, b);
- }
- // ------------------------------ OddEvenBlocks
- template <typename T, size_t N>
- HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
- return even;
- }
- // ------------------------------ SwapAdjacentBlocks
- template <typename T, size_t N>
- HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
- return v;
- }
- // ------------------------------ MulFixedPoint15 (OddEven)
- #if HWY_S390X_HAVE_Z14
- HWY_API Vec16<int16_t> MulFixedPoint15(Vec16<int16_t> a, Vec16<int16_t> b) {
- const DFromV<decltype(a)> di16;
- const RepartitionToWide<decltype(di16)> di32;
- const auto round_up_incr = Set(di32, 0x4000);
- const auto i32_product = MulEven(a, b) + round_up_incr;
- return ResizeBitCast(di16, ShiftLeft<1>(i32_product));
- }
- template <size_t N, HWY_IF_LANES_GT(N, 1)>
- HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
- Vec128<int16_t, N> b) {
- const DFromV<decltype(a)> di16;
- const RepartitionToWide<decltype(di16)> di32;
- const auto round_up_incr = Set(di32, 0x4000);
- const auto even_product = MulEven(a, b) + round_up_incr;
- const auto odd_product = MulOdd(a, b) + round_up_incr;
- return OddEven(BitCast(di16, ShiftRight<15>(odd_product)),
- BitCast(di16, ShiftLeft<1>(even_product)));
- }
- #else
- template <size_t N>
- HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
- Vec128<int16_t, N> b) {
- const Vec128<int16_t> zero = Zero(Full128<int16_t>());
- return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
- }
- #endif
- // ------------------------------ Shl
- namespace detail {
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
- Vec128<T, N> bits) {
- #if HWY_S390X_HAVE_Z14
- return Vec128<T, N>{v.raw << bits.raw};
- #else
- return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
- #endif
- }
- // Signed left shift is the same as unsigned.
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
- Vec128<T, N> bits) {
- const DFromV<decltype(v)> di;
- const RebindToUnsigned<decltype(di)> du;
- return BitCast(di,
- Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
- }
- } // namespace detail
- template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
- HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
- return detail::Shl(hwy::TypeTag<T>(), v, bits);
- }
- // ------------------------------ Shr
- namespace detail {
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Shr(hwy::UnsignedTag /*tag*/, Vec128<T, N> v,
- Vec128<T, N> bits) {
- #if HWY_S390X_HAVE_Z14
- return Vec128<T, N>{v.raw >> bits.raw};
- #else
- return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
- #endif
- }
- template <typename T, size_t N>
- HWY_API Vec128<T, N> Shr(hwy::SignedTag /*tag*/, Vec128<T, N> v,
- Vec128<T, N> bits) {
- #if HWY_S390X_HAVE_Z14
- return Vec128<T, N>{v.raw >> bits.raw};
- #else
- const DFromV<decltype(v)> di;
- const RebindToUnsigned<decltype(di)> du;
- return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
- #endif
- }
- } // namespace detail
- template <typename T, size_t N>
- HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
- return detail::Shr(hwy::TypeTag<T>(), v, bits);
- }
- // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
- template <class T, HWY_IF_UI64(T)>
- HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
- #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
- using V64 = typename detail::Raw128<T>::type;
- const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw));
- #if HWY_IS_LITTLE_ENDIAN
- return Vec128<T>{mul128_result};
- #else
- // Need to swap the two halves of mul128_result on big-endian targets as
- // the upper 64 bits of the product are in lane 0 of mul128_result and
- // the lower 64 bits of the product are in lane 1 of mul128_result
- return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
- #endif
- #else
- alignas(16) T mul[2];
- mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
- return Load(Full128<T>(), mul);
- #endif
- }
- template <class T, HWY_IF_UI64(T)>
- HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
- #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
- using V64 = typename detail::Raw128<T>::type;
- const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw));
- #if HWY_IS_LITTLE_ENDIAN
- return Vec128<T>{mul128_result};
- #else
- // Need to swap the two halves of mul128_result on big-endian targets as
- // the upper 64 bits of the product are in lane 0 of mul128_result and
- // the lower 64 bits of the product are in lane 1 of mul128_result
- return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
- #endif
- #else
- alignas(16) T mul[2];
- const Full64<T> d2;
- mul[0] =
- Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
- return Load(Full128<T>(), mul);
- #endif
- }
- // ------------------------------ PromoteEvenTo/PromoteOddTo
- #include "hwy/ops/inside-inl.h"
- // ------------------------------ WidenMulPairwiseAdd
- template <class DF, HWY_IF_F32_D(DF),
- class VBF = VFromD<Repartition<bfloat16_t, DF>>>
- HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
- return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
- Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
- }
- // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
- template <class D32, HWY_IF_UI32_D(D32),
- class V16 = VFromD<RepartitionToNarrow<D32>>>
- HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
- #if HWY_S390X_HAVE_Z14
- (void)d32;
- return MulEven(a, b) + MulOdd(a, b);
- #else
- return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)};
- #endif
- }
- // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
- // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
- template <class D32, HWY_IF_UI32_D(D32),
- class V16 = VFromD<RepartitionToNarrow<D32>>>
- HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b,
- VFromD<D32> sum0,
- VFromD<D32>& /*sum1*/) {
- #if HWY_S390X_HAVE_Z14
- return MulEven(a, b) + MulOdd(a, b) + sum0;
- #else
- return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
- #endif
- }
- // ------------------------------ RearrangeToOddPlusEven
- template <size_t N>
- HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(Vec128<int32_t, N> sum0,
- Vec128<int32_t, N> /*sum1*/) {
- return sum0; // invariant already holds
- }
- template <size_t N>
- HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
- Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> /*sum1*/) {
- return sum0; // invariant already holds
- }
- template <class VW>
- HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
- return Add(sum0, sum1);
- }
- // ------------------------------ SatWidenMulPairwiseAccumulate
- #if !HWY_S390X_HAVE_Z14
- #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
- #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
- #else
- #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
- #endif
- template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
- HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
- DI32 /* tag */, VFromD<Repartition<int16_t, DI32>> a,
- VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
- return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)};
- }
- #endif // !HWY_S390X_HAVE_Z14
- // ------------------------------ SumOfMulQuadAccumulate
- #if !HWY_S390X_HAVE_Z14
- #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
- #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
- #else
- #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
- #endif
- template <class DU32, HWY_IF_U32_D(DU32)>
- HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
- DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
- VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
- return VFromD<DU32>{vec_msum(a.raw, b.raw, sum.raw)};
- }
- #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
- #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
- #else
- #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
- #endif
- template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
- HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
- DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
- VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
- return VFromD<DI32>{vec_msum(b_i.raw, a_u.raw, sum.raw)};
- }
- #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
- #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
- #else
- #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
- #endif
- template <class DI32, HWY_IF_I32_D(DI32)>
- HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
- VFromD<Repartition<int8_t, DI32>> a,
- VFromD<Repartition<int8_t, DI32>> b,
- VFromD<DI32> sum) {
- const Repartition<uint8_t, decltype(di32)> du8;
- const auto result_sum_0 =
- SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum);
- const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a))));
- return result_sum_0 - result_sum_1;
- }
- #endif // !HWY_S390X_HAVE_Z14
- // ================================================== CONVERT
- // ------------------------------ Promotions (part w/ narrow lanes -> full)
- // Unsigned to signed/unsigned: zero-extend.
- template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_UNSIGNED(FromT)>
- HWY_API VFromD<D> PromoteTo(D /* d */,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- // First pretend the input has twice the lanes - the upper half will be
- // ignored by ZipLower.
- const Rebind<FromT, Twice<D>> d2;
- const VFromD<decltype(d2)> twice{v.raw};
- // Then cast to narrow as expected by ZipLower, in case the sign of FromT
- // differs from that of D.
- const RepartitionToNarrow<D> dn;
- #if HWY_IS_LITTLE_ENDIAN
- return ZipLower(BitCast(dn, twice), Zero(dn));
- #else
- return ZipLower(Zero(dn), BitCast(dn, twice));
- #endif
- }
- // Signed: replicate sign bit.
- template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_SIGNED(FromT)>
- HWY_API VFromD<D> PromoteTo(D /* d */,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- using Raw = typename detail::Raw128<TFromD<D>>::type;
- return VFromD<D>{reinterpret_cast<Raw>(vec_unpackh(v.raw))};
- }
- // 8-bit to 32-bit: First, promote to 16-bit, and then convert to 32-bit.
- template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_D(D),
- HWY_IF_T_SIZE(FromT, 1)>
- HWY_API VFromD<D> PromoteTo(D d32,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- const DFromV<decltype(v)> d8;
- const Rebind<MakeWide<FromT>, decltype(d8)> d16;
- return PromoteTo(d32, PromoteTo(d16, v));
- }
- // 8-bit or 16-bit to 64-bit: First, promote to MakeWide<FromT>, and then
- // convert to 64-bit.
- template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 8), HWY_IF_NOT_FLOAT_D(D),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL(FromT),
- HWY_IF_T_SIZE_ONE_OF(FromT, (1 << 1) | (1 << 2))>
- HWY_API VFromD<D> PromoteTo(D d64,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- const Rebind<MakeWide<FromT>, decltype(d64)> dw;
- return PromoteTo(d64, PromoteTo(dw, v));
- }
- #if HWY_PPC_HAVE_9
- // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
- #ifdef HWY_NATIVE_F16C
- #undef HWY_NATIVE_F16C
- #else
- #define HWY_NATIVE_F16C
- #endif
- template <class D, HWY_IF_F32_D(D)>
- HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
- return VFromD<D>{vec_extract_fp32_from_shorth(v.raw)};
- }
- #endif // HWY_PPC_HAVE_9
- template <class D, HWY_IF_F32_D(D)>
- HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
- const Rebind<uint16_t, decltype(df32)> du16;
- const RebindToSigned<decltype(df32)> di32;
- return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
- }
- template <class D, HWY_IF_F64_D(D)>
- HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
- const __vector float raw_v = InterleaveLower(v, v).raw;
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{vec_doubleo(raw_v)};
- #else
- return VFromD<D>{vec_doublee(raw_v)};
- #endif
- }
- template <class D, HWY_IF_F64_D(D)>
- HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<int32_t, D>> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToSigned<decltype(df64)> di64;
- return ConvertTo(df64, PromoteTo(di64, v));
- #else // VSX
- (void)df64;
- const __vector signed int raw_v = InterleaveLower(v, v).raw;
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{vec_doubleo(raw_v)};
- #else
- return VFromD<D>{vec_doublee(raw_v)};
- #endif
- #endif // HWY_S390X_HAVE_Z14
- }
- template <class D, HWY_IF_F64_D(D)>
- HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(df64)> du64;
- return ConvertTo(df64, PromoteTo(du64, v));
- #else // VSX
- (void)df64;
- const __vector unsigned int raw_v = InterleaveLower(v, v).raw;
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{vec_doubleo(raw_v)};
- #else
- return VFromD<D>{vec_doublee(raw_v)};
- #endif
- #endif // HWY_S390X_HAVE_Z14
- }
- #if !HWY_S390X_HAVE_Z14
- namespace detail {
- template <class V>
- static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
- #if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
- // Workaround for QEMU 7/8 VSX float to int conversion bug
- return IfThenElseZero(v == v, v);
- #else
- return v;
- #endif
- }
- } // namespace detail
- #endif // !HWY_S390X_HAVE_Z14
- template <class D, HWY_IF_I64_D(D)>
- HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
- #if !HWY_S390X_HAVE_Z14 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
- const __vector float raw_v =
- detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
- return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
- #else
- const RebindToFloat<decltype(di64)> df64;
- return ConvertTo(di64, PromoteTo(df64, v));
- #endif
- }
- template <class D, HWY_IF_U64_D(D)>
- HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
- #if !HWY_S390X_HAVE_Z14 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
- const __vector float raw_v =
- detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw;
- return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
- __builtin_vsx_xvcvspuxds(raw_v))};
- #else
- const RebindToFloat<decltype(du64)> df64;
- return ConvertTo(du64, PromoteTo(df64, v));
- #endif
- }
- // ------------------------------ PromoteUpperTo
- #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
- #undef HWY_NATIVE_PROMOTE_UPPER_TO
- #else
- #define HWY_NATIVE_PROMOTE_UPPER_TO
- #endif
- // Unsigned to signed/unsigned: zero-extend.
- template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16),
- HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_UNSIGNED(FromT)>
- HWY_API VFromD<D> PromoteUpperTo(D d, Vec128<FromT> v) {
- const RebindToUnsigned<D> du;
- const RepartitionToNarrow<decltype(du)> dn;
- #if HWY_IS_LITTLE_ENDIAN
- return BitCast(d, ZipUpper(du, v, Zero(dn)));
- #else
- return BitCast(d, ZipUpper(du, Zero(dn), v));
- #endif
- }
- // Signed: replicate sign bit.
- template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16),
- HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
- HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_SIGNED(FromT)>
- HWY_API VFromD<D> PromoteUpperTo(D /* d */, Vec128<FromT> v) {
- using Raw = typename detail::Raw128<TFromD<D>>::type;
- return VFromD<D>{reinterpret_cast<Raw>(vec_unpackl(v.raw))};
- }
- // F16 to F32
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
- HWY_API VFromD<D> PromoteUpperTo(D df32, Vec128<float16_t> v) {
- #if HWY_PPC_HAVE_9
- (void)df32;
- return VFromD<D>{vec_extract_fp32_from_shortl(v.raw)};
- #else
- const Rebind<float16_t, decltype(df32)> dh;
- return PromoteTo(df32, UpperHalf(dh, v));
- #endif
- }
- // BF16 to F32
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
- HWY_API VFromD<D> PromoteUpperTo(D df32, Vec128<bfloat16_t> v) {
- const Repartition<uint16_t, decltype(df32)> du16;
- const RebindToSigned<decltype(df32)> di32;
- return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
- HWY_API VFromD<D> PromoteUpperTo(D /*tag*/, Vec128<float> v) {
- const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{vec_doubleo(raw_v)};
- #else
- return VFromD<D>{vec_doublee(raw_v)};
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
- HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<int32_t> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToSigned<decltype(df64)> di64;
- return ConvertTo(df64, PromoteUpperTo(di64, v));
- #else // VSX
- (void)df64;
- const __vector signed int raw_v =
- InterleaveUpper(Full128<int32_t>(), v, v).raw;
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{vec_doubleo(raw_v)};
- #else
- return VFromD<D>{vec_doublee(raw_v)};
- #endif
- #endif // HWY_S390X_HAVE_Z14
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
- HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(df64)> du64;
- return ConvertTo(df64, PromoteUpperTo(du64, v));
- #else // VSX
- (void)df64;
- const __vector unsigned int raw_v =
- InterleaveUpper(Full128<uint32_t>(), v, v).raw;
- #if HWY_IS_LITTLE_ENDIAN
- return VFromD<D>{vec_doubleo(raw_v)};
- #else
- return VFromD<D>{vec_doublee(raw_v)};
- #endif
- #endif // HWY_S390X_HAVE_Z14
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
- HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
- #if !HWY_S390X_HAVE_Z14 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
- const __vector float raw_v =
- detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
- .raw;
- return VFromD<decltype(di64)>{__builtin_vsx_xvcvspsxds(raw_v)};
- #else
- const RebindToFloat<decltype(di64)> df64;
- return ConvertTo(di64, PromoteUpperTo(df64, v));
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
- HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
- #if !HWY_S390X_HAVE_Z14 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
- const __vector float raw_v =
- detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v))
- .raw;
- return VFromD<decltype(du64)>{reinterpret_cast<__vector unsigned long long>(
- __builtin_vsx_xvcvspuxds(raw_v))};
- #else
- const RebindToFloat<decltype(du64)> df64;
- return ConvertTo(du64, PromoteUpperTo(df64, v));
- #endif
- }
- // Generic version for <=64 bit input/output
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V>
- HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
- const Rebind<TFromV<V>, decltype(d)> dh;
- return PromoteTo(d, UpperHalf(dh, v));
- }
- // ------------------------------ PromoteEvenTo/PromoteOddTo
- namespace detail {
- // Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10
- #if HWY_PPC_HAVE_9 && \
- (HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
- #if HWY_IS_LITTLE_ENDIAN
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
- hwy::SizeTag<4> /*to_lane_size_tag*/,
- hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
- V v) {
- return VFromD<D>{vec_signexti(v.raw)};
- }
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
- V v) {
- return VFromD<D>{vec_signextll(v.raw)};
- }
- #else
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
- hwy::SizeTag<4> /*to_lane_size_tag*/,
- hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
- V v) {
- return VFromD<D>{vec_signexti(v.raw)};
- }
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- hwy::SignedTag /*from_type_tag*/, D /*d_to*/,
- V v) {
- return VFromD<D>{vec_signextll(v.raw)};
- }
- #endif // HWY_IS_LITTLE_ENDIAN
- #endif // HWY_PPC_HAVE_9
- // I32/U32/F32->F64 PromoteEvenTo
- #if HWY_S390X_HAVE_Z14
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- hwy::FloatTag /*from_type_tag*/, D /*d_to*/,
- V v) {
- return VFromD<D>{vec_doublee(v.raw)};
- }
- template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- FromTypeTag /*from_type_tag*/, D d_to, V v) {
- const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
- return ConvertTo(d_to, PromoteEvenTo(dw, v));
- }
- #else // VSX
- template <class D, class V, class FromTypeTag>
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- FromTypeTag /*from_type_tag*/, D /*d_to*/,
- V v) {
- return VFromD<D>{vec_doublee(v.raw)};
- }
- #endif // HWY_S390X_HAVE_Z14
- // F32->I64 PromoteEvenTo
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- hwy::FloatTag /*from_type_tag*/, D d_to,
- V v) {
- #if !HWY_S390X_HAVE_Z14 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
- (void)d_to;
- const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
- #if HWY_IS_LITTLE_ENDIAN
- // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
- // on little-endian PPC, and the vec_sld operation below will shift the even
- // lanes of normalized_v into the odd lanes.
- return VFromD<D>{
- __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
- #else
- // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
- // on big-endian PPC.
- return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
- #endif
- #else
- const RebindToFloat<decltype(d_to)> df64;
- return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
- hwy::FloatTag(), df64, v));
- #endif
- }
- // F32->U64 PromoteEvenTo
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- hwy::FloatTag /*from_type_tag*/, D d_to,
- V v) {
- #if !HWY_S390X_HAVE_Z14 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
- (void)d_to;
- const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
- #if HWY_IS_LITTLE_ENDIAN
- // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
- // on little-endian PPC, and the vec_sld operation below will shift the even
- // lanes of normalized_v into the odd lanes.
- return VFromD<D>{
- reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
- vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
- #else
- // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
- // on big-endian PPC.
- return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
- __builtin_vsx_xvcvspuxds(normalized_v.raw))};
- #endif
- #else
- const RebindToFloat<decltype(d_to)> df64;
- return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
- hwy::FloatTag(), df64, v));
- #endif
- }
- // I32/U32/F32->F64 PromoteOddTo
- #if HWY_S390X_HAVE_Z14
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- hwy::FloatTag /*from_type_tag*/, D d_to,
- V v) {
- return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(),
- d_to, V{vec_sld(v.raw, v.raw, 4)});
- }
- template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
- HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- FromTypeTag /*from_type_tag*/, D d_to, V v) {
- const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
- return ConvertTo(d_to, PromoteOddTo(dw, v));
- }
- #else
- template <class D, class V, class FromTypeTag>
- HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- FromTypeTag /*from_type_tag*/, D /*d_to*/,
- V v) {
- return VFromD<D>{vec_doubleo(v.raw)};
- }
- #endif
- // F32->I64 PromoteOddTo
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- hwy::FloatTag /*from_type_tag*/, D d_to,
- V v) {
- #if !HWY_S390X_HAVE_Z14 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds))
- (void)d_to;
- const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
- #if HWY_IS_LITTLE_ENDIAN
- // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
- // on little-endian PPC
- return VFromD<D>{__builtin_vsx_xvcvspsxds(normalized_v.raw)};
- #else
- // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
- // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
- // of normalized_v into the even lanes.
- return VFromD<D>{
- __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))};
- #endif
- #else
- const RebindToFloat<decltype(d_to)> df64;
- return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
- hwy::FloatTag(), df64, v));
- #endif
- }
- // F32->U64 PromoteOddTo
- template <class D, class V>
- HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
- hwy::SizeTag<8> /*to_lane_size_tag*/,
- hwy::FloatTag /*from_type_tag*/, D d_to,
- V v) {
- #if !HWY_S390X_HAVE_Z14 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds))
- (void)d_to;
- const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
- #if HWY_IS_LITTLE_ENDIAN
- // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
- // on little-endian PPC
- return VFromD<D>{reinterpret_cast<__vector unsigned long long>(
- __builtin_vsx_xvcvspuxds(normalized_v.raw))};
- #else
- // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
- // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
- // of normalized_v into the even lanes.
- return VFromD<D>{
- reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds(
- vec_sld(normalized_v.raw, normalized_v.raw, 4)))};
- #endif
- #else
- const RebindToFloat<decltype(d_to)> df64;
- return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
- hwy::FloatTag(), df64, v));
- #endif
- }
- } // namespace detail
- // ------------------------------ Demotions (full -> part w/ narrow lanes)
- template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_SIGNED(FromT), HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
- HWY_API VFromD<D> DemoteTo(D /* tag */,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- return VFromD<D>{vec_packsu(v.raw, v.raw)};
- }
- template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
- HWY_API VFromD<D> DemoteTo(D /* tag */,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- return VFromD<D>{vec_packs(v.raw, v.raw)};
- }
- template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
- HWY_API VFromD<D> DemoteTo(D /* tag */,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- return VFromD<D>{vec_packs(v.raw, v.raw)};
- }
- template <class D, class FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
- hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
- HWY_API VFromD<D> DemoteTo(D d,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- const Rebind<MakeNarrow<FromT>, D> d2;
- return DemoteTo(d, DemoteTo(d2, v));
- }
- template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
- hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
- HWY_API VFromD<D> DemoteTo(D d,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- const Rebind<MakeNarrow<FromT>, D> d2;
- return DemoteTo(d, DemoteTo(d2, v));
- }
- template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_SIGNED(FromT),
- HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
- hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
- HWY_API VFromD<D> DemoteTo(D d,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- const Rebind<MakeUnsigned<MakeNarrow<FromT>>, D> d2;
- return DemoteTo(d, DemoteTo(d2, v));
- }
- #if HWY_PPC_HAVE_9 && \
- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp))
- // We already toggled HWY_NATIVE_F16C above.
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
- HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
- // Avoid vec_pack_to_short_fp32 on Clang because its implementation is buggy.
- #if HWY_COMPILER_GCC_ACTUAL
- (void)df16;
- return VFromD<D>{vec_pack_to_short_fp32(v.raw, v.raw)};
- #elif HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)
- // Work around bug in the clang implementation of vec_pack_to_short_fp32
- // by using the __builtin_vsx_xvcvsphp builtin on PPC9/PPC10 targets
- // if the __builtin_vsx_xvcvsphp intrinsic is available
- const RebindToUnsigned<decltype(df16)> du16;
- const Rebind<uint32_t, D> du;
- const VFromD<decltype(du)> bits16{
- reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))};
- return BitCast(df16, TruncateTo(du16, bits16));
- #else
- #error "Only define the function if we have a native implementation"
- #endif
- }
- #endif // HWY_PPC_HAVE_9
- #if HWY_PPC_HAVE_9
- #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
- #undef HWY_NATIVE_DEMOTE_F64_TO_F16
- #else
- #define HWY_NATIVE_DEMOTE_F64_TO_F16
- #endif
- namespace detail {
- // On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64
- // vector with the resulting F16 bits in the lower 16 bits of U64 lane 0
- // On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as
- // an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1
- static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
- // Inline assembly is needed for the PPC9 xscvdphp instruction as there is
- // currently no intrinsic available for the PPC9 xscvdphp instruction
- __vector unsigned long long raw_result;
- __asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
- return Vec128<uint64_t>{raw_result};
- }
- } // namespace detail
- template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
- HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
- const RebindToUnsigned<decltype(df16)> du16;
- const Rebind<uint64_t, decltype(df16)> du64;
- const Full128<double> df64_full;
- #if HWY_IS_LITTLE_ENDIAN
- const auto bits16_as_u64 =
- UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
- #else
- const auto bits16_as_u64 =
- LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
- #endif
- return BitCast(df16, TruncateTo(du16, bits16_as_u64));
- }
- template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
- HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
- const RebindToUnsigned<decltype(df16)> du16;
- const Rebind<uint64_t, decltype(df16)> du64;
- const Rebind<double, decltype(df16)> df64;
- #if HWY_IS_LITTLE_ENDIAN
- const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
- const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
- const auto bits64_as_u64 =
- InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
- #else
- const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
- const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
- const auto bits64_as_u64 =
- InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
- #endif
- return BitCast(df16, TruncateTo(du16, bits64_as_u64));
- }
- #elif HWY_S390X_HAVE_Z14
- #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
- #undef HWY_NATIVE_DEMOTE_F64_TO_F16
- #else
- #define HWY_NATIVE_DEMOTE_F64_TO_F16
- #endif
- namespace detail {
- template <class DF32, HWY_IF_F32_D(DF32)>
- static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd(
- DF32 df32, VFromD<Rebind<double, DF32>> v) {
- const Twice<DF32> dt_f32;
- __vector float raw_f32_in_even;
- __asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
- const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
- return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
- }
- } // namespace detail
- template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
- HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
- const Rebind<float, decltype(df16)> df32;
- return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
- }
- #endif // HWY_PPC_HAVE_9
- #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
- #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
- #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
- #else
- #define HWY_NATIVE_DEMOTE_F32_TO_BF16
- #endif
- namespace detail {
- // VsxXvcvspbf16 converts a F32 vector to a BF16 vector, bitcasted to an U32
- // vector with the resulting BF16 bits in the lower 16 bits of each U32 lane
- template <class D, HWY_IF_BF16_D(D)>
- static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16(
- D dbf16, VFromD<Rebind<float, D>> v) {
- const Rebind<uint32_t, decltype(dbf16)> du32;
- const Repartition<uint8_t, decltype(du32)> du32_as_du8;
- using VU32 = __vector unsigned int;
- // Even though the __builtin_vsx_xvcvspbf16 builtin performs a F32 to BF16
- // conversion, the __builtin_vsx_xvcvspbf16 intrinsic expects a
- // __vector unsigned char argument (at least as of GCC 13 and Clang 17)
- return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>(
- __builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))};
- }
- } // namespace detail
- template <class D, HWY_IF_BF16_D(D)>
- HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
- const RebindToUnsigned<decltype(dbf16)> du16;
- return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v)));
- }
- #endif // HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
- // Specializations for partial vectors because vec_packs sets lanes above 2*N.
- template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
- HWY_IF_SIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
- const DFromV<decltype(a)> d;
- const Twice<decltype(d)> dt;
- return DemoteTo(dn, Combine(dt, b, a));
- }
- template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_SIGNED_D(DN),
- HWY_IF_SIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
- const Twice<decltype(dn)> dn_full;
- const Repartition<uint32_t, decltype(dn_full)> du32_full;
- const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
- const auto vu32_full = BitCast(du32_full, v_full);
- return LowerHalf(
- BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
- }
- template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_SIGNED_D(DN),
- HWY_IF_SIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
- return VFromD<DN>{vec_packs(a.raw, b.raw)};
- }
- template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
- HWY_IF_UNSIGNED_D(DN), HWY_IF_SIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
- const DFromV<decltype(a)> d;
- const Twice<decltype(d)> dt;
- return DemoteTo(dn, Combine(dt, b, a));
- }
- template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
- HWY_IF_SIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
- const Twice<decltype(dn)> dn_full;
- const Repartition<uint32_t, decltype(dn_full)> du32_full;
- const VFromD<decltype(dn_full)> v_full{vec_packsu(a.raw, b.raw)};
- const auto vu32_full = BitCast(du32_full, v_full);
- return LowerHalf(
- BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
- }
- template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
- HWY_IF_SIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
- return VFromD<DN>{vec_packsu(a.raw, b.raw)};
- }
- template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
- HWY_IF_UNSIGNED_D(DN), HWY_IF_UNSIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
- const DFromV<decltype(a)> d;
- const Twice<decltype(d)> dt;
- return DemoteTo(dn, Combine(dt, b, a));
- }
- template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
- HWY_IF_UNSIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
- const Twice<decltype(dn)> dn_full;
- const Repartition<uint32_t, decltype(dn_full)> du32_full;
- const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
- const auto vu32_full = BitCast(du32_full, v_full);
- return LowerHalf(
- BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
- }
- template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
- HWY_IF_UNSIGNED_V(V),
- HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
- HWY_API VFromD<DN> ReorderDemote2To(DN /*dn*/, V a, V b) {
- return VFromD<DN>{vec_packs(a.raw, b.raw)};
- }
- #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
- template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
- HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
- HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) {
- const RebindToUnsigned<decltype(dbf16)> du16;
- const Half<decltype(dbf16)> dh_bf16;
- return BitCast(dbf16,
- OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a),
- detail::VsxXvcvspbf16(dh_bf16, b)));
- }
- #endif
- template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
- HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
- HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
- HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
- HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
- return ReorderDemote2To(d, a, b);
- }
- #if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
- template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>),
- HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
- HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
- return ReorderDemote2To(d, a, b);
- }
- #endif
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
- HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
- return Vec32<float>{vec_floate(v.raw)};
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
- HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
- #if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
- const Vec128<float> f64_to_f32{vec_floate(v.raw)};
- #else
- const Vec128<float> f64_to_f32{vec_floato(v.raw)};
- #endif
- #if HWY_S390X_HAVE_Z14
- const Twice<decltype(d)> dt;
- return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32));
- #else
- const RebindToUnsigned<D> du;
- const Rebind<uint64_t, D> du64;
- return Vec64<float>{
- BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
- HWY_API Vec32<int32_t> DemoteTo(D di32, Vec64<double> v) {
- #if HWY_S390X_HAVE_Z14
- const Rebind<int64_t, decltype(di32)> di64;
- return DemoteTo(di32, ConvertTo(di64, v));
- #else
- (void)di32;
- return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
- HWY_API Vec64<int32_t> DemoteTo(D di32, Vec128<double> v) {
- #if HWY_S390X_HAVE_Z14
- const Rebind<int64_t, decltype(di32)> di64;
- return DemoteTo(di32, ConvertTo(di64, v));
- #else
- (void)di32;
- #if HWY_IS_LITTLE_ENDIAN
- const Vec128<int32_t> f64_to_i32{
- vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
- #else
- const Vec128<int32_t> f64_to_i32{
- vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
- #endif
- const Rebind<int64_t, D> di64;
- const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
- return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
- HWY_API Vec32<uint32_t> DemoteTo(D du32, Vec64<double> v) {
- #if HWY_S390X_HAVE_Z14
- const Rebind<uint64_t, decltype(du32)> du64;
- return DemoteTo(du32, ConvertTo(du64, v));
- #else
- (void)du32;
- return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
- HWY_API Vec64<uint32_t> DemoteTo(D du32, Vec128<double> v) {
- #if HWY_S390X_HAVE_Z14
- const Rebind<uint64_t, decltype(du32)> du64;
- return DemoteTo(du32, ConvertTo(du64, v));
- #else
- (void)du32;
- #if HWY_IS_LITTLE_ENDIAN
- const Vec128<uint32_t> f64_to_u32{
- vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
- #else
- const Vec128<uint32_t> f64_to_u32{
- vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
- #endif
- const Rebind<uint64_t, D> du64;
- const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32);
- return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)};
- #endif
- }
- #if HWY_S390X_HAVE_Z14
- namespace detail {
- template <class V, HWY_IF_I64(TFromV<V>)>
- HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
- __vector double raw_result;
- // Use inline assembly to do a round-to-odd I64->F64 conversion on Z14
- __asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
- return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
- }
- template <class V, HWY_IF_U64(TFromV<V>)>
- HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
- __vector double raw_result;
- // Use inline assembly to do a round-to-odd U64->F64 conversion on Z14
- __asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
- return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
- }
- } // namespace detail
- #endif // HWY_S390X_HAVE_Z14
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
- HWY_API Vec32<float> DemoteTo(D df32, Vec64<int64_t> v) {
- #if HWY_S390X_HAVE_Z14
- return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
- #else // VSX
- (void)df32;
- return Vec32<float>{vec_floate(v.raw)};
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
- HWY_API Vec64<float> DemoteTo(D df32, Vec128<int64_t> v) {
- #if HWY_S390X_HAVE_Z14
- return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
- #else // VSX
- #if HWY_IS_LITTLE_ENDIAN
- const Vec128<float> i64_to_f32{vec_floate(v.raw)};
- #else
- const Vec128<float> i64_to_f32{vec_floato(v.raw)};
- #endif
- const RebindToUnsigned<decltype(df32)> du32;
- const Rebind<uint64_t, decltype(df32)> du64;
- return Vec64<float>{
- BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw};
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
- HWY_API Vec32<float> DemoteTo(D df32, Vec64<uint64_t> v) {
- #if HWY_S390X_HAVE_Z14
- return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
- #else // VSX
- (void)df32;
- return Vec32<float>{vec_floate(v.raw)};
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
- HWY_API Vec64<float> DemoteTo(D df32, Vec128<uint64_t> v) {
- #if HWY_S390X_HAVE_Z14
- return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
- #else // VSX
- #if HWY_IS_LITTLE_ENDIAN
- const Vec128<float> u64_to_f32{vec_floate(v.raw)};
- #else
- const Vec128<float> u64_to_f32{vec_floato(v.raw)};
- #endif
- const RebindToUnsigned<decltype(df32)> du;
- const Rebind<uint64_t, decltype(df32)> du64;
- return Vec64<float>{
- BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
- #endif
- }
- // For already range-limited input [0, 255].
- template <size_t N>
- HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
- const Rebind<uint16_t, DFromV<decltype(v)>> du16;
- const Rebind<uint8_t, decltype(du16)> du8;
- return TruncateTo(du8, TruncateTo(du16, v));
- }
- // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
- // Note: altivec.h vec_ct* currently contain C casts which triggers
- // -Wdeprecate-lax-vec-conv-all warnings, so disable them.
- #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
- template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
- HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> ConvertTo(D df32,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- const Rebind<double, decltype(df32)> df64;
- return DemoteTo(df32, PromoteTo(df64, v));
- }
- template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
- HWY_IF_V_SIZE_D(D, 16)>
- HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
- const RepartitionToWide<decltype(df32)> df64;
- const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
- const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
- return ConcatEven(df32, vf32_hi, vf32_lo);
- }
- #else // Z15 or PPC
- template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
- HWY_API VFromD<D> ConvertTo(D /* tag */,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- HWY_DIAGNOSTICS(push)
- #if HWY_COMPILER_CLANG
- HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
- #endif
- #if HWY_S390X_HAVE_Z15
- return VFromD<D>{vec_float(v.raw)};
- #else
- return VFromD<D>{vec_ctf(v.raw, 0)};
- #endif
- HWY_DIAGNOSTICS(pop)
- }
- #endif // HWY_TARGET == HWY_Z14
- template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
- HWY_IF_T_SIZE_D(D, sizeof(FromT))>
- HWY_API VFromD<D> ConvertTo(D /* tag */,
- Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
- return VFromD<D>{vec_double(v.raw)};
- }
- // Truncates (rounds toward zero).
- #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
- template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> ConvertTo(D di32,
- Vec128<float, Rebind<float, D>().MaxLanes()> v) {
- const Rebind<int64_t, decltype(di32)> di64;
- return DemoteTo(di32, PromoteTo(di64, v));
- }
- template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
- HWY_API VFromD<D> ConvertTo(D di32,
- Vec128<float, Rebind<float, D>().MaxLanes()> v) {
- const RepartitionToWide<decltype(di32)> di64;
- return OrderedDemote2To(di32, PromoteLowerTo(di64, v),
- PromoteUpperTo(di64, v));
- }
- #else // Z15 or PPC
- template <class D, HWY_IF_I32_D(D)>
- HWY_API VFromD<D> ConvertTo(D /* tag */,
- Vec128<float, Rebind<float, D>().MaxLanes()> v) {
- #if defined(__OPTIMIZE__)
- if (detail::IsConstantRawAltivecVect(v.raw)) {
- constexpr int32_t kMinI32 = LimitsMin<int32_t>();
- constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
- return Dup128VecFromValues(
- D(),
- (v.raw[0] >= -2147483648.0f)
- ? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0])
- : kMaxI32)
- : ((v.raw[0] < 0) ? kMinI32 : 0),
- (v.raw[1] >= -2147483648.0f)
- ? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
- : kMaxI32)
- : ((v.raw[1] < 0) ? kMinI32 : 0),
- (v.raw[2] >= -2147483648.0f)
- ? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
- : kMaxI32)
- : ((v.raw[2] < 0) ? kMinI32 : 0),
- (v.raw[3] >= -2147483648.0f)
- ? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
- : kMaxI32)
- : ((v.raw[3] < 0) ? kMinI32 : 0));
- }
- #endif
- #if HWY_S390X_HAVE_Z15
- // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
- // the range of an int32_t
- __vector signed int raw_result;
- __asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
- return VFromD<D>{raw_result};
- #else
- HWY_DIAGNOSTICS(push)
- #if HWY_COMPILER_CLANG
- HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
- #endif
- return VFromD<D>{vec_cts(v.raw, 0)};
- HWY_DIAGNOSTICS(pop)
- #endif // HWY_S390X_HAVE_Z15
- }
- #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
- template <class D, HWY_IF_I64_D(D)>
- HWY_API VFromD<D> ConvertTo(D /* tag */,
- Vec128<double, Rebind<double, D>().MaxLanes()> v) {
- #if defined(__OPTIMIZE__)
- if (detail::IsConstantRawAltivecVect(v.raw)) {
- constexpr int64_t kMinI64 = LimitsMin<int64_t>();
- constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
- return Dup128VecFromValues(D(),
- (v.raw[0] >= -9223372036854775808.0)
- ? ((v.raw[0] < 9223372036854775808.0)
- ? static_cast<int64_t>(v.raw[0])
- : kMaxI64)
- : ((v.raw[0] < 0) ? kMinI64 : 0LL),
- (v.raw[1] >= -9223372036854775808.0)
- ? ((v.raw[1] < 9223372036854775808.0)
- ? static_cast<int64_t>(v.raw[1])
- : kMaxI64)
- : ((v.raw[1] < 0) ? kMinI64 : 0LL));
- }
- #endif
- // Use inline assembly to avoid undefined behavior if v[i] is not within the
- // range of an int64_t
- __vector signed long long raw_result;
- #if HWY_S390X_HAVE_Z14
- __asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
- #else
- __asm__("xvcvdpsxds %x0,%x1"
- : "=wa"(raw_result)
- : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
- #endif
- return VFromD<D>{raw_result};
- }
- #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
- template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API VFromD<D> ConvertTo(D du32,
- Vec128<float, Rebind<float, D>().MaxLanes()> v) {
- const Rebind<uint64_t, decltype(du32)> du64;
- return DemoteTo(du32, PromoteTo(du64, v));
- }
- template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
- HWY_API VFromD<D> ConvertTo(D du32,
- Vec128<float, Rebind<float, D>().MaxLanes()> v) {
- const RepartitionToWide<decltype(du32)> du64;
- return OrderedDemote2To(du32, PromoteLowerTo(du64, v),
- PromoteUpperTo(du64, v));
- }
- #else // Z15 or VSX
- template <class D, HWY_IF_U32_D(D)>
- HWY_API VFromD<D> ConvertTo(D /* tag */,
- Vec128<float, Rebind<float, D>().MaxLanes()> v) {
- #if defined(__OPTIMIZE__)
- if (detail::IsConstantRawAltivecVect(v.raw)) {
- constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
- return Dup128VecFromValues(
- D(),
- (v.raw[0] >= 0.0f)
- ? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0])
- : kMaxU32)
- : 0,
- (v.raw[1] >= 0.0f)
- ? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
- : kMaxU32)
- : 0,
- (v.raw[2] >= 0.0f)
- ? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
- : kMaxU32)
- : 0,
- (v.raw[3] >= 0.0f)
- ? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
- : kMaxU32)
- : 0);
- }
- #endif
- #if HWY_S390X_HAVE_Z15
- // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in
- // the range of an uint32_t
- __vector unsigned int raw_result;
- __asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
- return VFromD<D>{raw_result};
- #else // VSX
- HWY_DIAGNOSTICS(push)
- #if HWY_COMPILER_CLANG
- HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
- #endif
- VFromD<D> result{vec_ctu(v.raw, 0)};
- HWY_DIAGNOSTICS(pop)
- return result;
- #endif // HWY_S390X_HAVE_Z15
- }
- #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
- template <class D, HWY_IF_U64_D(D)>
- HWY_API VFromD<D> ConvertTo(D /* tag */,
- Vec128<double, Rebind<double, D>().MaxLanes()> v) {
- HWY_DIAGNOSTICS(push)
- #if HWY_COMPILER_CLANG
- HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
- #endif
- #if defined(__OPTIMIZE__)
- if (detail::IsConstantRawAltivecVect(v.raw)) {
- constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
- return Dup128VecFromValues(
- D(),
- (v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
- ? static_cast<uint64_t>(v.raw[0])
- : kMaxU64)
- : 0,
- (v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
- ? static_cast<uint64_t>(v.raw[1])
- : kMaxU64)
- : 0);
- }
- #endif
- // Use inline assembly to avoid undefined behavior if v[i] is not within the
- // range of an uint64_t
- __vector unsigned long long raw_result;
- #if HWY_S390X_HAVE_Z14
- __asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
- #else // VSX
- __asm__("xvcvdpuxds %x0,%x1"
- : "=wa"(raw_result)
- : "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
- #endif
- return VFromD<D>{raw_result};
- }
- // ------------------------------ Floating-point rounding (ConvertTo)
- // Toward nearest integer, ties to even
- template <size_t N>
- HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
- return Vec128<float, N>{vec_round(v.raw)};
- }
- template <size_t N>
- HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
- #if HWY_S390X_HAVE_Z14
- return Vec128<double, N>{vec_round(v.raw)};
- #else
- return Vec128<double, N>{vec_rint(v.raw)};
- #endif
- }
- template <size_t N>
- HWY_API Vec128<int32_t, N> NearestInt(Vec128<float, N> v) {
- const DFromV<decltype(v)> d;
- const RebindToSigned<decltype(d)> di;
- return ConvertTo(di, Round(v));
- }
- // Toward zero, aka truncate
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
- return Vec128<T, N>{vec_trunc(v.raw)};
- }
- // Toward +infinity, aka ceiling
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> Ceil(Vec128<T, N> v) {
- return Vec128<T, N>{vec_ceil(v.raw)};
- }
- // Toward -infinity, aka floor
- template <typename T, size_t N, HWY_IF_FLOAT(T)>
- HWY_API Vec128<T, N> Floor(Vec128<T, N> v) {
- return Vec128<T, N>{vec_floor(v.raw)};
- }
- // ------------------------------ Floating-point classification
- template <typename T, size_t N>
- HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
- static_assert(IsFloat<T>(), "Only for float");
- return v != v;
- }
- template <typename T, size_t N>
- HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
- static_assert(IsFloat<T>(), "Only for float");
- using TU = MakeUnsigned<T>;
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- const VFromD<decltype(du)> vu = BitCast(du, v);
- // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
- return RebindMask(
- d,
- Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
- }
- // Returns whether normal/subnormal/zero.
- template <typename T, size_t N>
- HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
- static_assert(IsFloat<T>(), "Only for float");
- using TU = MakeUnsigned<T>;
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- const VFromD<decltype(du)> vu = BitCast(du, v);
- // 'Shift left' to clear the sign bit, check for exponent<max.
- return RebindMask(
- d,
- Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
- }
- // ================================================== CRYPTO
- #if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
- // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
- #ifdef HWY_NATIVE_AES
- #undef HWY_NATIVE_AES
- #else
- #define HWY_NATIVE_AES
- #endif
- namespace detail {
- #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600
- using CipherTag = Full128<uint64_t>;
- #else
- using CipherTag = Full128<uint8_t>;
- #endif // !HWY_COMPILER_CLANG
- using CipherVec = VFromD<CipherTag>;
- } // namespace detail
- HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
- Vec128<uint8_t> round_key) {
- const detail::CipherTag dc;
- const Full128<uint8_t> du8;
- #if HWY_IS_LITTLE_ENDIAN
- return Reverse(du8,
- BitCast(du8, detail::CipherVec{vec_cipher_be(
- BitCast(dc, Reverse(du8, state)).raw,
- BitCast(dc, Reverse(du8, round_key)).raw)}));
- #else
- return BitCast(du8, detail::CipherVec{vec_cipher_be(
- BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
- #endif
- }
- HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
- Vec128<uint8_t> round_key) {
- const detail::CipherTag dc;
- const Full128<uint8_t> du8;
- #if HWY_IS_LITTLE_ENDIAN
- return Reverse(du8,
- BitCast(du8, detail::CipherVec{vec_cipherlast_be(
- BitCast(dc, Reverse(du8, state)).raw,
- BitCast(dc, Reverse(du8, round_key)).raw)}));
- #else
- return BitCast(du8, detail::CipherVec{vec_cipherlast_be(
- BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
- #endif
- }
- HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
- Vec128<uint8_t> round_key) {
- const detail::CipherTag dc;
- const Full128<uint8_t> du8;
- #if HWY_IS_LITTLE_ENDIAN
- return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be(
- BitCast(dc, Reverse(du8, state)).raw,
- Zero(dc).raw)})),
- round_key);
- #else
- return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be(
- BitCast(dc, state).raw, Zero(dc).raw)}),
- round_key);
- #endif
- }
- HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
- Vec128<uint8_t> round_key) {
- const detail::CipherTag dc;
- const Full128<uint8_t> du8;
- #if HWY_IS_LITTLE_ENDIAN
- return Reverse(du8,
- BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
- BitCast(dc, Reverse(du8, state)).raw,
- BitCast(dc, Reverse(du8, round_key)).raw)}));
- #else
- return BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
- BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
- #endif
- }
- HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
- const Full128<uint8_t> du8;
- const auto zero = Zero(du8);
- // PPC8/PPC9/PPC10 does not have a single instruction for the AES
- // InvMixColumns operation like ARM Crypto, SVE2 Crypto, or AES-NI do.
- // The AESInvMixColumns operation can be carried out on PPC8/PPC9/PPC10
- // by doing an AESLastRound operation with a zero round_key followed by an
- // AESRoundInv operation with a zero round_key.
- return AESRoundInv(AESLastRound(state, zero), zero);
- }
- template <uint8_t kRcon>
- HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
- constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0,
- 0, 0, 0, 0, kRcon, 0, 0, 0};
- constexpr __vector unsigned char kRotWordShuffle = {
- 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
- const detail::CipherTag dc;
- const Full128<uint8_t> du8;
- const auto sub_word_result =
- BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)});
- const auto rot_word_result =
- TableLookupBytes(sub_word_result, Vec128<uint8_t>{kRotWordShuffle});
- return Xor(rot_word_result, Vec128<uint8_t>{kRconXorMask});
- }
- template <size_t N>
- HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
- Vec128<uint64_t, N> b) {
- // NOTE: Lane 1 of both a and b need to be zeroed out for the
- // vec_pmsum_be operation below as the vec_pmsum_be operation
- // does a carryless multiplication of each 64-bit half and then
- // adds the two halves using an bitwise XOR operation.
- const DFromV<decltype(a)> d;
- const auto zero = Zero(d);
- using VU64 = __vector unsigned long long;
- const VU64 pmsum_result = reinterpret_cast<VU64>(
- vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw));
- #if HWY_IS_LITTLE_ENDIAN
- return Vec128<uint64_t, N>{pmsum_result};
- #else
- // Need to swap the two halves of pmsum_result on big-endian targets as
- // the upper 64 bits of the carryless multiplication result are in lane 0 of
- // pmsum_result and the lower 64 bits of the carryless multiplication result
- // are in lane 1 of mul128_result
- return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
- #endif
- }
- template <size_t N>
- HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
- Vec128<uint64_t, N> b) {
- // NOTE: Lane 0 of both a and b need to be zeroed out for the
- // vec_pmsum_be operation below as the vec_pmsum_be operation
- // does a carryless multiplication of each 64-bit half and then
- // adds the two halves using an bitwise XOR operation.
- const DFromV<decltype(a)> d;
- const auto zero = Zero(d);
- using VU64 = __vector unsigned long long;
- const VU64 pmsum_result = reinterpret_cast<VU64>(
- vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw)));
- #if HWY_IS_LITTLE_ENDIAN
- return Vec128<uint64_t, N>{pmsum_result};
- #else
- // Need to swap the two halves of pmsum_result on big-endian targets as
- // the upper 64 bits of the carryless multiplication result are in lane 0 of
- // pmsum_result and the lower 64 bits of the carryless multiplication result
- // are in lane 1 of mul128_result
- return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
- #endif
- }
- #endif // !defined(HWY_DISABLE_PPC8_CRYPTO)
- // ================================================== MISC
- // ------------------------------ LoadMaskBits (TestBit)
- namespace detail {
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
- #if HWY_PPC_HAVE_10
- const Vec128<uint8_t> mask_vec{vec_genbm(mask_bits)};
- #if HWY_IS_LITTLE_ENDIAN
- return MFromD<D>{MaskFromVec(mask_vec).raw};
- #else
- return MFromD<D>{MaskFromVec(Reverse(Full128<uint8_t>(), mask_vec)).raw};
- #endif // HWY_IS_LITTLE_ENDIAN
- #else // PPC9 or earlier
- const Full128<uint8_t> du8;
- const Full128<uint16_t> du16;
- const Vec128<uint8_t> vbits =
- BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
- // Replicate bytes 8x such that each byte contains the bit that governs it.
- #if HWY_IS_LITTLE_ENDIAN
- const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1};
- #else
- const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0};
- #endif // HWY_IS_LITTLE_ENDIAN
- const Vec128<uint8_t> rep8{vec_perm(vbits.raw, vbits.raw, kRep8)};
- const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128,
- 1, 2, 4, 8, 16, 32, 64, 128};
- return MFromD<D>{TestBit(rep8, Vec128<uint8_t>{kBit}).raw};
- #endif // HWY_PPC_HAVE_10
- }
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
- #if HWY_PPC_HAVE_10
- const Vec128<uint16_t> mask_vec{vec_genhm(mask_bits)};
- #if HWY_IS_LITTLE_ENDIAN
- return MFromD<D>{MaskFromVec(mask_vec).raw};
- #else
- return MFromD<D>{MaskFromVec(Reverse(Full128<uint16_t>(), mask_vec)).raw};
- #endif // HWY_IS_LITTLE_ENDIAN
- #else // PPC9 or earlier
- const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128};
- const auto vmask_bits =
- Set(Full128<uint16_t>(), static_cast<uint16_t>(mask_bits));
- return MFromD<D>{TestBit(vmask_bits, Vec128<uint16_t>{kBit}).raw};
- #endif // HWY_PPC_HAVE_10
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
- #if HWY_PPC_HAVE_10
- const Vec128<uint32_t> mask_vec{vec_genwm(mask_bits)};
- #if HWY_IS_LITTLE_ENDIAN
- return MFromD<D>{MaskFromVec(mask_vec).raw};
- #else
- return MFromD<D>{MaskFromVec(Reverse(Full128<uint32_t>(), mask_vec)).raw};
- #endif // HWY_IS_LITTLE_ENDIAN
- #else // PPC9 or earlier
- const __vector unsigned int kBit = {1, 2, 4, 8};
- const auto vmask_bits =
- Set(Full128<uint32_t>(), static_cast<uint32_t>(mask_bits));
- return MFromD<D>{TestBit(vmask_bits, Vec128<uint32_t>{kBit}).raw};
- #endif // HWY_PPC_HAVE_10
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_INLINE MFromD<D> LoadMaskBits128(D /*d*/, uint64_t mask_bits) {
- #if HWY_PPC_HAVE_10
- const Vec128<uint64_t> mask_vec{vec_gendm(mask_bits)};
- #if HWY_IS_LITTLE_ENDIAN
- return MFromD<D>{MaskFromVec(mask_vec).raw};
- #else
- return MFromD<D>{MaskFromVec(Reverse(Full128<uint64_t>(), mask_vec)).raw};
- #endif // HWY_IS_LITTLE_ENDIAN
- #else // PPC9 or earlier
- const __vector unsigned long long kBit = {1, 2};
- const auto vmask_bits =
- Set(Full128<uint64_t>(), static_cast<uint64_t>(mask_bits));
- return MFromD<D>{TestBit(vmask_bits, Vec128<uint64_t>{kBit}).raw};
- #endif // HWY_PPC_HAVE_10
- }
- } // namespace detail
- // `p` points to at least 8 readable bytes, not all of which need be valid.
- template <class D, HWY_IF_LANES_LE_D(D, 8)>
- HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
- // If there are 8 or fewer lanes, simply convert bits[0] to a uint64_t
- uint64_t mask_bits = bits[0];
- constexpr size_t kN = MaxLanes(d);
- if (kN < 8) mask_bits &= (1u << kN) - 1;
- return detail::LoadMaskBits128(d, mask_bits);
- }
- template <class D, HWY_IF_LANES_D(D, 16)>
- HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
- // First, copy the mask bits to a uint16_t as there as there are at most
- // 16 lanes in a vector.
- // Copying the mask bits to a uint16_t first will also ensure that the
- // mask bits are loaded into the lower 16 bits on big-endian PPC targets.
- uint16_t u16_mask_bits;
- CopyBytes<sizeof(uint16_t)>(bits, &u16_mask_bits);
- #if HWY_IS_LITTLE_ENDIAN
- return detail::LoadMaskBits128(d, u16_mask_bits);
- #else
- // On big-endian targets, u16_mask_bits need to be byte swapped as bits
- // contains the mask bits in little-endian byte order
- // GCC/Clang will optimize the load of u16_mask_bits and byte swap to a
- // single lhbrx instruction on big-endian PPC targets when optimizations
- // are enabled.
- #if HWY_HAS_BUILTIN(__builtin_bswap16)
- return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits));
- #else
- return detail::LoadMaskBits128(
- d, static_cast<uint16_t>((u16_mask_bits << 8) | (u16_mask_bits >> 8)));
- #endif
- #endif
- }
- template <typename T>
- struct CompressIsPartition {
- // generic_ops-inl does not guarantee IsPartition for 8-bit.
- enum { value = (sizeof(T) != 1) };
- };
- // ------------------------------ Dup128MaskFromMaskBits
- template <class D>
- HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
- constexpr size_t kN = MaxLanes(d);
- if (kN < 8) mask_bits &= (1u << kN) - 1;
- return detail::LoadMaskBits128(d, mask_bits);
- }
- // ------------------------------ StoreMaskBits
- namespace detail {
- #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
- // fallback for missing vec_extractm
- template <size_t N>
- HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
- __vector unsigned char bit_shuffle) {
- // clang POWER8 and 9 targets appear to differ in their return type of
- // vec_vbpermq: unsigned or signed, so cast to avoid a warning.
- using VU64 = detail::Raw128<uint64_t>::type;
- #if HWY_S390X_HAVE_Z14
- const Vec128<uint64_t> extracted{
- reinterpret_cast<VU64>(vec_bperm_u128(sign_bits.raw, bit_shuffle))};
- #else
- const Vec128<uint64_t> extracted{
- reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
- #endif
- return extracted.raw[HWY_IS_LITTLE_ENDIAN];
- }
- #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
- template <typename T, size_t N>
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128<T, N> mask) {
- const DFromM<decltype(mask)> d;
- const Repartition<uint8_t, decltype(d)> du8;
- const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
- #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
- return static_cast<uint64_t>(vec_extractm(sign_bits.raw));
- #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
- const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
- 56, 48, 40, 32, 24, 16, 8, 0};
- return ExtractSignBits(sign_bits, kBitShuffle);
- #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
- }
- template <typename T, size_t N>
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128<T, N> mask) {
- const DFromM<decltype(mask)> d;
- const RebindToUnsigned<decltype(d)> du;
- const Repartition<uint8_t, decltype(d)> du8;
- const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
- #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
- return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
- #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
- (void)du;
- #if HWY_IS_LITTLE_ENDIAN
- const __vector unsigned char kBitShuffle = {
- 112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
- #else
- const __vector unsigned char kBitShuffle = {
- 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0};
- #endif
- return ExtractSignBits(sign_bits, kBitShuffle);
- #endif // HWY_PPC_HAVE_10
- }
- template <typename T, size_t N>
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128<T, N> mask) {
- const DFromM<decltype(mask)> d;
- const RebindToUnsigned<decltype(d)> du;
- const Repartition<uint8_t, decltype(d)> du8;
- const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
- #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
- return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
- #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
- (void)du;
- #if HWY_IS_LITTLE_ENDIAN
- const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
- 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128};
- #else
- const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128,
- 96, 64, 32, 0};
- #endif
- return ExtractSignBits(sign_bits, kBitShuffle);
- #endif // HWY_PPC_HAVE_10
- }
- template <typename T, size_t N>
- HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128<T, N> mask) {
- const DFromM<decltype(mask)> d;
- const RebindToUnsigned<decltype(d)> du;
- const Repartition<uint8_t, decltype(d)> du8;
- const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
- #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
- return static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw));
- #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10
- (void)du;
- #if HWY_IS_LITTLE_ENDIAN
- const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128};
- #else
- const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128,
- 128, 128, 64, 0};
- #endif
- return ExtractSignBits(sign_bits, kBitShuffle);
- #endif // HWY_PPC_HAVE_10
- }
- // Returns the lowest N of the mask bits.
- template <typename T, size_t N>
- constexpr uint64_t OnlyActive(uint64_t mask_bits) {
- return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
- }
- template <typename T, size_t N>
- HWY_INLINE uint64_t BitsFromMask(Mask128<T, N> mask) {
- return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
- }
- } // namespace detail
- // `p` points to at least 8 writable bytes.
- template <class D, HWY_IF_LANES_LE_D(D, 8)>
- HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
- // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask
- // to an uint8_t and store the result in bits[0].
- bits[0] = static_cast<uint8_t>(detail::BitsFromMask(mask));
- return sizeof(uint8_t);
- }
- template <class D, HWY_IF_LANES_D(D, 16)>
- HWY_API size_t StoreMaskBits(D /*d*/, MFromD<D> mask, uint8_t* bits) {
- const auto mask_bits = detail::BitsFromMask(mask);
- // First convert mask_bits to a uint16_t as we only want to store
- // the lower 16 bits of mask_bits as there are 16 lanes in mask.
- // Converting mask_bits to a uint16_t first will also ensure that
- // the lower 16 bits of mask_bits are stored instead of the upper 16 bits
- // of mask_bits on big-endian PPC targets.
- #if HWY_IS_LITTLE_ENDIAN
- const uint16_t u16_mask_bits = static_cast<uint16_t>(mask_bits);
- #else
- // On big-endian targets, the bytes of mask_bits need to be swapped
- // as StoreMaskBits expects the mask bits to be stored in little-endian
- // byte order.
- // GCC will also optimize the byte swap and CopyBytes operations below
- // to a single sthbrx instruction when optimizations are enabled on
- // big-endian PPC targets
- #if HWY_HAS_BUILTIN(__builtin_bswap16)
- const uint16_t u16_mask_bits =
- __builtin_bswap16(static_cast<uint16_t>(mask_bits));
- #else
- const uint16_t u16_mask_bits = static_cast<uint16_t>(
- (mask_bits << 8) | (static_cast<uint16_t>(mask_bits) >> 8));
- #endif
- #endif
- CopyBytes<sizeof(uint16_t)>(&u16_mask_bits, bits);
- return sizeof(uint16_t);
- }
- // ------------------------------ Mask testing
- template <class D, HWY_IF_V_SIZE_D(D, 16)>
- HWY_API bool AllFalse(D d, MFromD<D> mask) {
- const RebindToUnsigned<decltype(d)> du;
- return static_cast<bool>(
- vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16)>
- HWY_API bool AllTrue(D d, MFromD<D> mask) {
- const RebindToUnsigned<decltype(d)> du;
- using TU = TFromD<decltype(du)>;
- return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw,
- Set(du, hwy::LimitsMax<TU>()).raw));
- }
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API bool AllFalse(D d, MFromD<D> mask) {
- const Full128<TFromD<D>> d_full;
- constexpr size_t kN = MaxLanes(d);
- return AllFalse(d_full,
- And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN)));
- }
- template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
- HWY_API bool AllTrue(D d, MFromD<D> mask) {
- const Full128<TFromD<D>> d_full;
- constexpr size_t kN = MaxLanes(d);
- return AllTrue(
- d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN))));
- }
- template <class D>
- HWY_API size_t CountTrue(D /* tag */, MFromD<D> mask) {
- return PopCount(detail::BitsFromMask(mask));
- }
- #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- namespace detail {
- template <class V>
- static HWY_INLINE size_t VsxCntlzLsbb(V v) {
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
- HWY_IS_LITTLE_ENDIAN
- // Use inline assembly to work around bug in GCC 11 and earlier on
- // little-endian PPC9
- int idx;
- __asm__("vctzlsbb %0,%1" : "=r"(idx) : "v"(v.raw));
- return static_cast<size_t>(idx);
- #else
- return static_cast<size_t>(vec_cntlz_lsbb(v.raw));
- #endif
- }
- template <class V>
- static HWY_INLINE size_t VsxCnttzLsbb(V v) {
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
- HWY_IS_LITTLE_ENDIAN
- // Use inline assembly to work around bug in GCC 11 and earlier on
- // little-endian PPC9
- int idx;
- __asm__("vclzlsbb %0,%1" : "=r"(idx) : "v"(v.raw));
- return static_cast<size_t>(idx);
- #else
- return static_cast<size_t>(vec_cnttz_lsbb(v.raw));
- #endif
- }
- } // namespace detail
- #endif
- template <class D, typename T = TFromD<D>>
- HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
- // For little-endian PPC10, BitsFromMask is already efficient.
- #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- if (detail::IsFull(d)) {
- const Repartition<uint8_t, D> d8;
- const auto bytes = BitCast(d8, VecFromMask(d, mask));
- return detail::VsxCntlzLsbb(bytes) / sizeof(T);
- }
- #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- (void)d;
- return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask));
- }
- template <class D, typename T = TFromD<D>>
- HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
- // For little-endian PPC10, BitsFromMask is already efficient.
- #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- constexpr size_t kN = 16 / sizeof(T);
- if (detail::IsFull(d)) {
- const Repartition<uint8_t, D> d8;
- const auto bytes = BitCast(d8, VecFromMask(d, mask));
- const size_t idx = detail::VsxCntlzLsbb(bytes) / sizeof(T);
- return idx == kN ? -1 : static_cast<intptr_t>(idx);
- }
- #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- (void)d;
- const uint64_t mask_bits = detail::BitsFromMask(mask);
- return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
- }
- template <class D, typename T = TFromD<D>>
- HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
- // For little-endian PPC10, BitsFromMask is already efficient.
- #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- if (detail::IsFull(d)) {
- const Repartition<uint8_t, D> d8;
- const auto bytes = BitCast(d8, VecFromMask(d, mask));
- const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T);
- return 16 / sizeof(T) - 1 - idx;
- }
- #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- (void)d;
- return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask));
- }
- template <class D, typename T = TFromD<D>>
- HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
- // For little-endian PPC10, BitsFromMask is already efficient.
- #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- constexpr size_t kN = 16 / sizeof(T);
- if (detail::IsFull(d)) {
- const Repartition<uint8_t, D> d8;
- const auto bytes = BitCast(d8, VecFromMask(d, mask));
- const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T);
- return idx == kN ? -1 : static_cast<intptr_t>(kN - 1 - idx);
- }
- #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
- (void)d;
- const uint64_t mask_bits = detail::BitsFromMask(mask);
- return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits))
- : -1;
- }
- // ------------------------------ Compress, CompressBits
- namespace detail {
- #if HWY_PPC_HAVE_10
- template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
- constexpr unsigned kGenPcvmMode =
- (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
- // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
- // compiler bugs on little-endian PPC10
- typename detail::Raw128<TFromD<D>>::type idx;
- __asm__("xxgenpcvbm %x0, %1, %2"
- : "=wa"(idx)
- : "v"(mask.raw), "i"(kGenPcvmMode));
- return VFromD<decltype(d)>{idx};
- }
- template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
- constexpr unsigned kGenPcvmMode =
- (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
- // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
- // compiler bugs on little-endian PPC10
- typename detail::Raw128<TFromD<D>>::type idx;
- __asm__("xxgenpcvhm %x0, %1, %2"
- : "=wa"(idx)
- : "v"(mask.raw), "i"(kGenPcvmMode));
- return VFromD<decltype(d)>{idx};
- }
- template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
- constexpr unsigned kGenPcvmMode =
- (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
- // Inline assembly is used instead of the vec_genpcvm intrinsic to work around
- // compiler bugs on little-endian PPC10
- typename detail::Raw128<TFromD<D>>::type idx;
- __asm__("xxgenpcvwm %x0, %1, %2"
- : "=wa"(idx)
- : "v"(mask.raw), "i"(kGenPcvmMode));
- return VFromD<decltype(d)>{idx};
- }
- #endif
- // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
- HWY_DASSERT(mask_bits < 256);
- const Rebind<uint8_t, decltype(d)> d8;
- const Twice<decltype(d8)> d8t;
- const RebindToUnsigned<decltype(d)> du;
- // To reduce cache footprint, store lane indices and convert to byte indices
- // (2*lane + 0..1), with the doubling baked into the table. It's not clear
- // that the additional cost of unpacking nibbles is worthwhile.
- alignas(16) static constexpr uint8_t table[2048] = {
- // PrintCompress16x8Tables
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
- 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
- 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, //
- 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
- 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, //
- 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, //
- 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, //
- 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
- 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, //
- 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, //
- 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, //
- 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, //
- 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, //
- 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, //
- 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, //
- 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
- 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, //
- 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, //
- 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, //
- 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, //
- 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, //
- 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, //
- 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, //
- 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, //
- 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, //
- 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, //
- 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, //
- 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, //
- 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, //
- 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, //
- 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, //
- 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
- 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, //
- 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, //
- 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, //
- 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, //
- 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, //
- 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, //
- 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, //
- 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, //
- 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, //
- 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, //
- 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, //
- 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, //
- 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, //
- 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, //
- 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, //
- 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, //
- 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, //
- 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, //
- 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, //
- 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, //
- 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, //
- 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, //
- 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, //
- 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, //
- 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, //
- 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, //
- 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, //
- 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, //
- 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, //
- 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, //
- 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, //
- 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, //
- 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, //
- 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, //
- 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, //
- 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, //
- 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, //
- 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, //
- 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, //
- 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, //
- 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, //
- 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, //
- 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, //
- 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, //
- 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, //
- 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, //
- 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, //
- 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, //
- 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, //
- 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, //
- 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, //
- 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, //
- 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, //
- 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, //
- 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, //
- 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, //
- 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, //
- 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, //
- 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, //
- 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, //
- 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, //
- 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, //
- 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, //
- 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, //
- 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, //
- 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, //
- 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, //
- 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, //
- 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, //
- 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, //
- 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, //
- 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, //
- 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, //
- 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, //
- 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, //
- 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, //
- 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, //
- 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, //
- 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, //
- 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, //
- 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, //
- 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, //
- 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, //
- 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, //
- 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, //
- 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, //
- 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, //
- 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, //
- 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, //
- 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, //
- 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, //
- 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, //
- 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, //
- 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, //
- 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, //
- 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14};
- const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
- const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
- constexpr uint16_t kPairIndexIncrement =
- HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
- return BitCast(d, pairs + Set(du, kPairIndexIncrement));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 2)>
- HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
- HWY_DASSERT(mask_bits < 256);
- const Rebind<uint8_t, decltype(d)> d8;
- const Twice<decltype(d8)> d8t;
- const RebindToUnsigned<decltype(d)> du;
- // To reduce cache footprint, store lane indices and convert to byte indices
- // (2*lane + 0..1), with the doubling baked into the table. It's not clear
- // that the additional cost of unpacking nibbles is worthwhile.
- alignas(16) static constexpr uint8_t table[2048] = {
- // PrintCompressNot16x8Tables
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, //
- 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, //
- 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, //
- 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, //
- 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, //
- 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, //
- 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, //
- 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, //
- 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, //
- 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, //
- 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, //
- 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, //
- 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, //
- 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, //
- 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, //
- 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, //
- 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, //
- 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, //
- 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, //
- 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, //
- 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, //
- 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, //
- 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, //
- 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, //
- 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, //
- 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, //
- 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, //
- 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, //
- 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, //
- 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, //
- 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, //
- 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, //
- 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, //
- 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, //
- 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, //
- 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, //
- 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, //
- 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, //
- 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, //
- 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, //
- 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, //
- 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, //
- 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, //
- 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, //
- 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, //
- 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, //
- 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, //
- 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, //
- 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, //
- 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, //
- 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, //
- 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, //
- 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, //
- 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, //
- 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, //
- 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, //
- 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, //
- 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, //
- 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, //
- 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, //
- 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, //
- 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, //
- 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, //
- 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, //
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, //
- 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, //
- 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, //
- 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, //
- 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, //
- 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, //
- 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, //
- 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, //
- 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, //
- 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, //
- 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, //
- 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, //
- 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, //
- 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, //
- 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, //
- 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, //
- 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, //
- 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, //
- 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, //
- 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, //
- 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, //
- 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, //
- 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, //
- 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, //
- 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, //
- 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, //
- 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, //
- 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, //
- 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, //
- 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, //
- 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, //
- 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, //
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, //
- 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, //
- 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, //
- 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, //
- 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, //
- 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, //
- 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, //
- 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, //
- 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, //
- 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, //
- 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, //
- 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, //
- 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, //
- 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, //
- 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, //
- 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, //
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, //
- 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, //
- 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, //
- 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, //
- 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, //
- 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, //
- 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, //
- 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, //
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, //
- 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, //
- 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, //
- 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, //
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, //
- 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, //
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, //
- 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14};
- const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
- const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
- constexpr uint16_t kPairIndexIncrement =
- HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
- return BitCast(d, pairs + Set(du, kPairIndexIncrement));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
- HWY_DASSERT(mask_bits < 16);
- // There are only 4 lanes, so we can afford to load the index vector directly.
- alignas(16) static constexpr uint8_t u8_indices[256] = {
- // PrintCompress32x4Tables
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
- 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
- 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
- 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
- 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
- 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
- 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
- 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
- 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
- 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
- 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
- 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
- const Repartition<uint8_t, decltype(d)> d8;
- return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 4)>
- HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
- HWY_DASSERT(mask_bits < 16);
- // There are only 4 lanes, so we can afford to load the index vector directly.
- alignas(16) static constexpr uint8_t u8_indices[256] = {
- // PrintCompressNot32x4Tables
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
- 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
- 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
- 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
- 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
- 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
- 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
- 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
- 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
- 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
- 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
- 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
- 12, 13, 14, 15};
- const Repartition<uint8_t, decltype(d)> d8;
- return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
- HWY_DASSERT(mask_bits < 4);
- // There are only 2 lanes, so we can afford to load the index vector directly.
- alignas(16) static constexpr uint8_t u8_indices[64] = {
- // PrintCompress64x2Tables
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
- const Repartition<uint8_t, decltype(d)> d8;
- return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
- }
- template <class D, HWY_IF_T_SIZE_D(D, 8)>
- HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
- HWY_DASSERT(mask_bits < 4);
- // There are only 2 lanes, so we can afford to load the index vector directly.
- alignas(16) static constexpr uint8_t u8_indices[64] = {
- // PrintCompressNot64x2Tables
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
- const Repartition<uint8_t, decltype(d)> d8;
- return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
- }
- template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
- HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- HWY_DASSERT(mask_bits < (1ull << N));
- const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
- return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
- }
- template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
- HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- HWY_DASSERT(mask_bits < (1ull << N));
- const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
- return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
- }
- } // namespace detail
- // Single lane: no-op
- template <typename T>
- HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
- return v;
- }
- // Two lanes: conditional swap
- template <typename T, HWY_IF_T_SIZE(T, 8)>
- HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
- // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
- const Full128<T> d;
- const Vec128<T> m = VecFromMask(d, mask);
- const Vec128<T> maskL = DupEven(m);
- const Vec128<T> maskH = DupOdd(m);
- const Vec128<T> swap = AndNot(maskL, maskH);
- return IfVecThenElse(swap, Shuffle01(v), v);
- }
- #if HWY_PPC_HAVE_10
- #ifdef HWY_NATIVE_COMPRESS8
- #undef HWY_NATIVE_COMPRESS8
- #else
- #define HWY_NATIVE_COMPRESS8
- #endif
- // General case, 1 byte
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
- const DFromV<decltype(v)> d;
- return TableLookupBytes(
- v, detail::CompressOrExpandIndicesFromMask<true>(d, mask));
- }
- #endif
- // General case, 2 or 4 bytes
- template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
- HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
- return detail::CompressBits(v, detail::BitsFromMask(mask));
- }
- // ------------------------------ CompressNot
- // Single lane: no-op
- template <typename T>
- HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
- return v;
- }
- // Two lanes: conditional swap
- template <typename T, HWY_IF_T_SIZE(T, 8)>
- HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
- // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
- const Full128<T> d;
- const Vec128<T> m = VecFromMask(d, mask);
- const Vec128<T> maskL = DupEven(m);
- const Vec128<T> maskH = DupOdd(m);
- const Vec128<T> swap = AndNot(maskH, maskL);
- return IfVecThenElse(swap, Shuffle01(v), v);
- }
- #if HWY_PPC_HAVE_10
- // General case, 1 byte
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
- const DFromV<decltype(v)> d;
- return TableLookupBytes(
- v, detail::CompressOrExpandIndicesFromMask<true>(d, Not(mask)));
- }
- #endif
- // General case, 2 or 4 bytes
- template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
- HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
- // For partial vectors, we cannot pull the Not() into the table because
- // BitsFromMask clears the upper bits.
- if (N < 16 / sizeof(T)) {
- return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
- }
- return detail::CompressNotBits(v, detail::BitsFromMask(mask));
- }
- // ------------------------------ CompressBlocksNot
- HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
- Mask128<uint64_t> /* m */) {
- return v;
- }
- #if HWY_PPC_HAVE_10
- template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
- HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
- const uint8_t* HWY_RESTRICT bits) {
- const DFromV<decltype(v)> d;
- return Compress(v, LoadMaskBits(d, bits));
- }
- #endif
- template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
- HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
- const uint8_t* HWY_RESTRICT bits) {
- // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply
- // convert bits[0] to a uint64_t
- uint64_t mask_bits = bits[0];
- if (N < 8) {
- mask_bits &= (1ull << N) - 1;
- }
- return detail::CompressBits(v, mask_bits);
- }
- // ------------------------------ CompressStore, CompressBitsStore
- #if HWY_PPC_HAVE_10
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
- TFromD<D>* HWY_RESTRICT unaligned) {
- const size_t count = CountTrue(d, m);
- const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m);
- const auto compressed = TableLookupBytes(v, indices);
- StoreU(compressed, d, unaligned);
- return count;
- }
- #endif
- template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
- HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
- TFromD<D>* HWY_RESTRICT unaligned) {
- const RebindToUnsigned<decltype(d)> du;
- const uint64_t mask_bits = detail::BitsFromMask(m);
- HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
- const size_t count = PopCount(mask_bits);
- const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
- const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
- StoreU(compressed, d, unaligned);
- return count;
- }
- #if HWY_PPC_HAVE_10
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
- TFromD<D>* HWY_RESTRICT unaligned) {
- const size_t count = CountTrue(d, m);
- const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m);
- const auto compressed = TableLookupBytes(v, indices);
- StoreN(compressed, d, unaligned, count);
- return count;
- }
- #endif
- template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
- HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
- TFromD<D>* HWY_RESTRICT unaligned) {
- const RebindToUnsigned<decltype(d)> du;
- const uint64_t mask_bits = detail::BitsFromMask(m);
- HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
- const size_t count = PopCount(mask_bits);
- const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
- const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
- #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
- StoreN(compressed, d, unaligned, count);
- #else
- BlendedStore(compressed, FirstN(d, count), d, unaligned);
- #endif
- return count;
- }
- #if HWY_PPC_HAVE_10
- template <class D, HWY_IF_T_SIZE_D(D, 1)>
- HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
- D d, TFromD<D>* HWY_RESTRICT unaligned) {
- return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
- }
- #endif
- template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
- HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
- D d, TFromD<D>* HWY_RESTRICT unaligned) {
- const RebindToUnsigned<decltype(d)> du;
- // As there are at most 8 lanes in v if sizeof(TFromD<D>) > 1, simply
- // convert bits[0] to a uint64_t
- uint64_t mask_bits = bits[0];
- constexpr size_t kN = MaxLanes(d);
- if (kN < 8) {
- mask_bits &= (1ull << kN) - 1;
- }
- const size_t count = PopCount(mask_bits);
- const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
- const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
- StoreU(compressed, d, unaligned);
- return count;
- }
- // ------------------------------ Expand
- #if HWY_PPC_HAVE_10
- #ifdef HWY_NATIVE_EXPAND
- #undef HWY_NATIVE_EXPAND
- #else
- #define HWY_NATIVE_EXPAND
- #endif
- template <typename T, size_t N,
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
- HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
- const DFromV<decltype(v)> d;
- const auto idx = detail::CompressOrExpandIndicesFromMask<false>(d, mask);
- return IfThenElseZero(mask, TableLookupBytes(v, idx));
- }
- template <typename T, HWY_IF_T_SIZE(T, 8)>
- HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
- // Same as Compress, just zero out the mask=false lanes.
- return IfThenElseZero(mask, Compress(v, mask));
- }
- // For single-element vectors, this is at least as fast as native.
- template <typename T>
- HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
- return IfThenElseZero(mask, v);
- }
- template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
- HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
- const TFromD<D>* HWY_RESTRICT unaligned) {
- return Expand(LoadU(d, unaligned), mask);
- }
- #endif // HWY_PPC_HAVE_10
- // ------------------------------ StoreInterleaved2/3/4
- // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
- // generic_ops-inl.h.
- // ------------------------------ Additional mask logical operations
- namespace detail {
- #if HWY_IS_LITTLE_ENDIAN
- template <class V>
- HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
- return v;
- }
- template <class V>
- HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
- return v;
- }
- #else
- template <class V, HWY_IF_T_SIZE_V(V, 1)>
- HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
- const DFromV<decltype(v)> d;
- return Reverse8(d, v);
- }
- template <class V, HWY_IF_T_SIZE_V(V, 2)>
- HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
- const DFromV<decltype(v)> d;
- return Reverse4(d, v);
- }
- template <class V, HWY_IF_T_SIZE_V(V, 4)>
- HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
- const DFromV<decltype(v)> d;
- return Reverse2(d, v);
- }
- template <class V, HWY_IF_T_SIZE_V(V, 8)>
- HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
- return v;
- }
- template <class V>
- HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
- const DFromV<decltype(v)> d;
- return Reverse(d, v);
- }
- #endif
- template <class V>
- HWY_INLINE V I128Subtract(V a, V b) {
- #if HWY_S390X_HAVE_Z14
- const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
- vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
- reinterpret_cast<__vector unsigned char>(b.raw)))};
- #elif defined(__SIZEOF_INT128__)
- using VU128 = __vector unsigned __int128;
- const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
- vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))};
- #else
- const DFromV<decltype(a)> d;
- const Repartition<uint64_t, decltype(d)> du64;
- const auto u64_a = BitCast(du64, a);
- const auto u64_b = BitCast(du64, b);
- const auto diff_u64 = u64_a - u64_b;
- const auto borrow_u64 = VecFromMask(du64, u64_a < u64_b);
- #if HWY_IS_LITTLE_ENDIAN
- const auto borrow_u64_shifted = ShiftLeftBytes<8>(du64, borrow_u64);
- #else
- const auto borrow_u64_shifted = ShiftRightBytes<8>(du64, borrow_u64);
- #endif
- const auto diff_i128 = BitCast(d, diff_u64 + borrow_u64_shifted);
- #endif
- return diff_i128;
- }
- } // namespace detail
- template <class T>
- HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
- return mask;
- }
- template <class T>
- HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
- const FixedTag<T, 2> d;
- const auto vmask = VecFromMask(d, mask);
- return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
- }
- template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
- HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
- const Simd<T, N, 0> d;
- const Full64<T> d_full64;
- const auto vmask = VecFromMask(d, mask);
- const auto vmask_le64 =
- BitCast(Full64<int64_t>(),
- detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask)));
- const auto neg_vmask_le64 = Neg(vmask_le64);
- const auto neg_vmask = ResizeBitCast(
- d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64)));
- return MaskFromVec(Or(vmask, neg_vmask));
- }
- template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
- HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
- const Full128<T> d;
- auto vmask = VecFromMask(d, mask);
- const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask);
- const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128);
- const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128);
- return MaskFromVec(BitCast(d, Or(vmask, neg_vmask)));
- }
- template <class T, size_t N>
- HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
- return Not(SetAtOrAfterFirst(mask));
- }
- template <class T>
- HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
- return mask;
- }
- template <class T>
- HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
- const FixedTag<T, 2> d;
- const RebindToSigned<decltype(d)> di;
- const auto vmask = BitCast(di, VecFromMask(d, mask));
- const auto zero = Zero(di);
- const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
- return MaskFromVec(BitCast(d, And(vmask, vmask2)));
- }
- template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
- HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
- const Simd<T, N, 0> d;
- const Full64<T> d_full64;
- const RebindToSigned<decltype(d)> di;
- const auto vmask = VecFromMask(d, mask);
- const auto vmask_le64 =
- BitCast(Full64<int64_t>(),
- detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask)));
- const auto neg_vmask_le64 = Neg(vmask_le64);
- const auto neg_vmask = ResizeBitCast(
- d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64)));
- const auto first_vmask = BitCast(di, And(vmask, neg_vmask));
- return MaskFromVec(BitCast(d, Or(first_vmask, Neg(first_vmask))));
- }
- template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
- HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
- const Full128<T> d;
- const RebindToSigned<decltype(d)> di;
- const auto vmask = VecFromMask(d, mask);
- const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask);
- const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128);
- const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128);
- return MaskFromVec(BitCast(d, Neg(BitCast(di, And(vmask, neg_vmask)))));
- }
- template <class T>
- HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
- const FixedTag<T, 1> d;
- const RebindToSigned<decltype(d)> di;
- using TI = MakeSigned<T>;
- return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
- }
- template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
- HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
- const Simd<T, N, 0> d;
- return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
- }
- // ------------------------------ SumsOf2 and SumsOf4
- namespace detail {
- #if !HWY_S390X_HAVE_Z14
- // Casts nominally int32_t result to D.
- template <class D>
- HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
- __vector signed int b) {
- const Repartition<int32_t, D> di32;
- #ifdef __OPTIMIZE__
- if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
- const int64_t sum0 =
- static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
- static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
- static_cast<int64_t>(b[0]);
- const int64_t sum1 =
- static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
- static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
- static_cast<int64_t>(b[1]);
- const int64_t sum2 =
- static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
- static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
- static_cast<int64_t>(b[2]);
- const int64_t sum3 =
- static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
- static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
- static_cast<int64_t>(b[3]);
- const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
- const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
- const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
- const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
- using Raw = typename detail::Raw128<int32_t>::type;
- return BitCast(
- d,
- VFromD<decltype(di32)>{Raw{
- (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
- : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
- (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
- : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
- (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
- : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
- (sign3 == (sum3 >> 31))
- ? static_cast<int32_t>(sum3)
- : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
- } else // NOLINT
- #endif
- {
- return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
- }
- }
- // Casts nominally uint32_t result to D.
- template <class D>
- HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
- __vector unsigned int b) {
- const Repartition<uint32_t, D> du32;
- #ifdef __OPTIMIZE__
- if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
- const uint64_t sum0 =
- static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
- static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
- static_cast<uint64_t>(b[0]);
- const uint64_t sum1 =
- static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
- static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
- static_cast<uint64_t>(b[1]);
- const uint64_t sum2 =
- static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
- static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
- static_cast<uint64_t>(b[2]);
- const uint64_t sum3 =
- static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
- static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
- static_cast<uint64_t>(b[3]);
- return BitCast(
- d,
- VFromD<decltype(du32)>{(__vector unsigned int){
- static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
- static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
- static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
- static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
- : 0xFFFFFFFFu)}});
- } else // NOLINT
- #endif
- {
- return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
- }
- }
- // Casts nominally int32_t result to D.
- template <class D>
- HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
- __vector signed int b) {
- const Repartition<int32_t, D> di32;
- #ifdef __OPTIMIZE__
- const Repartition<uint64_t, D> du64;
- constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
- if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
- __builtin_constant_p(b[kDestLaneOffset + 2])) {
- const int64_t sum0 = static_cast<int64_t>(a[0]) +
- static_cast<int64_t>(a[1]) +
- static_cast<int64_t>(b[kDestLaneOffset]);
- const int64_t sum1 = static_cast<int64_t>(a[2]) +
- static_cast<int64_t>(a[3]) +
- static_cast<int64_t>(b[kDestLaneOffset + 2]);
- const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
- const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
- return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
- (sign0 == (sum0 >> 31))
- ? static_cast<uint32_t>(sum0)
- : static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
- (sign1 == (sum1 >> 31))
- ? static_cast<uint32_t>(sum1)
- : static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
- } else // NOLINT
- #endif
- {
- __vector signed int sum;
- // Inline assembly is used for vsum2sws to avoid unnecessary shuffling
- // on little-endian PowerPC targets as the result of the vsum2sws
- // instruction will already be in the correct lanes on little-endian
- // PowerPC targets.
- __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
- return BitCast(d, VFromD<decltype(di32)>{sum});
- }
- }
- // Casts nominally int32_t result to D.
- template <class D>
- HWY_INLINE VFromD<D> AltivecVsum4shs(D d, __vector signed short a,
- __vector signed int b) {
- const Repartition<int32_t, D> di32;
- #ifdef __OPTIMIZE__
- if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
- const int64_t sum0 = static_cast<int64_t>(a[0]) +
- static_cast<int64_t>(a[1]) +
- static_cast<int64_t>(b[0]);
- const int64_t sum1 = static_cast<int64_t>(a[2]) +
- static_cast<int64_t>(a[3]) +
- static_cast<int64_t>(b[1]);
- const int64_t sum2 = static_cast<int64_t>(a[4]) +
- static_cast<int64_t>(a[5]) +
- static_cast<int64_t>(b[2]);
- const int64_t sum3 = static_cast<int64_t>(a[6]) +
- static_cast<int64_t>(a[7]) +
- static_cast<int64_t>(b[3]);
- const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
- const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
- const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
- const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
- using Raw = typename detail::Raw128<int32_t>::type;
- return BitCast(
- d,
- VFromD<decltype(di32)>{Raw{
- (sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
- : static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
- (sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
- : static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
- (sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
- : static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
- (sign3 == (sum3 >> 31))
- ? static_cast<int32_t>(sum3)
- : static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
- } else // NOLINT
- #endif
- {
- return BitCast(d, VFromD<decltype(di32)>{vec_vsum4shs(a, b)});
- }
- }
- // Casts nominally int32_t result to D.
- template <class D>
- HWY_INLINE VFromD<D> AltivecVsumsws(D d, __vector signed int a,
- __vector signed int b) {
- const Repartition<int32_t, D> di32;
- #ifdef __OPTIMIZE__
- constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
- if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) {
- const int64_t sum =
- static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
- static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
- static_cast<int64_t>(b[kDestLaneOffset]);
- const int32_t sign = static_cast<int32_t>(sum >> 63);
- #if HWY_IS_LITTLE_ENDIAN
- return BitCast(
- d, VFromD<decltype(di32)>{(__vector signed int){
- (sign == (sum >> 31)) ? static_cast<int32_t>(sum)
- : static_cast<int32_t>(sign ^ 0x7FFFFFFF),
- 0, 0, 0}});
- #else
- return BitCast(d, VFromD<decltype(di32)>{(__vector signed int){
- 0, 0, 0,
- (sign == (sum >> 31))
- ? static_cast<int32_t>(sum)
- : static_cast<int32_t>(sign ^ 0x7FFFFFFF)}});
- #endif
- } else // NOLINT
- #endif
- {
- __vector signed int sum;
- // Inline assembly is used for vsumsws to avoid unnecessary shuffling
- // on little-endian PowerPC targets as the result of the vsumsws
- // instruction will already be in the correct lanes on little-endian
- // PowerPC targets.
- __asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
- return BitCast(d, VFromD<decltype(di32)>{sum});
- }
- }
- template <size_t N>
- HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
- const RebindToSigned<DFromV<decltype(v)>> di16;
- const RepartitionToWide<decltype(di16)> di32;
- return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
- Set(di32, 65536).raw);
- }
- #endif // !HWY_S390X_HAVE_Z14
- // U16->U32 SumsOf2
- template <class V>
- HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
- hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
- const DFromV<V> d;
- const RepartitionToWide<decltype(d)> dw;
- #if HWY_S390X_HAVE_Z14
- return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)};
- #else
- return BitCast(dw, AltivecU16SumsOf2(v));
- #endif
- }
- // I16->I32 SumsOf2
- template <class V>
- HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
- hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
- const DFromV<V> d;
- const RepartitionToWide<decltype(d)> dw;
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(),
- BitCast(du, Xor(v, SignBit(d))))) +
- Set(dw, int32_t{-65536});
- #else
- return AltivecVsum4shs(dw, v.raw, Zero(dw).raw);
- #endif
- }
- #if HWY_S390X_HAVE_Z14
- // U32->U64 SumsOf2
- template <class V>
- HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
- hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
- const DFromV<V> d;
- const RepartitionToWide<decltype(d)> dw;
- return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)};
- }
- // I32->I64 SumsOf2
- template <class V>
- HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
- hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
- const DFromV<V> d;
- const RepartitionToWide<decltype(d)> dw;
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
- BitCast(du, Xor(v, SignBit(d))))) +
- Set(dw, int64_t{-4294967296LL});
- }
- #endif
- // U8->U32 SumsOf4
- template <class V>
- HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
- hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
- const DFromV<V> d;
- const RepartitionToWideX2<decltype(d)> dw2;
- #if HWY_S390X_HAVE_Z14
- return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)};
- #else
- return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw);
- #endif
- }
- // I8->I32 SumsOf4
- template <class V>
- HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
- hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
- const DFromV<V> d;
- const RepartitionToWideX2<decltype(d)> dw2;
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(),
- BitCast(du, Xor(v, SignBit(d))))) +
- Set(dw2, int32_t{-512});
- #else
- return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw);
- #endif
- }
- // U16->U64 SumsOf4
- template <class V>
- HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
- hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
- const DFromV<V> d;
- const RepartitionToWide<decltype(d)> dw;
- const RepartitionToWide<decltype(dw)> dw2;
- #if HWY_S390X_HAVE_Z14
- return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)};
- #else
- const RebindToSigned<decltype(dw)> dw_i;
- return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw);
- #endif
- }
- // I16->I64 SumsOf4
- template <class V>
- HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
- hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
- const DFromV<V> d;
- const RepartitionToWide<decltype(d)> dw;
- const RepartitionToWide<decltype(dw)> dw2;
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(),
- BitCast(du, Xor(v, SignBit(d))))) +
- Set(dw2, int64_t{-131072});
- #else // VSX
- const auto sums_of_4_in_lo32 =
- AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw);
- #if HWY_IS_LITTLE_ENDIAN
- return PromoteEvenTo(dw2, sums_of_4_in_lo32);
- #else
- return PromoteOddTo(dw2, sums_of_4_in_lo32);
- #endif // HWY_IS_LITTLE_ENDIAN
- #endif // HWY_S390X_HAVE_Z14
- }
- } // namespace detail
- // ------------------------------ SumOfLanes
- // We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16);
- // enable generic for the rest.
- #undef HWY_IF_SUM_OF_LANES_D
- #if HWY_S390X_HAVE_Z14
- #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
- #else
- #define HWY_IF_SUM_OF_LANES_D(D) \
- HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
- #endif
- #if HWY_S390X_HAVE_Z14
- namespace detail {
- template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
- HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
- HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- return BitCast(
- d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
- }
- } // namespace detail
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
- HWY_API VFromD<D> SumOfLanes(D /*d64*/, VFromD<D> v) {
- return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v));
- }
- #endif
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
- HWY_API Vec32<uint16_t> SumOfLanes(D du16, Vec32<uint16_t> v) {
- constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
- return Broadcast<kSumLaneIdx>(
- BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
- HWY_API Vec64<uint16_t> SumOfLanes(D du16, Vec64<uint16_t> v) {
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
- return Broadcast<kSumLaneIdx>(
- BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
- HWY_API Vec128<uint16_t> SumOfLanes(D du16, Vec128<uint16_t> v) {
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
- #if HWY_S390X_HAVE_Z14
- return Broadcast<kSumLaneIdx>(
- BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
- hwy::UnsignedTag(), hwy::SizeTag<2>(), v))));
- #else // VSX
- const auto zero = Zero(Full128<int32_t>());
- return Broadcast<kSumLaneIdx>(
- detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
- HWY_API Vec32<int16_t> SumOfLanes(D di16, Vec32<int16_t> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(di16)> du16;
- return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
- #else
- constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
- return Broadcast<kSumLaneIdx>(
- BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
- HWY_API Vec64<int16_t> SumOfLanes(D di16, Vec64<int16_t> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(di16)> du16;
- return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
- #else
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
- return Broadcast<kSumLaneIdx>(
- BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
- HWY_API Vec128<int16_t> SumOfLanes(D di16, Vec128<int16_t> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(di16)> du16;
- return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
- #else
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
- const Full128<int32_t> di32;
- const auto zero = Zero(di32);
- return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
- di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
- HWY_API Vec32<uint8_t> SumOfLanes(D du8, Vec32<uint8_t> v) {
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
- return Broadcast<kSumLaneIdx>(
- BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v)));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
- HWY_API Vec16<uint8_t> SumOfLanes(D du8, Vec16<uint8_t> v) {
- const Twice<decltype(du8)> dt_u8;
- return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v)));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
- HWY_API Vec64<uint8_t> SumOfLanes(D du8, Vec64<uint8_t> v) {
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
- return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
- HWY_API Vec128<uint8_t> SumOfLanes(D du8, Vec128<uint8_t> v) {
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
- #if HWY_S390X_HAVE_Z14
- return Broadcast<kSumLaneIdx>(
- BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
- hwy::UnsignedTag(), hwy::SizeTag<1>(), v))));
- #else
- const Full128<uint32_t> du32;
- const RebindToSigned<decltype(du32)> di32;
- const Vec128<uint32_t> zero = Zero(du32);
- return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
- du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
- BitCast(di32, zero).raw));
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
- HWY_API Vec32<int8_t> SumOfLanes(D di8, Vec32<int8_t> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(di8)> du8;
- return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
- #else
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
- return Broadcast<kSumLaneIdx>(
- BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v)));
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
- HWY_API Vec16<int8_t> SumOfLanes(D di8, Vec16<int8_t> v) {
- const Twice<decltype(di8)> dt_i8;
- return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v)));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
- HWY_API Vec64<int8_t> SumOfLanes(D di8, Vec64<int8_t> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(di8)> du8;
- return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
- #else
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
- return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v)));
- #endif
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
- HWY_API Vec128<int8_t> SumOfLanes(D di8, Vec128<int8_t> v) {
- #if HWY_S390X_HAVE_Z14
- const RebindToUnsigned<decltype(di8)> du8;
- return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
- #else
- constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
- const Full128<int32_t> di32;
- const Vec128<int32_t> zero = Zero(di32);
- return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
- di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
- #endif
- }
- #if HWY_S390X_HAVE_Z14
- template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
- HWY_API VFromD<D> SumOfLanes(D d32, VFromD<D> v) {
- const RebindToUnsigned<decltype(d32)> du32;
- return Broadcast<1>(
- BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
- BitCast(du32, v))));
- }
- template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
- HWY_API VFromD<D> SumOfLanes(D /*d32*/, VFromD<D> v) {
- return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v));
- }
- #endif
- // generic_ops defines MinOfLanes and MaxOfLanes.
- // ------------------------------ ReduceSum for N=4 I8/U8
- // GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4
- // I8/U8 ReduceSum implementation in generic_ops-inl.h
- #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
- #undef HWY_NATIVE_REDUCE_SUM_4_UI8
- #else
- #define HWY_NATIVE_REDUCE_SUM_4_UI8
- #endif
- template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
- HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
- return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
- }
- // ------------------------------ BitShuffle
- #ifdef HWY_NATIVE_BITSHUFFLE
- #undef HWY_NATIVE_BITSHUFFLE
- #else
- #define HWY_NATIVE_BITSHUFFLE
- #endif
- template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
- HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
- HWY_API V BitShuffle(V v, VI idx) {
- const DFromV<decltype(v)> d64;
- const RebindToUnsigned<decltype(d64)> du64;
- const Repartition<uint8_t, decltype(d64)> du8;
- const Full128<TFromD<decltype(du64)>> d_full_u64;
- const Full128<TFromD<decltype(du8)>> d_full_u8;
- using RawVU64 = __vector unsigned long long;
- #if HWY_PPC_HAVE_9
- #if HWY_IS_LITTLE_ENDIAN
- (void)d_full_u64;
- auto bit_idx = ResizeBitCast(d_full_u8, idx);
- #else
- auto bit_idx =
- BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx)));
- #endif
- bit_idx = Xor(bit_idx, Set(d_full_u8, uint8_t{0x3F}));
- return BitCast(d64, VFromD<decltype(du64)>{reinterpret_cast<RawVU64>(
- vec_bperm(BitCast(du64, v).raw, bit_idx.raw))});
- #else // !HWY_PPC_HAVE_9
- #if HWY_IS_LITTLE_ENDIAN
- const auto bit_idx_xor_mask = BitCast(
- d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x7F7F7F7F7F7F7F7Fu},
- uint64_t{0x3F3F3F3F3F3F3F3Fu}));
- const auto bit_idx = Xor(ResizeBitCast(d_full_u8, idx), bit_idx_xor_mask);
- constexpr int kBitShufResultByteShrAmt = 8;
- #else
- const auto bit_idx_xor_mask = BitCast(
- d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x3F3F3F3F3F3F3F3Fu},
- uint64_t{0x7F7F7F7F7F7F7F7Fu}));
- const auto bit_idx =
- Xor(BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))),
- bit_idx_xor_mask);
- constexpr int kBitShufResultByteShrAmt = 6;
- #endif
- #if HWY_S390X_HAVE_Z14
- const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
- vec_bperm_u128(BitCast(du8, v).raw, bit_idx.raw))};
- #elif defined(__SIZEOF_INT128__)
- using RawVU128 = __vector unsigned __int128;
- const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
- vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
- #else
- using RawVU128 = __vector unsigned char;
- const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
- vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
- #endif
- return ResizeBitCast(
- d64, PromoteTo(d_full_u64,
- ResizeBitCast(
- Rebind<uint8_t, decltype(d_full_u64)>(),
- CombineShiftRightBytes<kBitShufResultByteShrAmt>(
- d_full_u64, bit_shuf_result, bit_shuf_result))));
- #endif // HWY_PPC_HAVE_9
- }
- // ------------------------------ Lt128
- namespace detail {
- // Returns vector-mask for Lt128.
- template <class D, class V = VFromD<D>>
- HWY_INLINE V Lt128Vec(D d, V a, V b) {
- static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
- #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
- (void)d;
- using VU64 = __vector unsigned long long;
- using VU128 = __vector unsigned __int128;
- #if HWY_IS_LITTLE_ENDIAN
- const VU128 a_u128 = reinterpret_cast<VU128>(a.raw);
- const VU128 b_u128 = reinterpret_cast<VU128>(b.raw);
- #else
- // NOTE: Need to swap the halves of both a and b on big-endian targets
- // as the upper 64 bits of a and b are in lane 1 and the lower 64 bits
- // of a and b are in lane 0 whereas the vec_cmplt operation below expects
- // the upper 64 bits in lane 0 and the lower 64 bits in lane 1 on
- // big-endian PPC targets.
- const VU128 a_u128 = reinterpret_cast<VU128>(vec_sld(a.raw, a.raw, 8));
- const VU128 b_u128 = reinterpret_cast<VU128>(vec_sld(b.raw, b.raw, 8));
- #endif
- return V{reinterpret_cast<VU64>(vec_cmplt(a_u128, b_u128))};
- #else // !HWY_PPC_HAVE_10
- // Truth table of Eq and Lt for Hi and Lo u64.
- // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
- // =H =L cH cL | out = cH | (=H & cL)
- // 0 0 0 0 | 0
- // 0 0 0 1 | 0
- // 0 0 1 0 | 1
- // 0 0 1 1 | 1
- // 0 1 0 0 | 0
- // 0 1 0 1 | 0
- // 0 1 1 0 | 1
- // 1 0 0 0 | 0
- // 1 0 0 1 | 1
- // 1 1 0 0 | 0
- const auto eqHL = Eq(a, b);
- const V ltHL = VecFromMask(d, Lt(a, b));
- const V ltLX = ShiftLeftLanes<1>(ltHL);
- const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
- return InterleaveUpper(d, vecHx, vecHx);
- #endif
- }
- // Returns vector-mask for Eq128.
- template <class D, class V = VFromD<D>>
- HWY_INLINE V Eq128Vec(D d, V a, V b) {
- static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
- #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
- (void)d;
- using VU64 = __vector unsigned long long;
- using VU128 = __vector unsigned __int128;
- return V{reinterpret_cast<VU64>(vec_cmpeq(reinterpret_cast<VU128>(a.raw),
- reinterpret_cast<VU128>(b.raw)))};
- #else
- const auto eqHL = VecFromMask(d, Eq(a, b));
- const auto eqLH = Reverse2(d, eqHL);
- return And(eqHL, eqLH);
- #endif
- }
- template <class D, class V = VFromD<D>>
- HWY_INLINE V Ne128Vec(D d, V a, V b) {
- static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
- #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
- (void)d;
- using VU64 = __vector unsigned long long;
- using VU128 = __vector unsigned __int128;
- return V{reinterpret_cast<VU64>(vec_cmpne(reinterpret_cast<VU128>(a.raw),
- reinterpret_cast<VU128>(b.raw)))};
- #else
- const auto neHL = VecFromMask(d, Ne(a, b));
- const auto neLH = Reverse2(d, neHL);
- return Or(neHL, neLH);
- #endif
- }
- template <class D, class V = VFromD<D>>
- HWY_INLINE V Lt128UpperVec(D d, V a, V b) {
- const V ltHL = VecFromMask(d, Lt(a, b));
- return InterleaveUpper(d, ltHL, ltHL);
- }
- template <class D, class V = VFromD<D>>
- HWY_INLINE V Eq128UpperVec(D d, V a, V b) {
- const V eqHL = VecFromMask(d, Eq(a, b));
- return InterleaveUpper(d, eqHL, eqHL);
- }
- template <class D, class V = VFromD<D>>
- HWY_INLINE V Ne128UpperVec(D d, V a, V b) {
- const V neHL = VecFromMask(d, Ne(a, b));
- return InterleaveUpper(d, neHL, neHL);
- }
- } // namespace detail
- template <class D, class V = VFromD<D>>
- HWY_API MFromD<D> Lt128(D d, V a, V b) {
- return MaskFromVec(detail::Lt128Vec(d, a, b));
- }
- template <class D, class V = VFromD<D>>
- HWY_API MFromD<D> Eq128(D d, V a, V b) {
- return MaskFromVec(detail::Eq128Vec(d, a, b));
- }
- template <class D, class V = VFromD<D>>
- HWY_API MFromD<D> Ne128(D d, V a, V b) {
- return MaskFromVec(detail::Ne128Vec(d, a, b));
- }
- template <class D, class V = VFromD<D>>
- HWY_API MFromD<D> Lt128Upper(D d, V a, V b) {
- return MaskFromVec(detail::Lt128UpperVec(d, a, b));
- }
- template <class D, class V = VFromD<D>>
- HWY_API MFromD<D> Eq128Upper(D d, V a, V b) {
- return MaskFromVec(detail::Eq128UpperVec(d, a, b));
- }
- template <class D, class V = VFromD<D>>
- HWY_API MFromD<D> Ne128Upper(D d, V a, V b) {
- return MaskFromVec(detail::Ne128UpperVec(d, a, b));
- }
- // ------------------------------ Min128, Max128 (Lt128)
- // Avoids the extra MaskFromVec in Lt128.
- template <class D, class V = VFromD<D>>
- HWY_API V Min128(D d, const V a, const V b) {
- return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
- }
- template <class D, class V = VFromD<D>>
- HWY_API V Max128(D d, const V a, const V b) {
- return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
- }
- template <class D, class V = VFromD<D>>
- HWY_API V Min128Upper(D d, const V a, const V b) {
- return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
- }
- template <class D, class V = VFromD<D>>
- HWY_API V Max128Upper(D d, const V a, const V b) {
- return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
- }
- // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
- #ifdef HWY_NATIVE_LEADING_ZERO_COUNT
- #undef HWY_NATIVE_LEADING_ZERO_COUNT
- #else
- #define HWY_NATIVE_LEADING_ZERO_COUNT
- #endif
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
- HWY_API V LeadingZeroCount(V v) {
- #if HWY_S390X_HAVE_Z14
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
- // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
- // constant
- __asm__("" : "+v"(v.raw));
- #endif
- return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)});
- #else
- return V{vec_cntlz(v.raw)};
- #endif
- }
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
- HWY_API V HighestSetBitIndex(V v) {
- const DFromV<decltype(v)> d;
- using T = TFromD<decltype(d)>;
- return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
- }
- #if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
- HWY_API V TrailingZeroCount(V v) {
- #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
- return V{vec_vctz(v.raw)};
- #else
- #if HWY_S390X_HAVE_Z14
- const DFromV<decltype(v)> d;
- const RebindToUnsigned<decltype(d)> du;
- #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
- // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a
- // constant
- __asm__("" : "+v"(v.raw));
- #endif
- return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)});
- #else
- return V{vec_cnttz(v.raw)};
- #endif // HWY_S390X_HAVE_Z14
- #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
- }
- #else
- template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
- HWY_API V TrailingZeroCount(V v) {
- const DFromV<decltype(v)> d;
- const RebindToSigned<decltype(d)> di;
- using TI = TFromD<decltype(di)>;
- const auto vi = BitCast(di, v);
- const auto lowest_bit = And(vi, Neg(vi));
- constexpr TI kNumOfBitsInT{sizeof(TI) * 8};
- const auto bit_idx = HighestSetBitIndex(lowest_bit);
- return BitCast(d, IfThenElse(MaskFromVec(BroadcastSignBit(bit_idx)),
- Set(di, kNumOfBitsInT), bit_idx));
- }
- #endif
- #undef HWY_PPC_HAVE_9
- #undef HWY_PPC_HAVE_10
- #undef HWY_S390X_HAVE_Z14
- #undef HWY_S390X_HAVE_Z15
- // NOLINTNEXTLINE(google-readability-namespace-comments)
- } // namespace HWY_NAMESPACE
- } // namespace hwy
- HWY_AFTER_NAMESPACE();
|