base.h 99 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940
  1. // Copyright 2020 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #ifndef HIGHWAY_HWY_BASE_H_
  16. #define HIGHWAY_HWY_BASE_H_
  17. // Target-independent definitions.
  18. // IWYU pragma: begin_exports
  19. #include <stddef.h>
  20. #include <stdint.h>
  21. #include "hwy/detect_compiler_arch.h"
  22. #include "hwy/highway_export.h"
  23. // API version (https://semver.org/); keep in sync with CMakeLists.txt.
  24. #define HWY_MAJOR 1
  25. #define HWY_MINOR 2
  26. #define HWY_PATCH 0
  27. // True if the Highway version >= major.minor.0. Added in 1.2.0.
  28. #define HWY_VERSION_GE(major, minor) \
  29. (HWY_MAJOR > (major) || (HWY_MAJOR == (major) && HWY_MINOR >= (minor)))
  30. // True if the Highway version < major.minor.0. Added in 1.2.0.
  31. #define HWY_VERSION_LT(major, minor) \
  32. (HWY_MAJOR < (major) || (HWY_MAJOR == (major) && HWY_MINOR < (minor)))
  33. // "IWYU pragma: keep" does not work for these includes, so hide from the IDE.
  34. #if !HWY_IDE
  35. #if !defined(HWY_NO_LIBCXX)
  36. #ifndef __STDC_FORMAT_MACROS
  37. #define __STDC_FORMAT_MACROS // before inttypes.h
  38. #endif
  39. #include <inttypes.h>
  40. #endif
  41. #if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC
  42. #include <atomic>
  43. #endif
  44. #endif // !HWY_IDE
  45. #ifndef HWY_HAVE_COMPARE_HEADER // allow override
  46. #define HWY_HAVE_COMPARE_HEADER 0
  47. #if defined(__has_include) // note: wrapper macro fails on Clang ~17
  48. #if __has_include(<compare>)
  49. #undef HWY_HAVE_COMPARE_HEADER
  50. #define HWY_HAVE_COMPARE_HEADER 1
  51. #endif // __has_include
  52. #endif // defined(__has_include)
  53. #endif // HWY_HAVE_COMPARE_HEADER
  54. #ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE // allow override
  55. #if !defined(HWY_NO_LIBCXX) && defined(__cpp_impl_three_way_comparison) && \
  56. __cpp_impl_three_way_comparison >= 201907L && HWY_HAVE_COMPARE_HEADER
  57. #include <compare>
  58. #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
  59. #else
  60. #define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
  61. #endif
  62. #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
  63. // IWYU pragma: end_exports
  64. #if HWY_COMPILER_MSVC
  65. #include <string.h> // memcpy
  66. #endif
  67. //------------------------------------------------------------------------------
  68. // Compiler-specific definitions
  69. #define HWY_STR_IMPL(macro) #macro
  70. #define HWY_STR(macro) HWY_STR_IMPL(macro)
  71. #if HWY_COMPILER_MSVC
  72. #include <intrin.h>
  73. #define HWY_FUNCTION __FUNCSIG__ // function name + template args
  74. #define HWY_RESTRICT __restrict
  75. #define HWY_INLINE __forceinline
  76. #define HWY_NOINLINE __declspec(noinline)
  77. #define HWY_FLATTEN
  78. #define HWY_NORETURN __declspec(noreturn)
  79. #define HWY_LIKELY(expr) (expr)
  80. #define HWY_UNLIKELY(expr) (expr)
  81. #define HWY_PRAGMA(tokens) __pragma(tokens)
  82. #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
  83. #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
  84. #define HWY_MAYBE_UNUSED
  85. #define HWY_HAS_ASSUME_ALIGNED 0
  86. #if (_MSC_VER >= 1700)
  87. #define HWY_MUST_USE_RESULT _Check_return_
  88. #else
  89. #define HWY_MUST_USE_RESULT
  90. #endif
  91. #else
  92. #define HWY_FUNCTION __PRETTY_FUNCTION__ // function name + template args
  93. #define HWY_RESTRICT __restrict__
  94. // force inlining without optimization enabled creates very inefficient code
  95. // that can cause compiler timeout
  96. #ifdef __OPTIMIZE__
  97. #define HWY_INLINE inline __attribute__((always_inline))
  98. #else
  99. #define HWY_INLINE inline
  100. #endif
  101. #define HWY_NOINLINE __attribute__((noinline))
  102. #define HWY_FLATTEN __attribute__((flatten))
  103. #define HWY_NORETURN __attribute__((noreturn))
  104. #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
  105. #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
  106. #define HWY_PRAGMA(tokens) _Pragma(#tokens)
  107. #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
  108. #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
  109. // Encountered "attribute list cannot appear here" when using the C++17
  110. // [[maybe_unused]], so only use the old style attribute for now.
  111. #define HWY_MAYBE_UNUSED __attribute__((unused))
  112. #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
  113. #endif // !HWY_COMPILER_MSVC
  114. //------------------------------------------------------------------------------
  115. // Builtin/attributes (no more #include after this point due to namespace!)
  116. namespace hwy {
  117. // Enables error-checking of format strings.
  118. #if HWY_HAS_ATTRIBUTE(__format__)
  119. #define HWY_FORMAT(idx_fmt, idx_arg) \
  120. __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
  121. #else
  122. #define HWY_FORMAT(idx_fmt, idx_arg)
  123. #endif
  124. // Returns a void* pointer which the compiler then assumes is N-byte aligned.
  125. // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
  126. //
  127. // The assignment semantics are required by GCC/Clang. ICC provides an in-place
  128. // __assume_aligned, whereas MSVC's __assume appears unsuitable.
  129. #if HWY_HAS_BUILTIN(__builtin_assume_aligned)
  130. #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
  131. #else
  132. #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
  133. #endif
  134. // Returns a pointer whose type is `type` (T*), while allowing the compiler to
  135. // assume that the untyped pointer `ptr` is aligned to a multiple of sizeof(T).
  136. #define HWY_RCAST_ALIGNED(type, ptr) \
  137. reinterpret_cast<type>(HWY_ASSUME_ALIGNED((ptr), alignof(RemovePtr<type>)))
  138. // Clang and GCC require attributes on each function into which SIMD intrinsics
  139. // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
  140. // automatic annotation via pragmas.
  141. #if HWY_COMPILER_ICC
  142. // As of ICC 2021.{1-9} the pragma is neither implemented nor required.
  143. #define HWY_PUSH_ATTRIBUTES(targets_str)
  144. #define HWY_POP_ATTRIBUTES
  145. #elif HWY_COMPILER_CLANG
  146. #define HWY_PUSH_ATTRIBUTES(targets_str) \
  147. HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
  148. apply_to = function))
  149. #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
  150. #elif HWY_COMPILER_GCC_ACTUAL
  151. #define HWY_PUSH_ATTRIBUTES(targets_str) \
  152. HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
  153. #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
  154. #else
  155. #define HWY_PUSH_ATTRIBUTES(targets_str)
  156. #define HWY_POP_ATTRIBUTES
  157. #endif
  158. //------------------------------------------------------------------------------
  159. // Macros
  160. #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
  161. #define HWY_CONCAT_IMPL(a, b) a##b
  162. #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
  163. #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
  164. #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
  165. #if HWY_COMPILER_GCC_ACTUAL
  166. // nielskm: GCC does not support '#pragma GCC unroll' without the factor.
  167. #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
  168. #define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
  169. #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
  170. #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
  171. #define HWY_DEFAULT_UNROLL HWY_UNROLL()
  172. #else
  173. #define HWY_UNROLL(factor)
  174. #define HWY_DEFAULT_UNROLL
  175. #endif
  176. // Tell a compiler that the expression always evaluates to true.
  177. // The expression should be free from any side effects.
  178. // Some older compilers may have trouble with complex expressions, therefore
  179. // it is advisable to split multiple conditions into separate assume statements,
  180. // and manually check the generated code.
  181. // OK but could fail:
  182. // HWY_ASSUME(x == 2 && y == 3);
  183. // Better:
  184. // HWY_ASSUME(x == 2);
  185. // HWY_ASSUME(y == 3);
  186. #if HWY_HAS_CPP_ATTRIBUTE(assume)
  187. #define HWY_ASSUME(expr) [[assume(expr)]]
  188. #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
  189. #define HWY_ASSUME(expr) __assume(expr)
  190. // __builtin_assume() was added in clang 3.6.
  191. #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
  192. #define HWY_ASSUME(expr) __builtin_assume(expr)
  193. // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added
  194. // later, so check for the compiler version directly.
  195. #elif HWY_COMPILER_GCC_ACTUAL >= 405
  196. #define HWY_ASSUME(expr) \
  197. ((expr) ? static_cast<void>(0) : __builtin_unreachable())
  198. #else
  199. #define HWY_ASSUME(expr) static_cast<void>(0)
  200. #endif
  201. // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
  202. // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
  203. // does, without generating code.
  204. #if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)
  205. #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
  206. #else
  207. // TODO(janwas): investigate alternatives. On Arm, the above generates barriers.
  208. #define HWY_FENCE
  209. #endif
  210. // 4 instances of a given literal value, useful as input to LoadDup128.
  211. #define HWY_REP4(literal) literal, literal, literal, literal
  212. HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
  213. Abort(const char* file, int line, const char* format, ...);
  214. #define HWY_ABORT(format, ...) \
  215. ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
  216. // Always enabled.
  217. #define HWY_ASSERT(condition) \
  218. do { \
  219. if (!(condition)) { \
  220. HWY_ABORT("Assert %s", #condition); \
  221. } \
  222. } while (0)
  223. #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
  224. defined(__SANITIZE_MEMORY__)
  225. #define HWY_IS_MSAN 1
  226. #else
  227. #define HWY_IS_MSAN 0
  228. #endif
  229. #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) || \
  230. defined(__SANITIZE_ADDRESS__)
  231. #define HWY_IS_ASAN 1
  232. #else
  233. #define HWY_IS_ASAN 0
  234. #endif
  235. #if HWY_HAS_FEATURE(hwaddress_sanitizer) || defined(HWADDRESS_SANITIZER) || \
  236. defined(__SANITIZE_HWADDRESS__)
  237. #define HWY_IS_HWASAN 1
  238. #else
  239. #define HWY_IS_HWASAN 0
  240. #endif
  241. #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) || \
  242. defined(__SANITIZE_THREAD__)
  243. #define HWY_IS_TSAN 1
  244. #else
  245. #define HWY_IS_TSAN 0
  246. #endif
  247. #if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
  248. defined(UNDEFINED_BEHAVIOR_SANITIZER)
  249. #define HWY_IS_UBSAN 1
  250. #else
  251. #define HWY_IS_UBSAN 0
  252. #endif
  253. // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
  254. // You can disable MSAN by adding this attribute to the function that fails.
  255. #if HWY_IS_MSAN
  256. #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
  257. #else
  258. #define HWY_ATTR_NO_MSAN
  259. #endif
  260. // For enabling HWY_DASSERT and shortening tests in slower debug builds
  261. #if !defined(HWY_IS_DEBUG_BUILD)
  262. // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
  263. // MSVC defines NDEBUG (if not, could instead check _DEBUG).
  264. #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
  265. HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || \
  266. defined(__clang_analyzer__)
  267. #define HWY_IS_DEBUG_BUILD 1
  268. #else
  269. #define HWY_IS_DEBUG_BUILD 0
  270. #endif
  271. #endif // HWY_IS_DEBUG_BUILD
  272. #if HWY_IS_DEBUG_BUILD
  273. #define HWY_DASSERT(condition) HWY_ASSERT(condition)
  274. #else
  275. #define HWY_DASSERT(condition) \
  276. do { \
  277. } while (0)
  278. #endif
  279. //------------------------------------------------------------------------------
  280. // CopyBytes / ZeroBytes
  281. #if HWY_COMPILER_MSVC
  282. #pragma intrinsic(memcpy)
  283. #pragma intrinsic(memset)
  284. #endif
  285. template <size_t kBytes, typename From, typename To>
  286. HWY_API void CopyBytes(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
  287. #if HWY_COMPILER_MSVC
  288. memcpy(to, from, kBytes);
  289. #else
  290. __builtin_memcpy(to, from, kBytes);
  291. #endif
  292. }
  293. HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to,
  294. size_t num_of_bytes_to_copy) {
  295. #if HWY_COMPILER_MSVC
  296. memcpy(to, from, num_of_bytes_to_copy);
  297. #else
  298. __builtin_memcpy(to, from, num_of_bytes_to_copy);
  299. #endif
  300. }
  301. // Same as CopyBytes, but for same-sized objects; avoids a size argument.
  302. template <typename From, typename To>
  303. HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
  304. static_assert(sizeof(From) == sizeof(To), "");
  305. CopyBytes<sizeof(From)>(from, to);
  306. }
  307. template <size_t kBytes, typename To>
  308. HWY_API void ZeroBytes(To* to) {
  309. #if HWY_COMPILER_MSVC
  310. memset(to, 0, kBytes);
  311. #else
  312. __builtin_memset(to, 0, kBytes);
  313. #endif
  314. }
  315. HWY_API void ZeroBytes(void* to, size_t num_bytes) {
  316. #if HWY_COMPILER_MSVC
  317. memset(to, 0, num_bytes);
  318. #else
  319. __builtin_memset(to, 0, num_bytes);
  320. #endif
  321. }
  322. //------------------------------------------------------------------------------
  323. // kMaxVectorSize (undocumented, pending removal)
  324. #if HWY_ARCH_X86
  325. static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
  326. #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
  327. __riscv_v_intrinsic >= 11000
  328. // Not actually an upper bound on the size.
  329. static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
  330. #else
  331. static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
  332. #endif
  333. //------------------------------------------------------------------------------
  334. // Alignment
  335. // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
  336. // should be allocated dynamically via aligned_allocator.h because Lanes() may
  337. // exceed the stack size.
  338. #if HWY_ARCH_X86
  339. #define HWY_ALIGN_MAX alignas(64)
  340. #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
  341. __riscv_v_intrinsic >= 11000
  342. #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
  343. #else
  344. #define HWY_ALIGN_MAX alignas(16)
  345. #endif
  346. //------------------------------------------------------------------------------
  347. // Lane types
  348. // hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
  349. // BitCastScalar to be implemented before the implementations of the
  350. // hwy::float16_t and hwy::bfloat16_t types
  351. struct float16_t;
  352. struct bfloat16_t;
  353. using float32_t = float;
  354. using float64_t = double;
  355. #pragma pack(push, 1)
  356. // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
  357. // https://reviews.llvm.org/D86310
  358. struct alignas(16) uint128_t {
  359. uint64_t lo; // little-endian layout
  360. uint64_t hi;
  361. };
  362. // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
  363. // field is to be compared (Lt128Upper instead of Lt128).
  364. struct alignas(16) K64V64 {
  365. uint64_t value; // little-endian layout
  366. uint64_t key;
  367. };
  368. // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
  369. // than when considering both to be a 64-bit key.
  370. struct alignas(8) K32V32 {
  371. uint32_t value; // little-endian layout
  372. uint32_t key;
  373. };
  374. #pragma pack(pop)
  375. static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
  376. const uint128_t& b) {
  377. return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
  378. }
  379. // Required for std::greater.
  380. static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
  381. const uint128_t& b) {
  382. return b < a;
  383. }
  384. static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
  385. const uint128_t& b) {
  386. return a.lo == b.lo && a.hi == b.hi;
  387. }
  388. static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
  389. const K64V64& b) {
  390. return a.key < b.key;
  391. }
  392. // Required for std::greater.
  393. static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
  394. const K64V64& b) {
  395. return b < a;
  396. }
  397. static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
  398. const K64V64& b) {
  399. return a.key == b.key;
  400. }
  401. static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
  402. const K32V32& b) {
  403. return a.key < b.key;
  404. }
  405. // Required for std::greater.
  406. static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
  407. const K32V32& b) {
  408. return b < a;
  409. }
  410. static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
  411. const K32V32& b) {
  412. return a.key == b.key;
  413. }
  414. //------------------------------------------------------------------------------
  415. // Controlling overload resolution (SFINAE)
  416. template <bool Condition>
  417. struct EnableIfT {};
  418. template <>
  419. struct EnableIfT<true> {
  420. using type = void;
  421. };
  422. template <bool Condition>
  423. using EnableIf = typename EnableIfT<Condition>::type;
  424. template <typename T, typename U>
  425. struct IsSameT {
  426. enum { value = 0 };
  427. };
  428. template <typename T>
  429. struct IsSameT<T, T> {
  430. enum { value = 1 };
  431. };
  432. template <typename T, typename U>
  433. HWY_API constexpr bool IsSame() {
  434. return IsSameT<T, U>::value;
  435. }
  436. // Returns whether T matches either of U1 or U2
  437. template <typename T, typename U1, typename U2>
  438. HWY_API constexpr bool IsSameEither() {
  439. return IsSameT<T, U1>::value || IsSameT<T, U2>::value;
  440. }
  441. template <bool Condition, typename Then, typename Else>
  442. struct IfT {
  443. using type = Then;
  444. };
  445. template <class Then, class Else>
  446. struct IfT<false, Then, Else> {
  447. using type = Else;
  448. };
  449. template <bool Condition, typename Then, typename Else>
  450. using If = typename IfT<Condition, Then, Else>::type;
  451. template <typename T>
  452. struct IsConstT {
  453. enum { value = 0 };
  454. };
  455. template <typename T>
  456. struct IsConstT<const T> {
  457. enum { value = 1 };
  458. };
  459. template <typename T>
  460. HWY_API constexpr bool IsConst() {
  461. return IsConstT<T>::value;
  462. }
  463. template <class T>
  464. struct RemoveConstT {
  465. using type = T;
  466. };
  467. template <class T>
  468. struct RemoveConstT<const T> {
  469. using type = T;
  470. };
  471. template <class T>
  472. using RemoveConst = typename RemoveConstT<T>::type;
  473. template <class T>
  474. struct RemoveVolatileT {
  475. using type = T;
  476. };
  477. template <class T>
  478. struct RemoveVolatileT<volatile T> {
  479. using type = T;
  480. };
  481. template <class T>
  482. using RemoveVolatile = typename RemoveVolatileT<T>::type;
  483. template <class T>
  484. struct RemoveRefT {
  485. using type = T;
  486. };
  487. template <class T>
  488. struct RemoveRefT<T&> {
  489. using type = T;
  490. };
  491. template <class T>
  492. struct RemoveRefT<T&&> {
  493. using type = T;
  494. };
  495. template <class T>
  496. using RemoveRef = typename RemoveRefT<T>::type;
  497. template <class T>
  498. using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
  499. template <class T>
  500. struct RemovePtrT {
  501. using type = T;
  502. };
  503. template <class T>
  504. struct RemovePtrT<T*> {
  505. using type = T;
  506. };
  507. template <class T>
  508. struct RemovePtrT<const T*> {
  509. using type = T;
  510. };
  511. template <class T>
  512. struct RemovePtrT<volatile T*> {
  513. using type = T;
  514. };
  515. template <class T>
  516. struct RemovePtrT<const volatile T*> {
  517. using type = T;
  518. };
  519. template <class T>
  520. using RemovePtr = typename RemovePtrT<T>::type;
  521. // Insert into template/function arguments to enable this overload only for
  522. // vectors of exactly, at most (LE), or more than (GT) this many bytes.
  523. //
  524. // As an example, checking for a total size of 16 bytes will match both
  525. // Simd<uint8_t, 16, 0> and Simd<uint8_t, 8, 1>.
  526. #define HWY_IF_V_SIZE(T, kN, bytes) \
  527. hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
  528. #define HWY_IF_V_SIZE_LE(T, kN, bytes) \
  529. hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
  530. #define HWY_IF_V_SIZE_GT(T, kN, bytes) \
  531. hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
  532. #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
  533. #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
  534. #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
  535. #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
  536. #define HWY_IF_NOT_UNSIGNED(T) hwy::EnableIf<hwy::IsSigned<T>()>* = nullptr
  537. #define HWY_IF_SIGNED(T) \
  538. hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
  539. !hwy::IsSpecialFloat<T>()>* = nullptr
  540. #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
  541. #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
  542. #define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
  543. #define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
  544. #define HWY_IF_SPECIAL_FLOAT(T) \
  545. hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
  546. #define HWY_IF_NOT_SPECIAL_FLOAT(T) \
  547. hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
  548. #define HWY_IF_FLOAT_OR_SPECIAL(T) \
  549. hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
  550. #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
  551. hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
  552. #define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
  553. #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
  554. #define HWY_IF_NOT_T_SIZE(T, bytes) \
  555. hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
  556. // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds
  557. // too similar. If you want the opposite of this (2 or 4 bytes), ask for those
  558. // bits explicitly (0x14) instead of attempting to 'negate' 0x102.
  559. #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
  560. hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
  561. #define HWY_IF_T_SIZE_LE(T, bytes) \
  562. hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
  563. #define HWY_IF_T_SIZE_GT(T, bytes) \
  564. hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
  565. #define HWY_IF_SAME(T, expected) \
  566. hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
  567. #define HWY_IF_NOT_SAME(T, expected) \
  568. hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
  569. // One of two expected types
  570. #define HWY_IF_SAME2(T, expected1, expected2) \
  571. hwy::EnableIf< \
  572. hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
  573. nullptr
  574. #define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
  575. #define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
  576. #define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
  577. #define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
  578. #define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
  579. #define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
  580. #define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
  581. #define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
  582. #define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
  583. #define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
  584. #define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
  585. #define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
  586. #define HWY_IF_F32(T) HWY_IF_SAME(T, float)
  587. #define HWY_IF_F64(T) HWY_IF_SAME(T, double)
  588. // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double
  589. // overloads.
  590. #define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
  591. #define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
  592. #define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
  593. #define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
  594. #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
  595. hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
  596. // Empty struct used as a size tag type.
  597. template <size_t N>
  598. struct SizeTag {};
  599. template <class T>
  600. class DeclValT {
  601. private:
  602. template <class U, class URef = U&&>
  603. static URef TryAddRValRef(int);
  604. template <class U, class Arg>
  605. static U TryAddRValRef(Arg);
  606. public:
  607. using type = decltype(TryAddRValRef<T>(0));
  608. enum { kDisableDeclValEvaluation = 1 };
  609. };
  610. // hwy::DeclVal<T>() can only be used in unevaluated contexts such as within an
  611. // expression of a decltype specifier.
  612. // hwy::DeclVal<T>() does not require that T have a public default constructor
  613. template <class T>
  614. HWY_API typename DeclValT<T>::type DeclVal() noexcept {
  615. static_assert(!DeclValT<T>::kDisableDeclValEvaluation,
  616. "DeclVal() cannot be used in an evaluated context");
  617. }
  618. template <class T>
  619. struct IsArrayT {
  620. enum { value = 0 };
  621. };
  622. template <class T>
  623. struct IsArrayT<T[]> {
  624. enum { value = 1 };
  625. };
  626. template <class T, size_t N>
  627. struct IsArrayT<T[N]> {
  628. enum { value = 1 };
  629. };
  630. template <class T>
  631. static constexpr bool IsArray() {
  632. return IsArrayT<T>::value;
  633. }
  634. #if HWY_COMPILER_MSVC
  635. HWY_DIAGNOSTICS(push)
  636. HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
  637. #endif
  638. template <class From, class To>
  639. class IsConvertibleT {
  640. private:
  641. template <class T>
  642. static hwy::SizeTag<1> TestFuncWithToArg(T);
  643. template <class T, class U>
  644. static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
  645. DeclVal<T>()))
  646. TryConvTest(int);
  647. template <class T, class U, class Arg>
  648. static hwy::SizeTag<0> TryConvTest(Arg);
  649. public:
  650. enum {
  651. value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
  652. IsSame<RemoveConst<RemoveVolatile<To>>, void>()) ||
  653. (!IsArray<To>() &&
  654. (IsSame<To, decltype(DeclVal<To>())>() ||
  655. !IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
  656. IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
  657. };
  658. };
  659. #if HWY_COMPILER_MSVC
  660. HWY_DIAGNOSTICS(pop)
  661. #endif
  662. template <class From, class To>
  663. HWY_API constexpr bool IsConvertible() {
  664. return IsConvertibleT<From, To>::value;
  665. }
  666. template <class From, class To>
  667. class IsStaticCastableT {
  668. private:
  669. template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
  670. static hwy::SizeTag<1> TryStaticCastTest(int);
  671. template <class T, class U, class Arg>
  672. static hwy::SizeTag<0> TryStaticCastTest(Arg);
  673. public:
  674. enum {
  675. value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
  676. };
  677. };
  678. template <class From, class To>
  679. static constexpr bool IsStaticCastable() {
  680. return IsStaticCastableT<From, To>::value;
  681. }
  682. #define HWY_IF_CASTABLE(From, To) \
  683. hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
  684. #define HWY_IF_OP_CASTABLE(op, T, Native) \
  685. HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
  686. template <class T, class From>
  687. class IsAssignableT {
  688. private:
  689. template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
  690. static hwy::SizeTag<1> TryAssignTest(int);
  691. template <class T1, class T2, class Arg>
  692. static hwy::SizeTag<0> TryAssignTest(Arg);
  693. public:
  694. enum {
  695. value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
  696. };
  697. };
  698. template <class T, class From>
  699. static constexpr bool IsAssignable() {
  700. return IsAssignableT<T, From>::value;
  701. }
  702. #define HWY_IF_ASSIGNABLE(T, From) \
  703. hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
  704. // ----------------------------------------------------------------------------
  705. // IsSpecialFloat
  706. // These types are often special-cased and not supported in all ops.
  707. template <typename T>
  708. HWY_API constexpr bool IsSpecialFloat() {
  709. return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
  710. }
  711. // -----------------------------------------------------------------------------
  712. // IsIntegerLaneType and IsInteger
  713. template <class T>
  714. HWY_API constexpr bool IsIntegerLaneType() {
  715. return false;
  716. }
  717. template <>
  718. HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
  719. return true;
  720. }
  721. template <>
  722. HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
  723. return true;
  724. }
  725. template <>
  726. HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
  727. return true;
  728. }
  729. template <>
  730. HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
  731. return true;
  732. }
  733. template <>
  734. HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
  735. return true;
  736. }
  737. template <>
  738. HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
  739. return true;
  740. }
  741. template <>
  742. HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
  743. return true;
  744. }
  745. template <>
  746. HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
  747. return true;
  748. }
  749. template <class T>
  750. HWY_API constexpr bool IsInteger() {
  751. // NOTE: Do not add a IsInteger<wchar_t>() specialization below as it is
  752. // possible for IsSame<wchar_t, uint16_t>() to be true when compiled with MSVC
  753. // with the /Zc:wchar_t- option.
  754. return IsIntegerLaneType<T>() || IsSame<RemoveCvRef<T>, wchar_t>() ||
  755. IsSameEither<RemoveCvRef<T>, size_t, ptrdiff_t>() ||
  756. IsSameEither<RemoveCvRef<T>, intptr_t, uintptr_t>();
  757. }
  758. template <>
  759. HWY_INLINE constexpr bool IsInteger<bool>() {
  760. return true;
  761. }
  762. template <>
  763. HWY_INLINE constexpr bool IsInteger<char>() {
  764. return true;
  765. }
  766. template <>
  767. HWY_INLINE constexpr bool IsInteger<signed char>() {
  768. return true;
  769. }
  770. template <>
  771. HWY_INLINE constexpr bool IsInteger<unsigned char>() {
  772. return true;
  773. }
  774. template <>
  775. HWY_INLINE constexpr bool IsInteger<short>() { // NOLINT
  776. return true;
  777. }
  778. template <>
  779. HWY_INLINE constexpr bool IsInteger<unsigned short>() { // NOLINT
  780. return true;
  781. }
  782. template <>
  783. HWY_INLINE constexpr bool IsInteger<int>() {
  784. return true;
  785. }
  786. template <>
  787. HWY_INLINE constexpr bool IsInteger<unsigned>() {
  788. return true;
  789. }
  790. template <>
  791. HWY_INLINE constexpr bool IsInteger<long>() { // NOLINT
  792. return true;
  793. }
  794. template <>
  795. HWY_INLINE constexpr bool IsInteger<unsigned long>() { // NOLINT
  796. return true;
  797. }
  798. template <>
  799. HWY_INLINE constexpr bool IsInteger<long long>() { // NOLINT
  800. return true;
  801. }
  802. template <>
  803. HWY_INLINE constexpr bool IsInteger<unsigned long long>() { // NOLINT
  804. return true;
  805. }
  806. #if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
  807. template <>
  808. HWY_INLINE constexpr bool IsInteger<char8_t>() {
  809. return true;
  810. }
  811. #endif
  812. template <>
  813. HWY_INLINE constexpr bool IsInteger<char16_t>() {
  814. return true;
  815. }
  816. template <>
  817. HWY_INLINE constexpr bool IsInteger<char32_t>() {
  818. return true;
  819. }
  820. // -----------------------------------------------------------------------------
  821. // BitCastScalar
  822. #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
  823. #define HWY_BITCASTSCALAR_CONSTEXPR constexpr
  824. #else
  825. #define HWY_BITCASTSCALAR_CONSTEXPR
  826. #endif
  827. #if __cpp_constexpr >= 201304L
  828. #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
  829. #else
  830. #define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
  831. #endif
  832. #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
  833. namespace detail {
  834. template <class From>
  835. struct BitCastScalarSrcCastHelper {
  836. static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
  837. return val;
  838. }
  839. };
  840. #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
  841. // Workaround for Clang 9 constexpr __builtin_bit_cast bug
  842. template <class To, class From,
  843. hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
  844. hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
  845. static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
  846. BuiltinBitCastScalar(const From& val) {
  847. static_assert(sizeof(To) == sizeof(From),
  848. "sizeof(To) == sizeof(From) must be true");
  849. return static_cast<To>(val);
  850. }
  851. template <class To, class From,
  852. hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
  853. hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
  854. static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
  855. BuiltinBitCastScalar(const From& val) {
  856. return __builtin_bit_cast(To, val);
  857. }
  858. #endif // HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
  859. } // namespace detail
  860. template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
  861. HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
  862. // If From is hwy::float16_t or hwy::bfloat16_t, first cast val to either
  863. // const typename From::Native& or const uint16_t& using
  864. // detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef to
  865. // allow BitCastScalar from hwy::float16_t or hwy::bfloat16_t to be constexpr
  866. // if To is not a pointer type, union type, or a struct/class containing a
  867. // pointer, union, or reference subobject
  868. #if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
  869. return detail::BuiltinBitCastScalar<To>(
  870. detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
  871. val));
  872. #else
  873. return __builtin_bit_cast(
  874. To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
  875. val));
  876. #endif
  877. }
  878. template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
  879. HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
  880. // If To is hwy::float16_t or hwy::bfloat16_t, first do a BitCastScalar of val
  881. // to uint16_t, and then bit cast the uint16_t value to To using To::FromBits
  882. // as hwy::float16_t::FromBits and hwy::bfloat16_t::FromBits are guaranteed to
  883. // be constexpr if the __builtin_bit_cast intrinsic is available.
  884. return To::FromBits(BitCastScalar<uint16_t>(val));
  885. }
  886. #else
  887. template <class To, class From>
  888. HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
  889. To result;
  890. CopySameSize(&val, &result);
  891. return result;
  892. }
  893. #endif
  894. //------------------------------------------------------------------------------
  895. // F16 lane type
  896. #pragma pack(push, 1)
  897. // Compiler supports __fp16 and load/store/conversion NEON intrinsics, which are
  898. // included in Armv8 and VFPv4 (except with MSVC). On Armv7 Clang requires
  899. // __ARM_FP & 2 whereas Armv7 GCC requires -mfp16-format=ieee.
  900. #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
  901. (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
  902. (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
  903. #define HWY_NEON_HAVE_F16C 1
  904. #else
  905. #define HWY_NEON_HAVE_F16C 0
  906. #endif
  907. // RVV with f16 extension supports _Float16 and f16 vector ops. If set, implies
  908. // HWY_HAVE_FLOAT16.
  909. #if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
  910. #define HWY_RVV_HAVE_F16_VEC 1
  911. #else
  912. #define HWY_RVV_HAVE_F16_VEC 0
  913. #endif
  914. // x86 compiler supports _Float16, not necessarily with operators.
  915. // Avoid clang-cl because it lacks __extendhfsf2.
  916. #if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
  917. ((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) || \
  918. HWY_COMPILER_GCC_ACTUAL >= 1200)
  919. #define HWY_SSE2_HAVE_F16_TYPE 1
  920. #else
  921. #define HWY_SSE2_HAVE_F16_TYPE 0
  922. #endif
  923. #ifndef HWY_HAVE_SCALAR_F16_TYPE
  924. // Compiler supports _Float16, not necessarily with operators.
  925. #if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
  926. #define HWY_HAVE_SCALAR_F16_TYPE 1
  927. #else
  928. #define HWY_HAVE_SCALAR_F16_TYPE 0
  929. #endif
  930. #endif // HWY_HAVE_SCALAR_F16_TYPE
  931. #ifndef HWY_HAVE_SCALAR_F16_OPERATORS
  932. // Recent enough compiler also has operators.
  933. #if HWY_HAVE_SCALAR_F16_TYPE && \
  934. (HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
  935. (HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL && \
  936. !defined(_WIN32)) || \
  937. (HWY_ARCH_ARM && \
  938. (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
  939. #define HWY_HAVE_SCALAR_F16_OPERATORS 1
  940. #else
  941. #define HWY_HAVE_SCALAR_F16_OPERATORS 0
  942. #endif
  943. #endif // HWY_HAVE_SCALAR_F16_OPERATORS
  944. namespace detail {
  945. template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
  946. struct SpecialFloatUnwrapArithOpOperandT {};
  947. template <class T, class TVal>
  948. struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> {
  949. using type = T;
  950. };
  951. template <class T>
  952. using SpecialFloatUnwrapArithOpOperand =
  953. typename SpecialFloatUnwrapArithOpOperandT<T>::type;
  954. template <class T, class TVal = RemoveCvRef<T>>
  955. struct NativeSpecialFloatToWrapperT {
  956. using type = T;
  957. };
  958. template <class T>
  959. using NativeSpecialFloatToWrapper =
  960. typename NativeSpecialFloatToWrapperT<T>::type;
  961. } // namespace detail
  962. // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
  963. // by concatenating base type and bits. We use a wrapper class instead of a
  964. // typedef to the native type to ensure that the same symbols, e.g. for VQSort,
  965. // are generated regardless of F16 support; see #1684.
  966. struct alignas(2) float16_t {
  967. #if HWY_HAVE_SCALAR_F16_TYPE
  968. #if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE
  969. using Native = _Float16;
  970. #elif HWY_NEON_HAVE_F16C
  971. using Native = __fp16;
  972. #else
  973. #error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
  974. #endif
  975. #endif // HWY_HAVE_SCALAR_F16_TYPE
  976. union {
  977. #if HWY_HAVE_SCALAR_F16_TYPE
  978. // Accessed via NativeLaneType, and used directly if
  979. // HWY_HAVE_SCALAR_F16_OPERATORS.
  980. Native native;
  981. #endif
  982. // Only accessed via NativeLaneType or U16LaneType.
  983. uint16_t bits;
  984. };
  985. // Default init and copying.
  986. float16_t() noexcept = default;
  987. constexpr float16_t(const float16_t&) noexcept = default;
  988. constexpr float16_t(float16_t&&) noexcept = default;
  989. float16_t& operator=(const float16_t&) noexcept = default;
  990. float16_t& operator=(float16_t&&) noexcept = default;
  991. #if HWY_HAVE_SCALAR_F16_TYPE
  992. // NEON vget/set_lane intrinsics and SVE `svaddv` could use explicit
  993. // float16_t(intrinsic()), but user code expects implicit conversions.
  994. constexpr float16_t(Native arg) noexcept : native(arg) {}
  995. constexpr operator Native() const noexcept { return native; }
  996. #endif
  997. #if HWY_HAVE_SCALAR_F16_TYPE
  998. static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
  999. return float16_t(BitCastScalar<Native>(bits));
  1000. }
  1001. #else
  1002. private:
  1003. struct F16FromU16BitsTag {};
  1004. constexpr float16_t(F16FromU16BitsTag /*tag*/, uint16_t u16_bits)
  1005. : bits(u16_bits) {}
  1006. public:
  1007. static constexpr float16_t FromBits(uint16_t bits) {
  1008. return float16_t(F16FromU16BitsTag(), bits);
  1009. }
  1010. #endif
  1011. // When backed by a native type, ensure the wrapper behaves like the native
  1012. // type by forwarding all operators. Unfortunately it seems difficult to reuse
  1013. // this code in a base class, so we repeat it in float16_t.
  1014. #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
  1015. template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
  1016. IsConvertible<T, Native>()>* = nullptr>
  1017. constexpr float16_t(T&& arg) noexcept
  1018. : native(static_cast<Native>(static_cast<T&&>(arg))) {}
  1019. template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
  1020. !IsConvertible<T, Native>() &&
  1021. IsStaticCastable<T, Native>()>* = nullptr>
  1022. explicit constexpr float16_t(T&& arg) noexcept
  1023. : native(static_cast<Native>(static_cast<T&&>(arg))) {}
  1024. // pre-decrement operator (--x)
  1025. HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
  1026. native = static_cast<Native>(native - Native{1});
  1027. return *this;
  1028. }
  1029. // post-decrement operator (x--)
  1030. HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
  1031. float16_t result = *this;
  1032. native = static_cast<Native>(native - Native{1});
  1033. return result;
  1034. }
  1035. // pre-increment operator (++x)
  1036. HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
  1037. native = static_cast<Native>(native + Native{1});
  1038. return *this;
  1039. }
  1040. // post-increment operator (x++)
  1041. HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
  1042. float16_t result = *this;
  1043. native = static_cast<Native>(native + Native{1});
  1044. return result;
  1045. }
  1046. constexpr float16_t operator-() const noexcept {
  1047. return float16_t(static_cast<Native>(-native));
  1048. }
  1049. constexpr float16_t operator+() const noexcept { return *this; }
  1050. // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
  1051. // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
  1052. #define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func) \
  1053. constexpr float16_t op_func(const float16_t& rhs) const noexcept { \
  1054. return float16_t(static_cast<Native>(native op rhs.native)); \
  1055. } \
  1056. template <typename T, HWY_IF_NOT_F16(T), \
  1057. typename UnwrappedT = \
  1058. detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
  1059. typename RawResultT = \
  1060. decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
  1061. typename ResultT = \
  1062. detail::NativeSpecialFloatToWrapper<RawResultT>, \
  1063. HWY_IF_CASTABLE(RawResultT, ResultT)> \
  1064. constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
  1065. static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
  1066. return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
  1067. } \
  1068. HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func( \
  1069. const hwy::float16_t& rhs) noexcept { \
  1070. native = static_cast<Native>(native op rhs.native); \
  1071. return *this; \
  1072. } \
  1073. template <typename T, HWY_IF_NOT_F16(T), \
  1074. HWY_IF_OP_CASTABLE(op, const T&, Native), \
  1075. HWY_IF_ASSIGNABLE( \
  1076. Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
  1077. HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept( \
  1078. noexcept( \
  1079. static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
  1080. native = static_cast<Native>(native op rhs); \
  1081. return *this; \
  1082. }
  1083. HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
  1084. HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
  1085. HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
  1086. HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
  1087. #undef HWY_FLOAT16_BINARY_OP
  1088. #endif // HWY_HAVE_SCALAR_F16_OPERATORS
  1089. };
  1090. static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
  1091. #if HWY_HAVE_SCALAR_F16_TYPE
  1092. namespace detail {
  1093. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1094. template <class T>
  1095. struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
  1096. using type = hwy::float16_t::Native;
  1097. };
  1098. #endif
  1099. template <class T>
  1100. struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
  1101. using type = hwy::float16_t;
  1102. };
  1103. } // namespace detail
  1104. #endif // HWY_HAVE_SCALAR_F16_TYPE
  1105. #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
  1106. namespace detail {
  1107. template <>
  1108. struct BitCastScalarSrcCastHelper<hwy::float16_t> {
  1109. #if HWY_HAVE_SCALAR_F16_TYPE
  1110. static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
  1111. const hwy::float16_t& val) {
  1112. return val.native;
  1113. }
  1114. #else
  1115. static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
  1116. const hwy::float16_t& val) {
  1117. return val.bits;
  1118. }
  1119. #endif
  1120. };
  1121. } // namespace detail
  1122. #endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
  1123. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1124. #define HWY_F16_CONSTEXPR constexpr
  1125. #else
  1126. #define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
  1127. #endif // HWY_HAVE_SCALAR_F16_OPERATORS
  1128. HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
  1129. #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
  1130. return static_cast<float>(f16);
  1131. #endif
  1132. #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
  1133. const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
  1134. const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
  1135. const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
  1136. const uint32_t mantissa = bits16 & 0x3FF;
  1137. // Subnormal or zero
  1138. if (biased_exp == 0) {
  1139. const float subnormal =
  1140. (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
  1141. return sign ? -subnormal : subnormal;
  1142. }
  1143. // Normalized, infinity or NaN: convert the representation directly
  1144. // (faster than ldexp/tables).
  1145. const uint32_t biased_exp32 =
  1146. biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
  1147. const uint32_t mantissa32 = mantissa << (23 - 10);
  1148. const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
  1149. return BitCastScalar<float>(bits32);
  1150. #endif // !HWY_HAVE_SCALAR_F16_OPERATORS
  1151. }
  1152. #if HWY_IS_DEBUG_BUILD && \
  1153. (HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
  1154. #if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
  1155. // If C++23 if !consteval support is available, only execute
  1156. // HWY_DASSERT(condition) if F16FromF32 is not called from a constant-evaluated
  1157. // context to avoid compilation errors.
  1158. #define HWY_F16_FROM_F32_DASSERT(condition) \
  1159. do { \
  1160. if !consteval { \
  1161. HWY_DASSERT(condition); \
  1162. } \
  1163. } while (0)
  1164. #elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
  1165. HWY_COMPILER_MSVC >= 1926
  1166. // If the __builtin_is_constant_evaluated() intrinsic is available,
  1167. // only do HWY_DASSERT(condition) if __builtin_is_constant_evaluated() returns
  1168. // false to avoid compilation errors if F16FromF32 is called from a
  1169. // constant-evaluated context.
  1170. #define HWY_F16_FROM_F32_DASSERT(condition) \
  1171. do { \
  1172. if (!__builtin_is_constant_evaluated()) { \
  1173. HWY_DASSERT(condition); \
  1174. } \
  1175. } while (0)
  1176. #else
  1177. // If C++23 if !consteval support is not available,
  1178. // the __builtin_is_constant_evaluated() intrinsic is not available,
  1179. // HWY_IS_DEBUG_BUILD is 1, and the __builtin_bit_cast intrinsic is available,
  1180. // do not do a HWY_DASSERT to avoid compilation errors if F16FromF32 is
  1181. // called from a constant-evaluated context.
  1182. #define HWY_F16_FROM_F32_DASSERT(condition) \
  1183. do { \
  1184. } while (0)
  1185. #endif // defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
  1186. #else
  1187. // If HWY_IS_DEBUG_BUILD is 0 or the __builtin_bit_cast intrinsic is not
  1188. // available, define HWY_F16_FROM_F32_DASSERT(condition) as
  1189. // HWY_DASSERT(condition)
  1190. #define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
  1191. #endif // HWY_IS_DEBUG_BUILD && (HWY_HAS_BUILTIN(__builtin_bit_cast) ||
  1192. // HWY_COMPILER_MSVC >= 1926)
  1193. HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
  1194. #if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
  1195. return float16_t(static_cast<float16_t::Native>(f32));
  1196. #endif
  1197. #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
  1198. const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
  1199. const uint32_t sign = bits32 >> 31;
  1200. const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
  1201. constexpr uint32_t kMantissaMask = 0x7FFFFF;
  1202. const uint32_t mantissa32 = bits32 & kMantissaMask;
  1203. // Before shifting (truncation), round to nearest even to reduce bias. If
  1204. // the lowest remaining mantissa bit is odd, increase the offset. Example
  1205. // with the lowest remaining bit (left) and next lower two bits; the
  1206. // latter, plus two more, will be truncated.
  1207. // 0[00] + 1 = 0[01]
  1208. // 0[01] + 1 = 0[10]
  1209. // 0[10] + 1 = 0[11] (round down toward even)
  1210. // 0[11] + 1 = 1[00] (round up)
  1211. // 1[00] + 10 = 1[10]
  1212. // 1[01] + 10 = 1[11]
  1213. // 1[10] + 10 = C0[00] (round up toward even with C=1 carry out)
  1214. // 1[11] + 10 = C0[01] (round up toward even with C=1 carry out)
  1215. // If |f32| >= 2^-24, f16_ulp_bit_idx is the index of the F32 mantissa bit
  1216. // that will be shifted down into the ULP bit of the rounded down F16 result
  1217. // The biased F32 exponent of 2^-14 (the smallest positive normal F16 value)
  1218. // is 113, and bit 13 of the F32 mantissa will be shifted down to into the ULP
  1219. // bit of the rounded down F16 result if |f32| >= 2^14
  1220. // If |f32| < 2^-24, f16_ulp_bit_idx is equal to 24 as there are 24 mantissa
  1221. // bits (including the implied 1 bit) in the mantissa of a normal F32 value
  1222. // and as we want to round up the mantissa if |f32| > 2^-25 && |f32| < 2^-24
  1223. const int32_t f16_ulp_bit_idx =
  1224. HWY_MIN(HWY_MAX(126 - static_cast<int32_t>(biased_exp32), 13), 24);
  1225. const uint32_t odd_bit = ((mantissa32 | 0x800000u) >> f16_ulp_bit_idx) & 1;
  1226. const uint32_t rounded =
  1227. mantissa32 + odd_bit + (uint32_t{1} << (f16_ulp_bit_idx - 1)) - 1u;
  1228. const bool carry = rounded >= (1u << 23);
  1229. const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
  1230. // Tiny or zero => zero.
  1231. if (exp < -24) {
  1232. // restore original sign
  1233. return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
  1234. }
  1235. // If biased_exp16 would be >= 31, first check whether the input was NaN so we
  1236. // can set the mantissa to nonzero.
  1237. const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
  1238. const bool overflowed = exp >= 16;
  1239. const uint32_t biased_exp16 =
  1240. static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
  1241. // exp = [-24, -15] => subnormal, shift the mantissa.
  1242. const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
  1243. HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
  1244. const uint32_t shifted_mantissa =
  1245. (rounded & kMantissaMask) >> (23 - 10 + sub_exp);
  1246. const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
  1247. const uint32_t mantissa16 = is_nan ? 0x3FF
  1248. : overflowed ? 0u
  1249. : (leading + shifted_mantissa);
  1250. #if HWY_IS_DEBUG_BUILD
  1251. if (exp < -14) {
  1252. HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
  1253. HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
  1254. } else if (exp <= 15) {
  1255. HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
  1256. HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
  1257. }
  1258. #endif
  1259. HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
  1260. const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
  1261. HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
  1262. const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
  1263. return float16_t::FromBits(narrowed);
  1264. #endif // !HWY_HAVE_SCALAR_F16_OPERATORS
  1265. }
  1266. HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {
  1267. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1268. return float16_t(static_cast<float16_t::Native>(f64));
  1269. #else
  1270. // The mantissa bits of f64 are first rounded using round-to-odd rounding
  1271. // to the nearest f64 value that has the lower 29 bits zeroed out to
  1272. // ensure that the result is correctly rounded to a F16.
  1273. // The F64 round-to-odd operation below will round a normal F64 value
  1274. // (using round-to-odd rounding) to a F64 value that has 24 bits of precision.
  1275. // It is okay if the magnitude of a denormal F64 value is rounded up in the
  1276. // F64 round-to-odd step below as the magnitude of a denormal F64 value is
  1277. // much smaller than 2^(-24) (the smallest positive denormal F16 value).
  1278. // It is also okay if bit 29 of a NaN F64 value is changed by the F64
  1279. // round-to-odd step below as the lower 13 bits of a F32 NaN value are usually
  1280. // discarded or ignored by the conversion of a F32 NaN value to a F16.
  1281. // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
  1282. // NaN value as the result of the F64 round-to-odd step will have at least one
  1283. // mantissa bit if f64 is a NaN value.
  1284. // The F64 round-to-odd step will ensure that the F64 to F32 conversion is
  1285. // exact if the magnitude of the rounded F64 value (using round-to-odd
  1286. // rounding) is between 2^(-126) (the smallest normal F32 value) and
  1287. // HighestValue<float>() (the largest finite F32 value)
  1288. // It is okay if the F64 to F32 conversion is inexact for F64 values that have
  1289. // a magnitude that is less than 2^(-126) as the magnitude of a denormal F32
  1290. // value is much smaller than 2^(-24) (the smallest positive denormal F16
  1291. // value).
  1292. return F16FromF32(
  1293. static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
  1294. (BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
  1295. ((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
  1296. 0x0000000020000000ULL)))));
  1297. #endif
  1298. }
  1299. // More convenient to define outside float16_t because these may use
  1300. // F32FromF16, which is defined after the struct.
  1301. HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
  1302. float16_t rhs) noexcept {
  1303. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1304. return lhs.native == rhs.native;
  1305. #else
  1306. return F32FromF16(lhs) == F32FromF16(rhs);
  1307. #endif
  1308. }
  1309. HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
  1310. float16_t rhs) noexcept {
  1311. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1312. return lhs.native != rhs.native;
  1313. #else
  1314. return F32FromF16(lhs) != F32FromF16(rhs);
  1315. #endif
  1316. }
  1317. HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
  1318. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1319. return lhs.native < rhs.native;
  1320. #else
  1321. return F32FromF16(lhs) < F32FromF16(rhs);
  1322. #endif
  1323. }
  1324. HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
  1325. float16_t rhs) noexcept {
  1326. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1327. return lhs.native <= rhs.native;
  1328. #else
  1329. return F32FromF16(lhs) <= F32FromF16(rhs);
  1330. #endif
  1331. }
  1332. HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
  1333. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1334. return lhs.native > rhs.native;
  1335. #else
  1336. return F32FromF16(lhs) > F32FromF16(rhs);
  1337. #endif
  1338. }
  1339. HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
  1340. float16_t rhs) noexcept {
  1341. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1342. return lhs.native >= rhs.native;
  1343. #else
  1344. return F32FromF16(lhs) >= F32FromF16(rhs);
  1345. #endif
  1346. }
  1347. #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
  1348. HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
  1349. float16_t lhs, float16_t rhs) noexcept {
  1350. #if HWY_HAVE_SCALAR_F16_OPERATORS
  1351. return lhs.native <=> rhs.native;
  1352. #else
  1353. return F32FromF16(lhs) <=> F32FromF16(rhs);
  1354. #endif
  1355. }
  1356. #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
  1357. //------------------------------------------------------------------------------
  1358. // BF16 lane type
  1359. // Compiler supports ACLE __bf16, not necessarily with operators.
  1360. // Disable the __bf16 type on AArch64 with GCC 13 or earlier as there is a bug
  1361. // in GCC 13 and earlier that sometimes causes BF16 constant values to be
  1362. // incorrectly loaded on AArch64, and this GCC bug on AArch64 is
  1363. // described at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111867.
  1364. #if HWY_ARCH_ARM_A64 && \
  1365. (HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
  1366. #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
  1367. #else
  1368. #define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
  1369. #endif
  1370. // x86 compiler supports __bf16, not necessarily with operators.
  1371. #ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
  1372. #if HWY_ARCH_X86 && defined(__SSE2__) && \
  1373. ((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL) || \
  1374. HWY_COMPILER_GCC_ACTUAL >= 1300)
  1375. #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
  1376. #else
  1377. #define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
  1378. #endif
  1379. #endif // HWY_SSE2_HAVE_SCALAR_BF16_TYPE
  1380. // Compiler supports __bf16, not necessarily with operators.
  1381. #if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
  1382. #define HWY_HAVE_SCALAR_BF16_TYPE 1
  1383. #else
  1384. #define HWY_HAVE_SCALAR_BF16_TYPE 0
  1385. #endif
  1386. #ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
  1387. // Recent enough compiler also has operators. aarch64 clang 18 hits internal
  1388. // compiler errors on bf16 ToString, hence only enable on GCC for now.
  1389. #if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
  1390. #define HWY_HAVE_SCALAR_BF16_OPERATORS 1
  1391. #else
  1392. #define HWY_HAVE_SCALAR_BF16_OPERATORS 0
  1393. #endif
  1394. #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
  1395. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1396. #define HWY_BF16_CONSTEXPR constexpr
  1397. #else
  1398. #define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
  1399. #endif
  1400. struct alignas(2) bfloat16_t {
  1401. #if HWY_HAVE_SCALAR_BF16_TYPE
  1402. using Native = __bf16;
  1403. #endif
  1404. union {
  1405. #if HWY_HAVE_SCALAR_BF16_TYPE
  1406. // Accessed via NativeLaneType, and used directly if
  1407. // HWY_HAVE_SCALAR_BF16_OPERATORS.
  1408. Native native;
  1409. #endif
  1410. // Only accessed via NativeLaneType or U16LaneType.
  1411. uint16_t bits;
  1412. };
  1413. // Default init and copying
  1414. bfloat16_t() noexcept = default;
  1415. constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
  1416. constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
  1417. bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
  1418. bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
  1419. // Only enable implicit conversions if we have a native type.
  1420. #if HWY_HAVE_SCALAR_BF16_TYPE
  1421. constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
  1422. constexpr operator Native() const noexcept { return native; }
  1423. #endif
  1424. #if HWY_HAVE_SCALAR_BF16_TYPE
  1425. static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
  1426. return bfloat16_t(BitCastScalar<Native>(bits));
  1427. }
  1428. #else
  1429. private:
  1430. struct BF16FromU16BitsTag {};
  1431. constexpr bfloat16_t(BF16FromU16BitsTag /*tag*/, uint16_t u16_bits)
  1432. : bits(u16_bits) {}
  1433. public:
  1434. static constexpr bfloat16_t FromBits(uint16_t bits) {
  1435. return bfloat16_t(BF16FromU16BitsTag(), bits);
  1436. }
  1437. #endif
  1438. // When backed by a native type, ensure the wrapper behaves like the native
  1439. // type by forwarding all operators. Unfortunately it seems difficult to reuse
  1440. // this code in a base class, so we repeat it in float16_t.
  1441. #if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
  1442. template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
  1443. !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
  1444. IsConvertible<T, Native>()>* = nullptr>
  1445. constexpr bfloat16_t(T&& arg) noexcept(
  1446. noexcept(static_cast<Native>(DeclVal<T>())))
  1447. : native(static_cast<Native>(static_cast<T&&>(arg))) {}
  1448. template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
  1449. !IsSame<RemoveCvRef<T>, bfloat16_t>() &&
  1450. !IsConvertible<T, Native>() &&
  1451. IsStaticCastable<T, Native>()>* = nullptr>
  1452. explicit constexpr bfloat16_t(T&& arg) noexcept(
  1453. noexcept(static_cast<Native>(DeclVal<T>())))
  1454. : native(static_cast<Native>(static_cast<T&&>(arg))) {}
  1455. HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
  1456. native = arg;
  1457. return *this;
  1458. }
  1459. // pre-decrement operator (--x)
  1460. HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
  1461. native = static_cast<Native>(native - Native{1});
  1462. return *this;
  1463. }
  1464. // post-decrement operator (x--)
  1465. HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
  1466. bfloat16_t result = *this;
  1467. native = static_cast<Native>(native - Native{1});
  1468. return result;
  1469. }
  1470. // pre-increment operator (++x)
  1471. HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
  1472. native = static_cast<Native>(native + Native{1});
  1473. return *this;
  1474. }
  1475. // post-increment operator (x++)
  1476. HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
  1477. bfloat16_t result = *this;
  1478. native = static_cast<Native>(native + Native{1});
  1479. return result;
  1480. }
  1481. constexpr bfloat16_t operator-() const noexcept {
  1482. return bfloat16_t(static_cast<Native>(-native));
  1483. }
  1484. constexpr bfloat16_t operator+() const noexcept { return *this; }
  1485. // Reduce clutter by generating `operator+` and `operator+=` etc. Note that
  1486. // we cannot token-paste `operator` and `+`, so pass it in as `op_func`.
  1487. #define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func) \
  1488. constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept { \
  1489. return bfloat16_t(static_cast<Native>(native op rhs.native)); \
  1490. } \
  1491. template <typename T, HWY_IF_NOT_BF16(T), \
  1492. typename UnwrappedT = \
  1493. detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
  1494. typename RawResultT = \
  1495. decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
  1496. typename ResultT = \
  1497. detail::NativeSpecialFloatToWrapper<RawResultT>, \
  1498. HWY_IF_CASTABLE(RawResultT, ResultT)> \
  1499. constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
  1500. static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
  1501. return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
  1502. } \
  1503. HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func( \
  1504. const hwy::bfloat16_t& rhs) noexcept { \
  1505. native = static_cast<Native>(native op rhs.native); \
  1506. return *this; \
  1507. } \
  1508. template <typename T, HWY_IF_NOT_BF16(T), \
  1509. HWY_IF_OP_CASTABLE(op, const T&, Native), \
  1510. HWY_IF_ASSIGNABLE( \
  1511. Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
  1512. HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept( \
  1513. noexcept( \
  1514. static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
  1515. native = static_cast<Native>(native op rhs); \
  1516. return *this; \
  1517. }
  1518. HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
  1519. HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
  1520. HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
  1521. HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
  1522. #undef HWY_BFLOAT16_BINARY_OP
  1523. #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
  1524. };
  1525. static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
  1526. #pragma pack(pop)
  1527. #if HWY_HAVE_SCALAR_BF16_TYPE
  1528. namespace detail {
  1529. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1530. template <class T>
  1531. struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
  1532. using type = hwy::bfloat16_t::Native;
  1533. };
  1534. #endif
  1535. template <class T>
  1536. struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
  1537. using type = hwy::bfloat16_t;
  1538. };
  1539. } // namespace detail
  1540. #endif // HWY_HAVE_SCALAR_BF16_TYPE
  1541. #if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
  1542. namespace detail {
  1543. template <>
  1544. struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
  1545. #if HWY_HAVE_SCALAR_BF16_TYPE
  1546. static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
  1547. const hwy::bfloat16_t& val) {
  1548. return val.native;
  1549. }
  1550. #else
  1551. static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
  1552. const hwy::bfloat16_t& val) {
  1553. return val.bits;
  1554. }
  1555. #endif
  1556. };
  1557. } // namespace detail
  1558. #endif // HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
  1559. HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
  1560. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1561. return static_cast<float>(bf);
  1562. #else
  1563. return BitCastScalar<float>(static_cast<uint32_t>(
  1564. static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
  1565. #endif
  1566. }
  1567. namespace detail {
  1568. // Returns the increment to add to the bits of a finite F32 value to round a
  1569. // finite F32 to the nearest BF16 value
  1570. static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
  1571. const uint32_t f32_bits) {
  1572. return static_cast<uint32_t>(((f32_bits & 0x7FFFFFFFu) < 0x7F800000u)
  1573. ? (0x7FFFu + ((f32_bits >> 16) & 1u))
  1574. : 0u);
  1575. }
  1576. // Converts f32_bits (which is the bits of a F32 value) to BF16 bits,
  1577. // rounded to the nearest F16 value
  1578. static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
  1579. const uint32_t f32_bits) {
  1580. // Round f32_bits to the nearest BF16 by first adding
  1581. // F32BitsToBF16RoundIncr(f32_bits) to f32_bits and then right shifting
  1582. // f32_bits + F32BitsToBF16RoundIncr(f32_bits) by 16
  1583. // If f32_bits is the bit representation of a NaN F32 value, make sure that
  1584. // bit 6 of the BF16 result is set to convert SNaN F32 values to QNaN BF16
  1585. // values and to prevent NaN F32 values from being converted to an infinite
  1586. // BF16 value
  1587. return static_cast<uint16_t>(
  1588. ((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16) |
  1589. (static_cast<uint32_t>((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) << 6));
  1590. }
  1591. } // namespace detail
  1592. HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
  1593. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1594. return static_cast<bfloat16_t>(f);
  1595. #else
  1596. return bfloat16_t::FromBits(
  1597. detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
  1598. #endif
  1599. }
  1600. HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
  1601. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1602. return static_cast<bfloat16_t>(f64);
  1603. #else
  1604. // The mantissa bits of f64 are first rounded using round-to-odd rounding
  1605. // to the nearest f64 value that has the lower 38 bits zeroed out to
  1606. // ensure that the result is correctly rounded to a BF16.
  1607. // The F64 round-to-odd operation below will round a normal F64 value
  1608. // (using round-to-odd rounding) to a F64 value that has 15 bits of precision.
  1609. // It is okay if the magnitude of a denormal F64 value is rounded up in the
  1610. // F64 round-to-odd step below as the magnitude of a denormal F64 value is
  1611. // much smaller than 2^(-133) (the smallest positive denormal BF16 value).
  1612. // It is also okay if bit 38 of a NaN F64 value is changed by the F64
  1613. // round-to-odd step below as the lower 16 bits of a F32 NaN value are usually
  1614. // discarded or ignored by the conversion of a F32 NaN value to a BF16.
  1615. // If f64 is a NaN value, the result of the F64 round-to-odd step will be a
  1616. // NaN value as the result of the F64 round-to-odd step will have at least one
  1617. // mantissa bit if f64 is a NaN value.
  1618. // The F64 round-to-odd step below will ensure that the F64 to F32 conversion
  1619. // is exact if the magnitude of the rounded F64 value (using round-to-odd
  1620. // rounding) is between 2^(-135) (one-fourth of the smallest positive denormal
  1621. // BF16 value) and HighestValue<float>() (the largest finite F32 value).
  1622. // If |f64| is less than 2^(-135), the magnitude of the result of the F64 to
  1623. // F32 conversion is guaranteed to be less than or equal to 2^(-135), which
  1624. // ensures that the F32 to BF16 conversion is correctly rounded, even if the
  1625. // conversion of a rounded F64 value whose magnitude is less than 2^(-135)
  1626. // to a F32 is inexact.
  1627. return BF16FromF32(
  1628. static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
  1629. (BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
  1630. ((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
  1631. 0x0000004000000000ULL)))));
  1632. #endif
  1633. }
  1634. // More convenient to define outside bfloat16_t because these may use
  1635. // F32FromBF16, which is defined after the struct.
  1636. HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
  1637. bfloat16_t rhs) noexcept {
  1638. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1639. return lhs.native == rhs.native;
  1640. #else
  1641. return F32FromBF16(lhs) == F32FromBF16(rhs);
  1642. #endif
  1643. }
  1644. HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
  1645. bfloat16_t rhs) noexcept {
  1646. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1647. return lhs.native != rhs.native;
  1648. #else
  1649. return F32FromBF16(lhs) != F32FromBF16(rhs);
  1650. #endif
  1651. }
  1652. HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
  1653. bfloat16_t rhs) noexcept {
  1654. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1655. return lhs.native < rhs.native;
  1656. #else
  1657. return F32FromBF16(lhs) < F32FromBF16(rhs);
  1658. #endif
  1659. }
  1660. HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
  1661. bfloat16_t rhs) noexcept {
  1662. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1663. return lhs.native <= rhs.native;
  1664. #else
  1665. return F32FromBF16(lhs) <= F32FromBF16(rhs);
  1666. #endif
  1667. }
  1668. HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
  1669. bfloat16_t rhs) noexcept {
  1670. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1671. return lhs.native > rhs.native;
  1672. #else
  1673. return F32FromBF16(lhs) > F32FromBF16(rhs);
  1674. #endif
  1675. }
  1676. HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
  1677. bfloat16_t rhs) noexcept {
  1678. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1679. return lhs.native >= rhs.native;
  1680. #else
  1681. return F32FromBF16(lhs) >= F32FromBF16(rhs);
  1682. #endif
  1683. }
  1684. #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
  1685. HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
  1686. bfloat16_t lhs, bfloat16_t rhs) noexcept {
  1687. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  1688. return lhs.native <=> rhs.native;
  1689. #else
  1690. return F32FromBF16(lhs) <=> F32FromBF16(rhs);
  1691. #endif
  1692. }
  1693. #endif // HWY_HAVE_CXX20_THREE_WAY_COMPARE
  1694. //------------------------------------------------------------------------------
  1695. // Type relations
  1696. namespace detail {
  1697. template <typename T>
  1698. struct Relations;
  1699. template <>
  1700. struct Relations<uint8_t> {
  1701. using Unsigned = uint8_t;
  1702. using Signed = int8_t;
  1703. using Wide = uint16_t;
  1704. enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
  1705. };
  1706. template <>
  1707. struct Relations<int8_t> {
  1708. using Unsigned = uint8_t;
  1709. using Signed = int8_t;
  1710. using Wide = int16_t;
  1711. enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
  1712. };
  1713. template <>
  1714. struct Relations<uint16_t> {
  1715. using Unsigned = uint16_t;
  1716. using Signed = int16_t;
  1717. using Float = float16_t;
  1718. using Wide = uint32_t;
  1719. using Narrow = uint8_t;
  1720. enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
  1721. };
  1722. template <>
  1723. struct Relations<int16_t> {
  1724. using Unsigned = uint16_t;
  1725. using Signed = int16_t;
  1726. using Float = float16_t;
  1727. using Wide = int32_t;
  1728. using Narrow = int8_t;
  1729. enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
  1730. };
  1731. template <>
  1732. struct Relations<uint32_t> {
  1733. using Unsigned = uint32_t;
  1734. using Signed = int32_t;
  1735. using Float = float;
  1736. using Wide = uint64_t;
  1737. using Narrow = uint16_t;
  1738. enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
  1739. };
  1740. template <>
  1741. struct Relations<int32_t> {
  1742. using Unsigned = uint32_t;
  1743. using Signed = int32_t;
  1744. using Float = float;
  1745. using Wide = int64_t;
  1746. using Narrow = int16_t;
  1747. enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
  1748. };
  1749. template <>
  1750. struct Relations<uint64_t> {
  1751. using Unsigned = uint64_t;
  1752. using Signed = int64_t;
  1753. using Float = double;
  1754. using Wide = uint128_t;
  1755. using Narrow = uint32_t;
  1756. enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
  1757. };
  1758. template <>
  1759. struct Relations<int64_t> {
  1760. using Unsigned = uint64_t;
  1761. using Signed = int64_t;
  1762. using Float = double;
  1763. using Narrow = int32_t;
  1764. enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
  1765. };
  1766. template <>
  1767. struct Relations<uint128_t> {
  1768. using Unsigned = uint128_t;
  1769. using Narrow = uint64_t;
  1770. enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
  1771. };
  1772. template <>
  1773. struct Relations<float16_t> {
  1774. using Unsigned = uint16_t;
  1775. using Signed = int16_t;
  1776. using Float = float16_t;
  1777. using Wide = float;
  1778. enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
  1779. };
  1780. template <>
  1781. struct Relations<bfloat16_t> {
  1782. using Unsigned = uint16_t;
  1783. using Signed = int16_t;
  1784. using Wide = float;
  1785. enum { is_signed = 1, is_float = 1, is_bf16 = 1 };
  1786. };
  1787. template <>
  1788. struct Relations<float> {
  1789. using Unsigned = uint32_t;
  1790. using Signed = int32_t;
  1791. using Float = float;
  1792. using Wide = double;
  1793. using Narrow = float16_t;
  1794. enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
  1795. };
  1796. template <>
  1797. struct Relations<double> {
  1798. using Unsigned = uint64_t;
  1799. using Signed = int64_t;
  1800. using Float = double;
  1801. using Narrow = float;
  1802. enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
  1803. };
  1804. template <size_t N>
  1805. struct TypeFromSize;
  1806. template <>
  1807. struct TypeFromSize<1> {
  1808. using Unsigned = uint8_t;
  1809. using Signed = int8_t;
  1810. };
  1811. template <>
  1812. struct TypeFromSize<2> {
  1813. using Unsigned = uint16_t;
  1814. using Signed = int16_t;
  1815. using Float = float16_t;
  1816. };
  1817. template <>
  1818. struct TypeFromSize<4> {
  1819. using Unsigned = uint32_t;
  1820. using Signed = int32_t;
  1821. using Float = float;
  1822. };
  1823. template <>
  1824. struct TypeFromSize<8> {
  1825. using Unsigned = uint64_t;
  1826. using Signed = int64_t;
  1827. using Float = double;
  1828. };
  1829. template <>
  1830. struct TypeFromSize<16> {
  1831. using Unsigned = uint128_t;
  1832. };
  1833. } // namespace detail
  1834. // Aliases for types of a different category, but the same size.
  1835. template <typename T>
  1836. using MakeUnsigned = typename detail::Relations<T>::Unsigned;
  1837. template <typename T>
  1838. using MakeSigned = typename detail::Relations<T>::Signed;
  1839. template <typename T>
  1840. using MakeFloat = typename detail::Relations<T>::Float;
  1841. // Aliases for types of the same category, but different size.
  1842. template <typename T>
  1843. using MakeWide = typename detail::Relations<T>::Wide;
  1844. template <typename T>
  1845. using MakeNarrow = typename detail::Relations<T>::Narrow;
  1846. // Obtain type from its size [bytes].
  1847. template <size_t N>
  1848. using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
  1849. template <size_t N>
  1850. using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
  1851. template <size_t N>
  1852. using FloatFromSize = typename detail::TypeFromSize<N>::Float;
  1853. // Avoid confusion with SizeTag where the parameter is a lane size.
  1854. using UnsignedTag = SizeTag<0>;
  1855. using SignedTag = SizeTag<0x100>; // integer
  1856. using FloatTag = SizeTag<0x200>;
  1857. using SpecialTag = SizeTag<0x300>;
  1858. template <typename T, class R = detail::Relations<T>>
  1859. constexpr auto TypeTag()
  1860. -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> {
  1861. return hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)>();
  1862. }
  1863. // For when we only want to distinguish FloatTag from everything else.
  1864. using NonFloatTag = SizeTag<0x400>;
  1865. template <typename T, class R = detail::Relations<T>>
  1866. constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
  1867. return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
  1868. }
  1869. //------------------------------------------------------------------------------
  1870. // Type traits
  1871. template <typename T>
  1872. HWY_API constexpr bool IsFloat3264() {
  1873. return IsSameEither<RemoveCvRef<T>, float, double>();
  1874. }
  1875. template <typename T>
  1876. HWY_API constexpr bool IsFloat() {
  1877. // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
  1878. // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1.
  1879. return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
  1880. }
  1881. template <typename T>
  1882. HWY_API constexpr bool IsSigned() {
  1883. return static_cast<T>(0) > static_cast<T>(-1);
  1884. }
  1885. template <>
  1886. constexpr bool IsSigned<float16_t>() {
  1887. return true;
  1888. }
  1889. template <>
  1890. constexpr bool IsSigned<bfloat16_t>() {
  1891. return true;
  1892. }
  1893. template <>
  1894. constexpr bool IsSigned<hwy::uint128_t>() {
  1895. return false;
  1896. }
  1897. template <>
  1898. constexpr bool IsSigned<hwy::K64V64>() {
  1899. return false;
  1900. }
  1901. template <>
  1902. constexpr bool IsSigned<hwy::K32V32>() {
  1903. return false;
  1904. }
  1905. template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
  1906. struct MakeLaneTypeIfIntegerT {
  1907. using type = T;
  1908. };
  1909. template <typename T>
  1910. struct MakeLaneTypeIfIntegerT<T, true> {
  1911. using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>,
  1912. UnsignedFromSize<sizeof(T)>>;
  1913. };
  1914. template <typename T>
  1915. using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type;
  1916. // Largest/smallest representable integer values.
  1917. template <typename T>
  1918. HWY_API constexpr T LimitsMax() {
  1919. static_assert(IsInteger<T>(), "Only for integer types");
  1920. using TU = UnsignedFromSize<sizeof(T)>;
  1921. return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
  1922. : static_cast<TU>(~TU(0)));
  1923. }
  1924. template <typename T>
  1925. HWY_API constexpr T LimitsMin() {
  1926. static_assert(IsInteger<T>(), "Only for integer types");
  1927. return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
  1928. : static_cast<T>(0);
  1929. }
  1930. // Largest/smallest representable value (integer or float). This naming avoids
  1931. // confusion with numeric_limits<float>::min() (the smallest positive value).
  1932. // Cannot be constexpr because we use CopySameSize for [b]float16_t.
  1933. template <typename T>
  1934. HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {
  1935. return LimitsMin<T>();
  1936. }
  1937. template <>
  1938. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {
  1939. return bfloat16_t::FromBits(uint16_t{0xFF7Fu}); // -1.1111111 x 2^127
  1940. }
  1941. template <>
  1942. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {
  1943. return float16_t::FromBits(uint16_t{0xFBFFu}); // -1.1111111111 x 2^15
  1944. }
  1945. template <>
  1946. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
  1947. return -3.402823466e+38F;
  1948. }
  1949. template <>
  1950. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
  1951. return -1.7976931348623158e+308;
  1952. }
  1953. template <typename T>
  1954. HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {
  1955. return LimitsMax<T>();
  1956. }
  1957. template <>
  1958. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {
  1959. return bfloat16_t::FromBits(uint16_t{0x7F7Fu}); // 1.1111111 x 2^127
  1960. }
  1961. template <>
  1962. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {
  1963. return float16_t::FromBits(uint16_t{0x7BFFu}); // 1.1111111111 x 2^15
  1964. }
  1965. template <>
  1966. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
  1967. return 3.402823466e+38F;
  1968. }
  1969. template <>
  1970. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
  1971. return 1.7976931348623158e+308;
  1972. }
  1973. // Difference between 1.0 and the next representable value. Equal to
  1974. // 1 / (1ULL << MantissaBits<T>()), but hard-coding ensures precision.
  1975. template <typename T>
  1976. HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {
  1977. return 1;
  1978. }
  1979. template <>
  1980. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {
  1981. return bfloat16_t::FromBits(uint16_t{0x3C00u}); // 0.0078125
  1982. }
  1983. template <>
  1984. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {
  1985. return float16_t::FromBits(uint16_t{0x1400u}); // 0.0009765625
  1986. }
  1987. template <>
  1988. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {
  1989. return 1.192092896e-7f;
  1990. }
  1991. template <>
  1992. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
  1993. return 2.2204460492503131e-16;
  1994. }
  1995. // Returns width in bits of the mantissa field in IEEE binary16/32/64.
  1996. template <typename T>
  1997. constexpr int MantissaBits() {
  1998. static_assert(sizeof(T) == 0, "Only instantiate the specializations");
  1999. return 0;
  2000. }
  2001. template <>
  2002. constexpr int MantissaBits<bfloat16_t>() {
  2003. return 7;
  2004. }
  2005. template <>
  2006. constexpr int MantissaBits<float16_t>() {
  2007. return 10;
  2008. }
  2009. template <>
  2010. constexpr int MantissaBits<float>() {
  2011. return 23;
  2012. }
  2013. template <>
  2014. constexpr int MantissaBits<double>() {
  2015. return 52;
  2016. }
  2017. // Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with
  2018. // the largest possible (biased) exponent field. Used by IsInf.
  2019. template <typename T>
  2020. constexpr MakeSigned<T> MaxExponentTimes2() {
  2021. return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
  2022. }
  2023. // Returns bitmask of the sign bit in IEEE binary16/32/64.
  2024. template <typename T>
  2025. constexpr MakeUnsigned<T> SignMask() {
  2026. return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
  2027. }
  2028. // Returns bitmask of the exponent field in IEEE binary16/32/64.
  2029. template <typename T>
  2030. constexpr MakeUnsigned<T> ExponentMask() {
  2031. return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
  2032. static_cast<MakeUnsigned<T>>(~SignMask<T>());
  2033. }
  2034. // Returns bitmask of the mantissa field in IEEE binary16/32/64.
  2035. template <typename T>
  2036. constexpr MakeUnsigned<T> MantissaMask() {
  2037. return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
  2038. }
  2039. // Returns 1 << mantissa_bits as a floating-point number. All integers whose
  2040. // absolute value are less than this can be represented exactly.
  2041. template <typename T>
  2042. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {
  2043. static_assert(sizeof(T) == 0, "Only instantiate the specializations");
  2044. return 0;
  2045. }
  2046. template <>
  2047. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {
  2048. return bfloat16_t::FromBits(uint16_t{0x4300u}); // 1.0 x 2^7
  2049. }
  2050. template <>
  2051. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {
  2052. return float16_t::FromBits(uint16_t{0x6400u}); // 1.0 x 2^10
  2053. }
  2054. template <>
  2055. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
  2056. return 8388608.0f; // 1 << 23
  2057. }
  2058. template <>
  2059. HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
  2060. // floating point literal with p52 requires C++17.
  2061. return 4503599627370496.0; // 1 << 52
  2062. }
  2063. // Returns width in bits of the exponent field in IEEE binary16/32/64.
  2064. template <typename T>
  2065. constexpr int ExponentBits() {
  2066. // Exponent := remaining bits after deducting sign and mantissa.
  2067. return 8 * sizeof(T) - 1 - MantissaBits<T>();
  2068. }
  2069. // Returns largest value of the biased exponent field in IEEE binary16/32/64,
  2070. // right-shifted so that the LSB is bit zero. Example: 0xFF for float.
  2071. // This is expressed as a signed integer for more efficient comparison.
  2072. template <typename T>
  2073. constexpr MakeSigned<T> MaxExponentField() {
  2074. return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
  2075. }
  2076. //------------------------------------------------------------------------------
  2077. // Additional F16/BF16 operators
  2078. #if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
  2079. #define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2) \
  2080. template < \
  2081. typename T1, \
  2082. hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
  2083. hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
  2084. typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
  2085. typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
  2086. HWY_IF_CASTABLE(RawResultT, ResultT)> \
  2087. static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
  2088. return static_cast<ResultT>(a op b.native); \
  2089. }
  2090. #define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
  2091. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
  2092. template < \
  2093. typename T2, \
  2094. hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() || \
  2095. hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr, \
  2096. typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
  2097. typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
  2098. HWY_IF_CASTABLE(RawResultT, ResultT)> \
  2099. static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
  2100. return static_cast<ResultT>(a.native op b); \
  2101. }
  2102. #if HWY_HAVE_SCALAR_F16_OPERATORS
  2103. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
  2104. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
  2105. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
  2106. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
  2107. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
  2108. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
  2109. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
  2110. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
  2111. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
  2112. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
  2113. #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
  2114. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
  2115. #endif
  2116. #endif // HWY_HAVE_SCALAR_F16_OPERATORS
  2117. #if HWY_HAVE_SCALAR_BF16_OPERATORS
  2118. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
  2119. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
  2120. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
  2121. HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
  2122. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
  2123. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
  2124. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
  2125. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
  2126. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
  2127. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
  2128. #if HWY_HAVE_CXX20_THREE_WAY_COMPARE
  2129. HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
  2130. #endif
  2131. #endif // HWY_HAVE_SCALAR_BF16_OPERATORS
  2132. #undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
  2133. #undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
  2134. #endif // HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
  2135. //------------------------------------------------------------------------------
  2136. // Type conversions (after IsSpecialFloat)
  2137. HWY_API float F32FromF16Mem(const void* ptr) {
  2138. float16_t f16;
  2139. CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
  2140. return F32FromF16(f16);
  2141. }
  2142. HWY_API float F32FromBF16Mem(const void* ptr) {
  2143. bfloat16_t bf;
  2144. CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
  2145. return F32FromBF16(bf);
  2146. }
  2147. #if HWY_HAVE_SCALAR_F16_OPERATORS
  2148. #define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
  2149. #else
  2150. #define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
  2151. #endif
  2152. // For casting from TFrom to TTo
  2153. template <typename TTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TTo),
  2154. HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TTo, TFrom)>
  2155. HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
  2156. return static_cast<TTo>(in);
  2157. }
  2158. template <typename TTo, typename TFrom, HWY_IF_F16(TTo),
  2159. HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
  2160. HWY_API constexpr TTo ConvertScalarTo(const TFrom in) {
  2161. return F16FromF32(static_cast<float>(in));
  2162. }
  2163. template <typename TTo, HWY_IF_F16(TTo)>
  2164. HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo
  2165. ConvertScalarTo(const hwy::bfloat16_t in) {
  2166. return F16FromF32(F32FromBF16(in));
  2167. }
  2168. template <typename TTo, HWY_IF_F16(TTo)>
  2169. HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const double in) {
  2170. return F16FromF64(in);
  2171. }
  2172. template <typename TTo, typename TFrom, HWY_IF_BF16(TTo),
  2173. HWY_IF_NOT_SPECIAL_FLOAT(TFrom), HWY_IF_NOT_SAME(TFrom, double)>
  2174. HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
  2175. return BF16FromF32(static_cast<float>(in));
  2176. }
  2177. template <typename TTo, HWY_IF_BF16(TTo)>
  2178. HWY_API HWY_BF16_TO_F16_CONSTEXPR TTo ConvertScalarTo(const hwy::float16_t in) {
  2179. return BF16FromF32(F32FromF16(in));
  2180. }
  2181. template <typename TTo, HWY_IF_BF16(TTo)>
  2182. HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(const double in) {
  2183. return BF16FromF64(in);
  2184. }
  2185. template <typename TTo, typename TFrom, HWY_IF_F16(TFrom),
  2186. HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
  2187. HWY_API HWY_F16_CONSTEXPR TTo ConvertScalarTo(const TFrom in) {
  2188. return static_cast<TTo>(F32FromF16(in));
  2189. }
  2190. template <typename TTo, typename TFrom, HWY_IF_BF16(TFrom),
  2191. HWY_IF_NOT_SPECIAL_FLOAT(TTo)>
  2192. HWY_API HWY_BF16_CONSTEXPR TTo ConvertScalarTo(TFrom in) {
  2193. return static_cast<TTo>(F32FromBF16(in));
  2194. }
  2195. // Same: return unchanged
  2196. template <typename TTo>
  2197. HWY_API constexpr TTo ConvertScalarTo(TTo in) {
  2198. return in;
  2199. }
  2200. //------------------------------------------------------------------------------
  2201. // Helper functions
  2202. template <typename T1, typename T2>
  2203. constexpr inline T1 DivCeil(T1 a, T2 b) {
  2204. return (a + b - 1) / b;
  2205. }
  2206. // Works for any `align`; if a power of two, compiler emits ADD+AND.
  2207. constexpr inline size_t RoundUpTo(size_t what, size_t align) {
  2208. return DivCeil(what, align) * align;
  2209. }
  2210. // Works for any `align`; if a power of two, compiler emits AND.
  2211. constexpr inline size_t RoundDownTo(size_t what, size_t align) {
  2212. return what - (what % align);
  2213. }
  2214. namespace detail {
  2215. // T is unsigned or T is signed and (val >> shift_amt) is an arithmetic right
  2216. // shift
  2217. template <class T>
  2218. static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag /*type_tag*/, T val,
  2219. int shift_amt) {
  2220. return static_cast<T>(val >> shift_amt);
  2221. }
  2222. // T is signed and (val >> shift_amt) is a non-arithmetic right shift
  2223. template <class T>
  2224. static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag /*type_tag*/, T val,
  2225. int shift_amt) {
  2226. using TU = MakeUnsigned<MakeLaneTypeIfInteger<T>>;
  2227. return static_cast<T>(
  2228. (val < 0) ? static_cast<TU>(
  2229. ~(static_cast<TU>(~static_cast<TU>(val)) >> shift_amt))
  2230. : static_cast<TU>(static_cast<TU>(val) >> shift_amt));
  2231. }
  2232. } // namespace detail
  2233. // If T is an signed integer type, ScalarShr is guaranteed to perform an
  2234. // arithmetic right shift
  2235. // Otherwise, if T is an unsigned integer type, ScalarShr is guaranteed to
  2236. // perform a logical right shift
  2237. template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)>
  2238. HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) {
  2239. using NonCvRefT = RemoveCvRef<T>;
  2240. return detail::ScalarShr(
  2241. hwy::SizeTag<((IsSigned<NonCvRefT>() &&
  2242. (LimitsMin<NonCvRefT>() >> (sizeof(T) * 8 - 1)) !=
  2243. static_cast<NonCvRefT>(-1))
  2244. ? 0x100
  2245. : 0)>(),
  2246. static_cast<NonCvRefT>(val), shift_amt);
  2247. }
  2248. // Undefined results for x == 0.
  2249. HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
  2250. HWY_DASSERT(x != 0);
  2251. #if HWY_COMPILER_MSVC
  2252. unsigned long index; // NOLINT
  2253. _BitScanForward(&index, x);
  2254. return index;
  2255. #else // HWY_COMPILER_MSVC
  2256. return static_cast<size_t>(__builtin_ctz(x));
  2257. #endif // HWY_COMPILER_MSVC
  2258. }
  2259. HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
  2260. HWY_DASSERT(x != 0);
  2261. #if HWY_COMPILER_MSVC
  2262. #if HWY_ARCH_X86_64
  2263. unsigned long index; // NOLINT
  2264. _BitScanForward64(&index, x);
  2265. return index;
  2266. #else // HWY_ARCH_X86_64
  2267. // _BitScanForward64 not available
  2268. uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
  2269. unsigned long index; // NOLINT
  2270. if (lsb == 0) {
  2271. uint32_t msb = static_cast<uint32_t>(x >> 32u);
  2272. _BitScanForward(&index, msb);
  2273. return 32 + index;
  2274. } else {
  2275. _BitScanForward(&index, lsb);
  2276. return index;
  2277. }
  2278. #endif // HWY_ARCH_X86_64
  2279. #else // HWY_COMPILER_MSVC
  2280. return static_cast<size_t>(__builtin_ctzll(x));
  2281. #endif // HWY_COMPILER_MSVC
  2282. }
  2283. // Undefined results for x == 0.
  2284. HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
  2285. HWY_DASSERT(x != 0);
  2286. #if HWY_COMPILER_MSVC
  2287. unsigned long index; // NOLINT
  2288. _BitScanReverse(&index, x);
  2289. return 31 - index;
  2290. #else // HWY_COMPILER_MSVC
  2291. return static_cast<size_t>(__builtin_clz(x));
  2292. #endif // HWY_COMPILER_MSVC
  2293. }
  2294. HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
  2295. HWY_DASSERT(x != 0);
  2296. #if HWY_COMPILER_MSVC
  2297. #if HWY_ARCH_X86_64
  2298. unsigned long index; // NOLINT
  2299. _BitScanReverse64(&index, x);
  2300. return 63 - index;
  2301. #else // HWY_ARCH_X86_64
  2302. // _BitScanReverse64 not available
  2303. const uint32_t msb = static_cast<uint32_t>(x >> 32u);
  2304. unsigned long index; // NOLINT
  2305. if (msb == 0) {
  2306. const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
  2307. _BitScanReverse(&index, lsb);
  2308. return 63 - index;
  2309. } else {
  2310. _BitScanReverse(&index, msb);
  2311. return 31 - index;
  2312. }
  2313. #endif // HWY_ARCH_X86_64
  2314. #else // HWY_COMPILER_MSVC
  2315. return static_cast<size_t>(__builtin_clzll(x));
  2316. #endif // HWY_COMPILER_MSVC
  2317. }
  2318. template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
  2319. HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
  2320. HWY_API size_t PopCount(T x) {
  2321. uint32_t u32_x = static_cast<uint32_t>(
  2322. static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
  2323. #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
  2324. return static_cast<size_t>(__builtin_popcountl(u32_x));
  2325. #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
  2326. return static_cast<size_t>(_mm_popcnt_u32(u32_x));
  2327. #else
  2328. u32_x -= ((u32_x >> 1) & 0x55555555u);
  2329. u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
  2330. u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
  2331. u32_x += (u32_x >> 8);
  2332. u32_x += (u32_x >> 16);
  2333. return static_cast<size_t>(u32_x & 0x3Fu);
  2334. #endif
  2335. }
  2336. template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
  2337. HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
  2338. HWY_API size_t PopCount(T x) {
  2339. uint64_t u64_x = static_cast<uint64_t>(
  2340. static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
  2341. #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
  2342. return static_cast<size_t>(__builtin_popcountll(u64_x));
  2343. #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
  2344. return _mm_popcnt_u64(u64_x);
  2345. #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
  2346. return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
  2347. _mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
  2348. #else
  2349. u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
  2350. u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
  2351. (u64_x & 0x3333333333333333ULL));
  2352. u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
  2353. u64_x += (u64_x >> 8);
  2354. u64_x += (u64_x >> 16);
  2355. u64_x += (u64_x >> 32);
  2356. return static_cast<size_t>(u64_x & 0x7Fu);
  2357. #endif
  2358. }
  2359. // Skip HWY_API due to GCC "function not considered for inlining". Previously
  2360. // such errors were caused by underlying type mismatches, but it's not clear
  2361. // what is still mismatched despite all the casts.
  2362. template <typename TI>
  2363. /*HWY_API*/ constexpr size_t FloorLog2(TI x) {
  2364. return x == TI{1}
  2365. ? 0
  2366. : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
  2367. }
  2368. template <typename TI>
  2369. /*HWY_API*/ constexpr size_t CeilLog2(TI x) {
  2370. return x == TI{1}
  2371. ? 0
  2372. : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
  2373. }
  2374. template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
  2375. HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
  2376. return t + static_cast<T>(increment);
  2377. }
  2378. template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
  2379. HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
  2380. return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
  2381. ConvertScalarTo<float>(increment));
  2382. }
  2383. template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
  2384. HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
  2385. using TU = MakeUnsigned<T>;
  2386. // Sub-int types would promote to int, not unsigned, which would trigger
  2387. // warnings, so first promote to the largest unsigned type. Due to
  2388. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87519, which affected GCC 8
  2389. // until fixed in 9.3, we use built-in types rather than uint64_t.
  2390. return static_cast<T>(static_cast<TU>(
  2391. static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
  2392. static_cast<unsigned long long>(n)) &
  2393. uint64_t{hwy::LimitsMax<TU>()}));
  2394. }
  2395. #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
  2396. #pragma intrinsic(_mul128)
  2397. #pragma intrinsic(_umul128)
  2398. #endif
  2399. // 64 x 64 = 128 bit multiplication
  2400. HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
  2401. #if defined(__SIZEOF_INT128__)
  2402. __uint128_t product = (__uint128_t)a * (__uint128_t)b;
  2403. *upper = (uint64_t)(product >> 64);
  2404. return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
  2405. #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
  2406. return _umul128(a, b, upper);
  2407. #else
  2408. constexpr uint64_t kLo32 = 0xFFFFFFFFU;
  2409. const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
  2410. const uint64_t hi_lo = (a >> 32) * (b & kLo32);
  2411. const uint64_t lo_hi = (a & kLo32) * (b >> 32);
  2412. const uint64_t hi_hi = (a >> 32) * (b >> 32);
  2413. const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
  2414. *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
  2415. return (t << 32) | (lo_lo & kLo32);
  2416. #endif
  2417. }
  2418. HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {
  2419. #if defined(__SIZEOF_INT128__)
  2420. __int128_t product = (__int128_t)a * (__int128_t)b;
  2421. *upper = (int64_t)(product >> 64);
  2422. return (int64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
  2423. #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
  2424. return _mul128(a, b, upper);
  2425. #else
  2426. uint64_t unsigned_upper;
  2427. const int64_t lower = static_cast<int64_t>(Mul128(
  2428. static_cast<uint64_t>(a), static_cast<uint64_t>(b), &unsigned_upper));
  2429. *upper = static_cast<int64_t>(
  2430. unsigned_upper -
  2431. (static_cast<uint64_t>(ScalarShr(a, 63)) & static_cast<uint64_t>(b)) -
  2432. (static_cast<uint64_t>(ScalarShr(b, 63)) & static_cast<uint64_t>(a)));
  2433. return lower;
  2434. #endif
  2435. }
  2436. // Precomputation for fast n / divisor and n % divisor, where n is a variable
  2437. // and divisor is unchanging but unknown at compile-time.
  2438. class Divisor {
  2439. public:
  2440. explicit Divisor(uint32_t divisor) : divisor_(divisor) {
  2441. if (divisor <= 1) return;
  2442. const uint32_t len =
  2443. static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
  2444. const uint64_t u_hi = (2ULL << len) - divisor;
  2445. const uint32_t q = Truncate((u_hi << 32) / divisor);
  2446. mul_ = q + 1;
  2447. shift1_ = 1;
  2448. shift2_ = len;
  2449. }
  2450. uint32_t GetDivisor() const { return divisor_; }
  2451. // Returns n / divisor_.
  2452. uint32_t Divide(uint32_t n) const {
  2453. const uint64_t mul = mul_;
  2454. const uint32_t t = Truncate((mul * n) >> 32);
  2455. return (t + ((n - t) >> shift1_)) >> shift2_;
  2456. }
  2457. // Returns n % divisor_.
  2458. uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
  2459. private:
  2460. static uint32_t Truncate(uint64_t x) {
  2461. return static_cast<uint32_t>(x & 0xFFFFFFFFu);
  2462. }
  2463. uint32_t divisor_;
  2464. uint32_t mul_ = 1;
  2465. uint32_t shift1_ = 0;
  2466. uint32_t shift2_ = 0;
  2467. };
  2468. namespace detail {
  2469. template <typename T>
  2470. static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag /*tag*/,
  2471. T val) {
  2472. using TU = MakeUnsigned<T>;
  2473. return BitCastScalar<T>(
  2474. static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
  2475. }
  2476. template <typename T>
  2477. static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
  2478. ScalarAbs(hwy::SpecialTag /*tag*/, T val) {
  2479. return ScalarAbs(hwy::FloatTag(), val);
  2480. }
  2481. template <typename T>
  2482. static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
  2483. ScalarAbs(hwy::SignedTag /*tag*/, T val) {
  2484. using TU = MakeUnsigned<T>;
  2485. return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
  2486. }
  2487. template <typename T>
  2488. static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
  2489. ScalarAbs(hwy::UnsignedTag /*tag*/, T val) {
  2490. return val;
  2491. }
  2492. } // namespace detail
  2493. template <typename T>
  2494. HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {
  2495. using TVal = MakeLaneTypeIfInteger<
  2496. detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
  2497. return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
  2498. }
  2499. template <typename T>
  2500. HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {
  2501. using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
  2502. using TU = MakeUnsigned<TF>;
  2503. return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
  2504. }
  2505. template <typename T>
  2506. HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {
  2507. using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
  2508. using TU = MakeUnsigned<TF>;
  2509. return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
  2510. static_cast<TU>(MaxExponentTimes2<TF>());
  2511. }
  2512. namespace detail {
  2513. template <typename T>
  2514. static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
  2515. hwy::FloatTag /*tag*/, T val) {
  2516. using TU = MakeUnsigned<T>;
  2517. return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
  2518. }
  2519. template <typename T>
  2520. static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
  2521. hwy::NonFloatTag /*tag*/, T /*val*/) {
  2522. // Integer values are always finite
  2523. return true;
  2524. }
  2525. } // namespace detail
  2526. template <typename T>
  2527. HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
  2528. using TVal = MakeLaneTypeIfInteger<
  2529. detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
  2530. return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
  2531. static_cast<TVal>(val));
  2532. }
  2533. template <typename T>
  2534. HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
  2535. T sign) {
  2536. using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
  2537. using TU = MakeUnsigned<TF>;
  2538. return BitCastScalar<TF>(static_cast<TU>(
  2539. (BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
  2540. (BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
  2541. }
  2542. template <typename T>
  2543. HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {
  2544. using TVal = MakeLaneTypeIfInteger<
  2545. detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
  2546. using TU = MakeUnsigned<TVal>;
  2547. return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
  2548. }
  2549. // Prevents the compiler from eliding the computations that led to "output".
  2550. #if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
  2551. !defined(_SOFT_FLOAT)
  2552. // Workaround to avoid test failures on PPC if compiled with Clang
  2553. template <class T, HWY_IF_F32(T)>
  2554. HWY_API void PreventElision(T&& output) {
  2555. asm volatile("" : "+f"(output)::"memory");
  2556. }
  2557. template <class T, HWY_IF_F64(T)>
  2558. HWY_API void PreventElision(T&& output) {
  2559. asm volatile("" : "+d"(output)::"memory");
  2560. }
  2561. template <class T, HWY_IF_NOT_FLOAT3264(T)>
  2562. HWY_API void PreventElision(T&& output) {
  2563. asm volatile("" : "+r"(output)::"memory");
  2564. }
  2565. #else
  2566. template <class T>
  2567. HWY_API void PreventElision(T&& output) {
  2568. #if HWY_COMPILER_MSVC
  2569. // MSVC does not support inline assembly anymore (and never supported GCC's
  2570. // RTL constraints). Self-assignment with #pragma optimize("off") might be
  2571. // expected to prevent elision, but it does not with MSVC 2015. Type-punning
  2572. // with volatile pointers generates inefficient code on MSVC 2017.
  2573. static std::atomic<RemoveCvRef<T>> sink;
  2574. sink.store(output, std::memory_order_relaxed);
  2575. #else
  2576. // Works by indicating to the compiler that "output" is being read and
  2577. // modified. The +r constraint avoids unnecessary writes to memory, but only
  2578. // works for built-in types (typically FuncOutput).
  2579. asm volatile("" : "+r"(output) : : "memory");
  2580. #endif
  2581. }
  2582. #endif
  2583. } // namespace hwy
  2584. #endif // HIGHWAY_HWY_BASE_H_