set_macros-inl.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726
  1. // Copyright 2020 Google LLC
  2. // Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
  3. // SPDX-License-Identifier: Apache-2.0
  4. // SPDX-License-Identifier: BSD-3-Clause
  5. //
  6. // Licensed under the Apache License, Version 2.0 (the "License");
  7. // you may not use this file except in compliance with the License.
  8. // You may obtain a copy of the License at
  9. //
  10. // http://www.apache.org/licenses/LICENSE-2.0
  11. //
  12. // Unless required by applicable law or agreed to in writing, software
  13. // distributed under the License is distributed on an "AS IS" BASIS,
  14. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. // See the License for the specific language governing permissions and
  16. // limitations under the License.
  17. // Sets macros based on HWY_TARGET.
  18. // This include guard is toggled by foreach_target, so avoid the usual _H_
  19. // suffix to prevent copybara from renaming it.
  20. #if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
  21. #ifdef HWY_SET_MACROS_PER_TARGET
  22. #undef HWY_SET_MACROS_PER_TARGET
  23. #else
  24. #define HWY_SET_MACROS_PER_TARGET
  25. #endif
  26. #endif // HWY_SET_MACROS_PER_TARGET
  27. #include "hwy/detect_compiler_arch.h" // IWYU: export
  28. #include "hwy/detect_targets.h" // IWYU: export
  29. #undef HWY_NAMESPACE
  30. #undef HWY_ALIGN
  31. #undef HWY_MAX_BYTES
  32. #undef HWY_LANES
  33. #undef HWY_HAVE_SCALABLE
  34. #undef HWY_HAVE_TUPLE
  35. #undef HWY_HAVE_INTEGER64
  36. #undef HWY_HAVE_FLOAT16
  37. #undef HWY_HAVE_FLOAT64
  38. #undef HWY_MEM_OPS_MIGHT_FAULT
  39. #undef HWY_NATIVE_FMA
  40. #undef HWY_NATIVE_DOT_BF16
  41. #undef HWY_CAP_GE256
  42. #undef HWY_CAP_GE512
  43. #undef HWY_TARGET_IS_SVE
  44. #if HWY_TARGET & HWY_ALL_SVE
  45. #define HWY_TARGET_IS_SVE 1
  46. #else
  47. #define HWY_TARGET_IS_SVE 0
  48. #endif
  49. #undef HWY_TARGET_IS_NEON
  50. #if HWY_TARGET & HWY_ALL_NEON
  51. #define HWY_TARGET_IS_NEON 1
  52. #else
  53. #define HWY_TARGET_IS_NEON 0
  54. #endif
  55. #undef HWY_TARGET_IS_PPC
  56. #if HWY_TARGET & HWY_ALL_PPC
  57. #define HWY_TARGET_IS_PPC 1
  58. #else
  59. #define HWY_TARGET_IS_PPC 0
  60. #endif
  61. // Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
  62. #if HWY_TARGET == HWY_RVV && \
  63. ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
  64. (HWY_COMPILER_CLANG))
  65. #define HWY_HAVE_TUPLE 0
  66. #else
  67. #define HWY_HAVE_TUPLE 1
  68. #endif
  69. // For internal use (clamping/validating N for Simd<>)
  70. #undef HWY_MAX_N
  71. #if HWY_TARGET == HWY_SCALAR
  72. #define HWY_MAX_N 1
  73. #else
  74. #define HWY_MAX_N 65536
  75. #endif
  76. // For internal use (clamping kPow2 for Simd<>)
  77. #undef HWY_MAX_POW2
  78. // For HWY_TARGET == HWY_RVV, LMUL <= 8. Even on other targets, we want to
  79. // support say Rebind<uint64_t, Simd<uint8_t, 1, 0>> d; whose kPow2 is also 3.
  80. // However, those other targets do not actually support multiple vectors, and
  81. // thus Lanes(d) must not exceed Lanes(ScalableTag<T>()).
  82. #define HWY_MAX_POW2 3
  83. // User-visible. Loose lower bound that guarantees HWY_MAX_BYTES >>
  84. // (-HWY_MIN_POW2) <= 1. Useful for terminating compile-time recursions.
  85. #undef HWY_MIN_POW2
  86. #if HWY_TARGET == HWY_RVV
  87. #define HWY_MIN_POW2 -16
  88. #else
  89. // Tighter bound for other targets, whose vectors are smaller, to potentially
  90. // save compile time.
  91. #define HWY_MIN_POW2 -8
  92. #endif // HWY_TARGET == HWY_RVV
  93. #undef HWY_TARGET_STR
  94. #if defined(HWY_DISABLE_PCLMUL_AES)
  95. #define HWY_TARGET_STR_PCLMUL_AES ""
  96. #else
  97. #define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
  98. #endif
  99. #if defined(HWY_DISABLE_BMI2_FMA)
  100. #define HWY_TARGET_STR_BMI2_FMA ""
  101. #else
  102. #define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
  103. #endif
  104. #if defined(HWY_DISABLE_F16C)
  105. #define HWY_TARGET_STR_F16C ""
  106. #else
  107. #define HWY_TARGET_STR_F16C ",f16c"
  108. #endif
  109. #define HWY_TARGET_STR_SSE2 "sse2"
  110. #define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
  111. #define HWY_TARGET_STR_SSE4 \
  112. HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
  113. // Include previous targets, which are the half-vectors of the next target.
  114. #define HWY_TARGET_STR_AVX2 \
  115. HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
  116. #define HWY_TARGET_STR_AVX3 \
  117. HWY_TARGET_STR_AVX2 ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw"
  118. #define HWY_TARGET_STR_AVX3_DL \
  119. HWY_TARGET_STR_AVX3 \
  120. ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
  121. "avx512vpopcntdq,gfni"
  122. // Force-disable for compilers that do not properly support avx512bf16.
  123. #if !defined(HWY_AVX3_DISABLE_AVX512BF16) && \
  124. (HWY_COMPILER_CLANGCL || \
  125. (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
  126. (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900))
  127. #define HWY_AVX3_DISABLE_AVX512BF16
  128. #endif
  129. #if !defined(HWY_AVX3_DISABLE_AVX512BF16)
  130. #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
  131. #else
  132. #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL
  133. #endif
  134. #define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16"
  135. #if defined(HWY_DISABLE_PPC8_CRYPTO)
  136. #define HWY_TARGET_STR_PPC8_CRYPTO ""
  137. #else
  138. #define HWY_TARGET_STR_PPC8_CRYPTO ",crypto"
  139. #endif
  140. #define HWY_TARGET_STR_PPC8 \
  141. "altivec,vsx,power8-vector" HWY_TARGET_STR_PPC8_CRYPTO
  142. #define HWY_TARGET_STR_PPC9 HWY_TARGET_STR_PPC8 ",power9-vector"
  143. #if HWY_COMPILER_CLANG
  144. #define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",power10-vector"
  145. #else
  146. // See #1707 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102059#c35.
  147. // When the baseline is PPC 8 or 9, inlining functions such as PreventElision
  148. // into PPC10 code fails because PPC10 defaults to no-htm and is thus worse than
  149. // the baseline, which has htm. We cannot have pragma target on functions
  150. // outside HWY_NAMESPACE such as those in base.h. It would be possible for users
  151. // to set -mno-htm globally, but we can also work around this at the library
  152. // level by claiming that PPC10 still has HTM, thus avoiding the mismatch. This
  153. // seems to be safe because HTM uses builtins rather than modifying codegen, see
  154. // https://gcc.gnu.org/legacy-ml/gcc-patches/2013-07/msg00167.html.
  155. #define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",cpu=power10,htm"
  156. #endif
  157. #define HWY_TARGET_STR_Z14 "arch=z14"
  158. #define HWY_TARGET_STR_Z15 "arch=z15"
  159. // Before include guard so we redefine HWY_TARGET_STR on each include,
  160. // governed by the current HWY_TARGET.
  161. //-----------------------------------------------------------------------------
  162. // SSE2
  163. #if HWY_TARGET == HWY_SSE2
  164. #define HWY_NAMESPACE N_SSE2
  165. #define HWY_ALIGN alignas(16)
  166. #define HWY_MAX_BYTES 16
  167. #define HWY_LANES(T) (16 / sizeof(T))
  168. #define HWY_HAVE_SCALABLE 0
  169. #define HWY_HAVE_INTEGER64 1
  170. #define HWY_HAVE_FLOAT16 0
  171. #define HWY_HAVE_FLOAT64 1
  172. #define HWY_MEM_OPS_MIGHT_FAULT 1
  173. #define HWY_NATIVE_FMA 0
  174. #define HWY_NATIVE_DOT_BF16 0
  175. #define HWY_CAP_GE256 0
  176. #define HWY_CAP_GE512 0
  177. #define HWY_TARGET_STR HWY_TARGET_STR_SSE2
  178. //-----------------------------------------------------------------------------
  179. // SSSE3
  180. #elif HWY_TARGET == HWY_SSSE3
  181. #define HWY_NAMESPACE N_SSSE3
  182. #define HWY_ALIGN alignas(16)
  183. #define HWY_MAX_BYTES 16
  184. #define HWY_LANES(T) (16 / sizeof(T))
  185. #define HWY_HAVE_SCALABLE 0
  186. #define HWY_HAVE_INTEGER64 1
  187. #define HWY_HAVE_FLOAT16 0
  188. #define HWY_HAVE_FLOAT64 1
  189. #define HWY_MEM_OPS_MIGHT_FAULT 1
  190. #define HWY_NATIVE_FMA 0
  191. #define HWY_NATIVE_DOT_BF16 0
  192. #define HWY_CAP_GE256 0
  193. #define HWY_CAP_GE512 0
  194. #define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
  195. //-----------------------------------------------------------------------------
  196. // SSE4
  197. #elif HWY_TARGET == HWY_SSE4
  198. #define HWY_NAMESPACE N_SSE4
  199. #define HWY_ALIGN alignas(16)
  200. #define HWY_MAX_BYTES 16
  201. #define HWY_LANES(T) (16 / sizeof(T))
  202. #define HWY_HAVE_SCALABLE 0
  203. #define HWY_HAVE_INTEGER64 1
  204. #define HWY_HAVE_FLOAT16 0
  205. #define HWY_HAVE_FLOAT64 1
  206. #define HWY_MEM_OPS_MIGHT_FAULT 1
  207. #define HWY_NATIVE_FMA 0
  208. #define HWY_NATIVE_DOT_BF16 0
  209. #define HWY_CAP_GE256 0
  210. #define HWY_CAP_GE512 0
  211. #define HWY_TARGET_STR HWY_TARGET_STR_SSE4
  212. //-----------------------------------------------------------------------------
  213. // AVX2
  214. #elif HWY_TARGET == HWY_AVX2
  215. #define HWY_NAMESPACE N_AVX2
  216. #define HWY_ALIGN alignas(32)
  217. #define HWY_MAX_BYTES 32
  218. #define HWY_LANES(T) (32 / sizeof(T))
  219. #define HWY_HAVE_SCALABLE 0
  220. #define HWY_HAVE_INTEGER64 1
  221. #define HWY_HAVE_FLOAT16 0
  222. #define HWY_HAVE_FLOAT64 1
  223. #define HWY_MEM_OPS_MIGHT_FAULT 1
  224. #ifdef HWY_DISABLE_BMI2_FMA
  225. #define HWY_NATIVE_FMA 0
  226. #else
  227. #define HWY_NATIVE_FMA 1
  228. #endif
  229. #define HWY_NATIVE_DOT_BF16 0
  230. #define HWY_CAP_GE256 1
  231. #define HWY_CAP_GE512 0
  232. #define HWY_TARGET_STR HWY_TARGET_STR_AVX2
  233. //-----------------------------------------------------------------------------
  234. // AVX3[_DL]
  235. #elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
  236. HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
  237. #define HWY_ALIGN alignas(64)
  238. #define HWY_MAX_BYTES 64
  239. #define HWY_LANES(T) (64 / sizeof(T))
  240. #define HWY_HAVE_SCALABLE 0
  241. #define HWY_HAVE_INTEGER64 1
  242. #if HWY_TARGET == HWY_AVX3_SPR && HWY_COMPILER_GCC_ACTUAL && \
  243. HWY_HAVE_SCALAR_F16_TYPE
  244. // TODO: enable F16 for AVX3_SPR target with Clang once compilation issues are
  245. // fixed
  246. #define HWY_HAVE_FLOAT16 1
  247. #else
  248. #define HWY_HAVE_FLOAT16 0
  249. #endif
  250. #define HWY_HAVE_FLOAT64 1
  251. #define HWY_MEM_OPS_MIGHT_FAULT 0
  252. #define HWY_NATIVE_FMA 1
  253. #if (HWY_TARGET <= HWY_AVX3_ZEN4) && !defined(HWY_AVX3_DISABLE_AVX512BF16)
  254. #define HWY_NATIVE_DOT_BF16 1
  255. #else
  256. #define HWY_NATIVE_DOT_BF16 0
  257. #endif
  258. #define HWY_CAP_GE256 1
  259. #define HWY_CAP_GE512 1
  260. #if HWY_TARGET == HWY_AVX3
  261. #define HWY_NAMESPACE N_AVX3
  262. #define HWY_TARGET_STR HWY_TARGET_STR_AVX3
  263. #elif HWY_TARGET == HWY_AVX3_DL
  264. #define HWY_NAMESPACE N_AVX3_DL
  265. #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL
  266. #elif HWY_TARGET == HWY_AVX3_ZEN4
  267. #define HWY_NAMESPACE N_AVX3_ZEN4
  268. #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_ZEN4
  269. #elif HWY_TARGET == HWY_AVX3_SPR
  270. #define HWY_NAMESPACE N_AVX3_SPR
  271. #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR
  272. #else
  273. #error "Logic error"
  274. #endif // HWY_TARGET
  275. //-----------------------------------------------------------------------------
  276. // PPC8, PPC9, PPC10
  277. #elif HWY_TARGET_IS_PPC
  278. #define HWY_ALIGN alignas(16)
  279. #define HWY_MAX_BYTES 16
  280. #define HWY_LANES(T) (16 / sizeof(T))
  281. #define HWY_HAVE_SCALABLE 0
  282. #define HWY_HAVE_INTEGER64 1
  283. #define HWY_HAVE_FLOAT16 0
  284. #define HWY_HAVE_FLOAT64 1
  285. #define HWY_MEM_OPS_MIGHT_FAULT 1
  286. #define HWY_NATIVE_FMA 1
  287. #define HWY_NATIVE_DOT_BF16 0
  288. #define HWY_CAP_GE256 0
  289. #define HWY_CAP_GE512 0
  290. #if HWY_TARGET == HWY_PPC8
  291. #define HWY_NAMESPACE N_PPC8
  292. #define HWY_TARGET_STR HWY_TARGET_STR_PPC8
  293. #elif HWY_TARGET == HWY_PPC9
  294. #define HWY_NAMESPACE N_PPC9
  295. #define HWY_TARGET_STR HWY_TARGET_STR_PPC9
  296. #elif HWY_TARGET == HWY_PPC10
  297. #define HWY_NAMESPACE N_PPC10
  298. #define HWY_TARGET_STR HWY_TARGET_STR_PPC10
  299. #else
  300. #error "Logic error"
  301. #endif // HWY_TARGET
  302. //-----------------------------------------------------------------------------
  303. // Z14, Z15
  304. #elif HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
  305. #define HWY_ALIGN alignas(16)
  306. #define HWY_MAX_BYTES 16
  307. #define HWY_LANES(T) (16 / sizeof(T))
  308. #define HWY_HAVE_SCALABLE 0
  309. #define HWY_HAVE_INTEGER64 1
  310. #define HWY_HAVE_FLOAT16 0
  311. #define HWY_HAVE_FLOAT64 1
  312. #define HWY_MEM_OPS_MIGHT_FAULT 1
  313. #define HWY_NATIVE_FMA 1
  314. #define HWY_NATIVE_DOT_BF16 0
  315. #define HWY_CAP_GE256 0
  316. #define HWY_CAP_GE512 0
  317. #if HWY_TARGET == HWY_Z14
  318. #define HWY_NAMESPACE N_Z14
  319. #define HWY_TARGET_STR HWY_TARGET_STR_Z14
  320. #elif HWY_TARGET == HWY_Z15
  321. #define HWY_NAMESPACE N_Z15
  322. #define HWY_TARGET_STR HWY_TARGET_STR_Z15
  323. #else
  324. #error "Logic error"
  325. #endif // HWY_TARGET == HWY_Z15
  326. //-----------------------------------------------------------------------------
  327. // NEON
  328. #elif HWY_TARGET_IS_NEON
  329. #define HWY_ALIGN alignas(16)
  330. #define HWY_MAX_BYTES 16
  331. #define HWY_LANES(T) (16 / sizeof(T))
  332. #define HWY_HAVE_SCALABLE 0
  333. #define HWY_HAVE_INTEGER64 1
  334. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || HWY_TARGET == HWY_NEON_BF16
  335. #define HWY_HAVE_FLOAT16 1
  336. #else
  337. #define HWY_HAVE_FLOAT16 0
  338. #endif
  339. #if HWY_ARCH_ARM_A64
  340. #define HWY_HAVE_FLOAT64 1
  341. #else
  342. #define HWY_HAVE_FLOAT64 0
  343. #endif
  344. #define HWY_MEM_OPS_MIGHT_FAULT 1
  345. #if defined(__ARM_FEATURE_FMA) || defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
  346. #define HWY_NATIVE_FMA 1
  347. #else
  348. #define HWY_NATIVE_FMA 0
  349. #endif
  350. #if HWY_NEON_HAVE_F32_TO_BF16C || HWY_TARGET == HWY_NEON_BF16
  351. #define HWY_NATIVE_DOT_BF16 1
  352. #else
  353. #define HWY_NATIVE_DOT_BF16 0
  354. #endif
  355. #define HWY_CAP_GE256 0
  356. #define HWY_CAP_GE512 0
  357. #if HWY_TARGET == HWY_NEON_WITHOUT_AES
  358. #define HWY_NAMESPACE N_NEON_WITHOUT_AES
  359. #elif HWY_TARGET == HWY_NEON
  360. #define HWY_NAMESPACE N_NEON
  361. #elif HWY_TARGET == HWY_NEON_BF16
  362. #define HWY_NAMESPACE N_NEON_BF16
  363. #else
  364. #error "Logic error, missing case"
  365. #endif // HWY_TARGET
  366. // Can use pragmas instead of -march compiler flag
  367. #if HWY_HAVE_RUNTIME_DISPATCH
  368. #if HWY_ARCH_ARM_V7
  369. // The __attribute__((target(+neon-vfpv4)) was introduced in gcc >= 8.
  370. #if HWY_COMPILER_GCC_ACTUAL >= 800
  371. #define HWY_TARGET_STR "+neon-vfpv4"
  372. #else // GCC < 7
  373. // Do not define HWY_TARGET_STR (no pragma).
  374. #endif // HWY_COMPILER_GCC_ACTUAL
  375. #else // !HWY_ARCH_ARM_V7
  376. #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300) || \
  377. (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1300)
  378. // GCC 12 or earlier and Clang 12 or earlier require +crypto be added to the
  379. // target string to enable AArch64 AES intrinsics
  380. #define HWY_TARGET_STR_NEON "+crypto"
  381. #else
  382. #define HWY_TARGET_STR_NEON "+aes"
  383. #endif
  384. // Clang >= 16 requires +fullfp16 instead of fp16, but Apple Clang 15 = 1600
  385. // fails to parse unless the string starts with armv8, whereas 1700 refuses it.
  386. #if HWY_COMPILER_CLANG >= 1700
  387. #define HWY_TARGET_STR_FP16 "+fullfp16"
  388. #elif HWY_COMPILER_CLANG >= 1600 && defined(__apple_build_version__)
  389. #define HWY_TARGET_STR_FP16 "armv8.4-a+fullfp16"
  390. #else
  391. #define HWY_TARGET_STR_FP16 "+fp16"
  392. #endif
  393. #if HWY_TARGET == HWY_NEON_WITHOUT_AES
  394. // Do not define HWY_TARGET_STR (no pragma).
  395. #elif HWY_TARGET == HWY_NEON
  396. #define HWY_TARGET_STR HWY_TARGET_STR_NEON
  397. #elif HWY_TARGET == HWY_NEON_BF16
  398. #define HWY_TARGET_STR HWY_TARGET_STR_FP16 "+bf16+dotprod" HWY_TARGET_STR_NEON
  399. #else
  400. #error "Logic error, missing case"
  401. #endif // HWY_TARGET
  402. #endif // !HWY_ARCH_ARM_V7
  403. #else // !HWY_HAVE_RUNTIME_DISPATCH
  404. // HWY_TARGET_STR remains undefined
  405. #endif
  406. //-----------------------------------------------------------------------------
  407. // SVE[2]
  408. #elif HWY_TARGET_IS_SVE
  409. // SVE only requires lane alignment, not natural alignment of the entire vector.
  410. #define HWY_ALIGN alignas(8)
  411. // Value ensures MaxLanes() is the tightest possible upper bound to reduce
  412. // overallocation.
  413. #define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
  414. #define HWY_HAVE_INTEGER64 1
  415. #define HWY_HAVE_FLOAT16 1
  416. #define HWY_HAVE_FLOAT64 1
  417. #define HWY_MEM_OPS_MIGHT_FAULT 0
  418. #define HWY_NATIVE_FMA 1
  419. #if HWY_SVE_HAVE_BF16_FEATURE
  420. #define HWY_NATIVE_DOT_BF16 1
  421. #else
  422. #define HWY_NATIVE_DOT_BF16 0
  423. #endif
  424. #define HWY_CAP_GE256 0
  425. #define HWY_CAP_GE512 0
  426. #if HWY_TARGET == HWY_SVE2
  427. #define HWY_NAMESPACE N_SVE2
  428. #define HWY_MAX_BYTES 256
  429. #define HWY_HAVE_SCALABLE 1
  430. #elif HWY_TARGET == HWY_SVE_256
  431. #define HWY_NAMESPACE N_SVE_256
  432. #define HWY_MAX_BYTES 32
  433. #define HWY_HAVE_SCALABLE 0
  434. #elif HWY_TARGET == HWY_SVE2_128
  435. #define HWY_NAMESPACE N_SVE2_128
  436. #define HWY_MAX_BYTES 16
  437. #define HWY_HAVE_SCALABLE 0
  438. #else
  439. #define HWY_NAMESPACE N_SVE
  440. #define HWY_MAX_BYTES 256
  441. #define HWY_HAVE_SCALABLE 1
  442. #endif
  443. // Can use pragmas instead of -march compiler flag
  444. #if HWY_HAVE_RUNTIME_DISPATCH
  445. #if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
  446. // Static dispatch with -march=armv8-a+sve2+aes, or no baseline, hence dynamic
  447. // dispatch, which checks for AES support at runtime.
  448. #if defined(__ARM_FEATURE_SVE2_AES) || (HWY_BASELINE_SVE2 == 0)
  449. #define HWY_TARGET_STR "+sve2+sve2-aes,+sve"
  450. #else // SVE2 without AES
  451. #define HWY_TARGET_STR "+sve2,+sve"
  452. #endif
  453. #else // not SVE2 target
  454. #define HWY_TARGET_STR "+sve"
  455. #endif
  456. #else // !HWY_HAVE_RUNTIME_DISPATCH
  457. // HWY_TARGET_STR remains undefined
  458. #endif
  459. //-----------------------------------------------------------------------------
  460. // WASM
  461. #elif HWY_TARGET == HWY_WASM
  462. #define HWY_ALIGN alignas(16)
  463. #define HWY_MAX_BYTES 16
  464. #define HWY_LANES(T) (16 / sizeof(T))
  465. #define HWY_HAVE_SCALABLE 0
  466. #define HWY_HAVE_INTEGER64 1
  467. #define HWY_HAVE_FLOAT16 0
  468. #define HWY_HAVE_FLOAT64 1
  469. #define HWY_MEM_OPS_MIGHT_FAULT 1
  470. #define HWY_NATIVE_FMA 0
  471. #define HWY_NATIVE_DOT_BF16 0
  472. #define HWY_CAP_GE256 0
  473. #define HWY_CAP_GE512 0
  474. #define HWY_NAMESPACE N_WASM
  475. #define HWY_TARGET_STR "simd128"
  476. //-----------------------------------------------------------------------------
  477. // WASM_EMU256
  478. #elif HWY_TARGET == HWY_WASM_EMU256
  479. #define HWY_ALIGN alignas(32)
  480. #define HWY_MAX_BYTES 32
  481. #define HWY_LANES(T) (32 / sizeof(T))
  482. #define HWY_HAVE_SCALABLE 0
  483. #define HWY_HAVE_INTEGER64 1
  484. #define HWY_HAVE_FLOAT16 0
  485. #define HWY_HAVE_FLOAT64 0
  486. #define HWY_MEM_OPS_MIGHT_FAULT 1
  487. #define HWY_NATIVE_FMA 0
  488. #define HWY_NATIVE_DOT_BF16 0
  489. #define HWY_CAP_GE256 1
  490. #define HWY_CAP_GE512 0
  491. #define HWY_NAMESPACE N_WASM_EMU256
  492. #define HWY_TARGET_STR "simd128"
  493. //-----------------------------------------------------------------------------
  494. // RVV
  495. #elif HWY_TARGET == HWY_RVV
  496. // RVV only requires lane alignment, not natural alignment of the entire vector,
  497. // and the compiler already aligns builtin types, so nothing to do here.
  498. #define HWY_ALIGN
  499. // The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
  500. #define HWY_MAX_BYTES 65536
  501. // = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
  502. // LMUL. This is the tightest possible upper bound.
  503. #define HWY_LANES(T) (8192 / sizeof(T))
  504. #define HWY_HAVE_SCALABLE 1
  505. #define HWY_HAVE_INTEGER64 1
  506. #define HWY_HAVE_FLOAT64 1
  507. #define HWY_MEM_OPS_MIGHT_FAULT 0
  508. #define HWY_NATIVE_FMA 1
  509. #define HWY_NATIVE_DOT_BF16 0
  510. #define HWY_CAP_GE256 0
  511. #define HWY_CAP_GE512 0
  512. #if HWY_RVV_HAVE_F16_VEC
  513. #define HWY_HAVE_FLOAT16 1
  514. #else
  515. #define HWY_HAVE_FLOAT16 0
  516. #endif
  517. #define HWY_NAMESPACE N_RVV
  518. // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
  519. // (rv64gcv is not a valid target)
  520. //-----------------------------------------------------------------------------
  521. // EMU128
  522. #elif HWY_TARGET == HWY_EMU128
  523. #define HWY_ALIGN alignas(16)
  524. #define HWY_MAX_BYTES 16
  525. #define HWY_LANES(T) (16 / sizeof(T))
  526. #define HWY_HAVE_SCALABLE 0
  527. #define HWY_HAVE_INTEGER64 1
  528. #define HWY_HAVE_FLOAT16 0
  529. #define HWY_HAVE_FLOAT64 1
  530. #define HWY_MEM_OPS_MIGHT_FAULT 1
  531. #define HWY_NATIVE_FMA 0
  532. #define HWY_NATIVE_DOT_BF16 0
  533. #define HWY_CAP_GE256 0
  534. #define HWY_CAP_GE512 0
  535. #define HWY_NAMESPACE N_EMU128
  536. // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
  537. //-----------------------------------------------------------------------------
  538. // SCALAR
  539. #elif HWY_TARGET == HWY_SCALAR
  540. #define HWY_ALIGN
  541. #define HWY_MAX_BYTES 8
  542. #define HWY_LANES(T) 1
  543. #define HWY_HAVE_SCALABLE 0
  544. #define HWY_HAVE_INTEGER64 1
  545. #define HWY_HAVE_FLOAT16 0
  546. #define HWY_HAVE_FLOAT64 1
  547. #define HWY_MEM_OPS_MIGHT_FAULT 0
  548. #define HWY_NATIVE_FMA 0
  549. #define HWY_NATIVE_DOT_BF16 0
  550. #define HWY_CAP_GE256 0
  551. #define HWY_CAP_GE512 0
  552. #define HWY_NAMESPACE N_SCALAR
  553. // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
  554. #else
  555. #pragma message("HWY_TARGET does not match any known target")
  556. #endif // HWY_TARGET
  557. //-----------------------------------------------------------------------------
  558. // Sanity check: if we have f16 vector support, then base.h should also be
  559. // using a built-in type for f16 scalars.
  560. #if HWY_HAVE_FLOAT16 && !HWY_HAVE_SCALAR_F16_TYPE
  561. #error "Logic error: f16 vectors but no scalars"
  562. #endif
  563. // Override this to 1 in asan/msan builds, which will still fault.
  564. #if HWY_IS_ASAN || HWY_IS_MSAN
  565. #undef HWY_MEM_OPS_MIGHT_FAULT
  566. #define HWY_MEM_OPS_MIGHT_FAULT 1
  567. #endif
  568. // Clang <9 requires this be invoked at file scope, before any namespace.
  569. #undef HWY_BEFORE_NAMESPACE
  570. #if defined(HWY_TARGET_STR)
  571. #define HWY_BEFORE_NAMESPACE() \
  572. HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
  573. static_assert(true, "For requiring trailing semicolon")
  574. #else
  575. // avoids compiler warning if no HWY_TARGET_STR
  576. #define HWY_BEFORE_NAMESPACE() \
  577. static_assert(true, "For requiring trailing semicolon")
  578. #endif
  579. // Clang <9 requires any namespaces be closed before this macro.
  580. #undef HWY_AFTER_NAMESPACE
  581. #if defined(HWY_TARGET_STR)
  582. #define HWY_AFTER_NAMESPACE() \
  583. HWY_POP_ATTRIBUTES \
  584. static_assert(true, "For requiring trailing semicolon")
  585. #else
  586. // avoids compiler warning if no HWY_TARGET_STR
  587. #define HWY_AFTER_NAMESPACE() \
  588. static_assert(true, "For requiring trailing semicolon")
  589. #endif
  590. #undef HWY_ATTR
  591. #if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
  592. #define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
  593. #else
  594. #define HWY_ATTR
  595. #endif