aboutsummaryrefslogtreecommitdiff
path: root/configure.ac
diff options
context:
space:
mode:
authorNathan Bossart <nathan@postgresql.org>2024-04-06 21:56:23 -0500
committerNathan Bossart <nathan@postgresql.org>2024-04-06 21:56:23 -0500
commit792752af4eb5cf7b5b8b0470dbf22901c5178fe5 (patch)
tree2090baf57be8c2bf773386571587aa6511d8cb27 /configure.ac
parent158f5819236806b7c9cab323658c231e9371c458 (diff)
downloadpostgresql-792752af4eb5cf7b5b8b0470dbf22901c5178fe5.tar.gz
postgresql-792752af4eb5cf7b5b8b0470dbf22901c5178fe5.zip
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks when possible. Newer hardware that supports AVX-512 instructions can use 512-bit chunks, which provides a nice speedup, especially for larger buffers. This commit introduces the infrastructure required to detect compiler and CPU support for the required AVX-512 intrinsic functions, and it adds a new pg_popcount() implementation that uses these functions. If CPU support for this optimized implementation is detected at runtime, a function pointer is updated so that it is used by subsequent calls to pg_popcount(). Most of the existing in-tree calls to pg_popcount() should benefit from these instructions, and calls with smaller buffers should at least not regress compared to v16. The new infrastructure introduced by this commit can also be used to optimize visibilitymap_count(), but that is left for a follow-up commit. Co-authored-by: Paul Amonson, Ants Aasma Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
Diffstat (limited to 'configure.ac')
-rw-r--r--configure.ac51
1 files changed, 51 insertions, 0 deletions
diff --git a/configure.ac b/configure.ac
index 57f734879e1..67e738d92b1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2052,6 +2052,17 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then
AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.])
fi
+AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>],
+ [[unsigned int exx[4] = {0, 0, 0, 0};
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+ ]])],
+ [pgac_cv__get_cpuid_count="yes"],
+ [pgac_cv__get_cpuid_count="no"])])
+if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
+ AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.])
+fi
+
AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid],
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
[[unsigned int exx[4] = {0, 0, 0, 0};
@@ -2063,6 +2074,46 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
fi
+AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
+ [[unsigned int exx[4] = {0, 0, 0, 0};
+ __get_cpuidex(exx[0], 7, 0);
+ ]])],
+ [pgac_cv__cpuidex="yes"],
+ [pgac_cv__cpuidex="no"])])
+if test x"$pgac_cv__cpuidex" = x"yes"; then
+ AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
+fi
+
+# Check for XSAVE intrinsics
+#
+CFLAGS_XSAVE=""
+PGAC_XSAVE_INTRINSICS([])
+if test x"$pgac_xsave_intrinsics" != x"yes"; then
+ PGAC_XSAVE_INTRINSICS([-mxsave])
+fi
+if test x"$pgac_xsave_intrinsics" = x"yes"; then
+ AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.])
+fi
+AC_SUBST(CFLAGS_XSAVE)
+
+# Check for AVX-512 popcount intrinsics
+#
+CFLAGS_POPCNT=""
+PG_POPCNT_OBJS=""
+if test x"$host_cpu" = x"x86_64"; then
+ PGAC_AVX512_POPCNT_INTRINSICS([])
+ if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+ PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512bw])
+ fi
+ if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+ PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+ AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 popcount instructions with a runtime check.])
+ fi
+fi
+AC_SUBST(CFLAGS_POPCNT)
+AC_SUBST(PG_POPCNT_OBJS)
+
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used