aboutsummaryrefslogtreecommitdiff
path: root/cpu.c
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2017-11-16 13:11:07 +0100
committerAnton Khirnov <anton@khirnov.net>2017-11-19 16:30:14 +0100
commit0007a0b0c11fa7c12b228883453368f105a4324b (patch)
treea5ac8016c58c13668bf87931dd921bea933cf07f /cpu.c
Initial commit.
The following code is present: * the basis API * the BiCGSTAB solver * the pseudospectral linear system solver * helper APIs: - threadpool - logging - cpuid
Diffstat (limited to 'cpu.c')
-rw-r--r--cpu.c220
1 files changed, 220 insertions, 0 deletions
diff --git a/cpu.c b/cpu.c
new file mode 100644
index 0000000..173a025
--- /dev/null
+++ b/cpu.c
@@ -0,0 +1,220 @@
+/*
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if HAVE_SCHED_GETAFFINITY
+#define _GNU_SOURCE
+#include <sched.h>
+#endif
+#if HAVE_GETPROCESSAFFINITYMASK
+#include <windows.h>
+#endif
+#if HAVE_SYSCTL
+#if HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+#if HAVE_SYSCONF
+#include <unistd.h>
+#endif
+
+#include <string.h>
+
+#include "cpu.h"
+
+#if ARCH_X86
+static int get_cpu_flags_x86(void)
+{
+ int rval = 0;
+
+ int eax, ebx, ecx, edx;
+ int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0;
+ int family = 0, model = 0;
+ union { int i[3]; char c[12]; } vendor;
+
+ tdi_cpu_cpuid(0, &max_std_level, &vendor.i[0], &vendor.i[2], &vendor.i[1]);
+
+ if (max_std_level >= 1) {
+ tdi_cpu_cpuid(1, &eax, &ebx, &ecx, &std_caps);
+ family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+ model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+ if (std_caps & (1 << 15))
+ rval |= TDI_CPU_FLAG_CMOV;
+ if (std_caps & (1 << 23))
+ rval |= TDI_CPU_FLAG_MMX;
+ if (std_caps & (1 << 25))
+ rval |= TDI_CPU_FLAG_MMXEXT;
+#if HAVE_SSE
+ if (std_caps & (1 << 25))
+ rval |= TDI_CPU_FLAG_SSE;
+ if (std_caps & (1 << 26))
+ rval |= TDI_CPU_FLAG_SSE2;
+ if (ecx & 1)
+ rval |= TDI_CPU_FLAG_SSE3;
+ if (ecx & 0x00000200 )
+ rval |= TDI_CPU_FLAG_SSSE3;
+ if (ecx & 0x00080000 )
+ rval |= TDI_CPU_FLAG_SSE4;
+ if (ecx & 0x00100000 )
+ rval |= TDI_CPU_FLAG_SSE42;
+#if HAVE_AVX
+ /* Check OXSAVE and AVX bits */
+ if ((ecx & 0x18000000) == 0x18000000) {
+ /* Check for OS support */
+ tdi_cpu_xgetbv(0, &eax, &edx);
+ if ((eax & 0x6) == 0x6) {
+ rval |= TDI_CPU_FLAG_AVX;
+ if (ecx & 0x00001000)
+ rval |= TDI_CPU_FLAG_FMA3;
+ }
+ }
+#endif /* HAVE_AVX */
+#endif /* HAVE_SSE */
+ }
+ if (max_std_level >= 7) {
+ tdi_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
+#if HAVE_AVX2
+ if (ebx & 0x00000020)
+ rval |= TDI_CPU_FLAG_AVX2;
+#endif /* HAVE_AVX2 */
+ /* BMI1/2 don't need OS support */
+ if (ebx & 0x00000008) {
+ rval |= TDI_CPU_FLAG_BMI1;
+ if (ebx & 0x00000100)
+ rval |= TDI_CPU_FLAG_BMI2;
+ }
+ }
+
+ tdi_cpu_cpuid(0x80000000, &max_ext_level, &ebx, &ecx, &edx);
+
+ if (max_ext_level >= 0x80000001) {
+ tdi_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &ext_caps);
+ if (ext_caps & (1U << 31))
+ rval |= TDI_CPU_FLAG_3DNOW;
+ if (ext_caps & (1 << 30))
+ rval |= TDI_CPU_FLAG_3DNOWEXT;
+ if (ext_caps & (1 << 23))
+ rval |= TDI_CPU_FLAG_MMX;
+ if (ext_caps & (1 << 22))
+ rval |= TDI_CPU_FLAG_MMXEXT;
+
+ if (!strncmp(vendor.c, "AuthenticAMD", 12)) {
+ /* Allow for selectively disabling SSE2 functions on AMD processors
+ with SSE2 support but not SSE4a. This includes Athlon64, some
+ Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
+ than SSE2 often enough to utilize this special-case flag.
+ TDI_CPU_FLAG_SSE2 and TDI_CPU_FLAG_SSE2SLOW are both set in this case
+ so that SSE2 is used unless explicitly disabled by checking
+ TDI_CPU_FLAG_SSE2SLOW. */
+ if (rval & TDI_CPU_FLAG_SSE2 && !(ecx & 0x00000040))
+ rval |= TDI_CPU_FLAG_SSE2SLOW;
+
+ /* Similar to the above but for AVX functions on AMD processors.
+ This is necessary only for functions using YMM registers on Bulldozer
+ based CPUs as they lack 256-bit execution units. SSE/AVX functions
+ using XMM registers are always faster on them.
+ TDI_CPU_FLAG_AVX and TDI_CPU_FLAG_AVXSLOW are both set so that AVX is
+ used unless explicitly disabled by checking TDI_CPU_FLAG_AVXSLOW.
+ TODO: Confirm if Excavator is affected or not by this once it's
+ released, and update the check if necessary. Same for btver2. */
+ if (family == 0x15 && (rval & TDI_CPU_FLAG_AVX))
+ rval |= TDI_CPU_FLAG_AVXSLOW;
+ }
+
+ /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
+ * used unless the OS has AVX support. */
+ if (rval & TDI_CPU_FLAG_AVX) {
+ if (ecx & 0x00000800)
+ rval |= TDI_CPU_FLAG_XOP;
+ if (ecx & 0x00010000)
+ rval |= TDI_CPU_FLAG_FMA4;
+ }
+ }
+
+ if (!strncmp(vendor.c, "GenuineIntel", 12)) {
+ if (family == 6 && (model == 9 || model == 13 || model == 14)) {
+ /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
+ * 6/14 (core1 "yonah") theoretically support sse2, but it's
+ * usually slower than mmx, so let's just pretend they don't.
+ * TDI_CPU_FLAG_SSE2 is disabled and TDI_CPU_FLAG_SSE2SLOW is
+ * enabled so that SSE2 is not used unless explicitly enabled
+ * by checking TDI_CPU_FLAG_SSE2SLOW. The same situation
+ * applies for TDI_CPU_FLAG_SSE3 and TDI_CPU_FLAG_SSE3SLOW. */
+ if (rval & TDI_CPU_FLAG_SSE2)
+ rval ^= TDI_CPU_FLAG_SSE2SLOW | TDI_CPU_FLAG_SSE2;
+ if (rval & TDI_CPU_FLAG_SSE3)
+ rval ^= TDI_CPU_FLAG_SSE3SLOW | TDI_CPU_FLAG_SSE3;
+ }
+ /* The Atom processor has SSSE3 support, which is useful in many cases,
+ * but sometimes the SSSE3 version is slower than the SSE2 equivalent
+ * on the Atom, but is generally faster on other processors supporting
+ * SSSE3. This flag allows for selectively disabling certain SSSE3
+ * functions on the Atom. */
+ if (family == 6 && model == 28)
+ rval |= TDI_CPU_FLAG_ATOM;
+
+ /* Conroe has a slow shuffle unit. Check the model number to ensure not
+ * to include crippled low-end Penryns and Nehalems that lack SSE4. */
+ if ((rval & TDI_CPU_FLAG_SSSE3) && !(rval & TDI_CPU_FLAG_SSE4) &&
+ family == 6 && model < 23)
+ rval |= TDI_CPU_FLAG_SSSE3SLOW;
+ }
+
+ return rval;
+}
+#endif
+
+int tdi_init_cpu_flags(void)
+{
+ int flags = 0;
+
+#if ARCH_X86
+ flags = get_cpu_flags_x86();
+#endif
+
+ return flags;
+}
+
+unsigned int tdi_cpu_count(void)
+{
+ unsigned int nb_cpus = 1;
+#if HAVE_SCHED_GETAFFINITY && defined(CPU_COUNT)
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+
+ if (!sched_getaffinity(0, sizeof(cpuset), &cpuset))
+ nb_cpus = CPU_COUNT(&cpuset);
+#elif HAVE_GETPROCESSAFFINITYMASK
+ DWORD_PTR proc_aff, sys_aff;
+ if (GetProcessAffinityMask(GetCurrentProcess(), &proc_aff, &sys_aff))
+ nb_cpus = av_popcount64(proc_aff);
+#elif HAVE_SYSCTL && defined(HW_NCPU)
+ int mib[2] = { CTL_HW, HW_NCPU };
+ size_t len = sizeof(nb_cpus);
+
+ if (sysctl(mib, 2, &nb_cpus, &len, NULL, 0) == -1)
+ nb_cpus = 0;
+#elif HAVE_SYSCONF && defined(_SC_NPROC_ONLN)
+ nb_cpus = sysconf(_SC_NPROC_ONLN);
+#elif HAVE_SYSCONF && defined(_SC_NPROCESSORS_ONLN)
+ nb_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+
+ return nb_cpus;
+}