From 0007a0b0c11fa7c12b228883453368f105a4324b Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Thu, 16 Nov 2017 13:11:07 +0100 Subject: Initial commit. The following code is present: * the basis API * the BiCGSTAB solver * the pseudospectral linear system solver * helper APIs: - threadpool - logging - cpuid --- cpu.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 cpu.c (limited to 'cpu.c') diff --git a/cpu.c b/cpu.c new file mode 100644 index 0000000..173a025 --- /dev/null +++ b/cpu.c @@ -0,0 +1,220 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "config.h" + +#if HAVE_SCHED_GETAFFINITY +#define _GNU_SOURCE +#include +#endif +#if HAVE_GETPROCESSAFFINITYMASK +#include +#endif +#if HAVE_SYSCTL +#if HAVE_SYS_PARAM_H +#include +#endif +#include +#include +#endif +#if HAVE_SYSCONF +#include +#endif + +#include + +#include "cpu.h" + +#if ARCH_X86 +static int get_cpu_flags_x86(void) +{ + int rval = 0; + + int eax, ebx, ecx, edx; + int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0; + int family = 0, model = 0; + union { int i[3]; char c[12]; } vendor; + + tdi_cpu_cpuid(0, &max_std_level, &vendor.i[0], &vendor.i[2], &vendor.i[1]); + + if (max_std_level >= 1) { + tdi_cpu_cpuid(1, &eax, &ebx, &ecx, &std_caps); + family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); + if (std_caps & (1 << 15)) + rval |= TDI_CPU_FLAG_CMOV; + if (std_caps & (1 << 23)) + rval |= TDI_CPU_FLAG_MMX; + if (std_caps & (1 << 25)) + rval |= TDI_CPU_FLAG_MMXEXT; +#if HAVE_SSE + if (std_caps & (1 << 25)) + rval |= TDI_CPU_FLAG_SSE; + if (std_caps & (1 << 26)) + rval |= TDI_CPU_FLAG_SSE2; + if (ecx & 1) + rval |= TDI_CPU_FLAG_SSE3; + if (ecx & 0x00000200 ) + rval |= TDI_CPU_FLAG_SSSE3; + if (ecx & 0x00080000 ) + rval |= TDI_CPU_FLAG_SSE4; + if (ecx & 0x00100000 ) + rval |= TDI_CPU_FLAG_SSE42; +#if HAVE_AVX + /* Check OXSAVE and AVX bits */ + if ((ecx & 0x18000000) == 0x18000000) { + /* Check for OS support */ + tdi_cpu_xgetbv(0, &eax, &edx); + if ((eax & 0x6) == 0x6) { + rval |= TDI_CPU_FLAG_AVX; + if (ecx & 0x00001000) + rval |= TDI_CPU_FLAG_FMA3; + } + } +#endif /* HAVE_AVX */ +#endif /* HAVE_SSE */ + } + if (max_std_level >= 7) { + tdi_cpu_cpuid(7, &eax, &ebx, &ecx, &edx); +#if HAVE_AVX2 + if (ebx & 0x00000020) + rval |= TDI_CPU_FLAG_AVX2; +#endif /* HAVE_AVX2 */ + /* BMI1/2 don't need OS support */ + if (ebx & 0x00000008) { + rval |= TDI_CPU_FLAG_BMI1; + if (ebx & 0x00000100) + rval |= TDI_CPU_FLAG_BMI2; + } + } + + tdi_cpu_cpuid(0x80000000, &max_ext_level, &ebx, &ecx, &edx); + + if (max_ext_level >= 0x80000001) { + tdi_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &ext_caps); + if (ext_caps & (1U << 31)) + rval |= TDI_CPU_FLAG_3DNOW; + if (ext_caps & (1 << 30)) + rval |= TDI_CPU_FLAG_3DNOWEXT; + if (ext_caps & (1 << 23)) + rval |= TDI_CPU_FLAG_MMX; + if (ext_caps & (1 << 22)) + rval |= TDI_CPU_FLAG_MMXEXT; + + if (!strncmp(vendor.c, "AuthenticAMD", 12)) { + /* Allow for selectively disabling SSE2 functions on AMD processors + with SSE2 support but not SSE4a. This includes Athlon64, some + Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster + than SSE2 often enough to utilize this special-case flag. + TDI_CPU_FLAG_SSE2 and TDI_CPU_FLAG_SSE2SLOW are both set in this case + so that SSE2 is used unless explicitly disabled by checking + TDI_CPU_FLAG_SSE2SLOW. */ + if (rval & TDI_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) + rval |= TDI_CPU_FLAG_SSE2SLOW; + + /* Similar to the above but for AVX functions on AMD processors. + This is necessary only for functions using YMM registers on Bulldozer + based CPUs as they lack 256-bit execution units. SSE/AVX functions + using XMM registers are always faster on them. + TDI_CPU_FLAG_AVX and TDI_CPU_FLAG_AVXSLOW are both set so that AVX is + used unless explicitly disabled by checking TDI_CPU_FLAG_AVXSLOW. + TODO: Confirm if Excavator is affected or not by this once it's + released, and update the check if necessary. Same for btver2. */ + if (family == 0x15 && (rval & TDI_CPU_FLAG_AVX)) + rval |= TDI_CPU_FLAG_AVXSLOW; + } + + /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be + * used unless the OS has AVX support. */ + if (rval & TDI_CPU_FLAG_AVX) { + if (ecx & 0x00000800) + rval |= TDI_CPU_FLAG_XOP; + if (ecx & 0x00010000) + rval |= TDI_CPU_FLAG_FMA4; + } + } + + if (!strncmp(vendor.c, "GenuineIntel", 12)) { + if (family == 6 && (model == 9 || model == 13 || model == 14)) { + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and + * 6/14 (core1 "yonah") theoretically support sse2, but it's + * usually slower than mmx, so let's just pretend they don't. + * TDI_CPU_FLAG_SSE2 is disabled and TDI_CPU_FLAG_SSE2SLOW is + * enabled so that SSE2 is not used unless explicitly enabled + * by checking TDI_CPU_FLAG_SSE2SLOW. The same situation + * applies for TDI_CPU_FLAG_SSE3 and TDI_CPU_FLAG_SSE3SLOW. */ + if (rval & TDI_CPU_FLAG_SSE2) + rval ^= TDI_CPU_FLAG_SSE2SLOW | TDI_CPU_FLAG_SSE2; + if (rval & TDI_CPU_FLAG_SSE3) + rval ^= TDI_CPU_FLAG_SSE3SLOW | TDI_CPU_FLAG_SSE3; + } + /* The Atom processor has SSSE3 support, which is useful in many cases, + * but sometimes the SSSE3 version is slower than the SSE2 equivalent + * on the Atom, but is generally faster on other processors supporting + * SSSE3. This flag allows for selectively disabling certain SSSE3 + * functions on the Atom. */ + if (family == 6 && model == 28) + rval |= TDI_CPU_FLAG_ATOM; + + /* Conroe has a slow shuffle unit. Check the model number to ensure not + * to include crippled low-end Penryns and Nehalems that lack SSE4. */ + if ((rval & TDI_CPU_FLAG_SSSE3) && !(rval & TDI_CPU_FLAG_SSE4) && + family == 6 && model < 23) + rval |= TDI_CPU_FLAG_SSSE3SLOW; + } + + return rval; +} +#endif + +int tdi_init_cpu_flags(void) +{ + int flags = 0; + +#if ARCH_X86 + flags = get_cpu_flags_x86(); +#endif + + return flags; +} + +unsigned int tdi_cpu_count(void) +{ + unsigned int nb_cpus = 1; +#if HAVE_SCHED_GETAFFINITY && defined(CPU_COUNT) + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + + if (!sched_getaffinity(0, sizeof(cpuset), &cpuset)) + nb_cpus = CPU_COUNT(&cpuset); +#elif HAVE_GETPROCESSAFFINITYMASK + DWORD_PTR proc_aff, sys_aff; + if (GetProcessAffinityMask(GetCurrentProcess(), &proc_aff, &sys_aff)) + nb_cpus = av_popcount64(proc_aff); +#elif HAVE_SYSCTL && defined(HW_NCPU) + int mib[2] = { CTL_HW, HW_NCPU }; + size_t len = sizeof(nb_cpus); + + if (sysctl(mib, 2, &nb_cpus, &len, NULL, 0) == -1) + nb_cpus = 0; +#elif HAVE_SYSCONF && defined(_SC_NPROC_ONLN) + nb_cpus = sysconf(_SC_NPROC_ONLN); +#elif HAVE_SYSCONF && defined(_SC_NPROCESSORS_ONLN) + nb_cpus = sysconf(_SC_NPROCESSORS_ONLN); +#endif + + return nb_cpus; +} -- cgit v1.2.3