From f4e59032c42c1df52717d2663760072cd0510f30 Mon Sep 17 00:00:00 2001 From: eschnett Date: Wed, 16 Jan 2013 20:17:39 +0000 Subject: Major update Disable AVX emulation Set default for streaming stores to "no" Correct QPX vectorisation (IBM Blue Gene/Q) Add MIC vectorisation (Intel Xeon Phi) Convert SSE and AVX vectorisation to using inline functions instead of macros for code clarity Define CCTK_BOOLEAN, CCTK_INTEGER and CCTK_BOOLEAN_VEC, CCTK_INTEGER_VEC to make boolean and integer vectors explicit git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@77 105869f7-3296-0410-a4ea-f4349344b45a --- configuration.ccl | 3 +- configure.sh | 14 +- src/avxintrin_emu.h | 1061 ---------------------------------------- src/macros/vectors-4-SSE.h | 457 +++++++++++++++++ src/macros/vectors-4-default.h | 134 +++++ src/macros/vectors-8-AVX.h | 325 ++++++++++++ src/macros/vectors-8-SSE2.h | 427 ++++++++++++++++ src/macros/vectors-8-default.h | 132 +++++ src/vectors-4-AVX.h | 659 +++++++++++++++++++++++++ src/vectors-4-SSE.h | 938 ++++++++++++++++++++++------------- src/vectors-4-default.h | 3 + src/vectors-8-AVX.h | 752 +++++++++++++++++++--------- src/vectors-8-DoubleHummer.h | 4 +- src/vectors-8-MIC.h | 652 ++++++++++++++++++++++++ src/vectors-8-QPX.h | 17 +- src/vectors-8-SSE2.h | 831 ++++++++++++++++++++----------- src/vectors-8-VSX.h | 4 +- src/vectors-8-default.h | 43 +- src/vectors.h | 214 ++++++-- 19 files changed, 4675 insertions(+), 1995 deletions(-) delete mode 100644 src/avxintrin_emu.h create mode 100644 src/macros/vectors-4-SSE.h create mode 100644 src/macros/vectors-4-default.h create mode 100644 src/macros/vectors-8-AVX.h create mode 100644 src/macros/vectors-8-SSE2.h create mode 100644 src/macros/vectors-8-default.h create mode 100644 src/vectors-4-AVX.h create mode 100644 src/vectors-8-MIC.h diff --git a/configuration.ccl b/configuration.ccl index 9468abc..e2963f0 100644 --- a/configuration.ccl +++ b/configuration.ccl @@ -10,8 +10,7 @@ PROVIDES Vectors VECTORISE_ALWAYS_USE_UNALIGNED_LOADS \ VECTORISE_ALWAYS_USE_ALIGNED_LOADS \ VECTORISE_INLINE \ - VECTORISE_STREAMING_STORES \ - VECTORISE_EMULATE_AVX + VECTORISE_STREAMING_STORES } REQUIRES Vectors diff --git a/configure.sh b/configure.sh index 1555570..96f34cd 100644 --- a/configure.sh +++ b/configure.sh @@ -71,7 +71,7 @@ esac case $(echo "x$VECTORISE_STREAMING_STORES" | tr '[:upper:]' '[:lower:]') in (xyes) VECTORISE_STREAMING_STORES=1 ;; (xno) VECTORISE_STREAMING_STORES=0 ;; - (x) VECTORISE_STREAMING_STORES=1 ;; # default + (x) VECTORISE_STREAMING_STORES=0 ;; # default (*) echo "BEGIN ERROR" echo "Illegal value of option VECTORISE_STREAMING_STORES" echo "END ERROR" @@ -88,16 +88,6 @@ case $(echo "x$VECTORISE_INLINE" | tr '[:upper:]' '[:lower:]') in exit 1 esac -case $(echo "x$VECTORISE_EMULATE_AVX" | tr '[:upper:]' '[:lower:]') in - (xyes) VECTORISE_EMULATE_AVX=1 ;; - (xno) VECTORISE_EMULATE_AVX=0 ;; - (x) VECTORISE_EMULATE_AVX=0 ;; # default - (*) echo "BEGIN ERROR" - echo "Illegal value of option VECTORISE_EMULATE_AVX" - echo "END ERROR" - exit 1 -esac - ################################################################################ @@ -112,7 +102,6 @@ echo "VECTORISE_ALWAYS_USE_UNALIGNED_LOADS $VECTORISE_ALWAYS_USE_UNALIGNED_LOADS echo "VECTORISE_ALWAYS_USE_ALIGNED_LOADS $VECTORISE_ALWAYS_USE_ALIGNED_LOADS" echo "VECTORISE_INLINE $VECTORISE_INLINE" echo "VECTORISE_STREAMING_STORES $VECTORISE_STREAMING_STORES" -echo "VECTORISE_EMULATE_AVX $VECTORISE_EMULATE_AVX" echo "END DEFINE" echo "BEGIN MAKE_DEFINITION" @@ -122,5 +111,4 @@ echo "VECTORISE_ALWAYS_USE_UNALIGNED_LOADS = $VECTORISE_ALWAYS_USE_UNALIGNED_LOA echo "VECTORISE_ALWAYS_USE_ALIGNED_LOADS = $VECTORISE_ALWAYS_USE_ALIGNED_LOADS" echo "VECTORISE_INLINE = $VECTORISE_INLINE" echo "VECTORISE_STREAMING_STORES = $VECTORISE_STREAMING_STORES" -echo "VECTORISE_EMULATE_AVX = $VECTORISE_EMULATE_AVX" echo "END MAKE_DEFINITION" diff --git a/src/avxintrin_emu.h b/src/avxintrin_emu.h deleted file mode 100644 index 3097cd7..0000000 --- a/src/avxintrin_emu.h +++ /dev/null @@ -1,1061 +0,0 @@ -/* - Copyright (c) 2010, Intel Corporation. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors may - be used to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/*** - - Provide feedback to: maxim.locktyukhin intel com, phil.j.kerly intel com - - Version 1.0 - Initial release. - - This AVX intrinsics emulation header file designed to work with Intel C/C++ - as well as GCC compilers. - - Known Issues and limitations: - - - does not support immediate values higher than 0x7 for _mm[256]_cmp_[ps|pd] - intrinsics, UD2 instruction will be generated instead - - - -O0 optimization level may _sometimes_ result with compile time errors due - to failed forced inline and compiler not being able to generate instruction - with constant immediate operand becasue of it, compiling with -O1 and/or - -finline-functions should help. - -***/ - - -#ifndef __EMU_M256_AVXIMMINTRIN_EMU_H__ -#define __EMU_M256_AVXIMMINTRIN_EMU_H__ - -#ifdef __GNUC__ - -#ifdef __SSE__ -#include -#endif - -#ifdef __SSE2__ -#include -#endif - -#ifdef __SSE3__ -#include -#endif - -#ifdef __SSSE3__ -#include -#endif - -#if defined (__SSE4_2__) || defined (__SSE4_1__) -#include -#endif - -#if defined (__AES__) || defined (__PCLMUL__) -#include -#endif - -#else - -#include - -#endif - -#pragma message (" --- Intel remark: AVX intrinsics are emulated with SSE ---") - -/* - * Intel(R) AVX compiler intrinsics. - */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * This is an emulation of Intel AVX - */ - -#if defined( _MSC_VER ) || defined( __INTEL_COMPILER ) - #define __EMU_M256_ALIGN( a ) __declspec(align(a)) - #define __emu_inline __forceinline - #define __emu_int64_t __int64 -#elif defined( __GNUC__ ) - #define __EMU_M256_ALIGN( a ) __attribute__((__aligned__(a))) - #define __emu_inline __inline __attribute__((__always_inline__)) - #define __emu_int64_t long long -#else - #error "unsupported platform" -#endif - -typedef union __EMU_M256_ALIGN(32) __emu__m256 -{ - float __emu_arr[8]; - __m128 __emu_m128[2]; -} __emu__m256; - -typedef union __EMU_M256_ALIGN(32) __emu__m256d -{ - double __emu_arr[4]; - __m128d __emu_m128[2]; -} __emu__m256d; - -typedef union __EMU_M256_ALIGN(32) __emu__m256i -{ - int __emu_arr[8]; - __m128i __emu_m128[2]; -} __emu__m256i; - -static __emu_inline __emu__m256 __emu_set_m128( const __m128 arr[] ) { __emu__m256 ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); } -static __emu_inline __emu__m256d __emu_set_m128d( const __m128d arr[] ) { __emu__m256d ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); } -static __emu_inline __emu__m256i __emu_set_m128i( const __m128i arr[] ) { __emu__m256i ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); } - - -#define __EMU_M256_IMPL_M1( type, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \ - res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M1_RET( ret_type, type, func ) \ -static __emu_inline __emu##ret_type __emu_mm256_##func( __emu##type m256_param1 ) \ -{ __emu##ret_type res; \ - res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \ - res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M1_RET_NAME( ret_type, type, func, name ) \ - static __emu_inline __emu##ret_type __emu_mm256_##name( __emu##type m256_param1 ) \ -{ __emu##ret_type res; \ - res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \ - res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M1_LH( type, type_128, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( type_128 m128_param ) \ -{ __emu##type res; \ - res.__emu_m128[0] = _mm_##func( m128_param ); \ - __m128 m128_param_high = _mm_movehl_ps( *(__m128*)&m128_param, *(__m128*)&m128_param ); \ - res.__emu_m128[1] = _mm_##func( *(type_128*)&m128_param_high ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M1_HL( type_128, type, func ) \ -static __emu_inline type_128 __emu_mm256_##func( __emu##type m256_param1 ) \ -{ type_128 res, tmp; \ - res = _mm_##func( m256_param1.__emu_m128[0] ); \ - tmp = _mm_##func( m256_param1.__emu_m128[1] ); \ - *(((__emu_int64_t*)&res)+1) = *(__emu_int64_t*)&tmp; \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M1P_DUP( type, type_param, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( type_param param ) \ -{ __emu##type res; \ - res.__emu_m128[0] = _mm_##func( param ); \ - res.__emu_m128[1] = _mm_##func( param ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M1I_DUP( type, func ) \ - static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], param2 ); \ - res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], param2 ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL2_M1I_DUP( type, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], param2 ); \ - res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], param2 ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL2_M1I_SHIFT( type, func, shift_for_hi ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], param2 & ((1<> shift_for_hi); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M2( type, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0] ); \ - res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1] ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL2_M2T( type, type_2, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type_2 m256_param2 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0] ); \ - res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1] ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M2I_DUP( type, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 ); \ - res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], param3 ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL2_M2I_DUP( type, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 ); \ - res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], param3 ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M2I_SHIFT( type, func, shift_for_hi ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 & ((1<> shift_for_hi ); \ - return ( res ); \ -} - -#define __EMU_M256_IMPL_M3( type, func ) \ -static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, __emu##type m256_param3 ) \ -{ __emu##type res; \ - res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], m256_param3.__emu_m128[0] ); \ - res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], m256_param3.__emu_m128[1] ); \ - return ( res ); \ -} - - -/* - * Compare predicates for scalar and packed compare intrinsics - */ -#define _CMP_EQ_OQ 0x00 /* Equal (ordered, nonsignaling) */ -#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ -#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ -#define _CMP_UNORD_Q 0x03 /* Unordered (nonsignaling) */ -#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, nonsignaling) */ -#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ -#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ -#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ - -#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ -#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ -#define _CMP_NGT_US 0x0A /* Not-greater-than (unordered, signaling) */ -#define _CMP_FALSE_OQ 0x0B /* False (ordered, nonsignaling) */ -#define _CMP_NEQ_OQ 0x0C /* Not-equal (ordered, non-signaling) */ -#define _CMP_GE_OS 0x0D /* Greater-than-or-equal (ordered, signaling) */ -#define _CMP_GT_OS 0x0E /* Greater-than (ordered, signaling) */ -#define _CMP_TRUE_UQ 0x0F /* True (unordered, non-signaling) */ -#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ -#define _CMP_LT_OQ 0x11 /* Less-than (ordered, nonsignaling) */ -#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, nonsignaling) */ -#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ -#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ -#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, nonsignaling) */ -#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, nonsignaling) */ -#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ -#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ -#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, nonsignaling) */ -#define _CMP_NGT_UQ 0x1A /* Not-greater-than (unordered, nonsignaling) */ -#define _CMP_FALSE_OS 0x1B /* False (ordered, signaling) */ -#define _CMP_NEQ_OS 0x1C /* Not-equal (ordered, signaling) */ -#define _CMP_GE_OQ 0x1D /* Greater-than-or-equal (ordered, nonsignaling) */ -#define _CMP_GT_OQ 0x1E /* Greater-than (ordered, nonsignaling) */ -#define _CMP_TRUE_US 0x1F /* True (unordered, signaling) */ - -__EMU_M256_IMPL_M2( __m256d, add_pd ); -__EMU_M256_IMPL_M2( __m256, add_ps ); - -__EMU_M256_IMPL_M2( __m256d, addsub_pd ); -__EMU_M256_IMPL_M2( __m256, addsub_ps ); - -__EMU_M256_IMPL_M2( __m256d, and_pd ); -__EMU_M256_IMPL_M2( __m256, and_ps ); - -__EMU_M256_IMPL_M2( __m256d, andnot_pd ); -__EMU_M256_IMPL_M2( __m256, andnot_ps ); - -__EMU_M256_IMPL_M2( __m256d, div_pd ); -__EMU_M256_IMPL_M2( __m256, div_ps ); - -__EMU_M256_IMPL_M2( __m256d, hadd_pd ); -__EMU_M256_IMPL_M2( __m256, hadd_ps ); - -__EMU_M256_IMPL_M2( __m256d, hsub_pd ); -__EMU_M256_IMPL_M2( __m256, hsub_ps ); - -__EMU_M256_IMPL_M2( __m256d, max_pd ); -__EMU_M256_IMPL_M2( __m256, max_ps ); - -__EMU_M256_IMPL_M2( __m256d, min_pd ); -__EMU_M256_IMPL_M2( __m256, min_ps ); - -__EMU_M256_IMPL_M2( __m256d, mul_pd ); -__EMU_M256_IMPL_M2( __m256, mul_ps ); - -__EMU_M256_IMPL_M2( __m256d, or_pd ); -__EMU_M256_IMPL_M2( __m256, or_ps ); - -__EMU_M256_IMPL_M2I_SHIFT( __m256d, shuffle_pd, 2 ); -__EMU_M256_IMPL_M2I_DUP( __m256, shuffle_ps ); - -__EMU_M256_IMPL_M2( __m256d, sub_pd ); -__EMU_M256_IMPL_M2( __m256, sub_ps ); - -__EMU_M256_IMPL_M2( __m256d, xor_pd ); -__EMU_M256_IMPL_M2( __m256, xor_ps ); - -#if defined (__SSE4_2__) || defined (__SSE4_1__) - -__EMU_M256_IMPL_M2I_SHIFT( __m256d, blend_pd, 2 ); -__EMU_M256_IMPL_M2I_SHIFT( __m256, blend_ps, 4 ); - -__EMU_M256_IMPL_M3( __m256d, blendv_pd ); -__EMU_M256_IMPL_M3( __m256, blendv_ps ); - -__EMU_M256_IMPL_M2I_DUP( __m256, dp_ps ); - -__EMU_M256_IMPL_M1I_DUP( __m256d, round_pd ); -#define _mm256_ceil_pd(val) _mm256_round_pd((val), 0x0A); -#define _mm256_floor_pd(val) _mm256_round_pd((val), 0x09); - -__EMU_M256_IMPL_M1I_DUP( __m256, round_ps ); -#define _mm256_ceil_ps(val) _mm256_round_ps((val), 0x0A); -#define _mm256_floor_ps(val) _mm256_round_ps((val), 0x09); - -#define __emu_mm_test_impl( op, sfx, vec_type ) \ -static __emu_inline int __emu_mm_test##op##_##sfx(vec_type s1, vec_type s2) { \ - __m128d sign_bits_pd = _mm_castsi128_pd( _mm_set_epi32( 1 << 31, 0, 1 << 31, 0 ) ); \ - __m128 sign_bits_ps = _mm_castsi128_ps( _mm_set1_epi32( 1 << 31 ) ); \ - \ - s1 = _mm_and_##sfx( s1, sign_bits_##sfx ); \ - s2 = _mm_and_##sfx( s2, sign_bits_##sfx ); \ - return _mm_test##op##_si128( _mm_cast##sfx##_si128( s1 ), _mm_cast##sfx##_si128( s2 ) ); \ -} - -__emu_mm_test_impl( z, pd, __m128d ); -__emu_mm_test_impl( c, pd, __m128d ); -__emu_mm_test_impl( nzc, pd, __m128d ); - -__emu_mm_test_impl( z, ps, __m128 ); -__emu_mm_test_impl( c, ps, __m128 ); -__emu_mm_test_impl( nzc, ps, __m128 ); - - - -#define __emu_mm256_test_impl( prfx, op, sfx, sfx_impl, vec_type ) \ -static __emu_inline int __emu_mm256_test##op##_##sfx(vec_type s1, vec_type s2) { \ - int ret1 = prfx##_test##op##_##sfx_impl( s1.__emu_m128[0], s2.__emu_m128[0] ); \ - int ret2 = prfx##_test##op##_##sfx_impl( s1.__emu_m128[1], s2.__emu_m128[1] ); \ - return ( ret1 && ret2 ); \ -}; - -__emu_mm256_test_impl( _mm, z, si256, si128, __emu__m256i ); -__emu_mm256_test_impl( _mm, c, si256, si128, __emu__m256i ); -__emu_mm256_test_impl( _mm, nzc, si256, si128, __emu__m256i ); - -__emu_mm256_test_impl( __emu_mm, z, pd, pd, __emu__m256d ); -__emu_mm256_test_impl( __emu_mm, c, pd, pd, __emu__m256d ); -__emu_mm256_test_impl( __emu_mm, nzc, pd, pd, __emu__m256d ); - -__emu_mm256_test_impl( __emu_mm, z, ps, ps, __emu__m256 ); -__emu_mm256_test_impl( __emu_mm, c, ps, ps, __emu__m256 ); -__emu_mm256_test_impl( __emu_mm, nzc, ps, ps, __emu__m256 ); - -#endif - -#if defined( __GNUC__ ) && ( __GNUC__ == 4 ) && (__GNUC_MINOR__ < 4 ) -/* use macro implementation instead of inline functions to allow -O0 for GCC pre 4.4 */ - -#pragma message ("Using macro for GCC <4.4" ) - -#define __emu_mm_cmp_ps(m1, m2, predicate) \ -({ \ - __m128 res_ = (m1), m2_ = (m2); \ - if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); \ - __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \ - res_; }) - -#define __emu_mm256_cmp_ps(m1, m2, predicate) \ -({ \ - __emu__m256 res_ = (m1), m2_ = (m2); \ - if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ - __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[0]) : [m2_] "xm" (m2_.__emu_m128[0]), [pred_] "i" (predicate) ); \ - __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[1]) : [m2_] "xm" (m2_.__emu_m128[1]), [pred_] "i" (predicate) ); \ - res_; }) - - -#define __emu_mm_cmp_pd(m1, m2, predicate) \ -({ \ - __m128 res_ = (m1), m2_ = (m2); \ - if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ - __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \ - res_; }) - -#define __emu_mm256_cmp_pd(m1, m2, predicate) \ -({ \ - __emu__m256 res_ = (m1), m2_ = (m2); \ - if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ - __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[0]) : [m2_] "xm" (m2_.__emu_m128[0]), [pred_] "i" (predicate) ); \ - __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[1]) : [m2_] "xm" (m2_.__emu_m128[1]), [pred_] "i" (predicate) ); \ - res_; }) - - -#define __emu_mm_cmp_ss(m1, m2, predicate) \ -({ \ - __m128 res_ = (m1), m2_ = (m2); \ - if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ - __asm__ ( "cmpss %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \ - res_; }) - -#define __emu_mm_cmp_sd(m1, m2, predicate) \ -({ \ - __m128 res_ = (m1), m2_ = (m2); \ - if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ - __asm__ ( "cmpsd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \ - res_; }) - - - -#else /* __GNUC__==4 && __GNUC_MINOR__ <4 */ - - -static __emu_inline __m128 __emu_mm_cmp_ps(__m128 m1, __m128 m2, const int predicate) -{ - __m128 res; - - if ( predicate >= 0 && predicate <= 7 ) { - res = m1; - __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) ); - } else { - __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ - } - - return ( res ); -} -__EMU_M256_IMPL2_M2I_DUP( __m256, cmp_ps ) - -static __emu_inline __m128d __emu_mm_cmp_pd(__m128d m1, __m128d m2, const int predicate) -{ - __m128d res; - - if ( predicate >= 0 && predicate <= 7 ) { - res = m1; - __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) ); - } else { - __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ - } - - return ( res ); -} -__EMU_M256_IMPL2_M2I_DUP( __m256d, cmp_pd ) - - -static __emu_inline __m128d __emu_mm_cmp_sd(__m128d m1, __m128d m2, const int predicate) -{ - __m128d res; - - if ( predicate >= 0 && predicate <= 7 ) { - res = m1; - __asm__ ( "cmpsd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) ); - } else { - __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ - } - - return ( res ); -} - -static __emu_inline __m128 __emu_mm_cmp_ss(__m128 m1, __m128 m2, const int predicate) -{ - __m128 res; - - if ( predicate >= 0 && predicate <= 7 ) { - res = m1; - __asm__ ( "cmpss %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) ); - } else { - __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ - } - - return ( res ); -} - -#endif - - -__EMU_M256_IMPL_M1_LH( __m256d, __m128i, cvtepi32_pd ); -__EMU_M256_IMPL_M1_RET( __m256, __m256i, cvtepi32_ps ); -__EMU_M256_IMPL_M1_HL( __m128, __m256d, cvtpd_ps ); -__EMU_M256_IMPL_M1_RET( __m256i, __m256, cvtps_epi32 ); -__EMU_M256_IMPL_M1_LH( __m256d, __m128, cvtps_pd ); -__EMU_M256_IMPL_M1_HL( __m128i, __m256d, cvttpd_epi32); -__EMU_M256_IMPL_M1_HL( __m128i, __m256d, cvtpd_epi32); -__EMU_M256_IMPL_M1_RET( __m256i, __m256, cvttps_epi32 ); - -static __emu_inline __m128 __emu_mm256_extractf128_ps(__emu__m256 m1, const int offset) { return m1.__emu_m128[ offset ]; } -static __emu_inline __m128d __emu_mm256_extractf128_pd(__emu__m256d m1, const int offset) { return m1.__emu_m128[ offset ]; } -static __emu_inline __m128i __emu_mm256_extractf128_si256(__emu__m256i m1, const int offset) { return m1.__emu_m128[ offset ]; } - -static __emu_inline void __emu_mm256_zeroall(void) {} -static __emu_inline void __emu_mm256_zeroupper(void) {} - -static __emu_inline __m128 __emu_mm_permutevar_ps(__m128 a, __m128i control) -{ - int const* sel = (int const*)&control; - float const* src = (float const*)&a; - __EMU_M256_ALIGN(16) float dest[4]; - int i=0; - - for (; i<4; ++i) - dest[i] = src[ 3 & sel[i] ]; - - return ( *(__m128*)dest ); -} -__EMU_M256_IMPL2_M2T( __m256, __m256i, permutevar_ps ); - -static __emu_inline __m128 __emu_mm_permute_ps(__m128 a, int control) { return _mm_castsi128_ps( _mm_shuffle_epi32( *(__m128i*)&a, control ) ); } -__EMU_M256_IMPL2_M1I_DUP( __m256, permute_ps ); - - -static __emu_inline __m128d __emu_mm_permutevar_pd(__m128d a, __m128i control) -{ - __emu_int64_t const* sel = (__emu_int64_t const*)&control; - double const* src = (double const*)&a; - __EMU_M256_ALIGN(16) double dest[2]; - int i=0; - - for (; i<2; ++i) - dest[i] = src[ (2 & sel[i]) >> 1 ]; - - return ( *(__m128d*)dest ); -} -__EMU_M256_IMPL2_M2T( __m256d, __m256i, permutevar_pd ); - -static __emu_inline __m128d __emu_mm_permute_pd(__m128d a, int control) -{ - double const* src = (double const*)&a; - __EMU_M256_ALIGN(16) double dest[2]; - int i=0; - - for (; i<2; ++i) - dest[i] = src[ 1 & (control >> i) ]; - - return ( *(__m128d*)dest ); -} -__EMU_M256_IMPL2_M1I_SHIFT( __m256d, permute_pd, 2 ); - - -#define __emu_mm256_permute2f128_impl( name, m128_type, m256_type ) \ -static __emu_inline m256_type name( m256_type m1, m256_type m2, int control) { \ - m256_type res; \ - __m128 zero = _mm_setzero_ps(); \ - const m128_type param[4] = { m1.__emu_m128[0], m1.__emu_m128[1], m2.__emu_m128[0], m2.__emu_m128[1] }; \ - res.__emu_m128[0] = (control & 8) ? *(m128_type*)&zero : param[ control & 0x3 ]; control >>= 4; \ - res.__emu_m128[1] = (control & 8) ? *(m128_type*)&zero : param[ control & 0x3 ]; \ - return ( res ); \ -} - -__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_ps, __m128, __emu__m256 ); -__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_pd, __m128d, __emu__m256d ); -__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_si256, __m128i, __emu__m256i ); - - -#define __emu_mm_broadcast_impl( name, res_type, type ) \ -static __emu_inline res_type name(type const *a) { \ - const size_t size = sizeof( res_type ) / sizeof( type );\ - __EMU_M256_ALIGN(32) type res[ size ]; \ - size_t i = 0; \ - for ( ; i < size; ++i ) \ - res[ i ] = *a; \ - return (*(res_type*)&res); \ -} - -__emu_mm_broadcast_impl( __emu_mm_broadcast_ss, __m128, float ) -__emu_mm_broadcast_impl( __emu_mm256_broadcast_ss, __emu__m256, float ) - -__emu_mm_broadcast_impl( __emu_mm_broadcast_sd, __m128, double ) -__emu_mm_broadcast_impl( __emu_mm256_broadcast_sd, __emu__m256d, double ) - -__emu_mm_broadcast_impl( __emu_mm256_broadcast_ps, __emu__m256, __m128 ) -__emu_mm_broadcast_impl( __emu_mm256_broadcast_pd, __emu__m256d, __m128d ) - - -static __emu_inline __emu__m256 __emu_mm256_insertf128_ps(__emu__m256 a, __m128 b, int offset) { a.__emu_m128[ offset ] = b; return a; } -static __emu_inline __emu__m256d __emu_mm256_insertf128_pd(__emu__m256d a, __m128d b, int offset) { a.__emu_m128[ offset ] = b; return a; } -static __emu_inline __emu__m256i __emu_mm256_insertf128_si256(__emu__m256i a, __m128i b, int offset) { a.__emu_m128[ offset ] = b; return a; } - - -#define __emu_mm_load_impl( name, sfx, m256_sfx, m256_type, type_128, type ) \ -static __emu_inline __emu##m256_type __emu_mm256_##name##_##m256_sfx(const type* a) { \ - __emu##m256_type res; \ - res.__emu_m128[0] = _mm_##name##_##sfx( (const type_128 *)a ); \ - res.__emu_m128[1] = _mm_##name##_##sfx( (const type_128 *)(1+(const __m128 *)a) ); \ - return (res); \ -} - -#define __emu_mm_store_impl( name, sfx, m256_sfx, m256_type, type_128, type ) \ -static __emu_inline void __emu_mm256_##name##_##m256_sfx(type *a, __emu##m256_type b) { \ - _mm_##name##_##sfx( (type_128*)a, b.__emu_m128[0] ); \ - _mm_##name##_##sfx( (type_128*)(1+(__m128*)a), b.__emu_m128[1] ); \ -} - -__emu_mm_load_impl( load, pd, pd, __m256d, double, double ); -__emu_mm_store_impl( store, pd, pd, __m256d, double, double ); - -__emu_mm_load_impl( load, ps, ps, __m256, float, float ); -__emu_mm_store_impl( store, ps, ps, __m256, float, float ); - -__emu_mm_load_impl( loadu, pd, pd, __m256d, double, double ); -__emu_mm_store_impl( storeu, pd, pd, __m256d, double, double ); - -__emu_mm_load_impl( loadu, ps, ps, __m256, float, float ); -__emu_mm_store_impl( storeu, ps, ps, __m256, float, float ); - -__emu_mm_load_impl( load, si128, si256, __m256i, __m128i, __emu__m256i ); -__emu_mm_store_impl( store, si128, si256, __m256i, __m128i, __emu__m256i ); - -__emu_mm_load_impl( loadu, si128, si256, __m256i, __m128i, __emu__m256i ); -__emu_mm_store_impl( storeu, si128, si256, __m256i, __m128i, __emu__m256i ); - - -#define __emu_maskload_impl( name, vec_type, mask_vec_type, type, mask_type ) \ -static __emu_inline vec_type name(type const *a, mask_vec_type mask) { \ - const size_t size_type = sizeof( type ); \ - const size_t size = sizeof( vec_type ) / size_type; \ - __EMU_M256_ALIGN(32) type res[ size ]; \ - const mask_type* p_mask = (const mask_type*)&mask; \ - size_t i = 0; \ - mask_type sign_bit = 1; \ - sign_bit <<= (8*size_type - 1); \ - for ( ; i < size; ++i ) \ - res[ i ] = (sign_bit & *(p_mask + i)) ? *(a+i) : 0; \ - return (*(vec_type*)&res); \ -} - -#define __emu_maskstore_impl( name, vec_type, mask_vec_type, type, mask_type ) \ -static __emu_inline void name(type *a, mask_vec_type mask, vec_type data) { \ - const size_t size_type = sizeof( type ); \ - const size_t size = sizeof( vec_type ) / sizeof( type ); \ - type* p_data = (type*)&data; \ - const mask_type* p_mask = (const mask_type*)&mask; \ - size_t i = 0; \ - mask_type sign_bit = 1; \ - sign_bit <<= (8*size_type - 1); \ - for ( ; i < size; ++i ) \ - if ( *(p_mask + i ) & sign_bit) \ - *(a + i) = *(p_data + i); \ -} - -__emu_maskload_impl( __emu_mm256_maskload_pd, __emu__m256d, __emu__m256i, double, __emu_int64_t ); -__emu_maskstore_impl( __emu_mm256_maskstore_pd, __emu__m256d, __emu__m256i, double, __emu_int64_t ); - -__emu_maskload_impl( __emu_mm_maskload_pd, __m128d, __m128i, double, __emu_int64_t ); -__emu_maskstore_impl( __emu_mm_maskstore_pd, __m128d, __m128i, double, __emu_int64_t ); - -__emu_maskload_impl( __emu_mm256_maskload_ps, __emu__m256, __emu__m256i, float, int ); -__emu_maskstore_impl( __emu_mm256_maskstore_ps, __emu__m256, __emu__m256i, float, int ); - -__emu_maskload_impl( __emu_mm_maskload_ps, __m128, __m128i, float, int ); -__emu_maskstore_impl( __emu_mm_maskstore_ps, __m128, __m128i, float, int ); - - -__EMU_M256_IMPL_M1( __m256, movehdup_ps ); -__EMU_M256_IMPL_M1( __m256, moveldup_ps ); -__EMU_M256_IMPL_M1( __m256d, movedup_pd ); - -__emu_mm_load_impl( lddqu, si128, si256, __m256i, __m128i, __emu__m256i ); - -__emu_mm_store_impl( stream, si128, si256, __m256i, __m128i, __emu__m256i ); -__emu_mm_store_impl( stream, pd, pd, __m256d, double, double ); -__emu_mm_store_impl( stream, ps, ps, __m256, float, float ); - - -__EMU_M256_IMPL_M1( __m256, rcp_ps ); -__EMU_M256_IMPL_M1( __m256, rsqrt_ps ); - -__EMU_M256_IMPL_M1( __m256d, sqrt_pd ); -__EMU_M256_IMPL_M1( __m256, sqrt_ps ); - -__EMU_M256_IMPL_M2( __m256d, unpackhi_pd ); -__EMU_M256_IMPL_M2( __m256, unpackhi_ps ); -__EMU_M256_IMPL_M2( __m256d, unpacklo_pd ); -__EMU_M256_IMPL_M2( __m256, unpacklo_ps ); - - -static __emu_inline int __emu_mm256_movemask_pd(__emu__m256d a) -{ - return - (_mm_movemask_pd( a.__emu_m128[1] ) << 2) | - _mm_movemask_pd( a.__emu_m128[0] ); -} - -static __emu_inline int __emu_mm256_movemask_ps(__emu__m256 a) -{ - return - (_mm_movemask_ps( a.__emu_m128[1] ) << 4) | - _mm_movemask_ps( a.__emu_m128[0] ); -} - -static __emu_inline __emu__m256d __emu_mm256_setzero_pd(void) { __m128d ret[2] = { _mm_setzero_pd(), _mm_setzero_pd() }; return __emu_set_m128d( ret ); } -static __emu_inline __emu__m256 __emu_mm256_setzero_ps(void) { __m128 ret[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; return __emu_set_m128( ret ); } -static __emu_inline __emu__m256i __emu_mm256_setzero_si256(void) { __m128i ret[2] = { _mm_setzero_si128(), _mm_setzero_si128() }; return __emu_set_m128i( ret ); } - -static __emu_inline __emu__m256d __emu_mm256_set_pd(double a1, double a2, double a3, double a4) -{ __m128d ret[2] = { _mm_set_pd( a3, a4 ), _mm_set_pd( a1, a2 ) }; return __emu_set_m128d( ret ); } - -static __emu_inline __emu__m256 __emu_mm256_set_ps(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) -{ __m128 ret[2] = { _mm_set_ps( a5, a6, a7, a8 ), _mm_set_ps( a1, a2, a3, a4 ) }; return __emu_set_m128( ret ); } - -static __emu_inline __emu__m256i __emu_mm256_set_epi8(char a1, char a2, char a3, char a4, char a5, char a6, char a7, char a8, - char a9, char a10, char a11, char a12, char a13, char a14, char a15, char a16, - char a17, char a18, char a19, char a20, char a21, char a22, char a23, char a24, - char a25, char a26, char a27, char a28, char a29, char a30, char a31, char a32) -{ __m128i ret[2] = { _mm_set_epi8( a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32 ), - _mm_set_epi8( a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16 ) }; - return __emu_set_m128i( ret ); -} - -static __emu_inline __emu__m256i __emu_mm256_set_epi16(short a1, short a2, short a3, short a4, short a5, short a6, short a7, short a8, - short a9, short a10, short a11, short a12, short a13, short a14, short a15, short a16) -{ __m128i ret[2] = { _mm_set_epi16( a9, a10, a11, a12, a13, a14, a15, a16 ), - _mm_set_epi16( a1, a2, a3, a4, a5, a6, a7, a8 ) }; - return __emu_set_m128i( ret ); -} - -static __emu_inline __emu__m256i __emu_mm256_set_epi32(int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8) -{ __m128i ret[2] = { _mm_set_epi32( a5, a6, a7, a8 ), _mm_set_epi32( a1, a2, a3, a4 ) }; return __emu_set_m128i( ret ); } - -static __emu_inline __m128i __emu_mm_set_epi64x( __emu_int64_t a, __emu_int64_t b ) { return _mm_set_epi64( *(__m64*)&a, *(__m64*)&b ); } - -static __emu_inline __emu__m256i __emu_mm256_set_epi64x(__emu_int64_t a1, __emu_int64_t a2, __emu_int64_t a3, __emu_int64_t a4) -{ __m128i ret[2] = { __emu_mm_set_epi64x( a3, a4 ), __emu_mm_set_epi64x( a1, a2 ) }; return __emu_set_m128i( ret ); } - - -static __emu_inline __emu__m256d __emu_mm256_setr_pd(double a1, double a2, double a3, double a4) -{ __m128d ret[2] = { _mm_setr_pd( a1, a2 ), _mm_setr_pd( a3, a4 ) }; return __emu_set_m128d( ret ); } - -static __emu_inline __emu__m256 __emu_mm256_setr_ps(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) -{ __m128 ret[2] = { _mm_setr_ps( a1, a2, a3, a4 ), _mm_setr_ps( a5, a6, a7, a8 ) }; return __emu_set_m128( ret ); } - -static __emu_inline __emu__m256i __emu_mm256_setr_epi8(char a1, char a2, char a3, char a4, char a5, char a6, char a7, char a8, - char a9, char a10, char a11, char a12, char a13, char a14, char a15, char a16, - char a17, char a18, char a19, char a20, char a21, char a22, char a23, char a24, - char a25, char a26, char a27, char a28, char a29, char a30, char a31, char a32) -{ __m128i ret[2] = { _mm_setr_epi8( a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16 ), - _mm_setr_epi8( a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32 ) }; - return __emu_set_m128i( ret ); -} - -static __emu_inline __emu__m256i __emu_mm256_setr_epi16(short a1, short a2, short a3, short a4, short a5, short a6, short a7, short a8, - short a9, short a10, short a11, short a12, short a13, short a14, short a15, short a16) -{ __m128i ret[2] = { _mm_setr_epi16( a1, a2, a3, a4, a5, a6, a7, a8 ), - _mm_setr_epi16( a9, a10, a11, a12, a13, a14, a15, a16 ) }; return __emu_set_m128i( ret ); -} - -static __emu_inline __emu__m256i __emu_mm256_setr_epi32(int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8) -{ __m128i ret[2] = { _mm_setr_epi32( a1, a2, a3, a4 ), _mm_setr_epi32( a5, a6, a7, a8 ), }; return __emu_set_m128i( ret ); } - -static __emu_inline __emu__m256i __emu_mm256_setr_epi64x(__emu_int64_t a1, __emu_int64_t a2, __emu_int64_t a3, __emu_int64_t a4) -{ __m128i ret[2] = { __emu_mm_set_epi64x( a2, a1 ), __emu_mm_set_epi64x( a4, a3 ) }; return __emu_set_m128i( ret ); } - - - -__EMU_M256_IMPL_M1P_DUP( __m256d, double, set1_pd ); -__EMU_M256_IMPL_M1P_DUP( __m256, float, set1_ps ); -__EMU_M256_IMPL_M1P_DUP( __m256i, char, set1_epi8 ); -__EMU_M256_IMPL_M1P_DUP( __m256i, short, set1_epi16 ); -__EMU_M256_IMPL_M1P_DUP( __m256i, int, set1_epi32 ); - -static __emu__m256i __emu_mm256_set1_epi64x(__emu_int64_t a) -{ - __emu_int64_t res[4] = { a, a, a, a }; - return *((__emu__m256i*)res); -} - -/* - * Support intrinsics to do vector type casts. These intrinsics do not introduce - * extra moves to generated code. When cast is done from a 128 to 256-bit type - * the low 128 bits of the 256-bit result contain source parameter value; the - * upper 128 bits of the result are undefined - */ -__EMU_M256_IMPL_M1_RET( __m256, __m256d, castpd_ps ); -__EMU_M256_IMPL_M1_RET( __m256d, __m256, castps_pd ); - -__EMU_M256_IMPL_M1_RET_NAME( __m256i, __m256, castps_si128, castps_si256 ); -__EMU_M256_IMPL_M1_RET_NAME( __m256i, __m256d, castpd_si128, castpd_si256 ); - -__EMU_M256_IMPL_M1_RET_NAME( __m256, __m256i, castsi128_ps, castsi256_ps ); -__EMU_M256_IMPL_M1_RET_NAME( __m256d, __m256i, castsi128_pd, castsi256_pd ); - -static __emu_inline __m128 __emu_mm256_castps256_ps128(__emu__m256 a) { return ( a.__emu_m128[0] ); } -static __emu_inline __m128d __emu_mm256_castpd256_pd128(__emu__m256d a) { return ( a.__emu_m128[0] ); } -static __emu_inline __m128i __emu_mm256_castsi256_si128(__emu__m256i a) { return ( a.__emu_m128[0] ); } - -static __emu_inline __emu__m256 __emu_mm256_castps128_ps256(__m128 a) { __m128 ret[2] = { a, _mm_setzero_ps() }; return __emu_set_m128( ret ); }; -static __emu_inline __emu__m256d __emu_mm256_castpd128_pd256(__m128d a) { __m128d ret[2] = { a, _mm_setzero_pd() }; return __emu_set_m128d( ret ); }; -static __emu_inline __emu__m256i __emu_mm256_castsi128_si256(__m128i a) { __m128i ret[2] = { a, _mm_setzero_si128() }; return __emu_set_m128i( ret ); }; - -#if defined __cplusplus -}; /* End "C" */ -#endif /* __cplusplus */ - - - - - - -#ifndef __EMU_M256_NOMAP - -#define __m256 __emu__m256 -#define __m256i __emu__m256i -#define __m256d __emu__m256d - -#define _mm256_add_pd __emu_mm256_add_pd -#define _mm256_add_ps __emu_mm256_add_ps - -#define _mm256_addsub_pd __emu_mm256_addsub_pd -#define _mm256_addsub_ps __emu_mm256_addsub_ps - -#define _mm256_and_pd __emu_mm256_and_pd -#define _mm256_and_ps __emu_mm256_and_ps - -#define _mm256_andnot_pd __emu_mm256_andnot_pd -#define _mm256_andnot_ps __emu_mm256_andnot_ps - -#define _mm256_blend_pd __emu_mm256_blend_pd -#define _mm256_blend_ps __emu_mm256_blend_ps - -#define _mm256_blendv_pd __emu_mm256_blendv_pd -#define _mm256_blendv_ps __emu_mm256_blendv_ps - -#define _mm256_div_pd __emu_mm256_div_pd -#define _mm256_div_ps __emu_mm256_div_ps - -#define _mm256_dp_ps __emu_mm256_dp_ps - -#define _mm256_hadd_pd __emu_mm256_hadd_pd -#define _mm256_hadd_ps __emu_mm256_hadd_ps - -#define _mm256_hsub_pd __emu_mm256_hsub_pd -#define _mm256_hsub_ps __emu_mm256_hsub_ps - -#define _mm256_max_pd __emu_mm256_max_pd -#define _mm256_max_ps __emu_mm256_max_ps - -#define _mm256_min_pd __emu_mm256_min_pd -#define _mm256_min_ps __emu_mm256_min_ps - -#define _mm256_mul_pd __emu_mm256_mul_pd -#define _mm256_mul_ps __emu_mm256_mul_ps - -#define _mm256_or_pd __emu_mm256_or_pd -#define _mm256_or_ps __emu_mm256_or_ps - -#define _mm256_shuffle_pd __emu_mm256_shuffle_pd -#define _mm256_shuffle_ps __emu_mm256_shuffle_ps - -#define _mm256_sub_pd __emu_mm256_sub_pd -#define _mm256_sub_ps __emu_mm256_sub_ps - -#define _mm256_xor_pd __emu_mm256_xor_pd -#define _mm256_xor_ps __emu_mm256_xor_ps - - -#define _mm_cmp_pd __emu_mm_cmp_pd -#define _mm256_cmp_pd __emu_mm256_cmp_pd - -#define _mm_cmp_ps __emu_mm_cmp_ps -#define _mm256_cmp_ps __emu_mm256_cmp_ps - -#define _mm_cmp_sd __emu_mm_cmp_sd -#define _mm_cmp_ss __emu_mm_cmp_ss - -#define _mm256_cvtepi32_pd __emu_mm256_cvtepi32_pd -#define _mm256_cvtepi32_ps __emu_mm256_cvtepi32_ps - -#define _mm256_cvtpd_ps __emu_mm256_cvtpd_ps -#define _mm256_cvtps_epi32 __emu_mm256_cvtps_epi32 -#define _mm256_cvtps_pd __emu_mm256_cvtps_pd - -#define _mm256_cvttpd_epi32 __emu_mm256_cvttpd_epi32 -#define _mm256_cvtpd_epi32 __emu_mm256_cvtpd_epi32 -#define _mm256_cvttps_epi32 __emu_mm256_cvttps_epi32 - -#define _mm256_extractf128_ps __emu_mm256_extractf128_ps -#define _mm256_extractf128_pd __emu_mm256_extractf128_pd -#define _mm256_extractf128_si256 __emu_mm256_extractf128_si256 - -#define _mm256_zeroall __emu_mm256_zeroall -#define _mm256_zeroupper __emu_mm256_zeroupper - -#define _mm256_permutevar_ps __emu_mm256_permutevar_ps -#define _mm_permutevar_ps __emu_mm_permutevar_ps - -#define _mm256_permute_ps __emu_mm256_permute_ps -#define _mm_permute_ps __emu_mm_permute_ps - -#define _mm256_permutevar_pd __emu_mm256_permutevar_pd -#define _mm_permutevar_pd __emu_mm_permutevar_pd - -#define _mm256_permute_pd __emu_mm256_permute_pd -#define _mm_permute_pd __emu_mm_permute_pd - -#define _mm256_permute2f128_ps __emu_mm256_permute2f128_ps -#define _mm256_permute2f128_pd __emu_mm256_permute2f128_pd -#define _mm256_permute2f128_si256 __emu_mm256_permute2f128_si256 - -#define _mm256_broadcast_ss __emu_mm256_broadcast_ss -#define _mm_broadcast_ss __emu_mm_broadcast_ss - -#define _mm256_broadcast_sd __emu_mm256_broadcast_sd - -#define _mm256_broadcast_ps __emu_mm256_broadcast_ps -#define _mm256_broadcast_pd __emu_mm256_broadcast_pd - -#define _mm256_insertf128_ps __emu_mm256_insertf128_ps -#define _mm256_insertf128_pd __emu_mm256_insertf128_pd -#define _mm256_insertf128_si256 __emu_mm256_insertf128_si256 - -#define _mm256_load_pd __emu_mm256_load_pd -#define _mm256_store_pd __emu_mm256_store_pd -#define _mm256_load_ps __emu_mm256_load_ps -#define _mm256_store_ps __emu_mm256_store_ps - -#define _mm256_loadu_pd __emu_mm256_loadu_pd -#define _mm256_storeu_pd __emu_mm256_storeu_pd -#define _mm256_loadu_ps __emu_mm256_loadu_ps -#define _mm256_storeu_ps __emu_mm256_storeu_ps - -#define _mm256_load_si256 __emu_mm256_load_si256 -#define _mm256_store_si256 __emu_mm256_store_si256 -#define _mm256_loadu_si256 __emu_mm256_loadu_si256 -#define _mm256_storeu_si256 __emu_mm256_storeu_si256 - -#define _mm256_maskload_pd __emu_mm256_maskload_pd -#define _mm256_maskstore_pd __emu_mm256_maskstore_pd -#define _mm_maskload_pd __emu_mm_maskload_pd -#define _mm_maskstore_pd __emu_mm_maskstore_pd - -#define _mm256_maskload_ps __emu_mm256_maskload_ps -#define _mm256_maskstore_ps __emu_mm256_maskstore_ps -#define _mm_maskload_ps __emu_mm_maskload_ps -#define _mm_maskstore_ps __emu_mm_maskstore_ps - -#define _mm256_movehdup_ps __emu_mm256_movehdup_ps -#define _mm256_moveldup_ps __emu_mm256_moveldup_ps - -#define _mm256_movedup_pd __emu_mm256_movedup_pd -#define _mm256_lddqu_si256 __emu_mm256_lddqu_si256 - -#define _mm256_stream_si256 __emu_mm256_stream_si256 -#define _mm256_stream_pd __emu_mm256_stream_pd -#define _mm256_stream_ps __emu_mm256_stream_ps - -#define _mm256_rcp_ps __emu_mm256_rcp_ps -#define _mm256_rsqrt_ps __emu_mm256_rsqrt_ps - -#define _mm256_sqrt_pd __emu_mm256_sqrt_pd -#define _mm256_sqrt_ps __emu_mm256_sqrt_ps - -#define _mm256_round_pd __emu_mm256_round_pd - -#define _mm256_round_ps __emu_mm256_round_ps - -#define _mm256_unpackhi_pd __emu_mm256_unpackhi_pd -#define _mm256_unpackhi_ps __emu_mm256_unpackhi_ps - -#define _mm256_unpacklo_pd __emu_mm256_unpacklo_pd -#define _mm256_unpacklo_ps __emu_mm256_unpacklo_ps - -#define _mm256_testz_si256 __emu_mm256_testz_si256 -#define _mm256_testc_si256 __emu_mm256_testc_si256 -#define _mm256_testnzc_si256 __emu_mm256_testnzc_si256 - -#define _mm256_testz_pd __emu_mm256_testz_pd -#define _mm256_testc_pd __emu_mm256_testc_pd -#define _mm256_testnzc_pd __emu_mm256_testnzc_pd -#define _mm_testz_pd __emu_mm_testz_pd -#define _mm_testc_pd __emu_mm_testc_pd -#define _mm_testnzc_pd __emu_mm_testnzc_pd - -#define _mm256_testz_ps __emu_mm256_testz_ps -#define _mm256_testc_ps __emu_mm256_testc_ps -#define _mm256_testnzc_ps __emu_mm256_testnzc_ps -#define _mm_testz_ps __emu_mm_testz_ps -#define _mm_testc_ps __emu_mm_testc_ps -#define _mm_testnzc_ps __emu_mm_testnzc_ps - -#define _mm256_movemask_pd __emu_mm256_movemask_pd -#define _mm256_movemask_ps __emu_mm256_movemask_ps - -#define _mm256_setzero_pd __emu_mm256_setzero_pd -#define _mm256_setzero_ps __emu_mm256_setzero_ps -#define _mm256_setzero_si256 __emu_mm256_setzero_si256 - -#define _mm256_set_pd __emu_mm256_set_pd -#define _mm256_set_ps __emu_mm256_set_ps -#define _mm256_set_epi8 __emu_mm256_set_epi8 -#define _mm256_set_epi16 __emu_mm256_set_epi16 -#define _mm256_set_epi32 __emu_mm256_set_epi32 -#define _mm256_set_epi64x __emu_mm256_set_epi64x - -#define _mm256_setr_pd __emu_mm256_setr_pd -#define _mm256_setr_ps __emu_mm256_setr_ps -#define _mm256_setr_epi8 __emu_mm256_setr_epi8 -#define _mm256_setr_epi16 __emu_mm256_setr_epi16 -#define _mm256_setr_epi32 __emu_mm256_setr_epi32 -#define _mm256_setr_epi64x __emu_mm256_setr_epi64x - -#define _mm256_set1_pd __emu_mm256_set1_pd -#define _mm256_set1_ps __emu_mm256_set1_ps -#define _mm256_set1_epi8 __emu_mm256_set1_epi8 -#define _mm256_set1_epi16 __emu_mm256_set1_epi16 -#define _mm256_set1_epi32 __emu_mm256_set1_epi32 -#define _mm256_set1_epi64x __emu_mm256_set1_epi64x - -#define _mm256_castpd_ps __emu_mm256_castpd_ps -#define _mm256_castps_pd __emu_mm256_castps_pd -#define _mm256_castps_si256 __emu_mm256_castps_si256 -#define _mm256_castpd_si256 __emu_mm256_castpd_si256 -#define _mm256_castsi256_ps __emu_mm256_castsi256_ps -#define _mm256_castsi256_pd __emu_mm256_castsi256_pd -#define _mm256_castps256_ps128 __emu_mm256_castps256_ps128 -#define _mm256_castpd256_pd128 __emu_mm256_castpd256_pd128 -#define _mm256_castsi256_si128 __emu_mm256_castsi256_si128 -#define _mm256_castps128_ps256 __emu_mm256_castps128_ps256 -#define _mm256_castpd128_pd256 __emu_mm256_castpd128_pd256 -#define _mm256_castsi128_si256 __emu_mm256_castsi128_si256 - -#endif /* __EMU_M256_NOMAP */ - - - -#endif /* __EMU_M256_AVXIMMINTRIN_EMU_H__ */ diff --git a/src/macros/vectors-4-SSE.h b/src/macros/vectors-4-SSE.h new file mode 100644 index 0000000..2be477b --- /dev/null +++ b/src/macros/vectors-4-SSE.h @@ -0,0 +1,457 @@ +// Vectorise using Intel's or AMD's SSE + +// Use the type __m128 directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include +#include + +#include +#ifdef __SSE4_1__ +// Intel's SSE 4.1 +# include +#endif +#ifdef __SSE4A__ +// AMD's SSE 4a +# include +#endif +#ifdef __FMA4__ +# include +#endif + + + +#ifdef __SSE4_1__ +# define vec4_architecture_SSE4_1 "+SSE4.1" +#else +# define vec4_architecture_SSE4_1 "" +#endif +#ifdef __SSE4A__ +# define vec4_architecture_SSE4a "+SSE4A" +#else +# define vec4_architecture_SSE4a "" +#endif +#ifdef __FMA4__ +# define vec4_architecture_FMA4 "+FMA4" +#else +# define vec4_architecture_FMA4 "" +#endif +#define vec4_architecture "SSE" vec4_architecture_SSE4_1 vec4_architecture_SSE4a vec4_architecture_FMA4 " (32-bit precision)" + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL4_VEC __m128 + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL4_VEC_SIZE 4 + +// Integer and boolean types corresponding to this real type +#define CCTK_INTEGER4 CCTK_REAL4 +#define CCTK_BOOLEAN4 CCTK_REAL4 +#define CCTK_INTEGER4_VEC CCTK_REAL4_VEC +#define CCTK_BOOLEAN4_VEC CCTK_REAL4_VEC + + + +union k4const_t { + unsigned i[4]; + float f[4]; + __m128i vi; + __m128 vf; +}; + +#define K4_ZERO 0x00000000UL +#define K4_IMIN 0x80000000UL +#define K4_IMAX 0x7fffffffUL + + + +// Create vectors, extract vector elements + +#define vec4_set1(a) (_mm_set1_ps(a)) +#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments + +// original order is 0123 +#define vec4_swap1032(x_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const x=x__; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,3,0,1)); \ + }) +#define vec4_swap2301(x_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const x=x__; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,0,3,2)); \ + }) +#define vec4_swap3210(x_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const x=x__; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,1,2,3)); \ + }) + +#if defined(__PGI) +// _mm_cvtss_f32 does not exist on PGI compilers +# define vec4_elt0(x) \ + ({ \ + CCTK_REAL4 a; \ + asm ("" : "=x" (a) : "0" (x)); \ + a; \ + }) +#else +# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op +#endif +#define vec4_elt1(x) vec4_elt0(vec4_swap1032(x)) +#define vec4_elt2(x) vec4_elt0(vec4_swap2301(x)) +#define vec4_elt3(x) vec4_elt0(vec4_swap3210(x)) +#if defined(__PGI) +# define vec4_elt(x_,d) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const x=x__; \ + CCTK_REAL4 a; \ + if (d==0) a=vec4_elt0(x); \ + else if (d==1) a=vec4_elt1(x); \ + else if (d==2) a=vec4_elt2(x); \ + else if (d==3) a=vec4_elt3(x); \ + a; \ + }) +#else +# define vec4_elt(x_,d) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const x=x__; \ + CCTK_REAL4 a; \ + switch (d) { \ + case 0: a=vec4_elt0(x); break; \ + case 1: a=vec4_elt1(x); break; \ + case 2: a=vec4_elt2(x); break; \ + case 3: a=vec4_elt3(x); break; \ + } \ + a; \ + }) +#endif + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec4_load(p) (_mm_load_ps(&(p))) +#define vec4_loadu(p) (_mm_loadu_ps(&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec4_load_off1(p) vec_loadu(p) +# define vec4_load_off2(p) vec_loadu(p) +# define vec4_load_off3(p) vec_loadu(p) +#else +# define vec4_load_off1(p_) \ + ({ \ + CCTK_REAL4 const& p__=(p_); \ + CCTK_REAL4 const& p=p__; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \ + assert(0); \ + CCTK_REAL4_VEC const hi2=_mm_shuffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \ + _mm_shuffle_ps(lo,hi2, _MM_SHUFFLE(2,1,3,0)); \ + }) +# define vec4_load_off2(p_) \ + ({ \ + CCTK_REAL4 const& p__=(p_); \ + CCTK_REAL4 const& p=p__; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-2]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+2]); \ + _mm_shuffle_ps(lo,hi, _MM_SHUFFLE(1,0,3,2)); \ + }) +# define vec4_load_off3(p_) \ + ({ \ + CCTK_REAL4 const& p__=(p_); \ + CCTK_REAL4 const& p=p__; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \ + assert(0); \ + CCTK_REAL4_VEC const lo2=_mm_shuffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \ + _mm_shuffle_ps(lo2,hi, _MM_SHUFFLE(3,0,2,1)); \ + }) +#endif + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS +// Implementation: Always use unaligned load +# define vec4_loadu_maybe(off,p) vec4_loadu(p) +# define vec4_loadu_maybe3(off1,off2,off3,p) vec4_loadu(p) +#else +# define vec4_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL4 const& p__=(p_); \ + CCTK_REAL4 const& p=p__; \ + (off) % CCTK_REAL4_VEC_SIZE == 0 ? \ + vec4_load(p) : \ + vec4_loadu(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec4_loadu_maybe3(off1,off2,off3,p) \ + vec4_loadu_maybe(off1,p) +# else +# define vec4_loadu_maybe3(off1,off2,off3,p) \ + vec4_loadu_maybe((off1)|(off2)|(off3),p) +# endif +#endif + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec4_store(p,x) (_mm_store_ps(&(p),x)) +#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x)) +#if ! VECTORISE_STREAMING_STORES +# define vec4_store_nta(p,x) vec4_store(p,x) +#else +# define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x)) +#endif + +// Store a partial vector (aligned and non-temporal) +#define vec4_store_partial_prepare(i,imin,imax) \ + int v4stp_lo_skip = (imin)-(i); \ + int v4stp_hi_skip = (i)+CCTK_REAL_VEC_SIZE-(imax); \ + if (CCTK_BUILTIN_EXPECT(v4stp_lo_skip < 0, true)) v4stp_lo_skip = 0; \ + if (CCTK_BUILTIN_EXPECT(v4stp_hi_skip < 0, true)) v4stp_hi_skip = 0; +// Ignoring VECTORISE_STREAMING_STORES for partial stores +#define vec4_store_nta_partial(p_,x_) \ + ({ \ + CCTK_REAL4& p__=(p_); \ + CCTK_REAL4& p=p__; \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const x=x__; \ + if (CCTK_BUILTIN_EXPECT(v4stp_lo_skip==0 and v4stp_hi_skip==0, true)) { \ + vec4_store_nta(p,x); \ + } else { \ + /* these cases fall through */ \ + switch (v4stp_lo_skip) { \ + case 0: \ + (&p)[0] = vec4_elt0(x); \ + case 1: \ + if (v4stp_hi_skip>=3) break; \ + (&p)[1] = vec4_elt1(x); \ + case 2: \ + if (v4stp_hi_skip>=2) break; \ + (&p)[2] = vec4_elt2(x); \ + case 3: \ + if (v4stp_hi_skip>=1) break; \ + (&p)[3] = vec4_elt3(x); \ + } \ + } \ + }) + +// Ignoring VECTORISE_STREAMING_STORES for partial stores +#define vec4_store_nta_partial_lo(p_,x_,n) \ + ({ \ + CCTK_REAL4 & p__=(p_); \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4 & p=p__; \ + CCTK_REAL4_VEC const x=x__; \ + /* these cases fall through */ \ + switch (n) { \ + case 3: (&p)[2] = vec4_elt2(x); \ + case 2: (&p)[1] = vec4_elt1(x); \ + case 1: (&p)[0] = vec4_elt0(x); \ + } \ + }) +#define vec4_store_nta_partial_hi(p_,x_,n) \ + ({ \ + CCTK_REAL4 & p__=(p_); \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4 & p=p__; \ + CCTK_REAL4_VEC const x=x__; \ + /* these cases fall through */ \ + switch (n) { \ + case 3: (&p)[1]=vec4_elt1(x); \ + case 2: (&p)[2]=vec4_elt2(x); \ + case 1: (&p)[3]=vec4_elt3(x); \ + } \ + }) +#define vec4_store_nta_partial_mid(p_,x_,nlo,nhi) \ + ({ \ + CCTK_REAL4 & p__=(p_); \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4 & p=p__; \ + CCTK_REAL4_VEC const x=x__; \ + /* these cases fall through */ \ + switch (nhi) { \ + case 3: if (nlo<2) break; (&p)[1] = vec4_elt1(x); \ + case 2: if (nlo<3) break; (&p)[2] = vec4_elt2(x); \ + } \ + }) + + + +// Functions and operators + +static const k4const_t k4sign_mask = {{ K4_IMIN, K4_IMIN, K4_IMIN, K4_IMIN, }}; + +// Operators +#define k4neg(x) (_mm_xor_ps(k4sign_mask.vf,x)) +// #define k4inv(x) +// TODO: provide k4inv via rcp and Newton-Raphson +// This is described in AMD's publication 47414. +// This should apply for AVX as well. + +#define k4add(x,y) (_mm_add_ps(x,y)) +#define k4sub(x,y) (_mm_sub_ps(x,y)) +#define k4mul(x,y) (_mm_mul_ps(x,y)) +// TODO: use k4inv and k4mul instead +#define k4div(x,y) (_mm_div_ps(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#ifdef __FMA4__ +# define k4madd(x,y,z) (_mm_macc_ps(x,y,z)) +# define k4msub(x,y,z) (_mm_msub_ps(x,y,z)) +# define k4nmadd(x,y,z) (_mm_nmsub_ps(x,y,z)) +# define k4nmsub(x,y,z) (_mm_nmacc_ps(x,y,z)) +#else +# define k4madd(x,y,z) (k4add(k4mul(x,y),z)) +# define k4msub(x,y,z) (k4sub(k4mul(x,y),z)) +# define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y))) +# define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) +#endif + +// Cheap functions +#define k4copysign(x,y) \ + (_mm_or_ps(_mm_andnot_ps(k4sign_mask.vf,x), \ + _mm_and_ps(k4sign_mask.vf,y))) +#define k4fabs(x) (_mm_andnot_ps(k4sign_mask.vf,x)) +#define k4fmax(x,y) (_mm_max_ps(x,y)) +#define k4fmin(x,y) (_mm_min_ps(x,y)) +#define k4fnabs(x) (_mm_or_ps(k4sign_mask.vf,x)) +#define k4sgn(x_) \ + ({ \ + CCTK_REAL_VEC const x__=(x_); \ + CCTK_REAL_VEC const x=x__; \ + CCTK_REAL_VEC const iszero = _mm_cmpeq_ps(vec4_set1(0.0f), x); \ + CCTK_REAL_VEC const sign = _mm_and_ps(k4sign_mask.vf, x); \ + CCTK_REAL_VEC const signedone = _mm_or_ps(vec4_set1(1.0f), sign); \ + k4ifthen(iszero, vec4_set1(0.0f), signedone); \ + }) +// TODO: maybe use rsqrt and Newton-Raphson +#define k4sqrt(x) (_mm_sqrt_ps(x)) + +// Expensive functions +#define K4REPL(f,x_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const x=x__; \ + vec4_set(f(vec4_elt0(x)), \ + f(vec4_elt1(x)), \ + f(vec4_elt2(x)), \ + f(vec4_elt3(x))); \ + }) +#define K4REPL2S(f,x_,a_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4 const a__=(a_); \ + CCTK_REAL4_VEC const x=x__; \ + CCTK_REAL4 const a=a__; \ + vec4_set(f(vec4_elt0(x),a), \ + f(vec4_elt1(x),a), \ + f(vec4_elt2(x),a), \ + f(vec4_elt3(x),a)); \ + }) +#define K4REPL2(f,x_,y_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const y__=(y_); \ + CCTK_REAL4_VEC const x=x__; \ + CCTK_REAL4_VEC const y=y__; \ + vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \ + f(vec4_elt1(x),vec4_elt1(y)), \ + f(vec4_elt2(x),vec4_elt2(y)), \ + f(vec4_elt3(x),vec4_elt3(y))); \ + }) + +#define k4acos(x) K4REPL(acosf,x) +#define k4acosh(x) K4REPL(acoshf,x) +#define k4asin(x) K4REPL(asinf,x) +#define k4asinh(x) K4REPL(asinhf,x) +#define k4atan(x) K4REPL(atanf,x) +#define k4atan2(x,y) K4REPL2(atan2f,x,y) +#define k4atanh(x) K4REPL(atanhf,x) +#define k4cos(x) K4REPL(cosf,x) +#define k4cosh(x) K4REPL(coshf,x) +#define k4exp(x) K4REPL(expf,x) +#define k4log(x) K4REPL(logf,x) +#define k4pow(x,a) K4REPL2S(powf,x,a) +#define k4sin(x) K4REPL(sinf,x) +#define k4sinh(x) K4REPL(sinhf,x) +#define k4tan(x) K4REPL(tanf,x) +#define k4tanh(x) K4REPL(tanhf,x) + +static const k4const_t k4lfalse_ = {{ 0U, 0U, 0U, 0U, }}; +static const k4const_t k4ltrue_ = {{ ~0U, ~0U, ~0U, ~0U, }}; +#define k4lfalse (k4lfalse_.vf) +#define k4ltrue (k4ltrue_.vf) +#define k4lnot(x) (_mm_xor_ps(k4ltrue,x)) +#define k4land(x,y) (_mm_and_ps(x,y)) +#define k4lor(x,y) (_mm_or_ps(x,y)) +#define k4lxor(x,y) (_mm_xor_ps(x,y)) + +#ifdef __SSE4_1__ +# define k4ifthen(x,y,z) (_mm_blendv_ps(z,y,x)) +#elif 0 +# ifdef __cplusplus +# define k4signbit(x) ({ using namespace std; signbit(x); }) +# else +# define k4signbit(x) (signbitf(x)) +# endif +# define k4ifthen(x,y,z) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const y__=(y_); \ + CCTK_REAL4_VEC const z__=(z_); \ + CCTK_REAL4_VEC const x=x__; \ + CCTK_REAL4_VEC const y=y__; \ + CCTK_REAL4_VEC const z=z__; \ + vec4_set(k4signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \ + k4signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \ + k4signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \ + k4signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \ + }) +#elif 0 +// We don't need to shift -- the condition (mask) will be either all +// zeros or all ones +# define k4ifthen(x_,y_,z_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const y__=(y_); \ + CCTK_REAL4_VEC const z__=(z_); \ + CCTK_REAL4_VEC const x=x__; \ + CCTK_REAL4_VEC const y=y__; \ + CCTK_REAL4_VEC const z=z__; \ + CCTK_REAL4_VEC const mask = \ + (__m128)_mm_srai_epi32((__m128i)x, 31); \ + /* (z & ~mask) | (y & mask) */ \ + _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \ + }) +#else +# define k4ifthen(x_,y_,z_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const y__=(y_); \ + CCTK_REAL4_VEC const z__=(z_); \ + CCTK_REAL4_VEC const x=x__; \ + CCTK_REAL4_VEC const y=y__; \ + CCTK_REAL4_VEC const z=z__; \ + /* (z & ~mask) | (y & mask) where imask = ~mask */ \ + _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z)); \ + }) +#endif + +#define k4cmpeq(x,y) (_mm_cmpeq_ps(x,y)) +#define k4cmpne(x,y) (_mm_cmpneq_ps(x,y)) +#define k4cmpgt(x,y) (_mm_cmpgt_ps(x,y)) +#define k4cmpge(x,y) (_mm_cmpge_ps(x,y)) +#define k4cmplt(x,y) (_mm_cmplt_ps(x,y)) +#define k4cmple(x,y) (_mm_cmple_ps(x,y)) diff --git a/src/macros/vectors-4-default.h b/src/macros/vectors-4-default.h new file mode 100644 index 0000000..0cd49ac --- /dev/null +++ b/src/macros/vectors-4-default.h @@ -0,0 +1,134 @@ +// Fallback vectorisation implementation: Do not vectorise + +// We use macros here, so that we are not surprised by compilers which +// don't like to inline functions. This should also make debug builds +// (which may not inline) more efficient. + + + +#include +#include + + + +#define vec4_architecture "scalar (no vectorisation, 32-bit precision)" + +// Use CCTK_REAL4 +#define CCTK_REAL4_VEC CCTK_REAL4 + +// Number of vector elements in a vector +#define CCTK_REAL4_VEC_SIZE 1 + +// Integer and boolean types corresponding to this real type +#define CCTK_INTEGER4 CCTK_REAL4 +#define CCTK_BOOLEAN4 CCTK_REAL4 +#define CCTK_INTEGER4_VEC CCTK_REAL4_VEC +#define CCTK_BOOLEAN4_VEC CCTK_REAL4_VEC + + + +// Create a vector replicating a scalar +#define vec4_set1(a) (a) +// Create a vector from N scalars +#define vec4_set(a) (a) + +// Access vectors elements +#define vec4_elt0(x) (x) +#define vec4_elt(x,d) (x) + + + +// Load an aligned vector from memory +#define vec4_load(p) (p) +// Load an unaligned vector from memory +#define vec4_loadu(p) (p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size. These functions are +// useful e.g. for loading neightbouring grid points while evaluating +// finite differencing stencils. +#define vec4_loadu_maybe(off,p) (p) +#define vec4_loadu_maybe3(off1,off2,off3,p) (p) + +// Aligned store +#define vec4_store(p,x) ((p)=(x)) +#define vec4_storeu(p,x) ((p)=(x)) + +// Unaligned store +#define vec4_store_nta(p,x) ((p)=(x)) + +#define vec4_store_partial_prepare(i,imin,imax) (0) +#define vec4_store_nta_partial(p,x) (vec4_store_nta(p,x)) +// Store the n lower elements of a vector to memory +#define vec4_store_nta_partial_lo(p,x,n) (assert(0)) +// Store the n higher elements of a vector into memory. This stores +// the vector elements into memory locations as if element 0 were +// stored at p. +#define vec4_store_nta_partial_hi(p,x,n) (assert(0)) +#define vec4_store_nta_partial_mid(p,x,nlo,nhi) (assert(0)) + + + +// Operators +#define k4neg(x) (-(x)) + +#define k4add(x,y) ((x)+(y)) +#define k4sub(x,y) ((x)-(y)) +#define k4mul(x,y) ((x)*(y)) +#define k4div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k4madd(x,y,z) (+(x)*(y)+(z)) +#define k4msub(x,y,z) (+(x)*(y)-(z)) +#define k4nmadd(x,y,z) (-(x)*(y)-(z)) +#define k4nmsub(x,y,z) (-(x)*(y)+(z)) + +// Functions +#define k4acos(x) (acosf(x)) +#define k4acosh(x) (acoshf(x)) +#define k4asin(x) (asinf(x)) +#define k4asinh(x) (asinhf(x)) +#define k4atan(x) (atanf(x)) +#define k4atan2(x,y) (atan2f(x,y)) +#define k4atanh(x) (atanhf(x)) +#define k4copysign(x,y) (copysign(x,y)) +#define k4cos(x) (cosf(x)) +#define k4cosh(x) (coshf(x)) +#define k4exp(x) (expf(x)) +#define k4fabs(x) (fabsf(x)) +#define k4fmax(x,y) (fmaxf(x,y)) +#define k4fmin(x,y) (fminf(x,y)) +#define k4fnabs(x) (-fabsf(x)) +#define k4log(x) (logf(x)) +#define k4pow(x,a) (powf(x,a)) +#define k4sin(x) (sinf(x)) +#define k4sinh(x) (sinhf(x)) +#define k4sqrt(x) (sqrtf(x)) +#define k4tan(x) (tanf(x)) +#define k4tanh(x) (tanhf(x)) + +#define k4sgn(x_) \ + ({ \ + CCTK_REAL x__=(x_); \ + CCTK_REAL x=x__; \ + x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : std::copysign((CCTK_REAL)1.0, x); \ + }) +#define k4signbit(x) (std::signbit(x)) + +#define k4l2r(x_) ({ CCTK_INT4 x__=(x_); CCTK_INT4 x=x__; *(CCTK_REAL4*)&x; }) +#define k4r2l(x_) ({ CCTK_REAL4 x__=(x_); CCTK_REAL4 x=x__; *(CCTK_INT4*)&x; }) +#define k4lfalse k4l2r(0) +#define k4ltrue k4l2r(1) +#define k4lnot(x) k4l2r(!k4r2l(x)) +#define k4land(x,y) k4l2r(k4r2l(x) && k4r2l(y)) +#define k4lor(x,y) k4l2r(k4r2l(x) || k4r2l(y)) +#define k4lxor(x,y) k4l2r(!k4r2l(x) != !k4r2l(y)) + +#define k4ifthen(x,y,z) (k4r2l(x)?(y):(z)) + +#define k4cmpeq(x,y) k4l2r((x)==(y)) +#define k4cmpne(x,y) k4l2r((x)!=(y)) +#define k4cmpgt(x,y) k4l2r((x)>(y)) +#define k4cmpge(x,y) k4l2r((x)>=(y)) +#define k4cmplt(x,y) k4l2r((x)<(y)) +#define k4cmple(x,y) k4l2r((x)<=(y)) diff --git a/src/macros/vectors-8-AVX.h b/src/macros/vectors-8-AVX.h new file mode 100644 index 0000000..6882523 --- /dev/null +++ b/src/macros/vectors-8-AVX.h @@ -0,0 +1,325 @@ +// Vectorise using Intel's or AMD's AVX + +// Use the type __m256d directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#if VECTORISE_EMULATE_AVX +# include "avxintrin_emu.h" +#else +# include +#endif +#ifdef __FMA4__ +# include +#endif + + + +#ifdef __FMA4__ +# define vec8_architecture_FMA4 "+FMA4" +#else +# define vec8_architecture_FMA4 "" +#endif +#define vec8_architecture "AVX" vec8_architecture_FMA4 " (64-bit precision)" + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC __m256d + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 4 + +// Integer and boolean types corresponding to this real type +#define CCTK_INTEGER8 CCTK_REAL8 +#define CCTK_BOOLEAN8 CCTK_REAL8 +#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC +#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC + + + +union k8const_t { + unsigned long long i[4]; + double f[4]; + __m256i vi; + __m256d vf; +}; + +#define K8_ZERO 0x0000000000000000ULL +#define K8_NOTZERO 0xffffffffffffffffULL +#define K8_IMIN 0x8000000000000000ULL +#define K8_IMAX 0x7fffffffffffffffULL + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (_mm256_set1_pd(a)) +#define vec8_set(a,b,c,d) (_mm256_set_pd(d,c,b,a)) // note reversed arguments + +#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0]) +#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1]) +#define vec8_elt2(x) (((CCTK_REAL8 const*)&(x))[2]) +#define vec8_elt3(x) (((CCTK_REAL8 const*)&(x))[3]) +#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d]) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (_mm256_load_pd(&(p))) +#define vec8_loadu(p) (_mm256_loadu_pd(&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec8_load_off1(p) vec_loadu(p) +#else +# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported" +#endif + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS +// Implementation: Always use unaligned load +# define vec8_loadu_maybe(off,p) (vec8_loadu(p)) +# define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) +#else +# define vec8_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL8 const& p__=(p_); \ + CCTK_REAL8 const& p=p__; \ + (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + vec8_load(p) : \ + vec8_load_off1(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec8_loadu_maybe3(off1,off2,off3,p) \ + vec8_loadu_maybe(off1,p) +# else +# define vec8_loadu_maybe3(off1,off2,off3,p_) \ + ({ \ + CCTK_REAL8 const& p__=(p_); \ + CCTK_REAL8 const& p=p__; \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ + (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ + vec8_loadu(p) : \ + vec8_loadu_maybe(off1,p); \ + }) +# endif +#endif + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (_mm256_store_pd(&(p),x)) +#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x)) +#if ! VECTORISE_STREAMING_STORES +# define vec8_store_nta(p,x) (vec8_store(p,x)) +#else +# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x)) +#endif + +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin_,imax_) \ + bool v8stp_all; \ + __m256i v8stp_mask; \ + ({ \ + ptrdiff_t const imin__=(imin_); \ + ptrdiff_t const imin=imin__; \ + ptrdiff_t const imax__=(imax_); \ + ptrdiff_t const imax=imax__; \ + \ + v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE-1 +#include + +#include +#ifdef __SSE4_1__ +// Intel's SSE 4.1 +# include +#endif +#ifdef __SSE4A__ +// AMD's SSE 4a +# include + +// Intel compilers don't support SSE 4a. Here is how we can implement +// these instructions in assembler instead: + +// inline void __attribute__((__always_inline__)) +// _mm_stream_sd (double *p, __m128d x) +// { +// asm ("movntsd %[x],%[p]" : "=m" (*p) : [p] "m" (*p), [x] "x" (x)); +// } + +#endif +#ifdef __FMA4__ +# include +#endif + + + +#ifdef __SSE4_1__ +# define vec8_architecture_SSE4_1 "+SSE4.1" +#else +# define vec8_architecture_SSE4_1 "" +#endif +#ifdef __SSE4A__ +# define vec8_architecture_SSE4a "+SSE4A" +#else +# define vec8_architecture_SSE4a "" +#endif +#ifdef __FMA4__ +# define vec8_architecture_FMA4 "+FMA4" +#else +# define vec8_architecture_FMA4 "" +#endif +#define vec8_architecture "SSE2" vec8_architecture_SSE4_1 vec8_architecture_SSE4a vec8_architecture_FMA4 " (64-bit precision)" + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC __m128d + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 2 + +// Integer and boolean types corresponding to this real type +#define CCTK_INTEGER8 CCTK_REAL8 +#define CCTK_BOOLEAN8 CCTK_REAL8 +#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC +#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC + + + +union k8const_t { + long long i[2]; + double f[2]; + __m128i vi; + __m128d vf; +}; + +#define K8_IMIN ((long long)0x8000000000000000ULL) + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (_mm_set1_pd(a)) +#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments + +// original order is 01 +#define vec8_swap10(x_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const x=x__; \ + _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); \ + }) + +#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0]) +#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1]) +#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d]) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (_mm_load_pd(&(p))) +#define vec8_loadu(p) (_mm_loadu_pd(&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec8_load_off1(p) vec_loadu(p) +#else +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& p__=(p_); \ + CCTK_REAL8 const& p=p__; \ + _mm_shuffle_pd(vec8_load((&p)[-1]), \ + vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); \ + }) +#endif + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS +// Implementation: Always use unaligned load +# define vec8_loadu_maybe(off,p) vec8_loadu(p) +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) +#else +# define vec8_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL8 const& p__=(p_); \ + CCTK_REAL8 const& p=p__; \ + (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + vec8_load(p) : \ + vec8_load_off1(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec8_loadu_maybe3(off1,off2,off3,p) \ + vec8_loadu_maybe(off1,p) +# else +# define vec8_loadu_maybe3(off1,off2,off3,p_) \ + ({ \ + CCTK_REAL8 const& p__=(p_); \ + CCTK_REAL8 const& p=p__; \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ + (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ + vec8_loadu(p) : \ + vec8_loadu_maybe(off1,p); \ + }) +# endif +#endif + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (_mm_store_pd(&(p),x)) +#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) +#if ! VECTORISE_STREAMING_STORES +# define vec8_store_nta(p,x) vec8_store(p,x) +#else +# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) +#endif + +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin,imax) \ + bool const v8stp_lo = (i)>=(imin); \ + bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE-1<(imax) +#if VECTORISE_STREAMING_STORES && defined(__SSE4A__) +# define vec8_store_nta_partial(p_,x_) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const x=x__; \ + if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ + vec8_store_nta(p,x); \ + } else if (v8stp_lo) { \ + _mm_stream_sd(&p,x); \ + } else if (v8stp_hi) { \ + _mm_stream_sd(&p+1, vec8_swap10(x)); \ + } \ + }) +#else +# define vec8_store_nta_partial(p_,x_) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const x=x__; \ + if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ + vec8_store_nta(p,x); \ + } else if (v8stp_lo) { \ + _mm_storel_pd(&p,x); \ + } else if (v8stp_hi) { \ + _mm_storeh_pd(&p+1,x); \ + } \ + }) +#endif + +// Store a lower or higher partial vector (aligned and non-temporal) +#if ! VECTORISE_STREAMING_STORES +# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) +# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x)) +#else +# if defined(__SSE4A__) +# define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x)) +# define vec8_store_nta_partial_hi(p,x,n) \ + (_mm_stream_sd(&(p)+1, vec8_swap10(x))) +# else +// TODO: use clflush once a whole cache line has been written (cache +// lines are usually larger than the CPU vector size) +# define vec8_store_nta_partial_lo(p_,x,n) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8& p=p__; \ + _mm_storel_pd(&p,x); \ + /* _mm_clflush(&p); */ \ + }) +# define vec8_store_nta_partial_hi(p_,x,n) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8& p=p__; \ + _mm_storeh_pd(&p+1,x); \ + /* _mm_clflush(&p+1); */ \ + }) +# endif +#endif +#if 0 +// This is slower; we would need a non-temporal read +#define vec8_store_nta_partial_lo(p,x,n) \ + vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1)) +#define vec8_store_nta_partial_hi(p,x,n) \ + vec8_store_nta(p, _mm_loadl_pd(x,&(p))) +#endif +#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0) + + + +// Functions and operators + +static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }}; + +// Operators + +// #define k8inot(x) (_mm_xor_si128(k8all_mask,x)) +// +// #define k8iand(x,y) (_mm_and_si128(x,y)) +// #define k8ior(x,y) (_mm_or_si128(x,y)) +// #define k8ixor(x,y) (_mm_xor_si128(x,y)) +// +// #define k8ineg(x) (_mm_xor_pd(k8sign_mask,x)) +// +// #define k8iadd(x,y) (_mm_add_epi64(x,y)) +// #define k8isub(x,y) (_mm_sub_epi64(x,y)) +// +// #define k8not(x) (_mm_xor_pd(k8all_mask,x)) +// +// #define k8and(x,y) (_mm_and_pd(x,y)) +// #define k8or(x,y) (_mm_or_pd(x,y)) +// #define k8xor(x,y) (_mm_xor_pd(x,y)) + +#define k8neg(x) (_mm_xor_pd(k8sign_mask.vf,x)) + +#define k8add(x,y) (_mm_add_pd(x,y)) +#define k8sub(x,y) (_mm_sub_pd(x,y)) +#define k8mul(x,y) (_mm_mul_pd(x,y)) +#define k8div(x,y) (_mm_div_pd(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#ifdef __FMA4__ +# define k8madd(x,y,z) (_mm_macc_pd(x,y,z)) +# define k8msub(x,y,z) (_mm_msub_pd(x,y,z)) +# define k8nmadd(x,y,z) (_mm_nmsub_pd(x,y,z)) +# define k8nmsub(x,y,z) (_mm_nmacc_pd(x,y,z)) +#else +# define k8madd(x,y,z) (k8add(k8mul(x,y),z)) +# define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) +# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) +# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) +#endif + +// Cheap functions +#define k8copysign(x,y) \ + (_mm_or_pd(_mm_andnot_pd(k8sign_mask.vf,x), \ + _mm_and_pd(k8sign_mask.vf,y))) +#define k8fabs(x) (_mm_andnot_pd(k8sign_mask.vf,x)) +#define k8fmax(x,y) (_mm_max_pd(x,y)) +#define k8fmin(x,y) (_mm_min_pd(x,y)) +#define k8fnabs(x) (_mm_or_pd(k8sign_mask.vf,x)) +#define k8sgn(x_) \ + ({ \ + CCTK_REAL_VEC const x__=(x_); \ + CCTK_REAL_VEC const x=x__; \ + CCTK_REAL_VEC const iszero = _mm_cmpeq_pd(vec8_set1(0.0), x); \ + CCTK_REAL_VEC const sign = _mm_and_pd(k8sign_mask.vf, x); \ + CCTK_REAL_VEC const signedone = _mm_or_pd(vec8_set1(1.0), sign); \ + k8ifthen(iszero, vec8_set1(0.0), signedone); \ + }) +#define k8sqrt(x) (_mm_sqrt_pd(x)) + +// Expensive functions +#define K8REPL(f,x_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const x=x__; \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x))); \ + }) +#define K8REPL2S(f,x_,a_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8 const a__=(a_); \ + CCTK_REAL8_VEC const x=x__; \ + CCTK_REAL8 const a=a__; \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a)); \ + }) +#define K8REPL2(f,x_,y_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const y__=(y_); \ + CCTK_REAL8_VEC const x=x__; \ + CCTK_REAL8_VEC const y=y__; \ + vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ + f(vec8_elt1(x),vec8_elt1(y))); \ + }) + +#define k8acos(x) K8REPL(acos,x) +#define k8acosh(x) K8REPL(acosh,x) +#define k8asin(x) K8REPL(asin,x) +#define k8asinh(x) K8REPL(asinh,x) +#define k8atan(x) K8REPL(atan,x) +#define k8atan2(x,y) K8REPL2(atan2,x,y) +#define k8atanh(x) K8REPL(atanh,x) +#define k8cos(x) K8REPL(cos,x) +#define k8cosh(x) K8REPL(cosh,x) +#define k8exp(x) K8REPL(exp,x) +#define k8log(x) K8REPL(log,x) +#define k8pow(x,a) K8REPL2S(pow,x,a) +#define k8sin(x) K8REPL(sin,x) +#define k8sinh(x) K8REPL(sinh,x) +#define k8tan(x) K8REPL(tan,x) +#define k8tanh(x) K8REPL(tanh,x) + +static const k8const_t k8lfalse_ = {{ +0LL, +0LL, }}; +static const k8const_t k8ltrue_ = {{ -1LL, -1LL, }}; +#define k8lfalse (k8lfalse_.vf) +#define k8ltrue (k8ltrue_.vf) +#define k8lnot(x) (_mm_xor_pd(k8ltrue,x)) +#define k8land(x,y) (_mm_and_pd(x,y)) +#define k8lor(x,y) (_mm_or_pd(x,y)) +#define k8lxor(x,y) (_mm_xor_pd(x,y)) + +#ifdef __SSE4_1__ +# define k8ifthen(x,y,z) (_mm_blendv_pd(z,y,x)) +#elif 0 +// This is slow (but this is what Intel/PGI produce by themselves) +# define k8ifthen(x_,y_,z_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const y__=(y_); \ + CCTK_REAL8_VEC const z__=(z_); \ + CCTK_REAL8_VEC const x=x__; \ + CCTK_REAL8_VEC const y=y__; \ + CCTK_REAL8_VEC const z=z__; \ + int const m = _mm_movemask_pd(x); \ + CCTK_REAL8_VEC r; \ + switch (m) { \ + case 0: r = y; break; \ + case 1: r = _mm_move_sd(y,z); break; \ + case 2: r = _mm_move_sd(z,y); break; \ + case 3: r = z; break; \ + } \ + r; \ + }) +#elif 0 +# ifdef __cplusplus +# define k8signbit(x) ({ using namespace std; signbit(x); }) +# else +# define k8signbit(x) (signbit(x)) +# endif +# define k8ifthen(x_,y_,z_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const y__=(y_); \ + CCTK_REAL8_VEC const z__=(z_); \ + CCTK_REAL8_VEC const x=x__; \ + CCTK_REAL8_VEC const y=y__; \ + CCTK_REAL8_VEC const z=z__; \ + vec8_set(k8signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \ + k8signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \ + }) +#elif 0 +// We don't need to shift -- the condition (mask) will be either all +// zeros or all ones +static const k8const_t k8ione = {{ 0x1ULL, 0x1ULL, }}; +# define k8ifthen(x_,y_,z_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const y__=(y_); \ + CCTK_REAL8_VEC const z__=(z_); \ + CCTK_REAL8_VEC const x=x__; \ + CCTK_REAL8_VEC const y=y__; \ + CCTK_REAL8_VEC const z=z__; \ + /* there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 */ \ + __m128i const x_int = *(__m128i const*)&x; \ + __m128i const imask_int = \ + _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); \ + CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; \ + /* (z & ~mask) | (y & mask) where imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \ + }) +#else +# define k8ifthen(x_,y_,z_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const y__=(y_); \ + CCTK_REAL8_VEC const z__=(z_); \ + CCTK_REAL8_VEC const x=x__; \ + CCTK_REAL8_VEC const y=y__; \ + CCTK_REAL8_VEC const z=z__; \ + /* (z & ~mask) | (y & mask) where imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); \ + }) +#endif + +#define k8cmpeq(x,y) (_mm_cmpeq_pd(x,y)) +#define k8cmpne(x,y) (_mm_cmpneq_pd(x,y)) +#define k8cmpgt(x,y) (_mm_cmpgt_pd(x,y)) +#define k8cmpge(x,y) (_mm_cmpge_pd(x,y)) +#define k8cmplt(x,y) (_mm_cmplt_pd(x,y)) +#define k8cmple(x,y) (_mm_cmple_pd(x,y)) diff --git a/src/macros/vectors-8-default.h b/src/macros/vectors-8-default.h new file mode 100644 index 0000000..7ff6c8c --- /dev/null +++ b/src/macros/vectors-8-default.h @@ -0,0 +1,132 @@ +// Fallback vectorisation implementation: Do not vectorise + +// We use macros here, so that we are not surprised by compilers which +// don't like to inline functions. This should also make debug builds +// (which may not inline) more efficient. + + + +#include +#include + + + +#define vec8_architecture "scalar (no vectorisation, 64-bit precision)" + +// Use CCTK_REAL8 +#define CCTK_REAL8_VEC CCTK_REAL8 + +// Number of vector elements in a vector +#define CCTK_REAL8_VEC_SIZE 1 + +// Integer and boolean types corresponding to this real type +#define CCTK_INTEGER8 CCTK_REAL8 +#define CCTK_BOOLEAN8 CCTK_REAL8 +#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC +#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC + + + +// Create a vector replicating a scalar +#define vec8_set1(a) (a) +// Create a vector from N scalars +#define vec8_set(a) (a) + +// Access vectors elements +#define vec8_elt0(x) (x) +#define vec8_elt(x,d) (x) + + + +// Load an aligned vector from memory +#define vec8_load(p) (p) +// Load an unaligned vector from memory +#define vec8_loadu(p) (p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size. These functions are +// useful e.g. for loading neightbouring grid points while evaluating +// finite differencing stencils. +#define vec8_loadu_maybe(off,p) (p) +#define vec8_loadu_maybe3(off1,off2,off3,p) (p) + +// Aligned store +#define vec8_store(p,x) ((p)=(x)) +// Unaligned store +#define vec8_store_nta(p,x) ((p)=(x)) + +#define vec8_store_partial_prepare(i,imin,imax) ((void)0) +#define vec8_store_nta_partial(p,x) (vec8_store_nta(p,x)) +// Store the n lower elements of a vector to memory +#define vec8_store_nta_partial_lo(p,x,n) (assert(0)) +// Store the n higher elements of a vector into memory. This stores +// the vector elements into memory locations as if element 0 were +// stored at p. +#define vec8_store_nta_partial_hi(p,x,n) (assert(0)) +#define vec8_store_nta_partial_mid(p,x,nlo,nhi) (assert(0)) + + + +// Operators +#define k8neg(x) (-(x)) + +#define k8add(x,y) ((x)+(y)) +#define k8sub(x,y) ((x)-(y)) +#define k8mul(x,y) ((x)*(y)) +#define k8div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (+(x)*(y)+(z)) +#define k8msub(x,y,z) (+(x)*(y)-(z)) +#define k8nmadd(x,y,z) (-(x)*(y)-(z)) +#define k8nmsub(x,y,z) (-(x)*(y)+(z)) + +// Functions +#define k8acos(x) (acos(x)) +#define k8acosh(x) (acosh(x)) +#define k8asin(x) (asin(x)) +#define k8asinh(x) (asinh(x)) +#define k8atan(x) (atan(x)) +#define k8atan2(x,y) (atan2(x,y)) +#define k8atanh(x) (atanh(x)) +#define k8copysign(x,y) (copysign(x,y)) +#define k8cos(x) (cos(x)) +#define k8cosh(x) (cosh(x)) +#define k8exp(x) (exp(x)) +#define k8fabs(x) (fabs(x)) +#define k8fmax(x,y) (fmax(x,y)) +#define k8fmin(x,y) (fmin(x,y)) +#define k8fnabs(x) (-fabs(x)) +#define k8log(x) (log(x)) +#define k8pow(x,a) (pow(x,a)) +#define k8sin(x) (sin(x)) +#define k8sinh(x) (sinh(x)) +#define k8sqrt(x) (sqrt(x)) +#define k8tan(x) (tan(x)) +#define k8tanh(x) (tanh(x)) + +#define k8sgn(x_) \ + ({ \ + CCTK_REAL x__=(x_); \ + CCTK_REAL x=x__; \ + x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : std::copysign((CCTK_REAL)1.0, x); \ + }) +#define k8signbit(x) (std::signbit(x)) + +#define k8l2r(x_) ({ CCTK_INT8 x__=(x_); CCTK_INT8 x=x__; *(CCTK_REAL8*)&x; }) +#define k8r2l(x_) ({ CCTK_REAL8 x__=(x_); CCTK_REAL8 x=x__; *(CCTK_INT8*)&x; }) +#define k8lfalse k8l2r(0) +#define k8ltrue k8l2r(1) +#define k8lnot(x) k8l2r(!k8r2l(x)) +#define k8land(x,y) k8l2r(k8r2l(x) && k8r2l(y)) +#define k8lor(x,y) k8l2r(k8r2l(x) || k8r2l(y)) +#define k8lxor(x,y) k8l2r(!k8r2l(x) != !k8r2l(y)) + +#define k8ifthen(x,y,z) (k8r2l(x)?(y):(z)) + +#define k8cmpeq(x,y) k8l2r((x)==(y)) +#define k8cmpne(x,y) k8l2r((x)!=(y)) +#define k8cmpgt(x,y) k8l2r((x)>(y)) +#define k8cmpge(x,y) k8l2r((x)>=(y)) +#define k8cmplt(x,y) k8l2r((x)<(y)) +#define k8cmple(x,y) k8l2r((x)<=(y)) diff --git a/src/vectors-4-AVX.h b/src/vectors-4-AVX.h new file mode 100644 index 0000000..641a74b --- /dev/null +++ b/src/vectors-4-AVX.h @@ -0,0 +1,659 @@ +// Vectorise using Intel's or AMD's AVX + +// Use the type __m256 directly, without introducing a wrapper class + + + +#include + + + +#include +#ifdef __FMA4__ +# include +#endif + + + +#ifdef __FMA4__ +# define vec4_architecture_FMA4 "+FMA4" +#else +# define vec4_architecture_FMA4 "" +#endif +#define vec4_architecture "AVX" vec4_architecture_FMA4 " (32-bit precision)" + + + +// Vector type corresponding to CCTK_REAL +typedef __m256 CCTK_REAL4_VEC; +typedef __m256i CCTK_INTEGER4_VEC; +typedef __m256 CCTK_BOOLEAN4_VEC; + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL4_VEC_SIZE 8 + +vec_static_assert(sizeof(CCTK_REAL4_VEC) == + sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE); + +// Integer and boolean types corresponding to this real type +typedef CCTK_INT4 CCTK_INTEGER4; +typedef CCTK_REAL4 CCTK_BOOLEAN4; + + + +union k4const_t { + CCTK_INTEGER4 i[CCTK_REAL4_VEC_SIZE]; + CCTK_REAL4 f[CCTK_REAL4_VEC_SIZE]; + CCTK_INTEGER4_VEC vi; + CCTK_REAL4_VEC vf; +}; + +#define k4sign (vec4_set1i( (CCTK_INTEGER4)(1UL << 31UL))) +#define k4notsign (vec4_set1i(~ (CCTK_INTEGER4)(1UL << 31UL))) + + + +// Create vectors, extract vector elements + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_set1(CCTK_REAL4 const a) +{ + return _mm256_set1_ps(a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_set1i(CCTK_INT4 const a) +{ + return _mm256_castsi256_ps(_mm256_set1_epi32(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a, + CCTK_REAL4 const b, + CCTK_REAL4 const c, + CCTK_REAL4 const d, + CCTK_REAL4 const e, + CCTK_REAL4 const f, + CCTK_REAL4 const g, + CCTK_REAL4 const h) +{ + return _mm256_set_ps(h,g,f,e,d,c,b,a); // note reversed arguments +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt0(CCTK_REAL4_VEC const x) +{ + return ((CCTK_REAL4 const*)&x)[0]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt1(CCTK_REAL4_VEC const x) +{ + return ((CCTK_REAL4 const*)&x)[1]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt2(CCTK_REAL4_VEC const x) +{ + return ((CCTK_REAL4 const*)&x)[2]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt3(CCTK_REAL4_VEC const x) +{ + return ((CCTK_REAL4 const*)&x)[3]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt4(CCTK_REAL4_VEC const x) +{ + return ((CCTK_REAL4 const*)&x)[4]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt5(CCTK_REAL4_VEC const x) +{ + return ((CCTK_REAL4 const*)&x)[5]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt6(CCTK_REAL4_VEC const x) +{ + return ((CCTK_REAL4 const*)&x)[6]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt7(CCTK_REAL4_VEC const x) +{ + return ((CCTK_REAL4 const*)&x)[7]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d) +{ + return ((CCTK_REAL4 const*)&x)[d]; +} + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_load(CCTK_REAL4 const& p) +{ + return _mm256_load_ps(&p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu(CCTK_REAL4 const& p) +{ + return _mm256_loadu_ps(&p); +} +#if VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported" +#endif + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS +// Implementation: Always use unaligned load +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p) +{ + return vec4_loadu(p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL4 const& p) +{ + return vec4_loadu(p); +} +#else +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p) +{ + return off % CCTK_REAL4_VEC_SIZE == 0 ? vec4_load(p) : vec4_loadu(p); +} +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL4 const& p) +{ + return vec4_loadu_maybe(off1, p); +} +# else +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL4 const& p) +{ + return + off2 % CCTK_REAL4_VEC_SIZE != 0 or + off3 % CCTK_REAL4_VEC_SIZE != 0 ? + vec4_loadu(p) : + vec4_loadu_maybe(off1, p); +} +# endif +#endif + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store(CCTK_REAL4& p, CCTK_REAL4_VEC const x) +{ + return _mm256_store_ps(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_storeu(CCTK_REAL4& p, CCTK_REAL4_VEC const x) +{ + return _mm256_storeu_ps(&p, x); +} +#if ! VECTORISE_STREAMING_STORES +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x) +{ + return vec4_store(p, x); +} +#else +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x) +{ + return _mm256_stream_ps(&p, x); +} +#endif + +// Store a partial vector (aligned and non-temporal) +#define vec4_store_partial_prepare(i,imin,imax) \ + bool v4stp_all; \ + __m256i v4stp_mask; \ + vec4_store_partial_prepare_(v4stp_all, v4stp_mask, i, imin, imax); +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_partial_prepare_(bool& all, __m256i& mask, + std::ptrdiff_t const i, + std::ptrdiff_t const imin, + std::ptrdiff_t const imax) +{ + all = i>=imin and i+CCTK_REAL4_VEC_SIZE-1 +#include +#include -#include -#include #include #ifdef __SSE4_1__ @@ -18,7 +25,7 @@ # include #endif #ifdef __FMA4__ -# include +# include #endif @@ -43,98 +50,121 @@ // Vector type corresponding to CCTK_REAL -#define CCTK_REAL4_VEC __m128 +typedef __m128 CCTK_REAL4_VEC; +typedef __m128i CCTK_INTEGER4_VEC; +typedef __m128 CCTK_BOOLEAN4_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL4_VEC_SIZE 4 +vec_static_assert(sizeof(CCTK_REAL4_VEC) == + sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE); + // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER4 CCTK_REAL4 -#define CCTK_BOOLEAN4 CCTK_REAL4 -#define CCTK_INTEGER4_VEC CCTK_REAL4_VEC -#define CCTK_BOOLEAN4_VEC CCTK_REAL4_VEC +typedef CCTK_INT4 CCTK_INTEGER4; +typedef CCTK_REAL4 CCTK_BOOLEAN4; union k4const_t { - unsigned i[4]; - float f[4]; - __m128i vi; - __m128 vf; + CCTK_INTEGER4 i[CCTK_REAL4_VEC_SIZE]; + CCTK_REAL4 f[CCTK_REAL4_VEC_SIZE]; + CCTK_INTEGER4_VEC vi; + CCTK_REAL4_VEC vf; }; -#define K4_ZERO 0x00000000UL -#define K4_IMIN 0x80000000UL -#define K4_IMAX 0x7fffffffUL +#define k4sign (vec4_set1i( (CCTK_INTEGER4)(1UL << 31UL))) +#define k4notsign (vec4_set1i(~ (CCTK_INTEGER4)(1UL << 31UL))) // Create vectors, extract vector elements -#define vec4_set1(a) (_mm_set1_ps(a)) -#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_set1(CCTK_REAL4 const a) +{ + return _mm_set1_ps(a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_set1i(CCTK_INT4 const a) +{ + return _mm_castsi128_ps(_mm_set1_epi32(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a, + CCTK_REAL4 const b, + CCTK_REAL4 const c, + CCTK_REAL4 const d) +{ + return _mm_set_ps(d,c,b,a); // note reversed arguments +} // original order is 0123 -#define vec4_swap1032(x_) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const x=x__; \ - _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,3,0,1)); \ - }) -#define vec4_swap2301(x_) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const x=x__; \ - _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,0,3,2)); \ - }) -#define vec4_swap3210(x_) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const x=x__; \ - _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,1,2,3)); \ - }) - -#if defined(__PGI) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_swap1032(CCTK_REAL4_VEC const x) +{ + return _mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_swap2301(CCTK_REAL4_VEC const x) +{ + return _mm_shuffle_ps(x, x, _MM_SHUFFLE(1,0,3,2)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_swap3210(CCTK_REAL4_VEC const x) +{ + return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0,1,2,3)); +} + +#if defined __PGI // _mm_cvtss_f32 does not exist on PGI compilers -# define vec4_elt0(x) \ - ({ \ - CCTK_REAL4 a; \ - asm ("" : "=x" (a) : "0" (x)); \ - a; \ - }) -#else -# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 _mm_cvtss_f32(CCTK_REAL4_VEC const x) +{ + CCTK_REAL4 a; + asm ("" : "=x" (a) : "0" (x)); + return a; +} #endif -#define vec4_elt1(x) vec4_elt0(vec4_swap1032(x)) -#define vec4_elt2(x) vec4_elt0(vec4_swap2301(x)) -#define vec4_elt3(x) vec4_elt0(vec4_swap3210(x)) -#if defined(__PGI) -# define vec4_elt(x_,d) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const x=x__; \ - CCTK_REAL4 a; \ - if (d==0) a=vec4_elt0(x); \ - else if (d==1) a=vec4_elt1(x); \ - else if (d==2) a=vec4_elt2(x); \ - else if (d==3) a=vec4_elt3(x); \ - a; \ - }) + +// TODO: Why not ((CCTK_REAL4 const*)&x)[d] ? +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt0(CCTK_REAL4_VEC const x) +{ + return _mm_cvtss_f32(x); // this is a no-op +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt1(CCTK_REAL4_VEC const x) +{ + return vec4_elt0(vec4_swap1032(x)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt2(CCTK_REAL4_VEC const x) +{ + return vec4_elt0(vec4_swap2301(x)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt3(CCTK_REAL4_VEC const x) +{ + return vec4_elt0(vec4_swap3210(x)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d) +{ +#if defined __PGI + if (d==0) return vec4_elt0(x); + if (d==1) return vec4_elt1(x); + if (d==2) return vec4_elt2(x); + return vec4_elt3(x); #else -# define vec4_elt(x_,d) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const x=x__; \ - CCTK_REAL4 a; \ - switch (d) { \ - case 0: a=vec4_elt0(x); break; \ - case 1: a=vec4_elt1(x); break; \ - case 2: a=vec4_elt2(x); break; \ - case 3: a=vec4_elt3(x); break; \ - } \ - a; \ - }) + switch (d) { + case 0: return vec4_elt0(x); + case 1: return vec4_elt1(x); + case 2: return vec4_elt2(x); + } + return vec4_elt3(x); #endif +} @@ -142,318 +172,540 @@ union k4const_t { // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar -#define vec4_load(p) (_mm_load_ps(&(p))) -#define vec4_loadu(p) (_mm_loadu_ps(&(p))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_load(CCTK_REAL4 const& p) +{ + return _mm_load_ps(&p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu(CCTK_REAL4 const& p) +{ + return _mm_loadu_ps(&p); +} #if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS -# define vec4_load_off1(p) vec_loadu(p) -# define vec4_load_off2(p) vec_loadu(p) -# define vec4_load_off3(p) vec_loadu(p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_load_off1(CCTK_REAL4 const& p) +{ + return vec4_loadu(p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_load_off2(CCTK_REAL4 const& p) +{ + return vec4_loadu(p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_load_off3(CCTK_REAL4 const& p) +{ + return vec4_loadu(p); +} #else -# define vec4_load_off1(p_) \ - ({ \ - CCTK_REAL4 const& p__=(p_); \ - CCTK_REAL4 const& p=p__; \ - CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \ - CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \ - assert(0); \ - CCTK_REAL4_VEC const hi2=_mm_shuffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \ - _mm_shuffle_ps(lo,hi2, _MM_SHUFFLE(2,1,3,0)); \ - }) -# define vec4_load_off2(p_) \ - ({ \ - CCTK_REAL4 const& p__=(p_); \ - CCTK_REAL4 const& p=p__; \ - CCTK_REAL4_VEC const lo=vec4_load((&p)[-2]); \ - CCTK_REAL4_VEC const hi=vec4_load((&p)[+2]); \ - _mm_shuffle_ps(lo,hi, _MM_SHUFFLE(1,0,3,2)); \ - }) -# define vec4_load_off1(p_) \ - ({ \ - CCTK_REAL4 const& p__=(p_); \ - CCTK_REAL4 const& p=p__; \ - CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \ - CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \ - assert(0); \ - CCTK_REAL4_VEC const lo2=_mm_shuffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \ - _mm_shuffle_ps(lo2,hi, _MM_SHUFFLE(3,0,2,1)); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_load_off1(CCTK_REAL4 const& p) +{ + CCTK_REAL4_VEC const lo = vec4_load((&p)[-1]); + CCTK_REAL4_VEC const hi = vec4_load((&p)[+3]); + CCTK_REAL4_VEC const hi2 = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(0,1,2,3)); + return _mm_shuffle_ps(lo, hi2, _MM_SHUFFLE(2,1,3,0)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_load_off2(CCTK_REAL4 const& p) +{ + CCTK_REAL4_VEC const lo = vec4_load((&p)[-2]); + CCTK_REAL4_VEC const hi = vec4_load((&p)[+2]); + return _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1,0,3,2)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_load_off3(CCTK_REAL4 const& p) +{ + CCTK_REAL4_VEC const lo = vec4_load((&p)[-1]); + CCTK_REAL4_VEC const hi = vec4_load((&p)[+3]); + CCTK_REAL4_VEC const lo2 = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(0,1,2,3)); + return _mm_shuffle_ps(lo2, hi, _MM_SHUFFLE(3,0,2,1)); +} #endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size #if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -# define vec4_loadu_maybe(off,p) vec4_loadu(p) -# define vec4_loadu_maybe3(off1,off2,off3,p) vec4_loadu(p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p) +{ + return vec4_loadu(p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL4 const& p) +{ + return vec4_loadu(p); +} #else -# define vec4_loadu_maybe(off,p_) \ - ({ \ - CCTK_REAL4 const& p__=(p_); \ - CCTK_REAL4 const& p=p__; \ - (off) % CCTK_REAL4_VEC_SIZE == 0 ? \ - vec4_load(p) : \ - vec4_loadu(p); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p) +{ + // The :? operator probably breaks with the Intel compiler + //return off % CCTK_REAL4_VEC_SIZE == 0 ? vec4_load(p) : vec4_loadu(p); + if (off % CCTK_REAL4_VEC_SIZE == 0) return vec4_load(p); + return vec4_loadu(p); +} # if VECTORISE_ALIGNED_ARRAYS // Assume all array x sizes are multiples of the vector size -# define vec4_loadu_maybe3(off1,off2,off3,p) \ - vec4_loadu_maybe(off1,p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL4 const& p) +{ + return vec4_loadu_maybe(off1, p); +} # else -# define vec4_loadu_maybe3(off1,off2,off3,p) \ - vec4_loadu_maybe((off1)|(off2)|(off3),p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL4 const& p) +{ + return + off2 % CCTK_REAL4_VEC_SIZE != 0 or + off3 % CCTK_REAL4_VEC_SIZE != 0 ? + vec4_loadu(p) : + vec4_loadu_maybe(off1, p); +} # endif #endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec4_store(p,x) (_mm_store_ps(&(p),x)) -#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store(CCTK_REAL4& p, CCTK_REAL4_VEC const x) +{ + return _mm_store_ps(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_storeu(CCTK_REAL4& p, CCTK_REAL4_VEC const x) +{ + return _mm_storeu_ps(&p, x); +} #if ! VECTORISE_STREAMING_STORES -# define vec4_store_nta(p,x) vec4_store(p,x) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x) +{ + return vec4_store(p, x); +} #else -# define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x) +{ + return _mm_stream_ps(&p, x); +} #endif // Store a partial vector (aligned and non-temporal) -#define vec4_store_partial_prepare(i,imin,imax) \ - int v4stp_lo_skip = (imin)-(i); \ - int v4stp_hi_skip = (i)+CCTK_REAL_VEC_SIZE-(imax); \ - if (CCTK_BUILTIN_EXPECT(v4stp_lo_skip < 0, true)) v4stp_lo_skip = 0; \ - if (CCTK_BUILTIN_EXPECT(v4stp_hi_skip < 0, true)) v4stp_hi_skip = 0; -// Ignoring VECTORISE_STREAMING_STORES for partial stores -#define vec4_store_nta_partial(p_,x_) \ - ({ \ - CCTK_REAL4& p__=(p_); \ - CCTK_REAL4& p=p__; \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const x=x__; \ - if (CCTK_BUILTIN_EXPECT(v4stp_lo_skip==0 and v4stp_hi_skip==0, true)) { \ - vec4_store_nta(p,x); \ - } else { \ - /* these cases fall through */ \ - switch (v4stp_lo_skip) { \ - case 0: \ - (&p)[0] = vec4_elt0(x); \ - case 1: \ - if (v4stp_hi_skip>=3) break; \ - (&p)[1] = vec4_elt1(x); \ - case 2: \ - if (v4stp_hi_skip>=2) break; \ - (&p)[2] = vec4_elt2(x); \ - case 3: \ - if (v4stp_hi_skip>=1) break; \ - (&p)[3] = vec4_elt3(x); \ - } \ - } \ - }) - -// Ignoring VECTORISE_STREAMING_STORES for partial stores -#define vec4_store_nta_partial_lo(p_,x_,n) \ - ({ \ - CCTK_REAL4 & p__=(p_); \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4 & p=p__; \ - CCTK_REAL4_VEC const x=x__; \ - /* these cases fall through */ \ - switch (n) { \ - case 3: (&p)[2] = vec4_elt2(x); \ - case 2: (&p)[1] = vec4_elt1(x); \ - case 1: (&p)[0] = vec4_elt0(x); \ - } \ - }) -#define vec4_store_nta_partial_hi(p_,x_,n) \ - ({ \ - CCTK_REAL4 & p__=(p_); \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4 & p=p__; \ - CCTK_REAL4_VEC const x=x__; \ - /* these cases fall through */ \ - switch (n) { \ - case 3: (&p)[1]=vec4_elt1(x); \ - case 2: (&p)[2]=vec4_elt2(x); \ - case 1: (&p)[3]=vec4_elt3(x); \ - } \ - }) -#define vec4_store_nta_partial_mid(p_,x_,nlo,nhi) \ - ({ \ - CCTK_REAL4 & p__=(p_); \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4 & p=p__; \ - CCTK_REAL4_VEC const x=x__; \ - /* these cases fall through */ \ - switch (nhi) { \ - case 3: if (nlo<2) break; (&p)[1] = vec4_elt1(x); \ - case 2: if (nlo<3) break; (&p)[2] = vec4_elt2(x); \ - } \ - }) +// We ignoring VECTORISE_STREAMING_STORES for partial stores +#define vec4_store_partial_prepare(i, imin, imax) \ + std::ptrdiff_t v4stp_lo_skip, v4stp_hi_skip; \ + vec4_store_partial_prepare_(v4stp_lo_skip, v4stp_hi_skip, i, imin, imax) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_partial_prepare_(std::ptrdiff_t& lo_skip, + std::ptrdiff_t& hi_skip, + std::ptrdiff_t const i, + std::ptrdiff_t const imin, + std::ptrdiff_t const imax) +{ + lo_skip = std::max(std::ptrdiff_t(0), imin - i); + hi_skip = std::max(std::ptrdiff_t(0), i+CCTK_REAL4_VEC_SIZE - imax); +} +#define vec4_store_nta_partial(p, x) \ + vec4_store_nta_partial_(v8stp_lo_skip, v8stp_hi_skip, p, x) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_nta_partial_(std::ptrdiff_t const lo_skip, + std::ptrdiff_t const hi_skip, + CCTK_REAL4& p, + CCTK_REAL4_VEC const x) +{ + if (CCTK_BUILTIN_EXPECT(lo_skip==0 and hi_skip==0, true)) { + vec4_store_nta(p, x); + } else { + // these cases fall through + switch (lo_skip) { + case 0: + (&p)[0] = vec4_elt0(x); + case 1: + if (hi_skip>=3) break; + (&p)[1] = vec4_elt1(x); + case 2: + if (hi_skip>=2) break; + (&p)[2] = vec4_elt2(x); + case 3: + if (hi_skip>=1) break; + (&p)[3] = vec4_elt3(x); + } + } +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_nta_partial_lo(CCTK_REAL4& p, + CCTK_REAL4_VEC const x, + std::ptrdiff_t const n) +{ + // these cases fall through + switch (n) { + case 3: (&p)[2] = vec4_elt2(x); + case 2: (&p)[1] = vec4_elt1(x); + case 1: (&p)[0] = vec4_elt0(x); + } +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_nta_partial_hi(CCTK_REAL4& p, + CCTK_REAL4_VEC const x, + std::ptrdiff_t const n) +{ + // these cases fall through + switch (n) { + case 3: (&p)[1]=vec4_elt1(x); + case 2: (&p)[2]=vec4_elt2(x); + case 1: (&p)[3]=vec4_elt3(x); + } +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec4_store_nta_partial_hi(CCTK_REAL4& p, + CCTK_REAL4_VEC const x, + std::ptrdiff_t const nlo, + std::ptrdiff_t const nhi) +{ + // these cases fall through + switch (nhi) { + case 3: + if (nlo<2) break; + (&p)[1] = vec4_elt1(x); + case 2: + if (nlo<3) break; + (&p)[2] = vec4_elt2(x); + } +} // Functions and operators -static const k4const_t k4sign_mask = {{ K4_IMIN, K4_IMIN, K4_IMIN, K4_IMIN, }}; - // Operators -#define k4neg(x) (_mm_xor_ps(k4sign_mask.vf,x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4neg(CCTK_REAL4_VEC const x) +{ + return _mm_xor_ps(k4sign, x); +} // #define k4inv(x) // TODO: provide k4inv via rcp and Newton-Raphson // This is described in AMD's publication 47414. -// This should apply for AVX as well. - -#define k4add(x,y) (_mm_add_ps(x,y)) -#define k4sub(x,y) (_mm_sub_ps(x,y)) -#define k4mul(x,y) (_mm_mul_ps(x,y)) -// TODO: use k4inv and k4mul instead -#define k4div(x,y) (_mm_div_ps(x,y)) +// This should apply to AVX as well. + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4add(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_add_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sub(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_sub_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4mul(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_mul_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4div(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + // TODO: maybe use k4inv and k4mul instead + return _mm_div_ps(x, y); +} // Fused multiply-add, defined as [+-]x*y[+-]z #ifdef __FMA4__ -# define k4madd(x,y,z) (_mm_macc_ps(x,y,z)) -# define k4msub(x,y,z) (_mm_msub_ps(x,y,z)) -# define k4nmadd(x,y,z) (_mm_nmsub_ps(x,y,z)) -# define k4nmsub(x,y,z) (_mm_nmacc_ps(x,y,z)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4madd(CCTK_REAL4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ + return _mm_macc_ps(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4msub(CCTK_REAL4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ + return _mm_msub_ps(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4nmadd(CCTK_REAL4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ + return _mm_nmsub_ps(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ + return _mm_nmacc_ps(x, y, z); +} #else -# define k4madd(x,y,z) (k4add(k4mul(x,y),z)) -# define k4msub(x,y,z) (k4sub(k4mul(x,y),z)) -# define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y))) -# define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4madd(CCTK_REAL4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ + return k4add(k4mul(x, y), z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4msub(CCTK_REAL4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ + return k4sub(k4mul(x, y), z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4nmadd(CCTK_REAL4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ + return k4sub(k4neg(z), k4mul(x, y)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ + return k4sub(z, k4mul(x, y)); +} #endif // Cheap functions -#define k4copysign(x,y) \ - (_mm_or_ps(_mm_andnot_ps(k4sign_mask.vf,x), \ - _mm_and_ps(k4sign_mask.vf,y))) -#define k4fabs(x) (_mm_andnot_ps(k4sign_mask.vf,x)) -#define k4fmax(x,y) (_mm_max_ps(x,y)) -#define k4fmin(x,y) (_mm_min_ps(x,y)) -#define k4fnabs(x) (_mm_or_ps(k4sign_mask.vf,x)) -static const k4const_t k4zero = { f: { 0.0f, 0.0f, 0.0f, 0.0f, }}; -static const k4const_t k4one = { f: { 1.0f, 1.0f, 1.0f, 1.0f, }}; -#define k4sgn(x_) \ - ({ \ - CCTK_REAL_VEC const x__=(x_); \ - CCTK_REAL_VEC const x=x__; \ - CCTK_REAL_VEC const iszero = _mm_cmpeq_ps(k4zero.vf, x); \ - CCTK_REAL_VEC const sign = _mm_and_ps(k4sign_mask.vf, x); \ - CCTK_REAL_VEC const signedone = _mm_or_ps(k4one.vf, sign); \ - k4ifthen(iszero, k4zero.vf, signedone); \ - }) -// TODO: maybe use rsqrt and Newton-Raphson -#define k4sqrt(x) (_mm_sqrt_ps(x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4copysign(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_or_ps(_mm_and_ps(k4notsign, x), + _mm_and_ps(k4sign , y)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4fabs(CCTK_REAL4_VEC const x) +{ + return _mm_and_ps(k4notsign, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4fmax(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_max_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4fmin(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_min_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4fnabs(CCTK_REAL4_VEC const x) +{ + return _mm_or_ps(k4sign, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sqrt(CCTK_REAL4_VEC const x) +{ + // TODO: maybe use rsqrt and Newton-Raphson + return _mm_sqrt_ps(x); +} // Expensive functions -#define K4REPL(f,x_) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const x=x__; \ - vec4_set(f(vec4_elt0(x)), \ - f(vec4_elt1(x)), \ - f(vec4_elt2(x)), \ - f(vec4_elt3(x))); \ - }) -#define K4REPL2S(f,x_,a_) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4 const a__=(a_); \ - CCTK_REAL4_VEC const x=x__; \ - CCTK_REAL4 const a=a__; \ - vec4_set(f(vec4_elt0(x),a), \ - f(vec4_elt1(x),a), \ - f(vec4_elt2(x),a), \ - f(vec4_elt3(x),a)); \ - }) -#define K4REPL2(f,x_,y_) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const y__=(y_); \ - CCTK_REAL4_VEC const x=x__; \ - CCTK_REAL4_VEC const y=y__; \ - vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \ - f(vec4_elt1(x),vec4_elt1(y)), \ - f(vec4_elt2(x),vec4_elt2(y)), \ - f(vec4_elt3(x),vec4_elt3(y))); \ - }) - -#define k4acos(x) K4REPL(acosf,x) -#define k4acosh(x) K4REPL(acoshf,x) -#define k4asin(x) K4REPL(asinf,x) -#define k4asinh(x) K4REPL(asinhf,x) -#define k4atan(x) K4REPL(atanf,x) -#define k4atan2(x,y) K4REPL2(atan2f,x,y) -#define k4atanh(x) K4REPL(atanhf,x) -#define k4cos(x) K4REPL(cosf,x) -#define k4cosh(x) K4REPL(coshf,x) -#define k4exp(x) K4REPL(expf,x) -#define k4log(x) K4REPL(logf,x) -#define k4pow(x,a) K4REPL2S(powf,x,a) -#define k4sin(x) K4REPL(sinf,x) -#define k4sinh(x) K4REPL(sinhf,x) -#define k4tan(x) K4REPL(tanf,x) -#define k4tanh(x) K4REPL(tanhf,x) - -static const k4const_t k4lfalse_ = {{ 0U, 0U, 0U, 0U, }}; -static const k4const_t k4ltrue_ = {{ ~0U, ~0U, ~0U, ~0U, }}; -#define k4lfalse (k4lfalse_.vf) -#define k4ltrue (k4ltrue_.vf) -#define k4lnot(x) (_mm_xor_ps(k4ltrue,x)) -#define k4land(x,y) (_mm_and_ps(x,y)) -#define k4lor(x,y) (_mm_or_ps(x,y)) -#define k4lxor(x,y) (_mm_xor_ps(x,y)) - +#define K4REPL(f,x) \ + vec4_set(f(vec4_elt0(x)), \ + f(vec4_elt1(x)), \ + f(vec4_elt2(x)), \ + f(vec4_elt3(x))); +#define K4REPL2S(f,x,a) \ + vec4_set(f(vec4_elt0(x),a), \ + f(vec4_elt1(x),a), \ + f(vec4_elt2(x),a), \ + f(vec4_elt3(x),a)); +#define K4REPL2(f,x,y) \ + vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \ + f(vec4_elt1(x),vec4_elt1(y)), \ + f(vec4_elt2(x),vec4_elt2(y)), \ + f(vec4_elt3(x),vec4_elt3(y))); + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x) +{ + return K4REPL(acos,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4acosh(CCTK_REAL4_VEC const x) +{ + return K4REPL(acosh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4asin(CCTK_REAL4_VEC const x) +{ + return K4REPL(asin,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4asinh(CCTK_REAL4_VEC const x) +{ + return K4REPL(asinh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atan(CCTK_REAL4_VEC const x) +{ + return K4REPL(atan,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atan2(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return K4REPL2(atan2,x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atanh(CCTK_REAL4_VEC const x) +{ + return K4REPL(atanh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4cos(CCTK_REAL4_VEC const x) +{ + return K4REPL(cos,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4cosh(CCTK_REAL4_VEC const x) +{ + return K4REPL(cosh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4exp(CCTK_REAL4_VEC const x) +{ + return K4REPL(exp,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4log(CCTK_REAL4_VEC const x) +{ + return K4REPL(log,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4pow(CCTK_REAL4_VEC const x, CCTK_REAL4 const a) +{ + return K4REPL2S(pow,x,a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sin(CCTK_REAL4_VEC const x) +{ + return K4REPL(sin,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sinh(CCTK_REAL4_VEC const x) +{ + return K4REPL(sinh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4tan(CCTK_REAL4_VEC const x) +{ + return K4REPL(tan,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) +{ + return K4REPL(tanh,x); +} + + + +#define k4lfalse (vec4_set1i( 0)) +#define k4ltrue (vec4_set1i(~0)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4lnot(CCTK_BOOLEAN4_VEC const x) +{ + return _mm_xor_ps(k4ltrue, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4land(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) +{ + return _mm_and_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4lor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) +{ + return _mm_or_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4lxor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) +{ + return _mm_xor_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x, + CCTK_REAL4_VEC const y, + CCTK_REAL4_VEC const z) +{ #ifdef __SSE4_1__ -# define k4ifthen(x,y,z) (_mm_blendv_ps(z,y,x)) + return _mm_blendv_ps(z,y,x); #elif 0 -# ifdef __cplusplus -# define k4signbit(x) ({ using namespace std; signbit(x); }) -# else -# define k4signbit(x) (signbitf(x)) -# endif -# define k4ifthen(x,y,z) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const y__=(y_); \ - CCTK_REAL4_VEC const z__=(z_); \ - CCTK_REAL4_VEC const x=x__; \ - CCTK_REAL4_VEC const y=y__; \ - CCTK_REAL4_VEC const z=z__; \ - vec4_set(k4signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \ - k4signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \ - k4signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \ - k4signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \ - }) + return vec4_set(std::signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), + std::signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), + std::signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), + std::signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); #elif 0 -// We don't need to shift -- the condition (mask) will be either all -// zeros or all ones -# define k4ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const y__=(y_); \ - CCTK_REAL4_VEC const z__=(z_); \ - CCTK_REAL4_VEC const x=x__; \ - CCTK_REAL4_VEC const y=y__; \ - CCTK_REAL4_VEC const z=z__; \ - CCTK_REAL4_VEC const mask = \ - (__m128)_mm_srai_epi32((__m128i)x, 31); \ - /* (z & ~mask) | (y & mask) */ \ - _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \ - }) + // We don't need to shift -- the condition (mask) will be either all + // zeros or all ones + CCTK_REAL4_VEC const mask = (__m128)_mm_srai_epi32((__m128i)x, 31); + // (z & ~mask) | (y & mask) + return _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); #else -# define k4ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL4_VEC const x__=(x_); \ - CCTK_REAL4_VEC const y__=(y_); \ - CCTK_REAL4_VEC const z__=(z_); \ - CCTK_REAL4_VEC const x=x__; \ - CCTK_REAL4_VEC const y=y__; \ - CCTK_REAL4_VEC const z=z__; \ - /* (z & ~mask) | (y & mask) where imask = ~mask */ \ - _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z)); \ - }) + // This assumes that all logical operations always return either + // lfalse or ltrue, and nothing "in between" + // (z & ~mask) | (y & mask) where imask = ~mask + return _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z)); #endif +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4cmpeq(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_cmpeq_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4cmpne(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_cmpneq_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4cmpgt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_cmpgt_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4cmpge(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_cmpge_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4cmplt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_cmplt_ps(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN4_VEC k4cmple(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_cmple_ps(x, y); +} + + + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sgn(CCTK_REAL4_VEC const x) +{ + CCTK_BOOLEAN4_VEC const iszero = k4cmpeq(x, vec4_set1(0.0)); + CCTK_REAL4_VEC const sign = _mm_and_ps(k4sign, x); + CCTK_REAL4_VEC const signedone = _mm_or_ps(sign, vec4_set1(1.0)); + return k4ifthen(iszero, vec4_set1(0.0), signedone); +} -#define k4cmpeq(x,y) (_mm_cmpeq_ps(x,y)) -#define k4cmpne(x,y) (_mm_cmpneq_ps(x,y)) -#define k4cmpgt(x,y) (_mm_cmpgt_ps(x,y)) -#define k4cmpge(x,y) (_mm_cmpge_ps(x,y)) -#define k4cmplt(x,y) (_mm_cmplt_ps(x,y)) -#define k4cmple(x,y) (_mm_cmple_ps(x,y)) +#endif diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h index 0cd49ac..28fae04 100644 --- a/src/vectors-4-default.h +++ b/src/vectors-4-default.h @@ -19,6 +19,9 @@ // Number of vector elements in a vector #define CCTK_REAL4_VEC_SIZE 1 +vec_static_assert(sizeof(CCTK_REAL4_VEC) == + sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE); + // Integer and boolean types corresponding to this real type #define CCTK_INTEGER4 CCTK_REAL4 #define CCTK_BOOLEAN4 CCTK_REAL4 diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index 6882523..ce43542 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -1,17 +1,16 @@ // Vectorise using Intel's or AMD's AVX // Use the type __m256d directly, without introducing a wrapper class -// Use macros instead of inline functions -#if VECTORISE_EMULATE_AVX -# include "avxintrin_emu.h" -#else -# include -#endif +#include + + + +#include #ifdef __FMA4__ -# include +# include #endif @@ -26,43 +25,80 @@ // Vector type corresponding to CCTK_REAL -#define CCTK_REAL8_VEC __m256d +typedef __m256d CCTK_REAL8_VEC; +typedef __m256i CCTK_INTEGER8_VEC; +typedef __m256d CCTK_BOOLEAN8_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL8_VEC_SIZE 4 +vec_static_assert(sizeof(CCTK_REAL8_VEC) == + sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); + // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER8 CCTK_REAL8 -#define CCTK_BOOLEAN8 CCTK_REAL8 -#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC -#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC +typedef CCTK_INT8 CCTK_INTEGER8; +typedef CCTK_REAL8 CCTK_BOOLEAN8; union k8const_t { - unsigned long long i[4]; - double f[4]; - __m256i vi; - __m256d vf; + CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; + CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; + CCTK_INTEGER8_VEC vi; + CCTK_REAL8_VEC vf; }; -#define K8_ZERO 0x0000000000000000ULL -#define K8_NOTZERO 0xffffffffffffffffULL -#define K8_IMIN 0x8000000000000000ULL -#define K8_IMAX 0x7fffffffffffffffULL +#define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) +#define k8notsign (vec8_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL))) // Create vectors, extract vector elements -#define vec8_set1(a) (_mm256_set1_pd(a)) -#define vec8_set(a,b,c,d) (_mm256_set_pd(d,c,b,a)) // note reversed arguments - -#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0]) -#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1]) -#define vec8_elt2(x) (((CCTK_REAL8 const*)&(x))[2]) -#define vec8_elt3(x) (((CCTK_REAL8 const*)&(x))[3]) -#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d]) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a) +{ + return _mm256_set1_pd(a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a) +{ + return _mm256_castsi256_pd(_mm256_set1_epi64x(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a, + CCTK_REAL8 const b, + CCTK_REAL8 const c, + CCTK_REAL8 const d) +{ + return _mm256_set_pd(d,c,b,a); // note reversed arguments +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[0]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[1]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt2(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[2]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt3(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[3]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) +{ + return ((CCTK_REAL8 const*)&x)[d]; +} @@ -70,11 +106,17 @@ union k8const_t { // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar -#define vec8_load(p) (_mm256_load_pd(&(p))) -#define vec8_loadu(p) (_mm256_loadu_pd(&(p))) -#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS -# define vec8_load_off1(p) vec_loadu(p) -#else +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p) +{ + return _mm256_load_pd(&p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p) +{ + return _mm256_loadu_pd(&p); +} +#if VECTORISE_ALWAYS_USE_ALIGNED_LOADS # error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported" #endif @@ -82,244 +124,492 @@ union k8const_t { // decided by the offset off and the vector size #if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -# define vec8_loadu_maybe(off,p) (vec8_loadu(p)) -# define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} #else -# define vec8_loadu_maybe(off,p_) \ - ({ \ - CCTK_REAL8 const& p__=(p_); \ - CCTK_REAL8 const& p=p__; \ - (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ - vec8_load(p) : \ - vec8_load_off1(p); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) +{ + return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_loadu(p); +} # if VECTORISE_ALIGNED_ARRAYS // Assume all array x sizes are multiples of the vector size -# define vec8_loadu_maybe3(off1,off2,off3,p) \ - vec8_loadu_maybe(off1,p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return vec8_loadu_maybe(off1, p); +} # else -# define vec8_loadu_maybe3(off1,off2,off3,p_) \ - ({ \ - CCTK_REAL8 const& p__=(p_); \ - CCTK_REAL8 const& p=p__; \ - ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ - (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ - vec8_loadu(p) : \ - vec8_loadu_maybe(off1,p); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return + off2 % CCTK_REAL8_VEC_SIZE != 0 or + off3 % CCTK_REAL8_VEC_SIZE != 0 ? + vec8_loadu(p) : + vec8_loadu_maybe(off1, p); +} # endif #endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec8_store(p,x) (_mm256_store_pd(&(p),x)) -#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm256_store_pd(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm256_storeu_pd(&p, x); +} #if ! VECTORISE_STREAMING_STORES -# define vec8_store_nta(p,x) (vec8_store(p,x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + vec8_store(p, x); +} #else -# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm256_stream_pd(&p, x); +} #endif // Store a partial vector (aligned and non-temporal) -#define vec8_store_partial_prepare(i,imin_,imax_) \ +#define vec8_store_partial_prepare(i, imin,imax) \ bool v8stp_all; \ __m256i v8stp_mask; \ - ({ \ - ptrdiff_t const imin__=(imin_); \ - ptrdiff_t const imin=imin__; \ - ptrdiff_t const imax__=(imax_); \ - ptrdiff_t const imax=imax__; \ - \ - v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE-1=imin and i+CCTK_REAL8_VEC_SIZE-1 + +#include + + + +#define vec8_architecture "MIC (64-bit precision)" + + + +// Vector type corresponding to CCTK_REAL +typedef __m512d CCTK_REAL8_VEC; +typedef __m512i CCTK_INTEGER8_VEC; +typedef __mmask8 CCTK_BOOLEAN8_VEC; + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 8 + +vec_static_assert(sizeof(CCTK_REAL8_VEC) == + sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); + +// Integer and boolean types corresponding to this real type +typedef CCTK_INT8 CCTK_INTEGER8; +typedef bool CCTK_BOOLEAN8; + + + +union k8const_t { + CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; + CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; + CCTK_INTEGER8_VEC vi; + CCTK_REAL8_VEC vf; +}; + +#define k8sign (vec8i_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) +#define k8notsign (vec8i_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL))) + + + +// Create vectors, extract vector elements + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a) +{ + return _mm512_set1_pd(a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_INTEGER8_VEC vec8i_set1i(CCTK_INT8 const a) +{ + return _mm512_set1_epi64(a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a0, + CCTK_REAL8 const a1, + CCTK_REAL8 const a2, + CCTK_REAL8 const a3, + CCTK_REAL8 const a4, + CCTK_REAL8 const a5, + CCTK_REAL8 const a6, + CCTK_REAL8 const a7) +{ + return _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); // note reversed arguments +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[0]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[1]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt2(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[2]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt3(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[3]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt4(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[4]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt5(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[5]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt6(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[6]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt7(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[7]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) +{ + return ((CCTK_REAL8 const*)&x)[d]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8 vec8_elt(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) +{ + return _mm512_mask2int(x) & (1 << d); +} + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p) +{ + return _mm512_load_pd(&p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p) +{ + CCTK_REAL8_VEC x = _mm512_undefined_pd(); + x = _mm512_loadunpacklo_pd(x, &p); + x = _mm512_loadunpackhi_pd(x, &p+8); + return x; +} +#if VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS is not yet supported" +#endif + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS +// Implementation: Always use unaligned load +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} +#else +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) +{ + return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_loadu(p); +} +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return vec8_loadu_maybe(off1, p); +} +# else +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return + off2 % CCTK_REAL8_VEC_SIZE != 0 or + off3 % CCTK_REAL8_VEC_SIZE != 0 ? + vec8_loadu(p) : + vec8_loadu_maybe(off1, p); +} +# endif +#endif + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm512_store_pd(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm512_packstorelo_pd(&p , x); + _mm512_packstorehi_pd(&p+8, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ +#if VECTORISE_STREAMING_STORES + _mm512_extstore_pd(&p, x, _MM_DOWNCONV_PD_NONE, _MM_HINT_NT); +#else + _mm512_store_pd(&p, x); +#endif +} + +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i, imin,imax) \ + __mmask8 v8stp_mask; \ + vec8_store_partial_prepare_(v8stp_mask, i, imin, imax) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_partial_prepare_(__mmask8& mask, + std::ptrdiff_t const i, + std::ptrdiff_t const imin, + std::ptrdiff_t const imax) +{ + unsigned char m = 255; + if (i < imin) { + /* clear lower imin-i bits */ + m &= 255 << (imin-i); + } + if (i+CCTK_REAL8_VEC_SIZE > imax) { + /* clear upper i+CCTK_REAL8_VEC_SIZE-imax bits */ + m &= 255 >> (i+CCTK_REAL8_VEC_SIZE-imax); + } + mask = _mm512_int2mask(m); +} + +#define vec8_store_nta_partial(p, x) \ + vec8_store_nta_partial_(v8stp_mask, p, x) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_(__mmask8 const mask, + CCTK_REAL8& p, + CCTK_REAL8_VEC const x) +{ + _mm512_mask_store_pd(&p, mask, x); +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm512_mask_store_pd(&p, _mm512_int2mask(255 >> (8-n)), x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm512_mask_store_pd(&p, _mm512_int2mask(255 << (8-n)), x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_mid(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const nlo, + ptrdiff_t const nhi) +{ + _mm512_mask_store_pd + (&p, _mm512_int2mask((255 >> (8-nlo)) & (255 << (8-nhi))), x); +} + + + +// Functions and operators + +// Operators +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x) +{ + // Could also multiply by -1 + // Could also invert sign bit + return _mm512_sub_pd(_mm512_set1_pd(0.0), x); +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8add(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_add_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sub(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_sub_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8mul(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_mul_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8div(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_div_pd(x, y); +} + +// Fused multiply-add, defined as [+-]x*y[+-]z +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm512_fmadd_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm512_fmsub_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm512_fnmsub_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm512_fnmadd_pd(x, y, z); +} + +// Cheap functions +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + CCTK_INTEGER8_VEC ix = _mm512_castpd_si512(x); + CCTK_INTEGER8_VEC iy = _mm512_castpd_si512(y); + CCTK_INTEGER8_VEC ir = _mm512_or_epi64(_mm512_and_epi64(k8notsign, ix), + _mm512_and_epi64(k8sign , iy)); + return _mm512_castsi512_pd(ir); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x) +{ + // Could also do k8fmax(x, k8neg(x)) + CCTK_INTEGER8_VEC ix = _mm512_castpd_si512(x); + CCTK_INTEGER8_VEC ir = _mm512_and_epi64(k8notsign, ix); + return _mm512_castsi512_pd(ir); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_gmax_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_gmin_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x) +{ + // Could also do k8fmin(x, k8neg(x)) + CCTK_INTEGER8_VEC ix = _mm512_castpd_si512(x); + CCTK_INTEGER8_VEC ir = _mm512_or_epi64(k8sign, ix); + return _mm512_castsi512_pd(ir); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) +{ + return _mm512_sqrt_pd(x); +} + +// Expensive functions + +#if 0 +// These implementations lead to an ICE with icpc 13.0.1 +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) +{ + return _mm512_acos_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x) +{ + return _mm512_acosh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x) +{ + return _mm512_asin_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x) +{ + return _mm512_asinh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x) +{ + return _mm512_atan_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_atan2_pd(x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x) +{ + return _mm512_atanh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x) +{ + return _mm512_cos_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x) +{ + return _mm512_cosh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x) +{ + return _mm512_exp_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x) +{ + return _mm512_log_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a) +{ + return _mm512_pow_pd(x, _mm512_set1_pd(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x) +{ + return _mm512_sin_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x) +{ + return _mm512_sinh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x) +{ + return _mm512_tan_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +{ + return _mm512_tanh_pd(x); +} + +#else + +// These implementations are very expensive +#define K8REPL(f,x) \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x)), \ + f(vec8_elt2(x)), \ + f(vec8_elt3(x)), \ + f(vec8_elt4(x)), \ + f(vec8_elt5(x)), \ + f(vec8_elt6(x)), \ + f(vec8_elt7(x))); +#define K8REPL2S(f,x,a) \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a), \ + f(vec8_elt2(x),a), \ + f(vec8_elt3(x),a), \ + f(vec8_elt4(x),a), \ + f(vec8_elt5(x),a), \ + f(vec8_elt6(x),a), \ + f(vec8_elt7(x),a)); +#define K8REPL2(f,x,y) \ + vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ + f(vec8_elt1(x),vec8_elt1(y)), \ + f(vec8_elt2(x),vec8_elt2(y)), \ + f(vec8_elt3(x),vec8_elt3(y)), \ + f(vec8_elt4(x),vec8_elt4(y)), \ + f(vec8_elt5(x),vec8_elt5(y)), \ + f(vec8_elt6(x),vec8_elt6(y)), \ + f(vec8_elt7(x),vec8_elt7(y))); + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) +{ + return K8REPL(acos,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x) +{ + return K8REPL(acosh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x) +{ + return K8REPL(asin,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x) +{ + return K8REPL(asinh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x) +{ + return K8REPL(atan,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return K8REPL2(atan2,x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x) +{ + return K8REPL(atanh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x) +{ + return K8REPL(cos,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x) +{ + return K8REPL(cosh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x) +{ + return K8REPL(exp,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x) +{ + return K8REPL(log,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a) +{ + return K8REPL2S(pow,x,a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x) +{ + return K8REPL(sin,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x) +{ + return K8REPL(sinh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x) +{ + return K8REPL(tan,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +{ + return K8REPL(tanh,x); +} + +#endif + + + +#define k8lfalse (_mm512_int2mask( 0)) +#define k8ltrue (_mm512_int2mask(~0)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x) +{ + return _mm512_knot(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm512_kand(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm512_kor(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm512_kxor(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + // This leads to an ICE + // return _mm512_mask_blend_pd(x, z, y); + return _mm512_mask_mov_pd(z, x, y); +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_cmpeq_pd_mask(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_cmpneq_pd_mask(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_cmpnle_pd_mask(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_cmpnlt_pd_mask(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_cmplt_pd_mask(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm512_cmple_pd_mask(x, y); +} + + + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x) +{ + CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0)); + CCTK_BOOLEAN8_VEC const isneg = k8cmplt(x, vec8_set1(0.0)); + return k8ifthen(iszero, vec8_set1(0.0), + k8ifthen(isneg, vec8_set1(-1.0), vec8_set1(+1.0))); +} diff --git a/src/vectors-8-QPX.h b/src/vectors-8-QPX.h index 631c974..7639476 100644 --- a/src/vectors-8-QPX.h +++ b/src/vectors-8-QPX.h @@ -45,11 +45,16 @@ struct CCTK_REAL8_VEC { // Create vectors, extract vector elements #define vec8_set1(a) (vec_splats(a)) -#define vec8_set(a,b,c,d) ((vector4double){a,b,c,d}) +#define vec8_set(a,b,c,d) \ + (vec_insert \ + (d,vec_insert \ + (c,vec_insert \ + (b,vec_insert \ + (a,CCTK_REAL8_VEC(),0),1),2),3)) #define vec8_b2r(b) ((b)?+1.0:-1.0) -#define vec8b_set(a,b,c,d) \ - ((vector4double){vec8_b2r(a),vec8_b2r(b),vec8_b2r(c),vec8_b2r(d)}) +#define vec8b_set(a,b,c,d) \ + (vec8_set(vec8_b2r(a),vec8_b2r(b),vec8_b2r(c),vec8_b2r(d))) #define vec8_elt0(x) (vec_extract(x,0)) #define vec8_elt1(x) (vec_extract(x,1)) @@ -351,8 +356,8 @@ struct CCTK_REAL8_VEC { #define k8ifthen(x,y,z) (vec_sel(z,y,x)) #define k8cmpeq(x,y) (vec_cmpeq(x,y)) -#define k8cmpne(x,y) (k8lnot(vec_cmpeq(x,y))) +#define k8cmpne(x,y) (k8lnot(k8cmpeq(x,y))) #define k8cmpgt(x,y) (vec_cmpgt(x,y)) -#define k8cmpge(x,y) (k8lnot(vec_cmplt(x,y))) +#define k8cmpge(x,y) (k8lnot(k8cmplt(x,y))) #define k8cmplt(x,y) (vec_cmplt(x,y)) -#define k8cmple(x,y) (vec_not(vec_cmpgt(x,y))) +#define k8cmple(x,y) (k8lnot(k8cmpgt(x,y))) diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 6dfe89f..2326e49 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -1,12 +1,18 @@ - +// Vectorise using Intel's or AMD's SSE2 // Use the type __m128d directly, without introducing a wrapper class -// Use macros instead of inline functions + +#ifdef __PGI +// PGI doesn't want to inline functions +# include "macros/vectors-8-SSE2.h" +#else + +#include +#include + -#include -#include #include #ifdef __SSE4_1__ @@ -28,7 +34,7 @@ #endif #ifdef __FMA4__ -# include +# include #endif @@ -53,46 +59,79 @@ // Vector type corresponding to CCTK_REAL -#define CCTK_REAL8_VEC __m128d +typedef __m128d CCTK_REAL8_VEC; +typedef __m128i CCTK_INTEGER8_VEC; +typedef __m128d CCTK_BOOLEAN8_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL8_VEC_SIZE 2 +vec_static_assert(sizeof(CCTK_REAL8_VEC) == + sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); + // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER8 CCTK_REAL8 -#define CCTK_BOOLEAN8 CCTK_REAL8 -#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC -#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC +typedef CCTK_INT8 CCTK_INTEGER8; +typedef CCTK_REAL8 CCTK_BOOLEAN8; union k8const_t { - long long i[2]; - double f[2]; - __m128i vi; - __m128d vf; + CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; + CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; + CCTK_INTEGER8_VEC vi; + CCTK_REAL8_VEC vf; }; -#define K8_IMIN ((long long)0x8000000000000000ULL) +#define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) +#define k8notsign (vec8_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL))) // Create vectors, extract vector elements -#define vec8_set1(a) (_mm_set1_pd(a)) -#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a) +{ + return _mm_set1_pd(a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a) +{ +#if defined(__INTEL_COMPILER) + // Intel 11.1 does not support _mm_set1_epi64x + return _mm_set1_pd(*(CCTK_REAL8 const*)&a); +#else + return _mm_castsi128_pd(_mm_set1_epi64x(a)); +#endif +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a, CCTK_REAL8 const b) +{ + return _mm_set_pd(b,a); // note reversed arguments +} // original order is 01 -#define vec8_swap10(x_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const x=x__; \ - _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); \ - }) - -#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0]) -#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1]) -#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d]) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_swap10(CCTK_REAL8_VEC const x) +{ + return _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[0]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[1]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) +{ + return ((CCTK_REAL8 const*)&x)[d]; +} @@ -100,141 +139,237 @@ union k8const_t { // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar -#define vec8_load(p) (_mm_load_pd(&(p))) -#define vec8_loadu(p) (_mm_loadu_pd(&(p))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p) +{ + return _mm_load_pd(&p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p) +{ + return _mm_loadu_pd(&p); +} #if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS -# define vec8_load_off1(p) vec_loadu(p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_load_off1(CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} #else -# define vec8_load_off1(p_) \ - ({ \ - CCTK_REAL8 const& p__=(p_); \ - CCTK_REAL8 const& p=p__; \ - _mm_shuffle_pd(vec8_load((&p)[-1]), \ - vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_load_off1(CCTK_REAL8 const& p) +{ + return _mm_shuffle_pd(vec8_load((&p)[-1]), + vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); +} #endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size #if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -# define vec8_loadu_maybe(off,p) vec8_loadu(p) -# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} #else -# define vec8_loadu_maybe(off,p_) \ - ({ \ - CCTK_REAL8 const& p__=(p_); \ - CCTK_REAL8 const& p=p__; \ - (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ - vec8_load(p) : \ - vec8_load_off1(p); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) +{ + // The :? operator breaks with the Intel compiler + // return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_load_off1(p); + if (off % CCTK_REAL8_VEC_SIZE == 0) return vec8_load(p); + return vec8_load_off1(p); +} # if VECTORISE_ALIGNED_ARRAYS // Assume all array x sizes are multiples of the vector size -# define vec8_loadu_maybe3(off1,off2,off3,p) \ - vec8_loadu_maybe(off1,p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return vec8_loadu_maybe(off1, p); +} # else -# define vec8_loadu_maybe3(off1,off2,off3,p_) \ - ({ \ - CCTK_REAL8 const& p__=(p_); \ - CCTK_REAL8 const& p=p__; \ - ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ - (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ - vec8_loadu(p) : \ - vec8_loadu_maybe(off1,p); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return + off2 % CCTK_REAL8_VEC_SIZE != 0 or + off3 % CCTK_REAL8_VEC_SIZE != 0 ? + vec8_loadu(p) : + vec8_loadu_maybe(off1, p); +} # endif #endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec8_store(p,x) (_mm_store_pd(&(p),x)) -#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm_store_pd(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm_storeu_pd(&p, x); +} #if ! VECTORISE_STREAMING_STORES -# define vec8_store_nta(p,x) vec8_store(p,x) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + vec8_store(p, x); +} #else -# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm_stream_pd(&p, x); +} #endif // Store a partial vector (aligned and non-temporal) -#define vec8_store_partial_prepare(i,imin,imax) \ - bool const v8stp_lo = (i)>=(imin); \ - bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE-1<(imax) +#define vec8_store_partial_prepare(i, imin,imax) \ + bool v8stp_lo, v8stp_hi; \ + vec8_store_partial_prepare_(v8stp_lo, v8stp_hi, i, imin, imax); +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_partial_prepare_(bool& lo, bool& hi, + std::ptrdiff_t const i, + std::ptrdiff_t const imin, + std::ptrdiff_t const imax) +{ + lo = i >= imin; + hi = i+CCTK_REAL8_VEC_SIZE-1 < imax; +} +#define vec8_store_nta_partial(p, x) \ + vec8_store_nta_partial_(v8stp_lo, v8stp_hi, p, x) #if VECTORISE_STREAMING_STORES && defined(__SSE4A__) -# define vec8_store_nta_partial(p_,x_) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8& p=p__; \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const x=x__; \ - if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ - vec8_store_nta(p,x); \ - } else if (v8stp_lo) { \ - _mm_stream_sd(&p,x); \ - } else if (v8stp_hi) { \ - _mm_stream_sd(&p+1, vec8_swap10(x)); \ - } \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_(bool const lo, bool const hi, + CCTK_REAL8& p, + CCTK_REAL8_VEC const x) +{ + if (CCTK_BUILTIN_EXPECT(lo and hi, true)) { + vec8_store_nta(p, x); + } else if (lo) { + _mm_stream_sd(&p, x); + } else if (hi) { + _mm_stream_sd(&p+1, vec8_swap10(x)); + } +} #else -# define vec8_store_nta_partial(p_,x_) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8& p=p__; \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const x=x__; \ - if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ - vec8_store_nta(p,x); \ - } else if (v8stp_lo) { \ - _mm_storel_pd(&p,x); \ - } else if (v8stp_hi) { \ - _mm_storeh_pd(&p+1,x); \ - } \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_(bool const lo, bool const hi, + CCTK_REAL8& p, + CCTK_REAL8_VEC const x) +{ + if (CCTK_BUILTIN_EXPECT(lo and hi, true)) { + vec8_store_nta(p, x); + } else if (lo) { + _mm_storel_pd(&p, x); + } else if (hi) { + _mm_storeh_pd(&p+1, x); + } +} #endif // Store a lower or higher partial vector (aligned and non-temporal) #if ! VECTORISE_STREAMING_STORES -# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) -# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_storel_pd(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_storeh_pd(&p+1, x); +} #else # if defined(__SSE4A__) -# define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x)) -# define vec8_store_nta_partial_hi(p,x,n) \ - (_mm_stream_sd(&(p)+1, vec8_swap10(x))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_stream_sd(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_stream_sd(&p+1, vec8_swap10(x)); +} # else // TODO: use clflush once a whole cache line has been written (cache // lines are usually larger than the CPU vector size) -# define vec8_store_nta_partial_lo(p_,x,n) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8& p=p__; \ - _mm_storel_pd(&p,x); \ - /* _mm_clflush(&p); */ \ - }) -# define vec8_store_nta_partial_hi(p_,x,n) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8& p=p__; \ - _mm_storeh_pd(&p+1,x); \ - /* _mm_clflush(&p+1); */ \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_storel_pd(&p, x); + // _mm_clflush(&p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_storeh_pd(&p+1, x); + // _mm_clflush(&p+1); +} # endif #endif #if 0 // This is slower; we would need a non-temporal read -#define vec8_store_nta_partial_lo(p,x,n) \ - vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1)) -#define vec8_store_nta_partial_hi(p,x,n) \ - vec8_store_nta(p, _mm_loadl_pd(x,&(p))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + vec8_store_nta(p, _mm_loadh_pd(x, &p+1)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + vec8_store_nta(p, _mm_loadl_pd(x, &p)); +} #endif -#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_mid(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const nlo, + ptrdiff_t const nhi) +{ + assert(0); +} // Functions and operators -static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }}; - // Operators // #define k8inot(x) (_mm_xor_si128(k8all_mask,x)) @@ -254,176 +389,320 @@ static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }}; // #define k8or(x,y) (_mm_or_pd(x,y)) // #define k8xor(x,y) (_mm_xor_pd(x,y)) -#define k8neg(x) (_mm_xor_pd(k8sign_mask.vf,x)) - -#define k8add(x,y) (_mm_add_pd(x,y)) -#define k8sub(x,y) (_mm_sub_pd(x,y)) -#define k8mul(x,y) (_mm_mul_pd(x,y)) -#define k8div(x,y) (_mm_div_pd(x,y)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x) +{ + return _mm_xor_pd(k8sign, x); +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8add(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_add_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sub(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_sub_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8mul(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_mul_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8div(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_div_pd(x, y); +} // Fused multiply-add, defined as [+-]x*y[+-]z #ifdef __FMA4__ -# define k8madd(x,y,z) (_mm_macc_pd(x,y,z)) -# define k8msub(x,y,z) (_mm_msub_pd(x,y,z)) -# define k8nmadd(x,y,z) (_mm_nmsub_pd(x,y,z)) -# define k8nmsub(x,y,z) (_mm_nmacc_pd(x,y,z)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm_macc_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm_msub_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm_nmsub_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm_nmacc_pd(x, y, z); +} #else -# define k8madd(x,y,z) (k8add(k8mul(x,y),z)) -# define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) -# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) -# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return k8add(k8mul(x, y), z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return k8sub(k8mul(x, y), z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return k8sub(k8neg(z), k8mul(x, y)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return k8sub(z, k8mul(x, y)); +} #endif // Cheap functions -#define k8copysign(x,y) \ - (_mm_or_pd(_mm_andnot_pd(k8sign_mask.vf,x), \ - _mm_and_pd(k8sign_mask.vf,y))) -#define k8fabs(x) (_mm_andnot_pd(k8sign_mask.vf,x)) -#define k8fmax(x,y) (_mm_max_pd(x,y)) -#define k8fmin(x,y) (_mm_min_pd(x,y)) -#define k8fnabs(x) (_mm_or_pd(k8sign_mask.vf,x)) -static const k8const_t k8zero = { f: { 0.0, 0.0, }}; -static const k8const_t k8one = { f: { 1.0, 1.0, }}; -#define k8sgn(x_) \ - ({ \ - CCTK_REAL_VEC const x__=(x_); \ - CCTK_REAL_VEC const x=x__; \ - CCTK_REAL_VEC const iszero = _mm_cmpeq_pd(k8zero.vf, x); \ - CCTK_REAL_VEC const sign = _mm_and_pd(k8sign_mask.vf, x); \ - CCTK_REAL_VEC const signedone = _mm_or_pd(k8one.vf, sign); \ - k8ifthen(iszero, k8zero.vf, signedone); \ - }) -#define k8sqrt(x) (_mm_sqrt_pd(x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_or_pd(_mm_and_pd(k8notsign, x), + _mm_and_pd(k8sign , y)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x) +{ + return _mm_and_pd(k8notsign, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_max_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_min_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x) +{ + return _mm_or_pd(k8sign, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) +{ + return _mm_sqrt_pd(x); +} // Expensive functions -#define K8REPL(f,x_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const x=x__; \ - vec8_set(f(vec8_elt0(x)), \ - f(vec8_elt1(x))); \ - }) -#define K8REPL2S(f,x_,a_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8 const a__=(a_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8 const a=a__; \ - vec8_set(f(vec8_elt0(x),a), \ - f(vec8_elt1(x),a)); \ - }) -#define K8REPL2(f,x_,y_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ - f(vec8_elt1(x),vec8_elt1(y))); \ - }) - -#define k8acos(x) K8REPL(acos,x) -#define k8acosh(x) K8REPL(acosh,x) -#define k8asin(x) K8REPL(asin,x) -#define k8asinh(x) K8REPL(asinh,x) -#define k8atan(x) K8REPL(atan,x) -#define k8atan2(x,y) K8REPL2(atan2,x,y) -#define k8atanh(x) K8REPL(atanh,x) -#define k8cos(x) K8REPL(cos,x) -#define k8cosh(x) K8REPL(cosh,x) -#define k8exp(x) K8REPL(exp,x) -#define k8log(x) K8REPL(log,x) -#define k8pow(x,a) K8REPL2S(pow,x,a) -#define k8sin(x) K8REPL(sin,x) -#define k8sinh(x) K8REPL(sinh,x) -#define k8tan(x) K8REPL(tan,x) -#define k8tanh(x) K8REPL(tanh,x) - -static const k8const_t k8lfalse_ = {{ +0LL, +0LL, }}; -static const k8const_t k8ltrue_ = {{ -1LL, -1LL, }}; -#define k8lfalse (k8lfalse_.vf) -#define k8ltrue (k8ltrue_.vf) -#define k8lnot(x) (_mm_xor_pd(k8ltrue,x)) -#define k8land(x,y) (_mm_and_pd(x,y)) -#define k8lor(x,y) (_mm_or_pd(x,y)) -#define k8lxor(x,y) (_mm_xor_pd(x,y)) - +#define K8REPL(f,x) \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x))); +#define K8REPL2S(f,x,a) \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a)); +#define K8REPL2(f,x,y) \ + vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ + f(vec8_elt1(x),vec8_elt1(y))); + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) +{ + return K8REPL(acos,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x) +{ + return K8REPL(acosh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x) +{ + return K8REPL(asin,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x) +{ + return K8REPL(asinh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x) +{ + return K8REPL(atan,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return K8REPL2(atan2,x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x) +{ + return K8REPL(atanh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x) +{ + return K8REPL(cos,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x) +{ + return K8REPL(cosh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x) +{ + return K8REPL(exp,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x) +{ + return K8REPL(log,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a) +{ + return K8REPL2S(pow,x,a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x) +{ + return K8REPL(sin,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x) +{ + return K8REPL(sinh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x) +{ + return K8REPL(tan,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +{ + return K8REPL(tanh,x); +} + + + +#define k8lfalse (vec8_set1i( 0)) +#define k8ltrue (vec8_set1i(~0)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x) +{ + return _mm_xor_pd(k8ltrue, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm_and_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm_or_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm_xor_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ #ifdef __SSE4_1__ -# define k8ifthen(x,y,z) (_mm_blendv_pd(z,y,x)) + return _mm_blendv_pd(z,y,x); #elif 0 -// This is slow (but this is what Intel/PGI produce by themselves) -# define k8ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const z__=(z_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - CCTK_REAL8_VEC const z=z__; \ - int const m = _mm_movemask_pd(x); \ - CCTK_REAL8_VEC r; \ - switch (m) { \ - case 0: r = y; break; \ - case 1: r = _mm_move_sd(y,z); break; \ - case 2: r = _mm_move_sd(z,y); break; \ - case 3: r = z; break; \ - } \ - r; \ - }) + // This is slow (but this is what Intel/PGI produce by themselves) + int const m = _mm_movemask_pd(x); + switch (m) { + case 0: return y; + case 1: return _mm_move_sd(y,z); + case 2: return _mm_move_sd(z,y); + } + return z; #elif 0 -# ifdef __cplusplus -# define k8signbit(x) ({ using namespace std; signbit(x); }) -# else -# define k8signbit(x) (signbit(x)) -# endif -# define k8ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const z__=(z_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - CCTK_REAL8_VEC const z=z__; \ - vec8_set(k8signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \ - k8signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \ - }) + return vec8_set(std::signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), + std::signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); #elif 0 -// We don't need to shift -- the condition (mask) will be either all -// zeros or all ones -static const k8const_t k8ione = {{ 0x1ULL, 0x1ULL, }}; -# define k8ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const z__=(z_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - CCTK_REAL8_VEC const z=z__; \ - /* there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 */ \ - __m128i const x_int = *(__m128i const*)&x; \ - __m128i const imask_int = \ - _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); \ - CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; \ - /* (z & ~mask) | (y & mask) where imask = ~mask */ \ - _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \ - }) + // We don't need to shift -- the condition (mask) will be either all + // zeros or all ones + k8const_t const k8ione = { i: { 1, 1, }}; + // there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 + __m128i const x_int = *(__m128i const*)&x; + __m128i const imask_int = _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); + CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; + // (z & ~mask) | (y & mask) where imask = ~mask + return _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); #else -# define k8ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const z__=(z_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - CCTK_REAL8_VEC const z=z__; \ - /* (z & ~mask) | (y & mask) where imask = ~mask */ \ - _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); \ - }) + // This assumes that all logical operations always return either + // lfalse or ltrue, and nothing "in between" + // (z & ~mask) | (y & mask) where imask = ~mask + return _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); #endif +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmpeq_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmpneq_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmpgt_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmpge_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmplt_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmple_pd(x, y); +} + + + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x) +{ + CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0)); + CCTK_REAL8_VEC const sign = _mm_and_pd(k8sign, x); + CCTK_REAL8_VEC const signedone = _mm_or_pd(sign, vec8_set1(1.0)); + return k8ifthen(iszero, vec8_set1(0.0), signedone); +} -#define k8cmpeq(x,y) (_mm_cmpeq_pd(x,y)) -#define k8cmpne(x,y) (_mm_cmpneq_pd(x,y)) -#define k8cmpgt(x,y) (_mm_cmpgt_pd(x,y)) -#define k8cmpge(x,y) (_mm_cmpge_pd(x,y)) -#define k8cmplt(x,y) (_mm_cmplt_pd(x,y)) -#define k8cmple(x,y) (_mm_cmple_pd(x,y)) +#endif diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h index 35af574..1faae76 100644 --- a/src/vectors-8-VSX.h +++ b/src/vectors-8-VSX.h @@ -20,9 +20,9 @@ #define CCTK_REAL8_VEC_SIZE 2 // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER8 long long +//#define CCTK_INTEGER8 long long #define CCTK_BOOLEAN8 long long -#define CCTK_INTEGER8_VEC vector long long +//#define CCTK_INTEGER8_VEC vector long long #define CCTK_BOOLEAN8_VEC vector bool long long diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h index 7ff6c8c..5c07bfb 100644 --- a/src/vectors-8-default.h +++ b/src/vectors-8-default.h @@ -1,13 +1,9 @@ // Fallback vectorisation implementation: Do not vectorise -// We use macros here, so that we are not surprised by compilers which -// don't like to inline functions. This should also make debug builds -// (which may not inline) more efficient. - -#include -#include +#include +#include @@ -19,10 +15,13 @@ // Number of vector elements in a vector #define CCTK_REAL8_VEC_SIZE 1 +vec_static_assert(sizeof(CCTK_REAL8_VEC) == + sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); + // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER8 CCTK_REAL8 +//#define CCTK_INTEGER8 CCTK_REAL8 #define CCTK_BOOLEAN8 CCTK_REAL8 -#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC +//#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC #define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC @@ -89,7 +88,7 @@ #define k8atan(x) (atan(x)) #define k8atan2(x,y) (atan2(x,y)) #define k8atanh(x) (atanh(x)) -#define k8copysign(x,y) (copysign(x,y)) +#define k8copysign(x,y) (std::copysign(x,y)) #define k8cos(x) (cos(x)) #define k8cosh(x) (cosh(x)) #define k8exp(x) (exp(x)) @@ -104,17 +103,16 @@ #define k8sqrt(x) (sqrt(x)) #define k8tan(x) (tan(x)) #define k8tanh(x) (tanh(x)) - -#define k8sgn(x_) \ - ({ \ - CCTK_REAL x__=(x_); \ - CCTK_REAL x=x__; \ - x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : std::copysign((CCTK_REAL)1.0, x); \ - }) -#define k8signbit(x) (std::signbit(x)) - -#define k8l2r(x_) ({ CCTK_INT8 x__=(x_); CCTK_INT8 x=x__; *(CCTK_REAL8*)&x; }) -#define k8r2l(x_) ({ CCTK_REAL8 x__=(x_); CCTK_REAL8 x=x__; *(CCTK_INT8*)&x; }) +#define k8signbit(x) (std::signbit(x)) + +static inline CCTK_REAL8_VEC k8l2r(CCTK_INT8 const x) +{ + return *(CCTK_REAL8 const*)&x; +} +static inline CCTK_INT8 k8r2l(CCTK_REAL8_VEC const x) +{ + return *(CCTK_INT8 const*)&x; +} #define k8lfalse k8l2r(0) #define k8ltrue k8l2r(1) #define k8lnot(x) k8l2r(!k8r2l(x)) @@ -130,3 +128,8 @@ #define k8cmpge(x,y) k8l2r((x)>=(y)) #define k8cmplt(x,y) k8l2r((x)<(y)) #define k8cmple(x,y) k8l2r((x)<=(y)) + +static inline CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x) +{ + return x==(CCTK_REAL8)0.0 ? (CCTK_REAL8)0.0 : k8copysign((CCTK_REAL8)1.0, x); +} diff --git a/src/vectors.h b/src/vectors.h index c87446e..08b9f91 100644 --- a/src/vectors.h +++ b/src/vectors.h @@ -1,3 +1,5 @@ +// -*-C++-*- + #ifndef VECTORS_H #define VECTORS_H @@ -5,23 +7,26 @@ +#define vec_static_assert(x) namespace { typedef int vsa[(x) ? 1 : -1]; } + + + #if VECTORISE -/* TODO: support AVX */ -# if defined(__SSE__) // Intel SSE +# if defined(__AVX__) // Intel AVX +# include "vectors-4-AVX.h" +# elif defined(__SSE__) // Intel SSE # include "vectors-4-SSE.h" # elif defined(__ALTIVEC__) // Power Altivec # include "vectors-4-Altivec.h" # endif -# if defined(__AVX__) // Intel AVX +# if defined(__MIC__) // Intel MIC +# include "vectors-8-MIC.h" +# elif defined(__AVX__) && !defined(DISABLE_AVX) // Intel AVX # include "vectors-8-AVX.h" # elif defined(__SSE2__) // Intel SSE2 -# if VECTORISE_EMULATE_AVX -# include "vectors-8-AVX.h" -# else -# include "vectors-8-SSE2.h" -# endif +# include "vectors-8-SSE2.h" # elif defined(__bgq__) && defined(__VECTOR4DOUBLE__) // Blue Gene/Q QPX # include "vectors-8-QPX.h" # elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Power VSX @@ -33,10 +38,10 @@ #endif // Default implementation, do not vectorise -#if ! defined(CCTK_REAL4_VEC_SIZE) +#ifndef CCTK_REAL4_VEC_SIZE # include "vectors-4-default.h" #endif -#if ! defined(CCTK_REAL8_VEC_SIZE) +#ifndef CCTK_REAL8_VEC_SIZE # include "vectors-8-default.h" #endif @@ -130,9 +135,9 @@ # define CCTK_REAL_VEC CCTK_REAL8_VEC # define CCTK_REAL_VEC_SIZE CCTK_REAL8_VEC_SIZE -# define CCTK_INTEGER CCTK_INTEGER8 +//# define CCTK_INTEGER CCTK_INTEGER8 # define CCTK_BOOLEAN CCTK_BOOLEAN8 -# define CCTK_INTEGER_VEC CCTK_INTEGER8_VEC +//# define CCTK_INTEGER_VEC CCTK_INTEGER8_VEC # define CCTK_BOOLEAN_VEC CCTK_BOOLEAN8_VEC # define vec_set1 vec8_set1 @@ -241,43 +246,54 @@ #ifdef __cplusplus +#include + template struct vecprops { typedef T scalar_t; typedef T vector_t; - static inline int size() + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + std::size_t size() { return 1; } - static inline vector_t load (scalar_t const& a) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t load (scalar_t const& a) { return a; } - static inline vector_t loadu (scalar_t const& a) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t loadu (scalar_t const& a) { return a; } - static inline scalar_t elt (vector_t const& x, int const d) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + scalar_t elt (vector_t const& x, std::ptrdiff_t const d) { return x; } - static inline vector_t neg (vector_t const& x) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t neg (vector_t const& x) { return -x; } - static inline vector_t add (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t add (vector_t const& x, vector_t const& y) { return x+y; } - static inline vector_t sub (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t sub (vector_t const& x, vector_t const& y) { return x-y; } - static inline vector_t mul (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t mul (vector_t const& x, vector_t const& y) { return x*y; } - static inline vector_t div (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t div (vector_t const& x, vector_t const& y) { return x/y; } @@ -287,39 +303,48 @@ template<> struct vecprops { typedef CCTK_REAL4 scalar_t; typedef CCTK_REAL4_VEC vector_t; - static inline int size() + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + int size() { return CCTK_REAL4_VEC_SIZE; } - static inline vector_t load (scalar_t const& a) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t load (scalar_t const& a) { return vec4_load(a); } - static inline vector_t loadu (scalar_t const& a) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t loadu (scalar_t const& a) { return vec4_loadu(a); } - static inline scalar_t elt (vector_t const& x, int const d) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + scalar_t elt (vector_t const& x, int const d) { return vec4_elt(x,d); } - static inline vector_t neg (vector_t const& x) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t neg (vector_t const& x) { return k4neg(x); } - static inline vector_t add (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t add (vector_t const& x, vector_t const& y) { return k4add(x,y); } - static inline vector_t sub (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t sub (vector_t const& x, vector_t const& y) { return k4sub(x,y); } - static inline vector_t mul (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t mul (vector_t const& x, vector_t const& y) { return k4mul(x,y); } - static inline vector_t div (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t div (vector_t const& x, vector_t const& y) { return k4div(x,y); } @@ -329,44 +354,143 @@ template<> struct vecprops { typedef CCTK_REAL8 scalar_t; typedef CCTK_REAL8_VEC vector_t; - static inline int size() + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + int size() { return CCTK_REAL8_VEC_SIZE; } - static inline vector_t load (scalar_t const& a) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t load (scalar_t const& a) { return vec8_load(a); } - static inline vector_t loadu (scalar_t const& a) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t loadu (scalar_t const& a) { return vec8_loadu(a); } - static inline scalar_t elt (vector_t const& x, int const d) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + scalar_t elt (vector_t const& x, int const d) { return vec8_elt(x,d); } - static inline vector_t neg (vector_t const& x) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t neg (vector_t const& x) { return k8neg(x); } - static inline vector_t add (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t add (vector_t const& x, vector_t const& y) { return k8add(x,y); } - static inline vector_t sub (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t sub (vector_t const& x, vector_t const& y) { return k8sub(x,y); } - static inline vector_t mul (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t mul (vector_t const& x, vector_t const& y) { return k8mul(x,y); } - static inline vector_t div (vector_t const& x, vector_t const& y) + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vector_t div (vector_t const& x, vector_t const& y) { return k8div(x,y); } }; +template +struct vectype { +private: + typedef vecprops props; +public: + typedef typename props::vector_t vector_t; + typedef typename props::scalar_t scalar_t; + vector_t v; + vectype() { } + vectype(vectype const& x): v(x.v) { } + vectype(vector_t const& x): v(x) { } + operator vector_t() const { return v; } + vectype& operator=(vectype const& x) { v=x.v; return *this; } + + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + std::size_t size() const { + return props::size(); + } + + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype load(scalar_t const& a) + { + return props::load(a); + } + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype loadu(scalar_t const& a) + { + return props::loadu(a); + } + + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + scalar_t elt(std::ptrdiff_t const d) const + { + return props::elt(*this, d); + } + + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype operator+() const + { + return *this; + } + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype operator-() const + { + return props::neg(*this); + } + + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype operator+(vectype const& x) const + { + return props::add(*this, x); + } + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype operator-(vectype const& x) const + { + return props::sub(*this, x); + } + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype operator*(vectype const& x) const + { + return props::mul(*this, x); + } + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype operator/(vectype const& x) const + { + return props::div(*this, x); + } + + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype& operator+=(vectype const& x) + { + return *this = *this+x; + } + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype& operator-=(vectype const& x) + { + return *this = *this-x; + } + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype& operator*=(vectype const& x) + { + return *this = *this*x; + } + inline CCTK_ATTRIBUTE_ALWAYS_INLINE + vectype& operator/=(vectype const& x) + { + return *this = *this/x; + } +}; + #endif @@ -383,6 +507,18 @@ struct vecprops { # undef ToReal # define ToReal(x) (vec_set1(CCTK_REAL(x))) +# undef IfThen +# ifdef __PGI +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL_VEC vec_IfThen(CCTK_BOOLEAN x, CCTK_REAL_VEC y, CCTK_REAL_VEC z) +{ + if (x) return y; else return z; +} +# define IfThen(x,y,z) vec_IfThen(x,y,z) +# else +# define IfThen(x,y,z) ((x) ? CCTK_REAL_VEC(y) : CCTK_REAL_VEC(z)) +# endif + # undef KRANC_GFOFFSET3D # define KRANC_GFOFFSET3D(var,i,j,k) \ vec_loadu_maybe3((i),(j),(k), \ -- cgit v1.2.3