aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2013-01-16 20:17:39 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2013-01-16 20:17:39 +0000
commitf4e59032c42c1df52717d2663760072cd0510f30 (patch)
treee6d3021b3fb440b20087655844f013bee9b3b522
parent33e3cc3261b81b28f83a6e4d41d1ae97944dfa49 (diff)
Major update
Disable AVX emulation Set default for streaming stores to "no" Correct QPX vectorisation (IBM Blue Gene/Q) Add MIC vectorisation (Intel Xeon Phi) Convert SSE and AVX vectorisation to using inline functions instead of macros for code clarity Define CCTK_BOOLEAN, CCTK_INTEGER and CCTK_BOOLEAN_VEC, CCTK_INTEGER_VEC to make boolean and integer vectors explicit git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@77 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--configuration.ccl3
-rw-r--r--configure.sh14
-rw-r--r--src/avxintrin_emu.h1061
-rw-r--r--src/macros/vectors-4-SSE.h457
-rw-r--r--src/macros/vectors-4-default.h134
-rw-r--r--src/macros/vectors-8-AVX.h325
-rw-r--r--src/macros/vectors-8-SSE2.h427
-rw-r--r--src/macros/vectors-8-default.h132
-rw-r--r--src/vectors-4-AVX.h659
-rw-r--r--src/vectors-4-SSE.h938
-rw-r--r--src/vectors-4-default.h3
-rw-r--r--src/vectors-8-AVX.h752
-rw-r--r--src/vectors-8-DoubleHummer.h4
-rw-r--r--src/vectors-8-MIC.h652
-rw-r--r--src/vectors-8-QPX.h17
-rw-r--r--src/vectors-8-SSE2.h831
-rw-r--r--src/vectors-8-VSX.h4
-rw-r--r--src/vectors-8-default.h43
-rw-r--r--src/vectors.h214
19 files changed, 4675 insertions, 1995 deletions
diff --git a/configuration.ccl b/configuration.ccl
index 9468abc..e2963f0 100644
--- a/configuration.ccl
+++ b/configuration.ccl
@@ -10,8 +10,7 @@ PROVIDES Vectors
VECTORISE_ALWAYS_USE_UNALIGNED_LOADS \
VECTORISE_ALWAYS_USE_ALIGNED_LOADS \
VECTORISE_INLINE \
- VECTORISE_STREAMING_STORES \
- VECTORISE_EMULATE_AVX
+ VECTORISE_STREAMING_STORES
}
REQUIRES Vectors
diff --git a/configure.sh b/configure.sh
index 1555570..96f34cd 100644
--- a/configure.sh
+++ b/configure.sh
@@ -71,7 +71,7 @@ esac
case $(echo "x$VECTORISE_STREAMING_STORES" | tr '[:upper:]' '[:lower:]') in
(xyes) VECTORISE_STREAMING_STORES=1 ;;
(xno) VECTORISE_STREAMING_STORES=0 ;;
- (x) VECTORISE_STREAMING_STORES=1 ;; # default
+ (x) VECTORISE_STREAMING_STORES=0 ;; # default
(*) echo "BEGIN ERROR"
echo "Illegal value of option VECTORISE_STREAMING_STORES"
echo "END ERROR"
@@ -88,16 +88,6 @@ case $(echo "x$VECTORISE_INLINE" | tr '[:upper:]' '[:lower:]') in
exit 1
esac
-case $(echo "x$VECTORISE_EMULATE_AVX" | tr '[:upper:]' '[:lower:]') in
- (xyes) VECTORISE_EMULATE_AVX=1 ;;
- (xno) VECTORISE_EMULATE_AVX=0 ;;
- (x) VECTORISE_EMULATE_AVX=0 ;; # default
- (*) echo "BEGIN ERROR"
- echo "Illegal value of option VECTORISE_EMULATE_AVX"
- echo "END ERROR"
- exit 1
-esac
-
################################################################################
@@ -112,7 +102,6 @@ echo "VECTORISE_ALWAYS_USE_UNALIGNED_LOADS $VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
echo "VECTORISE_ALWAYS_USE_ALIGNED_LOADS $VECTORISE_ALWAYS_USE_ALIGNED_LOADS"
echo "VECTORISE_INLINE $VECTORISE_INLINE"
echo "VECTORISE_STREAMING_STORES $VECTORISE_STREAMING_STORES"
-echo "VECTORISE_EMULATE_AVX $VECTORISE_EMULATE_AVX"
echo "END DEFINE"
echo "BEGIN MAKE_DEFINITION"
@@ -122,5 +111,4 @@ echo "VECTORISE_ALWAYS_USE_UNALIGNED_LOADS = $VECTORISE_ALWAYS_USE_UNALIGNED_LOA
echo "VECTORISE_ALWAYS_USE_ALIGNED_LOADS = $VECTORISE_ALWAYS_USE_ALIGNED_LOADS"
echo "VECTORISE_INLINE = $VECTORISE_INLINE"
echo "VECTORISE_STREAMING_STORES = $VECTORISE_STREAMING_STORES"
-echo "VECTORISE_EMULATE_AVX = $VECTORISE_EMULATE_AVX"
echo "END MAKE_DEFINITION"
diff --git a/src/avxintrin_emu.h b/src/avxintrin_emu.h
deleted file mode 100644
index 3097cd7..0000000
--- a/src/avxintrin_emu.h
+++ /dev/null
@@ -1,1061 +0,0 @@
-/*
- Copyright (c) 2010, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
- be used to endorse or promote products derived from this software without
- specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/***
-
- Provide feedback to: maxim.locktyukhin intel com, phil.j.kerly intel com
-
- Version 1.0 - Initial release.
-
- This AVX intrinsics emulation header file designed to work with Intel C/C++
- as well as GCC compilers.
-
- Known Issues and limitations:
-
- - does not support immediate values higher than 0x7 for _mm[256]_cmp_[ps|pd]
- intrinsics, UD2 instruction will be generated instead
-
- - -O0 optimization level may _sometimes_ result with compile time errors due
- to failed forced inline and compiler not being able to generate instruction
- with constant immediate operand becasue of it, compiling with -O1 and/or
- -finline-functions should help.
-
-***/
-
-
-#ifndef __EMU_M256_AVXIMMINTRIN_EMU_H__
-#define __EMU_M256_AVXIMMINTRIN_EMU_H__
-
-#ifdef __GNUC__
-
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
-
-#ifdef __SSE2__
-#include <emmintrin.h>
-#endif
-
-#ifdef __SSE3__
-#include <pmmintrin.h>
-#endif
-
-#ifdef __SSSE3__
-#include <tmmintrin.h>
-#endif
-
-#if defined (__SSE4_2__) || defined (__SSE4_1__)
-#include <smmintrin.h>
-#endif
-
-#if defined (__AES__) || defined (__PCLMUL__)
-#include <wmmintrin.h>
-#endif
-
-#else
-
-#include <wmmintrin.h>
-
-#endif
-
-#pragma message (" --- Intel remark: AVX intrinsics are emulated with SSE ---")
-
-/*
- * Intel(R) AVX compiler intrinsics.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * This is an emulation of Intel AVX
- */
-
-#if defined( _MSC_VER ) || defined( __INTEL_COMPILER )
- #define __EMU_M256_ALIGN( a ) __declspec(align(a))
- #define __emu_inline __forceinline
- #define __emu_int64_t __int64
-#elif defined( __GNUC__ )
- #define __EMU_M256_ALIGN( a ) __attribute__((__aligned__(a)))
- #define __emu_inline __inline __attribute__((__always_inline__))
- #define __emu_int64_t long long
-#else
- #error "unsupported platform"
-#endif
-
-typedef union __EMU_M256_ALIGN(32) __emu__m256
-{
- float __emu_arr[8];
- __m128 __emu_m128[2];
-} __emu__m256;
-
-typedef union __EMU_M256_ALIGN(32) __emu__m256d
-{
- double __emu_arr[4];
- __m128d __emu_m128[2];
-} __emu__m256d;
-
-typedef union __EMU_M256_ALIGN(32) __emu__m256i
-{
- int __emu_arr[8];
- __m128i __emu_m128[2];
-} __emu__m256i;
-
-static __emu_inline __emu__m256 __emu_set_m128( const __m128 arr[] ) { __emu__m256 ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); }
-static __emu_inline __emu__m256d __emu_set_m128d( const __m128d arr[] ) { __emu__m256d ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); }
-static __emu_inline __emu__m256i __emu_set_m128i( const __m128i arr[] ) { __emu__m256i ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); }
-
-
-#define __EMU_M256_IMPL_M1( type, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \
- res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M1_RET( ret_type, type, func ) \
-static __emu_inline __emu##ret_type __emu_mm256_##func( __emu##type m256_param1 ) \
-{ __emu##ret_type res; \
- res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \
- res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M1_RET_NAME( ret_type, type, func, name ) \
- static __emu_inline __emu##ret_type __emu_mm256_##name( __emu##type m256_param1 ) \
-{ __emu##ret_type res; \
- res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \
- res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M1_LH( type, type_128, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( type_128 m128_param ) \
-{ __emu##type res; \
- res.__emu_m128[0] = _mm_##func( m128_param ); \
- __m128 m128_param_high = _mm_movehl_ps( *(__m128*)&m128_param, *(__m128*)&m128_param ); \
- res.__emu_m128[1] = _mm_##func( *(type_128*)&m128_param_high ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M1_HL( type_128, type, func ) \
-static __emu_inline type_128 __emu_mm256_##func( __emu##type m256_param1 ) \
-{ type_128 res, tmp; \
- res = _mm_##func( m256_param1.__emu_m128[0] ); \
- tmp = _mm_##func( m256_param1.__emu_m128[1] ); \
- *(((__emu_int64_t*)&res)+1) = *(__emu_int64_t*)&tmp; \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M1P_DUP( type, type_param, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( type_param param ) \
-{ __emu##type res; \
- res.__emu_m128[0] = _mm_##func( param ); \
- res.__emu_m128[1] = _mm_##func( param ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M1I_DUP( type, func ) \
- static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], param2 ); \
- res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], param2 ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL2_M1I_DUP( type, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], param2 ); \
- res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], param2 ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL2_M1I_SHIFT( type, func, shift_for_hi ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], param2 & ((1<<shift_for_hi)-1) ); \
- res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], param2 >> shift_for_hi); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M2( type, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0] ); \
- res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1] ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL2_M2T( type, type_2, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type_2 m256_param2 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0] ); \
- res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1] ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M2I_DUP( type, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 ); \
- res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], param3 ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL2_M2I_DUP( type, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 ); \
- res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], param3 ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M2I_SHIFT( type, func, shift_for_hi ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 & ((1<<shift_for_hi)-1) ); \
- res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], param3 >> shift_for_hi ); \
- return ( res ); \
-}
-
-#define __EMU_M256_IMPL_M3( type, func ) \
-static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, __emu##type m256_param3 ) \
-{ __emu##type res; \
- res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], m256_param3.__emu_m128[0] ); \
- res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], m256_param3.__emu_m128[1] ); \
- return ( res ); \
-}
-
-
-/*
- * Compare predicates for scalar and packed compare intrinsics
- */
-#define _CMP_EQ_OQ 0x00 /* Equal (ordered, nonsignaling) */
-#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
-#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
-#define _CMP_UNORD_Q 0x03 /* Unordered (nonsignaling) */
-#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, nonsignaling) */
-#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
-#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
-#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
-
-#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
-#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
-#define _CMP_NGT_US 0x0A /* Not-greater-than (unordered, signaling) */
-#define _CMP_FALSE_OQ 0x0B /* False (ordered, nonsignaling) */
-#define _CMP_NEQ_OQ 0x0C /* Not-equal (ordered, non-signaling) */
-#define _CMP_GE_OS 0x0D /* Greater-than-or-equal (ordered, signaling) */
-#define _CMP_GT_OS 0x0E /* Greater-than (ordered, signaling) */
-#define _CMP_TRUE_UQ 0x0F /* True (unordered, non-signaling) */
-#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
-#define _CMP_LT_OQ 0x11 /* Less-than (ordered, nonsignaling) */
-#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, nonsignaling) */
-#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
-#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
-#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, nonsignaling) */
-#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, nonsignaling) */
-#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
-#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
-#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, nonsignaling) */
-#define _CMP_NGT_UQ 0x1A /* Not-greater-than (unordered, nonsignaling) */
-#define _CMP_FALSE_OS 0x1B /* False (ordered, signaling) */
-#define _CMP_NEQ_OS 0x1C /* Not-equal (ordered, signaling) */
-#define _CMP_GE_OQ 0x1D /* Greater-than-or-equal (ordered, nonsignaling) */
-#define _CMP_GT_OQ 0x1E /* Greater-than (ordered, nonsignaling) */
-#define _CMP_TRUE_US 0x1F /* True (unordered, signaling) */
-
-__EMU_M256_IMPL_M2( __m256d, add_pd );
-__EMU_M256_IMPL_M2( __m256, add_ps );
-
-__EMU_M256_IMPL_M2( __m256d, addsub_pd );
-__EMU_M256_IMPL_M2( __m256, addsub_ps );
-
-__EMU_M256_IMPL_M2( __m256d, and_pd );
-__EMU_M256_IMPL_M2( __m256, and_ps );
-
-__EMU_M256_IMPL_M2( __m256d, andnot_pd );
-__EMU_M256_IMPL_M2( __m256, andnot_ps );
-
-__EMU_M256_IMPL_M2( __m256d, div_pd );
-__EMU_M256_IMPL_M2( __m256, div_ps );
-
-__EMU_M256_IMPL_M2( __m256d, hadd_pd );
-__EMU_M256_IMPL_M2( __m256, hadd_ps );
-
-__EMU_M256_IMPL_M2( __m256d, hsub_pd );
-__EMU_M256_IMPL_M2( __m256, hsub_ps );
-
-__EMU_M256_IMPL_M2( __m256d, max_pd );
-__EMU_M256_IMPL_M2( __m256, max_ps );
-
-__EMU_M256_IMPL_M2( __m256d, min_pd );
-__EMU_M256_IMPL_M2( __m256, min_ps );
-
-__EMU_M256_IMPL_M2( __m256d, mul_pd );
-__EMU_M256_IMPL_M2( __m256, mul_ps );
-
-__EMU_M256_IMPL_M2( __m256d, or_pd );
-__EMU_M256_IMPL_M2( __m256, or_ps );
-
-__EMU_M256_IMPL_M2I_SHIFT( __m256d, shuffle_pd, 2 );
-__EMU_M256_IMPL_M2I_DUP( __m256, shuffle_ps );
-
-__EMU_M256_IMPL_M2( __m256d, sub_pd );
-__EMU_M256_IMPL_M2( __m256, sub_ps );
-
-__EMU_M256_IMPL_M2( __m256d, xor_pd );
-__EMU_M256_IMPL_M2( __m256, xor_ps );
-
-#if defined (__SSE4_2__) || defined (__SSE4_1__)
-
-__EMU_M256_IMPL_M2I_SHIFT( __m256d, blend_pd, 2 );
-__EMU_M256_IMPL_M2I_SHIFT( __m256, blend_ps, 4 );
-
-__EMU_M256_IMPL_M3( __m256d, blendv_pd );
-__EMU_M256_IMPL_M3( __m256, blendv_ps );
-
-__EMU_M256_IMPL_M2I_DUP( __m256, dp_ps );
-
-__EMU_M256_IMPL_M1I_DUP( __m256d, round_pd );
-#define _mm256_ceil_pd(val) _mm256_round_pd((val), 0x0A);
-#define _mm256_floor_pd(val) _mm256_round_pd((val), 0x09);
-
-__EMU_M256_IMPL_M1I_DUP( __m256, round_ps );
-#define _mm256_ceil_ps(val) _mm256_round_ps((val), 0x0A);
-#define _mm256_floor_ps(val) _mm256_round_ps((val), 0x09);
-
-#define __emu_mm_test_impl( op, sfx, vec_type ) \
-static __emu_inline int __emu_mm_test##op##_##sfx(vec_type s1, vec_type s2) { \
- __m128d sign_bits_pd = _mm_castsi128_pd( _mm_set_epi32( 1 << 31, 0, 1 << 31, 0 ) ); \
- __m128 sign_bits_ps = _mm_castsi128_ps( _mm_set1_epi32( 1 << 31 ) ); \
- \
- s1 = _mm_and_##sfx( s1, sign_bits_##sfx ); \
- s2 = _mm_and_##sfx( s2, sign_bits_##sfx ); \
- return _mm_test##op##_si128( _mm_cast##sfx##_si128( s1 ), _mm_cast##sfx##_si128( s2 ) ); \
-}
-
-__emu_mm_test_impl( z, pd, __m128d );
-__emu_mm_test_impl( c, pd, __m128d );
-__emu_mm_test_impl( nzc, pd, __m128d );
-
-__emu_mm_test_impl( z, ps, __m128 );
-__emu_mm_test_impl( c, ps, __m128 );
-__emu_mm_test_impl( nzc, ps, __m128 );
-
-
-
-#define __emu_mm256_test_impl( prfx, op, sfx, sfx_impl, vec_type ) \
-static __emu_inline int __emu_mm256_test##op##_##sfx(vec_type s1, vec_type s2) { \
- int ret1 = prfx##_test##op##_##sfx_impl( s1.__emu_m128[0], s2.__emu_m128[0] ); \
- int ret2 = prfx##_test##op##_##sfx_impl( s1.__emu_m128[1], s2.__emu_m128[1] ); \
- return ( ret1 && ret2 ); \
-};
-
-__emu_mm256_test_impl( _mm, z, si256, si128, __emu__m256i );
-__emu_mm256_test_impl( _mm, c, si256, si128, __emu__m256i );
-__emu_mm256_test_impl( _mm, nzc, si256, si128, __emu__m256i );
-
-__emu_mm256_test_impl( __emu_mm, z, pd, pd, __emu__m256d );
-__emu_mm256_test_impl( __emu_mm, c, pd, pd, __emu__m256d );
-__emu_mm256_test_impl( __emu_mm, nzc, pd, pd, __emu__m256d );
-
-__emu_mm256_test_impl( __emu_mm, z, ps, ps, __emu__m256 );
-__emu_mm256_test_impl( __emu_mm, c, ps, ps, __emu__m256 );
-__emu_mm256_test_impl( __emu_mm, nzc, ps, ps, __emu__m256 );
-
-#endif
-
-#if defined( __GNUC__ ) && ( __GNUC__ == 4 ) && (__GNUC_MINOR__ < 4 )
-/* use macro implementation instead of inline functions to allow -O0 for GCC pre 4.4 */
-
-#pragma message ("Using macro for GCC <4.4" )
-
-#define __emu_mm_cmp_ps(m1, m2, predicate) \
-({ \
- __m128 res_ = (m1), m2_ = (m2); \
- if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); \
- __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \
- res_; })
-
-#define __emu_mm256_cmp_ps(m1, m2, predicate) \
-({ \
- __emu__m256 res_ = (m1), m2_ = (m2); \
- if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \
- __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[0]) : [m2_] "xm" (m2_.__emu_m128[0]), [pred_] "i" (predicate) ); \
- __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[1]) : [m2_] "xm" (m2_.__emu_m128[1]), [pred_] "i" (predicate) ); \
- res_; })
-
-
-#define __emu_mm_cmp_pd(m1, m2, predicate) \
-({ \
- __m128 res_ = (m1), m2_ = (m2); \
- if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \
- __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \
- res_; })
-
-#define __emu_mm256_cmp_pd(m1, m2, predicate) \
-({ \
- __emu__m256 res_ = (m1), m2_ = (m2); \
- if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \
- __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[0]) : [m2_] "xm" (m2_.__emu_m128[0]), [pred_] "i" (predicate) ); \
- __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[1]) : [m2_] "xm" (m2_.__emu_m128[1]), [pred_] "i" (predicate) ); \
- res_; })
-
-
-#define __emu_mm_cmp_ss(m1, m2, predicate) \
-({ \
- __m128 res_ = (m1), m2_ = (m2); \
- if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \
- __asm__ ( "cmpss %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \
- res_; })
-
-#define __emu_mm_cmp_sd(m1, m2, predicate) \
-({ \
- __m128 res_ = (m1), m2_ = (m2); \
- if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \
- __asm__ ( "cmpsd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \
- res_; })
-
-
-
-#else /* __GNUC__==4 && __GNUC_MINOR__ <4 */
-
-
-static __emu_inline __m128 __emu_mm_cmp_ps(__m128 m1, __m128 m2, const int predicate)
-{
- __m128 res;
-
- if ( predicate >= 0 && predicate <= 7 ) {
- res = m1;
- __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) );
- } else {
- __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */
- }
-
- return ( res );
-}
-__EMU_M256_IMPL2_M2I_DUP( __m256, cmp_ps )
-
-static __emu_inline __m128d __emu_mm_cmp_pd(__m128d m1, __m128d m2, const int predicate)
-{
- __m128d res;
-
- if ( predicate >= 0 && predicate <= 7 ) {
- res = m1;
- __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) );
- } else {
- __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */
- }
-
- return ( res );
-}
-__EMU_M256_IMPL2_M2I_DUP( __m256d, cmp_pd )
-
-
-static __emu_inline __m128d __emu_mm_cmp_sd(__m128d m1, __m128d m2, const int predicate)
-{
- __m128d res;
-
- if ( predicate >= 0 && predicate <= 7 ) {
- res = m1;
- __asm__ ( "cmpsd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) );
- } else {
- __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */
- }
-
- return ( res );
-}
-
-static __emu_inline __m128 __emu_mm_cmp_ss(__m128 m1, __m128 m2, const int predicate)
-{
- __m128 res;
-
- if ( predicate >= 0 && predicate <= 7 ) {
- res = m1;
- __asm__ ( "cmpss %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) );
- } else {
- __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */
- }
-
- return ( res );
-}
-
-#endif
-
-
-__EMU_M256_IMPL_M1_LH( __m256d, __m128i, cvtepi32_pd );
-__EMU_M256_IMPL_M1_RET( __m256, __m256i, cvtepi32_ps );
-__EMU_M256_IMPL_M1_HL( __m128, __m256d, cvtpd_ps );
-__EMU_M256_IMPL_M1_RET( __m256i, __m256, cvtps_epi32 );
-__EMU_M256_IMPL_M1_LH( __m256d, __m128, cvtps_pd );
-__EMU_M256_IMPL_M1_HL( __m128i, __m256d, cvttpd_epi32);
-__EMU_M256_IMPL_M1_HL( __m128i, __m256d, cvtpd_epi32);
-__EMU_M256_IMPL_M1_RET( __m256i, __m256, cvttps_epi32 );
-
-static __emu_inline __m128 __emu_mm256_extractf128_ps(__emu__m256 m1, const int offset) { return m1.__emu_m128[ offset ]; }
-static __emu_inline __m128d __emu_mm256_extractf128_pd(__emu__m256d m1, const int offset) { return m1.__emu_m128[ offset ]; }
-static __emu_inline __m128i __emu_mm256_extractf128_si256(__emu__m256i m1, const int offset) { return m1.__emu_m128[ offset ]; }
-
-static __emu_inline void __emu_mm256_zeroall(void) {}
-static __emu_inline void __emu_mm256_zeroupper(void) {}
-
-static __emu_inline __m128 __emu_mm_permutevar_ps(__m128 a, __m128i control)
-{
- int const* sel = (int const*)&control;
- float const* src = (float const*)&a;
- __EMU_M256_ALIGN(16) float dest[4];
- int i=0;
-
- for (; i<4; ++i)
- dest[i] = src[ 3 & sel[i] ];
-
- return ( *(__m128*)dest );
-}
-__EMU_M256_IMPL2_M2T( __m256, __m256i, permutevar_ps );
-
-static __emu_inline __m128 __emu_mm_permute_ps(__m128 a, int control) { return _mm_castsi128_ps( _mm_shuffle_epi32( *(__m128i*)&a, control ) ); }
-__EMU_M256_IMPL2_M1I_DUP( __m256, permute_ps );
-
-
-static __emu_inline __m128d __emu_mm_permutevar_pd(__m128d a, __m128i control)
-{
- __emu_int64_t const* sel = (__emu_int64_t const*)&control;
- double const* src = (double const*)&a;
- __EMU_M256_ALIGN(16) double dest[2];
- int i=0;
-
- for (; i<2; ++i)
- dest[i] = src[ (2 & sel[i]) >> 1 ];
-
- return ( *(__m128d*)dest );
-}
-__EMU_M256_IMPL2_M2T( __m256d, __m256i, permutevar_pd );
-
-static __emu_inline __m128d __emu_mm_permute_pd(__m128d a, int control)
-{
- double const* src = (double const*)&a;
- __EMU_M256_ALIGN(16) double dest[2];
- int i=0;
-
- for (; i<2; ++i)
- dest[i] = src[ 1 & (control >> i) ];
-
- return ( *(__m128d*)dest );
-}
-__EMU_M256_IMPL2_M1I_SHIFT( __m256d, permute_pd, 2 );
-
-
-#define __emu_mm256_permute2f128_impl( name, m128_type, m256_type ) \
-static __emu_inline m256_type name( m256_type m1, m256_type m2, int control) { \
- m256_type res; \
- __m128 zero = _mm_setzero_ps(); \
- const m128_type param[4] = { m1.__emu_m128[0], m1.__emu_m128[1], m2.__emu_m128[0], m2.__emu_m128[1] }; \
- res.__emu_m128[0] = (control & 8) ? *(m128_type*)&zero : param[ control & 0x3 ]; control >>= 4; \
- res.__emu_m128[1] = (control & 8) ? *(m128_type*)&zero : param[ control & 0x3 ]; \
- return ( res ); \
-}
-
-__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_ps, __m128, __emu__m256 );
-__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_pd, __m128d, __emu__m256d );
-__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_si256, __m128i, __emu__m256i );
-
-
-#define __emu_mm_broadcast_impl( name, res_type, type ) \
-static __emu_inline res_type name(type const *a) { \
- const size_t size = sizeof( res_type ) / sizeof( type );\
- __EMU_M256_ALIGN(32) type res[ size ]; \
- size_t i = 0; \
- for ( ; i < size; ++i ) \
- res[ i ] = *a; \
- return (*(res_type*)&res); \
-}
-
-__emu_mm_broadcast_impl( __emu_mm_broadcast_ss, __m128, float )
-__emu_mm_broadcast_impl( __emu_mm256_broadcast_ss, __emu__m256, float )
-
-__emu_mm_broadcast_impl( __emu_mm_broadcast_sd, __m128, double )
-__emu_mm_broadcast_impl( __emu_mm256_broadcast_sd, __emu__m256d, double )
-
-__emu_mm_broadcast_impl( __emu_mm256_broadcast_ps, __emu__m256, __m128 )
-__emu_mm_broadcast_impl( __emu_mm256_broadcast_pd, __emu__m256d, __m128d )
-
-
-static __emu_inline __emu__m256 __emu_mm256_insertf128_ps(__emu__m256 a, __m128 b, int offset) { a.__emu_m128[ offset ] = b; return a; }
-static __emu_inline __emu__m256d __emu_mm256_insertf128_pd(__emu__m256d a, __m128d b, int offset) { a.__emu_m128[ offset ] = b; return a; }
-static __emu_inline __emu__m256i __emu_mm256_insertf128_si256(__emu__m256i a, __m128i b, int offset) { a.__emu_m128[ offset ] = b; return a; }
-
-
-#define __emu_mm_load_impl( name, sfx, m256_sfx, m256_type, type_128, type ) \
-static __emu_inline __emu##m256_type __emu_mm256_##name##_##m256_sfx(const type* a) { \
- __emu##m256_type res; \
- res.__emu_m128[0] = _mm_##name##_##sfx( (const type_128 *)a ); \
- res.__emu_m128[1] = _mm_##name##_##sfx( (const type_128 *)(1+(const __m128 *)a) ); \
- return (res); \
-}
-
-#define __emu_mm_store_impl( name, sfx, m256_sfx, m256_type, type_128, type ) \
-static __emu_inline void __emu_mm256_##name##_##m256_sfx(type *a, __emu##m256_type b) { \
- _mm_##name##_##sfx( (type_128*)a, b.__emu_m128[0] ); \
- _mm_##name##_##sfx( (type_128*)(1+(__m128*)a), b.__emu_m128[1] ); \
-}
-
-__emu_mm_load_impl( load, pd, pd, __m256d, double, double );
-__emu_mm_store_impl( store, pd, pd, __m256d, double, double );
-
-__emu_mm_load_impl( load, ps, ps, __m256, float, float );
-__emu_mm_store_impl( store, ps, ps, __m256, float, float );
-
-__emu_mm_load_impl( loadu, pd, pd, __m256d, double, double );
-__emu_mm_store_impl( storeu, pd, pd, __m256d, double, double );
-
-__emu_mm_load_impl( loadu, ps, ps, __m256, float, float );
-__emu_mm_store_impl( storeu, ps, ps, __m256, float, float );
-
-__emu_mm_load_impl( load, si128, si256, __m256i, __m128i, __emu__m256i );
-__emu_mm_store_impl( store, si128, si256, __m256i, __m128i, __emu__m256i );
-
-__emu_mm_load_impl( loadu, si128, si256, __m256i, __m128i, __emu__m256i );
-__emu_mm_store_impl( storeu, si128, si256, __m256i, __m128i, __emu__m256i );
-
-
-#define __emu_maskload_impl( name, vec_type, mask_vec_type, type, mask_type ) \
-static __emu_inline vec_type name(type const *a, mask_vec_type mask) { \
- const size_t size_type = sizeof( type ); \
- const size_t size = sizeof( vec_type ) / size_type; \
- __EMU_M256_ALIGN(32) type res[ size ]; \
- const mask_type* p_mask = (const mask_type*)&mask; \
- size_t i = 0; \
- mask_type sign_bit = 1; \
- sign_bit <<= (8*size_type - 1); \
- for ( ; i < size; ++i ) \
- res[ i ] = (sign_bit & *(p_mask + i)) ? *(a+i) : 0; \
- return (*(vec_type*)&res); \
-}
-
-#define __emu_maskstore_impl( name, vec_type, mask_vec_type, type, mask_type ) \
-static __emu_inline void name(type *a, mask_vec_type mask, vec_type data) { \
- const size_t size_type = sizeof( type ); \
- const size_t size = sizeof( vec_type ) / sizeof( type ); \
- type* p_data = (type*)&data; \
- const mask_type* p_mask = (const mask_type*)&mask; \
- size_t i = 0; \
- mask_type sign_bit = 1; \
- sign_bit <<= (8*size_type - 1); \
- for ( ; i < size; ++i ) \
- if ( *(p_mask + i ) & sign_bit) \
- *(a + i) = *(p_data + i); \
-}
-
-__emu_maskload_impl( __emu_mm256_maskload_pd, __emu__m256d, __emu__m256i, double, __emu_int64_t );
-__emu_maskstore_impl( __emu_mm256_maskstore_pd, __emu__m256d, __emu__m256i, double, __emu_int64_t );
-
-__emu_maskload_impl( __emu_mm_maskload_pd, __m128d, __m128i, double, __emu_int64_t );
-__emu_maskstore_impl( __emu_mm_maskstore_pd, __m128d, __m128i, double, __emu_int64_t );
-
-__emu_maskload_impl( __emu_mm256_maskload_ps, __emu__m256, __emu__m256i, float, int );
-__emu_maskstore_impl( __emu_mm256_maskstore_ps, __emu__m256, __emu__m256i, float, int );
-
-__emu_maskload_impl( __emu_mm_maskload_ps, __m128, __m128i, float, int );
-__emu_maskstore_impl( __emu_mm_maskstore_ps, __m128, __m128i, float, int );
-
-
-__EMU_M256_IMPL_M1( __m256, movehdup_ps );
-__EMU_M256_IMPL_M1( __m256, moveldup_ps );
-__EMU_M256_IMPL_M1( __m256d, movedup_pd );
-
-__emu_mm_load_impl( lddqu, si128, si256, __m256i, __m128i, __emu__m256i );
-
-__emu_mm_store_impl( stream, si128, si256, __m256i, __m128i, __emu__m256i );
-__emu_mm_store_impl( stream, pd, pd, __m256d, double, double );
-__emu_mm_store_impl( stream, ps, ps, __m256, float, float );
-
-
-__EMU_M256_IMPL_M1( __m256, rcp_ps );
-__EMU_M256_IMPL_M1( __m256, rsqrt_ps );
-
-__EMU_M256_IMPL_M1( __m256d, sqrt_pd );
-__EMU_M256_IMPL_M1( __m256, sqrt_ps );
-
-__EMU_M256_IMPL_M2( __m256d, unpackhi_pd );
-__EMU_M256_IMPL_M2( __m256, unpackhi_ps );
-__EMU_M256_IMPL_M2( __m256d, unpacklo_pd );
-__EMU_M256_IMPL_M2( __m256, unpacklo_ps );
-
-
-static __emu_inline int __emu_mm256_movemask_pd(__emu__m256d a)
-{
- return
- (_mm_movemask_pd( a.__emu_m128[1] ) << 2) |
- _mm_movemask_pd( a.__emu_m128[0] );
-}
-
-static __emu_inline int __emu_mm256_movemask_ps(__emu__m256 a)
-{
- return
- (_mm_movemask_ps( a.__emu_m128[1] ) << 4) |
- _mm_movemask_ps( a.__emu_m128[0] );
-}
-
-static __emu_inline __emu__m256d __emu_mm256_setzero_pd(void) { __m128d ret[2] = { _mm_setzero_pd(), _mm_setzero_pd() }; return __emu_set_m128d( ret ); }
-static __emu_inline __emu__m256 __emu_mm256_setzero_ps(void) { __m128 ret[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; return __emu_set_m128( ret ); }
-static __emu_inline __emu__m256i __emu_mm256_setzero_si256(void) { __m128i ret[2] = { _mm_setzero_si128(), _mm_setzero_si128() }; return __emu_set_m128i( ret ); }
-
-static __emu_inline __emu__m256d __emu_mm256_set_pd(double a1, double a2, double a3, double a4)
-{ __m128d ret[2] = { _mm_set_pd( a3, a4 ), _mm_set_pd( a1, a2 ) }; return __emu_set_m128d( ret ); }
-
-static __emu_inline __emu__m256 __emu_mm256_set_ps(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
-{ __m128 ret[2] = { _mm_set_ps( a5, a6, a7, a8 ), _mm_set_ps( a1, a2, a3, a4 ) }; return __emu_set_m128( ret ); }
-
-static __emu_inline __emu__m256i __emu_mm256_set_epi8(char a1, char a2, char a3, char a4, char a5, char a6, char a7, char a8,
- char a9, char a10, char a11, char a12, char a13, char a14, char a15, char a16,
- char a17, char a18, char a19, char a20, char a21, char a22, char a23, char a24,
- char a25, char a26, char a27, char a28, char a29, char a30, char a31, char a32)
-{ __m128i ret[2] = { _mm_set_epi8( a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32 ),
- _mm_set_epi8( a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16 ) };
- return __emu_set_m128i( ret );
-}
-
-static __emu_inline __emu__m256i __emu_mm256_set_epi16(short a1, short a2, short a3, short a4, short a5, short a6, short a7, short a8,
- short a9, short a10, short a11, short a12, short a13, short a14, short a15, short a16)
-{ __m128i ret[2] = { _mm_set_epi16( a9, a10, a11, a12, a13, a14, a15, a16 ),
- _mm_set_epi16( a1, a2, a3, a4, a5, a6, a7, a8 ) };
- return __emu_set_m128i( ret );
-}
-
-static __emu_inline __emu__m256i __emu_mm256_set_epi32(int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8)
-{ __m128i ret[2] = { _mm_set_epi32( a5, a6, a7, a8 ), _mm_set_epi32( a1, a2, a3, a4 ) }; return __emu_set_m128i( ret ); }
-
-static __emu_inline __m128i __emu_mm_set_epi64x( __emu_int64_t a, __emu_int64_t b ) { return _mm_set_epi64( *(__m64*)&a, *(__m64*)&b ); }
-
-static __emu_inline __emu__m256i __emu_mm256_set_epi64x(__emu_int64_t a1, __emu_int64_t a2, __emu_int64_t a3, __emu_int64_t a4)
-{ __m128i ret[2] = { __emu_mm_set_epi64x( a3, a4 ), __emu_mm_set_epi64x( a1, a2 ) }; return __emu_set_m128i( ret ); }
-
-
-static __emu_inline __emu__m256d __emu_mm256_setr_pd(double a1, double a2, double a3, double a4)
-{ __m128d ret[2] = { _mm_setr_pd( a1, a2 ), _mm_setr_pd( a3, a4 ) }; return __emu_set_m128d( ret ); }
-
-static __emu_inline __emu__m256 __emu_mm256_setr_ps(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8)
-{ __m128 ret[2] = { _mm_setr_ps( a1, a2, a3, a4 ), _mm_setr_ps( a5, a6, a7, a8 ) }; return __emu_set_m128( ret ); }
-
-static __emu_inline __emu__m256i __emu_mm256_setr_epi8(char a1, char a2, char a3, char a4, char a5, char a6, char a7, char a8,
- char a9, char a10, char a11, char a12, char a13, char a14, char a15, char a16,
- char a17, char a18, char a19, char a20, char a21, char a22, char a23, char a24,
- char a25, char a26, char a27, char a28, char a29, char a30, char a31, char a32)
-{ __m128i ret[2] = { _mm_setr_epi8( a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16 ),
- _mm_setr_epi8( a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32 ) };
- return __emu_set_m128i( ret );
-}
-
-static __emu_inline __emu__m256i __emu_mm256_setr_epi16(short a1, short a2, short a3, short a4, short a5, short a6, short a7, short a8,
- short a9, short a10, short a11, short a12, short a13, short a14, short a15, short a16)
-{ __m128i ret[2] = { _mm_setr_epi16( a1, a2, a3, a4, a5, a6, a7, a8 ),
- _mm_setr_epi16( a9, a10, a11, a12, a13, a14, a15, a16 ) }; return __emu_set_m128i( ret );
-}
-
-static __emu_inline __emu__m256i __emu_mm256_setr_epi32(int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8)
-{ __m128i ret[2] = { _mm_setr_epi32( a1, a2, a3, a4 ), _mm_setr_epi32( a5, a6, a7, a8 ), }; return __emu_set_m128i( ret ); }
-
-static __emu_inline __emu__m256i __emu_mm256_setr_epi64x(__emu_int64_t a1, __emu_int64_t a2, __emu_int64_t a3, __emu_int64_t a4)
-{ __m128i ret[2] = { __emu_mm_set_epi64x( a2, a1 ), __emu_mm_set_epi64x( a4, a3 ) }; return __emu_set_m128i( ret ); }
-
-
-
-__EMU_M256_IMPL_M1P_DUP( __m256d, double, set1_pd );
-__EMU_M256_IMPL_M1P_DUP( __m256, float, set1_ps );
-__EMU_M256_IMPL_M1P_DUP( __m256i, char, set1_epi8 );
-__EMU_M256_IMPL_M1P_DUP( __m256i, short, set1_epi16 );
-__EMU_M256_IMPL_M1P_DUP( __m256i, int, set1_epi32 );
-
-static __emu__m256i __emu_mm256_set1_epi64x(__emu_int64_t a)
-{
- __emu_int64_t res[4] = { a, a, a, a };
- return *((__emu__m256i*)res);
-}
-
-/*
- * Support intrinsics to do vector type casts. These intrinsics do not introduce
- * extra moves to generated code. When cast is done from a 128 to 256-bit type
- * the low 128 bits of the 256-bit result contain source parameter value; the
- * upper 128 bits of the result are undefined
- */
-__EMU_M256_IMPL_M1_RET( __m256, __m256d, castpd_ps );
-__EMU_M256_IMPL_M1_RET( __m256d, __m256, castps_pd );
-
-__EMU_M256_IMPL_M1_RET_NAME( __m256i, __m256, castps_si128, castps_si256 );
-__EMU_M256_IMPL_M1_RET_NAME( __m256i, __m256d, castpd_si128, castpd_si256 );
-
-__EMU_M256_IMPL_M1_RET_NAME( __m256, __m256i, castsi128_ps, castsi256_ps );
-__EMU_M256_IMPL_M1_RET_NAME( __m256d, __m256i, castsi128_pd, castsi256_pd );
-
-static __emu_inline __m128 __emu_mm256_castps256_ps128(__emu__m256 a) { return ( a.__emu_m128[0] ); }
-static __emu_inline __m128d __emu_mm256_castpd256_pd128(__emu__m256d a) { return ( a.__emu_m128[0] ); }
-static __emu_inline __m128i __emu_mm256_castsi256_si128(__emu__m256i a) { return ( a.__emu_m128[0] ); }
-
-static __emu_inline __emu__m256 __emu_mm256_castps128_ps256(__m128 a) { __m128 ret[2] = { a, _mm_setzero_ps() }; return __emu_set_m128( ret ); };
-static __emu_inline __emu__m256d __emu_mm256_castpd128_pd256(__m128d a) { __m128d ret[2] = { a, _mm_setzero_pd() }; return __emu_set_m128d( ret ); };
-static __emu_inline __emu__m256i __emu_mm256_castsi128_si256(__m128i a) { __m128i ret[2] = { a, _mm_setzero_si128() }; return __emu_set_m128i( ret ); };
-
-#if defined __cplusplus
-}; /* End "C" */
-#endif /* __cplusplus */
-
-
-
-
-
-
-#ifndef __EMU_M256_NOMAP
-
-#define __m256 __emu__m256
-#define __m256i __emu__m256i
-#define __m256d __emu__m256d
-
-#define _mm256_add_pd __emu_mm256_add_pd
-#define _mm256_add_ps __emu_mm256_add_ps
-
-#define _mm256_addsub_pd __emu_mm256_addsub_pd
-#define _mm256_addsub_ps __emu_mm256_addsub_ps
-
-#define _mm256_and_pd __emu_mm256_and_pd
-#define _mm256_and_ps __emu_mm256_and_ps
-
-#define _mm256_andnot_pd __emu_mm256_andnot_pd
-#define _mm256_andnot_ps __emu_mm256_andnot_ps
-
-#define _mm256_blend_pd __emu_mm256_blend_pd
-#define _mm256_blend_ps __emu_mm256_blend_ps
-
-#define _mm256_blendv_pd __emu_mm256_blendv_pd
-#define _mm256_blendv_ps __emu_mm256_blendv_ps
-
-#define _mm256_div_pd __emu_mm256_div_pd
-#define _mm256_div_ps __emu_mm256_div_ps
-
-#define _mm256_dp_ps __emu_mm256_dp_ps
-
-#define _mm256_hadd_pd __emu_mm256_hadd_pd
-#define _mm256_hadd_ps __emu_mm256_hadd_ps
-
-#define _mm256_hsub_pd __emu_mm256_hsub_pd
-#define _mm256_hsub_ps __emu_mm256_hsub_ps
-
-#define _mm256_max_pd __emu_mm256_max_pd
-#define _mm256_max_ps __emu_mm256_max_ps
-
-#define _mm256_min_pd __emu_mm256_min_pd
-#define _mm256_min_ps __emu_mm256_min_ps
-
-#define _mm256_mul_pd __emu_mm256_mul_pd
-#define _mm256_mul_ps __emu_mm256_mul_ps
-
-#define _mm256_or_pd __emu_mm256_or_pd
-#define _mm256_or_ps __emu_mm256_or_ps
-
-#define _mm256_shuffle_pd __emu_mm256_shuffle_pd
-#define _mm256_shuffle_ps __emu_mm256_shuffle_ps
-
-#define _mm256_sub_pd __emu_mm256_sub_pd
-#define _mm256_sub_ps __emu_mm256_sub_ps
-
-#define _mm256_xor_pd __emu_mm256_xor_pd
-#define _mm256_xor_ps __emu_mm256_xor_ps
-
-
-#define _mm_cmp_pd __emu_mm_cmp_pd
-#define _mm256_cmp_pd __emu_mm256_cmp_pd
-
-#define _mm_cmp_ps __emu_mm_cmp_ps
-#define _mm256_cmp_ps __emu_mm256_cmp_ps
-
-#define _mm_cmp_sd __emu_mm_cmp_sd
-#define _mm_cmp_ss __emu_mm_cmp_ss
-
-#define _mm256_cvtepi32_pd __emu_mm256_cvtepi32_pd
-#define _mm256_cvtepi32_ps __emu_mm256_cvtepi32_ps
-
-#define _mm256_cvtpd_ps __emu_mm256_cvtpd_ps
-#define _mm256_cvtps_epi32 __emu_mm256_cvtps_epi32
-#define _mm256_cvtps_pd __emu_mm256_cvtps_pd
-
-#define _mm256_cvttpd_epi32 __emu_mm256_cvttpd_epi32
-#define _mm256_cvtpd_epi32 __emu_mm256_cvtpd_epi32
-#define _mm256_cvttps_epi32 __emu_mm256_cvttps_epi32
-
-#define _mm256_extractf128_ps __emu_mm256_extractf128_ps
-#define _mm256_extractf128_pd __emu_mm256_extractf128_pd
-#define _mm256_extractf128_si256 __emu_mm256_extractf128_si256
-
-#define _mm256_zeroall __emu_mm256_zeroall
-#define _mm256_zeroupper __emu_mm256_zeroupper
-
-#define _mm256_permutevar_ps __emu_mm256_permutevar_ps
-#define _mm_permutevar_ps __emu_mm_permutevar_ps
-
-#define _mm256_permute_ps __emu_mm256_permute_ps
-#define _mm_permute_ps __emu_mm_permute_ps
-
-#define _mm256_permutevar_pd __emu_mm256_permutevar_pd
-#define _mm_permutevar_pd __emu_mm_permutevar_pd
-
-#define _mm256_permute_pd __emu_mm256_permute_pd
-#define _mm_permute_pd __emu_mm_permute_pd
-
-#define _mm256_permute2f128_ps __emu_mm256_permute2f128_ps
-#define _mm256_permute2f128_pd __emu_mm256_permute2f128_pd
-#define _mm256_permute2f128_si256 __emu_mm256_permute2f128_si256
-
-#define _mm256_broadcast_ss __emu_mm256_broadcast_ss
-#define _mm_broadcast_ss __emu_mm_broadcast_ss
-
-#define _mm256_broadcast_sd __emu_mm256_broadcast_sd
-
-#define _mm256_broadcast_ps __emu_mm256_broadcast_ps
-#define _mm256_broadcast_pd __emu_mm256_broadcast_pd
-
-#define _mm256_insertf128_ps __emu_mm256_insertf128_ps
-#define _mm256_insertf128_pd __emu_mm256_insertf128_pd
-#define _mm256_insertf128_si256 __emu_mm256_insertf128_si256
-
-#define _mm256_load_pd __emu_mm256_load_pd
-#define _mm256_store_pd __emu_mm256_store_pd
-#define _mm256_load_ps __emu_mm256_load_ps
-#define _mm256_store_ps __emu_mm256_store_ps
-
-#define _mm256_loadu_pd __emu_mm256_loadu_pd
-#define _mm256_storeu_pd __emu_mm256_storeu_pd
-#define _mm256_loadu_ps __emu_mm256_loadu_ps
-#define _mm256_storeu_ps __emu_mm256_storeu_ps
-
-#define _mm256_load_si256 __emu_mm256_load_si256
-#define _mm256_store_si256 __emu_mm256_store_si256
-#define _mm256_loadu_si256 __emu_mm256_loadu_si256
-#define _mm256_storeu_si256 __emu_mm256_storeu_si256
-
-#define _mm256_maskload_pd __emu_mm256_maskload_pd
-#define _mm256_maskstore_pd __emu_mm256_maskstore_pd
-#define _mm_maskload_pd __emu_mm_maskload_pd
-#define _mm_maskstore_pd __emu_mm_maskstore_pd
-
-#define _mm256_maskload_ps __emu_mm256_maskload_ps
-#define _mm256_maskstore_ps __emu_mm256_maskstore_ps
-#define _mm_maskload_ps __emu_mm_maskload_ps
-#define _mm_maskstore_ps __emu_mm_maskstore_ps
-
-#define _mm256_movehdup_ps __emu_mm256_movehdup_ps
-#define _mm256_moveldup_ps __emu_mm256_moveldup_ps
-
-#define _mm256_movedup_pd __emu_mm256_movedup_pd
-#define _mm256_lddqu_si256 __emu_mm256_lddqu_si256
-
-#define _mm256_stream_si256 __emu_mm256_stream_si256
-#define _mm256_stream_pd __emu_mm256_stream_pd
-#define _mm256_stream_ps __emu_mm256_stream_ps
-
-#define _mm256_rcp_ps __emu_mm256_rcp_ps
-#define _mm256_rsqrt_ps __emu_mm256_rsqrt_ps
-
-#define _mm256_sqrt_pd __emu_mm256_sqrt_pd
-#define _mm256_sqrt_ps __emu_mm256_sqrt_ps
-
-#define _mm256_round_pd __emu_mm256_round_pd
-
-#define _mm256_round_ps __emu_mm256_round_ps
-
-#define _mm256_unpackhi_pd __emu_mm256_unpackhi_pd
-#define _mm256_unpackhi_ps __emu_mm256_unpackhi_ps
-
-#define _mm256_unpacklo_pd __emu_mm256_unpacklo_pd
-#define _mm256_unpacklo_ps __emu_mm256_unpacklo_ps
-
-#define _mm256_testz_si256 __emu_mm256_testz_si256
-#define _mm256_testc_si256 __emu_mm256_testc_si256
-#define _mm256_testnzc_si256 __emu_mm256_testnzc_si256
-
-#define _mm256_testz_pd __emu_mm256_testz_pd
-#define _mm256_testc_pd __emu_mm256_testc_pd
-#define _mm256_testnzc_pd __emu_mm256_testnzc_pd
-#define _mm_testz_pd __emu_mm_testz_pd
-#define _mm_testc_pd __emu_mm_testc_pd
-#define _mm_testnzc_pd __emu_mm_testnzc_pd
-
-#define _mm256_testz_ps __emu_mm256_testz_ps
-#define _mm256_testc_ps __emu_mm256_testc_ps
-#define _mm256_testnzc_ps __emu_mm256_testnzc_ps
-#define _mm_testz_ps __emu_mm_testz_ps
-#define _mm_testc_ps __emu_mm_testc_ps
-#define _mm_testnzc_ps __emu_mm_testnzc_ps
-
-#define _mm256_movemask_pd __emu_mm256_movemask_pd
-#define _mm256_movemask_ps __emu_mm256_movemask_ps
-
-#define _mm256_setzero_pd __emu_mm256_setzero_pd
-#define _mm256_setzero_ps __emu_mm256_setzero_ps
-#define _mm256_setzero_si256 __emu_mm256_setzero_si256
-
-#define _mm256_set_pd __emu_mm256_set_pd
-#define _mm256_set_ps __emu_mm256_set_ps
-#define _mm256_set_epi8 __emu_mm256_set_epi8
-#define _mm256_set_epi16 __emu_mm256_set_epi16
-#define _mm256_set_epi32 __emu_mm256_set_epi32
-#define _mm256_set_epi64x __emu_mm256_set_epi64x
-
-#define _mm256_setr_pd __emu_mm256_setr_pd
-#define _mm256_setr_ps __emu_mm256_setr_ps
-#define _mm256_setr_epi8 __emu_mm256_setr_epi8
-#define _mm256_setr_epi16 __emu_mm256_setr_epi16
-#define _mm256_setr_epi32 __emu_mm256_setr_epi32
-#define _mm256_setr_epi64x __emu_mm256_setr_epi64x
-
-#define _mm256_set1_pd __emu_mm256_set1_pd
-#define _mm256_set1_ps __emu_mm256_set1_ps
-#define _mm256_set1_epi8 __emu_mm256_set1_epi8
-#define _mm256_set1_epi16 __emu_mm256_set1_epi16
-#define _mm256_set1_epi32 __emu_mm256_set1_epi32
-#define _mm256_set1_epi64x __emu_mm256_set1_epi64x
-
-#define _mm256_castpd_ps __emu_mm256_castpd_ps
-#define _mm256_castps_pd __emu_mm256_castps_pd
-#define _mm256_castps_si256 __emu_mm256_castps_si256
-#define _mm256_castpd_si256 __emu_mm256_castpd_si256
-#define _mm256_castsi256_ps __emu_mm256_castsi256_ps
-#define _mm256_castsi256_pd __emu_mm256_castsi256_pd
-#define _mm256_castps256_ps128 __emu_mm256_castps256_ps128
-#define _mm256_castpd256_pd128 __emu_mm256_castpd256_pd128
-#define _mm256_castsi256_si128 __emu_mm256_castsi256_si128
-#define _mm256_castps128_ps256 __emu_mm256_castps128_ps256
-#define _mm256_castpd128_pd256 __emu_mm256_castpd128_pd256
-#define _mm256_castsi128_si256 __emu_mm256_castsi128_si256
-
-#endif /* __EMU_M256_NOMAP */
-
-
-
-#endif /* __EMU_M256_AVXIMMINTRIN_EMU_H__ */
diff --git a/src/macros/vectors-4-SSE.h b/src/macros/vectors-4-SSE.h
new file mode 100644
index 0000000..2be477b
--- /dev/null
+++ b/src/macros/vectors-4-SSE.h
@@ -0,0 +1,457 @@
+// Vectorise using Intel's or AMD's SSE
+
+// Use the type __m128 directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <assert.h>
+#include <math.h>
+
+#include <xmmintrin.h>
+#ifdef __SSE4_1__
+// Intel's SSE 4.1
+# include <smmintrin.h>
+#endif
+#ifdef __SSE4A__
+// AMD's SSE 4a
+# include <ammintrin.h>
+#endif
+#ifdef __FMA4__
+# include <fma4intrin.h>
+#endif
+
+
+
+#ifdef __SSE4_1__
+# define vec4_architecture_SSE4_1 "+SSE4.1"
+#else
+# define vec4_architecture_SSE4_1 ""
+#endif
+#ifdef __SSE4A__
+# define vec4_architecture_SSE4a "+SSE4A"
+#else
+# define vec4_architecture_SSE4a ""
+#endif
+#ifdef __FMA4__
+# define vec4_architecture_FMA4 "+FMA4"
+#else
+# define vec4_architecture_FMA4 ""
+#endif
+#define vec4_architecture "SSE" vec4_architecture_SSE4_1 vec4_architecture_SSE4a vec4_architecture_FMA4 " (32-bit precision)"
+
+
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL4_VEC __m128
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL4_VEC_SIZE 4
+
+// Integer and boolean types corresponding to this real type
+#define CCTK_INTEGER4 CCTK_REAL4
+#define CCTK_BOOLEAN4 CCTK_REAL4
+#define CCTK_INTEGER4_VEC CCTK_REAL4_VEC
+#define CCTK_BOOLEAN4_VEC CCTK_REAL4_VEC
+
+
+
+union k4const_t {
+ unsigned i[4];
+ float f[4];
+ __m128i vi;
+ __m128 vf;
+};
+
+#define K4_ZERO 0x00000000UL
+#define K4_IMIN 0x80000000UL
+#define K4_IMAX 0x7fffffffUL
+
+
+
+// Create vectors, extract vector elements
+
+#define vec4_set1(a) (_mm_set1_ps(a))
+#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments
+
+// original order is 0123
+#define vec4_swap1032(x_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const x=x__; \
+ _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,3,0,1)); \
+ })
+#define vec4_swap2301(x_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const x=x__; \
+ _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,0,3,2)); \
+ })
+#define vec4_swap3210(x_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const x=x__; \
+ _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,1,2,3)); \
+ })
+
+#if defined(__PGI)
+// _mm_cvtss_f32 does not exist on PGI compilers
+# define vec4_elt0(x) \
+ ({ \
+ CCTK_REAL4 a; \
+ asm ("" : "=x" (a) : "0" (x)); \
+ a; \
+ })
+#else
+# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op
+#endif
+#define vec4_elt1(x) vec4_elt0(vec4_swap1032(x))
+#define vec4_elt2(x) vec4_elt0(vec4_swap2301(x))
+#define vec4_elt3(x) vec4_elt0(vec4_swap3210(x))
+#if defined(__PGI)
+# define vec4_elt(x_,d) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const x=x__; \
+ CCTK_REAL4 a; \
+ if (d==0) a=vec4_elt0(x); \
+ else if (d==1) a=vec4_elt1(x); \
+ else if (d==2) a=vec4_elt2(x); \
+ else if (d==3) a=vec4_elt3(x); \
+ a; \
+ })
+#else
+# define vec4_elt(x_,d) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const x=x__; \
+ CCTK_REAL4 a; \
+ switch (d) { \
+ case 0: a=vec4_elt0(x); break; \
+ case 1: a=vec4_elt1(x); break; \
+ case 2: a=vec4_elt2(x); break; \
+ case 3: a=vec4_elt3(x); break; \
+ } \
+ a; \
+ })
+#endif
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec4_load(p) (_mm_load_ps(&(p)))
+#define vec4_loadu(p) (_mm_loadu_ps(&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# define vec4_load_off1(p) vec_loadu(p)
+# define vec4_load_off2(p) vec_loadu(p)
+# define vec4_load_off3(p) vec_loadu(p)
+#else
+# define vec4_load_off1(p_) \
+ ({ \
+ CCTK_REAL4 const& p__=(p_); \
+ CCTK_REAL4 const& p=p__; \
+ CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \
+ CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \
+ assert(0); \
+ CCTK_REAL4_VEC const hi2=_mm_shuffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \
+ _mm_shuffle_ps(lo,hi2, _MM_SHUFFLE(2,1,3,0)); \
+ })
+# define vec4_load_off2(p_) \
+ ({ \
+ CCTK_REAL4 const& p__=(p_); \
+ CCTK_REAL4 const& p=p__; \
+ CCTK_REAL4_VEC const lo=vec4_load((&p)[-2]); \
+ CCTK_REAL4_VEC const hi=vec4_load((&p)[+2]); \
+ _mm_shuffle_ps(lo,hi, _MM_SHUFFLE(1,0,3,2)); \
+ })
+# define vec4_load_off3(p_) \
+ ({ \
+ CCTK_REAL4 const& p__=(p_); \
+ CCTK_REAL4 const& p=p__; \
+ CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \
+ CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \
+ assert(0); \
+ CCTK_REAL4_VEC const lo2=_mm_shuffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \
+ _mm_shuffle_ps(lo2,hi, _MM_SHUFFLE(3,0,2,1)); \
+ })
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+# define vec4_loadu_maybe(off,p) vec4_loadu(p)
+# define vec4_loadu_maybe3(off1,off2,off3,p) vec4_loadu(p)
+#else
+# define vec4_loadu_maybe(off,p_) \
+ ({ \
+ CCTK_REAL4 const& p__=(p_); \
+ CCTK_REAL4 const& p=p__; \
+ (off) % CCTK_REAL4_VEC_SIZE == 0 ? \
+ vec4_load(p) : \
+ vec4_loadu(p); \
+ })
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+# define vec4_loadu_maybe3(off1,off2,off3,p) \
+ vec4_loadu_maybe(off1,p)
+# else
+# define vec4_loadu_maybe3(off1,off2,off3,p) \
+ vec4_loadu_maybe((off1)|(off2)|(off3),p)
+# endif
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec4_store(p,x) (_mm_store_ps(&(p),x))
+#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x))
+#if ! VECTORISE_STREAMING_STORES
+# define vec4_store_nta(p,x) vec4_store(p,x)
+#else
+# define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x))
+#endif
+
+// Store a partial vector (aligned and non-temporal)
+#define vec4_store_partial_prepare(i,imin,imax) \
+ int v4stp_lo_skip = (imin)-(i); \
+ int v4stp_hi_skip = (i)+CCTK_REAL_VEC_SIZE-(imax); \
+ if (CCTK_BUILTIN_EXPECT(v4stp_lo_skip < 0, true)) v4stp_lo_skip = 0; \
+ if (CCTK_BUILTIN_EXPECT(v4stp_hi_skip < 0, true)) v4stp_hi_skip = 0;
+// Ignoring VECTORISE_STREAMING_STORES for partial stores
+#define vec4_store_nta_partial(p_,x_) \
+ ({ \
+ CCTK_REAL4& p__=(p_); \
+ CCTK_REAL4& p=p__; \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const x=x__; \
+ if (CCTK_BUILTIN_EXPECT(v4stp_lo_skip==0 and v4stp_hi_skip==0, true)) { \
+ vec4_store_nta(p,x); \
+ } else { \
+ /* these cases fall through */ \
+ switch (v4stp_lo_skip) { \
+ case 0: \
+ (&p)[0] = vec4_elt0(x); \
+ case 1: \
+ if (v4stp_hi_skip>=3) break; \
+ (&p)[1] = vec4_elt1(x); \
+ case 2: \
+ if (v4stp_hi_skip>=2) break; \
+ (&p)[2] = vec4_elt2(x); \
+ case 3: \
+ if (v4stp_hi_skip>=1) break; \
+ (&p)[3] = vec4_elt3(x); \
+ } \
+ } \
+ })
+
+// Ignoring VECTORISE_STREAMING_STORES for partial stores
+#define vec4_store_nta_partial_lo(p_,x_,n) \
+ ({ \
+ CCTK_REAL4 & p__=(p_); \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4 & p=p__; \
+ CCTK_REAL4_VEC const x=x__; \
+ /* these cases fall through */ \
+ switch (n) { \
+ case 3: (&p)[2] = vec4_elt2(x); \
+ case 2: (&p)[1] = vec4_elt1(x); \
+ case 1: (&p)[0] = vec4_elt0(x); \
+ } \
+ })
+#define vec4_store_nta_partial_hi(p_,x_,n) \
+ ({ \
+ CCTK_REAL4 & p__=(p_); \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4 & p=p__; \
+ CCTK_REAL4_VEC const x=x__; \
+ /* these cases fall through */ \
+ switch (n) { \
+ case 3: (&p)[1]=vec4_elt1(x); \
+ case 2: (&p)[2]=vec4_elt2(x); \
+ case 1: (&p)[3]=vec4_elt3(x); \
+ } \
+ })
+#define vec4_store_nta_partial_mid(p_,x_,nlo,nhi) \
+ ({ \
+ CCTK_REAL4 & p__=(p_); \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4 & p=p__; \
+ CCTK_REAL4_VEC const x=x__; \
+ /* these cases fall through */ \
+ switch (nhi) { \
+ case 3: if (nlo<2) break; (&p)[1] = vec4_elt1(x); \
+ case 2: if (nlo<3) break; (&p)[2] = vec4_elt2(x); \
+ } \
+ })
+
+
+
+// Functions and operators
+
+static const k4const_t k4sign_mask = {{ K4_IMIN, K4_IMIN, K4_IMIN, K4_IMIN, }};
+
+// Operators
+#define k4neg(x) (_mm_xor_ps(k4sign_mask.vf,x))
+// #define k4inv(x)
+// TODO: provide k4inv via rcp and Newton-Raphson
+// This is described in AMD's publication 47414.
+// This should apply for AVX as well.
+
+#define k4add(x,y) (_mm_add_ps(x,y))
+#define k4sub(x,y) (_mm_sub_ps(x,y))
+#define k4mul(x,y) (_mm_mul_ps(x,y))
+// TODO: use k4inv and k4mul instead
+#define k4div(x,y) (_mm_div_ps(x,y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#ifdef __FMA4__
+# define k4madd(x,y,z) (_mm_macc_ps(x,y,z))
+# define k4msub(x,y,z) (_mm_msub_ps(x,y,z))
+# define k4nmadd(x,y,z) (_mm_nmsub_ps(x,y,z))
+# define k4nmsub(x,y,z) (_mm_nmacc_ps(x,y,z))
+#else
+# define k4madd(x,y,z) (k4add(k4mul(x,y),z))
+# define k4msub(x,y,z) (k4sub(k4mul(x,y),z))
+# define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y)))
+# define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y)))
+#endif
+
+// Cheap functions
+#define k4copysign(x,y) \
+ (_mm_or_ps(_mm_andnot_ps(k4sign_mask.vf,x), \
+ _mm_and_ps(k4sign_mask.vf,y)))
+#define k4fabs(x) (_mm_andnot_ps(k4sign_mask.vf,x))
+#define k4fmax(x,y) (_mm_max_ps(x,y))
+#define k4fmin(x,y) (_mm_min_ps(x,y))
+#define k4fnabs(x) (_mm_or_ps(k4sign_mask.vf,x))
+#define k4sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC const x__=(x_); \
+ CCTK_REAL_VEC const x=x__; \
+ CCTK_REAL_VEC const iszero = _mm_cmpeq_ps(vec4_set1(0.0f), x); \
+ CCTK_REAL_VEC const sign = _mm_and_ps(k4sign_mask.vf, x); \
+ CCTK_REAL_VEC const signedone = _mm_or_ps(vec4_set1(1.0f), sign); \
+ k4ifthen(iszero, vec4_set1(0.0f), signedone); \
+ })
+// TODO: maybe use rsqrt and Newton-Raphson
+#define k4sqrt(x) (_mm_sqrt_ps(x))
+
+// Expensive functions
+#define K4REPL(f,x_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const x=x__; \
+ vec4_set(f(vec4_elt0(x)), \
+ f(vec4_elt1(x)), \
+ f(vec4_elt2(x)), \
+ f(vec4_elt3(x))); \
+ })
+#define K4REPL2S(f,x_,a_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4 const a__=(a_); \
+ CCTK_REAL4_VEC const x=x__; \
+ CCTK_REAL4 const a=a__; \
+ vec4_set(f(vec4_elt0(x),a), \
+ f(vec4_elt1(x),a), \
+ f(vec4_elt2(x),a), \
+ f(vec4_elt3(x),a)); \
+ })
+#define K4REPL2(f,x_,y_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const y__=(y_); \
+ CCTK_REAL4_VEC const x=x__; \
+ CCTK_REAL4_VEC const y=y__; \
+ vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \
+ f(vec4_elt1(x),vec4_elt1(y)), \
+ f(vec4_elt2(x),vec4_elt2(y)), \
+ f(vec4_elt3(x),vec4_elt3(y))); \
+ })
+
+#define k4acos(x) K4REPL(acosf,x)
+#define k4acosh(x) K4REPL(acoshf,x)
+#define k4asin(x) K4REPL(asinf,x)
+#define k4asinh(x) K4REPL(asinhf,x)
+#define k4atan(x) K4REPL(atanf,x)
+#define k4atan2(x,y) K4REPL2(atan2f,x,y)
+#define k4atanh(x) K4REPL(atanhf,x)
+#define k4cos(x) K4REPL(cosf,x)
+#define k4cosh(x) K4REPL(coshf,x)
+#define k4exp(x) K4REPL(expf,x)
+#define k4log(x) K4REPL(logf,x)
+#define k4pow(x,a) K4REPL2S(powf,x,a)
+#define k4sin(x) K4REPL(sinf,x)
+#define k4sinh(x) K4REPL(sinhf,x)
+#define k4tan(x) K4REPL(tanf,x)
+#define k4tanh(x) K4REPL(tanhf,x)
+
+static const k4const_t k4lfalse_ = {{ 0U, 0U, 0U, 0U, }};
+static const k4const_t k4ltrue_ = {{ ~0U, ~0U, ~0U, ~0U, }};
+#define k4lfalse (k4lfalse_.vf)
+#define k4ltrue (k4ltrue_.vf)
+#define k4lnot(x) (_mm_xor_ps(k4ltrue,x))
+#define k4land(x,y) (_mm_and_ps(x,y))
+#define k4lor(x,y) (_mm_or_ps(x,y))
+#define k4lxor(x,y) (_mm_xor_ps(x,y))
+
+#ifdef __SSE4_1__
+# define k4ifthen(x,y,z) (_mm_blendv_ps(z,y,x))
+#elif 0
+# ifdef __cplusplus
+# define k4signbit(x) ({ using namespace std; signbit(x); })
+# else
+# define k4signbit(x) (signbitf(x))
+# endif
+# define k4ifthen(x,y,z) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const y__=(y_); \
+ CCTK_REAL4_VEC const z__=(z_); \
+ CCTK_REAL4_VEC const x=x__; \
+ CCTK_REAL4_VEC const y=y__; \
+ CCTK_REAL4_VEC const z=z__; \
+ vec4_set(k4signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \
+ k4signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \
+ k4signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \
+ k4signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \
+ })
+#elif 0
+// We don't need to shift -- the condition (mask) will be either all
+// zeros or all ones
+# define k4ifthen(x_,y_,z_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const y__=(y_); \
+ CCTK_REAL4_VEC const z__=(z_); \
+ CCTK_REAL4_VEC const x=x__; \
+ CCTK_REAL4_VEC const y=y__; \
+ CCTK_REAL4_VEC const z=z__; \
+ CCTK_REAL4_VEC const mask = \
+ (__m128)_mm_srai_epi32((__m128i)x, 31); \
+ /* (z & ~mask) | (y & mask) */ \
+ _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \
+ })
+#else
+# define k4ifthen(x_,y_,z_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const y__=(y_); \
+ CCTK_REAL4_VEC const z__=(z_); \
+ CCTK_REAL4_VEC const x=x__; \
+ CCTK_REAL4_VEC const y=y__; \
+ CCTK_REAL4_VEC const z=z__; \
+ /* (z & ~mask) | (y & mask) where imask = ~mask */ \
+ _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z)); \
+ })
+#endif
+
+#define k4cmpeq(x,y) (_mm_cmpeq_ps(x,y))
+#define k4cmpne(x,y) (_mm_cmpneq_ps(x,y))
+#define k4cmpgt(x,y) (_mm_cmpgt_ps(x,y))
+#define k4cmpge(x,y) (_mm_cmpge_ps(x,y))
+#define k4cmplt(x,y) (_mm_cmplt_ps(x,y))
+#define k4cmple(x,y) (_mm_cmple_ps(x,y))
diff --git a/src/macros/vectors-4-default.h b/src/macros/vectors-4-default.h
new file mode 100644
index 0000000..0cd49ac
--- /dev/null
+++ b/src/macros/vectors-4-default.h
@@ -0,0 +1,134 @@
+// Fallback vectorisation implementation: Do not vectorise
+
+// We use macros here, so that we are not surprised by compilers which
+// don't like to inline functions. This should also make debug builds
+// (which may not inline) more efficient.
+
+
+
+#include <assert.h>
+#include <math.h>
+
+
+
+#define vec4_architecture "scalar (no vectorisation, 32-bit precision)"
+
+// Use CCTK_REAL4
+#define CCTK_REAL4_VEC CCTK_REAL4
+
+// Number of vector elements in a vector
+#define CCTK_REAL4_VEC_SIZE 1
+
+// Integer and boolean types corresponding to this real type
+#define CCTK_INTEGER4 CCTK_REAL4
+#define CCTK_BOOLEAN4 CCTK_REAL4
+#define CCTK_INTEGER4_VEC CCTK_REAL4_VEC
+#define CCTK_BOOLEAN4_VEC CCTK_REAL4_VEC
+
+
+
+// Create a vector replicating a scalar
+#define vec4_set1(a) (a)
+// Create a vector from N scalars
+#define vec4_set(a) (a)
+
+// Access vectors elements
+#define vec4_elt0(x) (x)
+#define vec4_elt(x,d) (x)
+
+
+
+// Load an aligned vector from memory
+#define vec4_load(p) (p)
+// Load an unaligned vector from memory
+#define vec4_loadu(p) (p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset and the vector size. These functions are
+// useful e.g. for loading neightbouring grid points while evaluating
+// finite differencing stencils.
+#define vec4_loadu_maybe(off,p) (p)
+#define vec4_loadu_maybe3(off1,off2,off3,p) (p)
+
+// Aligned store
+#define vec4_store(p,x) ((p)=(x))
+#define vec4_storeu(p,x) ((p)=(x))
+
+// Unaligned store
+#define vec4_store_nta(p,x) ((p)=(x))
+
+#define vec4_store_partial_prepare(i,imin,imax) (0)
+#define vec4_store_nta_partial(p,x) (vec4_store_nta(p,x))
+// Store the n lower elements of a vector to memory
+#define vec4_store_nta_partial_lo(p,x,n) (assert(0))
+// Store the n higher elements of a vector into memory. This stores
+// the vector elements into memory locations as if element 0 were
+// stored at p.
+#define vec4_store_nta_partial_hi(p,x,n) (assert(0))
+#define vec4_store_nta_partial_mid(p,x,nlo,nhi) (assert(0))
+
+
+
+// Operators
+#define k4neg(x) (-(x))
+
+#define k4add(x,y) ((x)+(y))
+#define k4sub(x,y) ((x)-(y))
+#define k4mul(x,y) ((x)*(y))
+#define k4div(x,y) ((x)/(y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k4madd(x,y,z) (+(x)*(y)+(z))
+#define k4msub(x,y,z) (+(x)*(y)-(z))
+#define k4nmadd(x,y,z) (-(x)*(y)-(z))
+#define k4nmsub(x,y,z) (-(x)*(y)+(z))
+
+// Functions
+#define k4acos(x) (acosf(x))
+#define k4acosh(x) (acoshf(x))
+#define k4asin(x) (asinf(x))
+#define k4asinh(x) (asinhf(x))
+#define k4atan(x) (atanf(x))
+#define k4atan2(x,y) (atan2f(x,y))
+#define k4atanh(x) (atanhf(x))
+#define k4copysign(x,y) (copysign(x,y))
+#define k4cos(x) (cosf(x))
+#define k4cosh(x) (coshf(x))
+#define k4exp(x) (expf(x))
+#define k4fabs(x) (fabsf(x))
+#define k4fmax(x,y) (fmaxf(x,y))
+#define k4fmin(x,y) (fminf(x,y))
+#define k4fnabs(x) (-fabsf(x))
+#define k4log(x) (logf(x))
+#define k4pow(x,a) (powf(x,a))
+#define k4sin(x) (sinf(x))
+#define k4sinh(x) (sinhf(x))
+#define k4sqrt(x) (sqrtf(x))
+#define k4tan(x) (tanf(x))
+#define k4tanh(x) (tanhf(x))
+
+#define k4sgn(x_) \
+ ({ \
+ CCTK_REAL x__=(x_); \
+ CCTK_REAL x=x__; \
+ x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : std::copysign((CCTK_REAL)1.0, x); \
+ })
+#define k4signbit(x) (std::signbit(x))
+
+#define k4l2r(x_) ({ CCTK_INT4 x__=(x_); CCTK_INT4 x=x__; *(CCTK_REAL4*)&x; })
+#define k4r2l(x_) ({ CCTK_REAL4 x__=(x_); CCTK_REAL4 x=x__; *(CCTK_INT4*)&x; })
+#define k4lfalse k4l2r(0)
+#define k4ltrue k4l2r(1)
+#define k4lnot(x) k4l2r(!k4r2l(x))
+#define k4land(x,y) k4l2r(k4r2l(x) && k4r2l(y))
+#define k4lor(x,y) k4l2r(k4r2l(x) || k4r2l(y))
+#define k4lxor(x,y) k4l2r(!k4r2l(x) != !k4r2l(y))
+
+#define k4ifthen(x,y,z) (k4r2l(x)?(y):(z))
+
+#define k4cmpeq(x,y) k4l2r((x)==(y))
+#define k4cmpne(x,y) k4l2r((x)!=(y))
+#define k4cmpgt(x,y) k4l2r((x)>(y))
+#define k4cmpge(x,y) k4l2r((x)>=(y))
+#define k4cmplt(x,y) k4l2r((x)<(y))
+#define k4cmple(x,y) k4l2r((x)<=(y))
diff --git a/src/macros/vectors-8-AVX.h b/src/macros/vectors-8-AVX.h
new file mode 100644
index 0000000..6882523
--- /dev/null
+++ b/src/macros/vectors-8-AVX.h
@@ -0,0 +1,325 @@
+// Vectorise using Intel's or AMD's AVX
+
+// Use the type __m256d directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#if VECTORISE_EMULATE_AVX
+# include "avxintrin_emu.h"
+#else
+# include <immintrin.h>
+#endif
+#ifdef __FMA4__
+# include <fma4intrin.h>
+#endif
+
+
+
+#ifdef __FMA4__
+# define vec8_architecture_FMA4 "+FMA4"
+#else
+# define vec8_architecture_FMA4 ""
+#endif
+#define vec8_architecture "AVX" vec8_architecture_FMA4 " (64-bit precision)"
+
+
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL8_VEC __m256d
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL8_VEC_SIZE 4
+
+// Integer and boolean types corresponding to this real type
+#define CCTK_INTEGER8 CCTK_REAL8
+#define CCTK_BOOLEAN8 CCTK_REAL8
+#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
+#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
+
+
+
+union k8const_t {
+ unsigned long long i[4];
+ double f[4];
+ __m256i vi;
+ __m256d vf;
+};
+
+#define K8_ZERO 0x0000000000000000ULL
+#define K8_NOTZERO 0xffffffffffffffffULL
+#define K8_IMIN 0x8000000000000000ULL
+#define K8_IMAX 0x7fffffffffffffffULL
+
+
+
+// Create vectors, extract vector elements
+
+#define vec8_set1(a) (_mm256_set1_pd(a))
+#define vec8_set(a,b,c,d) (_mm256_set_pd(d,c,b,a)) // note reversed arguments
+
+#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0])
+#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1])
+#define vec8_elt2(x) (((CCTK_REAL8 const*)&(x))[2])
+#define vec8_elt3(x) (((CCTK_REAL8 const*)&(x))[3])
+#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d])
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec8_load(p) (_mm256_load_pd(&(p)))
+#define vec8_loadu(p) (_mm256_loadu_pd(&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# define vec8_load_off1(p) vec_loadu(p)
+#else
+# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported"
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+# define vec8_loadu_maybe(off,p) (vec8_loadu(p))
+# define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+#else
+# define vec8_loadu_maybe(off,p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
+ vec8_load(p) : \
+ vec8_load_off1(p); \
+ })
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+# define vec8_loadu_maybe3(off1,off2,off3,p) \
+ vec8_loadu_maybe(off1,p)
+# else
+# define vec8_loadu_maybe3(off1,off2,off3,p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
+ (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
+ vec8_loadu(p) : \
+ vec8_loadu_maybe(off1,p); \
+ })
+# endif
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec8_store(p,x) (_mm256_store_pd(&(p),x))
+#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x))
+#if ! VECTORISE_STREAMING_STORES
+# define vec8_store_nta(p,x) (vec8_store(p,x))
+#else
+# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x))
+#endif
+
+// Store a partial vector (aligned and non-temporal)
+#define vec8_store_partial_prepare(i,imin_,imax_) \
+ bool v8stp_all; \
+ __m256i v8stp_mask; \
+ ({ \
+ ptrdiff_t const imin__=(imin_); \
+ ptrdiff_t const imin=imin__; \
+ ptrdiff_t const imax__=(imax_); \
+ ptrdiff_t const imax=imax__; \
+ \
+ v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE-1<imax; \
+ \
+ if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ /* \
+ __m256i const v8stp_mask = \
+ _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), \
+ vec_index), \
+ _mm256_add_epi64(_mm256_set1_epi64x(i-imax), \
+ vec_index)); \
+ */ \
+ __m128i const termlo0 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(1, 0)); \
+ __m128i const termup0 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(1, 0)); \
+ __m128i const term0 = _mm_andnot_si128(termlo0, termup0); \
+ __m128i const termlo1 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(3, 2)); \
+ __m128i const termup1 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(3, 2)); \
+ __m128i const term1 = _mm_andnot_si128(termlo1, termup1); \
+ v8stp_mask = \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \
+ } \
+ })
+
+#define vec8_store_nta_partial(p,x) \
+ ({ \
+ if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ vec8_store_nta(p,x); \
+ } else { \
+ _mm256_maskstore_pd(&p,v8stp_mask,x); \
+ } \
+ })
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+// Masks indicating which vector element should be stored:
+static const k8const_t k8store_lo[5] =
+ {
+ {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_ZERO , }},
+ {{ K8_NOTZERO, K8_ZERO , K8_ZERO , K8_ZERO , }},
+ {{ K8_NOTZERO, K8_NOTZERO, K8_ZERO , K8_ZERO , }},
+ {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_ZERO , }},
+ {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ };
+static const k8const_t k8store_hi[5] =
+ {
+ {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_ZERO , }},
+ {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_NOTZERO, }},
+ {{ K8_ZERO , K8_ZERO , K8_NOTZERO, K8_NOTZERO, }},
+ {{ K8_ZERO , K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ };
+#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
+// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd
+# define vec8_store_nta_partial_lo(p,x,n) \
+ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo[n].vi),x))
+# define vec8_store_nta_partial_hi(p,x,n) \
+ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi[n].vi),x))
+# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
+ (_mm256_maskstore_pd \
+ (&(p), \
+ _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi), \
+ x))
+#else
+# define vec8_store_nta_partial_lo(p,x,n) \
+ (_mm256_maskstore_pd(&(p),k8store_lo[n].vi,x))
+# define vec8_store_nta_partial_hi(p,x,n) \
+ (_mm256_maskstore_pd(&(p),k8store_hi[n].vi,x))
+# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
+ (_mm256_maskstore_pd \
+ (&(p), \
+ _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, \
+ k8store_hi[nhi].vf)), \
+ x))
+#endif
+
+
+
+// Functions and operators
+
+static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
+
+// Operators
+#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask.vf))
+
+#define k8add(x,y) (_mm256_add_pd(x,y))
+#define k8sub(x,y) (_mm256_sub_pd(x,y))
+#define k8mul(x,y) (_mm256_mul_pd(x,y))
+#define k8div(x,y) (_mm256_div_pd(x,y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#ifdef __FMA4__
+# define k8madd(x,y,z) (_mm256_macc_pd(x,y,z))
+# define k8msub(x,y,z) (_mm256_msub_pd(x,y,z))
+# define k8nmadd(x,y,z) (_mm256_nmsub_pd(x,y,z))
+# define k8nmsub(x,y,z) (_mm256_nmacc_pd(x,y,z))
+#else
+# define k8madd(x,y,z) (k8add(k8mul(x,y),z))
+# define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
+# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
+# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+#endif
+
+// Cheap functions
+#define k8copysign(x,y) \
+ (_mm256_or_pd(_mm256_andnot_pd(k8sign_mask.vf,x), \
+ _mm256_and_pd(k8sign_mask.vf,y)))
+#define k8fabs(x) (_mm256_andnot_pd(k8sign_mask.vf,x))
+#define k8fmax(x,y) (_mm256_max_pd(x,y))
+#define k8fmin(x,y) (_mm256_min_pd(x,y))
+#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask.vf))
+static const k8const_t k8zero = { f: { 0.0, 0.0, 0.0, 0.0, }};
+static const k8const_t k8one = { f: { 1.0, 1.0, 1.0, 1.0, }};
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC x__=(x_); \
+ CCTK_REAL_VEC x=x__; \
+ CCTK_REAL_VEC iszero = _mm256_cmp_pd(x, k8zero.vf, _CMP_EQ_OQ); \
+ CCTK_REAL_VEC sign = _mm256_and_pd(k8sign_mask.vf, x); \
+ CCTK_REAL_VEC signedone = _mm256_or_pd(sign, k8one.vf); \
+ k8ifthen(iszero, k8zero.vf, signedone); \
+ })
+#define k8sqrt(x) (_mm256_sqrt_pd(x))
+
+// Expensive functions
+#define K8REPL(f,x_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const x=x__; \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x)), \
+ f(vec8_elt2(x)), \
+ f(vec8_elt3(x))); \
+ })
+#define K8REPL2S(f,x_,a_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8 const a__=(a_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8 const a=a__; \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a), \
+ f(vec8_elt2(x),a), \
+ f(vec8_elt3(x),a)); \
+ })
+#define K8REPL2(f,x_,y_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const y__=(y_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC const y=y__; \
+ vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
+ f(vec8_elt1(x),vec8_elt1(y)), \
+ f(vec8_elt2(x),vec8_elt2(y)), \
+ f(vec8_elt3(x),vec8_elt3(y))); \
+ })
+
+#define k8acos(x) K8REPL(acos,x)
+#define k8acosh(x) K8REPL(acosh,x)
+#define k8asin(x) K8REPL(asin,x)
+#define k8asinh(x) K8REPL(asinh,x)
+#define k8atan(x) K8REPL(atan,x)
+#define k8atan2(x,y) K8REPL2(atan2,x,y)
+#define k8atanh(x) K8REPL(atanh,x)
+#define k8cos(x) K8REPL(cos,x)
+#define k8cosh(x) K8REPL(cosh,x)
+#define k8exp(x) K8REPL(exp,x)
+#define k8log(x) K8REPL(log,x)
+#define k8pow(x,a) K8REPL2S(pow,x,a)
+#define k8sin(x) K8REPL(sin,x)
+#define k8sinh(x) K8REPL(sinh,x)
+#define k8tan(x) K8REPL(tan,x)
+#define k8tanh(x) K8REPL(tanh,x)
+
+static const k8const_t k8lfalse_ =
+ {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }};
+static const k8const_t k8ltrue_ =
+ {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }};
+#define k8lfalse (k8lfalse_.vf)
+#define k8ltrue (k8ltrue_.vf)
+#define k8lnot(x) (_mm256_xor_pd(k8ltrue,x))
+#define k8land(x,y) (_mm256_and_pd(x,y))
+#define k8lor(x,y) (_mm256_or_pd(x,y))
+#define k8lxor(x,y) (_mm256_xor_pd(x,y))
+#define k8ifthen(x,y,z) (_mm256_blendv_pd(z,y,x))
+
+#define k8cmpeq(x,y) (_mm256_cmp_pd(x,y,_CMP_EQ_OQ))
+#define k8cmpne(x,y) (_mm256_cmp_pd(x,y,_CMP_NEQ_OQ))
+#define k8cmpgt(x,y) (_mm256_cmp_pd(x,y,_CMP_GT_OQ))
+#define k8cmpge(x,y) (_mm256_cmp_pd(x,y,_CMP_GE_OQ))
+#define k8cmplt(x,y) (_mm256_cmp_pd(x,y,_CMP_LT_OQ))
+#define k8cmple(x,y) (_mm256_cmp_pd(x,y,_CMP_LE_OQ))
diff --git a/src/macros/vectors-8-SSE2.h b/src/macros/vectors-8-SSE2.h
new file mode 100644
index 0000000..a8823ab
--- /dev/null
+++ b/src/macros/vectors-8-SSE2.h
@@ -0,0 +1,427 @@
+// Vectorise using Intel's or AMD's SSE2
+
+// Use the type __m128d directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <assert.h>
+#include <math.h>
+
+#include <emmintrin.h>
+#ifdef __SSE4_1__
+// Intel's SSE 4.1
+# include <smmintrin.h>
+#endif
+#ifdef __SSE4A__
+// AMD's SSE 4a
+# include <ammintrin.h>
+
+// Intel compilers don't support SSE 4a. Here is how we can implement
+// these instructions in assembler instead:
+
+// inline void __attribute__((__always_inline__))
+// _mm_stream_sd (double *p, __m128d x)
+// {
+// asm ("movntsd %[x],%[p]" : "=m" (*p) : [p] "m" (*p), [x] "x" (x));
+// }
+
+#endif
+#ifdef __FMA4__
+# include <fma4intrin.h>
+#endif
+
+
+
+#ifdef __SSE4_1__
+# define vec8_architecture_SSE4_1 "+SSE4.1"
+#else
+# define vec8_architecture_SSE4_1 ""
+#endif
+#ifdef __SSE4A__
+# define vec8_architecture_SSE4a "+SSE4A"
+#else
+# define vec8_architecture_SSE4a ""
+#endif
+#ifdef __FMA4__
+# define vec8_architecture_FMA4 "+FMA4"
+#else
+# define vec8_architecture_FMA4 ""
+#endif
+#define vec8_architecture "SSE2" vec8_architecture_SSE4_1 vec8_architecture_SSE4a vec8_architecture_FMA4 " (64-bit precision)"
+
+
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL8_VEC __m128d
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL8_VEC_SIZE 2
+
+// Integer and boolean types corresponding to this real type
+#define CCTK_INTEGER8 CCTK_REAL8
+#define CCTK_BOOLEAN8 CCTK_REAL8
+#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
+#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
+
+
+
+union k8const_t {
+ long long i[2];
+ double f[2];
+ __m128i vi;
+ __m128d vf;
+};
+
+#define K8_IMIN ((long long)0x8000000000000000ULL)
+
+
+
+// Create vectors, extract vector elements
+
+#define vec8_set1(a) (_mm_set1_pd(a))
+#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments
+
+// original order is 01
+#define vec8_swap10(x_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const x=x__; \
+ _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); \
+ })
+
+#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0])
+#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1])
+#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d])
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec8_load(p) (_mm_load_pd(&(p)))
+#define vec8_loadu(p) (_mm_loadu_pd(&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# define vec8_load_off1(p) vec_loadu(p)
+#else
+# define vec8_load_off1(p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ _mm_shuffle_pd(vec8_load((&p)[-1]), \
+ vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); \
+ })
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+# define vec8_loadu_maybe(off,p) vec8_loadu(p)
+# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p)
+#else
+# define vec8_loadu_maybe(off,p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
+ vec8_load(p) : \
+ vec8_load_off1(p); \
+ })
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+# define vec8_loadu_maybe3(off1,off2,off3,p) \
+ vec8_loadu_maybe(off1,p)
+# else
+# define vec8_loadu_maybe3(off1,off2,off3,p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
+ (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
+ vec8_loadu(p) : \
+ vec8_loadu_maybe(off1,p); \
+ })
+# endif
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec8_store(p,x) (_mm_store_pd(&(p),x))
+#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x))
+#if ! VECTORISE_STREAMING_STORES
+# define vec8_store_nta(p,x) vec8_store(p,x)
+#else
+# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x))
+#endif
+
+// Store a partial vector (aligned and non-temporal)
+#define vec8_store_partial_prepare(i,imin,imax) \
+ bool const v8stp_lo = (i)>=(imin); \
+ bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE-1<(imax)
+#if VECTORISE_STREAMING_STORES && defined(__SSE4A__)
+# define vec8_store_nta_partial(p_,x_) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8& p=p__; \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const x=x__; \
+ if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \
+ vec8_store_nta(p,x); \
+ } else if (v8stp_lo) { \
+ _mm_stream_sd(&p,x); \
+ } else if (v8stp_hi) { \
+ _mm_stream_sd(&p+1, vec8_swap10(x)); \
+ } \
+ })
+#else
+# define vec8_store_nta_partial(p_,x_) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8& p=p__; \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const x=x__; \
+ if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \
+ vec8_store_nta(p,x); \
+ } else if (v8stp_lo) { \
+ _mm_storel_pd(&p,x); \
+ } else if (v8stp_hi) { \
+ _mm_storeh_pd(&p+1,x); \
+ } \
+ })
+#endif
+
+// Store a lower or higher partial vector (aligned and non-temporal)
+#if ! VECTORISE_STREAMING_STORES
+# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
+# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x))
+#else
+# if defined(__SSE4A__)
+# define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x))
+# define vec8_store_nta_partial_hi(p,x,n) \
+ (_mm_stream_sd(&(p)+1, vec8_swap10(x)))
+# else
+// TODO: use clflush once a whole cache line has been written (cache
+// lines are usually larger than the CPU vector size)
+# define vec8_store_nta_partial_lo(p_,x,n) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8& p=p__; \
+ _mm_storel_pd(&p,x); \
+ /* _mm_clflush(&p); */ \
+ })
+# define vec8_store_nta_partial_hi(p_,x,n) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8& p=p__; \
+ _mm_storeh_pd(&p+1,x); \
+ /* _mm_clflush(&p+1); */ \
+ })
+# endif
+#endif
+#if 0
+// This is slower; we would need a non-temporal read
+#define vec8_store_nta_partial_lo(p,x,n) \
+ vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1))
+#define vec8_store_nta_partial_hi(p,x,n) \
+ vec8_store_nta(p, _mm_loadl_pd(x,&(p)))
+#endif
+#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0)
+
+
+
+// Functions and operators
+
+static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }};
+
+// Operators
+
+// #define k8inot(x) (_mm_xor_si128(k8all_mask,x))
+//
+// #define k8iand(x,y) (_mm_and_si128(x,y))
+// #define k8ior(x,y) (_mm_or_si128(x,y))
+// #define k8ixor(x,y) (_mm_xor_si128(x,y))
+//
+// #define k8ineg(x) (_mm_xor_pd(k8sign_mask,x))
+//
+// #define k8iadd(x,y) (_mm_add_epi64(x,y))
+// #define k8isub(x,y) (_mm_sub_epi64(x,y))
+//
+// #define k8not(x) (_mm_xor_pd(k8all_mask,x))
+//
+// #define k8and(x,y) (_mm_and_pd(x,y))
+// #define k8or(x,y) (_mm_or_pd(x,y))
+// #define k8xor(x,y) (_mm_xor_pd(x,y))
+
+#define k8neg(x) (_mm_xor_pd(k8sign_mask.vf,x))
+
+#define k8add(x,y) (_mm_add_pd(x,y))
+#define k8sub(x,y) (_mm_sub_pd(x,y))
+#define k8mul(x,y) (_mm_mul_pd(x,y))
+#define k8div(x,y) (_mm_div_pd(x,y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#ifdef __FMA4__
+# define k8madd(x,y,z) (_mm_macc_pd(x,y,z))
+# define k8msub(x,y,z) (_mm_msub_pd(x,y,z))
+# define k8nmadd(x,y,z) (_mm_nmsub_pd(x,y,z))
+# define k8nmsub(x,y,z) (_mm_nmacc_pd(x,y,z))
+#else
+# define k8madd(x,y,z) (k8add(k8mul(x,y),z))
+# define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
+# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
+# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+#endif
+
+// Cheap functions
+#define k8copysign(x,y) \
+ (_mm_or_pd(_mm_andnot_pd(k8sign_mask.vf,x), \
+ _mm_and_pd(k8sign_mask.vf,y)))
+#define k8fabs(x) (_mm_andnot_pd(k8sign_mask.vf,x))
+#define k8fmax(x,y) (_mm_max_pd(x,y))
+#define k8fmin(x,y) (_mm_min_pd(x,y))
+#define k8fnabs(x) (_mm_or_pd(k8sign_mask.vf,x))
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC const x__=(x_); \
+ CCTK_REAL_VEC const x=x__; \
+ CCTK_REAL_VEC const iszero = _mm_cmpeq_pd(vec8_set1(0.0), x); \
+ CCTK_REAL_VEC const sign = _mm_and_pd(k8sign_mask.vf, x); \
+ CCTK_REAL_VEC const signedone = _mm_or_pd(vec8_set1(1.0), sign); \
+ k8ifthen(iszero, vec8_set1(0.0), signedone); \
+ })
+#define k8sqrt(x) (_mm_sqrt_pd(x))
+
+// Expensive functions
+#define K8REPL(f,x_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const x=x__; \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x))); \
+ })
+#define K8REPL2S(f,x_,a_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8 const a__=(a_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8 const a=a__; \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a)); \
+ })
+#define K8REPL2(f,x_,y_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const y__=(y_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC const y=y__; \
+ vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
+ f(vec8_elt1(x),vec8_elt1(y))); \
+ })
+
+#define k8acos(x) K8REPL(acos,x)
+#define k8acosh(x) K8REPL(acosh,x)
+#define k8asin(x) K8REPL(asin,x)
+#define k8asinh(x) K8REPL(asinh,x)
+#define k8atan(x) K8REPL(atan,x)
+#define k8atan2(x,y) K8REPL2(atan2,x,y)
+#define k8atanh(x) K8REPL(atanh,x)
+#define k8cos(x) K8REPL(cos,x)
+#define k8cosh(x) K8REPL(cosh,x)
+#define k8exp(x) K8REPL(exp,x)
+#define k8log(x) K8REPL(log,x)
+#define k8pow(x,a) K8REPL2S(pow,x,a)
+#define k8sin(x) K8REPL(sin,x)
+#define k8sinh(x) K8REPL(sinh,x)
+#define k8tan(x) K8REPL(tan,x)
+#define k8tanh(x) K8REPL(tanh,x)
+
+static const k8const_t k8lfalse_ = {{ +0LL, +0LL, }};
+static const k8const_t k8ltrue_ = {{ -1LL, -1LL, }};
+#define k8lfalse (k8lfalse_.vf)
+#define k8ltrue (k8ltrue_.vf)
+#define k8lnot(x) (_mm_xor_pd(k8ltrue,x))
+#define k8land(x,y) (_mm_and_pd(x,y))
+#define k8lor(x,y) (_mm_or_pd(x,y))
+#define k8lxor(x,y) (_mm_xor_pd(x,y))
+
+#ifdef __SSE4_1__
+# define k8ifthen(x,y,z) (_mm_blendv_pd(z,y,x))
+#elif 0
+// This is slow (but this is what Intel/PGI produce by themselves)
+# define k8ifthen(x_,y_,z_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const y__=(y_); \
+ CCTK_REAL8_VEC const z__=(z_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC const y=y__; \
+ CCTK_REAL8_VEC const z=z__; \
+ int const m = _mm_movemask_pd(x); \
+ CCTK_REAL8_VEC r; \
+ switch (m) { \
+ case 0: r = y; break; \
+ case 1: r = _mm_move_sd(y,z); break; \
+ case 2: r = _mm_move_sd(z,y); break; \
+ case 3: r = z; break; \
+ } \
+ r; \
+ })
+#elif 0
+# ifdef __cplusplus
+# define k8signbit(x) ({ using namespace std; signbit(x); })
+# else
+# define k8signbit(x) (signbit(x))
+# endif
+# define k8ifthen(x_,y_,z_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const y__=(y_); \
+ CCTK_REAL8_VEC const z__=(z_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC const y=y__; \
+ CCTK_REAL8_VEC const z=z__; \
+ vec8_set(k8signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \
+ k8signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \
+ })
+#elif 0
+// We don't need to shift -- the condition (mask) will be either all
+// zeros or all ones
+static const k8const_t k8ione = {{ 0x1ULL, 0x1ULL, }};
+# define k8ifthen(x_,y_,z_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const y__=(y_); \
+ CCTK_REAL8_VEC const z__=(z_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC const y=y__; \
+ CCTK_REAL8_VEC const z=z__; \
+ /* there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 */ \
+ __m128i const x_int = *(__m128i const*)&x; \
+ __m128i const imask_int = \
+ _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); \
+ CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; \
+ /* (z & ~mask) | (y & mask) where imask = ~mask */ \
+ _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \
+ })
+#else
+# define k8ifthen(x_,y_,z_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const y__=(y_); \
+ CCTK_REAL8_VEC const z__=(z_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC const y=y__; \
+ CCTK_REAL8_VEC const z=z__; \
+ /* (z & ~mask) | (y & mask) where imask = ~mask */ \
+ _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); \
+ })
+#endif
+
+#define k8cmpeq(x,y) (_mm_cmpeq_pd(x,y))
+#define k8cmpne(x,y) (_mm_cmpneq_pd(x,y))
+#define k8cmpgt(x,y) (_mm_cmpgt_pd(x,y))
+#define k8cmpge(x,y) (_mm_cmpge_pd(x,y))
+#define k8cmplt(x,y) (_mm_cmplt_pd(x,y))
+#define k8cmple(x,y) (_mm_cmple_pd(x,y))
diff --git a/src/macros/vectors-8-default.h b/src/macros/vectors-8-default.h
new file mode 100644
index 0000000..7ff6c8c
--- /dev/null
+++ b/src/macros/vectors-8-default.h
@@ -0,0 +1,132 @@
+// Fallback vectorisation implementation: Do not vectorise
+
+// We use macros here, so that we are not surprised by compilers which
+// don't like to inline functions. This should also make debug builds
+// (which may not inline) more efficient.
+
+
+
+#include <assert.h>
+#include <math.h>
+
+
+
+#define vec8_architecture "scalar (no vectorisation, 64-bit precision)"
+
+// Use CCTK_REAL8
+#define CCTK_REAL8_VEC CCTK_REAL8
+
+// Number of vector elements in a vector
+#define CCTK_REAL8_VEC_SIZE 1
+
+// Integer and boolean types corresponding to this real type
+#define CCTK_INTEGER8 CCTK_REAL8
+#define CCTK_BOOLEAN8 CCTK_REAL8
+#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
+#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
+
+
+
+// Create a vector replicating a scalar
+#define vec8_set1(a) (a)
+// Create a vector from N scalars
+#define vec8_set(a) (a)
+
+// Access vectors elements
+#define vec8_elt0(x) (x)
+#define vec8_elt(x,d) (x)
+
+
+
+// Load an aligned vector from memory
+#define vec8_load(p) (p)
+// Load an unaligned vector from memory
+#define vec8_loadu(p) (p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset and the vector size. These functions are
+// useful e.g. for loading neightbouring grid points while evaluating
+// finite differencing stencils.
+#define vec8_loadu_maybe(off,p) (p)
+#define vec8_loadu_maybe3(off1,off2,off3,p) (p)
+
+// Aligned store
+#define vec8_store(p,x) ((p)=(x))
+// Unaligned store
+#define vec8_store_nta(p,x) ((p)=(x))
+
+#define vec8_store_partial_prepare(i,imin,imax) ((void)0)
+#define vec8_store_nta_partial(p,x) (vec8_store_nta(p,x))
+// Store the n lower elements of a vector to memory
+#define vec8_store_nta_partial_lo(p,x,n) (assert(0))
+// Store the n higher elements of a vector into memory. This stores
+// the vector elements into memory locations as if element 0 were
+// stored at p.
+#define vec8_store_nta_partial_hi(p,x,n) (assert(0))
+#define vec8_store_nta_partial_mid(p,x,nlo,nhi) (assert(0))
+
+
+
+// Operators
+#define k8neg(x) (-(x))
+
+#define k8add(x,y) ((x)+(y))
+#define k8sub(x,y) ((x)-(y))
+#define k8mul(x,y) ((x)*(y))
+#define k8div(x,y) ((x)/(y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k8madd(x,y,z) (+(x)*(y)+(z))
+#define k8msub(x,y,z) (+(x)*(y)-(z))
+#define k8nmadd(x,y,z) (-(x)*(y)-(z))
+#define k8nmsub(x,y,z) (-(x)*(y)+(z))
+
+// Functions
+#define k8acos(x) (acos(x))
+#define k8acosh(x) (acosh(x))
+#define k8asin(x) (asin(x))
+#define k8asinh(x) (asinh(x))
+#define k8atan(x) (atan(x))
+#define k8atan2(x,y) (atan2(x,y))
+#define k8atanh(x) (atanh(x))
+#define k8copysign(x,y) (copysign(x,y))
+#define k8cos(x) (cos(x))
+#define k8cosh(x) (cosh(x))
+#define k8exp(x) (exp(x))
+#define k8fabs(x) (fabs(x))
+#define k8fmax(x,y) (fmax(x,y))
+#define k8fmin(x,y) (fmin(x,y))
+#define k8fnabs(x) (-fabs(x))
+#define k8log(x) (log(x))
+#define k8pow(x,a) (pow(x,a))
+#define k8sin(x) (sin(x))
+#define k8sinh(x) (sinh(x))
+#define k8sqrt(x) (sqrt(x))
+#define k8tan(x) (tan(x))
+#define k8tanh(x) (tanh(x))
+
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL x__=(x_); \
+ CCTK_REAL x=x__; \
+ x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : std::copysign((CCTK_REAL)1.0, x); \
+ })
+#define k8signbit(x) (std::signbit(x))
+
+#define k8l2r(x_) ({ CCTK_INT8 x__=(x_); CCTK_INT8 x=x__; *(CCTK_REAL8*)&x; })
+#define k8r2l(x_) ({ CCTK_REAL8 x__=(x_); CCTK_REAL8 x=x__; *(CCTK_INT8*)&x; })
+#define k8lfalse k8l2r(0)
+#define k8ltrue k8l2r(1)
+#define k8lnot(x) k8l2r(!k8r2l(x))
+#define k8land(x,y) k8l2r(k8r2l(x) && k8r2l(y))
+#define k8lor(x,y) k8l2r(k8r2l(x) || k8r2l(y))
+#define k8lxor(x,y) k8l2r(!k8r2l(x) != !k8r2l(y))
+
+#define k8ifthen(x,y,z) (k8r2l(x)?(y):(z))
+
+#define k8cmpeq(x,y) k8l2r((x)==(y))
+#define k8cmpne(x,y) k8l2r((x)!=(y))
+#define k8cmpgt(x,y) k8l2r((x)>(y))
+#define k8cmpge(x,y) k8l2r((x)>=(y))
+#define k8cmplt(x,y) k8l2r((x)<(y))
+#define k8cmple(x,y) k8l2r((x)<=(y))
diff --git a/src/vectors-4-AVX.h b/src/vectors-4-AVX.h
new file mode 100644
index 0000000..641a74b
--- /dev/null
+++ b/src/vectors-4-AVX.h
@@ -0,0 +1,659 @@
+// Vectorise using Intel's or AMD's AVX
+
+// Use the type __m256 directly, without introducing a wrapper class
+
+
+
+#include <cstdlib>
+
+
+
+#include <immintrin.h>
+#ifdef __FMA4__
+# include <x86intrin.h>
+#endif
+
+
+
+#ifdef __FMA4__
+# define vec4_architecture_FMA4 "+FMA4"
+#else
+# define vec4_architecture_FMA4 ""
+#endif
+#define vec4_architecture "AVX" vec4_architecture_FMA4 " (32-bit precision)"
+
+
+
+// Vector type corresponding to CCTK_REAL
+typedef __m256 CCTK_REAL4_VEC;
+typedef __m256i CCTK_INTEGER4_VEC;
+typedef __m256 CCTK_BOOLEAN4_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL4_VEC_SIZE 8
+
+vec_static_assert(sizeof(CCTK_REAL4_VEC) ==
+ sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE);
+
+// Integer and boolean types corresponding to this real type
+typedef CCTK_INT4 CCTK_INTEGER4;
+typedef CCTK_REAL4 CCTK_BOOLEAN4;
+
+
+
+union k4const_t {
+ CCTK_INTEGER4 i[CCTK_REAL4_VEC_SIZE];
+ CCTK_REAL4 f[CCTK_REAL4_VEC_SIZE];
+ CCTK_INTEGER4_VEC vi;
+ CCTK_REAL4_VEC vf;
+};
+
+#define k4sign (vec4_set1i( (CCTK_INTEGER4)(1UL << 31UL)))
+#define k4notsign (vec4_set1i(~ (CCTK_INTEGER4)(1UL << 31UL)))
+
+
+
+// Create vectors, extract vector elements
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set1(CCTK_REAL4 const a)
+{
+ return _mm256_set1_ps(a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set1i(CCTK_INT4 const a)
+{
+ return _mm256_castsi256_ps(_mm256_set1_epi32(a));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a,
+ CCTK_REAL4 const b,
+ CCTK_REAL4 const c,
+ CCTK_REAL4 const d,
+ CCTK_REAL4 const e,
+ CCTK_REAL4 const f,
+ CCTK_REAL4 const g,
+ CCTK_REAL4 const h)
+{
+ return _mm256_set_ps(h,g,f,e,d,c,b,a); // note reversed arguments
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt0(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[0];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt1(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[1];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt2(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[2];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt3(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[3];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt4(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[4];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt5(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[5];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt6(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[6];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt7(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[7];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d)
+{
+ return ((CCTK_REAL4 const*)&x)[d];
+}
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load(CCTK_REAL4 const& p)
+{
+ return _mm256_load_ps(&p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu(CCTK_REAL4 const& p)
+{
+ return _mm256_loadu_ps(&p);
+}
+#if VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported"
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p)
+{
+ return off % CCTK_REAL4_VEC_SIZE == 0 ? vec4_load(p) : vec4_loadu(p);
+}
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return vec4_loadu_maybe(off1, p);
+}
+# else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return
+ off2 % CCTK_REAL4_VEC_SIZE != 0 or
+ off3 % CCTK_REAL4_VEC_SIZE != 0 ?
+ vec4_loadu(p) :
+ vec4_loadu_maybe(off1, p);
+}
+# endif
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm256_store_ps(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_storeu(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm256_storeu_ps(&p, x);
+}
+#if ! VECTORISE_STREAMING_STORES
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return vec4_store(p, x);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm256_stream_ps(&p, x);
+}
+#endif
+
+// Store a partial vector (aligned and non-temporal)
+#define vec4_store_partial_prepare(i,imin,imax) \
+ bool v4stp_all; \
+ __m256i v4stp_mask; \
+ vec4_store_partial_prepare_(v4stp_all, v4stp_mask, i, imin, imax);
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_partial_prepare_(bool& all, __m256i& mask,
+ std::ptrdiff_t const i,
+ std::ptrdiff_t const imin,
+ std::ptrdiff_t const imax)
+{
+ all = i>=imin and i+CCTK_REAL4_VEC_SIZE-1<imax;
+
+ if (not CCTK_BUILTIN_EXPECT(all, true)) {
+ /* __m256i const v4stp_mask = */
+ /* _mm256_andnot_ps(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), */
+ /* vec_index), */
+ /* _mm256_add_epi64(_mm256_set1_epi64x(i-imax), */
+ /* vec_index)); */
+ __m128i const termlo0123 =
+ _mm_add_epi32(_mm_set1_epi32(i-imin), _mm_set_epi32(3, 2, 1, 0));
+ __m128i const termup0123 =
+ _mm_add_epi32(_mm_set1_epi32(i-imax), _mm_set_epi32(3, 2, 1, 0));
+ __m128i const term0123 = _mm_andnot_si128(termlo0123, termup0123);
+ __m128i const termlo4567 =
+ _mm_add_epi32(_mm_set1_epi32(i-imin), _mm_set_epi32(7, 6, 5, 4));
+ __m128i const termup4567 =
+ _mm_add_epi32(_mm_set1_epi32(i-imax), _mm_set_epi32(7, 6, 5, 4));
+ __m128i const term4567 = _mm_andnot_si128(termlo4567, termup4567);
+ mask =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(term0123), term4567, 1);
+ }
+}
+
+#define vec4_store_nta_partial(p, x) \
+ vec4_store_nta_partial_(v4stp_all, v4stp_mask, p, x)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_(bool const all, __m256i const mask,
+ CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ if (CCTK_BUILTIN_EXPECT(all, true)) {
+ vec4_store_nta(p, x);
+ } else {
+ _mm256_maskstore_ps(&p, mask, x);
+ }
+}
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+// Masks indicating which vector element should be stored:
+static k4const_t const k4store_lo[9] =
+ {
+ { i: { 0, 0, 0, 0, 0, 0, 0, 0, }},
+ { i: { ~0, 0, 0, 0, 0, 0, 0, 0, }},
+ { i: { ~0, ~0, 0, 0, 0, 0, 0, 0, }},
+ { i: { ~0, ~0, ~0, 0, 0, 0, 0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, 0, 0, 0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, 0, 0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, }},
+ };
+static k4const_t const k4store_hi[9] =
+ {
+ { i: { 0, 0, 0, 0, 0, 0, 0, 0, }},
+ { i: { 0, 0, 0, 0, 0, 0, 0, ~0, }},
+ { i: { 0, 0, 0, 0, 0, 0, ~0, ~0, }},
+ { i: { 0, 0, 0, 0, 0, ~0, ~0, ~0, }},
+ { i: { 0, 0, 0, 0, ~0, ~0, ~0, ~0, }},
+ { i: { 0, 0, 0, ~0, ~0, ~0, ~0, ~0, }},
+ { i: { 0, 0, ~0, ~0, ~0, ~0, ~0, ~0, }},
+ { i: { 0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, }},
+ };
+#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
+// gcc 4.4 uses a wrong prototype for _mm256_maskstore_ps
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_lo(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_ps(&p, _mm256_castsi256_ps(k4store_lo[n].vi), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_hi(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_ps(&p, _mm256_castsi256_ps(k4store_hi[n].vi), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_mid(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm256_maskstore_ps
+ (&p,
+ _mm256_castsi256_ps(k4store_lo[nlo].vi & k4store_hi[nhi].vi),
+ x);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_lo(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_ps(&p, k4store_lo[n].vi, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_hi(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_ps(&p, k4store_hi[n].vi, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_mid(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm256_maskstore_ps
+ (&p,
+ _mm256_castps_si256(_mm256_and_ps(k4store_lo[nlo].vf, k4store_hi[nhi].vf)),
+ x);
+}
+#endif
+
+
+
+// Functions and operators
+
+// Operators
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4neg(CCTK_REAL4_VEC const x)
+{
+ return _mm256_xor_ps(x, k4sign);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4add(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_add_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sub(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_sub_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4mul(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_mul_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4div(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_div_ps(x, y);
+}
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#ifdef __FMA4__
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4madd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_macc_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4msub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_msub_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmadd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_nmsub_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_nmacc_ps(x, y, z);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4madd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4add(k4mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4msub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(k4mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmadd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(k4neg(z), k4mul(x, y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(z, k4mul(x, y));
+}
+#endif
+
+// Cheap functions
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4copysign(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_or_ps(_mm256_and_ps(k4notsign, x),
+ _mm256_and_ps(k4sign , y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fabs(CCTK_REAL4_VEC const x)
+{
+ return _mm256_and_ps(k4notsign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fmax(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_max_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fmin(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_min_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fnabs(CCTK_REAL4_VEC const x)
+{
+ return _mm256_or_ps(x, k4sign);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sqrt(CCTK_REAL4_VEC const x)
+{
+ return _mm256_sqrt_ps(x);
+}
+
+// Expensive functions
+#define K4REPL(f,x) \
+ vec4_set(f(vec4_elt0(x)), \
+ f(vec4_elt1(x)), \
+ f(vec4_elt2(x)), \
+ f(vec4_elt3(x)), \
+ f(vec4_elt4(x)), \
+ f(vec4_elt5(x)), \
+ f(vec4_elt6(x)), \
+ f(vec4_elt7(x)));
+#define K4REPL2S(f,x,a) \
+ vec4_set(f(vec4_elt0(x),a), \
+ f(vec4_elt1(x),a), \
+ f(vec4_elt2(x),a), \
+ f(vec4_elt3(x),a), \
+ f(vec4_elt4(x),a), \
+ f(vec4_elt5(x),a), \
+ f(vec4_elt6(x),a), \
+ f(vec4_elt7(x),a));
+#define K4REPL2(f,x,y) \
+ vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \
+ f(vec4_elt1(x),vec4_elt1(y)), \
+ f(vec4_elt2(x),vec4_elt2(y)), \
+ f(vec4_elt3(x),vec4_elt3(y)), \
+ f(vec4_elt4(x),vec4_elt4(y)), \
+ f(vec4_elt5(x),vec4_elt5(y)), \
+ f(vec4_elt6(x),vec4_elt6(y)), \
+ f(vec4_elt7(x),vec4_elt7(y)));
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(acos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4acosh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(acosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4asin(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(asin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4asinh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(asinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atan(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(atan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atan2(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return K4REPL2(atan2,x,y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atanh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(atanh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4cos(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(cos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4cosh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(cosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4exp(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(exp,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4log(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(log,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4pow(CCTK_REAL4_VEC const x, CCTK_REAL4 const a)
+{
+ return K4REPL2S(pow,x,a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sin(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(sin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sinh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(sinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4tan(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(tan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(tanh,x);
+}
+
+
+
+#define k4lfalse (vec4_set1i( 0))
+#define k4ltrue (vec4_set1i(~0))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lnot(CCTK_BOOLEAN4_VEC const x)
+{
+ return _mm256_xor_ps(k4ltrue, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4land(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm256_and_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm256_or_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lxor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm256_xor_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_blendv_ps(z, y, x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpeq(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_EQ_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpne(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_NEQ_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpgt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_GT_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpge(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_GE_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmplt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_LT_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmple(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_LE_OQ);
+}
+
+
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sgn(CCTK_REAL4_VEC const x)
+{
+ CCTK_BOOLEAN4_VEC const iszero = k4cmpeq(x, vec4_set1(0.0));
+ CCTK_REAL4_VEC const sign = _mm256_and_ps(k4sign, x);
+ CCTK_REAL4_VEC const signedone = _mm256_or_ps(sign, vec4_set1(1.0));
+ return k4ifthen(iszero, vec4_set1(0.0), signedone);
+}
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h
index e420140..7d0d9c3 100644
--- a/src/vectors-4-SSE.h
+++ b/src/vectors-4-SSE.h
@@ -1,12 +1,19 @@
// Vectorise using Intel's or AMD's SSE
// Use the type __m128 directly, without introducing a wrapper class
-// Use macros instead of inline functions
+#ifdef __PGI
+// PGI doesn't want to inline functions
+# include "macros/vectors-4-SSE.h"
+#else
+
+
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
-#include <assert.h>
-#include <math.h>
#include <xmmintrin.h>
#ifdef __SSE4_1__
@@ -18,7 +25,7 @@
# include <ammintrin.h>
#endif
#ifdef __FMA4__
-# include <fma4intrin.h>
+# include <x86intrin.h>
#endif
@@ -43,98 +50,121 @@
// Vector type corresponding to CCTK_REAL
-#define CCTK_REAL4_VEC __m128
+typedef __m128 CCTK_REAL4_VEC;
+typedef __m128i CCTK_INTEGER4_VEC;
+typedef __m128 CCTK_BOOLEAN4_VEC;
// Number of vector elements in a CCTK_REAL_VEC
#define CCTK_REAL4_VEC_SIZE 4
+vec_static_assert(sizeof(CCTK_REAL4_VEC) ==
+ sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE);
+
// Integer and boolean types corresponding to this real type
-#define CCTK_INTEGER4 CCTK_REAL4
-#define CCTK_BOOLEAN4 CCTK_REAL4
-#define CCTK_INTEGER4_VEC CCTK_REAL4_VEC
-#define CCTK_BOOLEAN4_VEC CCTK_REAL4_VEC
+typedef CCTK_INT4 CCTK_INTEGER4;
+typedef CCTK_REAL4 CCTK_BOOLEAN4;
union k4const_t {
- unsigned i[4];
- float f[4];
- __m128i vi;
- __m128 vf;
+ CCTK_INTEGER4 i[CCTK_REAL4_VEC_SIZE];
+ CCTK_REAL4 f[CCTK_REAL4_VEC_SIZE];
+ CCTK_INTEGER4_VEC vi;
+ CCTK_REAL4_VEC vf;
};
-#define K4_ZERO 0x00000000UL
-#define K4_IMIN 0x80000000UL
-#define K4_IMAX 0x7fffffffUL
+#define k4sign (vec4_set1i( (CCTK_INTEGER4)(1UL << 31UL)))
+#define k4notsign (vec4_set1i(~ (CCTK_INTEGER4)(1UL << 31UL)))
// Create vectors, extract vector elements
-#define vec4_set1(a) (_mm_set1_ps(a))
-#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set1(CCTK_REAL4 const a)
+{
+ return _mm_set1_ps(a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set1i(CCTK_INT4 const a)
+{
+ return _mm_castsi128_ps(_mm_set1_epi32(a));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a,
+ CCTK_REAL4 const b,
+ CCTK_REAL4 const c,
+ CCTK_REAL4 const d)
+{
+ return _mm_set_ps(d,c,b,a); // note reversed arguments
+}
// original order is 0123
-#define vec4_swap1032(x_) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const x=x__; \
- _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,3,0,1)); \
- })
-#define vec4_swap2301(x_) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const x=x__; \
- _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,0,3,2)); \
- })
-#define vec4_swap3210(x_) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const x=x__; \
- _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,1,2,3)); \
- })
-
-#if defined(__PGI)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_swap1032(CCTK_REAL4_VEC const x)
+{
+ return _mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_swap2301(CCTK_REAL4_VEC const x)
+{
+ return _mm_shuffle_ps(x, x, _MM_SHUFFLE(1,0,3,2));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_swap3210(CCTK_REAL4_VEC const x)
+{
+ return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0,1,2,3));
+}
+
+#if defined __PGI
// _mm_cvtss_f32 does not exist on PGI compilers
-# define vec4_elt0(x) \
- ({ \
- CCTK_REAL4 a; \
- asm ("" : "=x" (a) : "0" (x)); \
- a; \
- })
-#else
-# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 _mm_cvtss_f32(CCTK_REAL4_VEC const x)
+{
+ CCTK_REAL4 a;
+ asm ("" : "=x" (a) : "0" (x));
+ return a;
+}
#endif
-#define vec4_elt1(x) vec4_elt0(vec4_swap1032(x))
-#define vec4_elt2(x) vec4_elt0(vec4_swap2301(x))
-#define vec4_elt3(x) vec4_elt0(vec4_swap3210(x))
-#if defined(__PGI)
-# define vec4_elt(x_,d) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const x=x__; \
- CCTK_REAL4 a; \
- if (d==0) a=vec4_elt0(x); \
- else if (d==1) a=vec4_elt1(x); \
- else if (d==2) a=vec4_elt2(x); \
- else if (d==3) a=vec4_elt3(x); \
- a; \
- })
+
+// TODO: Why not ((CCTK_REAL4 const*)&x)[d] ?
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt0(CCTK_REAL4_VEC const x)
+{
+ return _mm_cvtss_f32(x); // this is a no-op
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt1(CCTK_REAL4_VEC const x)
+{
+ return vec4_elt0(vec4_swap1032(x));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt2(CCTK_REAL4_VEC const x)
+{
+ return vec4_elt0(vec4_swap2301(x));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt3(CCTK_REAL4_VEC const x)
+{
+ return vec4_elt0(vec4_swap3210(x));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d)
+{
+#if defined __PGI
+ if (d==0) return vec4_elt0(x);
+ if (d==1) return vec4_elt1(x);
+ if (d==2) return vec4_elt2(x);
+ return vec4_elt3(x);
#else
-# define vec4_elt(x_,d) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const x=x__; \
- CCTK_REAL4 a; \
- switch (d) { \
- case 0: a=vec4_elt0(x); break; \
- case 1: a=vec4_elt1(x); break; \
- case 2: a=vec4_elt2(x); break; \
- case 3: a=vec4_elt3(x); break; \
- } \
- a; \
- })
+ switch (d) {
+ case 0: return vec4_elt0(x);
+ case 1: return vec4_elt1(x);
+ case 2: return vec4_elt2(x);
+ }
+ return vec4_elt3(x);
#endif
+}
@@ -142,318 +172,540 @@ union k4const_t {
// Load a vector from memory (aligned and unaligned); this loads from
// a reference to a scalar
-#define vec4_load(p) (_mm_load_ps(&(p)))
-#define vec4_loadu(p) (_mm_loadu_ps(&(p)))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load(CCTK_REAL4 const& p)
+{
+ return _mm_load_ps(&p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu(CCTK_REAL4 const& p)
+{
+ return _mm_loadu_ps(&p);
+}
#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
-# define vec4_load_off1(p) vec_loadu(p)
-# define vec4_load_off2(p) vec_loadu(p)
-# define vec4_load_off3(p) vec_loadu(p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load_off1(CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load_off2(CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load_off3(CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
#else
-# define vec4_load_off1(p_) \
- ({ \
- CCTK_REAL4 const& p__=(p_); \
- CCTK_REAL4 const& p=p__; \
- CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \
- CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \
- assert(0); \
- CCTK_REAL4_VEC const hi2=_mm_shuffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \
- _mm_shuffle_ps(lo,hi2, _MM_SHUFFLE(2,1,3,0)); \
- })
-# define vec4_load_off2(p_) \
- ({ \
- CCTK_REAL4 const& p__=(p_); \
- CCTK_REAL4 const& p=p__; \
- CCTK_REAL4_VEC const lo=vec4_load((&p)[-2]); \
- CCTK_REAL4_VEC const hi=vec4_load((&p)[+2]); \
- _mm_shuffle_ps(lo,hi, _MM_SHUFFLE(1,0,3,2)); \
- })
-# define vec4_load_off1(p_) \
- ({ \
- CCTK_REAL4 const& p__=(p_); \
- CCTK_REAL4 const& p=p__; \
- CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \
- CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \
- assert(0); \
- CCTK_REAL4_VEC const lo2=_mm_shuffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \
- _mm_shuffle_ps(lo2,hi, _MM_SHUFFLE(3,0,2,1)); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load_off1(CCTK_REAL4 const& p)
+{
+ CCTK_REAL4_VEC const lo = vec4_load((&p)[-1]);
+ CCTK_REAL4_VEC const hi = vec4_load((&p)[+3]);
+ CCTK_REAL4_VEC const hi2 = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(0,1,2,3));
+ return _mm_shuffle_ps(lo, hi2, _MM_SHUFFLE(2,1,3,0));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load_off2(CCTK_REAL4 const& p)
+{
+ CCTK_REAL4_VEC const lo = vec4_load((&p)[-2]);
+ CCTK_REAL4_VEC const hi = vec4_load((&p)[+2]);
+ return _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1,0,3,2));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load_off3(CCTK_REAL4 const& p)
+{
+ CCTK_REAL4_VEC const lo = vec4_load((&p)[-1]);
+ CCTK_REAL4_VEC const hi = vec4_load((&p)[+3]);
+ CCTK_REAL4_VEC const lo2 = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(0,1,2,3));
+ return _mm_shuffle_ps(lo2, hi, _MM_SHUFFLE(3,0,2,1));
+}
#endif
// Load a vector from memory that may or may not be aligned, as
// decided by the offset off and the vector size
#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
// Implementation: Always use unaligned load
-# define vec4_loadu_maybe(off,p) vec4_loadu(p)
-# define vec4_loadu_maybe3(off1,off2,off3,p) vec4_loadu(p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
#else
-# define vec4_loadu_maybe(off,p_) \
- ({ \
- CCTK_REAL4 const& p__=(p_); \
- CCTK_REAL4 const& p=p__; \
- (off) % CCTK_REAL4_VEC_SIZE == 0 ? \
- vec4_load(p) : \
- vec4_loadu(p); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p)
+{
+ // The :? operator probably breaks with the Intel compiler
+ //return off % CCTK_REAL4_VEC_SIZE == 0 ? vec4_load(p) : vec4_loadu(p);
+ if (off % CCTK_REAL4_VEC_SIZE == 0) return vec4_load(p);
+ return vec4_loadu(p);
+}
# if VECTORISE_ALIGNED_ARRAYS
// Assume all array x sizes are multiples of the vector size
-# define vec4_loadu_maybe3(off1,off2,off3,p) \
- vec4_loadu_maybe(off1,p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return vec4_loadu_maybe(off1, p);
+}
# else
-# define vec4_loadu_maybe3(off1,off2,off3,p) \
- vec4_loadu_maybe((off1)|(off2)|(off3),p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return
+ off2 % CCTK_REAL4_VEC_SIZE != 0 or
+ off3 % CCTK_REAL4_VEC_SIZE != 0 ?
+ vec4_loadu(p) :
+ vec4_loadu_maybe(off1, p);
+}
# endif
#endif
// Store a vector to memory (aligned and non-temporal); this stores to
// a reference to a scalar
-#define vec4_store(p,x) (_mm_store_ps(&(p),x))
-#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm_store_ps(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_storeu(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm_storeu_ps(&p, x);
+}
#if ! VECTORISE_STREAMING_STORES
-# define vec4_store_nta(p,x) vec4_store(p,x)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return vec4_store(p, x);
+}
#else
-# define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm_stream_ps(&p, x);
+}
#endif
// Store a partial vector (aligned and non-temporal)
-#define vec4_store_partial_prepare(i,imin,imax) \
- int v4stp_lo_skip = (imin)-(i); \
- int v4stp_hi_skip = (i)+CCTK_REAL_VEC_SIZE-(imax); \
- if (CCTK_BUILTIN_EXPECT(v4stp_lo_skip < 0, true)) v4stp_lo_skip = 0; \
- if (CCTK_BUILTIN_EXPECT(v4stp_hi_skip < 0, true)) v4stp_hi_skip = 0;
-// Ignoring VECTORISE_STREAMING_STORES for partial stores
-#define vec4_store_nta_partial(p_,x_) \
- ({ \
- CCTK_REAL4& p__=(p_); \
- CCTK_REAL4& p=p__; \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const x=x__; \
- if (CCTK_BUILTIN_EXPECT(v4stp_lo_skip==0 and v4stp_hi_skip==0, true)) { \
- vec4_store_nta(p,x); \
- } else { \
- /* these cases fall through */ \
- switch (v4stp_lo_skip) { \
- case 0: \
- (&p)[0] = vec4_elt0(x); \
- case 1: \
- if (v4stp_hi_skip>=3) break; \
- (&p)[1] = vec4_elt1(x); \
- case 2: \
- if (v4stp_hi_skip>=2) break; \
- (&p)[2] = vec4_elt2(x); \
- case 3: \
- if (v4stp_hi_skip>=1) break; \
- (&p)[3] = vec4_elt3(x); \
- } \
- } \
- })
-
-// Ignoring VECTORISE_STREAMING_STORES for partial stores
-#define vec4_store_nta_partial_lo(p_,x_,n) \
- ({ \
- CCTK_REAL4 & p__=(p_); \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4 & p=p__; \
- CCTK_REAL4_VEC const x=x__; \
- /* these cases fall through */ \
- switch (n) { \
- case 3: (&p)[2] = vec4_elt2(x); \
- case 2: (&p)[1] = vec4_elt1(x); \
- case 1: (&p)[0] = vec4_elt0(x); \
- } \
- })
-#define vec4_store_nta_partial_hi(p_,x_,n) \
- ({ \
- CCTK_REAL4 & p__=(p_); \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4 & p=p__; \
- CCTK_REAL4_VEC const x=x__; \
- /* these cases fall through */ \
- switch (n) { \
- case 3: (&p)[1]=vec4_elt1(x); \
- case 2: (&p)[2]=vec4_elt2(x); \
- case 1: (&p)[3]=vec4_elt3(x); \
- } \
- })
-#define vec4_store_nta_partial_mid(p_,x_,nlo,nhi) \
- ({ \
- CCTK_REAL4 & p__=(p_); \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4 & p=p__; \
- CCTK_REAL4_VEC const x=x__; \
- /* these cases fall through */ \
- switch (nhi) { \
- case 3: if (nlo<2) break; (&p)[1] = vec4_elt1(x); \
- case 2: if (nlo<3) break; (&p)[2] = vec4_elt2(x); \
- } \
- })
+// We ignoring VECTORISE_STREAMING_STORES for partial stores
+#define vec4_store_partial_prepare(i, imin, imax) \
+ std::ptrdiff_t v4stp_lo_skip, v4stp_hi_skip; \
+ vec4_store_partial_prepare_(v4stp_lo_skip, v4stp_hi_skip, i, imin, imax)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_partial_prepare_(std::ptrdiff_t& lo_skip,
+ std::ptrdiff_t& hi_skip,
+ std::ptrdiff_t const i,
+ std::ptrdiff_t const imin,
+ std::ptrdiff_t const imax)
+{
+ lo_skip = std::max(std::ptrdiff_t(0), imin - i);
+ hi_skip = std::max(std::ptrdiff_t(0), i+CCTK_REAL4_VEC_SIZE - imax);
+}
+#define vec4_store_nta_partial(p, x) \
+ vec4_store_nta_partial_(v8stp_lo_skip, v8stp_hi_skip, p, x)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_(std::ptrdiff_t const lo_skip,
+ std::ptrdiff_t const hi_skip,
+ CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x)
+{
+ if (CCTK_BUILTIN_EXPECT(lo_skip==0 and hi_skip==0, true)) {
+ vec4_store_nta(p, x);
+ } else {
+ // these cases fall through
+ switch (lo_skip) {
+ case 0:
+ (&p)[0] = vec4_elt0(x);
+ case 1:
+ if (hi_skip>=3) break;
+ (&p)[1] = vec4_elt1(x);
+ case 2:
+ if (hi_skip>=2) break;
+ (&p)[2] = vec4_elt2(x);
+ case 3:
+ if (hi_skip>=1) break;
+ (&p)[3] = vec4_elt3(x);
+ }
+ }
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_lo(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ std::ptrdiff_t const n)
+{
+ // these cases fall through
+ switch (n) {
+ case 3: (&p)[2] = vec4_elt2(x);
+ case 2: (&p)[1] = vec4_elt1(x);
+ case 1: (&p)[0] = vec4_elt0(x);
+ }
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_hi(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ std::ptrdiff_t const n)
+{
+ // these cases fall through
+ switch (n) {
+ case 3: (&p)[1]=vec4_elt1(x);
+ case 2: (&p)[2]=vec4_elt2(x);
+ case 1: (&p)[3]=vec4_elt3(x);
+ }
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_hi(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ std::ptrdiff_t const nlo,
+ std::ptrdiff_t const nhi)
+{
+ // these cases fall through
+ switch (nhi) {
+ case 3:
+ if (nlo<2) break;
+ (&p)[1] = vec4_elt1(x);
+ case 2:
+ if (nlo<3) break;
+ (&p)[2] = vec4_elt2(x);
+ }
+}
// Functions and operators
-static const k4const_t k4sign_mask = {{ K4_IMIN, K4_IMIN, K4_IMIN, K4_IMIN, }};
-
// Operators
-#define k4neg(x) (_mm_xor_ps(k4sign_mask.vf,x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4neg(CCTK_REAL4_VEC const x)
+{
+ return _mm_xor_ps(k4sign, x);
+}
// #define k4inv(x)
// TODO: provide k4inv via rcp and Newton-Raphson
// This is described in AMD's publication 47414.
-// This should apply for AVX as well.
-
-#define k4add(x,y) (_mm_add_ps(x,y))
-#define k4sub(x,y) (_mm_sub_ps(x,y))
-#define k4mul(x,y) (_mm_mul_ps(x,y))
-// TODO: use k4inv and k4mul instead
-#define k4div(x,y) (_mm_div_ps(x,y))
+// This should apply to AVX as well.
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4add(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_add_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sub(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_sub_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4mul(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_mul_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4div(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ // TODO: maybe use k4inv and k4mul instead
+ return _mm_div_ps(x, y);
+}
// Fused multiply-add, defined as [+-]x*y[+-]z
#ifdef __FMA4__
-# define k4madd(x,y,z) (_mm_macc_ps(x,y,z))
-# define k4msub(x,y,z) (_mm_msub_ps(x,y,z))
-# define k4nmadd(x,y,z) (_mm_nmsub_ps(x,y,z))
-# define k4nmsub(x,y,z) (_mm_nmacc_ps(x,y,z))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4madd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm_macc_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4msub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm_msub_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmadd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm_nmsub_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm_nmacc_ps(x, y, z);
+}
#else
-# define k4madd(x,y,z) (k4add(k4mul(x,y),z))
-# define k4msub(x,y,z) (k4sub(k4mul(x,y),z))
-# define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y)))
-# define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y)))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4madd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4add(k4mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4msub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(k4mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmadd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(k4neg(z), k4mul(x, y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(z, k4mul(x, y));
+}
#endif
// Cheap functions
-#define k4copysign(x,y) \
- (_mm_or_ps(_mm_andnot_ps(k4sign_mask.vf,x), \
- _mm_and_ps(k4sign_mask.vf,y)))
-#define k4fabs(x) (_mm_andnot_ps(k4sign_mask.vf,x))
-#define k4fmax(x,y) (_mm_max_ps(x,y))
-#define k4fmin(x,y) (_mm_min_ps(x,y))
-#define k4fnabs(x) (_mm_or_ps(k4sign_mask.vf,x))
-static const k4const_t k4zero = { f: { 0.0f, 0.0f, 0.0f, 0.0f, }};
-static const k4const_t k4one = { f: { 1.0f, 1.0f, 1.0f, 1.0f, }};
-#define k4sgn(x_) \
- ({ \
- CCTK_REAL_VEC const x__=(x_); \
- CCTK_REAL_VEC const x=x__; \
- CCTK_REAL_VEC const iszero = _mm_cmpeq_ps(k4zero.vf, x); \
- CCTK_REAL_VEC const sign = _mm_and_ps(k4sign_mask.vf, x); \
- CCTK_REAL_VEC const signedone = _mm_or_ps(k4one.vf, sign); \
- k4ifthen(iszero, k4zero.vf, signedone); \
- })
-// TODO: maybe use rsqrt and Newton-Raphson
-#define k4sqrt(x) (_mm_sqrt_ps(x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4copysign(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_or_ps(_mm_and_ps(k4notsign, x),
+ _mm_and_ps(k4sign , y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fabs(CCTK_REAL4_VEC const x)
+{
+ return _mm_and_ps(k4notsign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fmax(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_max_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fmin(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_min_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fnabs(CCTK_REAL4_VEC const x)
+{
+ return _mm_or_ps(k4sign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sqrt(CCTK_REAL4_VEC const x)
+{
+ // TODO: maybe use rsqrt and Newton-Raphson
+ return _mm_sqrt_ps(x);
+}
// Expensive functions
-#define K4REPL(f,x_) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const x=x__; \
- vec4_set(f(vec4_elt0(x)), \
- f(vec4_elt1(x)), \
- f(vec4_elt2(x)), \
- f(vec4_elt3(x))); \
- })
-#define K4REPL2S(f,x_,a_) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4 const a__=(a_); \
- CCTK_REAL4_VEC const x=x__; \
- CCTK_REAL4 const a=a__; \
- vec4_set(f(vec4_elt0(x),a), \
- f(vec4_elt1(x),a), \
- f(vec4_elt2(x),a), \
- f(vec4_elt3(x),a)); \
- })
-#define K4REPL2(f,x_,y_) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const y__=(y_); \
- CCTK_REAL4_VEC const x=x__; \
- CCTK_REAL4_VEC const y=y__; \
- vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \
- f(vec4_elt1(x),vec4_elt1(y)), \
- f(vec4_elt2(x),vec4_elt2(y)), \
- f(vec4_elt3(x),vec4_elt3(y))); \
- })
-
-#define k4acos(x) K4REPL(acosf,x)
-#define k4acosh(x) K4REPL(acoshf,x)
-#define k4asin(x) K4REPL(asinf,x)
-#define k4asinh(x) K4REPL(asinhf,x)
-#define k4atan(x) K4REPL(atanf,x)
-#define k4atan2(x,y) K4REPL2(atan2f,x,y)
-#define k4atanh(x) K4REPL(atanhf,x)
-#define k4cos(x) K4REPL(cosf,x)
-#define k4cosh(x) K4REPL(coshf,x)
-#define k4exp(x) K4REPL(expf,x)
-#define k4log(x) K4REPL(logf,x)
-#define k4pow(x,a) K4REPL2S(powf,x,a)
-#define k4sin(x) K4REPL(sinf,x)
-#define k4sinh(x) K4REPL(sinhf,x)
-#define k4tan(x) K4REPL(tanf,x)
-#define k4tanh(x) K4REPL(tanhf,x)
-
-static const k4const_t k4lfalse_ = {{ 0U, 0U, 0U, 0U, }};
-static const k4const_t k4ltrue_ = {{ ~0U, ~0U, ~0U, ~0U, }};
-#define k4lfalse (k4lfalse_.vf)
-#define k4ltrue (k4ltrue_.vf)
-#define k4lnot(x) (_mm_xor_ps(k4ltrue,x))
-#define k4land(x,y) (_mm_and_ps(x,y))
-#define k4lor(x,y) (_mm_or_ps(x,y))
-#define k4lxor(x,y) (_mm_xor_ps(x,y))
-
+#define K4REPL(f,x) \
+ vec4_set(f(vec4_elt0(x)), \
+ f(vec4_elt1(x)), \
+ f(vec4_elt2(x)), \
+ f(vec4_elt3(x)));
+#define K4REPL2S(f,x,a) \
+ vec4_set(f(vec4_elt0(x),a), \
+ f(vec4_elt1(x),a), \
+ f(vec4_elt2(x),a), \
+ f(vec4_elt3(x),a));
+#define K4REPL2(f,x,y) \
+ vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \
+ f(vec4_elt1(x),vec4_elt1(y)), \
+ f(vec4_elt2(x),vec4_elt2(y)), \
+ f(vec4_elt3(x),vec4_elt3(y)));
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(acos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4acosh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(acosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4asin(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(asin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4asinh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(asinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atan(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(atan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atan2(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return K4REPL2(atan2,x,y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atanh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(atanh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4cos(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(cos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4cosh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(cosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4exp(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(exp,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4log(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(log,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4pow(CCTK_REAL4_VEC const x, CCTK_REAL4 const a)
+{
+ return K4REPL2S(pow,x,a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sin(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(sin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sinh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(sinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4tan(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(tan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(tanh,x);
+}
+
+
+
+#define k4lfalse (vec4_set1i( 0))
+#define k4ltrue (vec4_set1i(~0))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lnot(CCTK_BOOLEAN4_VEC const x)
+{
+ return _mm_xor_ps(k4ltrue, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4land(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm_and_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm_or_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lxor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm_xor_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
#ifdef __SSE4_1__
-# define k4ifthen(x,y,z) (_mm_blendv_ps(z,y,x))
+ return _mm_blendv_ps(z,y,x);
#elif 0
-# ifdef __cplusplus
-# define k4signbit(x) ({ using namespace std; signbit(x); })
-# else
-# define k4signbit(x) (signbitf(x))
-# endif
-# define k4ifthen(x,y,z) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const y__=(y_); \
- CCTK_REAL4_VEC const z__=(z_); \
- CCTK_REAL4_VEC const x=x__; \
- CCTK_REAL4_VEC const y=y__; \
- CCTK_REAL4_VEC const z=z__; \
- vec4_set(k4signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \
- k4signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \
- k4signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \
- k4signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \
- })
+ return vec4_set(std::signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z),
+ std::signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z),
+ std::signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z),
+ std::signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z));
#elif 0
-// We don't need to shift -- the condition (mask) will be either all
-// zeros or all ones
-# define k4ifthen(x_,y_,z_) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const y__=(y_); \
- CCTK_REAL4_VEC const z__=(z_); \
- CCTK_REAL4_VEC const x=x__; \
- CCTK_REAL4_VEC const y=y__; \
- CCTK_REAL4_VEC const z=z__; \
- CCTK_REAL4_VEC const mask = \
- (__m128)_mm_srai_epi32((__m128i)x, 31); \
- /* (z & ~mask) | (y & mask) */ \
- _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \
- })
+ // We don't need to shift -- the condition (mask) will be either all
+ // zeros or all ones
+ CCTK_REAL4_VEC const mask = (__m128)_mm_srai_epi32((__m128i)x, 31);
+ // (z & ~mask) | (y & mask)
+ return _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y));
#else
-# define k4ifthen(x_,y_,z_) \
- ({ \
- CCTK_REAL4_VEC const x__=(x_); \
- CCTK_REAL4_VEC const y__=(y_); \
- CCTK_REAL4_VEC const z__=(z_); \
- CCTK_REAL4_VEC const x=x__; \
- CCTK_REAL4_VEC const y=y__; \
- CCTK_REAL4_VEC const z=z__; \
- /* (z & ~mask) | (y & mask) where imask = ~mask */ \
- _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z)); \
- })
+ // This assumes that all logical operations always return either
+ // lfalse or ltrue, and nothing "in between"
+ // (z & ~mask) | (y & mask) where imask = ~mask
+ return _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z));
#endif
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpeq(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_cmpeq_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpne(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_cmpneq_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpgt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_cmpgt_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpge(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_cmpge_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmplt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_cmplt_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmple(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm_cmple_ps(x, y);
+}
+
+
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sgn(CCTK_REAL4_VEC const x)
+{
+ CCTK_BOOLEAN4_VEC const iszero = k4cmpeq(x, vec4_set1(0.0));
+ CCTK_REAL4_VEC const sign = _mm_and_ps(k4sign, x);
+ CCTK_REAL4_VEC const signedone = _mm_or_ps(sign, vec4_set1(1.0));
+ return k4ifthen(iszero, vec4_set1(0.0), signedone);
+}
-#define k4cmpeq(x,y) (_mm_cmpeq_ps(x,y))
-#define k4cmpne(x,y) (_mm_cmpneq_ps(x,y))
-#define k4cmpgt(x,y) (_mm_cmpgt_ps(x,y))
-#define k4cmpge(x,y) (_mm_cmpge_ps(x,y))
-#define k4cmplt(x,y) (_mm_cmplt_ps(x,y))
-#define k4cmple(x,y) (_mm_cmple_ps(x,y))
+#endif
diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h
index 0cd49ac..28fae04 100644
--- a/src/vectors-4-default.h
+++ b/src/vectors-4-default.h
@@ -19,6 +19,9 @@
// Number of vector elements in a vector
#define CCTK_REAL4_VEC_SIZE 1
+vec_static_assert(sizeof(CCTK_REAL4_VEC) ==
+ sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE);
+
// Integer and boolean types corresponding to this real type
#define CCTK_INTEGER4 CCTK_REAL4
#define CCTK_BOOLEAN4 CCTK_REAL4
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h
index 6882523..ce43542 100644
--- a/src/vectors-8-AVX.h
+++ b/src/vectors-8-AVX.h
@@ -1,17 +1,16 @@
// Vectorise using Intel's or AMD's AVX
// Use the type __m256d directly, without introducing a wrapper class
-// Use macros instead of inline functions
-#if VECTORISE_EMULATE_AVX
-# include "avxintrin_emu.h"
-#else
-# include <immintrin.h>
-#endif
+#include <cstdlib>
+
+
+
+#include <immintrin.h>
#ifdef __FMA4__
-# include <fma4intrin.h>
+# include <x86intrin.h>
#endif
@@ -26,43 +25,80 @@
// Vector type corresponding to CCTK_REAL
-#define CCTK_REAL8_VEC __m256d
+typedef __m256d CCTK_REAL8_VEC;
+typedef __m256i CCTK_INTEGER8_VEC;
+typedef __m256d CCTK_BOOLEAN8_VEC;
// Number of vector elements in a CCTK_REAL_VEC
#define CCTK_REAL8_VEC_SIZE 4
+vec_static_assert(sizeof(CCTK_REAL8_VEC) ==
+ sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE);
+
// Integer and boolean types corresponding to this real type
-#define CCTK_INTEGER8 CCTK_REAL8
-#define CCTK_BOOLEAN8 CCTK_REAL8
-#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
-#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
+typedef CCTK_INT8 CCTK_INTEGER8;
+typedef CCTK_REAL8 CCTK_BOOLEAN8;
union k8const_t {
- unsigned long long i[4];
- double f[4];
- __m256i vi;
- __m256d vf;
+ CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE];
+ CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE];
+ CCTK_INTEGER8_VEC vi;
+ CCTK_REAL8_VEC vf;
};
-#define K8_ZERO 0x0000000000000000ULL
-#define K8_NOTZERO 0xffffffffffffffffULL
-#define K8_IMIN 0x8000000000000000ULL
-#define K8_IMAX 0x7fffffffffffffffULL
+#define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL)))
+#define k8notsign (vec8_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL)))
// Create vectors, extract vector elements
-#define vec8_set1(a) (_mm256_set1_pd(a))
-#define vec8_set(a,b,c,d) (_mm256_set_pd(d,c,b,a)) // note reversed arguments
-
-#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0])
-#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1])
-#define vec8_elt2(x) (((CCTK_REAL8 const*)&(x))[2])
-#define vec8_elt3(x) (((CCTK_REAL8 const*)&(x))[3])
-#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d])
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a)
+{
+ return _mm256_set1_pd(a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a)
+{
+ return _mm256_castsi256_pd(_mm256_set1_epi64x(a));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a,
+ CCTK_REAL8 const b,
+ CCTK_REAL8 const c,
+ CCTK_REAL8 const d)
+{
+ return _mm256_set_pd(d,c,b,a); // note reversed arguments
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[0];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[1];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt2(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[2];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt3(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[3];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d)
+{
+ return ((CCTK_REAL8 const*)&x)[d];
+}
@@ -70,11 +106,17 @@ union k8const_t {
// Load a vector from memory (aligned and unaligned); this loads from
// a reference to a scalar
-#define vec8_load(p) (_mm256_load_pd(&(p)))
-#define vec8_loadu(p) (_mm256_loadu_pd(&(p)))
-#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
-# define vec8_load_off1(p) vec_loadu(p)
-#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p)
+{
+ return _mm256_load_pd(&p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p)
+{
+ return _mm256_loadu_pd(&p);
+}
+#if VECTORISE_ALWAYS_USE_ALIGNED_LOADS
# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported"
#endif
@@ -82,244 +124,492 @@ union k8const_t {
// decided by the offset off and the vector size
#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
// Implementation: Always use unaligned load
-# define vec8_loadu_maybe(off,p) (vec8_loadu(p))
-# define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
#else
-# define vec8_loadu_maybe(off,p_) \
- ({ \
- CCTK_REAL8 const& p__=(p_); \
- CCTK_REAL8 const& p=p__; \
- (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
- vec8_load(p) : \
- vec8_load_off1(p); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p)
+{
+ return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_loadu(p);
+}
# if VECTORISE_ALIGNED_ARRAYS
// Assume all array x sizes are multiples of the vector size
-# define vec8_loadu_maybe3(off1,off2,off3,p) \
- vec8_loadu_maybe(off1,p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return vec8_loadu_maybe(off1, p);
+}
# else
-# define vec8_loadu_maybe3(off1,off2,off3,p_) \
- ({ \
- CCTK_REAL8 const& p__=(p_); \
- CCTK_REAL8 const& p=p__; \
- ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
- (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
- vec8_loadu(p) : \
- vec8_loadu_maybe(off1,p); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return
+ off2 % CCTK_REAL8_VEC_SIZE != 0 or
+ off3 % CCTK_REAL8_VEC_SIZE != 0 ?
+ vec8_loadu(p) :
+ vec8_loadu_maybe(off1, p);
+}
# endif
#endif
// Store a vector to memory (aligned and non-temporal); this stores to
// a reference to a scalar
-#define vec8_store(p,x) (_mm256_store_pd(&(p),x))
-#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm256_store_pd(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm256_storeu_pd(&p, x);
+}
#if ! VECTORISE_STREAMING_STORES
-# define vec8_store_nta(p,x) (vec8_store(p,x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ vec8_store(p, x);
+}
#else
-# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm256_stream_pd(&p, x);
+}
#endif
// Store a partial vector (aligned and non-temporal)
-#define vec8_store_partial_prepare(i,imin_,imax_) \
+#define vec8_store_partial_prepare(i, imin,imax) \
bool v8stp_all; \
__m256i v8stp_mask; \
- ({ \
- ptrdiff_t const imin__=(imin_); \
- ptrdiff_t const imin=imin__; \
- ptrdiff_t const imax__=(imax_); \
- ptrdiff_t const imax=imax__; \
- \
- v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE-1<imax; \
- \
- if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
- /* \
- __m256i const v8stp_mask = \
- _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), \
- vec_index), \
- _mm256_add_epi64(_mm256_set1_epi64x(i-imax), \
- vec_index)); \
- */ \
- __m128i const termlo0 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(1, 0)); \
- __m128i const termup0 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(1, 0)); \
- __m128i const term0 = _mm_andnot_si128(termlo0, termup0); \
- __m128i const termlo1 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(3, 2)); \
- __m128i const termup1 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(3, 2)); \
- __m128i const term1 = _mm_andnot_si128(termlo1, termup1); \
- v8stp_mask = \
- _mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \
- } \
- })
-
-#define vec8_store_nta_partial(p,x) \
- ({ \
- if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
- vec8_store_nta(p,x); \
- } else { \
- _mm256_maskstore_pd(&p,v8stp_mask,x); \
- } \
- })
+ vec8_store_partial_prepare_(v8stp_all, v8stp_mask, i, imin, imax)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_partial_prepare_(bool& all, __m256i& mask,
+ std::ptrdiff_t const i,
+ std::ptrdiff_t const imin,
+ std::ptrdiff_t const imax)
+{
+ all = i>=imin and i+CCTK_REAL8_VEC_SIZE-1<imax;
+
+ if (not CCTK_BUILTIN_EXPECT(all, true)) {
+ /* __m256i const v8stp_mask = */
+ /* _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), */
+ /* vec_index), */
+ /* _mm256_add_epi64(_mm256_set1_epi64x(i-imax), */
+ /* vec_index)); */
+ __m128i const termlo01 =
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(1, 0));
+ __m128i const termup01 =
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(1, 0));
+ __m128i const term01 = _mm_andnot_si128(termlo01, termup01);
+ __m128i const termlo23 =
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(3, 2));
+ __m128i const termup23 =
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(3, 2));
+ __m128i const term23 = _mm_andnot_si128(termlo23, termup23);
+ mask = _mm256_insertf128_si256(_mm256_castsi128_si256(term01), term23, 1);
+ }
+}
+
+#define vec8_store_nta_partial(p, x) \
+ vec8_store_nta_partial_(v8stp_all, v8stp_mask, p, x)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_(bool const all, __m256i const mask,
+ CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x)
+{
+ if (CCTK_BUILTIN_EXPECT(all, true)) {
+ vec8_store_nta(p, x);
+ } else {
+ _mm256_maskstore_pd(&p, mask, x);
+ }
+}
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
// Masks indicating which vector element should be stored:
-static const k8const_t k8store_lo[5] =
+/*static*/ k8const_t const k8store_lo[5] =
{
- {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_ZERO , }},
- {{ K8_NOTZERO, K8_ZERO , K8_ZERO , K8_ZERO , }},
- {{ K8_NOTZERO, K8_NOTZERO, K8_ZERO , K8_ZERO , }},
- {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_ZERO , }},
- {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ { i: { 0, 0, 0, 0, }},
+ { i: { ~0, 0, 0, 0, }},
+ { i: { ~0, ~0, 0, 0, }},
+ { i: { ~0, ~0, ~0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, }},
};
-static const k8const_t k8store_hi[5] =
+/*static*/ k8const_t const k8store_hi[5] =
{
- {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_ZERO , }},
- {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_NOTZERO, }},
- {{ K8_ZERO , K8_ZERO , K8_NOTZERO, K8_NOTZERO, }},
- {{ K8_ZERO , K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
- {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ { i: { 0, 0, 0, 0, }},
+ { i: { 0, 0, 0, ~0, }},
+ { i: { 0, 0, ~0, ~0, }},
+ { i: { 0, ~0, ~0, ~0, }},
+ { i: { ~0, ~0, ~0, ~0, }},
};
#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd
-# define vec8_store_nta_partial_lo(p,x,n) \
- (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo[n].vi),x))
-# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi[n].vi),x))
-# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
- (_mm256_maskstore_pd \
- (&(p), \
- _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi), \
- x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_pd(&p, _mm256_castsi256_pd(k8store_lo[n].vi), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_pd(&p, _mm256_castsi256_pd(k8store_hi[n].vi), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_mid(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm256_maskstore_pd
+ (&p,
+ _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi),
+ x);
+}
#else
-# define vec8_store_nta_partial_lo(p,x,n) \
- (_mm256_maskstore_pd(&(p),k8store_lo[n].vi,x))
-# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm256_maskstore_pd(&(p),k8store_hi[n].vi,x))
-# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
- (_mm256_maskstore_pd \
- (&(p), \
- _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, \
- k8store_hi[nhi].vf)), \
- x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_pd(&p, k8store_lo[n].vi, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_pd(&p, k8store_hi[n].vi, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_mid(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm256_maskstore_pd
+ (&p,
+ _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, k8store_hi[nhi].vf)),
+ x);
+}
#endif
// Functions and operators
-static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
-
// Operators
-#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask.vf))
-
-#define k8add(x,y) (_mm256_add_pd(x,y))
-#define k8sub(x,y) (_mm256_sub_pd(x,y))
-#define k8mul(x,y) (_mm256_mul_pd(x,y))
-#define k8div(x,y) (_mm256_div_pd(x,y))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x)
+{
+ return _mm256_xor_pd(k8sign, x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8add(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_add_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sub(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_sub_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8mul(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_mul_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8div(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_div_pd(x, y);
+}
// Fused multiply-add, defined as [+-]x*y[+-]z
#ifdef __FMA4__
-# define k8madd(x,y,z) (_mm256_macc_pd(x,y,z))
-# define k8msub(x,y,z) (_mm256_msub_pd(x,y,z))
-# define k8nmadd(x,y,z) (_mm256_nmsub_pd(x,y,z))
-# define k8nmsub(x,y,z) (_mm256_nmacc_pd(x,y,z))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_macc_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_msub_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_nmsub_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_nmacc_pd(x, y, z);
+}
#else
-# define k8madd(x,y,z) (k8add(k8mul(x,y),z))
-# define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
-# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
-# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8add(k8mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(k8mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(k8neg(z), k8mul(x, y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(z, k8mul(x, y));
+}
#endif
// Cheap functions
-#define k8copysign(x,y) \
- (_mm256_or_pd(_mm256_andnot_pd(k8sign_mask.vf,x), \
- _mm256_and_pd(k8sign_mask.vf,y)))
-#define k8fabs(x) (_mm256_andnot_pd(k8sign_mask.vf,x))
-#define k8fmax(x,y) (_mm256_max_pd(x,y))
-#define k8fmin(x,y) (_mm256_min_pd(x,y))
-#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask.vf))
-static const k8const_t k8zero = { f: { 0.0, 0.0, 0.0, 0.0, }};
-static const k8const_t k8one = { f: { 1.0, 1.0, 1.0, 1.0, }};
-#define k8sgn(x_) \
- ({ \
- CCTK_REAL_VEC x__=(x_); \
- CCTK_REAL_VEC x=x__; \
- CCTK_REAL_VEC iszero = _mm256_cmp_pd(x, k8zero.vf, _CMP_EQ_OQ); \
- CCTK_REAL_VEC sign = _mm256_and_pd(k8sign_mask.vf, x); \
- CCTK_REAL_VEC signedone = _mm256_or_pd(sign, k8one.vf); \
- k8ifthen(iszero, k8zero.vf, signedone); \
- })
-#define k8sqrt(x) (_mm256_sqrt_pd(x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_or_pd(_mm256_and_pd(k8notsign, x),
+ _mm256_and_pd(k8sign , y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x)
+{
+ return _mm256_and_pd(k8notsign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_max_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_min_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x)
+{
+ return _mm256_or_pd(k8sign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x)
+{
+ return _mm256_sqrt_pd(x);
+}
// Expensive functions
-#define K8REPL(f,x_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const x=x__; \
- vec8_set(f(vec8_elt0(x)), \
- f(vec8_elt1(x)), \
- f(vec8_elt2(x)), \
- f(vec8_elt3(x))); \
- })
-#define K8REPL2S(f,x_,a_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8 const a__=(a_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8 const a=a__; \
- vec8_set(f(vec8_elt0(x),a), \
- f(vec8_elt1(x),a), \
- f(vec8_elt2(x),a), \
- f(vec8_elt3(x),a)); \
- })
-#define K8REPL2(f,x_,y_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const y__=(y_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8_VEC const y=y__; \
- vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
- f(vec8_elt1(x),vec8_elt1(y)), \
- f(vec8_elt2(x),vec8_elt2(y)), \
- f(vec8_elt3(x),vec8_elt3(y))); \
- })
-
-#define k8acos(x) K8REPL(acos,x)
-#define k8acosh(x) K8REPL(acosh,x)
-#define k8asin(x) K8REPL(asin,x)
-#define k8asinh(x) K8REPL(asinh,x)
-#define k8atan(x) K8REPL(atan,x)
-#define k8atan2(x,y) K8REPL2(atan2,x,y)
-#define k8atanh(x) K8REPL(atanh,x)
-#define k8cos(x) K8REPL(cos,x)
-#define k8cosh(x) K8REPL(cosh,x)
-#define k8exp(x) K8REPL(exp,x)
-#define k8log(x) K8REPL(log,x)
-#define k8pow(x,a) K8REPL2S(pow,x,a)
-#define k8sin(x) K8REPL(sin,x)
-#define k8sinh(x) K8REPL(sinh,x)
-#define k8tan(x) K8REPL(tan,x)
-#define k8tanh(x) K8REPL(tanh,x)
-
-static const k8const_t k8lfalse_ =
- {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }};
-static const k8const_t k8ltrue_ =
- {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }};
-#define k8lfalse (k8lfalse_.vf)
-#define k8ltrue (k8ltrue_.vf)
-#define k8lnot(x) (_mm256_xor_pd(k8ltrue,x))
-#define k8land(x,y) (_mm256_and_pd(x,y))
-#define k8lor(x,y) (_mm256_or_pd(x,y))
-#define k8lxor(x,y) (_mm256_xor_pd(x,y))
-#define k8ifthen(x,y,z) (_mm256_blendv_pd(z,y,x))
-
-#define k8cmpeq(x,y) (_mm256_cmp_pd(x,y,_CMP_EQ_OQ))
-#define k8cmpne(x,y) (_mm256_cmp_pd(x,y,_CMP_NEQ_OQ))
-#define k8cmpgt(x,y) (_mm256_cmp_pd(x,y,_CMP_GT_OQ))
-#define k8cmpge(x,y) (_mm256_cmp_pd(x,y,_CMP_GE_OQ))
-#define k8cmplt(x,y) (_mm256_cmp_pd(x,y,_CMP_LT_OQ))
-#define k8cmple(x,y) (_mm256_cmp_pd(x,y,_CMP_LE_OQ))
+#define K8REPL(f,x) \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x)), \
+ f(vec8_elt2(x)), \
+ f(vec8_elt3(x)));
+#define K8REPL2S(f,x,a) \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a), \
+ f(vec8_elt2(x),a), \
+ f(vec8_elt3(x),a));
+#define K8REPL2(f,x,y) \
+ vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
+ f(vec8_elt1(x),vec8_elt1(y)), \
+ f(vec8_elt2(x),vec8_elt2(y)), \
+ f(vec8_elt3(x),vec8_elt3(y)));
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(acos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(acosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(asin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(asinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(atan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return K8REPL2(atan2,x,y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(atanh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(cos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(cosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(exp,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(log,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a)
+{
+ return K8REPL2S(pow,x,a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(sin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(sinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(tan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(tanh,x);
+}
+
+
+
+#define k8lfalse (vec8_set1i( 0))
+#define k8ltrue (vec8_set1i(~0))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x)
+{
+ return _mm256_xor_pd(k8ltrue, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm256_and_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm256_or_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm256_xor_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_blendv_pd(z, y, x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_EQ_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_NEQ_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_GT_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_GE_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_LT_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_LE_OQ);
+}
+
+
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x)
+{
+ CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0));
+ CCTK_REAL8_VEC const sign = _mm256_and_pd(k8sign, x);
+ CCTK_REAL8_VEC const signedone = _mm256_or_pd(sign, vec8_set1(1.0));
+ return k8ifthen(iszero, vec8_set1(0.0), signedone);
+}
diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h
index 0189989..7b9c50d 100644
--- a/src/vectors-8-DoubleHummer.h
+++ b/src/vectors-8-DoubleHummer.h
@@ -24,9 +24,9 @@
#define CCTK_REAL8_VEC_SIZE 2
// Integer and boolean types corresponding to this real type
-#define CCTK_INTEGER8 CCTK_REAL8
+//#define CCTK_INTEGER8 CCTK_REAL8
#define CCTK_BOOLEAN8 CCTK_REAL8
-#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
+//#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
diff --git a/src/vectors-8-MIC.h b/src/vectors-8-MIC.h
new file mode 100644
index 0000000..3f85119
--- /dev/null
+++ b/src/vectors-8-MIC.h
@@ -0,0 +1,652 @@
+// Vectorise using Intel's MIC
+
+// Use the type __m512d directly, without introducing a wrapper class
+
+
+
+#include <cstdlib>
+
+#include <immintrin.h>
+
+
+
+#define vec8_architecture "MIC (64-bit precision)"
+
+
+
+// Vector type corresponding to CCTK_REAL
+typedef __m512d CCTK_REAL8_VEC;
+typedef __m512i CCTK_INTEGER8_VEC;
+typedef __mmask8 CCTK_BOOLEAN8_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL8_VEC_SIZE 8
+
+vec_static_assert(sizeof(CCTK_REAL8_VEC) ==
+ sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE);
+
+// Integer and boolean types corresponding to this real type
+typedef CCTK_INT8 CCTK_INTEGER8;
+typedef bool CCTK_BOOLEAN8;
+
+
+
+union k8const_t {
+ CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE];
+ CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE];
+ CCTK_INTEGER8_VEC vi;
+ CCTK_REAL8_VEC vf;
+};
+
+#define k8sign (vec8i_set1i( (CCTK_INTEGER8)(1ULL << 63ULL)))
+#define k8notsign (vec8i_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL)))
+
+
+
+// Create vectors, extract vector elements
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a)
+{
+ return _mm512_set1_pd(a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_INTEGER8_VEC vec8i_set1i(CCTK_INT8 const a)
+{
+ return _mm512_set1_epi64(a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a0,
+ CCTK_REAL8 const a1,
+ CCTK_REAL8 const a2,
+ CCTK_REAL8 const a3,
+ CCTK_REAL8 const a4,
+ CCTK_REAL8 const a5,
+ CCTK_REAL8 const a6,
+ CCTK_REAL8 const a7)
+{
+ return _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); // note reversed arguments
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[0];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[1];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt2(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[2];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt3(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[3];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt4(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[4];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt5(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[5];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt6(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[6];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt7(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[7];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d)
+{
+ return ((CCTK_REAL8 const*)&x)[d];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8 vec8_elt(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d)
+{
+ return _mm512_mask2int(x) & (1 << d);
+}
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p)
+{
+ return _mm512_load_pd(&p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p)
+{
+ CCTK_REAL8_VEC x = _mm512_undefined_pd();
+ x = _mm512_loadunpacklo_pd(x, &p);
+ x = _mm512_loadunpackhi_pd(x, &p+8);
+ return x;
+}
+#if VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS is not yet supported"
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p)
+{
+ return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_loadu(p);
+}
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return vec8_loadu_maybe(off1, p);
+}
+# else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return
+ off2 % CCTK_REAL8_VEC_SIZE != 0 or
+ off3 % CCTK_REAL8_VEC_SIZE != 0 ?
+ vec8_loadu(p) :
+ vec8_loadu_maybe(off1, p);
+}
+# endif
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm512_store_pd(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm512_packstorelo_pd(&p , x);
+ _mm512_packstorehi_pd(&p+8, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+#if VECTORISE_STREAMING_STORES
+ _mm512_extstore_pd(&p, x, _MM_DOWNCONV_PD_NONE, _MM_HINT_NT);
+#else
+ _mm512_store_pd(&p, x);
+#endif
+}
+
+// Store a partial vector (aligned and non-temporal)
+#define vec8_store_partial_prepare(i, imin,imax) \
+ __mmask8 v8stp_mask; \
+ vec8_store_partial_prepare_(v8stp_mask, i, imin, imax)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_partial_prepare_(__mmask8& mask,
+ std::ptrdiff_t const i,
+ std::ptrdiff_t const imin,
+ std::ptrdiff_t const imax)
+{
+ unsigned char m = 255;
+ if (i < imin) {
+ /* clear lower imin-i bits */
+ m &= 255 << (imin-i);
+ }
+ if (i+CCTK_REAL8_VEC_SIZE > imax) {
+ /* clear upper i+CCTK_REAL8_VEC_SIZE-imax bits */
+ m &= 255 >> (i+CCTK_REAL8_VEC_SIZE-imax);
+ }
+ mask = _mm512_int2mask(m);
+}
+
+#define vec8_store_nta_partial(p, x) \
+ vec8_store_nta_partial_(v8stp_mask, p, x)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_(__mmask8 const mask,
+ CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x)
+{
+ _mm512_mask_store_pd(&p, mask, x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm512_mask_store_pd(&p, _mm512_int2mask(255 >> (8-n)), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm512_mask_store_pd(&p, _mm512_int2mask(255 << (8-n)), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_mid(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm512_mask_store_pd
+ (&p, _mm512_int2mask((255 >> (8-nlo)) & (255 << (8-nhi))), x);
+}
+
+
+
+// Functions and operators
+
+// Operators
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x)
+{
+ // Could also multiply by -1
+ // Could also invert sign bit
+ return _mm512_sub_pd(_mm512_set1_pd(0.0), x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8add(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_add_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sub(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_sub_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8mul(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_mul_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8div(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_div_pd(x, y);
+}
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm512_fmadd_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm512_fmsub_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm512_fnmsub_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm512_fnmadd_pd(x, y, z);
+}
+
+// Cheap functions
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ CCTK_INTEGER8_VEC ix = _mm512_castpd_si512(x);
+ CCTK_INTEGER8_VEC iy = _mm512_castpd_si512(y);
+ CCTK_INTEGER8_VEC ir = _mm512_or_epi64(_mm512_and_epi64(k8notsign, ix),
+ _mm512_and_epi64(k8sign , iy));
+ return _mm512_castsi512_pd(ir);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x)
+{
+ // Could also do k8fmax(x, k8neg(x))
+ CCTK_INTEGER8_VEC ix = _mm512_castpd_si512(x);
+ CCTK_INTEGER8_VEC ir = _mm512_and_epi64(k8notsign, ix);
+ return _mm512_castsi512_pd(ir);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_gmax_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_gmin_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x)
+{
+ // Could also do k8fmin(x, k8neg(x))
+ CCTK_INTEGER8_VEC ix = _mm512_castpd_si512(x);
+ CCTK_INTEGER8_VEC ir = _mm512_or_epi64(k8sign, ix);
+ return _mm512_castsi512_pd(ir);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x)
+{
+ return _mm512_sqrt_pd(x);
+}
+
+// Expensive functions
+
+#if 0
+// These implementations lead to an ICE with icpc 13.0.1
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x)
+{
+ return _mm512_acos_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x)
+{
+ return _mm512_acosh_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x)
+{
+ return _mm512_asin_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x)
+{
+ return _mm512_asinh_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x)
+{
+ return _mm512_atan_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_atan2_pd(x,y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x)
+{
+ return _mm512_atanh_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x)
+{
+ return _mm512_cos_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x)
+{
+ return _mm512_cosh_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x)
+{
+ return _mm512_exp_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x)
+{
+ return _mm512_log_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a)
+{
+ return _mm512_pow_pd(x, _mm512_set1_pd(a));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x)
+{
+ return _mm512_sin_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x)
+{
+ return _mm512_sinh_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x)
+{
+ return _mm512_tan_pd(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x)
+{
+ return _mm512_tanh_pd(x);
+}
+
+#else
+
+// These implementations are very expensive
+#define K8REPL(f,x) \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x)), \
+ f(vec8_elt2(x)), \
+ f(vec8_elt3(x)), \
+ f(vec8_elt4(x)), \
+ f(vec8_elt5(x)), \
+ f(vec8_elt6(x)), \
+ f(vec8_elt7(x)));
+#define K8REPL2S(f,x,a) \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a), \
+ f(vec8_elt2(x),a), \
+ f(vec8_elt3(x),a), \
+ f(vec8_elt4(x),a), \
+ f(vec8_elt5(x),a), \
+ f(vec8_elt6(x),a), \
+ f(vec8_elt7(x),a));
+#define K8REPL2(f,x,y) \
+ vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
+ f(vec8_elt1(x),vec8_elt1(y)), \
+ f(vec8_elt2(x),vec8_elt2(y)), \
+ f(vec8_elt3(x),vec8_elt3(y)), \
+ f(vec8_elt4(x),vec8_elt4(y)), \
+ f(vec8_elt5(x),vec8_elt5(y)), \
+ f(vec8_elt6(x),vec8_elt6(y)), \
+ f(vec8_elt7(x),vec8_elt7(y)));
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(acos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(acosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(asin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(asinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(atan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return K8REPL2(atan2,x,y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(atanh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(cos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(cosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(exp,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(log,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a)
+{
+ return K8REPL2S(pow,x,a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(sin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(sinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(tan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(tanh,x);
+}
+
+#endif
+
+
+
+#define k8lfalse (_mm512_int2mask( 0))
+#define k8ltrue (_mm512_int2mask(~0))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x)
+{
+ return _mm512_knot(x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm512_kand(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm512_kor(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm512_kxor(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ // This leads to an ICE
+ // return _mm512_mask_blend_pd(x, z, y);
+ return _mm512_mask_mov_pd(z, x, y);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_cmpeq_pd_mask(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_cmpneq_pd_mask(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_cmpnle_pd_mask(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_cmpnlt_pd_mask(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_cmplt_pd_mask(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm512_cmple_pd_mask(x, y);
+}
+
+
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x)
+{
+ CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0));
+ CCTK_BOOLEAN8_VEC const isneg = k8cmplt(x, vec8_set1(0.0));
+ return k8ifthen(iszero, vec8_set1(0.0),
+ k8ifthen(isneg, vec8_set1(-1.0), vec8_set1(+1.0)));
+}
diff --git a/src/vectors-8-QPX.h b/src/vectors-8-QPX.h
index 631c974..7639476 100644
--- a/src/vectors-8-QPX.h
+++ b/src/vectors-8-QPX.h
@@ -45,11 +45,16 @@ struct CCTK_REAL8_VEC {
// Create vectors, extract vector elements
#define vec8_set1(a) (vec_splats(a))
-#define vec8_set(a,b,c,d) ((vector4double){a,b,c,d})
+#define vec8_set(a,b,c,d) \
+ (vec_insert \
+ (d,vec_insert \
+ (c,vec_insert \
+ (b,vec_insert \
+ (a,CCTK_REAL8_VEC(),0),1),2),3))
#define vec8_b2r(b) ((b)?+1.0:-1.0)
-#define vec8b_set(a,b,c,d) \
- ((vector4double){vec8_b2r(a),vec8_b2r(b),vec8_b2r(c),vec8_b2r(d)})
+#define vec8b_set(a,b,c,d) \
+ (vec8_set(vec8_b2r(a),vec8_b2r(b),vec8_b2r(c),vec8_b2r(d)))
#define vec8_elt0(x) (vec_extract(x,0))
#define vec8_elt1(x) (vec_extract(x,1))
@@ -351,8 +356,8 @@ struct CCTK_REAL8_VEC {
#define k8ifthen(x,y,z) (vec_sel(z,y,x))
#define k8cmpeq(x,y) (vec_cmpeq(x,y))
-#define k8cmpne(x,y) (k8lnot(vec_cmpeq(x,y)))
+#define k8cmpne(x,y) (k8lnot(k8cmpeq(x,y)))
#define k8cmpgt(x,y) (vec_cmpgt(x,y))
-#define k8cmpge(x,y) (k8lnot(vec_cmplt(x,y)))
+#define k8cmpge(x,y) (k8lnot(k8cmplt(x,y)))
#define k8cmplt(x,y) (vec_cmplt(x,y))
-#define k8cmple(x,y) (vec_not(vec_cmpgt(x,y)))
+#define k8cmple(x,y) (k8lnot(k8cmpgt(x,y)))
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index 6dfe89f..2326e49 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -1,12 +1,18 @@
-
+// Vectorise using Intel's or AMD's SSE2
// Use the type __m128d directly, without introducing a wrapper class
-// Use macros instead of inline functions
+
+#ifdef __PGI
+// PGI doesn't want to inline functions
+# include "macros/vectors-8-SSE2.h"
+#else
+
+#include <cassert>
+#include <cmath>
+
-#include <assert.h>
-#include <math.h>
#include <emmintrin.h>
#ifdef __SSE4_1__
@@ -28,7 +34,7 @@
#endif
#ifdef __FMA4__
-# include <fma4intrin.h>
+# include <x86intrin.h>
#endif
@@ -53,46 +59,79 @@
// Vector type corresponding to CCTK_REAL
-#define CCTK_REAL8_VEC __m128d
+typedef __m128d CCTK_REAL8_VEC;
+typedef __m128i CCTK_INTEGER8_VEC;
+typedef __m128d CCTK_BOOLEAN8_VEC;
// Number of vector elements in a CCTK_REAL_VEC
#define CCTK_REAL8_VEC_SIZE 2
+vec_static_assert(sizeof(CCTK_REAL8_VEC) ==
+ sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE);
+
// Integer and boolean types corresponding to this real type
-#define CCTK_INTEGER8 CCTK_REAL8
-#define CCTK_BOOLEAN8 CCTK_REAL8
-#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
-#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
+typedef CCTK_INT8 CCTK_INTEGER8;
+typedef CCTK_REAL8 CCTK_BOOLEAN8;
union k8const_t {
- long long i[2];
- double f[2];
- __m128i vi;
- __m128d vf;
+ CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE];
+ CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE];
+ CCTK_INTEGER8_VEC vi;
+ CCTK_REAL8_VEC vf;
};
-#define K8_IMIN ((long long)0x8000000000000000ULL)
+#define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL)))
+#define k8notsign (vec8_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL)))
// Create vectors, extract vector elements
-#define vec8_set1(a) (_mm_set1_pd(a))
-#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a)
+{
+ return _mm_set1_pd(a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a)
+{
+#if defined(__INTEL_COMPILER)
+ // Intel 11.1 does not support _mm_set1_epi64x
+ return _mm_set1_pd(*(CCTK_REAL8 const*)&a);
+#else
+ return _mm_castsi128_pd(_mm_set1_epi64x(a));
+#endif
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a, CCTK_REAL8 const b)
+{
+ return _mm_set_pd(b,a); // note reversed arguments
+}
// original order is 01
-#define vec8_swap10(x_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const x=x__; \
- _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); \
- })
-
-#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0])
-#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1])
-#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d])
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_swap10(CCTK_REAL8_VEC const x)
+{
+ return _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1));
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[0];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[1];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d)
+{
+ return ((CCTK_REAL8 const*)&x)[d];
+}
@@ -100,141 +139,237 @@ union k8const_t {
// Load a vector from memory (aligned and unaligned); this loads from
// a reference to a scalar
-#define vec8_load(p) (_mm_load_pd(&(p)))
-#define vec8_loadu(p) (_mm_loadu_pd(&(p)))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p)
+{
+ return _mm_load_pd(&p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p)
+{
+ return _mm_loadu_pd(&p);
+}
#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
-# define vec8_load_off1(p) vec_loadu(p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_load_off1(CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
#else
-# define vec8_load_off1(p_) \
- ({ \
- CCTK_REAL8 const& p__=(p_); \
- CCTK_REAL8 const& p=p__; \
- _mm_shuffle_pd(vec8_load((&p)[-1]), \
- vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_load_off1(CCTK_REAL8 const& p)
+{
+ return _mm_shuffle_pd(vec8_load((&p)[-1]),
+ vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1));
+}
#endif
// Load a vector from memory that may or may not be aligned, as
// decided by the offset off and the vector size
#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
// Implementation: Always use unaligned load
-# define vec8_loadu_maybe(off,p) vec8_loadu(p)
-# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
#else
-# define vec8_loadu_maybe(off,p_) \
- ({ \
- CCTK_REAL8 const& p__=(p_); \
- CCTK_REAL8 const& p=p__; \
- (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
- vec8_load(p) : \
- vec8_load_off1(p); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p)
+{
+ // The :? operator breaks with the Intel compiler
+ // return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_load_off1(p);
+ if (off % CCTK_REAL8_VEC_SIZE == 0) return vec8_load(p);
+ return vec8_load_off1(p);
+}
# if VECTORISE_ALIGNED_ARRAYS
// Assume all array x sizes are multiples of the vector size
-# define vec8_loadu_maybe3(off1,off2,off3,p) \
- vec8_loadu_maybe(off1,p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return vec8_loadu_maybe(off1, p);
+}
# else
-# define vec8_loadu_maybe3(off1,off2,off3,p_) \
- ({ \
- CCTK_REAL8 const& p__=(p_); \
- CCTK_REAL8 const& p=p__; \
- ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
- (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
- vec8_loadu(p) : \
- vec8_loadu_maybe(off1,p); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return
+ off2 % CCTK_REAL8_VEC_SIZE != 0 or
+ off3 % CCTK_REAL8_VEC_SIZE != 0 ?
+ vec8_loadu(p) :
+ vec8_loadu_maybe(off1, p);
+}
# endif
#endif
// Store a vector to memory (aligned and non-temporal); this stores to
// a reference to a scalar
-#define vec8_store(p,x) (_mm_store_pd(&(p),x))
-#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm_store_pd(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm_storeu_pd(&p, x);
+}
#if ! VECTORISE_STREAMING_STORES
-# define vec8_store_nta(p,x) vec8_store(p,x)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ vec8_store(p, x);
+}
#else
-# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm_stream_pd(&p, x);
+}
#endif
// Store a partial vector (aligned and non-temporal)
-#define vec8_store_partial_prepare(i,imin,imax) \
- bool const v8stp_lo = (i)>=(imin); \
- bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE-1<(imax)
+#define vec8_store_partial_prepare(i, imin,imax) \
+ bool v8stp_lo, v8stp_hi; \
+ vec8_store_partial_prepare_(v8stp_lo, v8stp_hi, i, imin, imax);
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_partial_prepare_(bool& lo, bool& hi,
+ std::ptrdiff_t const i,
+ std::ptrdiff_t const imin,
+ std::ptrdiff_t const imax)
+{
+ lo = i >= imin;
+ hi = i+CCTK_REAL8_VEC_SIZE-1 < imax;
+}
+#define vec8_store_nta_partial(p, x) \
+ vec8_store_nta_partial_(v8stp_lo, v8stp_hi, p, x)
#if VECTORISE_STREAMING_STORES && defined(__SSE4A__)
-# define vec8_store_nta_partial(p_,x_) \
- ({ \
- CCTK_REAL8& p__=(p_); \
- CCTK_REAL8& p=p__; \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const x=x__; \
- if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \
- vec8_store_nta(p,x); \
- } else if (v8stp_lo) { \
- _mm_stream_sd(&p,x); \
- } else if (v8stp_hi) { \
- _mm_stream_sd(&p+1, vec8_swap10(x)); \
- } \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_(bool const lo, bool const hi,
+ CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x)
+{
+ if (CCTK_BUILTIN_EXPECT(lo and hi, true)) {
+ vec8_store_nta(p, x);
+ } else if (lo) {
+ _mm_stream_sd(&p, x);
+ } else if (hi) {
+ _mm_stream_sd(&p+1, vec8_swap10(x));
+ }
+}
#else
-# define vec8_store_nta_partial(p_,x_) \
- ({ \
- CCTK_REAL8& p__=(p_); \
- CCTK_REAL8& p=p__; \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const x=x__; \
- if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \
- vec8_store_nta(p,x); \
- } else if (v8stp_lo) { \
- _mm_storel_pd(&p,x); \
- } else if (v8stp_hi) { \
- _mm_storeh_pd(&p+1,x); \
- } \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_(bool const lo, bool const hi,
+ CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x)
+{
+ if (CCTK_BUILTIN_EXPECT(lo and hi, true)) {
+ vec8_store_nta(p, x);
+ } else if (lo) {
+ _mm_storel_pd(&p, x);
+ } else if (hi) {
+ _mm_storeh_pd(&p+1, x);
+ }
+}
#endif
// Store a lower or higher partial vector (aligned and non-temporal)
#if ! VECTORISE_STREAMING_STORES
-# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
-# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm_storel_pd(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm_storeh_pd(&p+1, x);
+}
#else
# if defined(__SSE4A__)
-# define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x))
-# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm_stream_sd(&(p)+1, vec8_swap10(x)))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm_stream_sd(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm_stream_sd(&p+1, vec8_swap10(x));
+}
# else
// TODO: use clflush once a whole cache line has been written (cache
// lines are usually larger than the CPU vector size)
-# define vec8_store_nta_partial_lo(p_,x,n) \
- ({ \
- CCTK_REAL8& p__=(p_); \
- CCTK_REAL8& p=p__; \
- _mm_storel_pd(&p,x); \
- /* _mm_clflush(&p); */ \
- })
-# define vec8_store_nta_partial_hi(p_,x,n) \
- ({ \
- CCTK_REAL8& p__=(p_); \
- CCTK_REAL8& p=p__; \
- _mm_storeh_pd(&p+1,x); \
- /* _mm_clflush(&p+1); */ \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm_storel_pd(&p, x);
+ // _mm_clflush(&p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm_storeh_pd(&p+1, x);
+ // _mm_clflush(&p+1);
+}
# endif
#endif
#if 0
// This is slower; we would need a non-temporal read
-#define vec8_store_nta_partial_lo(p,x,n) \
- vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1))
-#define vec8_store_nta_partial_hi(p,x,n) \
- vec8_store_nta(p, _mm_loadl_pd(x,&(p)))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ vec8_store_nta(p, _mm_loadh_pd(x, &p+1));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ vec8_store_nta(p, _mm_loadl_pd(x, &p));
+}
#endif
-#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_mid(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ assert(0);
+}
// Functions and operators
-static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }};
-
// Operators
// #define k8inot(x) (_mm_xor_si128(k8all_mask,x))
@@ -254,176 +389,320 @@ static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }};
// #define k8or(x,y) (_mm_or_pd(x,y))
// #define k8xor(x,y) (_mm_xor_pd(x,y))
-#define k8neg(x) (_mm_xor_pd(k8sign_mask.vf,x))
-
-#define k8add(x,y) (_mm_add_pd(x,y))
-#define k8sub(x,y) (_mm_sub_pd(x,y))
-#define k8mul(x,y) (_mm_mul_pd(x,y))
-#define k8div(x,y) (_mm_div_pd(x,y))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x)
+{
+ return _mm_xor_pd(k8sign, x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8add(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_add_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sub(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_sub_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8mul(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_mul_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8div(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_div_pd(x, y);
+}
// Fused multiply-add, defined as [+-]x*y[+-]z
#ifdef __FMA4__
-# define k8madd(x,y,z) (_mm_macc_pd(x,y,z))
-# define k8msub(x,y,z) (_mm_msub_pd(x,y,z))
-# define k8nmadd(x,y,z) (_mm_nmsub_pd(x,y,z))
-# define k8nmsub(x,y,z) (_mm_nmacc_pd(x,y,z))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm_macc_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm_msub_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm_nmsub_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm_nmacc_pd(x, y, z);
+}
#else
-# define k8madd(x,y,z) (k8add(k8mul(x,y),z))
-# define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
-# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
-# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8add(k8mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(k8mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(k8neg(z), k8mul(x, y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(z, k8mul(x, y));
+}
#endif
// Cheap functions
-#define k8copysign(x,y) \
- (_mm_or_pd(_mm_andnot_pd(k8sign_mask.vf,x), \
- _mm_and_pd(k8sign_mask.vf,y)))
-#define k8fabs(x) (_mm_andnot_pd(k8sign_mask.vf,x))
-#define k8fmax(x,y) (_mm_max_pd(x,y))
-#define k8fmin(x,y) (_mm_min_pd(x,y))
-#define k8fnabs(x) (_mm_or_pd(k8sign_mask.vf,x))
-static const k8const_t k8zero = { f: { 0.0, 0.0, }};
-static const k8const_t k8one = { f: { 1.0, 1.0, }};
-#define k8sgn(x_) \
- ({ \
- CCTK_REAL_VEC const x__=(x_); \
- CCTK_REAL_VEC const x=x__; \
- CCTK_REAL_VEC const iszero = _mm_cmpeq_pd(k8zero.vf, x); \
- CCTK_REAL_VEC const sign = _mm_and_pd(k8sign_mask.vf, x); \
- CCTK_REAL_VEC const signedone = _mm_or_pd(k8one.vf, sign); \
- k8ifthen(iszero, k8zero.vf, signedone); \
- })
-#define k8sqrt(x) (_mm_sqrt_pd(x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_or_pd(_mm_and_pd(k8notsign, x),
+ _mm_and_pd(k8sign , y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x)
+{
+ return _mm_and_pd(k8notsign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_max_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_min_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x)
+{
+ return _mm_or_pd(k8sign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x)
+{
+ return _mm_sqrt_pd(x);
+}
// Expensive functions
-#define K8REPL(f,x_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const x=x__; \
- vec8_set(f(vec8_elt0(x)), \
- f(vec8_elt1(x))); \
- })
-#define K8REPL2S(f,x_,a_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8 const a__=(a_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8 const a=a__; \
- vec8_set(f(vec8_elt0(x),a), \
- f(vec8_elt1(x),a)); \
- })
-#define K8REPL2(f,x_,y_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const y__=(y_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8_VEC const y=y__; \
- vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
- f(vec8_elt1(x),vec8_elt1(y))); \
- })
-
-#define k8acos(x) K8REPL(acos,x)
-#define k8acosh(x) K8REPL(acosh,x)
-#define k8asin(x) K8REPL(asin,x)
-#define k8asinh(x) K8REPL(asinh,x)
-#define k8atan(x) K8REPL(atan,x)
-#define k8atan2(x,y) K8REPL2(atan2,x,y)
-#define k8atanh(x) K8REPL(atanh,x)
-#define k8cos(x) K8REPL(cos,x)
-#define k8cosh(x) K8REPL(cosh,x)
-#define k8exp(x) K8REPL(exp,x)
-#define k8log(x) K8REPL(log,x)
-#define k8pow(x,a) K8REPL2S(pow,x,a)
-#define k8sin(x) K8REPL(sin,x)
-#define k8sinh(x) K8REPL(sinh,x)
-#define k8tan(x) K8REPL(tan,x)
-#define k8tanh(x) K8REPL(tanh,x)
-
-static const k8const_t k8lfalse_ = {{ +0LL, +0LL, }};
-static const k8const_t k8ltrue_ = {{ -1LL, -1LL, }};
-#define k8lfalse (k8lfalse_.vf)
-#define k8ltrue (k8ltrue_.vf)
-#define k8lnot(x) (_mm_xor_pd(k8ltrue,x))
-#define k8land(x,y) (_mm_and_pd(x,y))
-#define k8lor(x,y) (_mm_or_pd(x,y))
-#define k8lxor(x,y) (_mm_xor_pd(x,y))
-
+#define K8REPL(f,x) \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x)));
+#define K8REPL2S(f,x,a) \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a));
+#define K8REPL2(f,x,y) \
+ vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
+ f(vec8_elt1(x),vec8_elt1(y)));
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(acos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(acosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(asin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(asinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(atan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return K8REPL2(atan2,x,y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(atanh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(cos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(cosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(exp,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(log,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a)
+{
+ return K8REPL2S(pow,x,a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(sin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(sinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(tan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(tanh,x);
+}
+
+
+
+#define k8lfalse (vec8_set1i( 0))
+#define k8ltrue (vec8_set1i(~0))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x)
+{
+ return _mm_xor_pd(k8ltrue, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm_and_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm_or_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm_xor_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
#ifdef __SSE4_1__
-# define k8ifthen(x,y,z) (_mm_blendv_pd(z,y,x))
+ return _mm_blendv_pd(z,y,x);
#elif 0
-// This is slow (but this is what Intel/PGI produce by themselves)
-# define k8ifthen(x_,y_,z_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const y__=(y_); \
- CCTK_REAL8_VEC const z__=(z_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8_VEC const y=y__; \
- CCTK_REAL8_VEC const z=z__; \
- int const m = _mm_movemask_pd(x); \
- CCTK_REAL8_VEC r; \
- switch (m) { \
- case 0: r = y; break; \
- case 1: r = _mm_move_sd(y,z); break; \
- case 2: r = _mm_move_sd(z,y); break; \
- case 3: r = z; break; \
- } \
- r; \
- })
+ // This is slow (but this is what Intel/PGI produce by themselves)
+ int const m = _mm_movemask_pd(x);
+ switch (m) {
+ case 0: return y;
+ case 1: return _mm_move_sd(y,z);
+ case 2: return _mm_move_sd(z,y);
+ }
+ return z;
#elif 0
-# ifdef __cplusplus
-# define k8signbit(x) ({ using namespace std; signbit(x); })
-# else
-# define k8signbit(x) (signbit(x))
-# endif
-# define k8ifthen(x_,y_,z_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const y__=(y_); \
- CCTK_REAL8_VEC const z__=(z_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8_VEC const y=y__; \
- CCTK_REAL8_VEC const z=z__; \
- vec8_set(k8signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \
- k8signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \
- })
+ return vec8_set(std::signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z),
+ std::signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z));
#elif 0
-// We don't need to shift -- the condition (mask) will be either all
-// zeros or all ones
-static const k8const_t k8ione = {{ 0x1ULL, 0x1ULL, }};
-# define k8ifthen(x_,y_,z_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const y__=(y_); \
- CCTK_REAL8_VEC const z__=(z_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8_VEC const y=y__; \
- CCTK_REAL8_VEC const z=z__; \
- /* there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 */ \
- __m128i const x_int = *(__m128i const*)&x; \
- __m128i const imask_int = \
- _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); \
- CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; \
- /* (z & ~mask) | (y & mask) where imask = ~mask */ \
- _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \
- })
+ // We don't need to shift -- the condition (mask) will be either all
+ // zeros or all ones
+ k8const_t const k8ione = { i: { 1, 1, }};
+ // there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1
+ __m128i const x_int = *(__m128i const*)&x;
+ __m128i const imask_int = _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi);
+ CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int;
+ // (z & ~mask) | (y & mask) where imask = ~mask
+ return _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y));
#else
-# define k8ifthen(x_,y_,z_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const y__=(y_); \
- CCTK_REAL8_VEC const z__=(z_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8_VEC const y=y__; \
- CCTK_REAL8_VEC const z=z__; \
- /* (z & ~mask) | (y & mask) where imask = ~mask */ \
- _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); \
- })
+ // This assumes that all logical operations always return either
+ // lfalse or ltrue, and nothing "in between"
+ // (z & ~mask) | (y & mask) where imask = ~mask
+ return _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z));
#endif
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_cmpeq_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_cmpneq_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_cmpgt_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_cmpge_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_cmplt_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm_cmple_pd(x, y);
+}
+
+
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x)
+{
+ CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0));
+ CCTK_REAL8_VEC const sign = _mm_and_pd(k8sign, x);
+ CCTK_REAL8_VEC const signedone = _mm_or_pd(sign, vec8_set1(1.0));
+ return k8ifthen(iszero, vec8_set1(0.0), signedone);
+}
-#define k8cmpeq(x,y) (_mm_cmpeq_pd(x,y))
-#define k8cmpne(x,y) (_mm_cmpneq_pd(x,y))
-#define k8cmpgt(x,y) (_mm_cmpgt_pd(x,y))
-#define k8cmpge(x,y) (_mm_cmpge_pd(x,y))
-#define k8cmplt(x,y) (_mm_cmplt_pd(x,y))
-#define k8cmple(x,y) (_mm_cmple_pd(x,y))
+#endif
diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h
index 35af574..1faae76 100644
--- a/src/vectors-8-VSX.h
+++ b/src/vectors-8-VSX.h
@@ -20,9 +20,9 @@
#define CCTK_REAL8_VEC_SIZE 2
// Integer and boolean types corresponding to this real type
-#define CCTK_INTEGER8 long long
+//#define CCTK_INTEGER8 long long
#define CCTK_BOOLEAN8 long long
-#define CCTK_INTEGER8_VEC vector long long
+//#define CCTK_INTEGER8_VEC vector long long
#define CCTK_BOOLEAN8_VEC vector bool long long
diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h
index 7ff6c8c..5c07bfb 100644
--- a/src/vectors-8-default.h
+++ b/src/vectors-8-default.h
@@ -1,13 +1,9 @@
// Fallback vectorisation implementation: Do not vectorise
-// We use macros here, so that we are not surprised by compilers which
-// don't like to inline functions. This should also make debug builds
-// (which may not inline) more efficient.
-
-#include <assert.h>
-#include <math.h>
+#include <cassert>
+#include <cmath>
@@ -19,10 +15,13 @@
// Number of vector elements in a vector
#define CCTK_REAL8_VEC_SIZE 1
+vec_static_assert(sizeof(CCTK_REAL8_VEC) ==
+ sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE);
+
// Integer and boolean types corresponding to this real type
-#define CCTK_INTEGER8 CCTK_REAL8
+//#define CCTK_INTEGER8 CCTK_REAL8
#define CCTK_BOOLEAN8 CCTK_REAL8
-#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
+//#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
@@ -89,7 +88,7 @@
#define k8atan(x) (atan(x))
#define k8atan2(x,y) (atan2(x,y))
#define k8atanh(x) (atanh(x))
-#define k8copysign(x,y) (copysign(x,y))
+#define k8copysign(x,y) (std::copysign(x,y))
#define k8cos(x) (cos(x))
#define k8cosh(x) (cosh(x))
#define k8exp(x) (exp(x))
@@ -104,17 +103,16 @@
#define k8sqrt(x) (sqrt(x))
#define k8tan(x) (tan(x))
#define k8tanh(x) (tanh(x))
-
-#define k8sgn(x_) \
- ({ \
- CCTK_REAL x__=(x_); \
- CCTK_REAL x=x__; \
- x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : std::copysign((CCTK_REAL)1.0, x); \
- })
-#define k8signbit(x) (std::signbit(x))
-
-#define k8l2r(x_) ({ CCTK_INT8 x__=(x_); CCTK_INT8 x=x__; *(CCTK_REAL8*)&x; })
-#define k8r2l(x_) ({ CCTK_REAL8 x__=(x_); CCTK_REAL8 x=x__; *(CCTK_INT8*)&x; })
+#define k8signbit(x) (std::signbit(x))
+
+static inline CCTK_REAL8_VEC k8l2r(CCTK_INT8 const x)
+{
+ return *(CCTK_REAL8 const*)&x;
+}
+static inline CCTK_INT8 k8r2l(CCTK_REAL8_VEC const x)
+{
+ return *(CCTK_INT8 const*)&x;
+}
#define k8lfalse k8l2r(0)
#define k8ltrue k8l2r(1)
#define k8lnot(x) k8l2r(!k8r2l(x))
@@ -130,3 +128,8 @@
#define k8cmpge(x,y) k8l2r((x)>=(y))
#define k8cmplt(x,y) k8l2r((x)<(y))
#define k8cmple(x,y) k8l2r((x)<=(y))
+
+static inline CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x)
+{
+ return x==(CCTK_REAL8)0.0 ? (CCTK_REAL8)0.0 : k8copysign((CCTK_REAL8)1.0, x);
+}
diff --git a/src/vectors.h b/src/vectors.h
index c87446e..08b9f91 100644
--- a/src/vectors.h
+++ b/src/vectors.h
@@ -1,3 +1,5 @@
+// -*-C++-*-
+
#ifndef VECTORS_H
#define VECTORS_H
@@ -5,23 +7,26 @@
+#define vec_static_assert(x) namespace { typedef int vsa[(x) ? 1 : -1]; }
+
+
+
#if VECTORISE
-/* TODO: support AVX */
-# if defined(__SSE__) // Intel SSE
+# if defined(__AVX__) // Intel AVX
+# include "vectors-4-AVX.h"
+# elif defined(__SSE__) // Intel SSE
# include "vectors-4-SSE.h"
# elif defined(__ALTIVEC__) // Power Altivec
# include "vectors-4-Altivec.h"
# endif
-# if defined(__AVX__) // Intel AVX
+# if defined(__MIC__) // Intel MIC
+# include "vectors-8-MIC.h"
+# elif defined(__AVX__) && !defined(DISABLE_AVX) // Intel AVX
# include "vectors-8-AVX.h"
# elif defined(__SSE2__) // Intel SSE2
-# if VECTORISE_EMULATE_AVX
-# include "vectors-8-AVX.h"
-# else
-# include "vectors-8-SSE2.h"
-# endif
+# include "vectors-8-SSE2.h"
# elif defined(__bgq__) && defined(__VECTOR4DOUBLE__) // Blue Gene/Q QPX
# include "vectors-8-QPX.h"
# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Power VSX
@@ -33,10 +38,10 @@
#endif
// Default implementation, do not vectorise
-#if ! defined(CCTK_REAL4_VEC_SIZE)
+#ifndef CCTK_REAL4_VEC_SIZE
# include "vectors-4-default.h"
#endif
-#if ! defined(CCTK_REAL8_VEC_SIZE)
+#ifndef CCTK_REAL8_VEC_SIZE
# include "vectors-8-default.h"
#endif
@@ -130,9 +135,9 @@
# define CCTK_REAL_VEC CCTK_REAL8_VEC
# define CCTK_REAL_VEC_SIZE CCTK_REAL8_VEC_SIZE
-# define CCTK_INTEGER CCTK_INTEGER8
+//# define CCTK_INTEGER CCTK_INTEGER8
# define CCTK_BOOLEAN CCTK_BOOLEAN8
-# define CCTK_INTEGER_VEC CCTK_INTEGER8_VEC
+//# define CCTK_INTEGER_VEC CCTK_INTEGER8_VEC
# define CCTK_BOOLEAN_VEC CCTK_BOOLEAN8_VEC
# define vec_set1 vec8_set1
@@ -241,43 +246,54 @@
#ifdef __cplusplus
+#include <cstdlib>
+
template<typename T>
struct vecprops {
typedef T scalar_t;
typedef T vector_t;
- static inline int size()
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ std::size_t size()
{
return 1;
}
- static inline vector_t load (scalar_t const& a)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t load (scalar_t const& a)
{
return a;
}
- static inline vector_t loadu (scalar_t const& a)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t loadu (scalar_t const& a)
{
return a;
}
- static inline scalar_t elt (vector_t const& x, int const d)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ scalar_t elt (vector_t const& x, std::ptrdiff_t const d)
{
return x;
}
- static inline vector_t neg (vector_t const& x)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t neg (vector_t const& x)
{
return -x;
}
- static inline vector_t add (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t add (vector_t const& x, vector_t const& y)
{
return x+y;
}
- static inline vector_t sub (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t sub (vector_t const& x, vector_t const& y)
{
return x-y;
}
- static inline vector_t mul (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t mul (vector_t const& x, vector_t const& y)
{
return x*y;
}
- static inline vector_t div (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t div (vector_t const& x, vector_t const& y)
{
return x/y;
}
@@ -287,39 +303,48 @@ template<>
struct vecprops<CCTK_REAL4> {
typedef CCTK_REAL4 scalar_t;
typedef CCTK_REAL4_VEC vector_t;
- static inline int size()
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ int size()
{
return CCTK_REAL4_VEC_SIZE;
}
- static inline vector_t load (scalar_t const& a)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t load (scalar_t const& a)
{
return vec4_load(a);
}
- static inline vector_t loadu (scalar_t const& a)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t loadu (scalar_t const& a)
{
return vec4_loadu(a);
}
- static inline scalar_t elt (vector_t const& x, int const d)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ scalar_t elt (vector_t const& x, int const d)
{
return vec4_elt(x,d);
}
- static inline vector_t neg (vector_t const& x)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t neg (vector_t const& x)
{
return k4neg(x);
}
- static inline vector_t add (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t add (vector_t const& x, vector_t const& y)
{
return k4add(x,y);
}
- static inline vector_t sub (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t sub (vector_t const& x, vector_t const& y)
{
return k4sub(x,y);
}
- static inline vector_t mul (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t mul (vector_t const& x, vector_t const& y)
{
return k4mul(x,y);
}
- static inline vector_t div (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t div (vector_t const& x, vector_t const& y)
{
return k4div(x,y);
}
@@ -329,44 +354,143 @@ template<>
struct vecprops<CCTK_REAL8> {
typedef CCTK_REAL8 scalar_t;
typedef CCTK_REAL8_VEC vector_t;
- static inline int size()
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ int size()
{
return CCTK_REAL8_VEC_SIZE;
}
- static inline vector_t load (scalar_t const& a)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t load (scalar_t const& a)
{
return vec8_load(a);
}
- static inline vector_t loadu (scalar_t const& a)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t loadu (scalar_t const& a)
{
return vec8_loadu(a);
}
- static inline scalar_t elt (vector_t const& x, int const d)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ scalar_t elt (vector_t const& x, int const d)
{
return vec8_elt(x,d);
}
- static inline vector_t neg (vector_t const& x)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t neg (vector_t const& x)
{
return k8neg(x);
}
- static inline vector_t add (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t add (vector_t const& x, vector_t const& y)
{
return k8add(x,y);
}
- static inline vector_t sub (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t sub (vector_t const& x, vector_t const& y)
{
return k8sub(x,y);
}
- static inline vector_t mul (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t mul (vector_t const& x, vector_t const& y)
{
return k8mul(x,y);
}
- static inline vector_t div (vector_t const& x, vector_t const& y)
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vector_t div (vector_t const& x, vector_t const& y)
{
return k8div(x,y);
}
};
+template<typename T>
+struct vectype {
+private:
+ typedef vecprops<T> props;
+public:
+ typedef typename props::vector_t vector_t;
+ typedef typename props::scalar_t scalar_t;
+ vector_t v;
+ vectype() { }
+ vectype(vectype const& x): v(x.v) { }
+ vectype(vector_t const& x): v(x) { }
+ operator vector_t() const { return v; }
+ vectype& operator=(vectype const& x) { v=x.v; return *this; }
+
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ std::size_t size() const {
+ return props::size();
+ }
+
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype load(scalar_t const& a)
+ {
+ return props::load(a);
+ }
+ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype loadu(scalar_t const& a)
+ {
+ return props::loadu(a);
+ }
+
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ scalar_t elt(std::ptrdiff_t const d) const
+ {
+ return props::elt(*this, d);
+ }
+
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype operator+() const
+ {
+ return *this;
+ }
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype operator-() const
+ {
+ return props::neg(*this);
+ }
+
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype operator+(vectype const& x) const
+ {
+ return props::add(*this, x);
+ }
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype operator-(vectype const& x) const
+ {
+ return props::sub(*this, x);
+ }
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype operator*(vectype const& x) const
+ {
+ return props::mul(*this, x);
+ }
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype operator/(vectype const& x) const
+ {
+ return props::div(*this, x);
+ }
+
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype& operator+=(vectype const& x)
+ {
+ return *this = *this+x;
+ }
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype& operator-=(vectype const& x)
+ {
+ return *this = *this-x;
+ }
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype& operator*=(vectype const& x)
+ {
+ return *this = *this*x;
+ }
+ inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+ vectype& operator/=(vectype const& x)
+ {
+ return *this = *this/x;
+ }
+};
+
#endif
@@ -383,6 +507,18 @@ struct vecprops<CCTK_REAL8> {
# undef ToReal
# define ToReal(x) (vec_set1(CCTK_REAL(x)))
+# undef IfThen
+# ifdef __PGI
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL_VEC vec_IfThen(CCTK_BOOLEAN x, CCTK_REAL_VEC y, CCTK_REAL_VEC z)
+{
+ if (x) return y; else return z;
+}
+# define IfThen(x,y,z) vec_IfThen(x,y,z)
+# else
+# define IfThen(x,y,z) ((x) ? CCTK_REAL_VEC(y) : CCTK_REAL_VEC(z))
+# endif
+
# undef KRANC_GFOFFSET3D
# define KRANC_GFOFFSET3D(var,i,j,k) \
vec_loadu_maybe3((i),(j),(k), \