From 5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b Mon Sep 17 00:00:00 2001 From: eschnett Date: Thu, 20 Jan 2011 20:22:34 +0000 Subject: Change naming scheme of architecture files Add support for AVX (next-generation SSE) Add support for Double Hummer (Blue Gene/P) git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@7 105869f7-3296-0410-a4ea-f4349344b45a --- src/avxintrin_emu.h | 1061 ++++++++++++++++++++++++++++++++++++++++++ src/vectors-4-Altivec.h | 132 ++++++ src/vectors-4-SSE.h | 173 +++++++ src/vectors-4-default.h | 79 ++++ src/vectors-8-AVX.h | 163 +++++++ src/vectors-8-DoubleHummer.h | 108 +++++ src/vectors-8-SSE2.h | 148 ++++++ src/vectors-8-VSX.h | 110 +++++ src/vectors-8-default.h | 79 ++++ src/vectors-default-4.h | 79 ---- src/vectors-default-8.h | 79 ---- src/vectors-intel-4.h | 173 ------- src/vectors-intel-8.h | 148 ------ src/vectors-power-4.h | 128 ----- src/vectors-power-8.h | 106 ----- src/vectors.h | 32 +- 16 files changed, 2075 insertions(+), 723 deletions(-) create mode 100644 src/avxintrin_emu.h create mode 100644 src/vectors-4-Altivec.h create mode 100644 src/vectors-4-SSE.h create mode 100644 src/vectors-4-default.h create mode 100644 src/vectors-8-AVX.h create mode 100644 src/vectors-8-DoubleHummer.h create mode 100644 src/vectors-8-SSE2.h create mode 100644 src/vectors-8-VSX.h create mode 100644 src/vectors-8-default.h delete mode 100644 src/vectors-default-4.h delete mode 100644 src/vectors-default-8.h delete mode 100644 src/vectors-intel-4.h delete mode 100644 src/vectors-intel-8.h delete mode 100644 src/vectors-power-4.h delete mode 100644 src/vectors-power-8.h diff --git a/src/avxintrin_emu.h b/src/avxintrin_emu.h new file mode 100644 index 0000000..3097cd7 --- /dev/null +++ b/src/avxintrin_emu.h @@ -0,0 +1,1061 @@ +/* + Copyright (c) 2010, Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/*** + + Provide feedback to: maxim.locktyukhin intel com, phil.j.kerly intel com + + Version 1.0 - Initial release. + + This AVX intrinsics emulation header file designed to work with Intel C/C++ + as well as GCC compilers. + + Known Issues and limitations: + + - does not support immediate values higher than 0x7 for _mm[256]_cmp_[ps|pd] + intrinsics, UD2 instruction will be generated instead + + - -O0 optimization level may _sometimes_ result with compile time errors due + to failed forced inline and compiler not being able to generate instruction + with constant immediate operand becasue of it, compiling with -O1 and/or + -finline-functions should help. + +***/ + + +#ifndef __EMU_M256_AVXIMMINTRIN_EMU_H__ +#define __EMU_M256_AVXIMMINTRIN_EMU_H__ + +#ifdef __GNUC__ + +#ifdef __SSE__ +#include +#endif + +#ifdef __SSE2__ +#include +#endif + +#ifdef __SSE3__ +#include +#endif + +#ifdef __SSSE3__ +#include +#endif + +#if defined (__SSE4_2__) || defined (__SSE4_1__) +#include +#endif + +#if defined (__AES__) || defined (__PCLMUL__) +#include +#endif + +#else + +#include + +#endif + +#pragma message (" --- Intel remark: AVX intrinsics are emulated with SSE ---") + +/* + * Intel(R) AVX compiler intrinsics. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is an emulation of Intel AVX + */ + +#if defined( _MSC_VER ) || defined( __INTEL_COMPILER ) + #define __EMU_M256_ALIGN( a ) __declspec(align(a)) + #define __emu_inline __forceinline + #define __emu_int64_t __int64 +#elif defined( __GNUC__ ) + #define __EMU_M256_ALIGN( a ) __attribute__((__aligned__(a))) + #define __emu_inline __inline __attribute__((__always_inline__)) + #define __emu_int64_t long long +#else + #error "unsupported platform" +#endif + +typedef union __EMU_M256_ALIGN(32) __emu__m256 +{ + float __emu_arr[8]; + __m128 __emu_m128[2]; +} __emu__m256; + +typedef union __EMU_M256_ALIGN(32) __emu__m256d +{ + double __emu_arr[4]; + __m128d __emu_m128[2]; +} __emu__m256d; + +typedef union __EMU_M256_ALIGN(32) __emu__m256i +{ + int __emu_arr[8]; + __m128i __emu_m128[2]; +} __emu__m256i; + +static __emu_inline __emu__m256 __emu_set_m128( const __m128 arr[] ) { __emu__m256 ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); } +static __emu_inline __emu__m256d __emu_set_m128d( const __m128d arr[] ) { __emu__m256d ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); } +static __emu_inline __emu__m256i __emu_set_m128i( const __m128i arr[] ) { __emu__m256i ret; ret.__emu_m128[0] = arr[0]; ret.__emu_m128[1] = arr[1]; return (ret); } + + +#define __EMU_M256_IMPL_M1( type, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \ + res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M1_RET( ret_type, type, func ) \ +static __emu_inline __emu##ret_type __emu_mm256_##func( __emu##type m256_param1 ) \ +{ __emu##ret_type res; \ + res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \ + res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M1_RET_NAME( ret_type, type, func, name ) \ + static __emu_inline __emu##ret_type __emu_mm256_##name( __emu##type m256_param1 ) \ +{ __emu##ret_type res; \ + res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0] ); \ + res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1] ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M1_LH( type, type_128, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( type_128 m128_param ) \ +{ __emu##type res; \ + res.__emu_m128[0] = _mm_##func( m128_param ); \ + __m128 m128_param_high = _mm_movehl_ps( *(__m128*)&m128_param, *(__m128*)&m128_param ); \ + res.__emu_m128[1] = _mm_##func( *(type_128*)&m128_param_high ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M1_HL( type_128, type, func ) \ +static __emu_inline type_128 __emu_mm256_##func( __emu##type m256_param1 ) \ +{ type_128 res, tmp; \ + res = _mm_##func( m256_param1.__emu_m128[0] ); \ + tmp = _mm_##func( m256_param1.__emu_m128[1] ); \ + *(((__emu_int64_t*)&res)+1) = *(__emu_int64_t*)&tmp; \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M1P_DUP( type, type_param, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( type_param param ) \ +{ __emu##type res; \ + res.__emu_m128[0] = _mm_##func( param ); \ + res.__emu_m128[1] = _mm_##func( param ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M1I_DUP( type, func ) \ + static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], param2 ); \ + res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], param2 ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL2_M1I_DUP( type, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], param2 ); \ + res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], param2 ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL2_M1I_SHIFT( type, func, shift_for_hi ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, const int param2 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], param2 & ((1<> shift_for_hi); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M2( type, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0] ); \ + res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1] ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL2_M2T( type, type_2, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type_2 m256_param2 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0] ); \ + res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1] ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M2I_DUP( type, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 ); \ + res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], param3 ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL2_M2I_DUP( type, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = __emu_mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 ); \ + res.__emu_m128[1] = __emu_mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], param3 ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M2I_SHIFT( type, func, shift_for_hi ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, const int param3 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], param3 & ((1<> shift_for_hi ); \ + return ( res ); \ +} + +#define __EMU_M256_IMPL_M3( type, func ) \ +static __emu_inline __emu##type __emu_mm256_##func( __emu##type m256_param1, __emu##type m256_param2, __emu##type m256_param3 ) \ +{ __emu##type res; \ + res.__emu_m128[0] = _mm_##func( m256_param1.__emu_m128[0], m256_param2.__emu_m128[0], m256_param3.__emu_m128[0] ); \ + res.__emu_m128[1] = _mm_##func( m256_param1.__emu_m128[1], m256_param2.__emu_m128[1], m256_param3.__emu_m128[1] ); \ + return ( res ); \ +} + + +/* + * Compare predicates for scalar and packed compare intrinsics + */ +#define _CMP_EQ_OQ 0x00 /* Equal (ordered, nonsignaling) */ +#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ +#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ +#define _CMP_UNORD_Q 0x03 /* Unordered (nonsignaling) */ +#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, nonsignaling) */ +#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ +#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ +#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ + +#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ +#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ +#define _CMP_NGT_US 0x0A /* Not-greater-than (unordered, signaling) */ +#define _CMP_FALSE_OQ 0x0B /* False (ordered, nonsignaling) */ +#define _CMP_NEQ_OQ 0x0C /* Not-equal (ordered, non-signaling) */ +#define _CMP_GE_OS 0x0D /* Greater-than-or-equal (ordered, signaling) */ +#define _CMP_GT_OS 0x0E /* Greater-than (ordered, signaling) */ +#define _CMP_TRUE_UQ 0x0F /* True (unordered, non-signaling) */ +#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ +#define _CMP_LT_OQ 0x11 /* Less-than (ordered, nonsignaling) */ +#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, nonsignaling) */ +#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ +#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ +#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, nonsignaling) */ +#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, nonsignaling) */ +#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ +#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ +#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, nonsignaling) */ +#define _CMP_NGT_UQ 0x1A /* Not-greater-than (unordered, nonsignaling) */ +#define _CMP_FALSE_OS 0x1B /* False (ordered, signaling) */ +#define _CMP_NEQ_OS 0x1C /* Not-equal (ordered, signaling) */ +#define _CMP_GE_OQ 0x1D /* Greater-than-or-equal (ordered, nonsignaling) */ +#define _CMP_GT_OQ 0x1E /* Greater-than (ordered, nonsignaling) */ +#define _CMP_TRUE_US 0x1F /* True (unordered, signaling) */ + +__EMU_M256_IMPL_M2( __m256d, add_pd ); +__EMU_M256_IMPL_M2( __m256, add_ps ); + +__EMU_M256_IMPL_M2( __m256d, addsub_pd ); +__EMU_M256_IMPL_M2( __m256, addsub_ps ); + +__EMU_M256_IMPL_M2( __m256d, and_pd ); +__EMU_M256_IMPL_M2( __m256, and_ps ); + +__EMU_M256_IMPL_M2( __m256d, andnot_pd ); +__EMU_M256_IMPL_M2( __m256, andnot_ps ); + +__EMU_M256_IMPL_M2( __m256d, div_pd ); +__EMU_M256_IMPL_M2( __m256, div_ps ); + +__EMU_M256_IMPL_M2( __m256d, hadd_pd ); +__EMU_M256_IMPL_M2( __m256, hadd_ps ); + +__EMU_M256_IMPL_M2( __m256d, hsub_pd ); +__EMU_M256_IMPL_M2( __m256, hsub_ps ); + +__EMU_M256_IMPL_M2( __m256d, max_pd ); +__EMU_M256_IMPL_M2( __m256, max_ps ); + +__EMU_M256_IMPL_M2( __m256d, min_pd ); +__EMU_M256_IMPL_M2( __m256, min_ps ); + +__EMU_M256_IMPL_M2( __m256d, mul_pd ); +__EMU_M256_IMPL_M2( __m256, mul_ps ); + +__EMU_M256_IMPL_M2( __m256d, or_pd ); +__EMU_M256_IMPL_M2( __m256, or_ps ); + +__EMU_M256_IMPL_M2I_SHIFT( __m256d, shuffle_pd, 2 ); +__EMU_M256_IMPL_M2I_DUP( __m256, shuffle_ps ); + +__EMU_M256_IMPL_M2( __m256d, sub_pd ); +__EMU_M256_IMPL_M2( __m256, sub_ps ); + +__EMU_M256_IMPL_M2( __m256d, xor_pd ); +__EMU_M256_IMPL_M2( __m256, xor_ps ); + +#if defined (__SSE4_2__) || defined (__SSE4_1__) + +__EMU_M256_IMPL_M2I_SHIFT( __m256d, blend_pd, 2 ); +__EMU_M256_IMPL_M2I_SHIFT( __m256, blend_ps, 4 ); + +__EMU_M256_IMPL_M3( __m256d, blendv_pd ); +__EMU_M256_IMPL_M3( __m256, blendv_ps ); + +__EMU_M256_IMPL_M2I_DUP( __m256, dp_ps ); + +__EMU_M256_IMPL_M1I_DUP( __m256d, round_pd ); +#define _mm256_ceil_pd(val) _mm256_round_pd((val), 0x0A); +#define _mm256_floor_pd(val) _mm256_round_pd((val), 0x09); + +__EMU_M256_IMPL_M1I_DUP( __m256, round_ps ); +#define _mm256_ceil_ps(val) _mm256_round_ps((val), 0x0A); +#define _mm256_floor_ps(val) _mm256_round_ps((val), 0x09); + +#define __emu_mm_test_impl( op, sfx, vec_type ) \ +static __emu_inline int __emu_mm_test##op##_##sfx(vec_type s1, vec_type s2) { \ + __m128d sign_bits_pd = _mm_castsi128_pd( _mm_set_epi32( 1 << 31, 0, 1 << 31, 0 ) ); \ + __m128 sign_bits_ps = _mm_castsi128_ps( _mm_set1_epi32( 1 << 31 ) ); \ + \ + s1 = _mm_and_##sfx( s1, sign_bits_##sfx ); \ + s2 = _mm_and_##sfx( s2, sign_bits_##sfx ); \ + return _mm_test##op##_si128( _mm_cast##sfx##_si128( s1 ), _mm_cast##sfx##_si128( s2 ) ); \ +} + +__emu_mm_test_impl( z, pd, __m128d ); +__emu_mm_test_impl( c, pd, __m128d ); +__emu_mm_test_impl( nzc, pd, __m128d ); + +__emu_mm_test_impl( z, ps, __m128 ); +__emu_mm_test_impl( c, ps, __m128 ); +__emu_mm_test_impl( nzc, ps, __m128 ); + + + +#define __emu_mm256_test_impl( prfx, op, sfx, sfx_impl, vec_type ) \ +static __emu_inline int __emu_mm256_test##op##_##sfx(vec_type s1, vec_type s2) { \ + int ret1 = prfx##_test##op##_##sfx_impl( s1.__emu_m128[0], s2.__emu_m128[0] ); \ + int ret2 = prfx##_test##op##_##sfx_impl( s1.__emu_m128[1], s2.__emu_m128[1] ); \ + return ( ret1 && ret2 ); \ +}; + +__emu_mm256_test_impl( _mm, z, si256, si128, __emu__m256i ); +__emu_mm256_test_impl( _mm, c, si256, si128, __emu__m256i ); +__emu_mm256_test_impl( _mm, nzc, si256, si128, __emu__m256i ); + +__emu_mm256_test_impl( __emu_mm, z, pd, pd, __emu__m256d ); +__emu_mm256_test_impl( __emu_mm, c, pd, pd, __emu__m256d ); +__emu_mm256_test_impl( __emu_mm, nzc, pd, pd, __emu__m256d ); + +__emu_mm256_test_impl( __emu_mm, z, ps, ps, __emu__m256 ); +__emu_mm256_test_impl( __emu_mm, c, ps, ps, __emu__m256 ); +__emu_mm256_test_impl( __emu_mm, nzc, ps, ps, __emu__m256 ); + +#endif + +#if defined( __GNUC__ ) && ( __GNUC__ == 4 ) && (__GNUC_MINOR__ < 4 ) +/* use macro implementation instead of inline functions to allow -O0 for GCC pre 4.4 */ + +#pragma message ("Using macro for GCC <4.4" ) + +#define __emu_mm_cmp_ps(m1, m2, predicate) \ +({ \ + __m128 res_ = (m1), m2_ = (m2); \ + if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); \ + __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \ + res_; }) + +#define __emu_mm256_cmp_ps(m1, m2, predicate) \ +({ \ + __emu__m256 res_ = (m1), m2_ = (m2); \ + if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ + __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[0]) : [m2_] "xm" (m2_.__emu_m128[0]), [pred_] "i" (predicate) ); \ + __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[1]) : [m2_] "xm" (m2_.__emu_m128[1]), [pred_] "i" (predicate) ); \ + res_; }) + + +#define __emu_mm_cmp_pd(m1, m2, predicate) \ +({ \ + __m128 res_ = (m1), m2_ = (m2); \ + if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ + __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \ + res_; }) + +#define __emu_mm256_cmp_pd(m1, m2, predicate) \ +({ \ + __emu__m256 res_ = (m1), m2_ = (m2); \ + if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ + __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[0]) : [m2_] "xm" (m2_.__emu_m128[0]), [pred_] "i" (predicate) ); \ + __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_.__emu_m128[1]) : [m2_] "xm" (m2_.__emu_m128[1]), [pred_] "i" (predicate) ); \ + res_; }) + + +#define __emu_mm_cmp_ss(m1, m2, predicate) \ +({ \ + __m128 res_ = (m1), m2_ = (m2); \ + if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ + __asm__ ( "cmpss %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \ + res_; }) + +#define __emu_mm_cmp_sd(m1, m2, predicate) \ +({ \ + __m128 res_ = (m1), m2_ = (m2); \ + if ( 7 < (unsigned)predicate ) __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ \ + __asm__ ( "cmpsd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res_) : [m2_] "xm" (m2_), [pred_] "i" (predicate) ); \ + res_; }) + + + +#else /* __GNUC__==4 && __GNUC_MINOR__ <4 */ + + +static __emu_inline __m128 __emu_mm_cmp_ps(__m128 m1, __m128 m2, const int predicate) +{ + __m128 res; + + if ( predicate >= 0 && predicate <= 7 ) { + res = m1; + __asm__ ( "cmpps %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) ); + } else { + __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ + } + + return ( res ); +} +__EMU_M256_IMPL2_M2I_DUP( __m256, cmp_ps ) + +static __emu_inline __m128d __emu_mm_cmp_pd(__m128d m1, __m128d m2, const int predicate) +{ + __m128d res; + + if ( predicate >= 0 && predicate <= 7 ) { + res = m1; + __asm__ ( "cmppd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) ); + } else { + __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ + } + + return ( res ); +} +__EMU_M256_IMPL2_M2I_DUP( __m256d, cmp_pd ) + + +static __emu_inline __m128d __emu_mm_cmp_sd(__m128d m1, __m128d m2, const int predicate) +{ + __m128d res; + + if ( predicate >= 0 && predicate <= 7 ) { + res = m1; + __asm__ ( "cmpsd %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) ); + } else { + __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ + } + + return ( res ); +} + +static __emu_inline __m128 __emu_mm_cmp_ss(__m128 m1, __m128 m2, const int predicate) +{ + __m128 res; + + if ( predicate >= 0 && predicate <= 7 ) { + res = m1; + __asm__ ( "cmpss %[pred_], %[m2_], %[res_]" : [res_] "+x" (res) : [m2_] "xm" (m2), [pred_] "i" (predicate) ); + } else { + __asm__ __volatile__ ( "ud2" : : : "memory" ); /* not supported yet */ + } + + return ( res ); +} + +#endif + + +__EMU_M256_IMPL_M1_LH( __m256d, __m128i, cvtepi32_pd ); +__EMU_M256_IMPL_M1_RET( __m256, __m256i, cvtepi32_ps ); +__EMU_M256_IMPL_M1_HL( __m128, __m256d, cvtpd_ps ); +__EMU_M256_IMPL_M1_RET( __m256i, __m256, cvtps_epi32 ); +__EMU_M256_IMPL_M1_LH( __m256d, __m128, cvtps_pd ); +__EMU_M256_IMPL_M1_HL( __m128i, __m256d, cvttpd_epi32); +__EMU_M256_IMPL_M1_HL( __m128i, __m256d, cvtpd_epi32); +__EMU_M256_IMPL_M1_RET( __m256i, __m256, cvttps_epi32 ); + +static __emu_inline __m128 __emu_mm256_extractf128_ps(__emu__m256 m1, const int offset) { return m1.__emu_m128[ offset ]; } +static __emu_inline __m128d __emu_mm256_extractf128_pd(__emu__m256d m1, const int offset) { return m1.__emu_m128[ offset ]; } +static __emu_inline __m128i __emu_mm256_extractf128_si256(__emu__m256i m1, const int offset) { return m1.__emu_m128[ offset ]; } + +static __emu_inline void __emu_mm256_zeroall(void) {} +static __emu_inline void __emu_mm256_zeroupper(void) {} + +static __emu_inline __m128 __emu_mm_permutevar_ps(__m128 a, __m128i control) +{ + int const* sel = (int const*)&control; + float const* src = (float const*)&a; + __EMU_M256_ALIGN(16) float dest[4]; + int i=0; + + for (; i<4; ++i) + dest[i] = src[ 3 & sel[i] ]; + + return ( *(__m128*)dest ); +} +__EMU_M256_IMPL2_M2T( __m256, __m256i, permutevar_ps ); + +static __emu_inline __m128 __emu_mm_permute_ps(__m128 a, int control) { return _mm_castsi128_ps( _mm_shuffle_epi32( *(__m128i*)&a, control ) ); } +__EMU_M256_IMPL2_M1I_DUP( __m256, permute_ps ); + + +static __emu_inline __m128d __emu_mm_permutevar_pd(__m128d a, __m128i control) +{ + __emu_int64_t const* sel = (__emu_int64_t const*)&control; + double const* src = (double const*)&a; + __EMU_M256_ALIGN(16) double dest[2]; + int i=0; + + for (; i<2; ++i) + dest[i] = src[ (2 & sel[i]) >> 1 ]; + + return ( *(__m128d*)dest ); +} +__EMU_M256_IMPL2_M2T( __m256d, __m256i, permutevar_pd ); + +static __emu_inline __m128d __emu_mm_permute_pd(__m128d a, int control) +{ + double const* src = (double const*)&a; + __EMU_M256_ALIGN(16) double dest[2]; + int i=0; + + for (; i<2; ++i) + dest[i] = src[ 1 & (control >> i) ]; + + return ( *(__m128d*)dest ); +} +__EMU_M256_IMPL2_M1I_SHIFT( __m256d, permute_pd, 2 ); + + +#define __emu_mm256_permute2f128_impl( name, m128_type, m256_type ) \ +static __emu_inline m256_type name( m256_type m1, m256_type m2, int control) { \ + m256_type res; \ + __m128 zero = _mm_setzero_ps(); \ + const m128_type param[4] = { m1.__emu_m128[0], m1.__emu_m128[1], m2.__emu_m128[0], m2.__emu_m128[1] }; \ + res.__emu_m128[0] = (control & 8) ? *(m128_type*)&zero : param[ control & 0x3 ]; control >>= 4; \ + res.__emu_m128[1] = (control & 8) ? *(m128_type*)&zero : param[ control & 0x3 ]; \ + return ( res ); \ +} + +__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_ps, __m128, __emu__m256 ); +__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_pd, __m128d, __emu__m256d ); +__emu_mm256_permute2f128_impl( __emu_mm256_permute2f128_si256, __m128i, __emu__m256i ); + + +#define __emu_mm_broadcast_impl( name, res_type, type ) \ +static __emu_inline res_type name(type const *a) { \ + const size_t size = sizeof( res_type ) / sizeof( type );\ + __EMU_M256_ALIGN(32) type res[ size ]; \ + size_t i = 0; \ + for ( ; i < size; ++i ) \ + res[ i ] = *a; \ + return (*(res_type*)&res); \ +} + +__emu_mm_broadcast_impl( __emu_mm_broadcast_ss, __m128, float ) +__emu_mm_broadcast_impl( __emu_mm256_broadcast_ss, __emu__m256, float ) + +__emu_mm_broadcast_impl( __emu_mm_broadcast_sd, __m128, double ) +__emu_mm_broadcast_impl( __emu_mm256_broadcast_sd, __emu__m256d, double ) + +__emu_mm_broadcast_impl( __emu_mm256_broadcast_ps, __emu__m256, __m128 ) +__emu_mm_broadcast_impl( __emu_mm256_broadcast_pd, __emu__m256d, __m128d ) + + +static __emu_inline __emu__m256 __emu_mm256_insertf128_ps(__emu__m256 a, __m128 b, int offset) { a.__emu_m128[ offset ] = b; return a; } +static __emu_inline __emu__m256d __emu_mm256_insertf128_pd(__emu__m256d a, __m128d b, int offset) { a.__emu_m128[ offset ] = b; return a; } +static __emu_inline __emu__m256i __emu_mm256_insertf128_si256(__emu__m256i a, __m128i b, int offset) { a.__emu_m128[ offset ] = b; return a; } + + +#define __emu_mm_load_impl( name, sfx, m256_sfx, m256_type, type_128, type ) \ +static __emu_inline __emu##m256_type __emu_mm256_##name##_##m256_sfx(const type* a) { \ + __emu##m256_type res; \ + res.__emu_m128[0] = _mm_##name##_##sfx( (const type_128 *)a ); \ + res.__emu_m128[1] = _mm_##name##_##sfx( (const type_128 *)(1+(const __m128 *)a) ); \ + return (res); \ +} + +#define __emu_mm_store_impl( name, sfx, m256_sfx, m256_type, type_128, type ) \ +static __emu_inline void __emu_mm256_##name##_##m256_sfx(type *a, __emu##m256_type b) { \ + _mm_##name##_##sfx( (type_128*)a, b.__emu_m128[0] ); \ + _mm_##name##_##sfx( (type_128*)(1+(__m128*)a), b.__emu_m128[1] ); \ +} + +__emu_mm_load_impl( load, pd, pd, __m256d, double, double ); +__emu_mm_store_impl( store, pd, pd, __m256d, double, double ); + +__emu_mm_load_impl( load, ps, ps, __m256, float, float ); +__emu_mm_store_impl( store, ps, ps, __m256, float, float ); + +__emu_mm_load_impl( loadu, pd, pd, __m256d, double, double ); +__emu_mm_store_impl( storeu, pd, pd, __m256d, double, double ); + +__emu_mm_load_impl( loadu, ps, ps, __m256, float, float ); +__emu_mm_store_impl( storeu, ps, ps, __m256, float, float ); + +__emu_mm_load_impl( load, si128, si256, __m256i, __m128i, __emu__m256i ); +__emu_mm_store_impl( store, si128, si256, __m256i, __m128i, __emu__m256i ); + +__emu_mm_load_impl( loadu, si128, si256, __m256i, __m128i, __emu__m256i ); +__emu_mm_store_impl( storeu, si128, si256, __m256i, __m128i, __emu__m256i ); + + +#define __emu_maskload_impl( name, vec_type, mask_vec_type, type, mask_type ) \ +static __emu_inline vec_type name(type const *a, mask_vec_type mask) { \ + const size_t size_type = sizeof( type ); \ + const size_t size = sizeof( vec_type ) / size_type; \ + __EMU_M256_ALIGN(32) type res[ size ]; \ + const mask_type* p_mask = (const mask_type*)&mask; \ + size_t i = 0; \ + mask_type sign_bit = 1; \ + sign_bit <<= (8*size_type - 1); \ + for ( ; i < size; ++i ) \ + res[ i ] = (sign_bit & *(p_mask + i)) ? *(a+i) : 0; \ + return (*(vec_type*)&res); \ +} + +#define __emu_maskstore_impl( name, vec_type, mask_vec_type, type, mask_type ) \ +static __emu_inline void name(type *a, mask_vec_type mask, vec_type data) { \ + const size_t size_type = sizeof( type ); \ + const size_t size = sizeof( vec_type ) / sizeof( type ); \ + type* p_data = (type*)&data; \ + const mask_type* p_mask = (const mask_type*)&mask; \ + size_t i = 0; \ + mask_type sign_bit = 1; \ + sign_bit <<= (8*size_type - 1); \ + for ( ; i < size; ++i ) \ + if ( *(p_mask + i ) & sign_bit) \ + *(a + i) = *(p_data + i); \ +} + +__emu_maskload_impl( __emu_mm256_maskload_pd, __emu__m256d, __emu__m256i, double, __emu_int64_t ); +__emu_maskstore_impl( __emu_mm256_maskstore_pd, __emu__m256d, __emu__m256i, double, __emu_int64_t ); + +__emu_maskload_impl( __emu_mm_maskload_pd, __m128d, __m128i, double, __emu_int64_t ); +__emu_maskstore_impl( __emu_mm_maskstore_pd, __m128d, __m128i, double, __emu_int64_t ); + +__emu_maskload_impl( __emu_mm256_maskload_ps, __emu__m256, __emu__m256i, float, int ); +__emu_maskstore_impl( __emu_mm256_maskstore_ps, __emu__m256, __emu__m256i, float, int ); + +__emu_maskload_impl( __emu_mm_maskload_ps, __m128, __m128i, float, int ); +__emu_maskstore_impl( __emu_mm_maskstore_ps, __m128, __m128i, float, int ); + + +__EMU_M256_IMPL_M1( __m256, movehdup_ps ); +__EMU_M256_IMPL_M1( __m256, moveldup_ps ); +__EMU_M256_IMPL_M1( __m256d, movedup_pd ); + +__emu_mm_load_impl( lddqu, si128, si256, __m256i, __m128i, __emu__m256i ); + +__emu_mm_store_impl( stream, si128, si256, __m256i, __m128i, __emu__m256i ); +__emu_mm_store_impl( stream, pd, pd, __m256d, double, double ); +__emu_mm_store_impl( stream, ps, ps, __m256, float, float ); + + +__EMU_M256_IMPL_M1( __m256, rcp_ps ); +__EMU_M256_IMPL_M1( __m256, rsqrt_ps ); + +__EMU_M256_IMPL_M1( __m256d, sqrt_pd ); +__EMU_M256_IMPL_M1( __m256, sqrt_ps ); + +__EMU_M256_IMPL_M2( __m256d, unpackhi_pd ); +__EMU_M256_IMPL_M2( __m256, unpackhi_ps ); +__EMU_M256_IMPL_M2( __m256d, unpacklo_pd ); +__EMU_M256_IMPL_M2( __m256, unpacklo_ps ); + + +static __emu_inline int __emu_mm256_movemask_pd(__emu__m256d a) +{ + return + (_mm_movemask_pd( a.__emu_m128[1] ) << 2) | + _mm_movemask_pd( a.__emu_m128[0] ); +} + +static __emu_inline int __emu_mm256_movemask_ps(__emu__m256 a) +{ + return + (_mm_movemask_ps( a.__emu_m128[1] ) << 4) | + _mm_movemask_ps( a.__emu_m128[0] ); +} + +static __emu_inline __emu__m256d __emu_mm256_setzero_pd(void) { __m128d ret[2] = { _mm_setzero_pd(), _mm_setzero_pd() }; return __emu_set_m128d( ret ); } +static __emu_inline __emu__m256 __emu_mm256_setzero_ps(void) { __m128 ret[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; return __emu_set_m128( ret ); } +static __emu_inline __emu__m256i __emu_mm256_setzero_si256(void) { __m128i ret[2] = { _mm_setzero_si128(), _mm_setzero_si128() }; return __emu_set_m128i( ret ); } + +static __emu_inline __emu__m256d __emu_mm256_set_pd(double a1, double a2, double a3, double a4) +{ __m128d ret[2] = { _mm_set_pd( a3, a4 ), _mm_set_pd( a1, a2 ) }; return __emu_set_m128d( ret ); } + +static __emu_inline __emu__m256 __emu_mm256_set_ps(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) +{ __m128 ret[2] = { _mm_set_ps( a5, a6, a7, a8 ), _mm_set_ps( a1, a2, a3, a4 ) }; return __emu_set_m128( ret ); } + +static __emu_inline __emu__m256i __emu_mm256_set_epi8(char a1, char a2, char a3, char a4, char a5, char a6, char a7, char a8, + char a9, char a10, char a11, char a12, char a13, char a14, char a15, char a16, + char a17, char a18, char a19, char a20, char a21, char a22, char a23, char a24, + char a25, char a26, char a27, char a28, char a29, char a30, char a31, char a32) +{ __m128i ret[2] = { _mm_set_epi8( a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32 ), + _mm_set_epi8( a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16 ) }; + return __emu_set_m128i( ret ); +} + +static __emu_inline __emu__m256i __emu_mm256_set_epi16(short a1, short a2, short a3, short a4, short a5, short a6, short a7, short a8, + short a9, short a10, short a11, short a12, short a13, short a14, short a15, short a16) +{ __m128i ret[2] = { _mm_set_epi16( a9, a10, a11, a12, a13, a14, a15, a16 ), + _mm_set_epi16( a1, a2, a3, a4, a5, a6, a7, a8 ) }; + return __emu_set_m128i( ret ); +} + +static __emu_inline __emu__m256i __emu_mm256_set_epi32(int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8) +{ __m128i ret[2] = { _mm_set_epi32( a5, a6, a7, a8 ), _mm_set_epi32( a1, a2, a3, a4 ) }; return __emu_set_m128i( ret ); } + +static __emu_inline __m128i __emu_mm_set_epi64x( __emu_int64_t a, __emu_int64_t b ) { return _mm_set_epi64( *(__m64*)&a, *(__m64*)&b ); } + +static __emu_inline __emu__m256i __emu_mm256_set_epi64x(__emu_int64_t a1, __emu_int64_t a2, __emu_int64_t a3, __emu_int64_t a4) +{ __m128i ret[2] = { __emu_mm_set_epi64x( a3, a4 ), __emu_mm_set_epi64x( a1, a2 ) }; return __emu_set_m128i( ret ); } + + +static __emu_inline __emu__m256d __emu_mm256_setr_pd(double a1, double a2, double a3, double a4) +{ __m128d ret[2] = { _mm_setr_pd( a1, a2 ), _mm_setr_pd( a3, a4 ) }; return __emu_set_m128d( ret ); } + +static __emu_inline __emu__m256 __emu_mm256_setr_ps(float a1, float a2, float a3, float a4, float a5, float a6, float a7, float a8) +{ __m128 ret[2] = { _mm_setr_ps( a1, a2, a3, a4 ), _mm_setr_ps( a5, a6, a7, a8 ) }; return __emu_set_m128( ret ); } + +static __emu_inline __emu__m256i __emu_mm256_setr_epi8(char a1, char a2, char a3, char a4, char a5, char a6, char a7, char a8, + char a9, char a10, char a11, char a12, char a13, char a14, char a15, char a16, + char a17, char a18, char a19, char a20, char a21, char a22, char a23, char a24, + char a25, char a26, char a27, char a28, char a29, char a30, char a31, char a32) +{ __m128i ret[2] = { _mm_setr_epi8( a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16 ), + _mm_setr_epi8( a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32 ) }; + return __emu_set_m128i( ret ); +} + +static __emu_inline __emu__m256i __emu_mm256_setr_epi16(short a1, short a2, short a3, short a4, short a5, short a6, short a7, short a8, + short a9, short a10, short a11, short a12, short a13, short a14, short a15, short a16) +{ __m128i ret[2] = { _mm_setr_epi16( a1, a2, a3, a4, a5, a6, a7, a8 ), + _mm_setr_epi16( a9, a10, a11, a12, a13, a14, a15, a16 ) }; return __emu_set_m128i( ret ); +} + +static __emu_inline __emu__m256i __emu_mm256_setr_epi32(int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8) +{ __m128i ret[2] = { _mm_setr_epi32( a1, a2, a3, a4 ), _mm_setr_epi32( a5, a6, a7, a8 ), }; return __emu_set_m128i( ret ); } + +static __emu_inline __emu__m256i __emu_mm256_setr_epi64x(__emu_int64_t a1, __emu_int64_t a2, __emu_int64_t a3, __emu_int64_t a4) +{ __m128i ret[2] = { __emu_mm_set_epi64x( a2, a1 ), __emu_mm_set_epi64x( a4, a3 ) }; return __emu_set_m128i( ret ); } + + + +__EMU_M256_IMPL_M1P_DUP( __m256d, double, set1_pd ); +__EMU_M256_IMPL_M1P_DUP( __m256, float, set1_ps ); +__EMU_M256_IMPL_M1P_DUP( __m256i, char, set1_epi8 ); +__EMU_M256_IMPL_M1P_DUP( __m256i, short, set1_epi16 ); +__EMU_M256_IMPL_M1P_DUP( __m256i, int, set1_epi32 ); + +static __emu__m256i __emu_mm256_set1_epi64x(__emu_int64_t a) +{ + __emu_int64_t res[4] = { a, a, a, a }; + return *((__emu__m256i*)res); +} + +/* + * Support intrinsics to do vector type casts. These intrinsics do not introduce + * extra moves to generated code. When cast is done from a 128 to 256-bit type + * the low 128 bits of the 256-bit result contain source parameter value; the + * upper 128 bits of the result are undefined + */ +__EMU_M256_IMPL_M1_RET( __m256, __m256d, castpd_ps ); +__EMU_M256_IMPL_M1_RET( __m256d, __m256, castps_pd ); + +__EMU_M256_IMPL_M1_RET_NAME( __m256i, __m256, castps_si128, castps_si256 ); +__EMU_M256_IMPL_M1_RET_NAME( __m256i, __m256d, castpd_si128, castpd_si256 ); + +__EMU_M256_IMPL_M1_RET_NAME( __m256, __m256i, castsi128_ps, castsi256_ps ); +__EMU_M256_IMPL_M1_RET_NAME( __m256d, __m256i, castsi128_pd, castsi256_pd ); + +static __emu_inline __m128 __emu_mm256_castps256_ps128(__emu__m256 a) { return ( a.__emu_m128[0] ); } +static __emu_inline __m128d __emu_mm256_castpd256_pd128(__emu__m256d a) { return ( a.__emu_m128[0] ); } +static __emu_inline __m128i __emu_mm256_castsi256_si128(__emu__m256i a) { return ( a.__emu_m128[0] ); } + +static __emu_inline __emu__m256 __emu_mm256_castps128_ps256(__m128 a) { __m128 ret[2] = { a, _mm_setzero_ps() }; return __emu_set_m128( ret ); }; +static __emu_inline __emu__m256d __emu_mm256_castpd128_pd256(__m128d a) { __m128d ret[2] = { a, _mm_setzero_pd() }; return __emu_set_m128d( ret ); }; +static __emu_inline __emu__m256i __emu_mm256_castsi128_si256(__m128i a) { __m128i ret[2] = { a, _mm_setzero_si128() }; return __emu_set_m128i( ret ); }; + +#if defined __cplusplus +}; /* End "C" */ +#endif /* __cplusplus */ + + + + + + +#ifndef __EMU_M256_NOMAP + +#define __m256 __emu__m256 +#define __m256i __emu__m256i +#define __m256d __emu__m256d + +#define _mm256_add_pd __emu_mm256_add_pd +#define _mm256_add_ps __emu_mm256_add_ps + +#define _mm256_addsub_pd __emu_mm256_addsub_pd +#define _mm256_addsub_ps __emu_mm256_addsub_ps + +#define _mm256_and_pd __emu_mm256_and_pd +#define _mm256_and_ps __emu_mm256_and_ps + +#define _mm256_andnot_pd __emu_mm256_andnot_pd +#define _mm256_andnot_ps __emu_mm256_andnot_ps + +#define _mm256_blend_pd __emu_mm256_blend_pd +#define _mm256_blend_ps __emu_mm256_blend_ps + +#define _mm256_blendv_pd __emu_mm256_blendv_pd +#define _mm256_blendv_ps __emu_mm256_blendv_ps + +#define _mm256_div_pd __emu_mm256_div_pd +#define _mm256_div_ps __emu_mm256_div_ps + +#define _mm256_dp_ps __emu_mm256_dp_ps + +#define _mm256_hadd_pd __emu_mm256_hadd_pd +#define _mm256_hadd_ps __emu_mm256_hadd_ps + +#define _mm256_hsub_pd __emu_mm256_hsub_pd +#define _mm256_hsub_ps __emu_mm256_hsub_ps + +#define _mm256_max_pd __emu_mm256_max_pd +#define _mm256_max_ps __emu_mm256_max_ps + +#define _mm256_min_pd __emu_mm256_min_pd +#define _mm256_min_ps __emu_mm256_min_ps + +#define _mm256_mul_pd __emu_mm256_mul_pd +#define _mm256_mul_ps __emu_mm256_mul_ps + +#define _mm256_or_pd __emu_mm256_or_pd +#define _mm256_or_ps __emu_mm256_or_ps + +#define _mm256_shuffle_pd __emu_mm256_shuffle_pd +#define _mm256_shuffle_ps __emu_mm256_shuffle_ps + +#define _mm256_sub_pd __emu_mm256_sub_pd +#define _mm256_sub_ps __emu_mm256_sub_ps + +#define _mm256_xor_pd __emu_mm256_xor_pd +#define _mm256_xor_ps __emu_mm256_xor_ps + + +#define _mm_cmp_pd __emu_mm_cmp_pd +#define _mm256_cmp_pd __emu_mm256_cmp_pd + +#define _mm_cmp_ps __emu_mm_cmp_ps +#define _mm256_cmp_ps __emu_mm256_cmp_ps + +#define _mm_cmp_sd __emu_mm_cmp_sd +#define _mm_cmp_ss __emu_mm_cmp_ss + +#define _mm256_cvtepi32_pd __emu_mm256_cvtepi32_pd +#define _mm256_cvtepi32_ps __emu_mm256_cvtepi32_ps + +#define _mm256_cvtpd_ps __emu_mm256_cvtpd_ps +#define _mm256_cvtps_epi32 __emu_mm256_cvtps_epi32 +#define _mm256_cvtps_pd __emu_mm256_cvtps_pd + +#define _mm256_cvttpd_epi32 __emu_mm256_cvttpd_epi32 +#define _mm256_cvtpd_epi32 __emu_mm256_cvtpd_epi32 +#define _mm256_cvttps_epi32 __emu_mm256_cvttps_epi32 + +#define _mm256_extractf128_ps __emu_mm256_extractf128_ps +#define _mm256_extractf128_pd __emu_mm256_extractf128_pd +#define _mm256_extractf128_si256 __emu_mm256_extractf128_si256 + +#define _mm256_zeroall __emu_mm256_zeroall +#define _mm256_zeroupper __emu_mm256_zeroupper + +#define _mm256_permutevar_ps __emu_mm256_permutevar_ps +#define _mm_permutevar_ps __emu_mm_permutevar_ps + +#define _mm256_permute_ps __emu_mm256_permute_ps +#define _mm_permute_ps __emu_mm_permute_ps + +#define _mm256_permutevar_pd __emu_mm256_permutevar_pd +#define _mm_permutevar_pd __emu_mm_permutevar_pd + +#define _mm256_permute_pd __emu_mm256_permute_pd +#define _mm_permute_pd __emu_mm_permute_pd + +#define _mm256_permute2f128_ps __emu_mm256_permute2f128_ps +#define _mm256_permute2f128_pd __emu_mm256_permute2f128_pd +#define _mm256_permute2f128_si256 __emu_mm256_permute2f128_si256 + +#define _mm256_broadcast_ss __emu_mm256_broadcast_ss +#define _mm_broadcast_ss __emu_mm_broadcast_ss + +#define _mm256_broadcast_sd __emu_mm256_broadcast_sd + +#define _mm256_broadcast_ps __emu_mm256_broadcast_ps +#define _mm256_broadcast_pd __emu_mm256_broadcast_pd + +#define _mm256_insertf128_ps __emu_mm256_insertf128_ps +#define _mm256_insertf128_pd __emu_mm256_insertf128_pd +#define _mm256_insertf128_si256 __emu_mm256_insertf128_si256 + +#define _mm256_load_pd __emu_mm256_load_pd +#define _mm256_store_pd __emu_mm256_store_pd +#define _mm256_load_ps __emu_mm256_load_ps +#define _mm256_store_ps __emu_mm256_store_ps + +#define _mm256_loadu_pd __emu_mm256_loadu_pd +#define _mm256_storeu_pd __emu_mm256_storeu_pd +#define _mm256_loadu_ps __emu_mm256_loadu_ps +#define _mm256_storeu_ps __emu_mm256_storeu_ps + +#define _mm256_load_si256 __emu_mm256_load_si256 +#define _mm256_store_si256 __emu_mm256_store_si256 +#define _mm256_loadu_si256 __emu_mm256_loadu_si256 +#define _mm256_storeu_si256 __emu_mm256_storeu_si256 + +#define _mm256_maskload_pd __emu_mm256_maskload_pd +#define _mm256_maskstore_pd __emu_mm256_maskstore_pd +#define _mm_maskload_pd __emu_mm_maskload_pd +#define _mm_maskstore_pd __emu_mm_maskstore_pd + +#define _mm256_maskload_ps __emu_mm256_maskload_ps +#define _mm256_maskstore_ps __emu_mm256_maskstore_ps +#define _mm_maskload_ps __emu_mm_maskload_ps +#define _mm_maskstore_ps __emu_mm_maskstore_ps + +#define _mm256_movehdup_ps __emu_mm256_movehdup_ps +#define _mm256_moveldup_ps __emu_mm256_moveldup_ps + +#define _mm256_movedup_pd __emu_mm256_movedup_pd +#define _mm256_lddqu_si256 __emu_mm256_lddqu_si256 + +#define _mm256_stream_si256 __emu_mm256_stream_si256 +#define _mm256_stream_pd __emu_mm256_stream_pd +#define _mm256_stream_ps __emu_mm256_stream_ps + +#define _mm256_rcp_ps __emu_mm256_rcp_ps +#define _mm256_rsqrt_ps __emu_mm256_rsqrt_ps + +#define _mm256_sqrt_pd __emu_mm256_sqrt_pd +#define _mm256_sqrt_ps __emu_mm256_sqrt_ps + +#define _mm256_round_pd __emu_mm256_round_pd + +#define _mm256_round_ps __emu_mm256_round_ps + +#define _mm256_unpackhi_pd __emu_mm256_unpackhi_pd +#define _mm256_unpackhi_ps __emu_mm256_unpackhi_ps + +#define _mm256_unpacklo_pd __emu_mm256_unpacklo_pd +#define _mm256_unpacklo_ps __emu_mm256_unpacklo_ps + +#define _mm256_testz_si256 __emu_mm256_testz_si256 +#define _mm256_testc_si256 __emu_mm256_testc_si256 +#define _mm256_testnzc_si256 __emu_mm256_testnzc_si256 + +#define _mm256_testz_pd __emu_mm256_testz_pd +#define _mm256_testc_pd __emu_mm256_testc_pd +#define _mm256_testnzc_pd __emu_mm256_testnzc_pd +#define _mm_testz_pd __emu_mm_testz_pd +#define _mm_testc_pd __emu_mm_testc_pd +#define _mm_testnzc_pd __emu_mm_testnzc_pd + +#define _mm256_testz_ps __emu_mm256_testz_ps +#define _mm256_testc_ps __emu_mm256_testc_ps +#define _mm256_testnzc_ps __emu_mm256_testnzc_ps +#define _mm_testz_ps __emu_mm_testz_ps +#define _mm_testc_ps __emu_mm_testc_ps +#define _mm_testnzc_ps __emu_mm_testnzc_ps + +#define _mm256_movemask_pd __emu_mm256_movemask_pd +#define _mm256_movemask_ps __emu_mm256_movemask_ps + +#define _mm256_setzero_pd __emu_mm256_setzero_pd +#define _mm256_setzero_ps __emu_mm256_setzero_ps +#define _mm256_setzero_si256 __emu_mm256_setzero_si256 + +#define _mm256_set_pd __emu_mm256_set_pd +#define _mm256_set_ps __emu_mm256_set_ps +#define _mm256_set_epi8 __emu_mm256_set_epi8 +#define _mm256_set_epi16 __emu_mm256_set_epi16 +#define _mm256_set_epi32 __emu_mm256_set_epi32 +#define _mm256_set_epi64x __emu_mm256_set_epi64x + +#define _mm256_setr_pd __emu_mm256_setr_pd +#define _mm256_setr_ps __emu_mm256_setr_ps +#define _mm256_setr_epi8 __emu_mm256_setr_epi8 +#define _mm256_setr_epi16 __emu_mm256_setr_epi16 +#define _mm256_setr_epi32 __emu_mm256_setr_epi32 +#define _mm256_setr_epi64x __emu_mm256_setr_epi64x + +#define _mm256_set1_pd __emu_mm256_set1_pd +#define _mm256_set1_ps __emu_mm256_set1_ps +#define _mm256_set1_epi8 __emu_mm256_set1_epi8 +#define _mm256_set1_epi16 __emu_mm256_set1_epi16 +#define _mm256_set1_epi32 __emu_mm256_set1_epi32 +#define _mm256_set1_epi64x __emu_mm256_set1_epi64x + +#define _mm256_castpd_ps __emu_mm256_castpd_ps +#define _mm256_castps_pd __emu_mm256_castps_pd +#define _mm256_castps_si256 __emu_mm256_castps_si256 +#define _mm256_castpd_si256 __emu_mm256_castpd_si256 +#define _mm256_castsi256_ps __emu_mm256_castsi256_ps +#define _mm256_castsi256_pd __emu_mm256_castsi256_pd +#define _mm256_castps256_ps128 __emu_mm256_castps256_ps128 +#define _mm256_castpd256_pd128 __emu_mm256_castpd256_pd128 +#define _mm256_castsi256_si128 __emu_mm256_castsi256_si128 +#define _mm256_castps128_ps256 __emu_mm256_castps128_ps256 +#define _mm256_castpd128_pd256 __emu_mm256_castpd128_pd256 +#define _mm256_castsi128_si256 __emu_mm256_castsi128_si256 + +#endif /* __EMU_M256_NOMAP */ + + + +#endif /* __EMU_M256_AVXIMMINTRIN_EMU_H__ */ diff --git a/src/vectors-4-Altivec.h b/src/vectors-4-Altivec.h new file mode 100644 index 0000000..06cea58 --- /dev/null +++ b/src/vectors-4-Altivec.h @@ -0,0 +1,132 @@ +// Vectorise using IBM's Altivec (Power) + +// Use the type vector double directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL4_VEC vector float + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL4_VEC_SIZE 4 + + + +// Create vectors, extract vector elements + +#define vec4_set1(a) (vec_splats(a)) +#define vec4_set(a,b,c,d) \ +({ \ + CCTK_REAL4_VEC x; \ + x[0]=(a); \ + x[1]=(b); \ + x[2]=(c); \ + x[3]=(d); \ + x; \ +}) + +#define vec4_elt0(x) ((x)[0]) +#define vec4_elt1(x) ((x)[1]) +#define vec4_elt2(x) ((x)[2]) +#define vec4_elt3(x) ((x)[3]) +#define vec4_elt(x,d) ((x)[d]) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec4_load(p) (*(CCTK_REAL4_VEC const*)&(p)) +#define vec4_loadu(p) (*(CCTK_REAL4_VEC const*)&(p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size +#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) +#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec4_store(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) +#define vec4_storeu(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) +#if 0 +# define vec4_store_nta(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) +#else +// use stvxl instruction +# define vec4_store_nta(p,x) (vec_stl(x,0,(CCTK_REAL4_VEC*)&(p))) +#endif + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec4_store_nta_partial_lo(p,x,n) \ +({ \ + switch (n) { \ + case 3: ((&(p))[2]=(x)[2]); \ + case 2: ((&(p))[1]=(x)[1]); \ + case 1: ((&(p))[0]=(x)[0]); \ + } \ +}) +#define vec4_store_nta_partial_hi(p,x,n) \ +({ \ + switch (n) { \ + case 3: ((&(p))[1]=(x)[1]); \ + case 2: ((&(p))[2]=(x)[2]); \ + case 1: ((&(p))[3]=(x)[3]); \ + } \ +}) + + + +// Functions and operators + +// Operators +#define k4pos(x) (+(x)) +#define k4neg(x) (-(x)) + +#define k4add(x,y) ((x)+(y)) +#define k4sub(x,y) ((x)-(y)) +#define k4mul(x,y) ((x)*(y)) +#define k4div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k4madd(x,y,z) (vec_madd(x,y,z)) +#define k4msub(x,y,z) (vec_msub(x,y,z)) +#define k4nmadd(x,y,z) (vec_nmadd(x,y,z)) +#define k4nmsub(x,y,z) (vec_nmsub(x,y,z)) + +// Cheap functions +#define k4fabs(x) (vec_abs(x)) +#define k4fmax(x,y) (vec_max(x,y)) +#define k4fmin(x,y) (vec_min(x,y)) +#define k4fnabs(x) (vec_nabs(x)) + +#define k4exp(x) \ +({ \ + CCTK_REAL4_VEC const xexp=(x); \ + vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ + exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ +}) +#define k4log(x) \ +({ \ + CCTK_REAL4_VEC const xlog=(x); \ + vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ + log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ +}) +#define k4pow(x,a) \ +({ \ + CCTK_REAL4_VEC const xpow=(x); \ + CCTK_REAL4 const apow=(a); \ + vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ + pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ +}) +#define k4sqrt(x) \ +({ \ + CCTK_REAL4_VEC const xsqrt=(x); \ + vec4_set(sqrt(vec4_elt0(xsqrt)), sqrt(vec4_elt1(xsqrt)), \ + sqrt(vec4_elt2(xsqrt)), sqrt(vec4_elt3(xsqrt))); \ +}) diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h new file mode 100644 index 0000000..bc50e68 --- /dev/null +++ b/src/vectors-4-SSE.h @@ -0,0 +1,173 @@ +// Vectorise using Intel's or AMD's SSE + +// Use the type __m128 directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL4_VEC __m128 + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL4_VEC_SIZE 4 + + + +// Create vectors, extract vector elements + +#define vec4_set1(a) (_mm_set1_ps(a)) +#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments + +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtss_f32 does not exist on PGI compilers +# define vec4_elt0(x) \ +({ \ + CCTK_REAL4 aelt0; \ + asm ("" : "=x" (aelt0) : "0" (x)); \ + aelt0; \ +}) +#else +# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op +#endif +#define vec4_elt1(x) \ +({ \ + CCTK_REAL4_VEC const xelt1=(x); \ + vec4_elt0(_mm_shuffle_ps(xelt1,xelt1,_MM_SHUFFLE(1,0,3,2))); \ +}) +#define vec4_elt2(x) \ +({ \ + CCTK_REAL4_VEC const xelt2=(x); \ + vec4_elt0(_mm_unpackhi_ps(xelt2,xelt2)); \ +}) +#define vec4_elt3(x) \ +({ \ + CCTK_REAL4_VEC const xelt3=(x); \ + vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \ +}) +#if defined(__PGI) && defined (__amd64__) +# define vec4_elt(x,d) \ +({ \ + CCTK_REAL4_VEC const xelt=(x); \ + CCTK_REAL4 aelt; \ + if (d==0) aelt=vec4_elt0(xelt); \ + else if (d==1) aelt=vec4_elt1(xelt); \ + else if (d==2) aelt=vec4_elt2(xelt); \ + else if (d==3) aelt=vec4_elt3(xelt); \ + aelt; \ +}) +#else +# define vec4_elt(x,d) \ +({ \ + CCTK_REAL4_VEC const xelt=(x); \ + CCTK_REAL4 aelt; \ + switch (d) { \ + case 0: aelt=vec4_elt0(xelt); break; \ + case 1: aelt=vec4_elt1(xelt); break; \ + case 2: aelt=vec4_elt2(xelt); break; \ + case 3: aelt=vec4_elt3(xelt); break; \ + } \ + aelt; \ +}) +#endif + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec4_load(p) (_mm_load_ps(&(p))) +#define vec4_loadu(p) (_mm_loadu_ps(&(p))) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: Always use unaligned load +#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) +#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec4_store(p,x) (_mm_store_ps(&(p),x)) +#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x)) +#define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec4_store_nta_partial_lo(p,x,n) \ +({ \ + switch (n) { \ + case 3: (&(p))[2]=vec_elt2(p); \ + case 2: _mm_storel_pi(&(p),x); break; \ + case 1: (&(p))[0]=vec_elt0(p); \ + } \ +}) +#define vec4_store_nta_partial_hi(p,x,n) \ +({ \ + switch (n) { \ + case 3: (&(p))[1]=vec_elt1(p); \ + case 2: _mm_storeh_pi(&(p)+2,x); break; \ + case 1: (&(p))[3]=vec_elt3(p); \ + } \ +}) + + + +// Functions and operators + +static const union { + unsigned i[4]; + __m128 v; +} k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }}; +#define k4sign_mask (k4sign_mask_union.v) +static const union { + unsigned i[4]; + __m128 v; +} k4abs_mask_union = {{ 0x7fffffffU, 0x7fffffffU, 0x7fffffffU, 0x7fffffffU }}; +#define k4abs_mask (k4abs_mask_union.v) + +// Operators +#define k4pos(x) (x) +#define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) + +#define k4add(x,y) (_mm_add_ps(x,y)) +#define k4sub(x,y) (_mm_sub_ps(x,y)) +#define k4mul(x,y) (_mm_mul_ps(x,y)) +#define k4div(x,y) (_mm_div_ps(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k4madd(x,y,z) (k4add(k4mul(x,y),z)) +#define k4msub(x,y,z) (k4sub(k4mul(x,y),z)) +#define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y))) +#define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) + +// Cheap functions +#define k4fabs(x) (_mm_and_ps(x,k4abs_mask)) +#define k4fmax(x,y) (_mm_max_ps(x,y)) +#define k4fmin(x,y) (_mm_min_ps(x,y)) +#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) +#define k4sqrt(x) (_mm_sqrt_ps(x)) + +// Expensive functions +#define k4exp(x) \ +({ \ + CCTK_REAL4_VEC const xexp=(x); \ + vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ + exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ +}) +#define k4log(x) \ +({ \ + CCTK_REAL4_VEC const xlog=(x); \ + vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ + log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ +}) +#define k4pow(x,a) \ +({ \ + CCTK_REAL4_VEC const xpow=(x); \ + CCTK_REAL4 const apow=(a); \ + vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ + pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ +}) diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h new file mode 100644 index 0000000..e20109d --- /dev/null +++ b/src/vectors-4-default.h @@ -0,0 +1,79 @@ +// Fallback vectorisation implementation: Do not vectorise + + + +// We use macros here, so that we are not surprised by compilers which +// don't like to inline functions. This should also make debug builds +// (which may not inline) more efficient. + + + +// Use CCTK_REAL4 +#define CCTK_REAL4_VEC CCTK_REAL4 + +// Number of vector elements in a vector +#define CCTK_REAL4_VEC_SIZE 1 + + + +// Create a vector replicating a scalar +#define vec4_set1(a) (a) +// Create a vector from N scalars +#define vec4_set(a) (a) + +// Access vectors elements +#define vec4_elt0(x) (x) +#define vec4_elt(x,d) (x) + + + +// Load an aligned vector from memory +#define vec4_load(p) (p) +// Load an unaligned vector from memory +#define vec4_loadu(p) (p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size. These functions are +// useful e.g. for loading neightbouring grid points while evaluating +// finite differencing stencils. +#define vec4_loadu_maybe(off,p) (p) +#define vec4_loadu_maybe3(off1,off2,off3,p) (p) + +// Aligned store +#define vec4_store(p,x) ((p)=(x)) +// Unaligned store +#define vec4_store_nta(p,x) ((p)=(x)) + +// Store the n lower elements of a vector to memory +#define vec4_store_nta_partial_lo(p,x,n) (assert(0)) +// Store the n higher elements of a vector into memory. This stores +// the vector elements into memory locations as if element 0 were +// stored at p. +#define vec4_store_nta_partial_hi(p,x,n) (assert(0)) + + + +// Operators +#define k4pos(x) (+(x)) +#define k4neg(x) (-(x)) + +#define k4add(x,y) ((x)+(y)) +#define k4sub(x,y) ((x)-(y)) +#define k4mul(x,y) ((x)*(y)) +#define k4div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k4madd(x,y,z) (+(x)*(y)+(z)) +#define k4msub(x,y,z) (+(x)*(y)-(z)) +#define k4nmadd(x,y,z) (-(x)*(y)-(z)) +#define k4nmsub(x,y,z) (-(x)*(y)+(z)) + +// Functions +#define k4exp(x) (expf(x)) +#define k4fabs(x) (fabsf(x)) +#define k4fmax(x,y) (fmaxf(x,y)) +#define k4fmin(x,y) (fminf(x,y)) +#define k4fnabs(x) (-fabsf(x)) +#define k4log(x) (logf(x)) +#define k4pow(x,a) (powf(x,a)) +#define k4sqrt(x) (sqrtf(x)) diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h new file mode 100644 index 0000000..78c00d4 --- /dev/null +++ b/src/vectors-8-AVX.h @@ -0,0 +1,163 @@ +// Vectorise using Intel's or AMD's AVX + +// Use the type __m256d directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#if defined(EMULATE_AVX) +# include "avxintrin_emu.h" +#else +# include +#endif + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC __m256d + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 4 + + + +union k8const_t { + unsigned long long i[4]; + double d[4]; + __m256i vi; + __m256d vd; +}; + +#define K8_ZERO 0x0000000000000000ULL +#define K8_IMIN 0x8000000000000000ULL +#define K8_IMAX 0x7fffffffffffffffULL + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (_mm256_set1_pd(a)) +#define vec8_set(a,b,c,d) (_mm256_set_pd(d,c,b,a)) // note reversed arguments + +#define vec8_elt0(x) (_mm_cvtsd_f64(_mm256_extractf128_pd(x,0))) +#define vec8_elt1(x) \ +({ \ + __m128d const xelt1=_mm256_extractf128_pd(x,0); \ + _mm_cvtsd_f64(_mm_unpackhi_pd(xelt1,xelt1)); \ +}) +#define vec8_elt2(x) (_mm_cvtsd_f64(_mm256_extractf128_pd(x,1))) +#define vec8_elt3(x) \ +({ \ + __m128d const xelt3=_mm256_extractf128_pd(x,1); \ + _mm_cvtsd_f64(_mm_unpackhi_pd(xelt3,xelt3)); \ +}) + +#define vec8_elt(x,d) \ +({ \ + CCTK_REAL8_VEC const xelt=(x); \ + CCTK_REAL8 aelt; \ + switch (d) { \ + case 0: aelt=vec8_elt0(xelt); break; \ + case 1: aelt=vec8_elt1(xelt); break; \ + case 2: aelt=vec8_elt2(xelt); break; \ + case 3: aelt=vec8_elt3(xelt); break; \ + } \ + aelt; \ +}) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (_mm256_load_pd(&(p))) +#define vec8_loadu(p) (_mm256_loadu_pd(&(p))) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: Always use unaligned load +#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) +#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (_mm256_store_pd(&(p),x)) +#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x)) +#define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static const k8const_t k8store_lo_union[5] = + { + {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }}, + {{ K8_IMIN, K8_ZERO, K8_ZERO, K8_ZERO, }}, + {{ K8_IMIN, K8_IMIN, K8_ZERO, K8_ZERO, }}, + {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_ZERO, }}, + {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }}, + }; +#define vec8_store_nta_partial_lo(p,x,n) \ + (_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x)) +static const k8const_t k8store_hi_union[5] = + { + {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }}, + {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_IMIN, }}, + {{ K8_ZERO, K8_ZERO, K8_IMIN, K8_IMIN, }}, + {{ K8_ZERO, K8_IMIN, K8_IMIN, K8_IMIN, }}, + {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }}, + }; +#define vec8_store_nta_partial_hi(p,x,n) \ + (_mm256_maskstore_pd(&(p),k8store_hi_union[n].vi,x)) + + + +// Functions and operators + +static const k8const_t k8sign_mask_union = + {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }}; +static const k8const_t k8abs_mask_union = + {{ K8_IMAX, K8_IMAX, K8_IMAX, K8_IMAX, }}; + +// Operators +#define k8pos(x) (x) +#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask_union.vd)) + +#define k8add(x,y) (_mm256_add_pd(x,y)) +#define k8sub(x,y) (_mm256_sub_pd(x,y)) +#define k8mul(x,y) (_mm256_mul_pd(x,y)) +#define k8div(x,y) (_mm256_div_pd(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (k8add(k8mul(x,y),z)) +#define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) +#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) +#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) + +// Cheap functions +#define k8fabs(x) (_mm256_and_pd(x,k8abs_mask_union.vd)) +#define k8fmax(x,y) (_mm256_max_pd(x,y)) +#define k8fmin(x,y) (_mm256_min_pd(x,y)) +#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask_union.vd)) +#define k8sqrt(x) (_mm256_sqrt_pd(x)) + +// Expensive functions +#define K8REPL(x,func) \ +({ \ + CCTK_REAL8_VEC const xfunc=(x); \ + vec8_set((vec8_elt0(xfunc)), \ + (vec8_elt1(xfunc)), \ + (vec8_elt2(xfunc)), \ + (vec8_elt3(xfunc))); \ +}) +#define K8REPL2(x,a,func) \ +({ \ + CCTK_REAL8_VEC const xfunc=(x); \ + CCTK_REAL8 const afunc=(a); \ + vec8_set((vec8_elt0(xfunc),afunc), \ + (vec8_elt1(xfunc),afunc), \ + (vec8_elt2(xfunc),afunc), \ + (vec8_elt3(xfunc),afunc)); \ +}) +#define k8exp(x) K8REPL(x,exp) +#define k8log(x) K8REPL(x,log) +#define k8pow(x,a) K8REPL2(x,a,exp) diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h new file mode 100644 index 0000000..9311f62 --- /dev/null +++ b/src/vectors-8-DoubleHummer.h @@ -0,0 +1,108 @@ +// Vectorise using IBM's Blue Gene/P Double Hummer (Power) + +// Use the type double _Complex directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC double _Complex + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 2 + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (__cmplx(a,a)) +#define vec8_set(a,b) (__cmplx(a,b)) + +#define vec8_elt0(x) (__creal(x)) +#define vec8_elt1(x) (__cimag(x)) +#define vec8_elt(x,d) \ +({ \ + CCTK_REAL8_VEC const xelt=(x); \ + CCTK_REAL8 aelt; \ + switch (d) { \ + case 0: aelt=vec8_elt0(xelt); break; \ + case 1: aelt=vec8_elt1(xelt); break; \ + } \ + aelt; \ +}) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (__lfpd((double *)&(p))) +#define vec8_loadu(p) (__lfpd((double *)&(p))) // this may not work + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size +#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) +#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (__stfpd(&(p),x)) +#define vec8_storeu(p,x) (__stfpd(&(p),x)) // this may not work +#define vec8_store_nta(p,x) (__stfpd(&(p),x)) // this doesn't avoid the cache + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=vec8_elt0(x)) +#define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec8_elt1(x)) + + + +// Functions and operators + +// Operators +#define k8pos(x) (x) +#define k8neg(x) (__fpneg(x)) + +#define k8add(x,y) (__fpadd(x,y)) +#define k8sub(x,y) (__fpsub(x,y)) +#define k8mul(x,y) (__fpmul(x,y)) +#define k8div(x,y) (__fpmul(x,__fpre(y))) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (__fpmadd(z,x,y)) +#define k8msub(x,y,z) (__fpmsub(z,x,y)) +#define k8nmadd(x,y,z) (__fpnmadd(z,x,y)) +#define k8nmsub(x,y,z) (__fpnmsub(z,x,y)) + +// Cheap functions +#define k8fabs(x) (__fpabs(x)) +#define k8fmax(x,y) (__fpsel(__fpsub(y,x),x,y)) +#define k8fmin(x,y) (__fpsel(__fpsub(x,y),x,y)) +#define k8fnabs(x) (__fpnabs(x)) + +#define k8exp(x) \ +({ \ + CCTK_REAL8_VEC const xexp=(x); \ + vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ +}) +#define k8log(x) \ +({ \ + CCTK_REAL8_VEC const xlog=(x); \ + vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ +}) +#define k8pow(x,a) \ +({ \ + CCTK_REAL8_VEC const xpow=(x); \ + CCTK_REAL8 const apow=(a); \ + vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ +}) +#define k8sqrt(x) \ +({ \ + CCTK_REAL8_VEC const xsqrt=(x); \ + vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \ +}) diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h new file mode 100644 index 0000000..34aa24f --- /dev/null +++ b/src/vectors-8-SSE2.h @@ -0,0 +1,148 @@ +// Vectorise using Intel's or AMD's SSE2 + +// Use the type __m128d directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC __m128d + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 2 + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (_mm_set1_pd(a)) +#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments + +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtsd_f64 does not exist on PGI 9 compilers +# define vec8_elt0(x) \ +({ \ + CCTK_REAL8 aelt0; \ + asm ("" : "=x" (aelt0) : "0" (x)); \ + aelt0; \ +}) +#else +# define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op +#endif +#define vec8_elt1(x) \ +({ \ + CCTK_REAL8_VEC const xelt1=(x); \ + vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1)); \ +}) +#if defined(__PGI) && defined (__amd64__) +# define vec8_elt(x,d) \ +({ \ + CCTK_REAL8_VEC const xelt=(x); \ + CCTK_REAL8 aelt; \ + if (d==0) aelt=vec8_elt0(xelt); \ + else if (d==1) aelt=vec8_elt1(xelt); \ + aelt; \ +}) +#else +# define vec8_elt(x,d) \ +({ \ + CCTK_REAL8_VEC const xelt=(x); \ + CCTK_REAL8 aelt; \ + switch (d) { \ + case 0: aelt=vec8_elt0(xelt); break; \ + case 1: aelt=vec8_elt1(xelt); break; \ + } \ + aelt; \ +}) +#endif + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (_mm_load_pd(&(p))) +#define vec8_loadu(p) (_mm_loadu_pd(&(p))) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: Always use unaligned load +#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) +#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (_mm_store_pd(&(p),x)) +#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) +#define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#if 1 +# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) +# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x)) +#else +// This is slower; we would need a non-temporal read +# define vec8_store_nta_partial_lo(p,x,n) (vec8_store_nta(p,_mm_loadh_pd(x,&(p)+1))) +# define vec8_store_nta_partial_hi(p,x,n) (vec8_store_nta(p,_mm_loadl_pd(x,&(p)))) +#endif + + + +// Functions and operators + +static const union { + unsigned long long i[2]; + __m128d v; +} k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }}; +#define k8sign_mask (k8sign_mask_union.v) +static const union { + unsigned long long i[2]; + __m128d v; +} k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }}; +#define k8abs_mask (k8sign_mask_union.v) + +// Operators +#define k8pos(x) (x) +#define k8neg(x) (_mm_xor_pd(x,k8sign_mask)) + +#define k8add(x,y) (_mm_add_pd(x,y)) +#define k8sub(x,y) (_mm_sub_pd(x,y)) +#define k8mul(x,y) (_mm_mul_pd(x,y)) +#define k8div(x,y) (_mm_div_pd(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (k8add(k8mul(x,y),z)) +#define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) +#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) +#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) + +// Cheap functions +#define k8fabs(x) (_mm_and_pd(x,k8abs_mask)) +#define k8fmax(x,y) (_mm_max_pd(x,y)) +#define k8fmin(x,y) (_mm_min_pd(x,y)) +#define k8fnabs(x) (_mm_or_pd(x,k8sign_mask)) +#define k8sqrt(x) (_mm_sqrt_pd(x)) + +// Expensive functions +#define k8exp(x) \ +({ \ + CCTK_REAL8_VEC const xexp=(x); \ + vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ +}) +#define k8log(x) \ +({ \ + CCTK_REAL8_VEC const xlog=(x); \ + vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ +}) +#define k8pow(x,a) \ +({ \ + CCTK_REAL8_VEC const xpow=(x); \ + CCTK_REAL8 const apow=(a); \ + vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ +}) diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h new file mode 100644 index 0000000..9d7c17c --- /dev/null +++ b/src/vectors-8-VSX.h @@ -0,0 +1,110 @@ +// Vectorise using IBM's Altivec VSX (Power) + +// Use the type vector double directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC vector double + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 2 + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (vec_splats(a)) +#define vec8_set(a,b) \ +({ \ + CCTK_REAL8_VEC x; \ + x[0]=(a); \ + x[1]=(b); \ + x; \ +}) + +#define vec8_elt0(x) ((x)[0]) +#define vec8_elt1(x) ((x)[1]) +#define vec8_elt(x,d) ((x)[d]) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (*(CCTK_REAL8_VEC const*)&(p)) +#define vec8_loadu(p) (*(CCTK_REAL8_VEC const*)&(p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size +#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) +#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) +#define vec8_storeu(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) +#if 1 +# define vec8_store_nta(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) +#else +// stvxl instruction doesn't exist for double precision +# define vec8_store_nta(p,x) (vec_stl(x,0,(CCTK_REAL8_VEC*)&(p))) +#endif + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=(x)[0]) +#define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=(x)[1]) + + + +// Functions and operators + +// Operators +#define k8pos(x) (+(x)) +#define k8neg(x) (-(x)) + +#define k8add(x,y) ((x)+(y)) +#define k8sub(x,y) ((x)-(y)) +#define k8mul(x,y) ((x)*(y)) +#define k8div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (vec_madd(x,y,z)) +#define k8msub(x,y,z) (vec_msub(x,y,z)) +#define k8nmadd(x,y,z) (vec_nmadd(x,y,z)) +#define k8nmsub(x,y,z) (vec_nmsub(x,y,z)) + +// Cheap functions +#define k8fabs(x) (vec_abs(x)) +#define k8fmax(x,y) (vec_max(x,y)) +#define k8fmin(x,y) (vec_min(x,y)) +#define k8fnabs(x) (vec_nabs(x)) + +#define k8exp(x) \ +({ \ + CCTK_REAL8_VEC const xexp=(x); \ + vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ +}) +#define k8log(x) \ +({ \ + CCTK_REAL8_VEC const xlog=(x); \ + vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ +}) +#define k8pow(x,a) \ +({ \ + CCTK_REAL8_VEC const xpow=(x); \ + CCTK_REAL8 const apow=(a); \ + vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ +}) +#define k8sqrt(x) \ +({ \ + CCTK_REAL8_VEC const xsqrt=(x); \ + vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \ +}) diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h new file mode 100644 index 0000000..8ea3ac8 --- /dev/null +++ b/src/vectors-8-default.h @@ -0,0 +1,79 @@ +// Fallback vectorisation implementation: Do not vectorise + + + +// We use macros here, so that we are not surprised by compilers which +// don't like to inline functions. This should also make debug builds +// (which may not inline) more efficient. + + + +// Use CCTK_REAL8 +#define CCTK_REAL8_VEC CCTK_REAL8 + +// Number of vector elements in a vector +#define CCTK_REAL8_VEC_SIZE 1 + + + +// Create a vector replicating a scalar +#define vec8_set1(a) (a) +// Create a vector from N scalars +#define vec8_set(a) (a) + +// Access vectors elements +#define vec8_elt0(x) (x) +#define vec8_elt(x,d) (x) + + + +// Load an aligned vector from memory +#define vec8_load(p) (p) +// Load an unaligned vector from memory +#define vec8_loadu(p) (p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size. These functions are +// useful e.g. for loading neightbouring grid points while evaluating +// finite differencing stencils. +#define vec8_loadu_maybe(off,p) (p) +#define vec8_loadu_maybe3(off1,off2,off3,p) (p) + +// Aligned store +#define vec8_store(p,x) ((p)=(x)) +// Unaligned store +#define vec8_store_nta(p,x) ((p)=(x)) + +// Store the n lower elements of a vector to memory +#define vec8_store_nta_partial_lo(p,x,n) (assert(0)) +// Store the n higher elements of a vector into memory. This stores +// the vector elements into memory locations as if element 0 were +// stored at p. +#define vec8_store_nta_partial_hi(p,x,n) (assert(0)) + + + +// Operators +#define k8pos(x) (+(x)) +#define k8neg(x) (-(x)) + +#define k8add(x,y) ((x)+(y)) +#define k8sub(x,y) ((x)-(y)) +#define k8mul(x,y) ((x)*(y)) +#define k8div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (+(x)*(y)+(z)) +#define k8msub(x,y,z) (+(x)*(y)-(z)) +#define k8nmadd(x,y,z) (-(x)*(y)-(z)) +#define k8nmsub(x,y,z) (-(x)*(y)+(z)) + +// Functions +#define k8exp(x) (exp(x)) +#define k8fabs(x) (fabs(x)) +#define k8fmax(x,y) (fmax(x,y)) +#define k8fmin(x,y) (fmin(x,y)) +#define k8fnabs(x) (-fabs(x)) +#define k8log(x) (log(x)) +#define k8pow(x,a) (pow(x,a)) +#define k8sqrt(x) (sqrt(x)) diff --git a/src/vectors-default-4.h b/src/vectors-default-4.h deleted file mode 100644 index e20109d..0000000 --- a/src/vectors-default-4.h +++ /dev/null @@ -1,79 +0,0 @@ -// Fallback vectorisation implementation: Do not vectorise - - - -// We use macros here, so that we are not surprised by compilers which -// don't like to inline functions. This should also make debug builds -// (which may not inline) more efficient. - - - -// Use CCTK_REAL4 -#define CCTK_REAL4_VEC CCTK_REAL4 - -// Number of vector elements in a vector -#define CCTK_REAL4_VEC_SIZE 1 - - - -// Create a vector replicating a scalar -#define vec4_set1(a) (a) -// Create a vector from N scalars -#define vec4_set(a) (a) - -// Access vectors elements -#define vec4_elt0(x) (x) -#define vec4_elt(x,d) (x) - - - -// Load an aligned vector from memory -#define vec4_load(p) (p) -// Load an unaligned vector from memory -#define vec4_loadu(p) (p) - -// Load a vector from memory that may or may not be aligned, as -// decided by the offset and the vector size. These functions are -// useful e.g. for loading neightbouring grid points while evaluating -// finite differencing stencils. -#define vec4_loadu_maybe(off,p) (p) -#define vec4_loadu_maybe3(off1,off2,off3,p) (p) - -// Aligned store -#define vec4_store(p,x) ((p)=(x)) -// Unaligned store -#define vec4_store_nta(p,x) ((p)=(x)) - -// Store the n lower elements of a vector to memory -#define vec4_store_nta_partial_lo(p,x,n) (assert(0)) -// Store the n higher elements of a vector into memory. This stores -// the vector elements into memory locations as if element 0 were -// stored at p. -#define vec4_store_nta_partial_hi(p,x,n) (assert(0)) - - - -// Operators -#define k4pos(x) (+(x)) -#define k4neg(x) (-(x)) - -#define k4add(x,y) ((x)+(y)) -#define k4sub(x,y) ((x)-(y)) -#define k4mul(x,y) ((x)*(y)) -#define k4div(x,y) ((x)/(y)) - -// Fused multiply-add, defined as [+-]x*y[+-]z -#define k4madd(x,y,z) (+(x)*(y)+(z)) -#define k4msub(x,y,z) (+(x)*(y)-(z)) -#define k4nmadd(x,y,z) (-(x)*(y)-(z)) -#define k4nmsub(x,y,z) (-(x)*(y)+(z)) - -// Functions -#define k4exp(x) (expf(x)) -#define k4fabs(x) (fabsf(x)) -#define k4fmax(x,y) (fmaxf(x,y)) -#define k4fmin(x,y) (fminf(x,y)) -#define k4fnabs(x) (-fabsf(x)) -#define k4log(x) (logf(x)) -#define k4pow(x,a) (powf(x,a)) -#define k4sqrt(x) (sqrtf(x)) diff --git a/src/vectors-default-8.h b/src/vectors-default-8.h deleted file mode 100644 index 8ea3ac8..0000000 --- a/src/vectors-default-8.h +++ /dev/null @@ -1,79 +0,0 @@ -// Fallback vectorisation implementation: Do not vectorise - - - -// We use macros here, so that we are not surprised by compilers which -// don't like to inline functions. This should also make debug builds -// (which may not inline) more efficient. - - - -// Use CCTK_REAL8 -#define CCTK_REAL8_VEC CCTK_REAL8 - -// Number of vector elements in a vector -#define CCTK_REAL8_VEC_SIZE 1 - - - -// Create a vector replicating a scalar -#define vec8_set1(a) (a) -// Create a vector from N scalars -#define vec8_set(a) (a) - -// Access vectors elements -#define vec8_elt0(x) (x) -#define vec8_elt(x,d) (x) - - - -// Load an aligned vector from memory -#define vec8_load(p) (p) -// Load an unaligned vector from memory -#define vec8_loadu(p) (p) - -// Load a vector from memory that may or may not be aligned, as -// decided by the offset and the vector size. These functions are -// useful e.g. for loading neightbouring grid points while evaluating -// finite differencing stencils. -#define vec8_loadu_maybe(off,p) (p) -#define vec8_loadu_maybe3(off1,off2,off3,p) (p) - -// Aligned store -#define vec8_store(p,x) ((p)=(x)) -// Unaligned store -#define vec8_store_nta(p,x) ((p)=(x)) - -// Store the n lower elements of a vector to memory -#define vec8_store_nta_partial_lo(p,x,n) (assert(0)) -// Store the n higher elements of a vector into memory. This stores -// the vector elements into memory locations as if element 0 were -// stored at p. -#define vec8_store_nta_partial_hi(p,x,n) (assert(0)) - - - -// Operators -#define k8pos(x) (+(x)) -#define k8neg(x) (-(x)) - -#define k8add(x,y) ((x)+(y)) -#define k8sub(x,y) ((x)-(y)) -#define k8mul(x,y) ((x)*(y)) -#define k8div(x,y) ((x)/(y)) - -// Fused multiply-add, defined as [+-]x*y[+-]z -#define k8madd(x,y,z) (+(x)*(y)+(z)) -#define k8msub(x,y,z) (+(x)*(y)-(z)) -#define k8nmadd(x,y,z) (-(x)*(y)-(z)) -#define k8nmsub(x,y,z) (-(x)*(y)+(z)) - -// Functions -#define k8exp(x) (exp(x)) -#define k8fabs(x) (fabs(x)) -#define k8fmax(x,y) (fmax(x,y)) -#define k8fmin(x,y) (fmin(x,y)) -#define k8fnabs(x) (-fabs(x)) -#define k8log(x) (log(x)) -#define k8pow(x,a) (pow(x,a)) -#define k8sqrt(x) (sqrt(x)) diff --git a/src/vectors-intel-4.h b/src/vectors-intel-4.h deleted file mode 100644 index bc50e68..0000000 --- a/src/vectors-intel-4.h +++ /dev/null @@ -1,173 +0,0 @@ -// Vectorise using Intel's or AMD's SSE - -// Use the type __m128 directly, without introducing a wrapper class -// Use macros instead of inline functions - - - -#include - - - -// Vector type corresponding to CCTK_REAL -#define CCTK_REAL4_VEC __m128 - -// Number of vector elements in a CCTK_REAL_VEC -#define CCTK_REAL4_VEC_SIZE 4 - - - -// Create vectors, extract vector elements - -#define vec4_set1(a) (_mm_set1_ps(a)) -#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments - -#if defined(__PGI) && defined (__amd64__) -// _mm_cvtss_f32 does not exist on PGI compilers -# define vec4_elt0(x) \ -({ \ - CCTK_REAL4 aelt0; \ - asm ("" : "=x" (aelt0) : "0" (x)); \ - aelt0; \ -}) -#else -# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op -#endif -#define vec4_elt1(x) \ -({ \ - CCTK_REAL4_VEC const xelt1=(x); \ - vec4_elt0(_mm_shuffle_ps(xelt1,xelt1,_MM_SHUFFLE(1,0,3,2))); \ -}) -#define vec4_elt2(x) \ -({ \ - CCTK_REAL4_VEC const xelt2=(x); \ - vec4_elt0(_mm_unpackhi_ps(xelt2,xelt2)); \ -}) -#define vec4_elt3(x) \ -({ \ - CCTK_REAL4_VEC const xelt3=(x); \ - vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \ -}) -#if defined(__PGI) && defined (__amd64__) -# define vec4_elt(x,d) \ -({ \ - CCTK_REAL4_VEC const xelt=(x); \ - CCTK_REAL4 aelt; \ - if (d==0) aelt=vec4_elt0(xelt); \ - else if (d==1) aelt=vec4_elt1(xelt); \ - else if (d==2) aelt=vec4_elt2(xelt); \ - else if (d==3) aelt=vec4_elt3(xelt); \ - aelt; \ -}) -#else -# define vec4_elt(x,d) \ -({ \ - CCTK_REAL4_VEC const xelt=(x); \ - CCTK_REAL4 aelt; \ - switch (d) { \ - case 0: aelt=vec4_elt0(xelt); break; \ - case 1: aelt=vec4_elt1(xelt); break; \ - case 2: aelt=vec4_elt2(xelt); break; \ - case 3: aelt=vec4_elt3(xelt); break; \ - } \ - aelt; \ -}) -#endif - - - -// Load and store vectors - -// Load a vector from memory (aligned and unaligned); this loads from -// a reference to a scalar -#define vec4_load(p) (_mm_load_ps(&(p))) -#define vec4_loadu(p) (_mm_loadu_ps(&(p))) - -// Load a vector from memory that may or may not be aligned, as -// decided by the offset off and the vector size -// Implementation: Always use unaligned load -#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) -#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) - -// Store a vector to memory (aligned and non-temporal); this stores to -// a reference to a scalar -#define vec4_store(p,x) (_mm_store_ps(&(p),x)) -#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x)) -#define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x)) - -// Store a lower or higher partial vector (aligned and non-temporal); -// the non-temporal hint is probably ignored -#define vec4_store_nta_partial_lo(p,x,n) \ -({ \ - switch (n) { \ - case 3: (&(p))[2]=vec_elt2(p); \ - case 2: _mm_storel_pi(&(p),x); break; \ - case 1: (&(p))[0]=vec_elt0(p); \ - } \ -}) -#define vec4_store_nta_partial_hi(p,x,n) \ -({ \ - switch (n) { \ - case 3: (&(p))[1]=vec_elt1(p); \ - case 2: _mm_storeh_pi(&(p)+2,x); break; \ - case 1: (&(p))[3]=vec_elt3(p); \ - } \ -}) - - - -// Functions and operators - -static const union { - unsigned i[4]; - __m128 v; -} k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }}; -#define k4sign_mask (k4sign_mask_union.v) -static const union { - unsigned i[4]; - __m128 v; -} k4abs_mask_union = {{ 0x7fffffffU, 0x7fffffffU, 0x7fffffffU, 0x7fffffffU }}; -#define k4abs_mask (k4abs_mask_union.v) - -// Operators -#define k4pos(x) (x) -#define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) - -#define k4add(x,y) (_mm_add_ps(x,y)) -#define k4sub(x,y) (_mm_sub_ps(x,y)) -#define k4mul(x,y) (_mm_mul_ps(x,y)) -#define k4div(x,y) (_mm_div_ps(x,y)) - -// Fused multiply-add, defined as [+-]x*y[+-]z -#define k4madd(x,y,z) (k4add(k4mul(x,y),z)) -#define k4msub(x,y,z) (k4sub(k4mul(x,y),z)) -#define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y))) -#define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) - -// Cheap functions -#define k4fabs(x) (_mm_and_ps(x,k4abs_mask)) -#define k4fmax(x,y) (_mm_max_ps(x,y)) -#define k4fmin(x,y) (_mm_min_ps(x,y)) -#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) -#define k4sqrt(x) (_mm_sqrt_ps(x)) - -// Expensive functions -#define k4exp(x) \ -({ \ - CCTK_REAL4_VEC const xexp=(x); \ - vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ - exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ -}) -#define k4log(x) \ -({ \ - CCTK_REAL4_VEC const xlog=(x); \ - vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ - log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ -}) -#define k4pow(x,a) \ -({ \ - CCTK_REAL4_VEC const xpow=(x); \ - CCTK_REAL4 const apow=(a); \ - vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ - pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ -}) diff --git a/src/vectors-intel-8.h b/src/vectors-intel-8.h deleted file mode 100644 index 34aa24f..0000000 --- a/src/vectors-intel-8.h +++ /dev/null @@ -1,148 +0,0 @@ -// Vectorise using Intel's or AMD's SSE2 - -// Use the type __m128d directly, without introducing a wrapper class -// Use macros instead of inline functions - - - -#include - - - -// Vector type corresponding to CCTK_REAL -#define CCTK_REAL8_VEC __m128d - -// Number of vector elements in a CCTK_REAL_VEC -#define CCTK_REAL8_VEC_SIZE 2 - - - -// Create vectors, extract vector elements - -#define vec8_set1(a) (_mm_set1_pd(a)) -#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments - -#if defined(__PGI) && defined (__amd64__) -// _mm_cvtsd_f64 does not exist on PGI 9 compilers -# define vec8_elt0(x) \ -({ \ - CCTK_REAL8 aelt0; \ - asm ("" : "=x" (aelt0) : "0" (x)); \ - aelt0; \ -}) -#else -# define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op -#endif -#define vec8_elt1(x) \ -({ \ - CCTK_REAL8_VEC const xelt1=(x); \ - vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1)); \ -}) -#if defined(__PGI) && defined (__amd64__) -# define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - if (d==0) aelt=vec8_elt0(xelt); \ - else if (d==1) aelt=vec8_elt1(xelt); \ - aelt; \ -}) -#else -# define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - switch (d) { \ - case 0: aelt=vec8_elt0(xelt); break; \ - case 1: aelt=vec8_elt1(xelt); break; \ - } \ - aelt; \ -}) -#endif - - - -// Load and store vectors - -// Load a vector from memory (aligned and unaligned); this loads from -// a reference to a scalar -#define vec8_load(p) (_mm_load_pd(&(p))) -#define vec8_loadu(p) (_mm_loadu_pd(&(p))) - -// Load a vector from memory that may or may not be aligned, as -// decided by the offset off and the vector size -// Implementation: Always use unaligned load -#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) -#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) - -// Store a vector to memory (aligned and non-temporal); this stores to -// a reference to a scalar -#define vec8_store(p,x) (_mm_store_pd(&(p),x)) -#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) -#define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) - -// Store a lower or higher partial vector (aligned and non-temporal); -// the non-temporal hint is probably ignored -#if 1 -# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) -# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x)) -#else -// This is slower; we would need a non-temporal read -# define vec8_store_nta_partial_lo(p,x,n) (vec8_store_nta(p,_mm_loadh_pd(x,&(p)+1))) -# define vec8_store_nta_partial_hi(p,x,n) (vec8_store_nta(p,_mm_loadl_pd(x,&(p)))) -#endif - - - -// Functions and operators - -static const union { - unsigned long long i[2]; - __m128d v; -} k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }}; -#define k8sign_mask (k8sign_mask_union.v) -static const union { - unsigned long long i[2]; - __m128d v; -} k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }}; -#define k8abs_mask (k8sign_mask_union.v) - -// Operators -#define k8pos(x) (x) -#define k8neg(x) (_mm_xor_pd(x,k8sign_mask)) - -#define k8add(x,y) (_mm_add_pd(x,y)) -#define k8sub(x,y) (_mm_sub_pd(x,y)) -#define k8mul(x,y) (_mm_mul_pd(x,y)) -#define k8div(x,y) (_mm_div_pd(x,y)) - -// Fused multiply-add, defined as [+-]x*y[+-]z -#define k8madd(x,y,z) (k8add(k8mul(x,y),z)) -#define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) -#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) -#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) - -// Cheap functions -#define k8fabs(x) (_mm_and_pd(x,k8abs_mask)) -#define k8fmax(x,y) (_mm_max_pd(x,y)) -#define k8fmin(x,y) (_mm_min_pd(x,y)) -#define k8fnabs(x) (_mm_or_pd(x,k8sign_mask)) -#define k8sqrt(x) (_mm_sqrt_pd(x)) - -// Expensive functions -#define k8exp(x) \ -({ \ - CCTK_REAL8_VEC const xexp=(x); \ - vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ -}) -#define k8log(x) \ -({ \ - CCTK_REAL8_VEC const xlog=(x); \ - vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ -}) -#define k8pow(x,a) \ -({ \ - CCTK_REAL8_VEC const xpow=(x); \ - CCTK_REAL8 const apow=(a); \ - vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ -}) diff --git a/src/vectors-power-4.h b/src/vectors-power-4.h deleted file mode 100644 index 009b0f4..0000000 --- a/src/vectors-power-4.h +++ /dev/null @@ -1,128 +0,0 @@ -// Vectorise using IBM's Altivec (Power) - -// Use the type vector double directly, without introducing a wrapper class -// Use macros instead of inline functions - - - -#include - - - -// Vector type corresponding to CCTK_REAL -#define CCTK_REAL4_VEC vector float - -// Number of vector elements in a CCTK_REAL_VEC -#define CCTK_REAL4_VEC_SIZE 4 - - - -// Create vectors, extract vector elements - -#define vec4_set1(a) (vec_splats(a)) -#define vec4_set(a,b,c,d) \ -({ \ - CCTK_REAL4_VEC x; \ - x[0]=(a); \ - x[1]=(b); \ - x[2]=(c); \ - x[3]=(d); \ - x; \ -}) - -#define vec4_elt0(x) ((x)[0]) -#define vec4_elt1(x) ((x)[1]) -#define vec4_elt2(x) ((x)[2]) -#define vec4_elt3(x) ((x)[3]) -#define vec4_elt(x,d) ((x)[d]) - - - -// Load and store vectors - -// Load a vector from memory (aligned and unaligned); this loads from -// a reference to a scalar -#define vec4_load(p) (*(CCTK_REAL4_VEC const*)&(p)) -#define vec4_loadu(p) (*(CCTK_REAL4_VEC const*)&(p)) - -// Load a vector from memory that may or may not be aligned, as -// decided by the offset and the vector size -#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) -#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) - -// Store a vector to memory (aligned and non-temporal); this stores to -// a reference to a scalar -#define vec4_store(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) -#define vec4_storeu(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) -// TODO: Use stvxl instruction? -#define vec4_store_nta(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) - -// Store a lower or higher partial vector (aligned and non-temporal); -// the non-temporal hint is probably ignored -#define vec4_store_nta_partial_lo(p,x,n) \ -({ \ - switch (n) { \ - case 3: ((&(p))[2]=(x)[2]); \ - case 2: ((&(p))[1]=(x)[1]); \ - case 1: ((&(p))[0]=(x)[0]); \ - } \ -}) -#define vec4_store_nta_partial_hi(p,x,n) \ -({ \ - switch (n) { \ - case 3: ((&(p))[1]=(x)[1]); \ - case 2: ((&(p))[2]=(x)[2]); \ - case 1: ((&(p))[3]=(x)[3]); \ - } \ -}) - - - -// Functions and operators - -// Operators -#define k4pos(x) (+(x)) -#define k4neg(x) (-(x)) - -#define k4add(x,y) ((x)+(y)) -#define k4sub(x,y) ((x)-(y)) -#define k4mul(x,y) ((x)*(y)) -#define k4div(x,y) ((x)/(y)) - -// Fused multiply-add, defined as [+-]x*y[+-]z -#define k4madd(x,y,z) (vec_madd(x,y,z)) -#define k4msub(x,y,z) (vec_msub(x,y,z)) -#define k4nmadd(x,y,z) (vec_nmadd(x,y,z)) -#define k4nmsub(x,y,z) (vec_nmsub(x,y,z)) - -// Cheap functions -#define k4fabs(x) (vec_abs(x)) -#define k4fmax(x,y) (vec_max(x,y)) -#define k4fmin(x,y) (vec_min(x,y)) -#define k4fnabs(x) (vec_nabs(x)) - -#define k4exp(x) \ -({ \ - CCTK_REAL4_VEC const xexp=(x); \ - vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ - exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ -}) -#define k4log(x) \ -({ \ - CCTK_REAL4_VEC const xlog=(x); \ - vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ - log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ -}) -#define k4pow(x,a) \ -({ \ - CCTK_REAL4_VEC const xpow=(x); \ - CCTK_REAL4 const apow=(a); \ - vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ - pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ -}) -#define k4sqrt(x) \ -({ \ - CCTK_REAL4_VEC const xsqrt=(x); \ - vec4_set(sqrt(vec4_elt0(xsqrt)), sqrt(vec4_elt1(xsqrt)), \ - sqrt(vec4_elt2(xsqrt)), sqrt(vec4_elt3(xsqrt))); \ -}) diff --git a/src/vectors-power-8.h b/src/vectors-power-8.h deleted file mode 100644 index 8313168..0000000 --- a/src/vectors-power-8.h +++ /dev/null @@ -1,106 +0,0 @@ -// Vectorise using IBM's Altivec VSX (Power) - -// Use the type vector double directly, without introducing a wrapper class -// Use macros instead of inline functions - - - -#include - - - -// Vector type corresponding to CCTK_REAL -#define CCTK_REAL8_VEC vector double - -// Number of vector elements in a CCTK_REAL_VEC -#define CCTK_REAL8_VEC_SIZE 2 - - - -// Create vectors, extract vector elements - -#define vec8_set1(a) (vec_splats(a)) -#define vec8_set(a,b) \ -({ \ - CCTK_REAL8_VEC x; \ - x[0]=(a); \ - x[1]=(b); \ - x; \ -}) - -#define vec8_elt0(x) ((x)[0]) -#define vec8_elt1(x) ((x)[1]) -#define vec8_elt(x,d) ((x)[d]) - - - -// Load and store vectors - -// Load a vector from memory (aligned and unaligned); this loads from -// a reference to a scalar -#define vec8_load(p) (*(CCTK_REAL8_VEC const*)&(p)) -#define vec8_loadu(p) (*(CCTK_REAL8_VEC const*)&(p)) - -// Load a vector from memory that may or may not be aligned, as -// decided by the offset and the vector size -#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) -#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) - -// Store a vector to memory (aligned and non-temporal); this stores to -// a reference to a scalar -#define vec8_store(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) -#define vec8_storeu(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) -// TODO: Use stvxl instruction? -#define vec8_store_nta(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) - -// Store a lower or higher partial vector (aligned and non-temporal); -// the non-temporal hint is probably ignored -#define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=(x)[0]) -#define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=(x)[1]) - - - -// Functions and operators - -// Operators -#define k8pos(x) (+(x)) -#define k8neg(x) (-(x)) - -#define k8add(x,y) ((x)+(y)) -#define k8sub(x,y) ((x)-(y)) -#define k8mul(x,y) ((x)*(y)) -#define k8div(x,y) ((x)/(y)) - -// Fused multiply-add, defined as [+-]x*y[+-]z -#define k8madd(x,y,z) (vec_madd(x,y,z)) -#define k8msub(x,y,z) (vec_msub(x,y,z)) -#define k8nmadd(x,y,z) (vec_nmadd(x,y,z)) -#define k8nmsub(x,y,z) (vec_nmsub(x,y,z)) - -// Cheap functions -#define k8fabs(x) (vec_abs(x)) -#define k8fmax(x,y) (vec_max(x,y)) -#define k8fmin(x,y) (vec_min(x,y)) -#define k8fnabs(x) (vec_nabs(x)) - -#define k8exp(x) \ -({ \ - CCTK_REAL8_VEC const xexp=(x); \ - vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ -}) -#define k8log(x) \ -({ \ - CCTK_REAL8_VEC const xlog=(x); \ - vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ -}) -#define k8pow(x,a) \ -({ \ - CCTK_REAL8_VEC const xpow=(x); \ - CCTK_REAL8 const apow=(a); \ - vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ -}) -#define k8sqrt(x) \ -({ \ - CCTK_REAL8_VEC const xsqrt=(x); \ - vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \ -}) diff --git a/src/vectors.h b/src/vectors.h index 6fe909f..a3cad46 100644 --- a/src/vectors.h +++ b/src/vectors.h @@ -5,28 +5,40 @@ +#undef EMULATE_AVX + + + #if defined(KRANC_VECTORS) -# if defined(__SSE__) // Intel SSE vector instructions -# include "vectors-intel-4.h" -# elif defined(__ALTIVEC__) // Altivec (Power) -# include "vectors-power-4.h" +# if defined(__SSE__) // Intel SSE +# include "vectors-4-SSE.h" +# elif defined(__ALTIVEC__) // Power Altivec +# include "vectors-4-Altivec.h" # endif -# if defined(__SSE2__) // Intel SSE2 vector instructions -# include "vectors-intel-8.h" -# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec (Power) -# include "vectors-power-8.h" +# if defined(__AVX__) // Intel AVX +# include "vectors-8-AVX.h" +# elif defined(__SSE2__) // Intel SSE2 +# if defined(EMULATE_AVX) +# include "vectors-8-AVX.h" +# else +# include "vectors-8-SSE2.h" +# endif +# elif defined(_ARCH_450D) // Blue Gene/P Double Hummer +# include "vectors-8-DoubleHummer.h" +# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Power VSX +# include "vectors-8-VSX.h" # endif #endif // Default implementation, do not vectorise #if ! defined(CCTK_REAL4_VEC_SIZE) -# include "vectors-default-4.h" +# include "vectors-4-default.h" #endif #if ! defined(CCTK_REAL8_VEC_SIZE) -# include "vectors-default-8.h" +# include "vectors-8-default.h" #endif -- cgit v1.2.3