00001
00002
00003
00004
00005 #ifndef __SSEPLUS_EMULATION_SSE3_H__
00006 #define __SSEPLUS_EMULATION_SSE3_H__
00007
00008 #include "../SSEPlus_base.h"
00009 #include "../native/SSEPlus_native_SSE3.h"
00010 #include "../arithmetic/SSEPlus_arithmetic_SSE3.h"
00011
00018 SSP_FORCEINLINE
00019 __m128 ssp_dp_ps_SSE3( __m128 a, __m128 b, const int mask )
00020 {
00021 const static __m128i mulShiftImm_0123 = SSP_CONST_SET_32I( 0x010000, 0x020000, 0x040000, 0x080000 );
00022 const static __m128i mulShiftImm_4567 = SSP_CONST_SET_32I( 0x100000, 0x200000, 0x400000, 0x800000 );
00023
00024
00025 ssp_m128 mHi, mLo;
00026 mLo.i = _mm_set1_epi32( mask );
00027 mLo.i = _mm_slli_si128( mLo.i, 3 );
00028
00029 mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_0123 );
00030 mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_4567 );
00031
00032 mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() );
00033 mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() );
00034
00035
00036 a = _mm_and_ps( a, mHi.f );
00037 a = _mm_mul_ps( a, b );
00038
00039 a = ssp_arithmetic_hadd4_dup_ps_SSE3( a );
00040 a = _mm_and_ps( a, mLo.f );
00041 return a;
00042 }
00043
00048 #endif // __SSEPLUS_EMULATION_SSE3_H__