SSE[4A,...,5] implemented in SSE3 | |
SSP_FORCEINLINE __m128 | ssp_dp_ps_SSE3 (__m128 a, __m128 b, const int mask) |
SSP_FORCEINLINE __m128 ssp_dp_ps_SSE3 | ( | __m128 | a, | |
__m128 | b, | |||
const int | mask | |||
) |
SSE3 implementation of _mm_dp_ps [SSE4.1]. (Searches MSDN)
Definition at line 19 of file SSEPlus_emulation_SSE3.h.
00020 { 00021 const static __m128i mulShiftImm_0123 = SSP_CONST_SET_32I( 0x010000, 0x020000, 0x040000, 0x080000 ); // Shift mask multiply moves 0,1,2,3 bits to left, becomes MSB 00022 const static __m128i mulShiftImm_4567 = SSP_CONST_SET_32I( 0x100000, 0x200000, 0x400000, 0x800000 ); // Shift mask multiply moves 4,5,6,7 bits to left, becomes MSB 00023 00024 // Begin mask preparation 00025 ssp_m128 mHi, mLo; 00026 mLo.i = _mm_set1_epi32( mask ); // Load the mask into register 00027 mLo.i = _mm_slli_si128( mLo.i, 3 ); // Shift into reach of the 16 bit multiply 00028 00029 mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_0123 ); // Shift the bits 00030 mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_4567 ); // Shift the bits 00031 00032 mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() ); // FFFFFFFF if bit set, 00000000 if not set 00033 mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() ); // FFFFFFFF if bit set, 00000000 if not set 00034 // End mask preparation - Mask bits 0-3 in mLo, 4-7 in mHi 00035 00036 a = _mm_and_ps( a, mHi.f ); // Clear input using the high bits of the mask 00037 a = _mm_mul_ps( a, b ); 00038 00039 a = ssp_arithmetic_hadd4_dup_ps_SSE3( a ); // Horizontally add the 4 values 00040 a = _mm_and_ps( a, mLo.f ); // Clear output using low bits of the mask 00041 return a; 00042 }