SSE3
[Emulated]


SSE[4A,...,5] implemented in SSE3

SSP_FORCEINLINE __m128 ssp_dp_ps_SSE3 (__m128 a, __m128 b, const int mask)

Function Documentation

SSP_FORCEINLINE __m128 ssp_dp_ps_SSE3 ( __m128  a,
__m128  b,
const int  mask 
)

SSE3 implementation of _mm_dp_ps [SSE4.1]. (Searches MSDN)

Definition at line 19 of file SSEPlus_emulation_SSE3.h.

00020 {
00021     const static __m128i mulShiftImm_0123 = SSP_CONST_SET_32I( 0x010000, 0x020000, 0x040000, 0x080000 );   // Shift mask multiply moves 0,1,2,3 bits to left, becomes MSB
00022     const static __m128i mulShiftImm_4567 = SSP_CONST_SET_32I( 0x100000, 0x200000, 0x400000, 0x800000 );   // Shift mask multiply moves 4,5,6,7 bits to left, becomes MSB
00023 
00024     // Begin mask preparation
00025     ssp_m128 mHi, mLo;
00026     mLo.i = _mm_set1_epi32( mask );                                 // Load the mask into register
00027     mLo.i = _mm_slli_si128( mLo.i, 3 );                         // Shift into reach of the 16 bit multiply
00028 
00029     mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_0123 );   // Shift the bits
00030     mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_4567 );   // Shift the bits
00031 
00032     mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() );    // FFFFFFFF if bit set, 00000000 if not set
00033     mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() );    // FFFFFFFF if bit set, 00000000 if not set
00034     // End mask preparation - Mask bits 0-3 in mLo, 4-7 in mHi
00035 
00036     a = _mm_and_ps( a, mHi.f );                                       // Clear input using the high bits of the mask
00037     a = _mm_mul_ps( a, b );
00038 
00039     a = ssp_arithmetic_hadd4_dup_ps_SSE3( a );                            // Horizontally add the 4 values
00040     a = _mm_and_ps( a, mLo.f );                                      // Clear output using low bits of the mask
00041     return a;
00042 }


Generated on Wed May 21 13:44:15 2008 for "SSEPlus" by  doxygen 1.5.4