"SSEPlus": include/logical/SSEPlus_logical

00001 //
00002 // Copyright (c) 2006-2008 Advanced Micro Devices, Inc. All Rights Reserved.
00003 // This software is subject to the Apache v2.0 License.
00004 //
00005 #ifndef __SSEPLUS_LOGICAL_SSE2_H__
00006 #define __SSEPLUS_LOGICAL_SSE2_H__
00007 
00008 #include "../native/SSEPlus_native_SSE2.h"
00009 
00017 SSP_FORCEINLINE __m128i ssp_logical_signinvert_16_SSE2( __m128i mask, __m128i a, __m128i b)
00018 {
00019     __m128i signMask;   
00020     signMask = _mm_xor_si128  ( a, b );              // Signbit is 1 where signs differ 
00021     signMask = _mm_srai_epi16 ( signMask, 15 );      // fill all fields with sign bit     
00022     mask     = _mm_xor_si128  ( mask, signMask );    // Invert output where signs differed
00023     return mask;  
00024 }
00025 
00027 SSP_FORCEINLINE __m128i ssp_logical_signinvert_32_SSE2( __m128i mask, __m128i a, __m128i b)
00028 {
00029     __m128i signMask;   
00030     signMask = _mm_xor_si128  ( a, b );              // Signbit is 1 where signs differ 
00031     signMask = _mm_srai_epi32 ( signMask, 31 );      // fill all fields with sign bit     
00032     mask     = _mm_xor_si128  ( mask, signMask );    // Invert output where signs differed
00033     return mask;  
00034 }
00035 
00036 
00037 SSP_FORCEINLINE __m128i ssp_logical_invert_si128_SSE2( __m128i a )
00038 {
00039     const static __m128i mask = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF );
00040     a = _mm_xor_si128( a, mask );
00041     return a;
00042 }
00043 
00044 SSP_FORCEINLINE __m128d ssp_logical_invert_sd_SSE2( __m128d a )
00045 {
00046     const static __m128i mask = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 );
00047     ssp_m128 A;
00048     A.d    = a;
00049     A.i = _mm_xor_si128( A.i, mask );
00050     return A.d;
00051 }
00052 
00053 SSP_FORCEINLINE __m128 ssp_logical_invert_ss_SSE2( __m128 a )
00054 {
00055     const static __m128i mask = SSP_CONST_SET_32I( 0xFFFFFFFF, 0, 0, 0 );
00056     ssp_m128 A;
00057     A.f    = a;
00058     A.i = _mm_xor_si128( A.i, mask );
00059     return A.f;
00060 }
00061 
00062 
00063 SSP_FORCEINLINE __m128i ssp_logical_bitwise_select_SSE2( __m128i a, __m128i b, __m128i mask )   // Bitwise (mask ? a : b) 
00064 {
00065     a = _mm_and_si128   ( a,    mask );                                 // clear a where mask = 0
00066     b = _mm_andnot_si128( mask, b    );                                 // clear b where mask = 1
00067     a = _mm_or_si128    ( a,    b    );                                 // a = a OR b                         
00068     return a; 
00069 }
00070 
00071 
00072 //SSP_FORCEINLINE
00073 //__m128i ssp_generate_mask_imm8_to_epi16_SSE2( int mask )
00074 //{
00075 //    __m128i screen;
00076 //    const static __m128i mulShiftImm = SSP_CONST_SET_16I( 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 ); // Shift mask multiply moves all bits to left, becomes MSB
00077 //    screen = _mm_set1_epi16  ( mask                );   // Load the mask into register
00078 //    screen = _mm_mullo_epi16 ( screen, mulShiftImm );   // Shift bits to MSB
00079 //    screen = _mm_srai_epi16  ( screen, 15          );   // Shift bits to obtain 0xFFFF or 0x0000
00080 //    return screen;
00081 //}
00082 
00083 SSP_FORCEINLINE
00084 __m128i ssp_movmask_imm8_to_epi32_SSE2( int mask )
00085 {
00086     __m128i screen;
00087     const static __m128i mulShiftImm = SSP_CONST_SET_16I( 0x1000, 0x0000, 0x2000, 0x0000, 0x4000, 0x0000, 0x8000, 0x0000 ); // Shift mask multiply moves all bits to left, becomes MSB
00088     screen = _mm_set1_epi16 ( mask                );   // Load the mask into register
00089     screen = _mm_mullo_epi16( screen, mulShiftImm );   // Shift bits to MSB
00090     screen = _mm_srai_epi32 ( screen, 31          );   // Shift bits to obtain all F's or all 0's
00091     return screen;
00092 }
00093 
00094 
00098 SSP_FORCEINLINE __m128i ssp_slli_epi8_SSE2(__m128i a, const int b)
00099 {                                            //  a = VfVeVdVcVbVaV9V8V7V6V5V4V3V2V1V0
00100     __m128i t1 = _mm_srli_epi16( a, 8 );     // t1 =   Vf  Vd  Vb  V9  V7  V5  V3  V1
00101     __m128i t2 = _mm_slli_epi16( a, b + 8 ); // t2 = Re  Rc  Ra  R8  R6  R4  R2  R0
00102     t1 = _mm_slli_epi16( t1, b + 8 );        // t1 = Rf  Rd  Rb  R9  R7  R5  R3  R1
00103     t2 = _mm_srli_epi16( t1, 8 );            // t2 =   Re  Rc  Ra  R8  R6  R4  R2  R0
00104     t1 = _mm_or_si128( t1, t2 );             // t1 = RfReRdRcRbRaR9R8R7R6R5R4R3R2R1R0
00105     return t1;
00106 }
00107 
00111 SSP_FORCEINLINE __m128i ssp_srli_epi8_SSE2(__m128i a, const int b)
00112 {                                            //  a = VfVeVdVcVbVaV9V8V7V6V5V4V3V2V1V0
00113     __m128i t1 = _mm_slli_epi16( a, 8 );     // t1 = Ve  Vc  Va  V8  V6  V4  V2  V0
00114     __m128i t2 = _mm_srli_epi16( a, b + 8 ); // t2 =   Rf  Rd  Rb  R9  R7  R5  R3  R1
00115     t1 = _mm_srli_epi16( t1, b + 8 );        // t1 =   Re  Rc  Ra  R8  R6  R4  R2  R0
00116     t2 = _mm_slli_epi16( t1, 8 );            // t2 = Rf  Rd  Rb  R9  R7  R5  R3  R1
00117     t1 = _mm_or_si128( t1, t2 );             // t1 = RfReRdRcRbRaR9R8R7R6R5R4R3R2R1R0
00118     return t1;
00119 }
00120 
00125 #endif // __SSEPLUS_LOGICAL_SSE2_H__
include/logical/SSEPlus_logical_SSE2.h