SSE2
[Supplimental]


Arithmetic Operations

SSP_FORCEINLINE __m128 ssp_arithmetic_hadd4_dup_ps_SSE2 (__m128 a)
SSP_FORCEINLINE __m128i ssp_arithmetic_hadd4_epi16_SSE2 (__m128i a, const unsigned int offset)
SSP_FORCEINLINE __m128 ssp_round_ps_neg_zero_SSE2 (__m128 a, int iRoundMode)

Convert Operations

SSP_FORCEINLINE void ssp_convert_odd_even_epi16_SSE2 (__m128i *a, __m128i *b)
SSP_FORCEINLINE void ssp_convert_odd_even_ps_SSE2 (__m128 *a, __m128 *b)
SSP_FORCEINLINE void ssp_convert_odd_even_epi32_SSE2 (__m128i *a, __m128i *b)
SSP_FORCEINLINE void ssp_convert_3c_3p_epi8_SSE2 (__m128i *rgb1, __m128i *rgb2, __m128i *rgb3)
SSP_FORCEINLINE void ssp_convert_reverse_transpose_SSE2 (__m128i *a, __m128i *b, __m128i *c)
SSP_FORCEINLINE void ssp_convert_3p_3c_epi8_SSE2 (__m128i *r, __m128i *g, __m128i *b)
SSP_FORCEINLINE void ssp_convert_3c_3p_epi16_SSE2 (__m128i *rgb1, __m128i *rgb2, __m128i *rgb3)
SSP_FORCEINLINE void ssp_convert_3p_3c_epi16_SSE2 (__m128i *r, __m128i *g, __m128i *b)
SSP_FORCEINLINE void ssp_convert_3c_3p_epi32_SSE2 (__m128i *rgb1, __m128i *rgb2, __m128i *rgb3)
SSP_FORCEINLINE void ssp_convert_3p_3c_epi32_SSE2 (__m128i *r, __m128i *g, __m128i *b)
SSP_FORCEINLINE void ssp_convert_4c_4p_epi8_SSE2 (__m128i *rgba1, __m128i *rgba2, __m128i *rgba3, __m128i *rgba4)
SSP_FORCEINLINE void ssp_convert_4p_4c_epi8_SSE2 (__m128i *r, __m128i *g, __m128i *b, __m128i *a)
SSP_FORCEINLINE void ssp_convert_4c_4p_epi16_SSE2 (__m128i *rgba1, __m128i *rgba2, __m128i *rgba3, __m128i *rgba4)
SSP_FORCEINLINE void ssp_convert_4p_4c_epi16_SSE2 (__m128i *r, __m128i *g, __m128i *b, __m128i *a)
SSP_FORCEINLINE void ssp_convert_4c_4p_epi32_SSE2 (__m128i *rgba1, __m128i *rgba2, __m128i *rgba3, __m128i *rgba4)
SSP_FORCEINLINE void ssp_convert_4p_4c_epi32_SSE2 (__m128i *r, __m128i *g, __m128i *b, __m128i *a)

Logical Operations

SSP_FORCEINLINE __m128i ssp_logical_signinvert_16_SSE2 (__m128i mask, __m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_logical_signinvert_32_SSE2 (__m128i mask, __m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_logical_invert_si128_SSE2 (__m128i a)
SSP_FORCEINLINE __m128d ssp_logical_invert_sd_SSE2 (__m128d a)
SSP_FORCEINLINE __m128 ssp_logical_invert_ss_SSE2 (__m128 a)
SSP_FORCEINLINE __m128i ssp_logical_bitwise_select_SSE2 (__m128i a, __m128i b, __m128i mask)
SSP_FORCEINLINE __m128i ssp_movmask_imm8_to_epi32_SSE2 (int mask)
SSP_FORCEINLINE __m128i ssp_slli_epi8_SSE2 (__m128i a, const int b)
SSP_FORCEINLINE __m128i ssp_srli_epi8_SSE2 (__m128i a, const int b)

Memory Operations

__m128i ssp_memory_load1_epu8_SSE2 (unsigned char a)

Function Documentation

SSP_FORCEINLINE __m128 ssp_arithmetic_hadd4_dup_ps_SSE2 ( __m128  a  ) 

Definition at line 17 of file SSEPlus_arithmetic_SSE2.h.

00018 {
00019     __m128 t;
00020     t = _mm_shuffle_ps( a, a, _MM_SHUFFLE(2, 3, 0, 1) );                //TODO shuflo, shuf hi
00021     a = _mm_add_ps( a, t );   
00022 
00023     t = _mm_shuffle_ps( a, a, _MM_SHUFFLE(1, 0, 3, 2) );                //TODO shuflo, shuf hi
00024     a = _mm_add_ps( a, t );
00025     return a;
00026 } 

SSP_FORCEINLINE __m128i ssp_arithmetic_hadd4_epi16_SSE2 ( __m128i  a,
const unsigned int  offset 
)

in = a,b,c,d | e,f,g,h, 0 out = x,x,x,a+b+c+d | x,x,x,e+f+g+h

in = a,b,c,d | e,f,g,h, 3 out = a+b+c+d,x,x,x,| x,x,x,e+f+g+h

offset indicates desired position of sum (0,1,2,3)

Definition at line 39 of file SSEPlus_arithmetic_SSE2.h.

00040 {
00041     ssp_m128 A,B;
00042     A.i = a;                                           //A = a, b, c, d | e, f, g, h
00043 
00044     if( offset >= 2 ) B.i = _mm_slli_si128( A.i, 4 );  //B = c, d, x, x | g, h, x, x
00045     else              B.i = _mm_srli_si128( A.i, 4 );  //B = x, x, a, b | x, x, e, f
00046 
00047     A.i = _mm_add_epi16 ( A.i, B.i );      
00048 
00049     if( offset & 1 )  B.i = _mm_slli_si128( A.i, 2 );  
00050     else              B.i = _mm_srli_si128( A.i, 2 ); 
00051   
00052     A.i = _mm_add_epi16 ( A.i, B.i );      
00053     return A.i;
00054 }  

SSP_FORCEINLINE void ssp_convert_3c_3p_epi16_SSE2 ( __m128i *  rgb1,
__m128i *  rgb2,
__m128i *  rgb3 
)

Definition at line 192 of file SSEPlus_convert_SSE2.h.

00193 {
00194                 __m128i temp1, temp2;
00195 
00196                 *rgb2  = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b3,g3,r3,b2,r5,b4,g4,r4
00197                 temp1 = _mm_unpacklo_epi16(*rgb1, *rgb2);                               //r5,r1,b4,b0,g4,g0,r4,r0
00198                 temp2 = _mm_unpackhi_epi16(*rgb2, *rgb3);                               //b7,b3,g7,g3,r7,r3,b6,b2
00199                 *rgb3  = _mm_slli_si128(*rgb3, 8);                                              //g6,r6,b5,g5, 0, 0, 0, 0
00200                 *rgb2  = _mm_unpackhi_epi16(*rgb1, *rgb3);                              //g6,g2,r6,r2,b5,b1,g5,g1
00201                 
00202                 *rgb3  = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b5,b1,g5,g1,g6,g2,r6,r2
00203                 *rgb1  = _mm_unpacklo_epi16(temp1, *rgb3);                              //g6,g4,g2,g0,r6,r4,r2,r0
00204                 temp1 = _mm_srli_si128(temp1, 8);                                               // 0, 0, 0, 0,r5,r1,b4,b0
00205                 temp1 = _mm_unpacklo_epi16(temp1, temp2);                               //r7,r5,r3,r1,b6,b4,b2,b0
00206                 temp2 = _mm_unpackhi_epi16(*rgb3, temp2);                               //b7,b5,b3,b1,g7,g5,g3,g1
00207 
00208                 temp1 = _mm_shuffle_epi32(temp1, _MM_SHUFFLE(1,0,3,2)); //b6,b4,b2,b0,r7,r5,r3,r1
00209                 *rgb3  = _mm_unpackhi_epi16(temp1, temp2);                              //b7,b6,b5,b4,b3,b2,b1,b0                               
00210                 temp2 = _mm_slli_si128(temp2, 8);                                               //g7,g5,g3,g1, 0, 0, 0, 0
00211                 *rgb2  = _mm_unpackhi_epi16(*rgb1, temp2);                              //g7,g6,g5,g4,g3,g2,g1,g0                               
00212                 *rgb1  = _mm_unpacklo_epi16(*rgb1, temp1);                              //r7,r6,r5,r4,r3,r2,r1,r0                               
00213 }

SSP_FORCEINLINE void ssp_convert_3c_3p_epi32_SSE2 ( __m128i *  rgb1,
__m128i *  rgb2,
__m128i *  rgb3 
)

Definition at line 259 of file SSEPlus_convert_SSE2.h.

00260 {
00261                 __m128i temp1, temp2;
00262                 
00263                 *rgb2  = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b1,g1,g2,r2
00264                 temp1 = _mm_unpacklo_epi32(*rgb1, *rgb2);                               //g2,g0,r2,r0
00265                 temp2 = _mm_unpackhi_epi32(*rgb2, *rgb3);                               //b3,b1,g3,g1
00266                 *rgb3  = _mm_slli_si128(*rgb3, 8);                                              //r3,b2, 0, 0
00267                 *rgb2  = _mm_unpackhi_epi32(*rgb1, *rgb3);                              //r3,r1,b2,b0
00268                 
00269                 *rgb3  = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b2,b0,r3,r1
00270                 *rgb1  = _mm_unpacklo_epi32(temp1, *rgb3);                              //r3,r2,r1,r0
00271                 temp1 = _mm_srli_si128(temp1, 8);                                               // 0, 0,g2,g0
00272                 *rgb2  = _mm_unpacklo_epi32(temp1, temp2);                              //g3,g2,g1,g0
00273                 *rgb3  = _mm_unpackhi_epi32(*rgb3, temp2);                              //b3,b2,b1,b0
00274 }

SSP_FORCEINLINE void ssp_convert_3c_3p_epi8_SSE2 ( __m128i *  rgb1,
__m128i *  rgb2,
__m128i *  rgb3 
)

Definition at line 90 of file SSEPlus_convert_SSE2.h.

00091 {
00092     __m128i temp1, temp2;
00093                                                             // RGB1 =         r5 , b4  g4  r4 , b3  g3  r3 , b2  g2  r2 , b1  g1  r1 , b0  g0 r0
00094                                                             // RGB2 =     g10 r10, b9  g9  r9 , b8  g8  r8 , b7  g7  r7 , b6  g6  r6 , b5  g5   
00095                                                             // RGB3 = b15 g15 r15, b14 g14 r14, b13 g13 r13, b12 g12 r12, b11 g11 r11, b10 
00096 
00097 
00098     *rgb2 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));     // b7, g7, r7, b6, g6, r6, b5, g5,g10,r10, b9, g9, r9, b8, g8, r8
00099     temp1 = _mm_unpacklo_epi8(*rgb1, *rgb2);                            //g10, g2,r10, r2, b9, b1, g9, g1, r9, r1, b8, b0, g8, g0, r8, r0
00100     temp2 = _mm_unpackhi_epi8(*rgb2, *rgb3);                            //b15, b7,g15, g7,r15, r7,b14, b6,g14, g6,r14, r6,b13, b5,g13, g5
00101     *rgb3 = _mm_slli_si128   (*rgb3, 8    );                            //r13,b12,g12,r12,b11,g11,r11,b10,  0,  0,  0,  0,  0,  0,  0,  0
00102     *rgb2 = _mm_unpackhi_epi8(*rgb1, *rgb3);                            //r13, r5,b12, b4,g12, g4,r12, r4,b11, b3,g11, g3,r11, r3,b10, b2
00103 
00104     *rgb3 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));     //b11, b3,g11, g3,r11, r3,b10, b2,r13, r5,b12, b4,g12, g4,r12, r4
00105     *rgb1 = _mm_unpacklo_epi8(temp1, *rgb3);                            //r13, r9, r5, r1,b12, b8, b4, b0,g12, g8, g4, g0,r12, r8, r4, r0
00106     temp1 = _mm_srli_si128   (temp1, 8    );                            //  0,  0,  0,  0,  0,  0,  0,  0,g10, g2,r10, r2, b9, b1, g9, g1
00107     temp1 = _mm_unpacklo_epi8(temp1, temp2);                            //g14,g10, g6, g2,r14,r10, r6, r2,b13, b9, b5, b1,g13, g9, g5, g1
00108     temp2 = _mm_unpackhi_epi8(*rgb3, temp2);                            //b15,b11, b7, b3,g15,g11, g7, g3,r15,r11, r7, r3,b14,b10, b6, b2
00109 
00110     temp1 = _mm_shuffle_epi32(temp1, _MM_SHUFFLE(1,0,3,2)); //b13, b9, b5, b1,g13, g9, g5, g1,g14,g10, g6, g2,r14,r10, r6, r2
00111     *rgb3 = _mm_unpackhi_epi8(temp1, temp2);                            //b15,b13,b11, b9, b7, b5, b3, b1,g15,g13,g11, g9, g7, g5, g3, g1
00112     temp2 = _mm_slli_si128   (temp2, 8    );                            //r15,r11, r7, r3,b14,b10, b6, b2,  0,  0,  0,  0,  0,  0,  0,  0
00113     temp2 = _mm_unpackhi_epi8(*rgb1, temp2);                            //r15,r13,r11, r9, r7, r5, r3, r1,b14,b12,b10, b8, b6, b4, b2, b0
00114     temp1 = _mm_unpacklo_epi8(*rgb1, temp1);                            //g14,g12,g10, g8, g6, g4, g2, g0,r14,r12,r10, r8, r6, r4, r2, r0
00115 
00116     temp2 = _mm_shuffle_epi32(temp2, _MM_SHUFFLE(1,0,3,2)); //b14,b12,b10, b8, b6, b4, b2, b0,r15,r13,r11, r9, r7, r5, r3, r1
00117     *rgb1 = _mm_unpacklo_epi8(temp1, temp2);                            //r15,r14,r13,r12,r11,r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0       
00118     temp1 = _mm_srli_si128   (temp1, 8    );                            //  0,  0,  0,  0,  0,  0,  0,  0,g14,g12,g10, g8, g6, g4, g2, g0
00119     *rgb2 = _mm_unpacklo_epi8(temp1, *rgb3);                            //g15,g14,g13,g12,g11,g10, g9, g8, g7, g6, g5, g4, g3, g2, g1, g0       
00120     *rgb3 = _mm_unpackhi_epi8(temp2, *rgb3);                            //b15,b14,b13,b12,b11,b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0       
00121 }

SSP_FORCEINLINE void ssp_convert_3p_3c_epi16_SSE2 ( __m128i *  r,
__m128i *  g,
__m128i *  b 
)

Definition at line 216 of file SSEPlus_convert_SSE2.h.

00217 {
00218                 __m128i temp;
00219 
00220                 temp = _mm_srli_si128(*r, 8);                   // 0, 0, 0, 0,r7,r6,r5,r4
00221                 *r    = _mm_unpacklo_epi16(*r, temp);   //r7,r3,r6,r2,r5,r1,r4,r0
00222                 temp = _mm_srli_si128(*r, 8);                   // 0, 0, 0, 0,r7,r3,r6,r2
00223                 *r    = _mm_unpacklo_epi16(*r, temp);   //r7,r5,r3,r1,r6,r4,r2,r0
00224 
00225                 temp = _mm_srli_si128(*g, 8);                   //g7,g3,g6,g2,g5,g1,g4,g0
00226                 *g    = _mm_unpacklo_epi16(*g, temp);   // 0, 0, 0, 0,g7,g3,g6,g2
00227                 temp = _mm_srli_si128(*g, 8);                   //g7,g5,g3,g1,g6,g4,g2,g0
00228                 *g    = _mm_unpacklo_epi16(*g, temp);   //g7,g5,g3,g1,g6,g4,g2,g0
00229 
00230                 temp = _mm_srli_si128(*b, 8);                   //b7,b3,b6,b2,b5,b1,b4,b0
00231                 *b    = _mm_unpacklo_epi16(*b, temp);   // 0, 0, 0, 0,b7,b3,b6,b2
00232                 temp = _mm_srli_si128(*b, 8);                   //b7,b5,b3,b1,b6,b4,b2,b0
00233                 *b    = _mm_unpacklo_epi16(*b, temp);   //b7,b5,b3,b1,b6,b4,b2,b0
00234 
00235                 temp = _mm_unpacklo_epi16(*r, *g);              //g6,r6,g4,r4,g2,r2,g0,r0
00236                 *r    = _mm_srli_si128(*r , 8);                 // 0, 0, 0, 0,r7,r5,r3,r1
00237                 *r    = _mm_unpacklo_epi16(*b, *r);             //r7,b6,r5,b4,r3,b2,r1,b0
00238                 *g    = _mm_unpackhi_epi16(*g, *b);             //b7,g7,b5,g5,b3,g3,b1,g1
00239 
00240                 *b    = _mm_srli_si128(*r, 8);                  // 0, 0, 0, 0,r7,b6,r5,b4
00241                 *r    = _mm_unpacklo_epi32(*r, *b);             //r7,b6,r3,b2,r5,b4,r1,b0
00242                 *b    = _mm_srli_si128(*g, 8);                  // 0, 0, 0, 0,b7,g7,b5,g5
00243                 *g    = _mm_unpacklo_epi32(*g, *b);             //b7,g7,b3,g3,b5,g5,b1,g1
00244                 *b    = _mm_srli_si128(temp, 8);                // 0, 0, 0, 0,g6,r6,g4,r4
00245                 temp = _mm_unpacklo_epi32(temp, *b);    //g6,r6,g2,r2,g4,r4,g0,r0
00246 
00247                 *b    = _mm_unpacklo_epi32(temp, *g);   //b5,g5,g4,r4,b1,g1,g0,r0
00248                 temp = _mm_srli_si128(temp, 8);                 // 0, 0, 0, 0,g6,r6,g2,r2
00249                 temp = _mm_unpacklo_epi32(*r, temp);    //g6,r6,r5,b4,g2,r2,r1,b0
00250                 *g    = _mm_unpackhi_epi32(*r, *g);             //b7,g7,r7,b6,b3,g3,r3,b2
00251                 
00252                 *r    = _mm_unpacklo_epi32(*b, temp);   //g2,r2,b1,g1,r1,b0,g0,r0
00253                 temp = _mm_unpackhi_epi32(*b, temp);    //g6,r6,b5,g5,r5,b4,g4,b4
00254                 *b    = _mm_unpackhi_epi64(temp, *g);   //b7,g7,r7,b6,g6,r6,b5,g5
00255                 *g    = _mm_unpacklo_epi64(*g, temp);   //r5,b4,g4,r4,b3,g3,r3,b2
00256 }

SSP_FORCEINLINE void ssp_convert_3p_3c_epi32_SSE2 ( __m128i *  r,
__m128i *  g,
__m128i *  b 
)

Definition at line 277 of file SSEPlus_convert_SSE2.h.

00278 {
00279                 __m128i temp;
00280 
00281                 temp = _mm_srli_si128(*r, 8);                   // 0, 0,r3,r2
00282                 *r    = _mm_unpacklo_epi32(*r, temp);   //r3,r1,r2,r0
00283                 temp = _mm_srli_si128(*g, 8);                   // 0, 0,g3,g2
00284                 *g    = _mm_unpacklo_epi32(*g, temp);   //g3,g1,g2,g0
00285                 temp = _mm_srli_si128(*b, 8);                   // 0, 0,b3,b2
00286                 *b    = _mm_unpacklo_epi32(*b, temp);   //b3,b1,b2,b0
00287 
00288                 temp = _mm_unpacklo_epi32(*r, *g);              //g2,r2,g0,r0
00289                 *g    = _mm_unpackhi_epi32(*g, *b);             //b3,g3,b1,g1
00290                 *r    = _mm_srli_si128(*r, 8);                  // 0, 0,r3,r1
00291                 *b    = _mm_unpacklo_epi32(*b, *r);             //r3,b2,r1,b0
00292 
00293                 *r    = _mm_unpacklo_epi64(temp, *b);   //r1,b0,g0,r0
00294                 *b    = _mm_unpackhi_epi64(*b, *g);             //b3,g3,r3,b2
00295                 *g    = _mm_slli_si128(*g, 8);                  //b1,g1, 0, 0
00296                 *g    = _mm_unpackhi_epi64(*g, temp);   //g2,r2,b1,g1
00297 }

SSP_FORCEINLINE void ssp_convert_3p_3c_epi8_SSE2 ( __m128i *  r,
__m128i *  g,
__m128i *  b 
)

Definition at line 150 of file SSEPlus_convert_SSE2.h.

00151 {
00152     const static __m128i odd_8  = SSP_CONST_SET_8I(   0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0 );
00153     const static __m128i even_8 = SSP_CONST_SET_8I( 0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF   );
00154 
00155     const static __m128i odd_16  = SSP_CONST_SET_16I(   0xFFFF,0,0xFFFF,0,0xFFFF,0,0xFFFF,0 );
00156     const static __m128i even_16 = SSP_CONST_SET_16I( 0,0xFFFF,0,0xFFFF,0,0xFFFF,0,0xFFFF   );    
00157 
00158    ssp_m128 T, RG, GB, BR, RGBR, GBRG, BRGB;
00159     
00160      RG.i = _mm_and_si128 (     *r, even_8  );  // Mask out the odd r bits
00161       T.i = _mm_slli_epi16(     *g, 8       );  // Move the even g bits to the odd position
00162      RG.i = _mm_or_si128  (   RG.i, T.i     );  // G14 R14 ... G2 R2 G0 R0
00163 
00164      GB.i = _mm_srli_epi16(     *g, 8       );      
00165       T.i = _mm_and_si128 (     *b, odd_8   );
00166      GB.i = _mm_or_si128  (   GB.i, T.i     );
00167 
00168      BR.i = _mm_and_si128 (     *b, even_8  );
00169       T.i = _mm_and_si128 (     *r, odd_8   );
00170      BR.i = _mm_or_si128  (   BR.i, T.i     );
00171 
00172    RGBR.i = _mm_and_si128 (   RG.i, even_16 );
00173       T.i = _mm_slli_epi32(   BR.i, 16      );
00174    RGBR.i = _mm_or_si128  ( RGBR.i, T.i     );
00175 
00176    GBRG.i = _mm_and_si128 (   GB.i, even_16 );
00177       T.i = _mm_and_si128 (   RG.i, odd_16  );
00178    GBRG.i = _mm_or_si128  ( GBRG.i, T.i     );
00179 
00180    BRGB.i = _mm_srli_epi32(   BR.i, 16      );
00181       T.i = _mm_and_si128 (   GB.i, odd_16  );
00182    BRGB.i = _mm_or_si128  ( BRGB.i, T.i     );
00183 
00184    ssp_convert_reverse_transpose_SSE2( &RGBR.i, &GBRG.i, &BRGB.i );
00185 
00186    *r = RGBR.i;
00187    *g = GBRG.i;
00188    *b = BRGB.i; 
00189 }

SSP_FORCEINLINE void ssp_convert_4c_4p_epi16_SSE2 ( __m128i *  rgba1,
__m128i *  rgba2,
__m128i *  rgba3,
__m128i *  rgba4 
)

Definition at line 344 of file SSEPlus_convert_SSE2.h.

00345 {
00346                 __m128i temp1, temp2;
00347 
00348                 temp1  = _mm_unpacklo_epi16(*rgba1, *rgba3);            //a4,a0,b4,b0,g4,g0,r4,r0
00349                 *rgba1  = _mm_unpackhi_epi16(*rgba1, *rgba3);           //a5,a1,b5,b1,g5,g1,r5,r1
00350                 *rgba3  = _mm_unpacklo_epi16(*rgba2, *rgba4);           //a6,a2,b6,b2,g6,g2,r6,r2
00351                 *rgba2  = _mm_unpackhi_epi16(*rgba2, *rgba4);           //a7,a3,b7,b3,g7,g3,r7,r3
00352 
00353                 *rgba4  = _mm_unpackhi_epi16(*rgba1, *rgba2);           //a7,a5,a3,a1,b7,b5,b3,b1
00354                 *rgba1  = _mm_unpacklo_epi16(*rgba1, *rgba2);           //g7,g5,g3,g1,r7,r5,r3,r1
00355                 temp2  = _mm_unpacklo_epi16(temp1, *rgba3);                     //g6,g4,g2,g0,r6,r4,r2,r0
00356                 temp1  = _mm_unpackhi_epi16(temp1, *rgba3);                     //a6,a4,a2,a0,b6,b4,b2,b0
00357 
00358                 *rgba3  = _mm_unpacklo_epi16(temp1, *rgba4);            //b7,b6,b5,b4,b3,b2,b1,b0
00359                 *rgba4  = _mm_unpackhi_epi16(temp1, *rgba4);            //a7,a6,a5,a4,a3,a2,a1,a0
00360                 *rgba2  = _mm_unpackhi_epi16(temp2, *rgba1);            //g7,g6,g5,g4,g3,g2,g1,g0
00361                 *rgba1  = _mm_unpacklo_epi16(temp2, *rgba1);            //r7,r6,r5,r4,r3,r2,r1,r0
00362 }

SSP_FORCEINLINE void ssp_convert_4c_4p_epi32_SSE2 ( __m128i *  rgba1,
__m128i *  rgba2,
__m128i *  rgba3,
__m128i *  rgba4 
)

Definition at line 381 of file SSEPlus_convert_SSE2.h.

00382 {
00383                 __m128i temp1, temp2;
00384 
00385                 temp1  = _mm_unpacklo_epi32(*rgba1, *rgba3);            //g2,g0,r2,r0
00386                 *rgba1  = _mm_unpackhi_epi32(*rgba1, *rgba3);           //a2,a0,b2,b0
00387                 temp2  = _mm_unpacklo_epi32(*rgba2, *rgba4);            //g3,g1,r3,r1
00388                 *rgba2  = _mm_unpackhi_epi32(*rgba2, *rgba4);           //a3,a1,b3,b1
00389 
00390                 *rgba4  = _mm_unpackhi_epi32(*rgba1, *rgba2);           //a3,a2,a1,a0
00391                 *rgba3  = _mm_unpacklo_epi32(*rgba1, *rgba2);           //b3,b2,b1,b0
00392                 *rgba1  = _mm_unpacklo_epi32(temp1, temp2);                     //r3,r2,r1,r0
00393                 *rgba2  = _mm_unpackhi_epi32(temp1, temp2);                     //g3,g2,g1,g0
00394 }

SSP_FORCEINLINE void ssp_convert_4c_4p_epi8_SSE2 ( __m128i *  rgba1,
__m128i *  rgba2,
__m128i *  rgba3,
__m128i *  rgba4 
)

Definition at line 301 of file SSEPlus_convert_SSE2.h.

00302 {
00303                 __m128i temp1,temp2;
00304 
00305                 temp1 = _mm_unpacklo_epi8(*rgba1, *rgba3);                      // a9, a1, b9, b1, g9, g1, r9, r1, a8, a0, b8, b0, g8, g0, r8, r0
00306                 *rgba1 = _mm_unpackhi_epi8(*rgba1, *rgba3);                     //a11, a3,b11, b3,g11, g3,r11, r3,a10, a2,b10, b2,g10, g2,r10, r2
00307                 *rgba3 = _mm_unpacklo_epi8(*rgba2, *rgba4);                     //a13, a5,b13, b5,g13, g5,r13, r5,a12, a4,b12, b4,g12, g4,r12, r4
00308                 temp2 = _mm_unpackhi_epi8(*rgba2, *rgba4);                      //a15, a7,b15, b7,g15, g7,r15, r7,a14, a6,b14, b6,g14, g6,r14, r6
00309 
00310                 *rgba4 = _mm_unpackhi_epi8(*rgba1, temp2);                      //a15,a11, a7, a3,b15,b11, b7, b3,g15,g11, g7, g3,r15,r11, r7, r3
00311                 *rgba1 = _mm_unpacklo_epi8(*rgba1, temp2);                      //a14,a10, a6, a2,b14,b10, b6, b2,g14,g10, g6, g2,r14,r10, r6, r2
00312                 *rgba2 = _mm_unpacklo_epi8(temp1, *rgba3);                      //a12, a8, a4, a0,b12, b8, b4, b0,g12, g8, g4, g0,r12, r8, r4, r0
00313                 *rgba3 = _mm_unpackhi_epi8(temp1, *rgba3);                      //a13, a9, a5, a1,b13, b9, b5, b1,g13, g9, g5, g1,r13, r9, r5, r1
00314 
00315                 temp1 = _mm_unpacklo_epi8(*rgba3, *rgba4);                      //g15,g13,g11, g9, g7, g5, g3, g1,r15,r13,r11, r9, r7, r5, r3, r1
00316                 *rgba3 = _mm_unpackhi_epi8(*rgba3, *rgba4);                     //a15,a13,a11, a9, a7, a5, a3, a1,b15,b13,b11, b9, b7, b5, b3, b1
00317                 temp2 = _mm_unpackhi_epi8(*rgba2, *rgba1);                      //a14,a12,a10, a8, a6, a4, a2, a0,b14,b12,b10, b8, b6, b4, b2, b0
00318                 *rgba2 = _mm_unpacklo_epi8(*rgba2, *rgba1);                     //g14,g12,g10, g8, g6, g4, g2, g0,r14,r12,r10, r8, r6, r4, r2, r0
00319 
00320                 *rgba1 = _mm_unpacklo_epi8(*rgba2, temp1);                      //r15,r14,r13,r12,r11,r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0
00321                 *rgba2 = _mm_unpackhi_epi8(*rgba2, temp1);                      //g15,g14,g13,g12,g11,g10, g9, g8, g7, g6, g5, g4, g3, g2, g1, g0
00322                 *rgba4 = _mm_unpackhi_epi8(temp2, *rgba3);                      //a15,a14,a13,a12,a11,a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0
00323                 *rgba3 = _mm_unpacklo_epi8(temp2, *rgba3);                      //b15,b14,b13,b12,b11,b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0
00324 }

SSP_FORCEINLINE void ssp_convert_4p_4c_epi16_SSE2 ( __m128i *  r,
__m128i *  g,
__m128i *  b,
__m128i *  a 
)

Definition at line 365 of file SSEPlus_convert_SSE2.h.

00366 {
00367                 __m128i temp1, temp2;
00368 
00369                 temp1 = _mm_unpacklo_epi16(*r, *b);                     //b3,r3,b2,r2,b1,r1,b0,r0
00370                 *r     = _mm_unpackhi_epi16(*r, *b);        //b7,r7,b6,r6,b5,r5,b4,r4
00371                 temp2 = _mm_unpacklo_epi16(*g, *a);                     //a3,g3,a2,g2,a1,g1,a0,g0
00372                 *g     = _mm_unpackhi_epi16(*g, *a);            //a7,g7,a6,g6,a5,g5,a4,g4
00373 
00374                 *b     = _mm_unpacklo_epi16(*r, *g);            //a5,b5,g5,r5,a4,b4,g4,r4
00375                 *a     = _mm_unpackhi_epi16(*r, *g);            //a7,b7,g7,r7,a6,b6,g6,r6
00376                 *r     = _mm_unpacklo_epi16(temp1, temp2);      //a1,b1,g1,r1,a0,b0,g0,r0
00377                 *g     = _mm_unpackhi_epi16(temp1, temp2);      //a3,b3,g3,r3,a2,b2,g2,r2
00378 }

SSP_FORCEINLINE void ssp_convert_4p_4c_epi32_SSE2 ( __m128i *  r,
__m128i *  g,
__m128i *  b,
__m128i *  a 
)

Definition at line 397 of file SSEPlus_convert_SSE2.h.

00398 {
00399                 __m128i temp1, temp2;
00400 
00401                 temp1 = _mm_unpacklo_epi32(*r, *b);                     //b1,r1,b0,r0
00402                 *r     = _mm_unpackhi_epi32(*r, *b);        //b3,r3,b2,r2
00403                 temp2 = _mm_unpacklo_epi32(*g, *a);                     //a1,g1,a0,g0
00404                 *g     = _mm_unpackhi_epi32(*g, *a);            //a3,g3,a2,g2
00405 
00406                 *b     = _mm_unpacklo_epi32(*r, *g);            //a2,b2,g2,r2
00407                 *a     = _mm_unpackhi_epi32(*r, *g);            //a3,b3,g3,r3
00408                 *r     = _mm_unpacklo_epi32(temp1, temp2);      //a0,b0,g0,r0
00409                 *g     = _mm_unpackhi_epi32(temp1, temp2);      //a1,b1,g1,r1
00410 }

SSP_FORCEINLINE void ssp_convert_4p_4c_epi8_SSE2 ( __m128i *  r,
__m128i *  g,
__m128i *  b,
__m128i *  a 
)

Definition at line 328 of file SSEPlus_convert_SSE2.h.

00329 {
00330                 __m128i temp1, temp2;
00331 
00332                 temp1 = _mm_unpacklo_epi8(*r, *b);                      // b7, r7, b6, r6, b5, r5, b4, r4, b3, r3, b2, r2, b1, r1, b0, r0
00333                 *r     = _mm_unpackhi_epi8(*r, *b);         //b15,r15,b14,r14,b13,r13,b12,r12,b11,r11,b10,r10, b9, r9, b8, r8
00334                 temp2 = _mm_unpacklo_epi8(*g, *a);                      // a7, g7, a6, g6, a5, g5, a4, g4, a3, g3, a2, g2, a1, g1, a0, g0
00335                 *g     = _mm_unpackhi_epi8(*g, *a);                     //a15,g15,a14,g14,a13,g13,a12,g12,a11,g11,a10,g10, a9, g9, a8, g8
00336 
00337                 *b     = _mm_unpacklo_epi8(*r, *g);                     //a11,b11,g11,r11,a10,b10,g10,r10, a9, b9, g9, r9, a8, b8, g8, r8
00338                 *a     = _mm_unpackhi_epi8(*r, *g);                     //a16,b16,g16,r16,a15,b15,g15,r15,a14, b1,g14,r14,a13,b13,g12,r12
00339                 *r     = _mm_unpacklo_epi8(temp1, temp2);       // a3, b3, g3, r3, a2, b2, g2, r2, a1, b1, g1, r1, a0, b0, g0, r0
00340                 *g     = _mm_unpackhi_epi8(temp1, temp2);       // a7, b7, g7, r7, a6, b6, g6, r6, a5, b5, g5, r5, a4, b4, g4, r4
00341 }

SSP_FORCEINLINE void ssp_convert_odd_even_epi16_SSE2 ( __m128i *  a,
__m128i *  b 
)

Definition at line 18 of file SSEPlus_convert_SSE2.h.

00019 {
00020     // IN
00021     // a = a7,a6,a5,a4,a3,a2,a1,a0
00022     // b = b7,b6,b5,b4,b3,b2,b1,b0
00023 
00024     // OUT
00025     // a = b6,b4,b2,b0,a6,a4,a2,a0  // even
00026     // b = b7,b5,b3,b1,a7,a5,a3,a1  // odd
00027 
00028     __m128i A = *a;
00029     __m128i B = *b;
00030     __m128i ta, tb, odd, even;
00031 
00032     ta   = _mm_srai_epi32 ( A, 16 );    // sign,a7,sign,a5,sign,a3,sign,a1
00033     tb   = _mm_srai_epi32 ( B, 16 );    // sign,b7,sign,b5,sign,b3,sign,b1
00034     odd  = _mm_packs_epi32( ta, tb );   //   b7,b5,  b3,b1,  a7,a5,  a3,a1
00035 
00036     A    = _mm_slli_si128 ( A, 2 );     //   a6, 0,  a4, 0,  a2, 0,  a0, 0
00037     B    = _mm_slli_si128 ( B, 2 );     //   b6, 0,  b4, 0,  b2, 0,  b0, 0
00038     A    = _mm_srai_epi32 ( A, 16 );    // sign,a6,sign,a4,sign,a2,sign,a0
00039     B    = _mm_srai_epi32 ( B, 16 );    // sign,b6,sign,b4,sign,b2,sign,b0                                        
00040     even = _mm_packs_epi32( A, B );     //   b6,b4,  b2,b0,  a6,a4,  a2,a0
00041 
00042     *a = even;
00043     *b = odd;
00044 }

SSP_FORCEINLINE void ssp_convert_odd_even_epi32_SSE2 ( __m128i *  a,
__m128i *  b 
)

Definition at line 68 of file SSEPlus_convert_SSE2.h.

00069 {
00070     // IN
00071     // a = a3,a2,a1,a0
00072     // b = b3,b2,b1,b0
00073 
00074     // OUT
00075     // a = b2,b0,a2,a0  // even
00076     // b = b3,b1,a3,a1  // odd
00077     
00078     ssp_m128 A,B;
00079     A.i = *a;
00080     B.i = *b;  
00081 
00082     ssp_convert_odd_even_ps_SSE2( &A.f, &B.f );
00083 
00084     *a = A.i;
00085     *b = B.i;       
00086 }

SSP_FORCEINLINE void ssp_convert_odd_even_ps_SSE2 ( __m128 *  a,
__m128 *  b 
)

Definition at line 49 of file SSEPlus_convert_SSE2.h.

00050 {
00051     // IN
00052     // a = a3,a2,a1,a0
00053     // b = b3,b2,b1,b0
00054 
00055     // OUT
00056     // a = b2,b0,a2,a0  // even
00057     // b = b3,b1,a3,a1  // odd
00058     
00059     __m128 c, d;  
00060     c = _mm_shuffle_ps( *a, *b, _MM_SHUFFLE(3,1,3,1) );
00061     d = _mm_shuffle_ps( *a, *b, _MM_SHUFFLE(2,0,2,0) );
00062     *a = c;
00063     *b = d;     
00064 }

SSP_FORCEINLINE void ssp_convert_reverse_transpose_SSE2 ( __m128i *  a,
__m128i *  b,
__m128i *  c 
)

Definition at line 128 of file SSEPlus_convert_SSE2.h.

00129 {
00130     ssp_m128 A, B, C, T1, T2, T3;
00131     A.i = *a;   
00132     B.i = *b;   
00133     C.i = *c;  
00134 
00135     T1.f = _mm_shuffle_ps( C.f,  A.f,  _MM_SHUFFLE( 3,1,2,0) ); // 9  3  8  2
00136     T2.f = _mm_shuffle_ps( B.f,  A.f,  _MM_SHUFFLE( 2,0,2,0) ); // 6  0  7  1
00137     T3.f = _mm_shuffle_ps( C.f,  B.f,  _MM_SHUFFLE( 3,1,3,1) ); // 10 4  11 5
00138 
00139     A.f  = _mm_shuffle_ps( T2.f, T1.f, _MM_SHUFFLE( 2,0,0,2 ) ); //3  2  1  0  
00140     B.f  = _mm_shuffle_ps( T3.f, T2.f, _MM_SHUFFLE( 1,3,0,2 ) ); //7  6  5  4  
00141     C.f  = _mm_shuffle_ps( T1.f, T3.f, _MM_SHUFFLE( 1,3,3,1 ) ); //11 10 9  8   
00142 
00143     *a = A.i;
00144     *b = B.i;
00145     *c = C.i; 
00146 }

SSP_FORCEINLINE __m128i ssp_logical_bitwise_select_SSE2 ( __m128i  a,
__m128i  b,
__m128i  mask 
)

Definition at line 63 of file SSEPlus_logical_SSE2.h.

00063                                                                                                                      : b) 
00064 {
00065     a = _mm_and_si128   ( a,    mask );                                 // clear a where mask = 0
00066     b = _mm_andnot_si128( mask, b    );                                 // clear b where mask = 1
00067     a = _mm_or_si128    ( a,    b    );                                 // a = a OR b                         
00068     return a; 
00069 }

SSP_FORCEINLINE __m128d ssp_logical_invert_sd_SSE2 ( __m128d  a  ) 

Definition at line 44 of file SSEPlus_logical_SSE2.h.

00045 {
00046     const static __m128i mask = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 );
00047     ssp_m128 A;
00048     A.d    = a;
00049     A.i = _mm_xor_si128( A.i, mask );
00050     return A.d;
00051 }

SSP_FORCEINLINE __m128i ssp_logical_invert_si128_SSE2 ( __m128i  a  ) 

Definition at line 37 of file SSEPlus_logical_SSE2.h.

00038 {
00039     const static __m128i mask = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF );
00040     a = _mm_xor_si128( a, mask );
00041     return a;
00042 }

SSP_FORCEINLINE __m128 ssp_logical_invert_ss_SSE2 ( __m128  a  ) 

Definition at line 53 of file SSEPlus_logical_SSE2.h.

00054 {
00055     const static __m128i mask = SSP_CONST_SET_32I( 0xFFFFFFFF, 0, 0, 0 );
00056     ssp_m128 A;
00057     A.f    = a;
00058     A.i = _mm_xor_si128( A.i, mask );
00059     return A.f;
00060 }

SSP_FORCEINLINE __m128i ssp_logical_signinvert_16_SSE2 ( __m128i  mask,
__m128i  a,
__m128i  b 
)

Invert 'mask' if 'a' and 'b' have different signs.

Definition at line 17 of file SSEPlus_logical_SSE2.h.

00018 {
00019     __m128i signMask;   
00020     signMask = _mm_xor_si128  ( a, b );              // Signbit is 1 where signs differ 
00021     signMask = _mm_srai_epi16 ( signMask, 15 );      // fill all fields with sign bit     
00022     mask     = _mm_xor_si128  ( mask, signMask );    // Invert output where signs differed
00023     return mask;  
00024 }

SSP_FORCEINLINE __m128i ssp_logical_signinvert_32_SSE2 ( __m128i  mask,
__m128i  a,
__m128i  b 
)

Invert 'mask' if 'a' and 'b' have different signs.

Definition at line 27 of file SSEPlus_logical_SSE2.h.

00028 {
00029     __m128i signMask;   
00030     signMask = _mm_xor_si128  ( a, b );              // Signbit is 1 where signs differ 
00031     signMask = _mm_srai_epi32 ( signMask, 31 );      // fill all fields with sign bit     
00032     mask     = _mm_xor_si128  ( mask, signMask );    // Invert output where signs differed
00033     return mask;  
00034 }

__m128i ssp_memory_load1_epu8_SSE2 ( unsigned char  a  ) 

Definition at line 18 of file SSEPlus_memory_SSE2.h.

00019 {
00020     ssp_m128 A;
00021     
00022     A.u8[0] = a;
00023     A.u8[1] = a;
00024     A.u8[2] = a;
00025     A.u8[3] = a;
00026 
00027     A.u32[1] = A.u32[0];
00028     A.u32[2] = A.u32[0];
00029     A.u32[3] = A.u32[0]; 
00030     
00031     return A.i;
00032 }

SSP_FORCEINLINE __m128i ssp_movmask_imm8_to_epi32_SSE2 ( int  mask  ) 

Definition at line 84 of file SSEPlus_logical_SSE2.h.

00085 {
00086     __m128i screen;
00087     const static __m128i mulShiftImm = SSP_CONST_SET_16I( 0x1000, 0x0000, 0x2000, 0x0000, 0x4000, 0x0000, 0x8000, 0x0000 ); // Shift mask multiply moves all bits to left, becomes MSB
00088     screen = _mm_set1_epi16 ( mask                );   // Load the mask into register
00089     screen = _mm_mullo_epi16( screen, mulShiftImm );   // Shift bits to MSB
00090     screen = _mm_srai_epi32 ( screen, 31          );   // Shift bits to obtain all F's or all 0's
00091     return screen;
00092 }

SSP_FORCEINLINE __m128 ssp_round_ps_neg_zero_SSE2 ( __m128  a,
int  iRoundMode 
)

This function wraps ssp_round_ps_SSE2. It guarantees that numbers rounding to 0 from a negative will generate a negative zero.

Definition at line 91 of file SSEPlus_arithmetic_SSE2.h.

00092 {
00093     const static __m128i SIGN_BIT = SSP_CONST_SET_32I( 0x80000000, 0x80000000, 0x80000000,0x80000000 );
00094     ssp_m128 A, sign;
00095     A.f = a;
00096     
00097     sign.i = _mm_and_si128    ( A.i, SIGN_BIT );  // Store the sign bits
00098     A.f    = ssp_round_ps_SSE2( A.f, iRoundMode );   
00099     A.i    = _mm_or_si128     ( A.i, sign.i );    // Restore the sign bits (preserves -0)
00100    
00101     return A.f;
00102 }

SSP_FORCEINLINE __m128i ssp_slli_epi8_SSE2 ( __m128i  a,
const int  b 
)

r_:= a_ << b; (logical left shift)

Definition at line 98 of file SSEPlus_logical_SSE2.h.

00099 {                                            //  a = VfVeVdVcVbVaV9V8V7V6V5V4V3V2V1V0
00100     __m128i t1 = _mm_srli_epi16( a, 8 );     // t1 =   Vf  Vd  Vb  V9  V7  V5  V3  V1
00101     __m128i t2 = _mm_slli_epi16( a, b + 8 ); // t2 = Re  Rc  Ra  R8  R6  R4  R2  R0
00102     t1 = _mm_slli_epi16( t1, b + 8 );        // t1 = Rf  Rd  Rb  R9  R7  R5  R3  R1
00103     t2 = _mm_srli_epi16( t1, 8 );            // t2 =   Re  Rc  Ra  R8  R6  R4  R2  R0
00104     t1 = _mm_or_si128( t1, t2 );             // t1 = RfReRdRcRbRaR9R8R7R6R5R4R3R2R1R0
00105     return t1;
00106 }

SSP_FORCEINLINE __m128i ssp_srli_epi8_SSE2 ( __m128i  a,
const int  b 
)

r_:= a_ >> b; (logical right shift)

Definition at line 111 of file SSEPlus_logical_SSE2.h.

00112 {                                            //  a = VfVeVdVcVbVaV9V8V7V6V5V4V3V2V1V0
00113     __m128i t1 = _mm_slli_epi16( a, 8 );     // t1 = Ve  Vc  Va  V8  V6  V4  V2  V0
00114     __m128i t2 = _mm_srli_epi16( a, b + 8 ); // t2 =   Rf  Rd  Rb  R9  R7  R5  R3  R1
00115     t1 = _mm_srli_epi16( t1, b + 8 );        // t1 =   Re  Rc  Ra  R8  R6  R4  R2  R0
00116     t2 = _mm_slli_epi16( t1, 8 );            // t2 = Rf  Rd  Rb  R9  R7  R5  R3  R1
00117     t1 = _mm_or_si128( t1, t2 );             // t1 = RfReRdRcRbRaR9R8R7R6R5R4R3R2R1R0
00118     return t1;
00119 }


Generated on Wed May 21 13:44:15 2008 for "SSEPlus" by  doxygen 1.5.4