include/convert/SSEPlus_convert_SSE2.h

Go to the documentation of this file.
00001 //
00002 // Copyright (c) 2006-2008 Advanced Micro Devices, Inc. All Rights Reserved.
00003 // This software is subject to the Apache v2.0 License.
00004 //
00005 #ifndef __SSEPLUS_CONVERT_SSE2_H__
00006 #define __SSEPLUS_CONVERT_SSE2_H__
00007 
00008 #include "../native/SSEPlus_native_SSE2.h"
00009 
00017 SSP_FORCEINLINE
00018 void ssp_convert_odd_even_epi16_SSE2( __m128i *a, __m128i *b )
00019 {
00020     // IN
00021     // a = a7,a6,a5,a4,a3,a2,a1,a0
00022     // b = b7,b6,b5,b4,b3,b2,b1,b0
00023 
00024     // OUT
00025     // a = b6,b4,b2,b0,a6,a4,a2,a0  // even
00026     // b = b7,b5,b3,b1,a7,a5,a3,a1  // odd
00027 
00028     __m128i A = *a;
00029     __m128i B = *b;
00030     __m128i ta, tb, odd, even;
00031 
00032     ta   = _mm_srai_epi32 ( A, 16 );    // sign,a7,sign,a5,sign,a3,sign,a1
00033     tb   = _mm_srai_epi32 ( B, 16 );    // sign,b7,sign,b5,sign,b3,sign,b1
00034     odd  = _mm_packs_epi32( ta, tb );   //   b7,b5,  b3,b1,  a7,a5,  a3,a1
00035 
00036     A    = _mm_slli_si128 ( A, 2 );     //   a6, 0,  a4, 0,  a2, 0,  a0, 0
00037     B    = _mm_slli_si128 ( B, 2 );     //   b6, 0,  b4, 0,  b2, 0,  b0, 0
00038     A    = _mm_srai_epi32 ( A, 16 );    // sign,a6,sign,a4,sign,a2,sign,a0
00039     B    = _mm_srai_epi32 ( B, 16 );    // sign,b6,sign,b4,sign,b2,sign,b0                                        
00040     even = _mm_packs_epi32( A, B );     //   b6,b4,  b2,b0,  a6,a4,  a2,a0
00041 
00042     *a = even;
00043     *b = odd;
00044 }
00045 
00046 
00048 SSP_FORCEINLINE
00049 void ssp_convert_odd_even_ps_SSE2( __m128 *a, __m128 *b )
00050 {
00051     // IN
00052     // a = a3,a2,a1,a0
00053     // b = b3,b2,b1,b0
00054 
00055     // OUT
00056     // a = b2,b0,a2,a0  // even
00057     // b = b3,b1,a3,a1  // odd
00058     
00059     __m128 c, d;  
00060     c = _mm_shuffle_ps( *a, *b, _MM_SHUFFLE(3,1,3,1) );
00061     d = _mm_shuffle_ps( *a, *b, _MM_SHUFFLE(2,0,2,0) );
00062     *a = c;
00063     *b = d;     
00064 }
00065 
00067 SSP_FORCEINLINE
00068 void ssp_convert_odd_even_epi32_SSE2( __m128i *a, __m128i *b )
00069 {
00070     // IN
00071     // a = a3,a2,a1,a0
00072     // b = b3,b2,b1,b0
00073 
00074     // OUT
00075     // a = b2,b0,a2,a0  // even
00076     // b = b3,b1,a3,a1  // odd
00077     
00078     ssp_m128 A,B;
00079     A.i = *a;
00080     B.i = *b;  
00081 
00082     ssp_convert_odd_even_ps_SSE2( &A.f, &B.f );
00083 
00084     *a = A.i;
00085     *b = B.i;       
00086 }
00087 
00088 
00089 SSP_FORCEINLINE 
00090 void ssp_convert_3c_3p_epi8_SSE2( __m128i *rgb1, __m128i *rgb2, __m128i *rgb3)
00091 {
00092     __m128i temp1, temp2;
00093                                                             // RGB1 =         r5 , b4  g4  r4 , b3  g3  r3 , b2  g2  r2 , b1  g1  r1 , b0  g0 r0
00094                                                             // RGB2 =     g10 r10, b9  g9  r9 , b8  g8  r8 , b7  g7  r7 , b6  g6  r6 , b5  g5   
00095                                                             // RGB3 = b15 g15 r15, b14 g14 r14, b13 g13 r13, b12 g12 r12, b11 g11 r11, b10 
00096 
00097 
00098     *rgb2 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));     // b7, g7, r7, b6, g6, r6, b5, g5,g10,r10, b9, g9, r9, b8, g8, r8
00099     temp1 = _mm_unpacklo_epi8(*rgb1, *rgb2);                            //g10, g2,r10, r2, b9, b1, g9, g1, r9, r1, b8, b0, g8, g0, r8, r0
00100     temp2 = _mm_unpackhi_epi8(*rgb2, *rgb3);                            //b15, b7,g15, g7,r15, r7,b14, b6,g14, g6,r14, r6,b13, b5,g13, g5
00101     *rgb3 = _mm_slli_si128   (*rgb3, 8    );                            //r13,b12,g12,r12,b11,g11,r11,b10,  0,  0,  0,  0,  0,  0,  0,  0
00102     *rgb2 = _mm_unpackhi_epi8(*rgb1, *rgb3);                            //r13, r5,b12, b4,g12, g4,r12, r4,b11, b3,g11, g3,r11, r3,b10, b2
00103 
00104     *rgb3 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));     //b11, b3,g11, g3,r11, r3,b10, b2,r13, r5,b12, b4,g12, g4,r12, r4
00105     *rgb1 = _mm_unpacklo_epi8(temp1, *rgb3);                            //r13, r9, r5, r1,b12, b8, b4, b0,g12, g8, g4, g0,r12, r8, r4, r0
00106     temp1 = _mm_srli_si128   (temp1, 8    );                            //  0,  0,  0,  0,  0,  0,  0,  0,g10, g2,r10, r2, b9, b1, g9, g1
00107     temp1 = _mm_unpacklo_epi8(temp1, temp2);                            //g14,g10, g6, g2,r14,r10, r6, r2,b13, b9, b5, b1,g13, g9, g5, g1
00108     temp2 = _mm_unpackhi_epi8(*rgb3, temp2);                            //b15,b11, b7, b3,g15,g11, g7, g3,r15,r11, r7, r3,b14,b10, b6, b2
00109 
00110     temp1 = _mm_shuffle_epi32(temp1, _MM_SHUFFLE(1,0,3,2)); //b13, b9, b5, b1,g13, g9, g5, g1,g14,g10, g6, g2,r14,r10, r6, r2
00111     *rgb3 = _mm_unpackhi_epi8(temp1, temp2);                            //b15,b13,b11, b9, b7, b5, b3, b1,g15,g13,g11, g9, g7, g5, g3, g1
00112     temp2 = _mm_slli_si128   (temp2, 8    );                            //r15,r11, r7, r3,b14,b10, b6, b2,  0,  0,  0,  0,  0,  0,  0,  0
00113     temp2 = _mm_unpackhi_epi8(*rgb1, temp2);                            //r15,r13,r11, r9, r7, r5, r3, r1,b14,b12,b10, b8, b6, b4, b2, b0
00114     temp1 = _mm_unpacklo_epi8(*rgb1, temp1);                            //g14,g12,g10, g8, g6, g4, g2, g0,r14,r12,r10, r8, r6, r4, r2, r0
00115 
00116     temp2 = _mm_shuffle_epi32(temp2, _MM_SHUFFLE(1,0,3,2)); //b14,b12,b10, b8, b6, b4, b2, b0,r15,r13,r11, r9, r7, r5, r3, r1
00117     *rgb1 = _mm_unpacklo_epi8(temp1, temp2);                            //r15,r14,r13,r12,r11,r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0       
00118     temp1 = _mm_srli_si128   (temp1, 8    );                            //  0,  0,  0,  0,  0,  0,  0,  0,g14,g12,g10, g8, g6, g4, g2, g0
00119     *rgb2 = _mm_unpacklo_epi8(temp1, *rgb3);                            //g15,g14,g13,g12,g11,g10, g9, g8, g7, g6, g5, g4, g3, g2, g1, g0       
00120     *rgb3 = _mm_unpackhi_epi8(temp2, *rgb3);                            //b15,b14,b13,b12,b11,b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0       
00121 }
00122 
00123 
00124 //a: 9  6 3 0    3  2  1 0               
00125 //b: 10 7 4 1 -> 7  6  5 4       
00126 //c: 11 8 5 2    11 10 9 8
00127 SSP_FORCEINLINE
00128 void ssp_convert_reverse_transpose_SSE2( __m128i *a, __m128i *b, __m128i *c )
00129 {
00130     ssp_m128 A, B, C, T1, T2, T3;
00131     A.i = *a;   
00132     B.i = *b;   
00133     C.i = *c;  
00134 
00135     T1.f = _mm_shuffle_ps( C.f,  A.f,  _MM_SHUFFLE( 3,1,2,0) ); // 9  3  8  2
00136     T2.f = _mm_shuffle_ps( B.f,  A.f,  _MM_SHUFFLE( 2,0,2,0) ); // 6  0  7  1
00137     T3.f = _mm_shuffle_ps( C.f,  B.f,  _MM_SHUFFLE( 3,1,3,1) ); // 10 4  11 5
00138 
00139     A.f  = _mm_shuffle_ps( T2.f, T1.f, _MM_SHUFFLE( 2,0,0,2 ) ); //3  2  1  0  
00140     B.f  = _mm_shuffle_ps( T3.f, T2.f, _MM_SHUFFLE( 1,3,0,2 ) ); //7  6  5  4  
00141     C.f  = _mm_shuffle_ps( T1.f, T3.f, _MM_SHUFFLE( 1,3,3,1 ) ); //11 10 9  8   
00142 
00143     *a = A.i;
00144     *b = B.i;
00145     *c = C.i; 
00146 }
00147 
00148 
00149 SSP_FORCEINLINE
00150 void ssp_convert_3p_3c_epi8_SSE2( __m128i *r, __m128i *g, __m128i *b )
00151 {
00152     const static __m128i odd_8  = SSP_CONST_SET_8I(   0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0 );
00153     const static __m128i even_8 = SSP_CONST_SET_8I( 0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF   );
00154 
00155     const static __m128i odd_16  = SSP_CONST_SET_16I(   0xFFFF,0,0xFFFF,0,0xFFFF,0,0xFFFF,0 );
00156     const static __m128i even_16 = SSP_CONST_SET_16I( 0,0xFFFF,0,0xFFFF,0,0xFFFF,0,0xFFFF   );    
00157 
00158    ssp_m128 T, RG, GB, BR, RGBR, GBRG, BRGB;
00159     
00160      RG.i = _mm_and_si128 (     *r, even_8  );  // Mask out the odd r bits
00161       T.i = _mm_slli_epi16(     *g, 8       );  // Move the even g bits to the odd position
00162      RG.i = _mm_or_si128  (   RG.i, T.i     );  // G14 R14 ... G2 R2 G0 R0
00163 
00164      GB.i = _mm_srli_epi16(     *g, 8       );      
00165       T.i = _mm_and_si128 (     *b, odd_8   );
00166      GB.i = _mm_or_si128  (   GB.i, T.i     );
00167 
00168      BR.i = _mm_and_si128 (     *b, even_8  );
00169       T.i = _mm_and_si128 (     *r, odd_8   );
00170      BR.i = _mm_or_si128  (   BR.i, T.i     );
00171 
00172    RGBR.i = _mm_and_si128 (   RG.i, even_16 );
00173       T.i = _mm_slli_epi32(   BR.i, 16      );
00174    RGBR.i = _mm_or_si128  ( RGBR.i, T.i     );
00175 
00176    GBRG.i = _mm_and_si128 (   GB.i, even_16 );
00177       T.i = _mm_and_si128 (   RG.i, odd_16  );
00178    GBRG.i = _mm_or_si128  ( GBRG.i, T.i     );
00179 
00180    BRGB.i = _mm_srli_epi32(   BR.i, 16      );
00181       T.i = _mm_and_si128 (   GB.i, odd_16  );
00182    BRGB.i = _mm_or_si128  ( BRGB.i, T.i     );
00183 
00184    ssp_convert_reverse_transpose_SSE2( &RGBR.i, &GBRG.i, &BRGB.i );
00185 
00186    *r = RGBR.i;
00187    *g = GBRG.i;
00188    *b = BRGB.i; 
00189 }
00190 
00191 SSP_FORCEINLINE 
00192 void ssp_convert_3c_3p_epi16_SSE2(__m128i *rgb1,__m128i *rgb2,__m128i *rgb3)
00193 {
00194                 __m128i temp1, temp2;
00195 
00196                 *rgb2  = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b3,g3,r3,b2,r5,b4,g4,r4
00197                 temp1 = _mm_unpacklo_epi16(*rgb1, *rgb2);                               //r5,r1,b4,b0,g4,g0,r4,r0
00198                 temp2 = _mm_unpackhi_epi16(*rgb2, *rgb3);                               //b7,b3,g7,g3,r7,r3,b6,b2
00199                 *rgb3  = _mm_slli_si128(*rgb3, 8);                                              //g6,r6,b5,g5, 0, 0, 0, 0
00200                 *rgb2  = _mm_unpackhi_epi16(*rgb1, *rgb3);                              //g6,g2,r6,r2,b5,b1,g5,g1
00201                 
00202                 *rgb3  = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b5,b1,g5,g1,g6,g2,r6,r2
00203                 *rgb1  = _mm_unpacklo_epi16(temp1, *rgb3);                              //g6,g4,g2,g0,r6,r4,r2,r0
00204                 temp1 = _mm_srli_si128(temp1, 8);                                               // 0, 0, 0, 0,r5,r1,b4,b0
00205                 temp1 = _mm_unpacklo_epi16(temp1, temp2);                               //r7,r5,r3,r1,b6,b4,b2,b0
00206                 temp2 = _mm_unpackhi_epi16(*rgb3, temp2);                               //b7,b5,b3,b1,g7,g5,g3,g1
00207 
00208                 temp1 = _mm_shuffle_epi32(temp1, _MM_SHUFFLE(1,0,3,2)); //b6,b4,b2,b0,r7,r5,r3,r1
00209                 *rgb3  = _mm_unpackhi_epi16(temp1, temp2);                              //b7,b6,b5,b4,b3,b2,b1,b0                               
00210                 temp2 = _mm_slli_si128(temp2, 8);                                               //g7,g5,g3,g1, 0, 0, 0, 0
00211                 *rgb2  = _mm_unpackhi_epi16(*rgb1, temp2);                              //g7,g6,g5,g4,g3,g2,g1,g0                               
00212                 *rgb1  = _mm_unpacklo_epi16(*rgb1, temp1);                              //r7,r6,r5,r4,r3,r2,r1,r0                               
00213 }
00214 
00215 SSP_FORCEINLINE 
00216 void ssp_convert_3p_3c_epi16_SSE2(__m128i *r,__m128i *g,__m128i *b)
00217 {
00218                 __m128i temp;
00219 
00220                 temp = _mm_srli_si128(*r, 8);                   // 0, 0, 0, 0,r7,r6,r5,r4
00221                 *r    = _mm_unpacklo_epi16(*r, temp);   //r7,r3,r6,r2,r5,r1,r4,r0
00222                 temp = _mm_srli_si128(*r, 8);                   // 0, 0, 0, 0,r7,r3,r6,r2
00223                 *r    = _mm_unpacklo_epi16(*r, temp);   //r7,r5,r3,r1,r6,r4,r2,r0
00224 
00225                 temp = _mm_srli_si128(*g, 8);                   //g7,g3,g6,g2,g5,g1,g4,g0
00226                 *g    = _mm_unpacklo_epi16(*g, temp);   // 0, 0, 0, 0,g7,g3,g6,g2
00227                 temp = _mm_srli_si128(*g, 8);                   //g7,g5,g3,g1,g6,g4,g2,g0
00228                 *g    = _mm_unpacklo_epi16(*g, temp);   //g7,g5,g3,g1,g6,g4,g2,g0
00229 
00230                 temp = _mm_srli_si128(*b, 8);                   //b7,b3,b6,b2,b5,b1,b4,b0
00231                 *b    = _mm_unpacklo_epi16(*b, temp);   // 0, 0, 0, 0,b7,b3,b6,b2
00232                 temp = _mm_srli_si128(*b, 8);                   //b7,b5,b3,b1,b6,b4,b2,b0
00233                 *b    = _mm_unpacklo_epi16(*b, temp);   //b7,b5,b3,b1,b6,b4,b2,b0
00234 
00235                 temp = _mm_unpacklo_epi16(*r, *g);              //g6,r6,g4,r4,g2,r2,g0,r0
00236                 *r    = _mm_srli_si128(*r , 8);                 // 0, 0, 0, 0,r7,r5,r3,r1
00237                 *r    = _mm_unpacklo_epi16(*b, *r);             //r7,b6,r5,b4,r3,b2,r1,b0
00238                 *g    = _mm_unpackhi_epi16(*g, *b);             //b7,g7,b5,g5,b3,g3,b1,g1
00239 
00240                 *b    = _mm_srli_si128(*r, 8);                  // 0, 0, 0, 0,r7,b6,r5,b4
00241                 *r    = _mm_unpacklo_epi32(*r, *b);             //r7,b6,r3,b2,r5,b4,r1,b0
00242                 *b    = _mm_srli_si128(*g, 8);                  // 0, 0, 0, 0,b7,g7,b5,g5
00243                 *g    = _mm_unpacklo_epi32(*g, *b);             //b7,g7,b3,g3,b5,g5,b1,g1
00244                 *b    = _mm_srli_si128(temp, 8);                // 0, 0, 0, 0,g6,r6,g4,r4
00245                 temp = _mm_unpacklo_epi32(temp, *b);    //g6,r6,g2,r2,g4,r4,g0,r0
00246 
00247                 *b    = _mm_unpacklo_epi32(temp, *g);   //b5,g5,g4,r4,b1,g1,g0,r0
00248                 temp = _mm_srli_si128(temp, 8);                 // 0, 0, 0, 0,g6,r6,g2,r2
00249                 temp = _mm_unpacklo_epi32(*r, temp);    //g6,r6,r5,b4,g2,r2,r1,b0
00250                 *g    = _mm_unpackhi_epi32(*r, *g);             //b7,g7,r7,b6,b3,g3,r3,b2
00251                 
00252                 *r    = _mm_unpacklo_epi32(*b, temp);   //g2,r2,b1,g1,r1,b0,g0,r0
00253                 temp = _mm_unpackhi_epi32(*b, temp);    //g6,r6,b5,g5,r5,b4,g4,b4
00254                 *b    = _mm_unpackhi_epi64(temp, *g);   //b7,g7,r7,b6,g6,r6,b5,g5
00255                 *g    = _mm_unpacklo_epi64(*g, temp);   //r5,b4,g4,r4,b3,g3,r3,b2
00256 }
00257 
00258 SSP_FORCEINLINE 
00259 void ssp_convert_3c_3p_epi32_SSE2(__m128i *rgb1,__m128i *rgb2,__m128i *rgb3)
00260 {
00261                 __m128i temp1, temp2;
00262                 
00263                 *rgb2  = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b1,g1,g2,r2
00264                 temp1 = _mm_unpacklo_epi32(*rgb1, *rgb2);                               //g2,g0,r2,r0
00265                 temp2 = _mm_unpackhi_epi32(*rgb2, *rgb3);                               //b3,b1,g3,g1
00266                 *rgb3  = _mm_slli_si128(*rgb3, 8);                                              //r3,b2, 0, 0
00267                 *rgb2  = _mm_unpackhi_epi32(*rgb1, *rgb3);                              //r3,r1,b2,b0
00268                 
00269                 *rgb3  = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b2,b0,r3,r1
00270                 *rgb1  = _mm_unpacklo_epi32(temp1, *rgb3);                              //r3,r2,r1,r0
00271                 temp1 = _mm_srli_si128(temp1, 8);                                               // 0, 0,g2,g0
00272                 *rgb2  = _mm_unpacklo_epi32(temp1, temp2);                              //g3,g2,g1,g0
00273                 *rgb3  = _mm_unpackhi_epi32(*rgb3, temp2);                              //b3,b2,b1,b0
00274 }
00275 
00276 SSP_FORCEINLINE 
00277 void ssp_convert_3p_3c_epi32_SSE2(__m128i *r,__m128i *g,__m128i *b)
00278 {
00279                 __m128i temp;
00280 
00281                 temp = _mm_srli_si128(*r, 8);                   // 0, 0,r3,r2
00282                 *r    = _mm_unpacklo_epi32(*r, temp);   //r3,r1,r2,r0
00283                 temp = _mm_srli_si128(*g, 8);                   // 0, 0,g3,g2
00284                 *g    = _mm_unpacklo_epi32(*g, temp);   //g3,g1,g2,g0
00285                 temp = _mm_srli_si128(*b, 8);                   // 0, 0,b3,b2
00286                 *b    = _mm_unpacklo_epi32(*b, temp);   //b3,b1,b2,b0
00287 
00288                 temp = _mm_unpacklo_epi32(*r, *g);              //g2,r2,g0,r0
00289                 *g    = _mm_unpackhi_epi32(*g, *b);             //b3,g3,b1,g1
00290                 *r    = _mm_srli_si128(*r, 8);                  // 0, 0,r3,r1
00291                 *b    = _mm_unpacklo_epi32(*b, *r);             //r3,b2,r1,b0
00292 
00293                 *r    = _mm_unpacklo_epi64(temp, *b);   //r1,b0,g0,r0
00294                 *b    = _mm_unpackhi_epi64(*b, *g);             //b3,g3,r3,b2
00295                 *g    = _mm_slli_si128(*g, 8);                  //b1,g1, 0, 0
00296                 *g    = _mm_unpackhi_epi64(*g, temp);   //g2,r2,b1,g1
00297 }
00298 
00299 /* convert 4-channel RGBA to 4-planar format */
00300 SSP_FORCEINLINE 
00301 void ssp_convert_4c_4p_epi8_SSE2( __m128i *rgba1, __m128i *rgba2, __m128i *rgba3, __m128i *rgba4 )
00302 {
00303                 __m128i temp1,temp2;
00304 
00305                 temp1 = _mm_unpacklo_epi8(*rgba1, *rgba3);                      // a9, a1, b9, b1, g9, g1, r9, r1, a8, a0, b8, b0, g8, g0, r8, r0
00306                 *rgba1 = _mm_unpackhi_epi8(*rgba1, *rgba3);                     //a11, a3,b11, b3,g11, g3,r11, r3,a10, a2,b10, b2,g10, g2,r10, r2
00307                 *rgba3 = _mm_unpacklo_epi8(*rgba2, *rgba4);                     //a13, a5,b13, b5,g13, g5,r13, r5,a12, a4,b12, b4,g12, g4,r12, r4
00308                 temp2 = _mm_unpackhi_epi8(*rgba2, *rgba4);                      //a15, a7,b15, b7,g15, g7,r15, r7,a14, a6,b14, b6,g14, g6,r14, r6
00309 
00310                 *rgba4 = _mm_unpackhi_epi8(*rgba1, temp2);                      //a15,a11, a7, a3,b15,b11, b7, b3,g15,g11, g7, g3,r15,r11, r7, r3
00311                 *rgba1 = _mm_unpacklo_epi8(*rgba1, temp2);                      //a14,a10, a6, a2,b14,b10, b6, b2,g14,g10, g6, g2,r14,r10, r6, r2
00312                 *rgba2 = _mm_unpacklo_epi8(temp1, *rgba3);                      //a12, a8, a4, a0,b12, b8, b4, b0,g12, g8, g4, g0,r12, r8, r4, r0
00313                 *rgba3 = _mm_unpackhi_epi8(temp1, *rgba3);                      //a13, a9, a5, a1,b13, b9, b5, b1,g13, g9, g5, g1,r13, r9, r5, r1
00314 
00315                 temp1 = _mm_unpacklo_epi8(*rgba3, *rgba4);                      //g15,g13,g11, g9, g7, g5, g3, g1,r15,r13,r11, r9, r7, r5, r3, r1
00316                 *rgba3 = _mm_unpackhi_epi8(*rgba3, *rgba4);                     //a15,a13,a11, a9, a7, a5, a3, a1,b15,b13,b11, b9, b7, b5, b3, b1
00317                 temp2 = _mm_unpackhi_epi8(*rgba2, *rgba1);                      //a14,a12,a10, a8, a6, a4, a2, a0,b14,b12,b10, b8, b6, b4, b2, b0
00318                 *rgba2 = _mm_unpacklo_epi8(*rgba2, *rgba1);                     //g14,g12,g10, g8, g6, g4, g2, g0,r14,r12,r10, r8, r6, r4, r2, r0
00319 
00320                 *rgba1 = _mm_unpacklo_epi8(*rgba2, temp1);                      //r15,r14,r13,r12,r11,r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0
00321                 *rgba2 = _mm_unpackhi_epi8(*rgba2, temp1);                      //g15,g14,g13,g12,g11,g10, g9, g8, g7, g6, g5, g4, g3, g2, g1, g0
00322                 *rgba4 = _mm_unpackhi_epi8(temp2, *rgba3);                      //a15,a14,a13,a12,a11,a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0
00323                 *rgba3 = _mm_unpacklo_epi8(temp2, *rgba3);                      //b15,b14,b13,b12,b11,b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0
00324 }
00325 
00326 /* convert 4-planar RGBA to 4-channel format */
00327 SSP_FORCEINLINE 
00328 void ssp_convert_4p_4c_epi8_SSE2(__m128i *r,__m128i *g,__m128i *b,__m128i *a)
00329 {
00330                 __m128i temp1, temp2;
00331 
00332                 temp1 = _mm_unpacklo_epi8(*r, *b);                      // b7, r7, b6, r6, b5, r5, b4, r4, b3, r3, b2, r2, b1, r1, b0, r0
00333                 *r     = _mm_unpackhi_epi8(*r, *b);         //b15,r15,b14,r14,b13,r13,b12,r12,b11,r11,b10,r10, b9, r9, b8, r8
00334                 temp2 = _mm_unpacklo_epi8(*g, *a);                      // a7, g7, a6, g6, a5, g5, a4, g4, a3, g3, a2, g2, a1, g1, a0, g0
00335                 *g     = _mm_unpackhi_epi8(*g, *a);                     //a15,g15,a14,g14,a13,g13,a12,g12,a11,g11,a10,g10, a9, g9, a8, g8
00336 
00337                 *b     = _mm_unpacklo_epi8(*r, *g);                     //a11,b11,g11,r11,a10,b10,g10,r10, a9, b9, g9, r9, a8, b8, g8, r8
00338                 *a     = _mm_unpackhi_epi8(*r, *g);                     //a16,b16,g16,r16,a15,b15,g15,r15,a14, b1,g14,r14,a13,b13,g12,r12
00339                 *r     = _mm_unpacklo_epi8(temp1, temp2);       // a3, b3, g3, r3, a2, b2, g2, r2, a1, b1, g1, r1, a0, b0, g0, r0
00340                 *g     = _mm_unpackhi_epi8(temp1, temp2);       // a7, b7, g7, r7, a6, b6, g6, r6, a5, b5, g5, r5, a4, b4, g4, r4
00341 }
00342 
00343 SSP_FORCEINLINE 
00344 void ssp_convert_4c_4p_epi16_SSE2(__m128i *rgba1,__m128i *rgba2,__m128i *rgba3,__m128i *rgba4)
00345 {
00346                 __m128i temp1, temp2;
00347 
00348                 temp1  = _mm_unpacklo_epi16(*rgba1, *rgba3);            //a4,a0,b4,b0,g4,g0,r4,r0
00349                 *rgba1  = _mm_unpackhi_epi16(*rgba1, *rgba3);           //a5,a1,b5,b1,g5,g1,r5,r1
00350                 *rgba3  = _mm_unpacklo_epi16(*rgba2, *rgba4);           //a6,a2,b6,b2,g6,g2,r6,r2
00351                 *rgba2  = _mm_unpackhi_epi16(*rgba2, *rgba4);           //a7,a3,b7,b3,g7,g3,r7,r3
00352 
00353                 *rgba4  = _mm_unpackhi_epi16(*rgba1, *rgba2);           //a7,a5,a3,a1,b7,b5,b3,b1
00354                 *rgba1  = _mm_unpacklo_epi16(*rgba1, *rgba2);           //g7,g5,g3,g1,r7,r5,r3,r1
00355                 temp2  = _mm_unpacklo_epi16(temp1, *rgba3);                     //g6,g4,g2,g0,r6,r4,r2,r0
00356                 temp1  = _mm_unpackhi_epi16(temp1, *rgba3);                     //a6,a4,a2,a0,b6,b4,b2,b0
00357 
00358                 *rgba3  = _mm_unpacklo_epi16(temp1, *rgba4);            //b7,b6,b5,b4,b3,b2,b1,b0
00359                 *rgba4  = _mm_unpackhi_epi16(temp1, *rgba4);            //a7,a6,a5,a4,a3,a2,a1,a0
00360                 *rgba2  = _mm_unpackhi_epi16(temp2, *rgba1);            //g7,g6,g5,g4,g3,g2,g1,g0
00361                 *rgba1  = _mm_unpacklo_epi16(temp2, *rgba1);            //r7,r6,r5,r4,r3,r2,r1,r0
00362 }
00363 
00364 SSP_FORCEINLINE 
00365 void ssp_convert_4p_4c_epi16_SSE2(__m128i *r,__m128i *g,__m128i *b,__m128i *a)
00366 {
00367                 __m128i temp1, temp2;
00368 
00369                 temp1 = _mm_unpacklo_epi16(*r, *b);                     //b3,r3,b2,r2,b1,r1,b0,r0
00370                 *r     = _mm_unpackhi_epi16(*r, *b);        //b7,r7,b6,r6,b5,r5,b4,r4
00371                 temp2 = _mm_unpacklo_epi16(*g, *a);                     //a3,g3,a2,g2,a1,g1,a0,g0
00372                 *g     = _mm_unpackhi_epi16(*g, *a);            //a7,g7,a6,g6,a5,g5,a4,g4
00373 
00374                 *b     = _mm_unpacklo_epi16(*r, *g);            //a5,b5,g5,r5,a4,b4,g4,r4
00375                 *a     = _mm_unpackhi_epi16(*r, *g);            //a7,b7,g7,r7,a6,b6,g6,r6
00376                 *r     = _mm_unpacklo_epi16(temp1, temp2);      //a1,b1,g1,r1,a0,b0,g0,r0
00377                 *g     = _mm_unpackhi_epi16(temp1, temp2);      //a3,b3,g3,r3,a2,b2,g2,r2
00378 }
00379 
00380 SSP_FORCEINLINE 
00381 void ssp_convert_4c_4p_epi32_SSE2(__m128i *rgba1,__m128i *rgba2,__m128i *rgba3, __m128i *rgba4)
00382 {
00383                 __m128i temp1, temp2;
00384 
00385                 temp1  = _mm_unpacklo_epi32(*rgba1, *rgba3);            //g2,g0,r2,r0
00386                 *rgba1  = _mm_unpackhi_epi32(*rgba1, *rgba3);           //a2,a0,b2,b0
00387                 temp2  = _mm_unpacklo_epi32(*rgba2, *rgba4);            //g3,g1,r3,r1
00388                 *rgba2  = _mm_unpackhi_epi32(*rgba2, *rgba4);           //a3,a1,b3,b1
00389 
00390                 *rgba4  = _mm_unpackhi_epi32(*rgba1, *rgba2);           //a3,a2,a1,a0
00391                 *rgba3  = _mm_unpacklo_epi32(*rgba1, *rgba2);           //b3,b2,b1,b0
00392                 *rgba1  = _mm_unpacklo_epi32(temp1, temp2);                     //r3,r2,r1,r0
00393                 *rgba2  = _mm_unpackhi_epi32(temp1, temp2);                     //g3,g2,g1,g0
00394 }
00395 
00396 SSP_FORCEINLINE 
00397 void ssp_convert_4p_4c_epi32_SSE2(__m128i *r,__m128i *g,__m128i *b,__m128i *a)
00398 {
00399                 __m128i temp1, temp2;
00400 
00401                 temp1 = _mm_unpacklo_epi32(*r, *b);                     //b1,r1,b0,r0
00402                 *r     = _mm_unpackhi_epi32(*r, *b);        //b3,r3,b2,r2
00403                 temp2 = _mm_unpacklo_epi32(*g, *a);                     //a1,g1,a0,g0
00404                 *g     = _mm_unpackhi_epi32(*g, *a);            //a3,g3,a2,g2
00405 
00406                 *b     = _mm_unpacklo_epi32(*r, *g);            //a2,b2,g2,r2
00407                 *a     = _mm_unpackhi_epi32(*r, *g);            //a3,b3,g3,r3
00408                 *r     = _mm_unpacklo_epi32(temp1, temp2);      //a0,b0,g0,r0
00409                 *g     = _mm_unpackhi_epi32(temp1, temp2);      //a1,b1,g1,r1
00410 }
00411 
00412 
00417 #endif // __SSEPLUS_CONVERT_SSE2_H__

Generated on Wed May 21 13:44:11 2008 for "SSEPlus" by  doxygen 1.5.4