00001
00002
00003
00004
00005 #ifndef __SSEPLUS_CONVERT_SSE2_H__
00006 #define __SSEPLUS_CONVERT_SSE2_H__
00007
00008 #include "../native/SSEPlus_native_SSE2.h"
00009
00017 SSP_FORCEINLINE
00018 void ssp_convert_odd_even_epi16_SSE2( __m128i *a, __m128i *b )
00019 {
00020
00021
00022
00023
00024
00025
00026
00027
00028 __m128i A = *a;
00029 __m128i B = *b;
00030 __m128i ta, tb, odd, even;
00031
00032 ta = _mm_srai_epi32 ( A, 16 );
00033 tb = _mm_srai_epi32 ( B, 16 );
00034 odd = _mm_packs_epi32( ta, tb );
00035
00036 A = _mm_slli_si128 ( A, 2 );
00037 B = _mm_slli_si128 ( B, 2 );
00038 A = _mm_srai_epi32 ( A, 16 );
00039 B = _mm_srai_epi32 ( B, 16 );
00040 even = _mm_packs_epi32( A, B );
00041
00042 *a = even;
00043 *b = odd;
00044 }
00045
00046
00048 SSP_FORCEINLINE
00049 void ssp_convert_odd_even_ps_SSE2( __m128 *a, __m128 *b )
00050 {
00051
00052
00053
00054
00055
00056
00057
00058
00059 __m128 c, d;
00060 c = _mm_shuffle_ps( *a, *b, _MM_SHUFFLE(3,1,3,1) );
00061 d = _mm_shuffle_ps( *a, *b, _MM_SHUFFLE(2,0,2,0) );
00062 *a = c;
00063 *b = d;
00064 }
00065
00067 SSP_FORCEINLINE
00068 void ssp_convert_odd_even_epi32_SSE2( __m128i *a, __m128i *b )
00069 {
00070
00071
00072
00073
00074
00075
00076
00077
00078 ssp_m128 A,B;
00079 A.i = *a;
00080 B.i = *b;
00081
00082 ssp_convert_odd_even_ps_SSE2( &A.f, &B.f );
00083
00084 *a = A.i;
00085 *b = B.i;
00086 }
00087
00088
00089 SSP_FORCEINLINE
00090 void ssp_convert_3c_3p_epi8_SSE2( __m128i *rgb1, __m128i *rgb2, __m128i *rgb3)
00091 {
00092 __m128i temp1, temp2;
00093
00094
00095
00096
00097
00098 *rgb2 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));
00099 temp1 = _mm_unpacklo_epi8(*rgb1, *rgb2);
00100 temp2 = _mm_unpackhi_epi8(*rgb2, *rgb3);
00101 *rgb3 = _mm_slli_si128 (*rgb3, 8 );
00102 *rgb2 = _mm_unpackhi_epi8(*rgb1, *rgb3);
00103
00104 *rgb3 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));
00105 *rgb1 = _mm_unpacklo_epi8(temp1, *rgb3);
00106 temp1 = _mm_srli_si128 (temp1, 8 );
00107 temp1 = _mm_unpacklo_epi8(temp1, temp2);
00108 temp2 = _mm_unpackhi_epi8(*rgb3, temp2);
00109
00110 temp1 = _mm_shuffle_epi32(temp1, _MM_SHUFFLE(1,0,3,2));
00111 *rgb3 = _mm_unpackhi_epi8(temp1, temp2);
00112 temp2 = _mm_slli_si128 (temp2, 8 );
00113 temp2 = _mm_unpackhi_epi8(*rgb1, temp2);
00114 temp1 = _mm_unpacklo_epi8(*rgb1, temp1);
00115
00116 temp2 = _mm_shuffle_epi32(temp2, _MM_SHUFFLE(1,0,3,2));
00117 *rgb1 = _mm_unpacklo_epi8(temp1, temp2);
00118 temp1 = _mm_srli_si128 (temp1, 8 );
00119 *rgb2 = _mm_unpacklo_epi8(temp1, *rgb3);
00120 *rgb3 = _mm_unpackhi_epi8(temp2, *rgb3);
00121 }
00122
00123
00124
00125
00126
00127 SSP_FORCEINLINE
00128 void ssp_convert_reverse_transpose_SSE2( __m128i *a, __m128i *b, __m128i *c )
00129 {
00130 ssp_m128 A, B, C, T1, T2, T3;
00131 A.i = *a;
00132 B.i = *b;
00133 C.i = *c;
00134
00135 T1.f = _mm_shuffle_ps( C.f, A.f, _MM_SHUFFLE( 3,1,2,0) );
00136 T2.f = _mm_shuffle_ps( B.f, A.f, _MM_SHUFFLE( 2,0,2,0) );
00137 T3.f = _mm_shuffle_ps( C.f, B.f, _MM_SHUFFLE( 3,1,3,1) );
00138
00139 A.f = _mm_shuffle_ps( T2.f, T1.f, _MM_SHUFFLE( 2,0,0,2 ) );
00140 B.f = _mm_shuffle_ps( T3.f, T2.f, _MM_SHUFFLE( 1,3,0,2 ) );
00141 C.f = _mm_shuffle_ps( T1.f, T3.f, _MM_SHUFFLE( 1,3,3,1 ) );
00142
00143 *a = A.i;
00144 *b = B.i;
00145 *c = C.i;
00146 }
00147
00148
00149 SSP_FORCEINLINE
00150 void ssp_convert_3p_3c_epi8_SSE2( __m128i *r, __m128i *g, __m128i *b )
00151 {
00152 const static __m128i odd_8 = SSP_CONST_SET_8I( 0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0 );
00153 const static __m128i even_8 = SSP_CONST_SET_8I( 0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF );
00154
00155 const static __m128i odd_16 = SSP_CONST_SET_16I( 0xFFFF,0,0xFFFF,0,0xFFFF,0,0xFFFF,0 );
00156 const static __m128i even_16 = SSP_CONST_SET_16I( 0,0xFFFF,0,0xFFFF,0,0xFFFF,0,0xFFFF );
00157
00158 ssp_m128 T, RG, GB, BR, RGBR, GBRG, BRGB;
00159
00160 RG.i = _mm_and_si128 ( *r, even_8 );
00161 T.i = _mm_slli_epi16( *g, 8 );
00162 RG.i = _mm_or_si128 ( RG.i, T.i );
00163
00164 GB.i = _mm_srli_epi16( *g, 8 );
00165 T.i = _mm_and_si128 ( *b, odd_8 );
00166 GB.i = _mm_or_si128 ( GB.i, T.i );
00167
00168 BR.i = _mm_and_si128 ( *b, even_8 );
00169 T.i = _mm_and_si128 ( *r, odd_8 );
00170 BR.i = _mm_or_si128 ( BR.i, T.i );
00171
00172 RGBR.i = _mm_and_si128 ( RG.i, even_16 );
00173 T.i = _mm_slli_epi32( BR.i, 16 );
00174 RGBR.i = _mm_or_si128 ( RGBR.i, T.i );
00175
00176 GBRG.i = _mm_and_si128 ( GB.i, even_16 );
00177 T.i = _mm_and_si128 ( RG.i, odd_16 );
00178 GBRG.i = _mm_or_si128 ( GBRG.i, T.i );
00179
00180 BRGB.i = _mm_srli_epi32( BR.i, 16 );
00181 T.i = _mm_and_si128 ( GB.i, odd_16 );
00182 BRGB.i = _mm_or_si128 ( BRGB.i, T.i );
00183
00184 ssp_convert_reverse_transpose_SSE2( &RGBR.i, &GBRG.i, &BRGB.i );
00185
00186 *r = RGBR.i;
00187 *g = GBRG.i;
00188 *b = BRGB.i;
00189 }
00190
00191 SSP_FORCEINLINE
00192 void ssp_convert_3c_3p_epi16_SSE2(__m128i *rgb1,__m128i *rgb2,__m128i *rgb3)
00193 {
00194 __m128i temp1, temp2;
00195
00196 *rgb2 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));
00197 temp1 = _mm_unpacklo_epi16(*rgb1, *rgb2);
00198 temp2 = _mm_unpackhi_epi16(*rgb2, *rgb3);
00199 *rgb3 = _mm_slli_si128(*rgb3, 8);
00200 *rgb2 = _mm_unpackhi_epi16(*rgb1, *rgb3);
00201
00202 *rgb3 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));
00203 *rgb1 = _mm_unpacklo_epi16(temp1, *rgb3);
00204 temp1 = _mm_srli_si128(temp1, 8);
00205 temp1 = _mm_unpacklo_epi16(temp1, temp2);
00206 temp2 = _mm_unpackhi_epi16(*rgb3, temp2);
00207
00208 temp1 = _mm_shuffle_epi32(temp1, _MM_SHUFFLE(1,0,3,2));
00209 *rgb3 = _mm_unpackhi_epi16(temp1, temp2);
00210 temp2 = _mm_slli_si128(temp2, 8);
00211 *rgb2 = _mm_unpackhi_epi16(*rgb1, temp2);
00212 *rgb1 = _mm_unpacklo_epi16(*rgb1, temp1);
00213 }
00214
00215 SSP_FORCEINLINE
00216 void ssp_convert_3p_3c_epi16_SSE2(__m128i *r,__m128i *g,__m128i *b)
00217 {
00218 __m128i temp;
00219
00220 temp = _mm_srli_si128(*r, 8);
00221 *r = _mm_unpacklo_epi16(*r, temp);
00222 temp = _mm_srli_si128(*r, 8);
00223 *r = _mm_unpacklo_epi16(*r, temp);
00224
00225 temp = _mm_srli_si128(*g, 8);
00226 *g = _mm_unpacklo_epi16(*g, temp);
00227 temp = _mm_srli_si128(*g, 8);
00228 *g = _mm_unpacklo_epi16(*g, temp);
00229
00230 temp = _mm_srli_si128(*b, 8);
00231 *b = _mm_unpacklo_epi16(*b, temp);
00232 temp = _mm_srli_si128(*b, 8);
00233 *b = _mm_unpacklo_epi16(*b, temp);
00234
00235 temp = _mm_unpacklo_epi16(*r, *g);
00236 *r = _mm_srli_si128(*r , 8);
00237 *r = _mm_unpacklo_epi16(*b, *r);
00238 *g = _mm_unpackhi_epi16(*g, *b);
00239
00240 *b = _mm_srli_si128(*r, 8);
00241 *r = _mm_unpacklo_epi32(*r, *b);
00242 *b = _mm_srli_si128(*g, 8);
00243 *g = _mm_unpacklo_epi32(*g, *b);
00244 *b = _mm_srli_si128(temp, 8);
00245 temp = _mm_unpacklo_epi32(temp, *b);
00246
00247 *b = _mm_unpacklo_epi32(temp, *g);
00248 temp = _mm_srli_si128(temp, 8);
00249 temp = _mm_unpacklo_epi32(*r, temp);
00250 *g = _mm_unpackhi_epi32(*r, *g);
00251
00252 *r = _mm_unpacklo_epi32(*b, temp);
00253 temp = _mm_unpackhi_epi32(*b, temp);
00254 *b = _mm_unpackhi_epi64(temp, *g);
00255 *g = _mm_unpacklo_epi64(*g, temp);
00256 }
00257
00258 SSP_FORCEINLINE
00259 void ssp_convert_3c_3p_epi32_SSE2(__m128i *rgb1,__m128i *rgb2,__m128i *rgb3)
00260 {
00261 __m128i temp1, temp2;
00262
00263 *rgb2 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));
00264 temp1 = _mm_unpacklo_epi32(*rgb1, *rgb2);
00265 temp2 = _mm_unpackhi_epi32(*rgb2, *rgb3);
00266 *rgb3 = _mm_slli_si128(*rgb3, 8);
00267 *rgb2 = _mm_unpackhi_epi32(*rgb1, *rgb3);
00268
00269 *rgb3 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));
00270 *rgb1 = _mm_unpacklo_epi32(temp1, *rgb3);
00271 temp1 = _mm_srli_si128(temp1, 8);
00272 *rgb2 = _mm_unpacklo_epi32(temp1, temp2);
00273 *rgb3 = _mm_unpackhi_epi32(*rgb3, temp2);
00274 }
00275
00276 SSP_FORCEINLINE
00277 void ssp_convert_3p_3c_epi32_SSE2(__m128i *r,__m128i *g,__m128i *b)
00278 {
00279 __m128i temp;
00280
00281 temp = _mm_srli_si128(*r, 8);
00282 *r = _mm_unpacklo_epi32(*r, temp);
00283 temp = _mm_srli_si128(*g, 8);
00284 *g = _mm_unpacklo_epi32(*g, temp);
00285 temp = _mm_srli_si128(*b, 8);
00286 *b = _mm_unpacklo_epi32(*b, temp);
00287
00288 temp = _mm_unpacklo_epi32(*r, *g);
00289 *g = _mm_unpackhi_epi32(*g, *b);
00290 *r = _mm_srli_si128(*r, 8);
00291 *b = _mm_unpacklo_epi32(*b, *r);
00292
00293 *r = _mm_unpacklo_epi64(temp, *b);
00294 *b = _mm_unpackhi_epi64(*b, *g);
00295 *g = _mm_slli_si128(*g, 8);
00296 *g = _mm_unpackhi_epi64(*g, temp);
00297 }
00298
00299
00300 SSP_FORCEINLINE
00301 void ssp_convert_4c_4p_epi8_SSE2( __m128i *rgba1, __m128i *rgba2, __m128i *rgba3, __m128i *rgba4 )
00302 {
00303 __m128i temp1,temp2;
00304
00305 temp1 = _mm_unpacklo_epi8(*rgba1, *rgba3);
00306 *rgba1 = _mm_unpackhi_epi8(*rgba1, *rgba3);
00307 *rgba3 = _mm_unpacklo_epi8(*rgba2, *rgba4);
00308 temp2 = _mm_unpackhi_epi8(*rgba2, *rgba4);
00309
00310 *rgba4 = _mm_unpackhi_epi8(*rgba1, temp2);
00311 *rgba1 = _mm_unpacklo_epi8(*rgba1, temp2);
00312 *rgba2 = _mm_unpacklo_epi8(temp1, *rgba3);
00313 *rgba3 = _mm_unpackhi_epi8(temp1, *rgba3);
00314
00315 temp1 = _mm_unpacklo_epi8(*rgba3, *rgba4);
00316 *rgba3 = _mm_unpackhi_epi8(*rgba3, *rgba4);
00317 temp2 = _mm_unpackhi_epi8(*rgba2, *rgba1);
00318 *rgba2 = _mm_unpacklo_epi8(*rgba2, *rgba1);
00319
00320 *rgba1 = _mm_unpacklo_epi8(*rgba2, temp1);
00321 *rgba2 = _mm_unpackhi_epi8(*rgba2, temp1);
00322 *rgba4 = _mm_unpackhi_epi8(temp2, *rgba3);
00323 *rgba3 = _mm_unpacklo_epi8(temp2, *rgba3);
00324 }
00325
00326
00327 SSP_FORCEINLINE
00328 void ssp_convert_4p_4c_epi8_SSE2(__m128i *r,__m128i *g,__m128i *b,__m128i *a)
00329 {
00330 __m128i temp1, temp2;
00331
00332 temp1 = _mm_unpacklo_epi8(*r, *b);
00333 *r = _mm_unpackhi_epi8(*r, *b);
00334 temp2 = _mm_unpacklo_epi8(*g, *a);
00335 *g = _mm_unpackhi_epi8(*g, *a);
00336
00337 *b = _mm_unpacklo_epi8(*r, *g);
00338 *a = _mm_unpackhi_epi8(*r, *g);
00339 *r = _mm_unpacklo_epi8(temp1, temp2);
00340 *g = _mm_unpackhi_epi8(temp1, temp2);
00341 }
00342
00343 SSP_FORCEINLINE
00344 void ssp_convert_4c_4p_epi16_SSE2(__m128i *rgba1,__m128i *rgba2,__m128i *rgba3,__m128i *rgba4)
00345 {
00346 __m128i temp1, temp2;
00347
00348 temp1 = _mm_unpacklo_epi16(*rgba1, *rgba3);
00349 *rgba1 = _mm_unpackhi_epi16(*rgba1, *rgba3);
00350 *rgba3 = _mm_unpacklo_epi16(*rgba2, *rgba4);
00351 *rgba2 = _mm_unpackhi_epi16(*rgba2, *rgba4);
00352
00353 *rgba4 = _mm_unpackhi_epi16(*rgba1, *rgba2);
00354 *rgba1 = _mm_unpacklo_epi16(*rgba1, *rgba2);
00355 temp2 = _mm_unpacklo_epi16(temp1, *rgba3);
00356 temp1 = _mm_unpackhi_epi16(temp1, *rgba3);
00357
00358 *rgba3 = _mm_unpacklo_epi16(temp1, *rgba4);
00359 *rgba4 = _mm_unpackhi_epi16(temp1, *rgba4);
00360 *rgba2 = _mm_unpackhi_epi16(temp2, *rgba1);
00361 *rgba1 = _mm_unpacklo_epi16(temp2, *rgba1);
00362 }
00363
00364 SSP_FORCEINLINE
00365 void ssp_convert_4p_4c_epi16_SSE2(__m128i *r,__m128i *g,__m128i *b,__m128i *a)
00366 {
00367 __m128i temp1, temp2;
00368
00369 temp1 = _mm_unpacklo_epi16(*r, *b);
00370 *r = _mm_unpackhi_epi16(*r, *b);
00371 temp2 = _mm_unpacklo_epi16(*g, *a);
00372 *g = _mm_unpackhi_epi16(*g, *a);
00373
00374 *b = _mm_unpacklo_epi16(*r, *g);
00375 *a = _mm_unpackhi_epi16(*r, *g);
00376 *r = _mm_unpacklo_epi16(temp1, temp2);
00377 *g = _mm_unpackhi_epi16(temp1, temp2);
00378 }
00379
00380 SSP_FORCEINLINE
00381 void ssp_convert_4c_4p_epi32_SSE2(__m128i *rgba1,__m128i *rgba2,__m128i *rgba3, __m128i *rgba4)
00382 {
00383 __m128i temp1, temp2;
00384
00385 temp1 = _mm_unpacklo_epi32(*rgba1, *rgba3);
00386 *rgba1 = _mm_unpackhi_epi32(*rgba1, *rgba3);
00387 temp2 = _mm_unpacklo_epi32(*rgba2, *rgba4);
00388 *rgba2 = _mm_unpackhi_epi32(*rgba2, *rgba4);
00389
00390 *rgba4 = _mm_unpackhi_epi32(*rgba1, *rgba2);
00391 *rgba3 = _mm_unpacklo_epi32(*rgba1, *rgba2);
00392 *rgba1 = _mm_unpacklo_epi32(temp1, temp2);
00393 *rgba2 = _mm_unpackhi_epi32(temp1, temp2);
00394 }
00395
00396 SSP_FORCEINLINE
00397 void ssp_convert_4p_4c_epi32_SSE2(__m128i *r,__m128i *g,__m128i *b,__m128i *a)
00398 {
00399 __m128i temp1, temp2;
00400
00401 temp1 = _mm_unpacklo_epi32(*r, *b);
00402 *r = _mm_unpackhi_epi32(*r, *b);
00403 temp2 = _mm_unpacklo_epi32(*g, *a);
00404 *g = _mm_unpackhi_epi32(*g, *a);
00405
00406 *b = _mm_unpacklo_epi32(*r, *g);
00407 *a = _mm_unpackhi_epi32(*r, *g);
00408 *r = _mm_unpacklo_epi32(temp1, temp2);
00409 *g = _mm_unpackhi_epi32(temp1, temp2);
00410 }
00411
00412
00417 #endif // __SSEPLUS_CONVERT_SSE2_H__