00001
00002
00003
00004
00005 #ifndef __SSEPLUS_EMULATION_SSE2_H__
00006 #define __SSEPLUS_EMULATION_SSE2_H__
00007
00008 #include "../SSEPlus_SSE2.h"
00009 #include "../native/SSEPlus_native_SSE2.h"
00010 #include "../logical/SSEPlus_logical_SSE2.h"
00011 #include "../convert/SSEPlus_convert_SSE2.h"
00012 #include "../arithmetic/SSEPlus_arithmetic_SSE2.h"
00013 #include "SSEPlus_emulation_comps_SSE2.h"
00014
00015
00021
00022
00023
00025 SSP_FORCEINLINE __m128i ssp_macc_epi16_SSE2( __m128i a, __m128i b, __m128i c )
00026 {
00027 a = _mm_mullo_epi16( a, b );
00028 a = _mm_add_epi16( a, c );
00029 return a;
00030 }
00031
00033 SSP_FORCEINLINE __m128i ssp_macc_epi32_SSE2( __m128i a, __m128i b, __m128i c )
00034 {
00035 __m128i ab02, ab13, mask;
00036
00037 mask = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
00038 ab02 = _mm_mul_epu32(a, b);
00039 ab02 = _mm_and_si128(ab02, mask);
00040 a = _mm_srli_epi64(a, 32);
00041 b = _mm_srli_epi64(b, 32);
00042 ab13 = _mm_mul_epu32(a, b);
00043 ab13 = _mm_slli_epi64(ab13, 32);
00044
00045 a = _mm_add_epi32(ab02, ab13);
00046
00047 return _mm_add_epi32(a, c);
00048 }
00049
00051 SSP_FORCEINLINE __m128d ssp_macc_pd_SSE2(__m128d a, __m128d b, __m128d c)
00052 {
00053 a = _mm_mul_pd( a, b );
00054 a = _mm_add_pd( a, c );
00055 return a;
00056 }
00057
00059 SSP_FORCEINLINE __m128 ssp_macc_ps_SSE2( __m128 a, __m128 b, __m128 c )
00060 {
00061 a = _mm_mul_ps( a, b );
00062 a = _mm_add_ps( a, c );
00063 return a;
00064 }
00065
00067 SSP_FORCEINLINE __m128d ssp_macc_sd_SSE2(__m128d a, __m128d b, __m128d c)
00068 {
00069 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00070
00071 ssp_m128 A,B;
00072 A.d = a;
00073 B.d = b;
00074 B.d = ssp_macc_pd_SSE2( A.d, B.d, c );
00075 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask );
00076 return B.d;
00077 }
00078
00080 SSP_FORCEINLINE __m128 ssp_macc_ss_SSE2(__m128 a, __m128 b, __m128 c)
00081 {
00082 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00083
00084 ssp_m128 A,B;
00085 A.f = a;
00086 B.f = b;
00087 B.f = ssp_macc_ps_SSE2( A.f, B.f, c );
00088 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask );
00089 return B.f;
00090 }
00091
00093 SSP_FORCEINLINE __m128i ssp_maccd_epi16_SSE2( __m128i a, __m128i b, __m128i c )
00094 {
00095 __m128i ab_lo, ab_hi;
00096 __m128i mask = _mm_set1_epi32(0xFFFF);
00097
00098 ab_lo = _mm_mullo_epi16(a, b);
00099 ab_hi = _mm_mulhi_epi16(a, b);
00100
00101 ab_lo = _mm_and_si128(ab_lo, mask);
00102 ab_hi = _mm_and_si128(ab_hi, mask);
00103 ab_hi = _mm_slli_epi32(ab_hi, 16);
00104 a = _mm_add_epi32( ab_lo, ab_hi );
00105 return _mm_add_epi32 (a, c);
00106
00108
00109
00110
00111
00112
00113
00114 }
00115
00117 SSP_FORCEINLINE __m128i ssp_macchi_epi32_SSE2( __m128i a, __m128i b, __m128i c )
00118 {
00119 __m128i mask, mask_A, mask_B, mask_C, ab;
00120
00121 a = _mm_srli_epi64(a, 32);
00122 b = _mm_srli_epi64(b, 32);
00123 mask = _mm_set_epi32(0x7FFFFFFF, 0, 0x7FFFFFFF, 0);
00124
00125
00126 mask_A = _mm_cmplt_epi32( a, mask);
00127 a = _mm_xor_si128 ( a, mask_A );
00128 mask_C = _mm_srli_epi32( mask_A, 31 );
00129 a = _mm_add_epi32( a, mask_C );
00130
00131
00132 mask_B = _mm_cmplt_epi32( b, mask);
00133 b = _mm_xor_si128 ( b, mask_B );
00134 mask_C = _mm_srli_epi32( mask_B, 31 );
00135 b = _mm_add_epi32( b, mask_C );
00136
00137 ab = _mm_mul_epu32(a, b);
00138
00139
00140 mask_A = _mm_xor_si128(mask_A, mask_B);
00141 mask_C = _mm_srli_epi32(mask_A, 31 );
00142 mask_B = _mm_slli_epi64(mask_A, 32);
00143 mask = _mm_add_epi32(mask_A, mask_B);
00144 a = _mm_xor_si128(ab, mask);
00145 a = _mm_add_epi64(a, mask_C);
00146
00147 return _mm_add_epi64(a, c);
00148 }
00149
00151 SSP_FORCEINLINE __m128i ssp_macclo_epi32_SSE2( __m128i a, __m128i b, __m128i c )
00152 {
00153 __m128i mask, mask_A, mask_B, mask_C, ab;
00154
00155 mask = _mm_set_epi32(0x7FFFFFFF, 0, 0x7FFFFFFF, 0);
00156
00157 mask_A = _mm_cmplt_epi32( a, mask);
00158 a = _mm_xor_si128 ( a, mask_A );
00159 mask_C = _mm_srli_epi32( mask_A, 31 );
00160 a = _mm_add_epi32( a, mask_C );
00161
00162
00163 mask_B = _mm_cmplt_epi32( b, mask);
00164 b = _mm_xor_si128 ( b, mask_B );
00165 mask_C = _mm_srli_epi32( mask_B, 31 );
00166 b = _mm_add_epi32( b, mask_C );
00167
00168 ab = _mm_mul_epu32(a, b);
00169
00170
00171 mask_A = _mm_xor_si128(mask_A, mask_B);
00172 mask_C = _mm_srli_epi32(mask_A, 31 );
00173 mask_B = _mm_slli_epi64(mask_A, 32);
00174 mask = _mm_add_epi32(mask_A, mask_B);
00175 a = _mm_xor_si128(ab, mask);
00176 a = _mm_add_epi64(a, mask_C);
00177
00178 return _mm_add_epi64(a, c);
00179 }
00180
00182 SSP_FORCEINLINE __m128i ssp_maccs_epi16_SSE2( __m128i a, __m128i b, __m128i c )
00183 {
00184
00185 __m128i ablo, abhi, unlo, unhi, signC, clo, chi;
00186
00187 ablo = _mm_mullo_epi16( a, b );
00188 abhi = _mm_mulhi_epi16( a, b );
00189 unlo = _mm_unpacklo_epi16( ablo, abhi );
00190 unhi = _mm_unpackhi_epi16( ablo, abhi );
00191
00192
00193 signC = _mm_srai_epi16 (c, 15);
00194 chi = _mm_unpackhi_epi16(c, signC);
00195 clo = _mm_unpacklo_epi16(c, signC);
00196
00197 chi = _mm_add_epi32(chi, unhi);
00198 clo = _mm_add_epi32(clo, unlo);
00199
00200 return _mm_packs_epi32(clo, chi);
00201 }
00202
00204 SSP_FORCEINLINE __m128i ssp_maccs_epi32_SSE2( __m128i a, __m128i b, __m128i c )
00205 {
00206
00207 ssp_m128 s1lo,s1hi,s2lo,s2hi,s3lo,s3hi, sl, sh;
00208 static const __m128d max_val = {(double)0x7FFFFFFFl, (double)0x7FFFFFFFl};
00209 static const __m128d min_val = {(-(double)0x80000000l), (-(double)0x80000000l)};
00210
00211 s1lo.d = _mm_cvtepi32_pd(a);
00212 s1hi.d = _mm_cvtepi32_pd(_mm_srli_si128(a, 8));
00213
00214 s2lo.d = _mm_cvtepi32_pd(b);
00215 s2hi.d = _mm_cvtepi32_pd(_mm_srli_si128(b,8));
00216
00217 s1lo.d = _mm_mul_pd(s1lo.d,s2lo.d);
00218 s1hi.d = _mm_mul_pd(s1hi.d,s2hi.d);
00219
00220 s3lo.d = _mm_cvtepi32_pd(c);
00221 s3hi.d = _mm_cvtepi32_pd(_mm_srli_si128(c,8));
00222
00223 s1lo.d = _mm_add_pd(s1lo.d,s3lo.d);
00224 s1hi.d = _mm_add_pd(s1hi.d,s3hi.d);
00225
00226 sl.d = _mm_min_pd(s1lo.d, max_val);
00227 sl.d = _mm_max_pd(sl.d, min_val);
00228
00229 sh.d = _mm_min_pd(s1hi.d, max_val);
00230 sh.d = _mm_max_pd(sh.d, min_val);
00231
00232 sl.i = _mm_cvtpd_epi32(sl.d);
00233 sh.i = _mm_cvtpd_epi32(sh.d);
00234
00235 sh.i = _mm_slli_si128(sh.i, 8);
00236 sl.i = _mm_or_si128(sl.i, sh.i);
00237
00238 return sl.i;
00239 }
00240
00241
00242
00243
00245 SSP_FORCEINLINE __m128 ssp_nmacc_ps_SSE2( __m128 a, __m128 b, __m128 c )
00246 {
00247 const static __m128 neg1 = SSP_CONST_SET_32F( -1.0f, -1.0f, -1.0f, -1.0f );
00248
00249 a = _mm_mul_ps( a, b );
00250 a = _mm_mul_ps( a, neg1 );
00251 a = _mm_add_ps( a, c );
00252 return a;
00253 }
00254
00256 SSP_FORCEINLINE __m128d ssp_nmacc_pd_SSE2(__m128d a, __m128d b, __m128d c)
00257 {
00258 const static __m128d neg1 = SSP_CONST_SET_64F( -1.0, -1.0 );
00259
00260 a = _mm_mul_pd( a, b );
00261 a = _mm_mul_pd( a, neg1 );
00262 a = _mm_add_pd( a, c );
00263 return a;
00264 }
00265
00267 SSP_FORCEINLINE __m128 ssp_nmacc_ss_SSE2(__m128 a, __m128 b, __m128 c)
00268 {
00269 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00270
00271 ssp_m128 A,B;
00272 A.f = a;
00273 B.f = b;
00274 B.f = ssp_nmacc_ps_SSE2( A.f, B.f, c );
00275 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask );
00276 return B.f;
00277 }
00278
00280 SSP_FORCEINLINE __m128d ssp_nmacc_sd_SSE2(__m128d a, __m128d b, __m128d c)
00281 {
00282 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00283
00284 ssp_m128 A,B;
00285 A.d = a;
00286 B.d = b;
00287 B.d = ssp_nmacc_pd_SSE2( A.d, B.d, c );
00288 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask );
00289 return B.d;
00290 }
00291
00292
00293
00294
00295
00297 SSP_FORCEINLINE __m128 ssp_msub_ps_SSE2(__m128 a, __m128 b, __m128 c)
00298 {
00299 a = _mm_mul_ps( a, b );
00300 a = _mm_sub_ps( a, c );
00301 return a;
00302 }
00303
00305 SSP_FORCEINLINE __m128d ssp_msub_pd_SSE2(__m128d a, __m128d b, __m128d c)
00306 {
00307 a = _mm_mul_pd( a, b );
00308 a = _mm_sub_pd( a, c );
00309 return a;
00310 }
00311
00313 SSP_FORCEINLINE __m128 ssp_msub_ss_SSE2(__m128 a, __m128 b, __m128 c)
00314 {
00315 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00316
00317 ssp_m128 A,B;
00318 A.f = a;
00319 B.f = b;
00320 B.f = ssp_msub_ps_SSE2( A.f, B.f, c );
00321 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask );
00322 return B.f;
00323 }
00324
00326 SSP_FORCEINLINE __m128d ssp_msub_sd_SSE2(__m128d a, __m128d b, __m128d c)
00327 {
00328 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00329
00330 ssp_m128 A,B;
00331 A.d = a;
00332 B.d = b;
00333 B.d = ssp_msub_pd_SSE2( A.d, B.d, c );
00334 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask );
00335 return B.d;
00336 }
00337
00338
00339
00340
00341
00343 SSP_FORCEINLINE __m128 ssp_nmsub_ps_SSE2(__m128 a, __m128 b, __m128 c)
00344 {
00345 const static __m128 neg1 = SSP_CONST_SET_32F( -1.0f, -1.0f, -1.0f, -1.0f );
00346
00347 a = _mm_mul_ps( a, b );
00348 a = _mm_mul_ps( a, neg1 );
00349 a = _mm_sub_ps( a, c );
00350 return a;
00351 }
00352
00354 SSP_FORCEINLINE __m128d ssp_nmsub_pd_SSE2(__m128d a, __m128d b, __m128d c)
00355 {
00356 const static __m128d neg1 = SSP_CONST_SET_64F( -1.0, -1.0 );
00357
00358 a = _mm_mul_pd( a, b );
00359 a = _mm_mul_pd( a, neg1 );
00360 a = _mm_sub_pd( a, c );
00361 return a;
00362 }
00363
00365 SSP_FORCEINLINE __m128 ssp_nmsub_ss_SSE2(__m128 a, __m128 b, __m128 c)
00366 {
00367 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00368
00369 ssp_m128 A,B;
00370 A.f = a;
00371 B.f = b;
00372 B.f = ssp_nmsub_ps_SSE2( A.f, B.f, c );
00373 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask );
00374 return B.f;
00375 }
00376
00378 SSP_FORCEINLINE __m128d ssp_nmsub_sd_SSE2(__m128d a, __m128d b, __m128d c)
00379 {
00380 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00381
00382 ssp_m128 A,B;
00383 A.d = a;
00384 B.d = b;
00385 B.d = ssp_nmsub_pd_SSE2( A.d, B.d, c );
00386 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask );
00387 return B.d;
00388 }
00389
00390
00391
00392
00393
00394
00396 SSP_FORCEINLINE
00397 __m128i ssp_abs_epi8_SSE2 (__m128i a)
00398 {
00399 __m128i mask = _mm_cmplt_epi8( a, _mm_setzero_si128() );
00400 __m128i one = _mm_set1_epi8(1);
00401 a = _mm_xor_si128 ( a, mask );
00402 mask = _mm_and_si128 ( mask, one );
00403 a = _mm_add_epi8 ( a, mask );
00404 return a;
00405 }
00406
00408 SSP_FORCEINLINE
00409 __m128i ssp_abs_epi16_SSE2 (__m128i a)
00410 {
00411 __m128i mask = _mm_cmplt_epi16( a, _mm_setzero_si128() );
00412 a = _mm_xor_si128 ( a, mask );
00413 mask = _mm_srli_epi16( mask, 15 );
00414 a = _mm_add_epi16 ( a, mask );
00415 return a;
00416 }
00417
00419 SSP_FORCEINLINE
00420 __m128i ssp_abs_epi32_SSE2 (__m128i a)
00421 {
00422 __m128i mask = _mm_cmplt_epi32( a, _mm_setzero_si128() );
00423 a = _mm_xor_si128 ( a, mask );
00424 mask = _mm_srli_epi32( mask, 31 );
00425 a = _mm_add_epi32( a, mask );
00426 return a;
00427 }
00428
00429
00431 SSP_FORCEINLINE
00432 __m128 ssp_addsub_ps_SSE2(__m128 a, __m128 b)
00433 {
00434 const static __m128 neg = SSP_CONST_SET_32F( 1, -1, 1, -1 );
00435
00436 b = _mm_mul_ps( b, neg );
00437 a = _mm_add_ps( a, b );
00438 return a;
00439 }
00440
00442 SSP_FORCEINLINE
00443 __m128d ssp_addsub_pd_SSE2(__m128d a, __m128d b)
00444 {
00445 const static __m128d const_addSub_pd_neg = SSP_CONST_SET_64F( 1, -1 );
00446
00447 b = _mm_mul_pd( b, const_addSub_pd_neg );
00448 a = _mm_add_pd( a, b );
00449 return a;
00450 }
00451
00452
00453
00454
00455
00457 SSP_FORCEINLINE
00458 __m128i ssp_blend_epi16_SSE2( __m128i a, __m128i b, const int mask )
00459 {
00460 __m128i screen;
00461 const static __m128i mulShiftImm = SSP_CONST_SET_16I( 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 );
00462
00463 screen = _mm_set1_epi16 ( mask );
00464 screen = _mm_mullo_epi16 ( screen, mulShiftImm );
00465 screen = _mm_srai_epi16 ( screen, 15 );
00466 b = _mm_and_si128 ( screen, b );
00467 a = _mm_andnot_si128( screen, a );
00468 a = _mm_or_si128 ( a, b );
00469 return a;
00470 }
00471
00473 SSP_FORCEINLINE
00474 __m128d ssp_blend_pd_SSE2( __m128d a, __m128d b, const int mask )
00475 {
00476 __m128d screen;
00477 screen = _mm_set_pd( (mask&0x2)>>1, mask&0x1 );
00478 b = _mm_mul_pd( b, screen );
00479 screen = _mm_set_pd( (~mask&0x2)>>1, (~mask&0x1) );
00480 a = _mm_mul_pd( a, screen );
00481 a = _mm_or_pd ( a, b );
00482 return a;
00483 }
00484
00486 SSP_FORCEINLINE
00487 __m128 ssp_blend_ps_SSE2( __m128 a, __m128 b, const int mask )
00488 {
00489 ssp_m128 screen, A, B;
00490 A.f = a;
00491 B.f = b;
00492 screen.i = ssp_movmask_imm8_to_epi32_SSE2( mask );
00493 screen.i = ssp_logical_bitwise_select_SSE2( B.i, A.i, screen.i );
00494 return screen.f;
00495 }
00496
00498 SSP_FORCEINLINE
00499 __m128i ssp_blendv_epi8_SSE2( __m128i a, __m128i b, __m128i mask )
00500 {
00501 __m128i mHi, mLo;
00502 __m128i zero = _mm_setzero_si128 ();
00503
00504 mHi = _mm_unpacklo_epi8( zero, mask );
00505 mHi = _mm_srai_epi16 ( mHi, 15 );
00506 mHi = _mm_srli_epi16 ( mHi, 1 );
00507
00508 mLo = _mm_unpackhi_epi8( zero, mask );
00509 mLo = _mm_srai_epi16 ( mLo, 15 );
00510 mLo = _mm_srli_epi16 ( mLo, 1 );
00511
00512 mHi = _mm_packus_epi16 ( mHi, mLo );
00513
00514 b = _mm_and_si128 ( b, mHi );
00515 a = _mm_andnot_si128 ( mHi, a );
00516 a = _mm_or_si128 ( a, b );
00517 return a;
00518 }
00519
00521 SSP_FORCEINLINE __m128d ssp_blendv_pd_SSE2( __m128d a, __m128d b, __m128d mask )
00522 {
00523 ssp_m128 A, B, Mask;
00524 A.d = a;
00525 B.d = b;
00526 Mask.d = mask;
00527
00528 Mask.i = _mm_shuffle_epi32( Mask.i, _MM_SHUFFLE(3, 3, 1, 1) );
00529 Mask.i = _mm_srai_epi32 ( Mask.i, 31 );
00530
00531 B.i = _mm_and_si128( B.i, Mask.i );
00532 A.i = _mm_andnot_si128( Mask.i, A.i );
00533 A.i = _mm_or_si128( A.i, B.i );
00534 return A.d;
00535 }
00537 SSP_FORCEINLINE __m128 ssp_blendv_ps_SSE2( __m128 a, __m128 b, __m128 mask )
00538 {
00539 ssp_m128 A, B, Mask;
00540 A.f = a;
00541 B.f = b;
00542 Mask.f = mask;
00543
00544 Mask.i = _mm_srai_epi32( Mask.i, 31 );
00545 B.i = _mm_and_si128( B.i, Mask.i );
00546 A.i = _mm_andnot_si128( Mask.i, A.i );
00547 A.i = _mm_or_si128( A.i, B.i );
00548 return A.f;
00549 }
00550
00551
00552
00553
00554
00556 SSP_FORCEINLINE
00557 __m128i ssp_cmpeq_epi64_SSE2( __m128i a, __m128i b )
00558 {
00559 return ssp_comeq_epi64_SSE2( a, b );
00560 }
00561
00562
00563
00564
00565
00566
00568 SSP_FORCEINLINE
00569 __m128i ssp_hadd_epi16_SSE2( __m128i a, __m128i b )
00570 {
00571 ssp_convert_odd_even_epi16_SSE2( &a, &b );
00572 a = _mm_add_epi16( a, b );
00573 return a;
00574 }
00575
00577 SSP_FORCEINLINE __m128i ssp_hadds_epi16_SSE2 ( __m128i a, __m128i b )
00578 {
00579 ssp_convert_odd_even_epi16_SSE2( &a, &b );
00580 a = _mm_adds_epi16( a, b );
00581 return a;
00582 }
00583
00584
00586 SSP_FORCEINLINE
00587 __m128i ssp_hsub_epi16_SSE2 ( __m128i a, __m128i b )
00588 {
00589 ssp_convert_odd_even_epi16_SSE2( &a, &b );
00590 a = _mm_sub_epi16( a, b );
00591 return a;
00592 }
00593
00595 SSP_FORCEINLINE
00596 __m128i ssp_hsubs_epi16_SSE2 ( __m128i a, __m128i b )
00597 {
00598 ssp_convert_odd_even_epi16_SSE2( &a, &b );
00599 a = _mm_subs_epi16( a, b );
00600 return a;
00601 }
00602
00603
00604
00606 SSP_FORCEINLINE __m128i ssp_hadd_epi32_SSE2( __m128i a, __m128i b )
00607 {
00608 ssp_convert_odd_even_epi32_SSE2( &a, &b );
00609 a = _mm_add_epi32( a, b );
00610 return a;
00611 }
00612
00614 SSP_FORCEINLINE __m128i ssp_hsub_epi32_SSE2 ( __m128i a, __m128i b )
00615 {
00616 ssp_convert_odd_even_epi32_SSE2( &a, &b );
00617 a = _mm_sub_epi32( b, a );
00618 return a;
00619 }
00620
00621
00623 SSP_FORCEINLINE
00624 __m128 ssp_hadd_ps_SSE2(__m128 a, __m128 b)
00625 {
00626 ssp_convert_odd_even_ps_SSE2( &a, &b );
00627 a = _mm_add_ps( a, b );
00628 return a;
00629 }
00630
00632 SSP_FORCEINLINE
00633 __m128 ssp_hsub_ps_SSE2(__m128 a, __m128 b)
00634 {
00635 ssp_convert_odd_even_ps_SSE2( &a, &b );
00636 a = _mm_sub_ps( b, a );
00637 return a;
00638 }
00639
00640
00642 SSP_FORCEINLINE
00643 __m128d ssp_hadd_pd_SSE2(__m128d a, __m128d b)
00644 {
00645 ssp_m128 A,B,C;
00646 A.d = a;
00647 C.d = a;
00648 B.d = b;
00649
00650 A.f = _mm_movelh_ps( A.f, B.f );
00651 B.f = _mm_movehl_ps( B.f, C.f );
00652 A.d = _mm_add_pd ( A.d, B.d );
00653 return A.d;
00654 }
00655
00657 SSP_FORCEINLINE
00658 __m128d ssp_hsub_pd_SSE2(__m128d a, __m128d b)
00659 {
00660 ssp_m128 A,B,C;
00661 A.d = a;
00662 C.d = a;
00663 B.d = b;
00664
00665 A.f = _mm_movelh_ps( A.f, B.f );
00666 B.f = _mm_movehl_ps( B.f, C.f );
00667 A.d = _mm_sub_pd ( A.d, B.d );
00668 return A.d;
00669 }
00670
00671
00672
00674 SSP_FORCEINLINE __m128i ssp_mulhrs_epi16_SSE2( __m128i a, __m128i b )
00675 {
00676 const static __m128i VAL = SSP_CONST_SET_32I( 0x4000, 0x4000, 0x4000, 0x4000 );
00677 __m128i c,d;
00678
00679 c = _mm_mullo_epi16( a, b );
00680 d = _mm_mulhi_epi16( a, b );
00681
00682 a = _mm_unpackhi_epi16( c, d );
00683 b = _mm_unpacklo_epi16( c, d );
00684
00685 a = _mm_add_epi32( a, VAL );
00686 b = _mm_add_epi32( b, VAL );
00687
00688 a = _mm_srai_epi32( a, 15 );
00689 b = _mm_srai_epi32( b, 15 );
00690
00691 a = _mm_packs_epi32( b, a );
00692 return a;
00693 }
00694
00695
00697 SSP_FORCEINLINE
00698 __m128i ssp_insert_epi32_SSE2( __m128i a, int b, const int ndx )
00699 {
00700 switch( ndx & 0x3 )
00701 {
00702 case 0: a = _mm_insert_epi16( a, b , 0 );
00703 a = _mm_insert_epi16( a, b<<16, 1 ); break;
00704 case 1: a = _mm_insert_epi16( a, b , 2 );
00705 a = _mm_insert_epi16( a, b<<16, 3 ); break;
00706 case 2: a = _mm_insert_epi16( a, b , 4 );
00707 a = _mm_insert_epi16( a, b<<16, 5 ); break;
00708 case 3: a = _mm_insert_epi16( a, b , 6 );
00709 a = _mm_insert_epi16( a, b<<16, 7 ); break;
00710 }
00711 return a;
00712 }
00713
00714
00715
00716
00717
00719 SSP_FORCEINLINE
00720 __m128i ssp_min_epi8_SSE2( __m128i a, __m128i b )
00721 {
00722 __m128i mask = _mm_cmplt_epi8( a, b );
00723 a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00724 return a;
00725 }
00726
00728 SSP_FORCEINLINE
00729 __m128i ssp_max_epi8_SSE2( __m128i a, __m128i b )
00730 {
00731 __m128i mask = _mm_cmpgt_epi8( a, b );
00732 a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00733 return a;
00734 }
00735
00737 SSP_FORCEINLINE
00738 __m128i ssp_min_epu16_SSE2( __m128i a, __m128i b )
00739 {
00740 __m128i mask = ssp_comlt_epu16_SSE2( a, b );
00741 a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00742 return a;
00743 }
00744
00746 SSP_FORCEINLINE
00747 __m128i ssp_max_epu16_SSE2( __m128i a, __m128i b )
00748 {
00749 __m128i mask = ssp_comgt_epu16_SSE2( a, b );
00750 a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00751 return a;
00752 }
00753
00755 SSP_FORCEINLINE
00756 __m128i ssp_min_epi32_SSE2( __m128i a, __m128i b )
00757 {
00758 __m128i mask = _mm_cmplt_epi32( a, b );
00759 a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00760 return a;
00761 }
00762
00764 SSP_FORCEINLINE
00765 __m128i ssp_max_epi32_SSE2( __m128i a, __m128i b )
00766 {
00767 __m128i mask = _mm_cmpgt_epi32( a, b );
00768 a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00769 return a;
00770 }
00771
00773 SSP_FORCEINLINE
00774 __m128i ssp_min_epu32_SSE2 ( __m128i a, __m128i b )
00775 {
00776 __m128i mask = ssp_comlt_epu32_SSE2( a, b );
00777 a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00778 return a;
00779 }
00780
00782 SSP_FORCEINLINE
00783 __m128i ssp_max_epu32_SSE2 ( __m128i a, __m128i b )
00784 {
00785 __m128i mask = ssp_comgt_epu32_SSE2( a, b );
00786 a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00787 return a;
00788 }
00789
00790
00799 SSP_FORCEINLINE __m128i ssp_maddubs_epi16_SSE2( __m128i a, __m128i b)
00800 {
00801 const static __m128i EVEN_8 = SSP_CONST_SET_8I( 0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF);
00802 __m128i Aodd, Aeven, Beven, Bodd;
00803
00804
00805 Aodd = _mm_srli_epi16( a, 8 );
00806 Bodd = _mm_srai_epi16( b, 8 );
00807
00808 Aeven = _mm_and_si128 ( a, EVEN_8 );
00809 Beven = _mm_slli_si128( b, 1 );
00810 Beven = _mm_srai_epi16( Beven, 8 );
00811
00812 a = _mm_mullo_epi16( Aodd , Bodd );
00813 b = _mm_mullo_epi16( Aeven, Beven );
00814 a = _mm_adds_epi16 ( a, b );
00815 return a;
00816 }
00817
00818
00819
00821 SSP_FORCEINLINE
00822 __m128i ssp_mpsadbw_epu8_SSE2 ( __m128i a, __m128i b, const int msk )
00823 {
00824 const static __m128i MASK_BITS04 = SSP_CONST_SET_16I( 0,0,0,0xFFFF,0,0,0,0xFFFF );
00825 const static __m128i MASK_BITS15 = SSP_CONST_SET_16I( 0,0,0xFFFF,0,0,0,0xFFFF,0 );
00826 const static __m128i MASK_BITS26 = SSP_CONST_SET_16I( 0,0xFFFF,0,0,0,0xFFFF,0,0 );
00827 const static __m128i MASK_BITS37 = SSP_CONST_SET_16I( 0xFFFF,0,0,0,0xFFFF,0,0,0 );
00828
00829 ssp_m128 A,B,A16,tmp,out;
00830 A.i = a;
00831 B.i = b;
00832
00833 switch( msk & 0x4 )
00834 {
00835 case 4: A.i = _mm_srli_si128( A.i, 4 );
00836 }
00837
00838 switch( (msk & 0x3) * 4 )
00839 {
00840 case 0: B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(0,0,0,0) ); break;
00841 case 4: B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(1,1,1,1) ); break;
00842 case 8: B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(2,2,2,2) ); break;
00843 case 12: B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(3,3,3,3) ); break;
00844
00845 }
00846
00847
00848 B.i = _mm_unpacklo_epi8( B.i, _mm_setzero_si128() );
00849 A16.i = _mm_unpacklo_epi8( A.i, _mm_setzero_si128() );
00850 tmp.i = _mm_subs_epi16 ( A16.i, B.i );
00851 tmp.i = ssp_abs_epi16_SSE2 ( tmp.i );
00852 tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 0 );
00853 tmp.i = _mm_and_si128 ( tmp.i, MASK_BITS04 );
00854 out.i = tmp.i;
00855
00856
00857 A16.i = _mm_srli_si128 ( A.i, 1 );
00858 A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );
00859 tmp.i = _mm_subs_epi16 ( A16.i, B.i );
00860 tmp.i = ssp_abs_epi16_SSE2 ( tmp.i );
00861 tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 1 );
00862 tmp.i = _mm_and_si128 ( tmp.i, MASK_BITS15 );
00863 out.i = _mm_or_si128 ( out.i, tmp.i );
00864
00865
00866 A16.i = _mm_srli_si128 ( A.i, 2 );
00867 A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );
00868 tmp.i = _mm_subs_epi16 ( A16.i, B.i );
00869 tmp.i = ssp_abs_epi16_SSE2 ( tmp.i );
00870 tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 2 );
00871 tmp.i = _mm_and_si128 ( tmp.i, MASK_BITS26 );
00872 out.i = _mm_or_si128 ( out.i, tmp.i );
00873
00874
00875 A16.i = _mm_srli_si128 ( A.i, 3 );
00876 A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );
00877 tmp.i = _mm_subs_epi16 ( A16.i, B.i );
00878 tmp.i = ssp_abs_epi16_SSE2 ( tmp.i );
00879 tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 3 );
00880 tmp.i = _mm_and_si128 ( tmp.i, MASK_BITS37 );
00881 out.i = _mm_or_si128 ( out.i, tmp.i );
00882
00883 return out.i;
00884 }
00885
00886
00887
00888
00889
00890
00892 SSP_FORCEINLINE
00893 __m128d ssp_dp_pd_SSE2( __m128d a, __m128d b, const int mask )
00894 {
00895 int smallMask = (mask & 0x33)<<16;
00896 const static __m128i mulShiftImm_01 = SSP_CONST_SET_32I( 0x40000000, 0x40000000, 0x80000000, 0x80000000 );
00897 const static __m128i mulShiftImm_45 = SSP_CONST_SET_32I( 0x04000000, 0x04000000, 0x08000000, 0x08000000 );
00898 ssp_m128 mHi, mLo;
00899
00900 mLo.i = _mm_set1_epi32( smallMask );
00901 mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_01 );
00902 mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_45 );
00903
00904 mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() );
00905 mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() );
00906
00907 a = _mm_and_pd( a, mHi.d );
00908 a = _mm_mul_pd( a, b );
00909
00910 b = _mm_shuffle_pd( a, a, _MM_SHUFFLE2(0, 1) );
00911 a = _mm_add_pd( a, b );
00912 a = _mm_and_pd( a, mLo.d );
00913 return a;
00914 }
00915
00917 SSP_FORCEINLINE
00918 __m128 ssp_dp_ps_SSE2( __m128 a, __m128 b, const int mask )
00919 {
00920 const static __m128i mulShiftImm_0123 = SSP_CONST_SET_32I( 0x010000, 0x020000, 0x040000, 0x080000 );
00921 const static __m128i mulShiftImm_4567 = SSP_CONST_SET_32I( 0x100000, 0x200000, 0x400000, 0x800000 );
00922
00923
00924 ssp_m128 mHi, mLo;
00925 mLo.i = _mm_set1_epi32( mask );
00926 mLo.i = _mm_slli_si128( mLo.i, 3 );
00927
00928 mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_0123 );
00929 mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_4567 );
00930
00931 mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() );
00932 mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() );
00933
00934
00935 a = _mm_and_ps( a, mHi.f );
00936 a = _mm_mul_ps( a, b );
00937
00938 a = ssp_arithmetic_hadd4_dup_ps_SSE2( a );
00939 a = _mm_and_ps( a, mLo.f );
00940 return a;
00941 }
00942
00949 SSP_FORCEINLINE
00950 __m128 ssp_round_ps_SSE2( __m128 a, int iRoundMode )
00951 {
00952 #pragma message( "" WARN() "SSEPlus SSE2 rounding functions overflow if input outside 32 bit integer range" )
00953
00954 enum ENUM_MXCSR
00955 {
00956 CSR_ROUND_TO_EVEN = 0x00001F80,
00957 CSR_ROUND_DOWN = 0x00003F80,
00958 CSR_ROUND_UP = 0x00005F80,
00959 CSR_ROUND_TRUNC = 0x00007F80,
00960 };
00961
00962 ssp_u32 bak = _mm_getcsr();
00963 ssp_m128 A, i;
00964 A.f = a;
00965
00966 switch( iRoundMode & 0x3 )
00967 {
00968 case SSP_FROUND_CUR_DIRECTION: break;
00969 case SSP_FROUND_TO_ZERO: _mm_setcsr( CSR_ROUND_TRUNC ); break;
00970 case SSP_FROUND_TO_POS_INF: _mm_setcsr( CSR_ROUND_UP ); break;
00971 case SSP_FROUND_TO_NEG_INF: _mm_setcsr( CSR_ROUND_DOWN ); break;
00972 default: _mm_setcsr( CSR_ROUND_TO_EVEN); break;
00973 }
00974
00975 i.i = _mm_cvtps_epi32( A.f );
00976 A.f = _mm_cvtepi32_ps( i.i );
00977
00978 i.u32[0] = bak;
00979 _mm_setcsr( i.u32[0] );
00980 return A.f;
00981 }
00982
00983
00984 SSP_FORCEINLINE
00985 __m128d ssp_round_pd_SSE2( __m128d a, int iRoundMode )
00986 {
00987 #pragma message( "" WARN() "SSEPlus SSE2 rounding functions overflow if input outside 32 bit integer range" )
00988
00989 enum ENUM_MXCSR
00990 {
00991 CSR_ROUND_TO_EVEN = 0x00001F80,
00992 CSR_ROUND_DOWN = 0x00003F80,
00993 CSR_ROUND_UP = 0x00005F80,
00994 CSR_ROUND_TRUNC = 0x00007F80,
00995 };
00996
00997 ssp_u32 bak = _mm_getcsr();
00998 ssp_m128 A, i;
00999 A.d = a;
01000
01001
01002 switch( iRoundMode & 0x3 )
01003 {
01004 case SSP_FROUND_CUR_DIRECTION: break;
01005 case SSP_FROUND_TO_ZERO: _mm_setcsr( CSR_ROUND_TRUNC ); break;
01006 case SSP_FROUND_TO_POS_INF: _mm_setcsr( CSR_ROUND_UP ); break;
01007 case SSP_FROUND_TO_NEG_INF: _mm_setcsr( CSR_ROUND_DOWN ); break;
01008 default: _mm_setcsr( CSR_ROUND_TO_EVEN); break;
01009 }
01010
01011 i.i = _mm_cvtpd_epi32( A.d );
01012 A.d = _mm_cvtepi32_pd( i.i );
01013
01014 i.u32[0] = bak;
01015 _mm_setcsr( i.u32[0] );
01016 return A.d;
01017 }
01018
01020 SSP_FORCEINLINE
01021 __m128 ssp_round_ss_SSE2( __m128 a, __m128 b, int iRoundMode )
01022 {
01023
01024
01025
01026
01027
01028
01029
01031
01032 b = ssp_round_ps_SSE2(b, iRoundMode);
01033 b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));
01034 return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0));
01035 }
01036
01038 SSP_FORCEINLINE
01039 __m128 ssp_ceil_ps_SSE2( __m128 a )
01040 {
01041 return ssp_round_ps_SSE2( a, SSP_FROUND_TO_POS_INF );
01042 }
01043
01045 SSP_FORCEINLINE
01046 __m128 ssp_floor_ps_SSE2( __m128 a )
01047 {
01048 return ssp_round_ps_SSE2( a, SSP_FROUND_TO_NEG_INF );
01049 }
01050
01052 SSP_FORCEINLINE
01053 __m128d ssp_floor_pd_SSE2( __m128d a )
01054 {
01055 return ssp_round_pd_SSE2( a, SSP_FROUND_TO_NEG_INF );
01056 }
01057
01059 SSP_FORCEINLINE
01060 __m128d ssp_ceil_pd_SSE2( __m128d a )
01061 {
01062 return ssp_round_pd_SSE2( a, SSP_FROUND_TO_POS_INF );
01063 }
01064
01066 SSP_FORCEINLINE __m128d ssp_floor_sd_SSE2( __m128d a, __m128d b)
01067 {
01068 b = ssp_round_pd_SSE2(b, SSP_FROUND_TO_NEG_INF );
01069
01070 return _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1,0));
01071 }
01072
01074 SSP_FORCEINLINE __m128d ssp_ceil_sd_SSE2( __m128d a, __m128d b)
01075 {
01076 b = ssp_round_pd_SSE2(b, SSP_FROUND_TO_POS_INF );
01077
01078 return _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1,0));
01079 }
01080
01082 SSP_FORCEINLINE __m128 ssp_floor_ss_SSE2( __m128 a, __m128 b)
01083 {
01084 b = ssp_round_ps_SSE2(b, SSP_FROUND_TO_NEG_INF );
01085 b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));
01086 return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0));
01087 }
01088
01090 SSP_FORCEINLINE __m128 ssp_ceil_ss_SSE2( __m128 a, __m128 b)
01091 {
01092 b = ssp_round_ps_SSE2(b, SSP_FROUND_TO_POS_INF );
01093 b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));
01094 return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0));
01095 }
01096
01097
01098
01099
01100
01101
01102
01103
01104
01105
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119
01120
01121
01123 SSP_FORCEINLINE
01124 __m128i ssp_cvtepi8_epi16_SSE2 ( __m128i a)
01125 {
01126 __m128i b = _mm_setzero_si128 ();
01127 __m128i c = _mm_unpacklo_epi8(a, b);
01128 __m128i d = _mm_set1_epi16 (128);
01129
01130 b = _mm_and_si128(d, c);
01131 d = _mm_set1_epi16(0x1FE);
01132 b = _mm_mullo_epi16(b, d);
01133
01134 return _mm_add_epi16(c, b);
01135
01136
01137
01138
01139
01140
01141
01142
01143 }
01144
01146 SSP_FORCEINLINE
01147 __m128i ssp_cvtepi8_epi32_SSE2 ( __m128i a)
01148 {
01149 __m128i b = _mm_set1_epi32 (-1);
01150 __m128i c = _mm_unpacklo_epi8(a, b);
01151 __m128i d = _mm_set1_epi32 (128);
01152
01153 c = _mm_unpacklo_epi16(c, b);
01154 b = _mm_andnot_si128(c, d);
01155 d = _mm_slli_epi32(b, 1);
01156
01157 return _mm_add_epi32(c, d);
01158 }
01159
01161 SSP_FORCEINLINE
01162 __m128i ssp_cvtepi8_epi64_SSE2 ( __m128i a)
01163 {
01164 __m128i b = _mm_set1_epi32 (-1);
01165 __m128i c = _mm_unpacklo_epi8(a, b);
01166 __m128i d = _mm_set_epi32 (0, 128, 0, 128);
01167
01168 c = _mm_unpacklo_epi16(c, b);
01169 c = _mm_unpacklo_epi32(c, b);
01170 b = _mm_andnot_si128(c, d);
01171 d = _mm_slli_epi64(b, 1);
01172
01173 return _mm_add_epi64(c, d);
01174 }
01175
01177 SSP_FORCEINLINE
01178 __m128i ssp_cvtepi16_epi32_SSE2 ( __m128i a)
01179 {
01180 __m128i b = _mm_set1_epi32 (-1);
01181 __m128i c = _mm_unpacklo_epi16(a, b);
01182 __m128i d = _mm_set1_epi32 (0x8000);
01183
01184 b = _mm_andnot_si128(c, d);
01185 d = _mm_slli_epi32(b, 1);
01186
01187 return _mm_add_epi32(c, d);
01188 }
01189
01191 SSP_FORCEINLINE
01192 __m128i ssp_cvtepi16_epi64_SSE2 ( __m128i a)
01193 {
01194 __m128i b = _mm_set1_epi32 (-1);
01195 __m128i c = _mm_unpacklo_epi16(a, b);
01196 __m128i d = _mm_set_epi32(0,0x8000, 0,0x8000);
01197
01198 c = _mm_unpacklo_epi32(c, b);
01199 b = _mm_andnot_si128(c, d);
01200 d = _mm_slli_epi64(b, 1);
01201
01202 return _mm_add_epi64(c, d);
01203 }
01204
01206 SSP_FORCEINLINE
01207 __m128i ssp_cvtepi32_epi64_SSE2 ( __m128i a)
01208 {
01209 __m128i b = _mm_set1_epi32 (-1);
01210 __m128i c = _mm_unpacklo_epi32(a, b);
01211 __m128i d = _mm_set_epi32(0, 0x80000000,0,0x80000000);
01212
01213 b = _mm_andnot_si128(c, d);
01214 d = _mm_slli_epi64(b, 1);
01215
01216 return _mm_add_epi64(c, d);
01217 }
01218
01220 SSP_FORCEINLINE
01221 __m128i ssp_cvtepu8_epi16_SSE2 ( __m128i a)
01222 {
01223 __m128i b =_mm_setzero_si128 ();
01224
01225 return _mm_unpacklo_epi8(a, b);
01226 }
01227
01229 SSP_FORCEINLINE
01230 __m128i ssp_cvtepu8_epi32_SSE2 ( __m128i a)
01231 {
01232 __m128i b = _mm_setzero_si128 ();
01233
01234 a = _mm_unpacklo_epi8(a, b);
01235
01236 return _mm_unpacklo_epi16(a, b);
01237 }
01238
01240 SSP_FORCEINLINE
01241 __m128i ssp_cvtepu8_epi64_SSE2 ( __m128i a)
01242 {
01243 __m128i b = _mm_setzero_si128 ();
01244
01245 a = _mm_unpacklo_epi8(a, b);
01246
01247 a = _mm_unpacklo_epi16(a, b);
01248
01249 return _mm_unpacklo_epi32(a, b);
01250 }
01251
01253 SSP_FORCEINLINE
01254 __m128i ssp_cvtepu16_epi32_SSE2 ( __m128i a)
01255 {
01256 __m128i b = _mm_setzero_si128 ();
01257
01258 return _mm_unpacklo_epi16(a, b);
01259 }
01260
01262 SSP_FORCEINLINE
01263 __m128i ssp_cvtepu16_epi64_SSE2 ( __m128i a)
01264 {
01265 __m128i b = _mm_setzero_si128 ();
01266
01267 a = _mm_unpacklo_epi16(a, b);
01268
01269 return _mm_unpacklo_epi32(a, b);
01270 }
01271
01273 SSP_FORCEINLINE
01274 __m128i ssp_cvtepu32_epi64_SSE2 ( __m128i a)
01275 {
01276 __m128i b = _mm_setzero_si128 ();
01277
01278 return _mm_unpacklo_epi32(a, b);
01279 }
01280
01282 SSP_FORCEINLINE
01283 __m128i ssp_packus_epi32_SSE2( __m128i a, __m128i b )
01284 {
01285 const static __m128i val_32 = SSP_CONST_SET_32I( 0x8000, 0x8000, 0x8000, 0x8000 );
01286 const static __m128i val_16 = SSP_CONST_SET_16I( 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000 );
01287
01288 a = _mm_sub_epi32( a, val_32 );
01289 b = _mm_sub_epi32( b, val_32 );
01290 a = _mm_packs_epi32( a, b );
01291 a = _mm_add_epi16( a, val_16 );
01292 return a;
01293 }
01294
01295
01296
01297
01299 SSP_FORCEINLINE
01300 __m128i ssp_alignr_epi8_SSE2 (__m128i a, __m128i b, const int ralign)
01301 {
01302 if (ralign < 0) return b;
01303
01304
01305
01306
01307
01308
01309 switch (ralign) {
01310 case 0:
01311 return b;
01312 case 1:
01313 b = _mm_srli_si128(b, 1);
01314 a = _mm_slli_si128(a, 15);
01315 return _mm_or_si128( a, b );
01316 case 2:
01317 b = _mm_srli_si128(b, 2);
01318 a = _mm_slli_si128(a, 14);
01319 return _mm_or_si128( a, b );
01320 case 3:
01321 b = _mm_srli_si128(b, 3);
01322 a = _mm_slli_si128(a, 13);
01323 return _mm_or_si128( a, b );
01324 case 4:
01325 b = _mm_srli_si128(b, 4);
01326 a = _mm_slli_si128(a, 12);
01327 return _mm_or_si128( a, b );
01328 case 5:
01329 b = _mm_srli_si128(b, 5);
01330 a = _mm_slli_si128(a, 11);
01331 return _mm_or_si128( a, b );
01332 case 6:
01333 b = _mm_srli_si128(b, 6);
01334 a = _mm_slli_si128(a, 10);
01335 return _mm_or_si128( a, b );
01336 case 7:
01337 b = _mm_srli_si128(b, 7);
01338 a = _mm_slli_si128(a, 9);
01339 return _mm_or_si128( a, b );
01340 case 8:
01341 b = _mm_srli_si128(b, 8);
01342 a = _mm_slli_si128(a, 8);
01343 return _mm_or_si128( a, b );
01344 case 9:
01345 b = _mm_srli_si128(b, 9);
01346 a = _mm_slli_si128(a, 7);
01347 return _mm_or_si128( a, b );
01348 case 10:
01349 b = _mm_srli_si128(b, 10);
01350 a = _mm_slli_si128(a, 6);
01351 return _mm_or_si128( a, b );
01352 case 11:
01353 b = _mm_srli_si128(b, 11);
01354 a = _mm_slli_si128(a, 5);
01355 return _mm_or_si128( a, b );
01356 case 12:
01357 b = _mm_srli_si128(b, 12);
01358 a = _mm_slli_si128(a, 4);
01359 return _mm_or_si128( a, b );
01360 case 13:
01361 b = _mm_srli_si128(b, 13);
01362 a = _mm_slli_si128(a, 3);
01363 return _mm_or_si128( a, b );
01364 case 14:
01365 b = _mm_srli_si128(b, 14);
01366 a = _mm_slli_si128(a, 2);
01367 return _mm_or_si128( a, b );
01368 case 15:
01369 b = _mm_srli_si128(b, 15);
01370 a = _mm_slli_si128(a, 1);
01371 return _mm_or_si128( a, b );
01372 case 16:
01373 return a;
01374 case 17:
01375 a = _mm_slli_si128(a, 1);
01376 return _mm_srli_si128(a, 1);
01377 case 18:
01378 a = _mm_slli_si128(a, 2);
01379 return _mm_srli_si128(a, 2);
01380 case 19:
01381 a = _mm_slli_si128(a, 3);
01382 return _mm_srli_si128(a, 3);
01383 case 20:
01384 a = _mm_slli_si128(a, 4);
01385 return _mm_srli_si128(a, 4);
01386 case 21:
01387 a = _mm_slli_si128(a, 5);
01388 return _mm_srli_si128(a, 5);
01389 case 22:
01390 a = _mm_slli_si128(a, 6);
01391 return _mm_srli_si128(a, 6);
01392 case 23:
01393 a = _mm_slli_si128(a, 7);
01394 return _mm_srli_si128(a, 7);
01395 case 24:
01396 a = _mm_slli_si128(a, 8);
01397 return _mm_srli_si128(a, 8);
01398 case 25:
01399 a = _mm_slli_si128(a, 9);
01400 return _mm_srli_si128(a, 9);
01401 case 26:
01402 a = _mm_slli_si128(a, 10);
01403 return _mm_srli_si128(a, 10);
01404 case 27:
01405 a = _mm_slli_si128(a, 11);
01406 return _mm_srli_si128(a, 11);
01407 case 28:
01408 a = _mm_slli_si128(a, 12);
01409 return _mm_srli_si128(a, 12);
01410 case 29:
01411 a = _mm_slli_si128(a, 13);
01412 return _mm_srli_si128(a, 13);
01413 case 30:
01414 a = _mm_slli_si128(a, 14);
01415 return _mm_srli_si128(a, 14);
01416 case 31:
01417 a = _mm_slli_si128(a, 15);
01418 return _mm_srli_si128(a, 15);
01419 default:
01420 return _mm_setzero_si128();
01421 }
01422 }
01423
01424
01425
01426
01428 SSP_FORCEINLINE __m128i ssp_insert_epi8_SSE2( __m128i a, int b, const int ndx )
01429 {
01430 ssp_m128 Ahi, Alo;
01431 b = b & 0xFF;
01432 Ahi.i = _mm_unpackhi_epi8( a, _mm_setzero_si128() );
01433 Alo.i = _mm_unpacklo_epi8( a, _mm_setzero_si128() );
01434
01435
01436 switch( ndx & 0xF )
01437 {
01438 case 0: Alo.i = _mm_insert_epi16( Alo.i, b, 0 ); break;
01439 case 1: Alo.i = _mm_insert_epi16( Alo.i, b, 1 ); break;
01440 case 2: Alo.i = _mm_insert_epi16( Alo.i, b, 2 ); break;
01441 case 3: Alo.i = _mm_insert_epi16( Alo.i, b, 3 ); break;
01442 case 4: Alo.i = _mm_insert_epi16( Alo.i, b, 4 ); break;
01443 case 5: Alo.i = _mm_insert_epi16( Alo.i, b, 5 ); break;
01444 case 6: Alo.i = _mm_insert_epi16( Alo.i, b, 6 ); break;
01445 case 7: Alo.i = _mm_insert_epi16( Alo.i, b, 7 ); break;
01446 case 8: Ahi.i = _mm_insert_epi16( Ahi.i, b, 0 ); break;
01447 case 9: Ahi.i = _mm_insert_epi16( Ahi.i, b, 1 ); break;
01448 case 10: Ahi.i = _mm_insert_epi16( Ahi.i, b, 2 ); break;
01449 case 11: Ahi.i = _mm_insert_epi16( Ahi.i, b, 3 ); break;
01450 case 12: Ahi.i = _mm_insert_epi16( Ahi.i, b, 4 ); break;
01451 case 13: Ahi.i = _mm_insert_epi16( Ahi.i, b, 5 ); break;
01452 case 14: Ahi.i = _mm_insert_epi16( Ahi.i, b, 6 ); break;
01453 default: Ahi.i = _mm_insert_epi16( Ahi.i, b, 7 );
01454 }
01455 return _mm_packus_epi16( Alo.i, Ahi.i );
01456
01458
01459
01460
01461
01462
01463
01464
01465
01466 }
01468 SSP_FORCEINLINE __m128i ssp_inserti_si64_SSE2( __m128i a, __m128i b, int len, int ndx )
01469 {
01470 const static __m128i MASK = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF );
01471
01472 int left = ndx + len;
01473 __m128i m;
01474 m = _mm_slli_epi64( MASK, 64-left );
01475 m = _mm_srli_epi64( m, 64-len );
01476 m = _mm_slli_epi64( m, ndx );
01477 b = _mm_slli_epi64( b, ndx );
01478
01479 a = ssp_logical_bitwise_select_SSE2( b, a, m );
01480 return a;
01481 }
01482
01483
01485 SSP_FORCEINLINE __m128i ssp_insert_si64_SSE2( __m128i a, __m128i b )
01486 {
01487 ssp_u32 ndx, len;
01488 ssp_m128 B;
01489 B.i = b;
01490
01491 ndx = (ssp_u32)((B.u64[1] & 0x3F00) >> 8);
01492 len = (ssp_u32)((B.u64[1] & 0x003F));
01493
01494 a = ssp_inserti_si64_SSE2( a, b, len, ndx );
01495 return a;
01496 }
01497
01498
01499
01500
01501
01503 SSP_FORCEINLINE int ssp_extract_epi8_SSE2( __m128i a, const int ndx )
01504 {
01505 ssp_m128 mask;
01506 switch( ndx & 0xF )
01507 {
01508 case 15: a = _mm_srli_si128( a, 15 ); break;
01509 case 14: a = _mm_srli_si128( a, 14 ); break;
01510 case 13: a = _mm_srli_si128( a, 13 ); break;
01511 case 12: a = _mm_srli_si128( a, 12 ); break;
01512 case 11: a = _mm_srli_si128( a, 11 ); break;
01513 case 10: a = _mm_srli_si128( a, 10 ); break;
01514 case 9: a = _mm_srli_si128( a, 9 ); break;
01515 case 8: a = _mm_srli_si128( a, 8 ); break;
01516 case 7: a = _mm_srli_si128( a, 7 ); break;
01517 case 6: a = _mm_srli_si128( a, 6 ); break;
01518 case 5: a = _mm_srli_si128( a, 5 ); break;
01519 case 4: a = _mm_srli_si128( a, 4 ); break;
01520 case 3: a = _mm_srli_si128( a, 3 ); break;
01521 case 2: a = _mm_srli_si128( a, 2 ); break;
01522 case 1: a = _mm_srli_si128( a, 1 ); break;
01523 }
01524
01525 mask.i = _mm_setr_epi8 ( -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 );
01526
01527 mask.i = _mm_and_si128 ( mask.i, a );
01528 return mask.s8[0];
01529 }
01530
01532 SSP_FORCEINLINE int ssp_extract_epi32_SSE2( __m128i a, const int imm )
01533 {
01534 ssp_m128 mask;
01535 switch( imm & 0x3 )
01536 {
01537 case 3: a = _mm_srli_si128( a, 12 ); break;
01538 case 2: a = _mm_srli_si128( a, 8 ); break;
01539 case 1: a = _mm_srli_si128( a, 4 ); break;
01540 }
01541
01542 mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF );
01543 mask.i = _mm_and_si128 ( mask.i, a );
01544
01545 return mask.s32[0];
01546 }
01547
01549 SSP_FORCEINLINE int ssp_extract_ps_SSE2( __m128 a, const int ndx )
01550 {
01551 ssp_m128 A;
01552 A.f = a;
01553 return ssp_extract_epi32_SSE2( A.i, ndx );
01554 }
01555
01557 SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_SSE2( __m128i a, const int ndx )
01558 {
01559 ssp_m128 mask;
01560 switch( ndx & 0x1 )
01561 {
01562 case 1: a = _mm_srli_si128( a, 8 ); break;
01563 }
01564
01565 mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF );
01566 mask.i = _mm_and_si128 ( mask.i, a );
01567
01568 return mask.s64[0];
01569 }
01570
01574 SSP_FORCEINLINE __m128i ssp_extracti_si64_SSE2( __m128i a, int len, int ndx )
01575 {
01576 int left = ndx + len;
01577 a = _mm_slli_epi64( a, 64-left );
01578 a = _mm_srli_epi64( a, 64-len );
01579 return a;
01580 }
01581
01582
01586 SSP_FORCEINLINE __m128i ssp_extract_si64_SSE2( __m128i a ,__m128i b )
01587 {
01588 ssp_u32 len, ndx;
01589 ssp_m128 B;
01590 B.i = b;
01591
01592 ndx = (ssp_u32)((B.u64[0] & 0x3F00) >> 8);
01593 len = (ssp_u32)((B.u64[0] & 0x003F));
01594
01595 a = ssp_extracti_si64_SSE2( a, len, ndx );
01596 return a;
01597 }
01598
01599
01601 SSP_FORCEINLINE __m128i ssp_shuffle_epi8_SSE2 (__m128i a, __m128i mask)
01602 {
01603 ssp_m128 A,B, MASK, maskZero;
01604 A.i = a;
01605 maskZero.i = ssp_comge_epi8_SSE2( mask, _mm_setzero_si128() );
01606 MASK.i = _mm_and_si128 ( mask, _mm_set1_epi8( (char)0x0F) );
01607
01608 B.s8[ 0] = A.s8[ (MASK.s8[ 0]) ];
01609 B.s8[ 1] = A.s8[ (MASK.s8[ 1]) ];
01610 B.s8[ 2] = A.s8[ (MASK.s8[ 2]) ];
01611 B.s8[ 3] = A.s8[ (MASK.s8[ 3]) ];
01612 B.s8[ 4] = A.s8[ (MASK.s8[ 4]) ];
01613 B.s8[ 5] = A.s8[ (MASK.s8[ 5]) ];
01614 B.s8[ 6] = A.s8[ (MASK.s8[ 6]) ];
01615 B.s8[ 7] = A.s8[ (MASK.s8[ 7]) ];
01616 B.s8[ 8] = A.s8[ (MASK.s8[ 8]) ];
01617 B.s8[ 9] = A.s8[ (MASK.s8[ 9]) ];
01618 B.s8[10] = A.s8[ (MASK.s8[10]) ];
01619 B.s8[11] = A.s8[ (MASK.s8[11]) ];
01620 B.s8[12] = A.s8[ (MASK.s8[12]) ];
01621 B.s8[13] = A.s8[ (MASK.s8[13]) ];
01622 B.s8[14] = A.s8[ (MASK.s8[14]) ];
01623 B.s8[15] = A.s8[ (MASK.s8[15]) ];
01624
01625 B.i = _mm_and_si128( B.i, maskZero.i );
01626 return B.i;
01627 }
01628
01629
01631 SSP_FORCEINLINE
01632 __m128i ssp_sign_epi8_SSE2 (__m128i a, __m128i b)
01633 {
01634 __m128i ap, an, c, d, zero, one;
01635
01636 zero=_mm_setzero_si128();
01637
01638 d = _mm_cmpgt_epi8(b, zero);
01639 ap = _mm_and_si128(a, d);
01640
01641
01642 c = _mm_cmplt_epi8(b, zero);
01643 one = _mm_set1_epi8(1);
01644 an = _mm_and_si128(a, c);
01645 an = _mm_xor_si128(an, c);
01646 one = _mm_and_si128(one, c);
01647 an = _mm_add_epi8(an, one);
01648
01649 return _mm_or_si128(an, ap);
01650 }
01651
01653 SSP_FORCEINLINE
01654 __m128i ssp_sign_epi16_SSE2 (__m128i a, __m128i b)
01655 {
01656 __m128i c, d, zero;
01657
01658 zero=_mm_setzero_si128();
01659 d = _mm_cmpgt_epi16(b, zero);
01660 c = _mm_cmplt_epi16(b, zero);
01661 d = _mm_srli_epi16(d, 15);
01662 c = _mm_or_si128(c, d);
01663 a = _mm_mullo_epi16(a, c);
01664
01665
01666
01667
01668
01669
01670
01671
01672
01673
01674 return a;
01675 }
01676
01678 SSP_FORCEINLINE
01679 __m128i ssp_sign_epi32_SSE2 (__m128i a, __m128i b)
01680 {
01681 __m128i ap, an, c, d, zero, one;
01682
01683 zero=_mm_setzero_si128();
01684
01685 d = _mm_cmpgt_epi32(b, zero);
01686 ap = _mm_and_si128(a, d);
01687
01688
01689 c = _mm_cmplt_epi32(b, zero);
01690 one = _mm_set1_epi32(1);
01691 an = _mm_and_si128(a, c);
01692 an = _mm_xor_si128(an, c);
01693 one = _mm_and_si128(one, c);
01694 an = _mm_add_epi8(an, one);
01695
01696 return _mm_or_si128(an, ap);
01697 }
01698
01699
01700
01701
01703 SSP_FORCEINLINE int ssp_testc_si128_SSE2( __m128i a, __m128i b)
01704 {
01705 a = _mm_xor_si128( a, b );
01706 return ssp_testz_si128_SSE2( a, a );
01707 }
01708
01710 SSP_FORCEINLINE
01711 int ssp_testz_si128_SSE2( __m128i a, __m128i b)
01712 {
01713 ssp_m128 t;
01714 t.i = _mm_and_si128 ( a, b );
01715 t.i = _mm_packs_epi32( t.i, _mm_setzero_si128() );
01716 return t.u64[0] == 0;
01717 }
01718
01720 SSP_FORCEINLINE
01721 int ssp_testnzc_si128_SSE2( __m128i a, __m128i b)
01722 {
01723 ssp_m128 zf, cf;
01724
01725 zf.i = _mm_and_si128 ( a, b );
01726 zf.i = _mm_packs_epi32( zf.i, _mm_setzero_si128() );
01727
01728 cf.i = _mm_andnot_si128( a, b );
01729 cf.i = _mm_packs_epi32( cf.i, _mm_setzero_si128() );
01730
01731 return ( !(zf.u64[0] == 0) && !(cf.u64[0] == 0));
01732 }
01733
01734
01735
01736
01737
01739 SSP_FORCEINLINE __m128 ssp_movehdup_ps_SSE2(__m128 a)
01740 {
01741 ssp_m128 A;
01742 A.f = a;
01743 A.i = _mm_shuffle_epi32( A.i, _MM_SHUFFLE( 3, 3, 1, 1) );
01744 return A.f;
01745 }
01746
01748 SSP_FORCEINLINE __m128 ssp_moveldup_ps_SSE2(__m128 a)
01749 {
01750 ssp_m128 A;
01751 A.f = a;
01752 A.i = _mm_shuffle_epi32( A.i, _MM_SHUFFLE( 2, 2, 0, 0) );
01753 return A.f;
01754 }
01755
01757 SSP_FORCEINLINE __m128d ssp_movedup_pd_SSE2(__m128d a)
01758 {
01759 ssp_m128 A;
01760 A.d = a;
01761 return _mm_set_pd( A.f64[0], A.f64[0] );
01762 }
01763
01765 SSP_FORCEINLINE __m128i ssp_rot_epi8_SSE2(__m128i a, __m128i b )
01766 {
01767 int n;
01768 ssp_m128 A,B;
01769 A.i = a;
01770 B.i = b;
01771
01772 for( n = 0; n < 16; n++ )
01773 {
01774 if( B.s8[n] < 0 )
01775 {
01776 unsigned int count = (-B.s8[n]) % 8;
01777 unsigned int carry_count = (8 - count) % 8;
01778 unsigned char carry = A.u8[n] << carry_count;
01779 A.u8[n] = A.u8[n] >> count;
01780 A.u8[n] = A.u8[n] | carry;
01781 }
01782 else
01783 {
01784 unsigned int count = B.s8[n] % 8;
01785 unsigned int carry_count = (8 - count) % 8;
01786 unsigned char carry = A.u8[n] >> carry_count;
01787 A.u8[n] = A.u8[n] << count;
01788 A.u8[n] = A.u8[n] | carry;
01789 }
01790 }
01791 return A.i;
01792 }
01794 SSP_FORCEINLINE __m128i ssp_rot_epi16_SSE2(__m128i a, __m128i b )
01795 {
01796 int n;
01797 ssp_m128 A,B;
01798 A.i = a;
01799 B.i = b;
01800
01801 for( n = 0; n < 8; n++ )
01802 {
01803 if( B.s16[n] < 0 )
01804 {
01805 unsigned int count = (-B.s16[n]) % 16;
01806 unsigned int carry_count = (16 - count) % 16;
01807 ssp_u16 carry = A.u16[n] << carry_count;
01808 A.u16[n] = A.u16[n] >> count;
01809 A.u16[n] = A.u16[n] | carry;
01810 }
01811 else
01812 {
01813 unsigned int count = B.s16[n] % 8;
01814 unsigned int carry_count = (16 - count) % 16;
01815 ssp_u16 carry = A.u16[n] >> carry_count;
01816 A.u16[n] = A.u16[n] << count;
01817 A.u16[n] = A.u16[n] | carry;
01818 }
01819 }
01820 return A.i;
01821 }
01823 SSP_FORCEINLINE __m128i ssp_rot_epi32_SSE2(__m128i a, __m128i b )
01824 {
01825 int n;
01826 ssp_m128 A,B;
01827 A.i = a;
01828 B.i = b;
01829
01830 for( n = 0; n < 4; n++ )
01831 {
01832 if( B.s32[n] < 0 )
01833 {
01834 unsigned int count = (-B.s32[n]) % 32;
01835 unsigned int carry_count = (32 - count) % 32;
01836 ssp_u32 carry = A.u32[n] << carry_count;
01837 A.u32[n] = A.u32[n] >> count;
01838 A.u32[n] = A.u32[n] | carry;
01839 }
01840 else
01841 {
01842 unsigned int count = B.s32[n] % 32;
01843 unsigned int carry_count = (32 - count) % 32;
01844 ssp_u32 carry = A.u32[n] >> carry_count;
01845 A.u32[n] = A.u32[n] << count;
01846 A.u32[n] = A.u32[n] | carry;
01847 }
01848 }
01849 return A.i;
01850 }
01852 SSP_FORCEINLINE __m128i ssp_rot_epi64_SSE2(__m128i a, __m128i b )
01853 {
01854 int n;
01855 ssp_m128 A,B;
01856 A.i = a;
01857 B.i = b;
01858
01859 for( n = 0; n < 2; n++ )
01860 {
01861 if( B.s64[n] < 0 )
01862 {
01863 unsigned int count = (unsigned int)((-B.s64[n]) % 64);
01864 unsigned int carry_count = (64 - count) % 64;
01865 ssp_u64 carry = A.u64[n] << carry_count;
01866 A.u64[n] = A.u64[n] >> count;
01867 A.u64[n] = A.u64[n] | carry;
01868 }
01869 else
01870 {
01871 unsigned int count = (unsigned int)(B.s64[n] % 64);
01872 unsigned int carry_count = (64 - count) % 64;
01873 ssp_u64 carry = A.u64[n] >> carry_count;
01874 A.u64[n] = A.u64[n] << count;
01875 A.u64[n] = A.u64[n] | carry;
01876 }
01877 }
01878 return A.i;
01879 }
01880
01882 SSP_FORCEINLINE __m128i ssp_roti_epi8_SSE2(__m128i a, const int b)
01883 {
01884 ssp_m128 A;
01885 A.i = a;
01886
01887 if( b < 0 )
01888 {
01889 const unsigned int count = (-b) % 8;
01890 const unsigned int carry_count = (8 - count) % 8;
01891 __m128i t = ssp_slli_epi8_SSE2( A.i, carry_count );
01892 A.i = ssp_srli_epi8_SSE2( A.i, count );
01893 A.i = _mm_or_si128( A.i, t );
01894 }
01895 else
01896 {
01897 const unsigned int count = b % 8;
01898 const unsigned int carry_count = (8 - count) % 8;
01899 __m128i t = ssp_srli_epi8_SSE2( A.i, carry_count );
01900 A.i = ssp_slli_epi8_SSE2( A.i, count );
01901 A.i = _mm_or_si128( A.i, t );
01902 }
01903
01904 return A.i;
01905 }
01907 SSP_FORCEINLINE __m128i ssp_roti_epi16_SSE2(__m128i a, const int b)
01908 {
01909 ssp_m128 A;
01910 A.i = a;
01911
01912 if( b < 0 )
01913 {
01914 const unsigned int count = (-b) % 16;
01915 const unsigned int carry_count = (16 - count) % 16;
01916 __m128i t = _mm_slli_epi16( A.i, carry_count );
01917 A.i = _mm_srli_epi16( A.i, count );
01918 A.i = _mm_or_si128( A.i, t );
01919 }
01920 else
01921 {
01922 const unsigned int count = b % 16;
01923 const unsigned int carry_count = (16 - count) % 16;
01924 __m128i t = _mm_srli_epi16( A.i, carry_count );
01925 A.i = _mm_slli_epi16( A.i, count );
01926 A.i = _mm_or_si128( A.i, t );
01927 }
01928
01929 return A.i;
01930 }
01932 SSP_FORCEINLINE __m128i ssp_roti_epi32_SSE2(__m128i a, const int b)
01933 {
01934 ssp_m128 A;
01935 A.i = a;
01936
01937 if( b < 0 )
01938 {
01939 const unsigned int count = (-b) % 32;
01940 const unsigned int carry_count = (32 - count) % 32;
01941 __m128i t = _mm_slli_epi32( A.i, carry_count );
01942 A.i = _mm_srli_epi32( A.i, count );
01943 A.i = _mm_or_si128( A.i, t );
01944 }
01945 else
01946 {
01947 const unsigned int count = b % 32;
01948 const unsigned int carry_count = (32 - count) % 32;
01949 __m128i t = _mm_srli_epi32( A.i, carry_count );
01950 A.i = _mm_slli_epi32( A.i, count );
01951 A.i = _mm_or_si128( A.i, t );
01952 }
01953
01954 return A.i;
01955 }
01957 SSP_FORCEINLINE __m128i ssp_roti_epi64_SSE2(__m128i a, const int b)
01958 {
01959 ssp_m128 A;
01960 A.i = a;
01961
01962 if( b < 0 )
01963 {
01964 const unsigned int count = (-b) % 64;
01965 const unsigned int carry_count = (64 - count) % 64;
01966 __m128i t = _mm_slli_epi64( A.i, carry_count );
01967 A.i = _mm_srli_epi64( A.i, count );
01968 A.i = _mm_or_si128( A.i, t );
01969 }
01970 else
01971 {
01972 const unsigned int count = b % 64;
01973 const unsigned int carry_count = (64 - count) % 64;
01974 __m128i t = _mm_srli_epi64( A.i, carry_count );
01975 A.i = _mm_slli_epi64( A.i, count );
01976 A.i = _mm_or_si128( A.i, t );
01977 }
01978
01979 return A.i;
01980 }
01981
01982
01983
01984
01985
01987 SSP_FORCEINLINE __m128i ssp_shl_epi8_SSE2(__m128i a, __m128i b)
01988 {
01989 int n;
01990 ssp_m128 A,B;
01991 A.i = a;
01992 B.i = b;
01993
01994 for( n = 0; n < 16; n++ )
01995 {
01996 if( B.s8[n] < 0 )
01997 {
01998 unsigned int count = (-B.s8[n]) % 8;
01999 A.u8[n] = A.u8[n] >> count;
02000 }
02001 else
02002 {
02003 unsigned int count = B.s8[n] % 8;
02004 A.u8[n] = A.u8[n] << count;
02005 }
02006 }
02007 return A.i;
02008 }
02009
02011 SSP_FORCEINLINE __m128i ssp_sha_epi8_SSE2(__m128i a, __m128i b)
02012 {
02013 int n;
02014 ssp_m128 A,B;
02015 A.i = a;
02016 B.i = b;
02017
02018 for( n = 0; n < 16; n++ )
02019 {
02020 if( B.s8[n] < 0 )
02021 {
02022 unsigned int count = (-B.s8[n]) % 8;
02023 A.s8[n] = A.s8[n] >> count;
02024 }
02025 else
02026 {
02027 unsigned int count = B.s8[n] % 8;
02028 A.s8[n] = A.s8[n] << count;
02029 }
02030 }
02031
02032 return A.i;
02033 }
02034
02036 SSP_FORCEINLINE __m128i ssp_shl_epi16_SSE2(__m128i a, __m128i b)
02037 {
02038 __m128i v1, v2, mask, mask2, b1, b2;
02039 b1 = ssp_abs_epi8_SSE2( b );
02040 mask = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, -1 );
02041 mask2 = _mm_srli_epi16( mask, 12 );
02042
02043 b2 = _mm_and_si128( b1, mask2 );
02044 v1 = _mm_and_si128( _mm_srl_epi16( a, b2 ), mask );
02045 v2 = _mm_and_si128( _mm_sll_epi16( a, b2 ), mask );
02046 mask = _mm_slli_si128( mask, 2 );
02047 b1 = _mm_srli_si128( b1, 2 );
02048
02049 b2 = _mm_and_si128( b1, mask2 );
02050 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) );
02051 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02052 mask = _mm_slli_si128( mask, 2 );
02053 b1 = _mm_srli_si128( b1, 2 );
02054
02055 b2 = _mm_and_si128( b1, mask2 );
02056 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) );
02057 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02058 mask = _mm_slli_si128( mask, 2 );
02059 b1 = _mm_srli_si128( b1, 2 );
02060
02061 b2 = _mm_and_si128( b1, mask2 );
02062 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) );
02063 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02064 mask = _mm_slli_si128( mask, 2 );
02065 b1 = _mm_srli_si128( b1, 2 );
02066
02067 b2 = _mm_and_si128( b1, mask2 );
02068 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) );
02069 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02070 mask = _mm_slli_si128( mask, 2 );
02071 b1 = _mm_srli_si128( b1, 2 );
02072
02073 b2 = _mm_and_si128( b1, mask2 );
02074 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) );
02075 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02076 mask = _mm_slli_si128( mask, 2 );
02077 b1 = _mm_srli_si128( b1, 2 );
02078
02079 b2 = _mm_and_si128( b1, mask2 );
02080 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) );
02081 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02082 mask = _mm_slli_si128( mask, 2 );
02083 b1 = _mm_srli_si128( b1, 2 );
02084
02085 b2 = _mm_and_si128( b1, mask2 );
02086 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) );
02087 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02088
02089 mask = _mm_setzero_si128();
02090 mask = _mm_cmpgt_epi8( mask, b );
02091 mask2 = _mm_slli_epi16( mask, 8 );
02092 mask = _mm_or_si128( mask, mask2 );
02093 v1 = _mm_and_si128( v1, mask );
02094 mask = _mm_andnot_si128( mask, v2 );
02095 v1 = _mm_or_si128( v1, mask );
02096 return v1;
02097 }
02098
02100 SSP_FORCEINLINE __m128i ssp_sha_epi16_SSE2(__m128i a, __m128i b)
02101 {
02102 __m128i v1, v2, mask, mask2, b1, b2;
02103 b1 = ssp_abs_epi8_SSE2( b );
02104 mask = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, -1 );
02105 mask2 = _mm_srli_epi16( mask, 12 );
02106
02107 b2 = _mm_and_si128( b1, mask2 );
02108 v1 = _mm_and_si128( _mm_sra_epi16( a, b2 ), mask );
02109 v2 = _mm_and_si128( _mm_sll_epi16( a, b2 ), mask );
02110 mask = _mm_slli_si128( mask, 2 );
02111 b1 = _mm_srli_si128( b1, 2 );
02112
02113 b2 = _mm_and_si128( b1, mask2 );
02114 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) );
02115 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02116 mask = _mm_slli_si128( mask, 2 );
02117 b1 = _mm_srli_si128( b1, 2 );
02118
02119 b2 = _mm_and_si128( b1, mask2 );
02120 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) );
02121 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02122 mask = _mm_slli_si128( mask, 2 );
02123 b1 = _mm_srli_si128( b1, 2 );
02124
02125 b2 = _mm_and_si128( b1, mask2 );
02126 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) );
02127 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02128 mask = _mm_slli_si128( mask, 2 );
02129 b1 = _mm_srli_si128( b1, 2 );
02130
02131 b2 = _mm_and_si128( b1, mask2 );
02132 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) );
02133 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02134 mask = _mm_slli_si128( mask, 2 );
02135 b1 = _mm_srli_si128( b1, 2 );
02136
02137 b2 = _mm_and_si128( b1, mask2 );
02138 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) );
02139 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02140 mask = _mm_slli_si128( mask, 2 );
02141 b1 = _mm_srli_si128( b1, 2 );
02142
02143 b2 = _mm_and_si128( b1, mask2 );
02144 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) );
02145 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02146 mask = _mm_slli_si128( mask, 2 );
02147 b1 = _mm_srli_si128( b1, 2 );
02148
02149 b2 = _mm_and_si128( b1, mask2 );
02150 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) );
02151 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) );
02152
02153 mask = _mm_setzero_si128();
02154 mask = _mm_cmpgt_epi8( mask, b );
02155 mask2 = _mm_slli_epi16( mask, 8 );
02156 mask = _mm_or_si128( mask, mask2 );
02157 v1 = _mm_and_si128( v1, mask );
02158 mask = _mm_andnot_si128( mask, v2 );
02159 v1 = _mm_or_si128( v1, mask );
02160 return v1;
02161 }
02162
02164 SSP_FORCEINLINE __m128i ssp_shl_epi32_SSE2(__m128i a, __m128i b)
02165 {
02166 __m128i v1, v2, mask, mask2, b1, b2;
02167 b1 = ssp_abs_epi8_SSE2( b );
02168 mask = _mm_set_epi32( 0, 0, 0, -1 );
02169 mask2 = _mm_srli_epi32( mask, 27 );
02170
02171 b2 = _mm_and_si128( b1, mask2 );
02172 v1 = _mm_and_si128( _mm_srl_epi32( a, b2 ), mask );
02173 v2 = _mm_and_si128( _mm_sll_epi32( a, b2 ), mask );
02174 mask = _mm_slli_si128( mask, 4 );
02175 b1 = _mm_srli_si128( b1, 4 );
02176
02177 b2 = _mm_and_si128( b1, mask2 );
02178 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) );
02179 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) );
02180 mask = _mm_slli_si128( mask, 4 );
02181 b1 = _mm_srli_si128( b1, 4 );
02182
02183 b2 = _mm_and_si128( b1, mask2 );
02184 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) );
02185 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) );
02186 mask = _mm_slli_si128( mask, 4 );
02187 b1 = _mm_srli_si128( b1, 4 );
02188
02189 b2 = _mm_and_si128( b1, mask2 );
02190 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) );
02191 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) );
02192
02193 mask = _mm_setzero_si128();
02194 mask = _mm_cmpgt_epi8( mask, b );
02195 mask = _mm_slli_epi32( mask, 24 );
02196 mask = _mm_srai_epi32( mask, 24 );
02197 v1 = _mm_and_si128( v1, mask );
02198 mask = _mm_andnot_si128( mask, v2 );
02199 v1 = _mm_or_si128( v1, mask );
02200 return v1;
02201 }
02202
02204 SSP_FORCEINLINE __m128i ssp_sha_epi32_SSE2(__m128i a, __m128i b)
02205 {
02206 __m128i v1, v2, mask, mask2, b1, b2;
02207 b1 = ssp_abs_epi8_SSE2( b );
02208 mask = _mm_set_epi32( 0, 0, 0, -1 );
02209 mask2 = _mm_srli_epi32( mask, 27 );
02210
02211 b2 = _mm_and_si128( b1, mask2 );
02212 v1 = _mm_and_si128( _mm_sra_epi32( a, b2 ), mask );
02213 v2 = _mm_and_si128( _mm_sll_epi32( a, b2 ), mask );
02214 mask = _mm_slli_si128( mask, 4 );
02215 b1 = _mm_srli_si128( b1, 4 );
02216
02217 b2 = _mm_and_si128( b1, mask2 );
02218 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) );
02219 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) );
02220 mask = _mm_slli_si128( mask, 4 );
02221 b1 = _mm_srli_si128( b1, 4 );
02222
02223 b2 = _mm_and_si128( b1, mask2 );
02224 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) );
02225 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) );
02226 mask = _mm_slli_si128( mask, 4 );
02227 b1 = _mm_srli_si128( b1, 4 );
02228
02229 b2 = _mm_and_si128( b1, mask2 );
02230 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) );
02231 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) );
02232
02233 mask = _mm_setzero_si128();
02234 mask = _mm_cmpgt_epi8( mask, b );
02235 mask = _mm_slli_epi32( mask, 24 );
02236 mask = _mm_srai_epi32( mask, 24 );
02237 v1 = _mm_and_si128( v1, mask );
02238 mask = _mm_andnot_si128( mask, v2 );
02239 v1 = _mm_or_si128( v1, mask );
02240 return v1;
02241 }
02242
02244 SSP_FORCEINLINE __m128i ssp_shl_epi64_SSE2(__m128i a, __m128i b)
02245 {
02246 __m128i v1, v2, mask, mask2, b1, b2;
02247 b1 = ssp_abs_epi8_SSE2( b );
02248 mask = _mm_set_epi32( 0, 0, -1, -1 );
02249 mask2 = _mm_srli_epi64( mask, 58 );
02250
02251 b2 = _mm_and_si128( b1, mask2 );
02252 v1 = _mm_and_si128( _mm_srl_epi64( a, b2 ), mask );
02253 v2 = _mm_and_si128( _mm_sll_epi64( a, b2 ), mask );
02254 mask = _mm_slli_si128( mask, 8 );
02255 b1 = _mm_srli_si128( b1, 8 );
02256
02257 b2 = _mm_and_si128( b1, mask2 );
02258 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi64( a, b2 ), mask ) );
02259 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi64( a, b2 ), mask ) );
02260
02261 mask = _mm_setzero_si128();
02262 mask = _mm_cmpgt_epi8( mask, b );
02263 mask = _mm_slli_epi16( mask, 8 );
02264 mask = _mm_srai_epi16( mask, 8 );
02265 mask = _mm_shufflelo_epi16( mask, _MM_SHUFFLE(0,0,0,0) );
02266 mask = _mm_shufflehi_epi16( mask, _MM_SHUFFLE(0,0,0,0) );
02267 v1 = _mm_and_si128( v1, mask );
02268 mask = _mm_andnot_si128( mask, v2 );
02269 v1 = _mm_or_si128( v1, mask );
02270 return v1;
02271 }
02272
02274 SSP_FORCEINLINE __m128i ssp_sha_epi64_SSE2(__m128i a, __m128i b)
02275 {
02276 int n;
02277 ssp_m128 A,B;
02278 A.i = a;
02279 B.i = b;
02280
02281 for( n = 0; n < 2; n++ )
02282 {
02283 if( B.s8[n*8] < 0 )
02284 {
02285 unsigned int count = (-B.s8[n*8]) % 64;
02286 A.s64[n] = A.s64[n] >> count;
02287 }
02288 else
02289 {
02290 unsigned int count = B.s8[n*8] % 64;
02291 A.s64[n] = A.s64[n] << count;
02292 }
02293 }
02294
02295 return A.i;
02296 }
02297
02303 #endif // __SSEPLUS_EMULATION_SSE2_H__