include/emulation/SSEPlus_emulation_SSE2.h

Go to the documentation of this file.
00001 //
00002 // Copyright (c) 2006-2008 Advanced Micro Devices, Inc. All Rights Reserved.
00003 // This software is subject to the Apache v2.0 License.
00004 //
00005 #ifndef __SSEPLUS_EMULATION_SSE2_H__
00006 #define __SSEPLUS_EMULATION_SSE2_H__
00007 
00008 #include "../SSEPlus_SSE2.h"
00009 #include "../native/SSEPlus_native_SSE2.h"
00010 #include "../logical/SSEPlus_logical_SSE2.h"
00011 #include "../convert/SSEPlus_convert_SSE2.h"
00012 #include "../arithmetic/SSEPlus_arithmetic_SSE2.h"
00013 #include "SSEPlus_emulation_comps_SSE2.h"
00014 
00015 
00021 //
00022 // Multiply Add
00023 //
00025 SSP_FORCEINLINE __m128i ssp_macc_epi16_SSE2( __m128i a, __m128i b, __m128i c )
00026 {
00027     a = _mm_mullo_epi16( a, b );
00028     a = _mm_add_epi16( a, c );
00029     return a;
00030 }
00031 
00033 SSP_FORCEINLINE __m128i ssp_macc_epi32_SSE2( __m128i a, __m128i b, __m128i c )
00034 {
00035         __m128i ab02, ab13, mask;
00036 
00037         mask = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
00038         ab02 = _mm_mul_epu32(a, b);
00039         ab02 = _mm_and_si128(ab02, mask);
00040         a    = _mm_srli_epi64(a, 32);
00041         b    = _mm_srli_epi64(b, 32);
00042         ab13 = _mm_mul_epu32(a, b);
00043         ab13 = _mm_slli_epi64(ab13, 32);
00044 
00045         a    = _mm_add_epi32(ab02, ab13);
00046 
00047         return _mm_add_epi32(a, c);
00048 }
00049 
00051 SSP_FORCEINLINE __m128d ssp_macc_pd_SSE2(__m128d a, __m128d b, __m128d c)
00052 {
00053     a = _mm_mul_pd( a, b );
00054     a = _mm_add_pd( a, c );
00055     return a;
00056 }
00057 
00059 SSP_FORCEINLINE __m128 ssp_macc_ps_SSE2( __m128 a, __m128 b, __m128 c )
00060 {
00061     a = _mm_mul_ps( a, b );
00062     a = _mm_add_ps( a, c );
00063     return a;
00064 }
00065 
00067 SSP_FORCEINLINE __m128d ssp_macc_sd_SSE2(__m128d a, __m128d b, __m128d c)
00068 {
00069     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00070 
00071     ssp_m128 A,B;
00072     A.d = a;
00073     B.d = b;
00074     B.d = ssp_macc_pd_SSE2( A.d, B.d, c );
00075     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00076     return B.d;
00077 }
00078 
00080 SSP_FORCEINLINE __m128 ssp_macc_ss_SSE2(__m128 a, __m128 b, __m128 c)   // Assuming SSE5 *_ss semantics are similar to _mm_add_ss. TODO: confirm
00081 {
00082     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00083 
00084     ssp_m128 A,B;
00085     A.f = a;
00086     B.f = b;
00087     B.f = ssp_macc_ps_SSE2( A.f, B.f, c );
00088     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00089     return B.f;
00090 }
00091 
00093 SSP_FORCEINLINE __m128i ssp_maccd_epi16_SSE2( __m128i a, __m128i b, __m128i c )
00094 {
00095         __m128i ab_lo, ab_hi;
00096         __m128i mask = _mm_set1_epi32(0xFFFF);
00097 
00098     ab_lo = _mm_mullo_epi16(a, b);
00099         ab_hi = _mm_mulhi_epi16(a, b);
00100 
00101         ab_lo = _mm_and_si128(ab_lo, mask);
00102         ab_hi = _mm_and_si128(ab_hi, mask);
00103         ab_hi = _mm_slli_epi32(ab_hi, 16);
00104         a = _mm_add_epi32( ab_lo, ab_hi );
00105         return _mm_add_epi32 (a, c);
00106 
00108         //b     = _mm_unpacklo_epi16(ab_lo, ab_hi);
00109         //ab_hi = _mm_unpackhi_epi16(ab_lo, ab_hi);
00110         //ab_lo = _mm_unpacklo_epi32(b,     ab_hi);
00111         //ab_hi = _mm_unpackhi_epi32(b,     ab_hi);
00112         //ab_lo = _mm_unpacklo_epi32(ab_lo, ab_hi);
00113         //return _mm_add_epi32(ab_lo, c);
00114 }
00115 
00117 SSP_FORCEINLINE __m128i ssp_macchi_epi32_SSE2( __m128i a, __m128i b, __m128i c )
00118 {
00119         __m128i mask, mask_A, mask_B, mask_C, ab;
00120 
00121         a = _mm_srli_epi64(a, 32);
00122         b = _mm_srli_epi64(b, 32);
00123         mask   = _mm_set_epi32(0x7FFFFFFF, 0, 0x7FFFFFFF, 0);
00124 
00125         //abs(A)
00126         mask_A = _mm_cmplt_epi32( a, mask);     //FFF...F when a < 0
00127         a      = _mm_xor_si128 ( a, mask_A );   //Invert  when a < 0
00128         mask_C = _mm_srli_epi32( mask_A, 31 );  // 1      when a < 0
00129         a      = _mm_add_epi32( a, mask_C );    //Add 1   when a < 0
00130 
00131         //abs(B)
00132         mask_B = _mm_cmplt_epi32( b, mask);     //FFF...F when b < 0
00133         b      = _mm_xor_si128 ( b, mask_B );   //Invert  when b < 0
00134         mask_C = _mm_srli_epi32( mask_B, 31 );  // 1      when b < 0
00135         b      = _mm_add_epi32( b, mask_C );    //Add 1   when b < 0
00136 
00137         ab     = _mm_mul_epu32(a, b);
00138 
00139         //correct negative cases
00140         mask_A = _mm_xor_si128(mask_A, mask_B);
00141         mask_C = _mm_srli_epi32(mask_A, 31 );
00142         mask_B = _mm_slli_epi64(mask_A, 32);
00143         mask   = _mm_add_epi32(mask_A, mask_B);
00144         a      = _mm_xor_si128(ab, mask);
00145         a      = _mm_add_epi64(a, mask_C);
00146 
00147         return _mm_add_epi64(a, c);
00148 }
00149 
00151 SSP_FORCEINLINE __m128i ssp_macclo_epi32_SSE2( __m128i a, __m128i b, __m128i c )
00152 {
00153         __m128i mask, mask_A, mask_B, mask_C, ab;
00154 
00155         mask   = _mm_set_epi32(0x7FFFFFFF, 0, 0x7FFFFFFF, 0);
00156         //abs(A)
00157         mask_A = _mm_cmplt_epi32( a, mask);     //FFF...F when a < 0
00158         a      = _mm_xor_si128 ( a, mask_A );   //Invert  when a < 0
00159         mask_C = _mm_srli_epi32( mask_A, 31 );  // 1      when a < 0
00160         a      = _mm_add_epi32( a, mask_C );    //Add 1   when a < 0
00161 
00162         //abs(B)
00163         mask_B = _mm_cmplt_epi32( b, mask);     //FFF...F when b < 0
00164         b      = _mm_xor_si128 ( b, mask_B );   //Invert  when b < 0
00165         mask_C = _mm_srli_epi32( mask_B, 31 );  // 1      when b < 0
00166         b      = _mm_add_epi32( b, mask_C );    //Add 1   when b < 0
00167 
00168         ab     = _mm_mul_epu32(a, b);
00169 
00170         //correct negative cases
00171         mask_A = _mm_xor_si128(mask_A, mask_B);
00172         mask_C = _mm_srli_epi32(mask_A, 31 );
00173         mask_B = _mm_slli_epi64(mask_A, 32);
00174         mask   = _mm_add_epi32(mask_A, mask_B);
00175         a      = _mm_xor_si128(ab, mask);
00176         a      = _mm_add_epi64(a, mask_C);
00177 
00178         return _mm_add_epi64(a, c);
00179 }
00180 
00182 SSP_FORCEINLINE __m128i ssp_maccs_epi16_SSE2( __m128i a, __m128i b, __m128i c )
00183 {
00184         //similar to the version in Framewave CBL
00185         __m128i ablo, abhi, unlo, unhi, signC, clo, chi;
00186 
00187         ablo  = _mm_mullo_epi16( a, b );
00188         abhi  = _mm_mulhi_epi16( a, b );
00189         unlo  = _mm_unpacklo_epi16( ablo, abhi );
00190         unhi  = _mm_unpackhi_epi16( ablo, abhi );
00191 
00192         //unpack and keep the sign of C
00193         signC = _mm_srai_epi16 (c, 15);
00194         chi   = _mm_unpackhi_epi16(c, signC);
00195         clo   = _mm_unpacklo_epi16(c, signC);
00196 
00197         chi   = _mm_add_epi32(chi, unhi);
00198         clo   = _mm_add_epi32(clo, unlo);
00199 
00200         return _mm_packs_epi32(clo, chi);
00201 }
00202 
00204 SSP_FORCEINLINE __m128i ssp_maccs_epi32_SSE2( __m128i a, __m128i b, __m128i c )
00205 {
00206         //Version 1, slightly modified from Framewave CBL
00207         ssp_m128 s1lo,s1hi,s2lo,s2hi,s3lo,s3hi, sl, sh;
00208         static const __m128d max_val = {(double)0x7FFFFFFFl, (double)0x7FFFFFFFl};
00209         static const __m128d min_val = {(-(double)0x80000000l), (-(double)0x80000000l)};
00210 
00211         s1lo.d =  _mm_cvtepi32_pd(a);
00212         s1hi.d = _mm_cvtepi32_pd(_mm_srli_si128(a, 8)); 
00213 
00214         s2lo.d =  _mm_cvtepi32_pd(b);
00215         s2hi.d = _mm_cvtepi32_pd(_mm_srli_si128(b,8)); 
00216 
00217         s1lo.d = _mm_mul_pd(s1lo.d,s2lo.d);
00218         s1hi.d = _mm_mul_pd(s1hi.d,s2hi.d);
00219 
00220         s3lo.d =  _mm_cvtepi32_pd(c);
00221         s3hi.d = _mm_cvtepi32_pd(_mm_srli_si128(c,8)); 
00222         
00223         s1lo.d = _mm_add_pd(s1lo.d,s3lo.d);
00224         s1hi.d = _mm_add_pd(s1hi.d,s3hi.d);
00225 
00226         sl.d   = _mm_min_pd(s1lo.d, max_val);
00227         sl.d   = _mm_max_pd(sl.d, min_val);
00228 
00229         sh.d   = _mm_min_pd(s1hi.d, max_val);
00230         sh.d   = _mm_max_pd(sh.d, min_val);
00231 
00232         sl.i   = _mm_cvtpd_epi32(sl.d); 
00233         sh.i   = _mm_cvtpd_epi32(sh.d);
00234 
00235         sh.i   = _mm_slli_si128(sh.i, 8); 
00236         sl.i   = _mm_or_si128(sl.i, sh.i);
00237 
00238     return sl.i;
00239 }
00240 
00241 //
00242 // Negative Multiply Add
00243 //
00245 SSP_FORCEINLINE __m128 ssp_nmacc_ps_SSE2( __m128 a, __m128 b, __m128 c )
00246 {
00247     const static __m128 neg1 = SSP_CONST_SET_32F( -1.0f, -1.0f, -1.0f, -1.0f );
00248 
00249     a = _mm_mul_ps( a, b    );
00250     a = _mm_mul_ps( a, neg1 );
00251     a = _mm_add_ps( a, c    );
00252     return a;
00253 }
00254 
00256 SSP_FORCEINLINE __m128d ssp_nmacc_pd_SSE2(__m128d a, __m128d b, __m128d c)
00257 {
00258     const static __m128d neg1 = SSP_CONST_SET_64F( -1.0, -1.0 );
00259 
00260     a = _mm_mul_pd( a, b    );
00261     a = _mm_mul_pd( a, neg1 );
00262     a = _mm_add_pd( a, c    );
00263     return a;
00264 }
00265 
00267 SSP_FORCEINLINE __m128 ssp_nmacc_ss_SSE2(__m128 a, __m128 b, __m128 c)   // Assuming SSE5 *_ss semantics are similar to _mm_add_ss. TODO: confirm
00268 {
00269     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00270 
00271     ssp_m128 A,B;
00272     A.f = a;
00273     B.f = b;
00274     B.f = ssp_nmacc_ps_SSE2( A.f, B.f, c );
00275     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00276     return B.f;
00277 }
00278 
00280 SSP_FORCEINLINE __m128d ssp_nmacc_sd_SSE2(__m128d a, __m128d b, __m128d c)
00281 {
00282     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00283 
00284     ssp_m128 A,B;
00285     A.d = a;
00286     B.d = b;
00287     B.d = ssp_nmacc_pd_SSE2( A.d, B.d, c );
00288     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00289     return B.d;
00290 }
00291 
00292 //
00293 // Multiply Subtract
00294 //
00295 
00297 SSP_FORCEINLINE __m128 ssp_msub_ps_SSE2(__m128 a, __m128 b, __m128 c)
00298 {
00299     a = _mm_mul_ps( a, b );
00300     a = _mm_sub_ps( a, c );
00301     return a;
00302 }
00303 
00305 SSP_FORCEINLINE __m128d ssp_msub_pd_SSE2(__m128d a, __m128d b, __m128d c)
00306 {
00307     a = _mm_mul_pd( a, b );
00308     a = _mm_sub_pd( a, c );
00309     return a;
00310 }
00311 
00313 SSP_FORCEINLINE __m128 ssp_msub_ss_SSE2(__m128 a, __m128 b, __m128 c)
00314 {
00315     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00316 
00317     ssp_m128 A,B;
00318     A.f = a;
00319     B.f = b;
00320     B.f = ssp_msub_ps_SSE2( A.f, B.f, c );
00321     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00322     return B.f;
00323 }
00324 
00326 SSP_FORCEINLINE __m128d ssp_msub_sd_SSE2(__m128d a, __m128d b, __m128d c)
00327 {
00328     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00329 
00330     ssp_m128 A,B;
00331     A.d = a;
00332     B.d = b;
00333     B.d = ssp_msub_pd_SSE2( A.d, B.d, c );
00334     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00335     return B.d;
00336 }
00337 
00338 //
00339 // Negative Multiply Subtract
00340 //
00341 
00343 SSP_FORCEINLINE __m128 ssp_nmsub_ps_SSE2(__m128 a, __m128 b, __m128 c)
00344 {
00345     const static __m128 neg1 = SSP_CONST_SET_32F( -1.0f, -1.0f, -1.0f, -1.0f );
00346 
00347     a = _mm_mul_ps( a, b    );
00348     a = _mm_mul_ps( a, neg1 );
00349     a = _mm_sub_ps( a, c    );
00350     return a;
00351 }
00352 
00354 SSP_FORCEINLINE __m128d ssp_nmsub_pd_SSE2(__m128d a, __m128d b, __m128d c)
00355 {
00356     const static __m128d neg1 = SSP_CONST_SET_64F( -1.0, -1.0 );
00357 
00358     a = _mm_mul_pd( a, b    );
00359     a = _mm_mul_pd( a, neg1 );
00360     a = _mm_sub_pd( a, c    );
00361     return a;
00362 }
00363 
00365 SSP_FORCEINLINE __m128 ssp_nmsub_ss_SSE2(__m128 a, __m128 b, __m128 c)
00366 {
00367     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00368 
00369     ssp_m128 A,B;
00370     A.f = a;
00371     B.f = b;
00372     B.f = ssp_nmsub_ps_SSE2( A.f, B.f, c );
00373     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00374     return B.f;
00375 }
00376 
00378 SSP_FORCEINLINE __m128d ssp_nmsub_sd_SSE2(__m128d a, __m128d b, __m128d c)
00379 {
00380     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00381 
00382     ssp_m128 A,B;
00383     A.d = a;
00384     B.d = b;
00385     B.d = ssp_nmsub_pd_SSE2( A.d, B.d, c );
00386     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00387     return B.d;
00388 }
00389 
00390 //
00391 // Abs
00392 //
00393 
00394 
00396 SSP_FORCEINLINE
00397 __m128i ssp_abs_epi8_SSE2 (__m128i a)
00398 {
00399     __m128i mask = _mm_cmplt_epi8( a, _mm_setzero_si128() );  // FFFF   where a < 0
00400         __m128i one  = _mm_set1_epi8(1);
00401     a    = _mm_xor_si128 ( a, mask  );                        // Invert where a < 0
00402     mask = _mm_and_si128 ( mask, one );                       // 0001   where a < 0
00403     a    = _mm_add_epi8  ( a, mask  );                        // Add 1  where a < 0 
00404     return a;
00405 }
00406 
00408 SSP_FORCEINLINE
00409 __m128i ssp_abs_epi16_SSE2 (__m128i a)
00410 {
00411     __m128i mask = _mm_cmplt_epi16( a, _mm_setzero_si128() ); // FFFF   where a < 0
00412     a    = _mm_xor_si128 ( a, mask  );                        // Invert where a < 0
00413     mask = _mm_srli_epi16( mask, 15 );                        // 0001   where a < 0
00414     a    = _mm_add_epi16 ( a, mask  );                        // Add 1  where a < 0
00415     return a;
00416 }
00417 
00419 SSP_FORCEINLINE
00420 __m128i ssp_abs_epi32_SSE2 (__m128i a)
00421 {
00422     __m128i mask = _mm_cmplt_epi32( a, _mm_setzero_si128() ); // FFFF   where a < 0
00423     a    = _mm_xor_si128 ( a, mask );                         // Invert where a < 0
00424     mask = _mm_srli_epi32( mask, 31 );                        // 0001   where a < 0
00425     a = _mm_add_epi32( a, mask );                             // Add 1  where a < 0
00426         return a;
00427 }
00428 
00429 
00431 SSP_FORCEINLINE
00432 __m128 ssp_addsub_ps_SSE2(__m128 a, __m128 b)
00433 {
00434     const static __m128 neg = SSP_CONST_SET_32F(  1, -1, 1, -1 );
00435 
00436     b = _mm_mul_ps( b, neg );
00437     a = _mm_add_ps( a, b   );
00438     return a;
00439 }
00440 
00442 SSP_FORCEINLINE
00443 __m128d ssp_addsub_pd_SSE2(__m128d a, __m128d b)
00444 {
00445     const static __m128d const_addSub_pd_neg = SSP_CONST_SET_64F( 1, -1 );
00446 
00447     b = _mm_mul_pd( b, const_addSub_pd_neg );
00448     a = _mm_add_pd( a, b   );
00449     return a;
00450 }
00451 
00452 //
00453 // Blend
00454 //
00455 
00457 SSP_FORCEINLINE
00458 __m128i ssp_blend_epi16_SSE2( __m128i a, __m128i b, const int mask )
00459 {
00460     __m128i screen;
00461     const static __m128i mulShiftImm = SSP_CONST_SET_16I( 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 ); // Shift mask multiply moves all bits to left, becomes MSB
00462 
00463     screen = _mm_set1_epi16  ( mask                );   // Load the mask into register
00464     screen = _mm_mullo_epi16 ( screen, mulShiftImm );   // Shift bits to MSB
00465     screen = _mm_srai_epi16  ( screen, 15          );   // Shift bits to obtain 0xFFFF or 0x0000
00466     b      = _mm_and_si128   ( screen,  b          );   // Mask out the correct values from b
00467     a      = _mm_andnot_si128( screen,  a          );   // Mask out the correct values from a (invert the mask)
00468     a      = _mm_or_si128    (      a,  b          );   // Or the 2 results.
00469     return a;
00470 }
00471 
00473 SSP_FORCEINLINE
00474 __m128d ssp_blend_pd_SSE2(  __m128d a, __m128d b, const int mask )
00475 {
00476     __m128d screen;
00477     screen = _mm_set_pd(  (mask&0x2)>>1,    mask&0x1 );
00478     b      = _mm_mul_pd(              b,      screen );
00479     screen = _mm_set_pd( (~mask&0x2)>>1, (~mask&0x1) );
00480     a      = _mm_mul_pd(              a,      screen );
00481     a      = _mm_or_pd (              a,           b );
00482     return a;
00483 }
00484 
00486 SSP_FORCEINLINE
00487 __m128  ssp_blend_ps_SSE2( __m128  a, __m128  b, const int mask )               // _mm_blend_ps [SSE4.1]
00488 {
00489     ssp_m128 screen, A, B;
00490     A.f = a;
00491     B.f = b;
00492     screen.i = ssp_movmask_imm8_to_epi32_SSE2( mask );
00493     screen.i = ssp_logical_bitwise_select_SSE2( B.i, A.i, screen.i );
00494     return screen.f;
00495 }
00496 
00498 SSP_FORCEINLINE
00499 __m128i ssp_blendv_epi8_SSE2( __m128i a, __m128i b, __m128i mask )
00500 {
00501     __m128i mHi, mLo;
00502     __m128i zero = _mm_setzero_si128 ();
00503 
00504     mHi = _mm_unpacklo_epi8( zero, mask );
00505     mHi = _mm_srai_epi16   (  mHi,   15 );
00506     mHi = _mm_srli_epi16   (  mHi,    1 );
00507 
00508     mLo = _mm_unpackhi_epi8( zero, mask );
00509     mLo = _mm_srai_epi16   (  mLo,   15 );
00510     mLo = _mm_srli_epi16   (  mLo,    1 );
00511 
00512     mHi = _mm_packus_epi16  ( mHi,  mLo );
00513 
00514     b   = _mm_and_si128     (    b, mHi  );
00515     a   = _mm_andnot_si128  (  mHi,  a   );
00516     a   = _mm_or_si128      (    a,  b   );
00517     return a;
00518 }
00519 
00521 SSP_FORCEINLINE __m128d ssp_blendv_pd_SSE2( __m128d a, __m128d b, __m128d mask )
00522 {
00523     ssp_m128 A, B, Mask;
00524     A.d = a;
00525     B.d = b;
00526     Mask.d = mask;
00527 
00528     Mask.i = _mm_shuffle_epi32( Mask.i, _MM_SHUFFLE(3, 3, 1, 1) );
00529     Mask.i = _mm_srai_epi32   ( Mask.i, 31                      );
00530 
00531     B.i = _mm_and_si128( B.i, Mask.i );
00532     A.i = _mm_andnot_si128( Mask.i, A.i );
00533     A.i = _mm_or_si128( A.i, B.i );
00534     return A.d;
00535 }
00537 SSP_FORCEINLINE __m128  ssp_blendv_ps_SSE2( __m128  a, __m128  b, __m128 mask )     
00538 {
00539     ssp_m128 A, B, Mask;
00540     A.f = a;
00541     B.f = b;
00542     Mask.f = mask;
00543 
00544     Mask.i = _mm_srai_epi32( Mask.i, 31 );
00545     B.i = _mm_and_si128( B.i, Mask.i );
00546     A.i = _mm_andnot_si128( Mask.i, A.i );
00547     A.i = _mm_or_si128( A.i, B.i );
00548     return A.f;
00549 }
00550 
00551 //
00552 // Compare
00553 //
00554 
00556 SSP_FORCEINLINE
00557 __m128i ssp_cmpeq_epi64_SSE2( __m128i a, __m128i b )
00558 {
00559     return ssp_comeq_epi64_SSE2( a, b );
00560 }
00561 
00562 
00563 //
00564 // Horizontal Operations
00565 //
00566 
00568 SSP_FORCEINLINE
00569 __m128i ssp_hadd_epi16_SSE2( __m128i a, __m128i b )
00570 {
00571     ssp_convert_odd_even_epi16_SSE2( &a, &b );
00572     a = _mm_add_epi16( a, b );     
00573     return a;
00574 }
00575 
00577 SSP_FORCEINLINE __m128i ssp_hadds_epi16_SSE2 ( __m128i a, __m128i b )                     
00578 {
00579     ssp_convert_odd_even_epi16_SSE2( &a, &b );
00580     a = _mm_adds_epi16( a, b );    
00581     return a;
00582 }
00583 
00584 
00586 SSP_FORCEINLINE
00587 __m128i ssp_hsub_epi16_SSE2 ( __m128i a, __m128i b )
00588 {
00589     ssp_convert_odd_even_epi16_SSE2( &a, &b ); 
00590     a = _mm_sub_epi16( a, b );     
00591     return a;  
00592 }
00593 
00595 SSP_FORCEINLINE
00596 __m128i ssp_hsubs_epi16_SSE2 ( __m128i a, __m128i b )
00597 {
00598     ssp_convert_odd_even_epi16_SSE2( &a, &b ); 
00599     a = _mm_subs_epi16( a, b );     
00600     return a;  
00601 }
00602 
00603 
00604 
00606 SSP_FORCEINLINE __m128i ssp_hadd_epi32_SSE2( __m128i a, __m128i b )                        
00607 {
00608    ssp_convert_odd_even_epi32_SSE2( &a, &b );
00609    a = _mm_add_epi32( a, b );
00610    return a; 
00611 }
00612 
00614 SSP_FORCEINLINE __m128i ssp_hsub_epi32_SSE2 ( __m128i a, __m128i b )                        
00615 {
00616    ssp_convert_odd_even_epi32_SSE2( &a, &b );
00617    a = _mm_sub_epi32( b, a );
00618    return a;
00619 }
00620 
00621 
00623 SSP_FORCEINLINE
00624 __m128 ssp_hadd_ps_SSE2(__m128 a, __m128 b)
00625 {
00626     ssp_convert_odd_even_ps_SSE2( &a, &b );
00627     a = _mm_add_ps( a, b );
00628     return a;
00629 }
00630 
00632 SSP_FORCEINLINE
00633 __m128 ssp_hsub_ps_SSE2(__m128 a, __m128 b)
00634 {
00635     ssp_convert_odd_even_ps_SSE2( &a, &b );
00636     a = _mm_sub_ps( b, a );
00637     return a;
00638 }
00639 
00640 
00642 SSP_FORCEINLINE
00643 __m128d ssp_hadd_pd_SSE2(__m128d a, __m128d b)
00644 {
00645     ssp_m128 A,B,C;
00646     A.d = a;
00647     C.d = a;
00648     B.d = b;
00649 
00650     A.f = _mm_movelh_ps( A.f, B.f );
00651     B.f = _mm_movehl_ps( B.f, C.f );
00652     A.d = _mm_add_pd   ( A.d, B.d );
00653     return A.d;
00654 }
00655 
00657 SSP_FORCEINLINE
00658 __m128d ssp_hsub_pd_SSE2(__m128d a, __m128d b)
00659 {
00660     ssp_m128 A,B,C;
00661     A.d = a;
00662     C.d = a;
00663     B.d = b;
00664 
00665     A.f = _mm_movelh_ps( A.f, B.f );
00666     B.f = _mm_movehl_ps( B.f, C.f );
00667     A.d = _mm_sub_pd   ( A.d, B.d );
00668     return A.d;
00669 }
00670 
00671 
00672 //__m128i _mm_mulhrs_epi16( __m128i a,  __m128i b);
00674 SSP_FORCEINLINE __m128i ssp_mulhrs_epi16_SSE2( __m128i a, __m128i b )
00675 {
00676     const static __m128i VAL = SSP_CONST_SET_32I( 0x4000, 0x4000, 0x4000, 0x4000 );
00677     __m128i c,d;   
00678 
00679     c = _mm_mullo_epi16( a, b );
00680     d = _mm_mulhi_epi16( a, b );
00681    
00682     a = _mm_unpackhi_epi16( c, d );
00683     b = _mm_unpacklo_epi16( c, d );
00684 
00685     a = _mm_add_epi32( a, VAL );
00686     b = _mm_add_epi32( b, VAL );
00687 
00688     a = _mm_srai_epi32( a, 15 );
00689     b = _mm_srai_epi32( b, 15 );
00690 
00691     a = _mm_packs_epi32( b, a );
00692     return a;
00693 }
00694 
00695 
00697 SSP_FORCEINLINE
00698 __m128i ssp_insert_epi32_SSE2( __m128i a, int b, const int ndx )            // TODO: Verify behavior on Intel Hardware
00699 {
00700     switch( ndx & 0x3 )
00701     {
00702     case 0: a = _mm_insert_epi16( a, b    , 0 );
00703             a = _mm_insert_epi16( a, b<<16, 1 ); break;
00704     case 1: a = _mm_insert_epi16( a, b    , 2 );
00705             a = _mm_insert_epi16( a, b<<16, 3 ); break;
00706     case 2: a = _mm_insert_epi16( a, b    , 4 );
00707             a = _mm_insert_epi16( a, b<<16, 5 ); break;
00708     case 3: a = _mm_insert_epi16( a, b    , 6 );
00709             a = _mm_insert_epi16( a, b<<16, 7 ); break;
00710     }
00711     return a;
00712 }
00713 
00714 //
00715 // Min / Max
00716 //
00717 
00719 SSP_FORCEINLINE
00720 __m128i ssp_min_epi8_SSE2( __m128i a, __m128i b )
00721 {
00722     __m128i mask  = _mm_cmplt_epi8( a, b );                             // FFFFFFFF where a < b
00723     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00724     return a;
00725 }
00726 
00728 SSP_FORCEINLINE
00729 __m128i ssp_max_epi8_SSE2( __m128i a, __m128i b )
00730 {
00731     __m128i mask  = _mm_cmpgt_epi8( a, b );                             // FFFFFFFF where a > b
00732     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00733     return a;
00734 }
00735 
00737 SSP_FORCEINLINE
00738 __m128i ssp_min_epu16_SSE2( __m128i a, __m128i b )
00739 {
00740     __m128i mask = ssp_comlt_epu16_SSE2( a, b );
00741     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00742     return a;
00743 }
00744 
00746 SSP_FORCEINLINE
00747 __m128i ssp_max_epu16_SSE2( __m128i a, __m128i b )
00748 {
00749     __m128i mask = ssp_comgt_epu16_SSE2( a, b );
00750     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00751     return a;
00752 }
00753 
00755 SSP_FORCEINLINE
00756 __m128i ssp_min_epi32_SSE2( __m128i a, __m128i b )
00757 {
00758     __m128i mask  = _mm_cmplt_epi32( a, b );                            // FFFFFFFF where a < b
00759     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00760     return a;
00761 }
00762 
00764 SSP_FORCEINLINE
00765 __m128i ssp_max_epi32_SSE2( __m128i a, __m128i b )
00766 {
00767     __m128i mask  = _mm_cmpgt_epi32( a, b );                            // FFFFFFFF where a > b
00768     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00769     return a;
00770 }
00771 
00773 SSP_FORCEINLINE
00774 __m128i ssp_min_epu32_SSE2 ( __m128i a, __m128i b )
00775 {
00776     __m128i mask = ssp_comlt_epu32_SSE2( a, b );
00777     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00778     return a;
00779 }
00780 
00782 SSP_FORCEINLINE
00783 __m128i ssp_max_epu32_SSE2 ( __m128i a, __m128i b )
00784 {
00785    __m128i mask = ssp_comgt_epu32_SSE2( a, b );
00786     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00787     return a;
00788 }
00789 
00790 
00799 SSP_FORCEINLINE __m128i ssp_maddubs_epi16_SSE2( __m128i a,  __m128i b)
00800 {
00801     const static __m128i EVEN_8 = SSP_CONST_SET_8I( 0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF);
00802     __m128i Aodd, Aeven, Beven, Bodd;
00803 
00804     // Convert the 8 bit inputs into 16 bits by dropping every other value
00805     Aodd  = _mm_srli_epi16( a, 8 );             // A is unsigned  
00806     Bodd  = _mm_srai_epi16( b, 8 );             // B is signed
00807 
00808     Aeven = _mm_and_si128 ( a, EVEN_8 );        // A is unsigned   
00809     Beven = _mm_slli_si128( b,     1  );        // B is signed
00810     Beven = _mm_srai_epi16( Beven, 8  );
00811 
00812     a = _mm_mullo_epi16( Aodd , Bodd  );        // Will always fit in lower 16
00813     b = _mm_mullo_epi16( Aeven, Beven );  
00814     a = _mm_adds_epi16 ( a, b );
00815         return a;
00816 }
00817 
00818 
00819 
00821 SSP_FORCEINLINE
00822 __m128i ssp_mpsadbw_epu8_SSE2 ( __m128i a, __m128i b, const int msk  ) // _mm_mpsadbw_epu8
00823 {
00824     const static __m128i MASK_BITS04 = SSP_CONST_SET_16I( 0,0,0,0xFFFF,0,0,0,0xFFFF );
00825     const static __m128i MASK_BITS15 = SSP_CONST_SET_16I( 0,0,0xFFFF,0,0,0,0xFFFF,0 );
00826     const static __m128i MASK_BITS26 = SSP_CONST_SET_16I( 0,0xFFFF,0,0,0,0xFFFF,0,0 );
00827     const static __m128i MASK_BITS37 = SSP_CONST_SET_16I( 0xFFFF,0,0,0,0xFFFF,0,0,0 );
00828 
00829     ssp_m128 A,B,A16,tmp,out;
00830     A.i = a;
00831     B.i = b;
00832 
00833     switch( msk & 0x4 )         // Possible values: 0, 4
00834     {
00835     case 4: A.i = _mm_srli_si128( A.i, 4 );
00836     }
00837 
00838     switch( (msk & 0x3) * 4 )   // Possible values: 0, 4, 8, 12
00839     {
00840     case 0:     B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(0,0,0,0) ); break;
00841     case 4:     B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(1,1,1,1) ); break;
00842     case 8:     B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(2,2,2,2) ); break;
00843     case 12:    B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(3,3,3,3) ); break;
00844     //default: ASSERT( false );
00845     }
00846 
00847     // out[0,4]
00848     B.i   = _mm_unpacklo_epi8( B.i, _mm_setzero_si128() );          // 1 2 3 4 | 1 2 3 4
00849     A16.i = _mm_unpacklo_epi8( A.i, _mm_setzero_si128() );          // a b c d | e f g h
00850     tmp.i = _mm_subs_epi16                 ( A16.i, B.i );          // a-1,b-2,c-3,d-4 | e-1,f-2,g-3,h-4
00851     tmp.i = ssp_abs_epi16_SSE2             ( tmp.i    );            // abs(a-1),abs(b-2),...,abs(h-4) | ...
00852     tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 0 );            // x,x,x,abs(a-1)+abs(b-2)+abs(c-3)+abs(d-4) | ...
00853     tmp.i = _mm_and_si128                  ( tmp.i, MASK_BITS04 );  // 0,0,0,abs(a-1)+abs(b-2)+abs(c-3)+abs(d-4) | ...
00854     out.i = tmp.i;
00855 
00856     // out[1,5]
00857     A16.i = _mm_srli_si128   ( A.i, 1 );
00858     A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );        // b c | d e | f g | h i
00859     tmp.i = _mm_subs_epi16                 ( A16.i, B.i );
00860     tmp.i = ssp_abs_epi16_SSE2             ( tmp.i    );
00861     tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 1 );
00862     tmp.i = _mm_and_si128                  ( tmp.i, MASK_BITS15 );
00863     out.i = _mm_or_si128                   ( out.i, tmp.i );
00864 
00865     // out[2,6]
00866     A16.i = _mm_srli_si128   ( A.i, 2 );
00867     A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );        // c d | e f | g h | i j
00868     tmp.i = _mm_subs_epi16                 ( A16.i, B.i );
00869     tmp.i = ssp_abs_epi16_SSE2             ( tmp.i    );
00870     tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 2 );
00871     tmp.i = _mm_and_si128                  ( tmp.i, MASK_BITS26 );
00872     out.i = _mm_or_si128                   ( out.i, tmp.i );
00873 
00874     // out[3,7]
00875     A16.i = _mm_srli_si128   ( A.i, 3 );
00876     A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );        // d e | f g | h i | j k
00877     tmp.i = _mm_subs_epi16                 ( A16.i, B.i );
00878     tmp.i = ssp_abs_epi16_SSE2             ( tmp.i    );
00879     tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 3 );
00880     tmp.i = _mm_and_si128                  ( tmp.i, MASK_BITS37 );
00881     out.i = _mm_or_si128                   ( out.i, tmp.i );
00882 
00883     return out.i;
00884 }
00885 
00886 
00887 //---------------------------------------
00888 // Dot Product
00889 //---------------------------------------
00890 
00892 SSP_FORCEINLINE
00893 __m128d ssp_dp_pd_SSE2( __m128d a, __m128d b, const int mask )               // _mm_dp_pd [SSE4,  cycles]
00894 {
00895     int smallMask = (mask & 0x33)<<16;
00896     const static __m128i mulShiftImm_01 = SSP_CONST_SET_32I( 0x40000000, 0x40000000, 0x80000000, 0x80000000 );   // Shift mask multiply moves 0,1, bits to left, becomes MSB
00897     const static __m128i mulShiftImm_45 = SSP_CONST_SET_32I( 0x04000000, 0x04000000, 0x08000000, 0x08000000 );   // Shift mask multiply moves 4,5, bits to left, becomes MSB
00898     ssp_m128 mHi, mLo;
00899 
00900     mLo.i = _mm_set1_epi32( smallMask );// Load the mask into register
00901     mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_01 );       // Shift the bits
00902     mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_45 );       // Shift the bits
00903 
00904     mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() );  // FFFFFFFF if bit set, 00000000 if not set
00905     mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() );  // FFFFFFFF if bit set, 00000000 if not set
00906 
00907     a = _mm_and_pd( a, mHi.d );                                     // Clear input using the high bits of the mask
00908     a = _mm_mul_pd( a, b );
00909 
00910     b = _mm_shuffle_pd( a, a, _MM_SHUFFLE2(0, 1) );                 // Shuffle the values so that we b = { a[0], a[1] } and a = { a[1], a[0] }
00911     a = _mm_add_pd( a, b );                                         // Horizontally add the 4 values
00912     a = _mm_and_pd( a, mLo.d );                                     // Clear output using low bits of the mask
00913     return a;
00914 }
00915 
00917 SSP_FORCEINLINE
00918 __m128 ssp_dp_ps_SSE2( __m128 a, __m128 b, const int mask )                  // _mm_dp_ps() [SSE4, 28 cycles]
00919 {
00920     const static __m128i mulShiftImm_0123 = SSP_CONST_SET_32I( 0x010000, 0x020000, 0x040000, 0x080000 );   // Shift mask multiply moves 0,1,2,3 bits to left, becomes MSB
00921     const static __m128i mulShiftImm_4567 = SSP_CONST_SET_32I( 0x100000, 0x200000, 0x400000, 0x800000 );   // Shift mask multiply moves 4,5,6,7 bits to left, becomes MSB
00922 
00923     // Begin mask preparation
00924     ssp_m128 mHi, mLo;
00925     mLo.i = _mm_set1_epi32( mask );                                 // Load the mask into register
00926     mLo.i = _mm_slli_si128( mLo.i, 3 );                             // Shift into reach of the 16 bit multiply
00927 
00928     mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_0123 );             // Shift the bits
00929     mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_4567 );             // Shift the bits
00930 
00931     mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() );          // FFFFFFFF if bit set, 00000000 if not set
00932     mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() );          // FFFFFFFF if bit set, 00000000 if not set
00933     // End mask preparation - Mask bits 0-3 in mLo, 4-7 in mHi
00934 
00935     a = _mm_and_ps( a, mHi.f );                                     // Clear input using the high bits of the mask
00936     a = _mm_mul_ps( a, b );
00937 
00938     a = ssp_arithmetic_hadd4_dup_ps_SSE2( a );                      // Horizontally add the 4 values
00939     a = _mm_and_ps( a, mLo.f );                                     // Clear output using low bits of the mask
00940     return a;
00941 }
00942 
00949 SSP_FORCEINLINE
00950 __m128 ssp_round_ps_SSE2( __m128  a, int iRoundMode )
00951 {
00952     #pragma message( "" WARN() "SSEPlus SSE2 rounding functions overflow if input outside 32 bit integer range" )
00953 
00954     enum ENUM_MXCSR
00955     {
00956         CSR_ROUND_TO_EVEN = 0x00001F80, //
00957         CSR_ROUND_DOWN    = 0x00003F80, //
00958         CSR_ROUND_UP      = 0x00005F80, //
00959         CSR_ROUND_TRUNC   = 0x00007F80, //(_mm_getcsr() & ~_mm_ROUND_MASK) | _mm_ROUND_TOWARD_ZERO;
00960     }; 
00961 
00962     ssp_u32 bak = _mm_getcsr();
00963     ssp_m128 A, i;
00964     A.f = a;
00965 
00966     switch( iRoundMode & 0x3 )
00967     {
00968     case SSP_FROUND_CUR_DIRECTION:                                      break;
00969     case SSP_FROUND_TO_ZERO:            _mm_setcsr( CSR_ROUND_TRUNC  ); break;
00970     case SSP_FROUND_TO_POS_INF:         _mm_setcsr( CSR_ROUND_UP     ); break;
00971     case SSP_FROUND_TO_NEG_INF:         _mm_setcsr( CSR_ROUND_DOWN   ); break;
00972     default:                            _mm_setcsr( CSR_ROUND_TO_EVEN); break;
00973     }
00974     
00975     i.i    = _mm_cvtps_epi32( A.f );    // Convert to integer
00976     A.f    = _mm_cvtepi32_ps( i.i );    // Convert back to float
00977 
00978     i.u32[0] = bak;                     // Workaround for a bug in the MSVC compiler. MSVC was hoisting the mxcsr restore above the converts. 
00979     _mm_setcsr( i.u32[0] );
00980     return A.f;
00981 }
00982 
00983 
00984 SSP_FORCEINLINE
00985 __m128d ssp_round_pd_SSE2( __m128d  a, int iRoundMode )
00986 {
00987     #pragma message( "" WARN() "SSEPlus SSE2 rounding functions overflow if input outside 32 bit integer range" )
00988 
00989     enum ENUM_MXCSR
00990     {
00991         CSR_ROUND_TO_EVEN = 0x00001F80, //
00992         CSR_ROUND_DOWN    = 0x00003F80, //
00993         CSR_ROUND_UP      = 0x00005F80, //
00994         CSR_ROUND_TRUNC   = 0x00007F80, //(_mm_getcsr() & ~_mm_ROUND_MASK) | _mm_ROUND_TOWARD_ZERO;
00995     }; 
00996 
00997     ssp_u32 bak = _mm_getcsr();
00998     ssp_m128 A, i;
00999     A.d = a;
01000     
01001 
01002     switch( iRoundMode & 0x3 )
01003     {
01004     case SSP_FROUND_CUR_DIRECTION:                                      break;
01005     case SSP_FROUND_TO_ZERO:            _mm_setcsr( CSR_ROUND_TRUNC  ); break;
01006     case SSP_FROUND_TO_POS_INF:         _mm_setcsr( CSR_ROUND_UP     ); break;
01007     case SSP_FROUND_TO_NEG_INF:         _mm_setcsr( CSR_ROUND_DOWN   ); break;
01008     default:                            _mm_setcsr( CSR_ROUND_TO_EVEN); break;
01009     }
01010     
01011     i.i    = _mm_cvtpd_epi32( A.d );    // Convert to integer
01012     A.d    = _mm_cvtepi32_pd( i.i );    // Convert back to float
01013 
01014     i.u32[0] = bak;                     // Workaround for a bug in the MSVC compiler. MSVC was hoisting the mxcsr restore above the converts. 
01015     _mm_setcsr( i.u32[0] );             
01016     return A.d;
01017 }
01018 
01020 SSP_FORCEINLINE
01021 __m128 ssp_round_ss_SSE2( __m128  a, __m128  b, int iRoundMode )
01022 {
01023         //Commented code will generate linker error in x64 platform
01024     //ssp_m128 A,B;
01025     //A.f = a;
01026     //B.f = ssp_round_ps_SSE2( b, iRoundMode );
01027 
01028     //A.f = _mm_move_ss( A.f, B.f );
01029 
01031         //return A.f;
01032         b = ssp_round_ps_SSE2(b, iRoundMode);               // B contains modified values through whole vector
01033         b =    _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));  
01034     return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0)); 
01035 }
01036 
01038 SSP_FORCEINLINE
01039 __m128 ssp_ceil_ps_SSE2( __m128 a )
01040 {
01041     return ssp_round_ps_SSE2( a, SSP_FROUND_TO_POS_INF );
01042 }
01043 
01045 SSP_FORCEINLINE
01046 __m128 ssp_floor_ps_SSE2( __m128 a )
01047 {
01048     return ssp_round_ps_SSE2( a, SSP_FROUND_TO_NEG_INF );
01049 }
01050 
01052 SSP_FORCEINLINE
01053 __m128d ssp_floor_pd_SSE2( __m128d a )
01054 {
01055     return ssp_round_pd_SSE2( a, SSP_FROUND_TO_NEG_INF );
01056 }
01057 
01059 SSP_FORCEINLINE
01060 __m128d ssp_ceil_pd_SSE2( __m128d a )
01061 {
01062     return ssp_round_pd_SSE2( a, SSP_FROUND_TO_POS_INF );
01063 }
01064 
01066 SSP_FORCEINLINE __m128d ssp_floor_sd_SSE2( __m128d a, __m128d b)                              
01067 {
01068         b = ssp_round_pd_SSE2(b, SSP_FROUND_TO_NEG_INF );
01069 
01070     return _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1,0));
01071 }
01072 
01074 SSP_FORCEINLINE __m128d ssp_ceil_sd_SSE2( __m128d a, __m128d b)                              
01075 {
01076         b = ssp_round_pd_SSE2(b, SSP_FROUND_TO_POS_INF );
01077 
01078     return _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1,0));
01079 }
01080 
01082 SSP_FORCEINLINE __m128 ssp_floor_ss_SSE2( __m128 a, __m128 b)                              
01083 {
01084         b = ssp_round_ps_SSE2(b, SSP_FROUND_TO_NEG_INF );
01085         b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));
01086     return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0));
01087 }
01088 
01090 SSP_FORCEINLINE __m128 ssp_ceil_ss_SSE2( __m128 a, __m128 b)                              
01091 {
01092         b = ssp_round_ps_SSE2(b, SSP_FROUND_TO_POS_INF );
01093         b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));
01094     return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0));
01095 }
01096 
01097 
01098 //SSP_FORCEINLINE
01099 //__m128i _mm_mul_epi32( __m128i a, __m128i b ) //_mm_mul_epi32
01100 //{
01101 //    //ssp_m128 sign;
01102 //
01103 //    //sign.i = ::_mm_xor_si128( a, b );
01104 //
01105 //    //__m128i t1;
01106 //
01107 //    a = _mm_min_epi32( a, b );
01108 //
01109 //    //a = _mm_slli_si128( a, 4 );
01110 //    //b = _mm_slli_si128( b, 4 );
01111 //
01112 //    //a = _mm_mul_epu32( a, b );
01113 //
01114 //    //a = _mm_slli_si128( a, 4 );
01115 //
01116 //    return a;
01117 //}
01118 
01119 
01120 //type conversion
01121 
01123 SSP_FORCEINLINE
01124 __m128i ssp_cvtepi8_epi16_SSE2 ( __m128i a)
01125 {
01126         __m128i b = _mm_setzero_si128 ();
01127         __m128i c = _mm_unpacklo_epi8(a, b);
01128         __m128i d = _mm_set1_epi16 (128);
01129 
01130         b = _mm_and_si128(d, c);
01131         d = _mm_set1_epi16(0x1FE);
01132         b = _mm_mullo_epi16(b, d);
01133 
01134         return _mm_add_epi16(c, b);
01135 
01136         //Another way, slower
01137         //__m128i b = _mm_set1_epi32 (-1);                              //0xFFFFFFFF
01138         //__m128i c = _mm_unpacklo_epi8(a, b);                  //FFa0FFa1....
01139         //__m128i d = _mm_set1_epi16 (128);                             //0x80
01140         //b = _mm_andnot_si128(c, d);                                   // 0x80 for positive, 0x00 for negative
01141         //d = _mm_slli_epi16(b, 1);                                             // 0x100 for positive, 0x000 for negative
01142         //return _mm_add_epi16(c, d);
01143 }
01144 
01146 SSP_FORCEINLINE
01147 __m128i ssp_cvtepi8_epi32_SSE2 ( __m128i a)
01148 {
01149         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01150         __m128i c = _mm_unpacklo_epi8(a, b);                    //FFa0FFa1....
01151         __m128i d = _mm_set1_epi32 (128);                               //0x80
01152 
01153         c = _mm_unpacklo_epi16(c, b);                                   //FFFFFFa0FFFFFFa1...
01154         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01155         d = _mm_slli_epi32(b, 1);                                               // 0x100 for positive, 0x000 for negative
01156 
01157         return _mm_add_epi32(c, d);
01158 }
01159 
01161 SSP_FORCEINLINE
01162 __m128i ssp_cvtepi8_epi64_SSE2 ( __m128i a)
01163 {
01164         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01165         __m128i c = _mm_unpacklo_epi8(a, b);                    //FFa0FFa1....
01166         __m128i d = _mm_set_epi32 (0, 128, 0, 128);             //0x80
01167 
01168         c = _mm_unpacklo_epi16(c, b);                                   //FFFFFFa0FFFFFFa1...
01169         c = _mm_unpacklo_epi32(c, b);                                   //FFFFFFFFFFFFFFa0...
01170         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01171         d = _mm_slli_epi64(b, 1);                                               // 0x100 for positive, 0x000 for negative
01172 
01173         return _mm_add_epi64(c, d);
01174 }
01175 
01177 SSP_FORCEINLINE
01178 __m128i ssp_cvtepi16_epi32_SSE2 ( __m128i a)
01179 {
01180         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01181         __m128i c = _mm_unpacklo_epi16(a, b);                   //FFFFa0**FFFFa1**....
01182         __m128i d = _mm_set1_epi32 (0x8000);                    //0x8000
01183 
01184         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01185         d = _mm_slli_epi32(b, 1);                                               // 0x100 for positive, 0x000 for negative
01186 
01187         return _mm_add_epi32(c, d);
01188 }
01189 
01191 SSP_FORCEINLINE
01192 __m128i ssp_cvtepi16_epi64_SSE2 ( __m128i a)
01193 {
01194         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01195         __m128i c = _mm_unpacklo_epi16(a, b);                   //FFFFa0**FFFFa1**....
01196         __m128i d = _mm_set_epi32(0,0x8000, 0,0x8000);  //0x8000
01197 
01198         c = _mm_unpacklo_epi32(c, b);                                   //FFFFFFFFFFFFFFa0...
01199         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01200         d = _mm_slli_epi64(b, 1);                                               // 0x100 for positive, 0x000 for negative
01201 
01202         return _mm_add_epi64(c, d);
01203 }
01204 
01206 SSP_FORCEINLINE
01207 __m128i ssp_cvtepi32_epi64_SSE2 ( __m128i a)
01208 {
01209         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01210         __m128i c = _mm_unpacklo_epi32(a, b);                   //FFFFFFFFa0******FFFFFFFFa1******....
01211         __m128i d = _mm_set_epi32(0, 0x80000000,0,0x80000000);  //0x80000000
01212 
01213         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01214         d = _mm_slli_epi64(b, 1);                                               // 0x100 for positive, 0x000 for negative
01215 
01216         return _mm_add_epi64(c, d);
01217 }
01218 
01220 SSP_FORCEINLINE
01221 __m128i ssp_cvtepu8_epi16_SSE2 ( __m128i a)
01222 {
01223         __m128i b =_mm_setzero_si128 ();
01224 
01225         return _mm_unpacklo_epi8(a, b);
01226 }
01227 
01229 SSP_FORCEINLINE
01230 __m128i ssp_cvtepu8_epi32_SSE2 ( __m128i a)
01231 {
01232         __m128i b = _mm_setzero_si128 ();
01233 
01234         a = _mm_unpacklo_epi8(a, b);
01235 
01236         return _mm_unpacklo_epi16(a, b);
01237 }
01238 
01240 SSP_FORCEINLINE
01241 __m128i ssp_cvtepu8_epi64_SSE2 ( __m128i a)
01242 {
01243         __m128i b = _mm_setzero_si128 ();
01244 
01245         a = _mm_unpacklo_epi8(a, b);
01246 
01247         a = _mm_unpacklo_epi16(a, b);
01248 
01249         return _mm_unpacklo_epi32(a, b);
01250 }
01251 
01253 SSP_FORCEINLINE
01254 __m128i ssp_cvtepu16_epi32_SSE2 ( __m128i a)
01255 {
01256         __m128i b = _mm_setzero_si128 ();
01257 
01258         return _mm_unpacklo_epi16(a, b);
01259 }
01260 
01262 SSP_FORCEINLINE
01263 __m128i ssp_cvtepu16_epi64_SSE2 ( __m128i a)
01264 {
01265         __m128i b = _mm_setzero_si128 ();
01266 
01267         a = _mm_unpacklo_epi16(a, b);
01268 
01269         return _mm_unpacklo_epi32(a, b);
01270 }
01271 
01273 SSP_FORCEINLINE
01274 __m128i ssp_cvtepu32_epi64_SSE2 ( __m128i a)
01275 {
01276         __m128i b = _mm_setzero_si128 ();
01277 
01278         return _mm_unpacklo_epi32(a, b);
01279 }
01280 
01282 SSP_FORCEINLINE
01283 __m128i ssp_packus_epi32_SSE2( __m128i a, __m128i b )
01284 {
01285     const static __m128i val_32 = SSP_CONST_SET_32I(  0x8000, 0x8000, 0x8000, 0x8000 );
01286     const static __m128i val_16 = SSP_CONST_SET_16I(  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000 );
01287 
01288     a = _mm_sub_epi32( a, val_32 );
01289     b = _mm_sub_epi32( b, val_32 );
01290     a = _mm_packs_epi32( a, b );
01291     a = _mm_add_epi16( a, val_16 );
01292     return a;
01293 }
01294 
01295 //SSSE3
01296 // bit manipulation
01297 //__m128i _mm_alignr_epi8(__m128i a, __m128i b, const int ralign);
01299 SSP_FORCEINLINE
01300 __m128i ssp_alignr_epi8_SSE2 (__m128i a, __m128i b, const int ralign)
01301 {
01302         if (ralign < 0)  return b; //only shift to right, no negative
01303         //if (ralign > 32) return _mm_setzero_si128();
01304         //
01305         //if (ralign > 16) return _mm_srli_si128(a, ralign-16);
01306 
01307         //b = _mm_srli_si128(b, ralign);
01308         //a = _mm_slli_si128(a, 16-ralign);
01309         switch (ralign) {
01310         case 0: 
01311                 return b;                       
01312         case 1: 
01313                 b = _mm_srli_si128(b, 1);       
01314                 a = _mm_slli_si128(a, 15);          
01315                 return _mm_or_si128( a, b );    
01316         case 2: 
01317                 b = _mm_srli_si128(b, 2);       
01318                 a = _mm_slli_si128(a, 14);          
01319                 return _mm_or_si128( a, b );    
01320         case 3: 
01321                 b = _mm_srli_si128(b, 3);       
01322                 a = _mm_slli_si128(a, 13);          
01323                 return _mm_or_si128( a, b );    
01324         case 4:                                         
01325                 b = _mm_srli_si128(b, 4);           
01326                 a = _mm_slli_si128(a, 12);          
01327                 return _mm_or_si128( a, b );    
01328         case 5:                                         
01329                 b = _mm_srli_si128(b, 5);           
01330                 a = _mm_slli_si128(a, 11);          
01331                 return _mm_or_si128( a, b );    
01332         case 6:                                         
01333                 b = _mm_srli_si128(b, 6);           
01334                  a = _mm_slli_si128(a, 10);         
01335                 return _mm_or_si128( a, b );    
01336         case 7:                                         
01337                 b = _mm_srli_si128(b, 7);           
01338                 a = _mm_slli_si128(a, 9);           
01339                 return _mm_or_si128( a, b );    
01340         case 8:                                         
01341                 b = _mm_srli_si128(b, 8);           
01342                 a = _mm_slli_si128(a, 8);           
01343                 return _mm_or_si128( a, b );    
01344         case 9:                                         
01345                 b = _mm_srli_si128(b, 9);           
01346                 a = _mm_slli_si128(a, 7);           
01347                 return _mm_or_si128( a, b );    
01348         case 10:                                            
01349                 b = _mm_srli_si128(b, 10);          
01350                 a = _mm_slli_si128(a,  6);          
01351                 return _mm_or_si128( a, b );    
01352         case 11:                                            
01353                 b = _mm_srli_si128(b, 11);      
01354                 a = _mm_slli_si128(a,  5);      
01355                 return _mm_or_si128( a, b );    
01356         case 12:                                            
01357                 b = _mm_srli_si128(b, 12);      
01358                 a = _mm_slli_si128(a,  4);      
01359                 return _mm_or_si128( a, b );    
01360         case 13:                                            
01361                 b = _mm_srli_si128(b, 13);      
01362                 a = _mm_slli_si128(a,  3);      
01363                 return _mm_or_si128( a, b );    
01364         case 14:                                            
01365                 b = _mm_srli_si128(b, 14);          
01366                 a = _mm_slli_si128(a,  2);      
01367                 return _mm_or_si128( a, b );    
01368         case 15:                                            
01369                 b = _mm_srli_si128(b, 15);          
01370                 a = _mm_slli_si128(a,  1);          
01371                 return _mm_or_si128( a, b );    
01372         case 16:                            
01373                 return a;                       
01374         case 17:                            
01375                     a    = _mm_slli_si128(a,  1);   
01376                     return _mm_srli_si128(a,  1);   
01377         case 18:                            
01378                     a    = _mm_slli_si128(a,  2);   
01379                     return _mm_srli_si128(a,  2);   
01380         case 19:                            
01381                     a    = _mm_slli_si128(a,  3);   
01382                     return _mm_srli_si128(a,  3);   
01383         case 20:                            
01384                     a    = _mm_slli_si128(a,  4);   
01385                     return _mm_srli_si128(a,  4);   
01386         case 21:                            
01387                     a    = _mm_slli_si128(a,  5);   
01388                     return _mm_srli_si128(a,  5);   
01389         case 22:                            
01390                     a    = _mm_slli_si128(a,  6);   
01391                     return _mm_srli_si128(a,  6);   
01392         case 23:                            
01393                     a    = _mm_slli_si128(a,  7);   
01394                     return _mm_srli_si128(a,  7);   
01395         case 24:                            
01396                     a    = _mm_slli_si128(a,  8);   
01397                     return _mm_srli_si128(a,  8);   
01398         case 25:                            
01399                     a    = _mm_slli_si128(a,  9);   
01400                     return _mm_srli_si128(a,  9);   
01401         case 26:                            
01402                     a    = _mm_slli_si128(a, 10);   
01403                     return _mm_srli_si128(a, 10);   
01404         case 27:                            
01405                     a    = _mm_slli_si128(a, 11);   
01406                     return _mm_srli_si128(a, 11);   
01407         case 28:                            
01408                     a    = _mm_slli_si128(a, 12);   
01409                     return _mm_srli_si128(a, 12);   
01410         case 29:                            
01411                     a    = _mm_slli_si128(a, 13);   
01412                     return _mm_srli_si128(a, 13);   
01413         case 30:                            
01414                     a    = _mm_slli_si128(a, 14);   
01415                     return _mm_srli_si128(a, 14);   
01416         case 31:                            
01417                     a    = _mm_slli_si128(a, 15);   
01418                     return _mm_srli_si128(a, 15);   
01419         default:                            
01420                     return _mm_setzero_si128(); 
01421         }
01422 }
01423 
01424 //---------------------------------------
01425 //Insert
01426 //---------------------------------------
01428 SSP_FORCEINLINE __m128i ssp_insert_epi8_SSE2( __m128i a, int b, const int ndx )
01429 {
01430     ssp_m128 Ahi, Alo;
01431     b = b & 0xFF;                                           /* Convert to 8-bit integer */
01432     Ahi.i = _mm_unpackhi_epi8( a, _mm_setzero_si128() );    /* Ahi = a_8[8:15]  Simulate 8bit integers as 16-bit integers */
01433     Alo.i = _mm_unpacklo_epi8( a, _mm_setzero_si128() );    /* Alo = a_8[0:7]   Simulate 8bit integers as 16-bit integers */
01434 
01435     /* Insert b as a 16-bit integer to upper or lower half of a */
01436     switch( ndx & 0xF )
01437     {
01438     case 0:  Alo.i = _mm_insert_epi16( Alo.i, b, 0 ); break;
01439     case 1:  Alo.i = _mm_insert_epi16( Alo.i, b, 1 ); break;
01440     case 2:  Alo.i = _mm_insert_epi16( Alo.i, b, 2 ); break;
01441     case 3:  Alo.i = _mm_insert_epi16( Alo.i, b, 3 ); break;
01442     case 4:  Alo.i = _mm_insert_epi16( Alo.i, b, 4 ); break;
01443     case 5:  Alo.i = _mm_insert_epi16( Alo.i, b, 5 ); break;
01444     case 6:  Alo.i = _mm_insert_epi16( Alo.i, b, 6 ); break;
01445     case 7:  Alo.i = _mm_insert_epi16( Alo.i, b, 7 ); break;
01446     case 8:  Ahi.i = _mm_insert_epi16( Ahi.i, b, 0 ); break;
01447     case 9:  Ahi.i = _mm_insert_epi16( Ahi.i, b, 1 ); break;
01448     case 10: Ahi.i = _mm_insert_epi16( Ahi.i, b, 2 ); break;
01449     case 11: Ahi.i = _mm_insert_epi16( Ahi.i, b, 3 ); break;
01450     case 12: Ahi.i = _mm_insert_epi16( Ahi.i, b, 4 ); break;
01451     case 13: Ahi.i = _mm_insert_epi16( Ahi.i, b, 5 ); break;
01452     case 14: Ahi.i = _mm_insert_epi16( Ahi.i, b, 6 ); break;
01453     default: Ahi.i = _mm_insert_epi16( Ahi.i, b, 7 );
01454     }
01455     return _mm_packus_epi16( Alo.i, Ahi.i ); // Pack the 16-bit integers to 8bit again.
01456 
01458     //ssp_m128 A, B, mask;
01459     //mask.i = _mm_setzero_si128();
01460     //mask.s8[ ndx & 0x0F ] = (ssp_s8)0xFF;
01461     //B.i    = _mm_set1_epi8( (ssp_s8)b );
01462     //A.i    = _mm_andnot_si128( mask.i, a );
01463     //mask.i = _mm_and_si128( mask.i, B.i );
01464     //A.i = _mm_or_si128( A.i, mask.i );
01465     //return A.i;
01466 }
01468 SSP_FORCEINLINE __m128i ssp_inserti_si64_SSE2( __m128i a, __m128i b, int len, int ndx )
01469 {
01470     const static __m128i MASK = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF );
01471 
01472     int left = ndx + len;
01473     __m128i m;
01474     m = _mm_slli_epi64( MASK, 64-left );    // clear the mask to the left
01475     m = _mm_srli_epi64( m,    64-len  );    // clear the mask to the right
01476     m = _mm_slli_epi64( m,    ndx     );    // put the mask into the proper position
01477     b = _mm_slli_epi64( b,    ndx     );    // put the insert bits into the proper position
01478 
01479     a = ssp_logical_bitwise_select_SSE2( b, a, m );
01480     return a;
01481 }
01482 
01483 
01485 SSP_FORCEINLINE __m128i ssp_insert_si64_SSE2( __m128i a, __m128i b )
01486 {
01487     ssp_u32 ndx, len;
01488     ssp_m128 B;
01489     B.i = b;
01490 
01491     ndx = (ssp_u32)((B.u64[1] & 0x3F00) >> 8);    // Mask length field.
01492     len = (ssp_u32)((B.u64[1] & 0x003F));         // Mask ndx field.
01493 
01494     a = ssp_inserti_si64_SSE2( a, b, len, ndx );
01495     return a;
01496 }
01497 
01498 //---------------------------------------
01499 //Extract
01500 //---------------------------------------
01501 
01503 SSP_FORCEINLINE int ssp_extract_epi8_SSE2( __m128i a, const int ndx )                       
01504 {
01505     ssp_m128 mask;
01506     switch( ndx & 0xF )
01507     {
01508     case 15:  a = _mm_srli_si128( a, 15 ); break;
01509     case 14:  a = _mm_srli_si128( a, 14 ); break;
01510     case 13:  a = _mm_srli_si128( a, 13 ); break;
01511     case 12:  a = _mm_srli_si128( a, 12 ); break;
01512     case 11:  a = _mm_srli_si128( a, 11 ); break;
01513     case 10:  a = _mm_srli_si128( a, 10 ); break;
01514     case 9:   a = _mm_srli_si128( a,  9 ); break;
01515     case 8:   a = _mm_srli_si128( a,  8 ); break;
01516     case 7:   a = _mm_srli_si128( a,  7 ); break;
01517     case 6:   a = _mm_srli_si128( a,  6 ); break;
01518     case 5:   a = _mm_srli_si128( a,  5 ); break;
01519     case 4:   a = _mm_srli_si128( a,  4 ); break;
01520     case 3:   a = _mm_srli_si128( a,  3 ); break;
01521     case 2:   a = _mm_srli_si128( a,  2 ); break;
01522     case 1:   a = _mm_srli_si128( a,  1 ); break;
01523     }
01524 
01525     mask.i = _mm_setr_epi8 ( -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ); 
01526     // mask = { 00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,FF }
01527     mask.i = _mm_and_si128 ( mask.i, a   );
01528     return mask.s8[0];
01529 }
01530 
01532 SSP_FORCEINLINE int ssp_extract_epi32_SSE2( __m128i a, const int imm )                            
01533 {
01534     ssp_m128 mask;
01535     switch( imm & 0x3 )
01536     {
01537     case 3:  a = _mm_srli_si128( a, 12 ); break;
01538     case 2:  a = _mm_srli_si128( a, 8  ); break;
01539     case 1:  a = _mm_srli_si128( a, 4  ); break;
01540     }
01541 
01542     mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF );
01543     mask.i = _mm_and_si128 ( mask.i, a   );
01544 
01545     return mask.s32[0];
01546 }
01547 
01549 SSP_FORCEINLINE int ssp_extract_ps_SSE2( __m128 a, const int ndx )                          
01550 {
01551     ssp_m128 A;
01552     A.f = a;
01553     return ssp_extract_epi32_SSE2( A.i, ndx );
01554 }
01555 
01557 SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_SSE2( __m128i a, const int ndx )                  
01558 {
01559     ssp_m128 mask;
01560     switch( ndx & 0x1 )
01561     {
01562     case 1:  a = _mm_srli_si128( a, 8  ); break;
01563     }
01564 
01565     mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF );
01566     mask.i = _mm_and_si128 ( mask.i, a   );
01567 
01568     return mask.s64[0];
01569 }
01570 
01574 SSP_FORCEINLINE __m128i ssp_extracti_si64_SSE2( __m128i a, int len, int ndx )   
01575 {
01576     int left = ndx + len;   
01577     a = _mm_slli_epi64( a, 64-left );    // clear the mask to the left
01578     a = _mm_srli_epi64( a, 64-len  );    // clear the mask to the right      
01579     return a;
01580 }
01581 
01582 
01586 SSP_FORCEINLINE __m128i ssp_extract_si64_SSE2( __m128i a ,__m128i b )        
01587 {
01588     ssp_u32 len, ndx;   
01589     ssp_m128 B;
01590     B.i = b;
01591 
01592     ndx = (ssp_u32)((B.u64[0] & 0x3F00) >> 8);    // Mask ndx field.
01593     len = (ssp_u32)((B.u64[0] & 0x003F));         // Mask len field.
01594 
01595     a = ssp_extracti_si64_SSE2( a, len, ndx );   
01596     return a;
01597 }
01598 
01599 
01601 SSP_FORCEINLINE __m128i ssp_shuffle_epi8_SSE2 (__m128i a, __m128i mask)
01602 {  
01603     ssp_m128 A,B, MASK, maskZero;       
01604     A.i        = a;
01605     maskZero.i = ssp_comge_epi8_SSE2( mask, _mm_setzero_si128()        );    
01606     MASK.i     = _mm_and_si128      ( mask, _mm_set1_epi8( (char)0x0F) );
01607 
01608     B.s8[ 0] = A.s8[ (MASK.s8[ 0]) ];
01609         B.s8[ 1] = A.s8[ (MASK.s8[ 1]) ];
01610         B.s8[ 2] = A.s8[ (MASK.s8[ 2]) ];
01611         B.s8[ 3] = A.s8[ (MASK.s8[ 3]) ];
01612         B.s8[ 4] = A.s8[ (MASK.s8[ 4]) ];
01613         B.s8[ 5] = A.s8[ (MASK.s8[ 5]) ];
01614         B.s8[ 6] = A.s8[ (MASK.s8[ 6]) ];
01615         B.s8[ 7] = A.s8[ (MASK.s8[ 7]) ];
01616         B.s8[ 8] = A.s8[ (MASK.s8[ 8]) ];
01617         B.s8[ 9] = A.s8[ (MASK.s8[ 9]) ];
01618         B.s8[10] = A.s8[ (MASK.s8[10]) ];
01619         B.s8[11] = A.s8[ (MASK.s8[11]) ];
01620         B.s8[12] = A.s8[ (MASK.s8[12]) ];
01621         B.s8[13] = A.s8[ (MASK.s8[13]) ];
01622         B.s8[14] = A.s8[ (MASK.s8[14]) ];
01623         B.s8[15] = A.s8[ (MASK.s8[15]) ];  
01624 
01625     B.i = _mm_and_si128( B.i, maskZero.i );
01626         return B.i;
01627 }
01628 
01629 
01631 SSP_FORCEINLINE 
01632 __m128i ssp_sign_epi8_SSE2 (__m128i a, __m128i b)
01633 {
01634     __m128i ap, an, c, d, zero, one;
01635 
01636         zero=_mm_setzero_si128();
01637         //Great than zero part
01638         d  = _mm_cmpgt_epi8(b, zero);
01639         ap = _mm_and_si128(a, d);
01640 
01641         //Less than zero
01642         c   = _mm_cmplt_epi8(b, zero);
01643         one = _mm_set1_epi8(1);
01644         an  = _mm_and_si128(a, c);  //get the all number which needs to be negated 
01645         an  = _mm_xor_si128(an, c);
01646         one = _mm_and_si128(one, c);
01647         an  = _mm_add_epi8(an, one);
01648 
01649         return _mm_or_si128(an, ap);//_mm_add_epi8(an, ap);
01650 }
01651 
01653 SSP_FORCEINLINE 
01654 __m128i ssp_sign_epi16_SSE2 (__m128i a, __m128i b)
01655 {
01656     __m128i c, d, zero;
01657 
01658         zero=_mm_setzero_si128();
01659         d   = _mm_cmpgt_epi16(b, zero);
01660         c   = _mm_cmplt_epi16(b, zero);
01661         d   = _mm_srli_epi16(d, 15);
01662         c   = _mm_or_si128(c, d);
01663         a   = _mm_mullo_epi16(a, c);
01664 
01665         //The following method has same performance
01666         //zero=_mm_setzero_si128();
01667         //d   = _mm_cmpgt_epi16(b, zero);
01668         //c   = _mm_cmplt_epi16(b, zero);
01669         //one = _mm_set1_epi16(1);
01670         //d   = _mm_and_si128(d, one);
01671         //c   = _mm_add_epi16(c, d);
01672         //a   = _mm_mullo_epi16(a, c);
01673 
01674         return a;
01675 }
01676 
01678 SSP_FORCEINLINE 
01679 __m128i ssp_sign_epi32_SSE2 (__m128i a, __m128i b)
01680 {
01681     __m128i ap, an, c, d, zero, one;
01682 
01683         zero=_mm_setzero_si128();
01684         //Great than zero part
01685         d  = _mm_cmpgt_epi32(b, zero);
01686         ap = _mm_and_si128(a, d);
01687 
01688         //Less than zero
01689         c   = _mm_cmplt_epi32(b, zero);
01690         one = _mm_set1_epi32(1);
01691         an  = _mm_and_si128(a, c);  //get the all number which needs to be negated 
01692         an  = _mm_xor_si128(an, c);
01693         one = _mm_and_si128(one, c);
01694         an  = _mm_add_epi8(an, one);
01695 
01696         return _mm_or_si128(an, ap);
01697 }
01698 
01699 //---------------------------------------
01700 // Test
01701 //---------------------------------------
01703 SSP_FORCEINLINE int ssp_testc_si128_SSE2( __m128i a, __m128i b)                              
01704 {
01705     a = _mm_xor_si128( a, b );
01706     return ssp_testz_si128_SSE2( a, a );
01707 }
01708 
01710 SSP_FORCEINLINE 
01711 int ssp_testz_si128_SSE2( __m128i a, __m128i b)   // This is much faster in 64 bit                           
01712 {
01713     ssp_m128 t;
01714     t.i = _mm_and_si128  ( a, b );   
01715     t.i = _mm_packs_epi32( t.i, _mm_setzero_si128() );   
01716     return t.u64[0] == 0;
01717 }
01718 
01720 SSP_FORCEINLINE 
01721 int ssp_testnzc_si128_SSE2( __m128i a, __m128i b)                            
01722 {
01723     ssp_m128 zf, cf;    
01724 
01725     zf.i = _mm_and_si128  ( a, b );   
01726     zf.i = _mm_packs_epi32( zf.i, _mm_setzero_si128() ); 
01727  
01728     cf.i = _mm_andnot_si128( a, b );
01729     cf.i = _mm_packs_epi32( cf.i, _mm_setzero_si128() );  
01730 
01731     return ( !(zf.u64[0] == 0) && !(cf.u64[0] == 0));
01732 }
01733 
01734 
01735 //---------------------------------------
01736 // Move
01737 //---------------------------------------
01739 SSP_FORCEINLINE __m128 ssp_movehdup_ps_SSE2(__m128 a)                                   
01740 {
01741     ssp_m128 A;
01742     A.f = a;
01743     A.i = _mm_shuffle_epi32( A.i, _MM_SHUFFLE( 3, 3, 1, 1) );
01744     return A.f;
01745 }
01746 
01748 SSP_FORCEINLINE __m128 ssp_moveldup_ps_SSE2(__m128 a)                                   
01749 {
01750     ssp_m128 A;
01751     A.f = a;
01752     A.i = _mm_shuffle_epi32( A.i, _MM_SHUFFLE( 2, 2, 0, 0) );
01753     return A.f;
01754 }
01755 
01757 SSP_FORCEINLINE __m128d ssp_movedup_pd_SSE2(__m128d a)                                  
01758 {
01759     ssp_m128 A;
01760     A.d = a;
01761     return _mm_set_pd( A.f64[0], A.f64[0] );
01762 }
01763 
01765 SSP_FORCEINLINE __m128i ssp_rot_epi8_SSE2(__m128i a, __m128i b  )
01766 {
01767     int n;
01768     ssp_m128 A,B;
01769     A.i = a;
01770     B.i = b;
01771 
01772     for( n = 0; n < 16; n++ )
01773     {
01774       if( B.s8[n] < 0 )
01775       {
01776         unsigned int count = (-B.s8[n]) % 8;
01777         unsigned int carry_count = (8 - count) % 8;
01778         unsigned char carry = A.u8[n] << carry_count;
01779         A.u8[n] = A.u8[n] >> count;
01780         A.u8[n] = A.u8[n] | carry;
01781       }
01782       else
01783       {
01784         unsigned int count = B.s8[n] % 8;
01785         unsigned int carry_count = (8 - count) % 8;
01786         unsigned char carry = A.u8[n] >> carry_count;
01787         A.u8[n] = A.u8[n] << count;
01788         A.u8[n] = A.u8[n] | carry;
01789       }
01790     }
01791     return A.i;
01792 }
01794 SSP_FORCEINLINE __m128i ssp_rot_epi16_SSE2(__m128i a, __m128i b  )
01795 {
01796     int n;
01797     ssp_m128 A,B;
01798     A.i = a;
01799     B.i = b;
01800 
01801     for( n = 0; n < 8; n++ )
01802     {
01803       if( B.s16[n] < 0 )
01804       {
01805         unsigned int count = (-B.s16[n]) % 16;
01806         unsigned int carry_count = (16 - count) % 16;
01807         ssp_u16 carry = A.u16[n] << carry_count;
01808         A.u16[n] = A.u16[n] >> count;
01809         A.u16[n] = A.u16[n] | carry;
01810       }
01811       else
01812       {
01813         unsigned int count = B.s16[n] % 8;
01814         unsigned int carry_count = (16 - count) % 16;
01815         ssp_u16 carry = A.u16[n] >> carry_count;
01816         A.u16[n] = A.u16[n] << count;
01817         A.u16[n] = A.u16[n] | carry;
01818       }
01819     }
01820     return A.i;
01821 }
01823 SSP_FORCEINLINE __m128i ssp_rot_epi32_SSE2(__m128i a, __m128i b  )
01824 {
01825     int n;
01826     ssp_m128 A,B;
01827     A.i = a;
01828     B.i = b;
01829 
01830     for( n = 0; n < 4; n++ )
01831     {
01832       if( B.s32[n] < 0 )
01833       {
01834         unsigned int count = (-B.s32[n]) % 32;
01835         unsigned int carry_count = (32 - count) % 32;
01836         ssp_u32 carry = A.u32[n] << carry_count;
01837         A.u32[n] = A.u32[n] >> count;
01838         A.u32[n] = A.u32[n] | carry;
01839       }
01840       else
01841       {
01842         unsigned int count = B.s32[n] % 32;
01843         unsigned int carry_count = (32 - count) % 32;
01844         ssp_u32 carry = A.u32[n] >> carry_count;
01845         A.u32[n] = A.u32[n] << count;
01846         A.u32[n] = A.u32[n] | carry;
01847       }
01848     }
01849     return A.i;
01850 }
01852 SSP_FORCEINLINE __m128i ssp_rot_epi64_SSE2(__m128i a, __m128i b  )
01853 {
01854     int n;
01855     ssp_m128 A,B;
01856     A.i = a;
01857     B.i = b;
01858 
01859     for( n = 0; n < 2; n++ )
01860     {
01861       if( B.s64[n] < 0 )
01862       {
01863         unsigned int count = (unsigned int)((-B.s64[n]) % 64);
01864         unsigned int carry_count = (64 - count) % 64;
01865         ssp_u64 carry = A.u64[n] << carry_count;
01866         A.u64[n] = A.u64[n] >> count;
01867         A.u64[n] = A.u64[n] | carry;
01868       }
01869       else
01870       {
01871         unsigned int count = (unsigned int)(B.s64[n] % 64);
01872         unsigned int carry_count = (64 - count) % 64;
01873         ssp_u64 carry = A.u64[n] >> carry_count;
01874         A.u64[n] = A.u64[n] << count;
01875         A.u64[n] = A.u64[n] | carry;
01876       }
01877     }
01878     return A.i;
01879 }
01880 
01882 SSP_FORCEINLINE __m128i ssp_roti_epi8_SSE2(__m128i a, const int b)
01883 {
01884     ssp_m128 A;
01885     A.i = a;
01886 
01887     if( b < 0 )
01888     {
01889         const unsigned int count = (-b) % 8;
01890         const unsigned int carry_count = (8 - count) % 8;
01891         __m128i t = ssp_slli_epi8_SSE2( A.i, carry_count );
01892         A.i = ssp_srli_epi8_SSE2( A.i, count );
01893         A.i = _mm_or_si128( A.i, t );
01894     }
01895     else
01896     {
01897         const unsigned int count = b % 8;
01898         const unsigned int carry_count = (8 - count) % 8;
01899         __m128i t = ssp_srli_epi8_SSE2( A.i, carry_count );
01900         A.i = ssp_slli_epi8_SSE2( A.i, count );
01901         A.i = _mm_or_si128( A.i, t );
01902     }
01903 
01904     return A.i;
01905 }
01907 SSP_FORCEINLINE __m128i ssp_roti_epi16_SSE2(__m128i a, const int b)
01908 {
01909     ssp_m128 A;
01910     A.i = a;
01911 
01912     if( b < 0 )
01913     {
01914         const unsigned int count = (-b) % 16;
01915         const unsigned int carry_count = (16 - count) % 16;
01916         __m128i t = _mm_slli_epi16( A.i, carry_count );
01917         A.i = _mm_srli_epi16( A.i, count );
01918         A.i = _mm_or_si128( A.i, t );
01919     }
01920     else
01921     {
01922         const unsigned int count = b % 16;
01923         const unsigned int carry_count = (16 - count) % 16;
01924         __m128i t = _mm_srli_epi16( A.i, carry_count );
01925         A.i = _mm_slli_epi16( A.i, count );
01926         A.i = _mm_or_si128( A.i, t );
01927     }
01928 
01929     return A.i;
01930 }
01932 SSP_FORCEINLINE __m128i ssp_roti_epi32_SSE2(__m128i a, const int b)
01933 {
01934     ssp_m128 A;
01935     A.i = a;
01936 
01937     if( b < 0 )
01938     {
01939         const unsigned int count = (-b) % 32;
01940         const unsigned int carry_count = (32 - count) % 32;
01941         __m128i t = _mm_slli_epi32( A.i, carry_count );
01942         A.i = _mm_srli_epi32( A.i, count );
01943         A.i = _mm_or_si128( A.i, t );
01944     }
01945     else
01946     {
01947         const unsigned int count = b % 32;
01948         const unsigned int carry_count = (32 - count) % 32;
01949         __m128i t = _mm_srli_epi32( A.i, carry_count );
01950         A.i = _mm_slli_epi32( A.i, count );
01951         A.i = _mm_or_si128( A.i, t );
01952     }
01953 
01954     return A.i;
01955 }
01957 SSP_FORCEINLINE __m128i ssp_roti_epi64_SSE2(__m128i a, const int b)
01958 {
01959     ssp_m128 A;
01960     A.i = a;
01961 
01962     if( b < 0 )
01963     {
01964         const unsigned int count = (-b) % 64;
01965         const unsigned int carry_count = (64 - count) % 64;
01966         __m128i t = _mm_slli_epi64( A.i, carry_count );
01967         A.i = _mm_srli_epi64( A.i, count );
01968         A.i = _mm_or_si128( A.i, t );
01969     }
01970     else
01971     {
01972         const unsigned int count = b % 64;
01973         const unsigned int carry_count = (64 - count) % 64;
01974         __m128i t = _mm_srli_epi64( A.i, carry_count );
01975         A.i = _mm_slli_epi64( A.i, count );
01976         A.i = _mm_or_si128( A.i, t );
01977     }
01978 
01979     return A.i;
01980 }
01981 
01982 //--------------------------------------
01983 // Packed Shift Logical & Arithmetic
01984 //--------------------------------------
01985 
01987 SSP_FORCEINLINE __m128i ssp_shl_epi8_SSE2(__m128i a, __m128i b)
01988 {
01989     int n;
01990     ssp_m128 A,B;
01991     A.i = a;
01992     B.i = b;
01993 
01994     for( n = 0; n < 16; n++ )
01995     {
01996       if( B.s8[n] < 0 )
01997       {
01998         unsigned int count = (-B.s8[n]) % 8;
01999         A.u8[n] = A.u8[n] >> count;
02000       }
02001       else
02002       {
02003         unsigned int count = B.s8[n] % 8;
02004         A.u8[n] = A.u8[n] << count;
02005       }
02006     }
02007     return A.i;
02008 }
02009 
02011 SSP_FORCEINLINE __m128i ssp_sha_epi8_SSE2(__m128i a, __m128i b)
02012 {
02013     int n;
02014     ssp_m128 A,B;
02015     A.i = a;
02016     B.i = b;
02017 
02018     for( n = 0; n < 16; n++ )
02019     {
02020       if( B.s8[n] < 0 )
02021       {
02022         unsigned int count = (-B.s8[n]) % 8;
02023         A.s8[n] = A.s8[n] >> count;
02024       }
02025       else
02026       {
02027         unsigned int count = B.s8[n] % 8;
02028         A.s8[n] = A.s8[n] << count;
02029       }
02030     }
02031 
02032     return A.i;
02033 }
02034 
02036 SSP_FORCEINLINE __m128i ssp_shl_epi16_SSE2(__m128i a, __m128i b)
02037 {
02038     __m128i v1, v2, mask, mask2, b1, b2;
02039     b1 = ssp_abs_epi8_SSE2( b );
02040     mask = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, -1 );
02041     mask2 = _mm_srli_epi16( mask, 12 ); // the shfit count is a 4 bit value
02042 
02043     b2 = _mm_and_si128( b1, mask2 );
02044     v1 = _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ); // negative shift
02045     v2 = _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ); // positive shift
02046     mask = _mm_slli_si128( mask, 2 );
02047     b1 = _mm_srli_si128( b1, 2 );
02048 
02049     b2 = _mm_and_si128( b1, mask2 );
02050     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02051     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02052     mask = _mm_slli_si128( mask, 2 );
02053     b1 = _mm_srli_si128( b1, 2 );
02054 
02055     b2 = _mm_and_si128( b1, mask2 );
02056     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02057     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02058     mask = _mm_slli_si128( mask, 2 );
02059     b1 = _mm_srli_si128( b1, 2 );
02060 
02061     b2 = _mm_and_si128( b1, mask2 );
02062     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02063     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02064     mask = _mm_slli_si128( mask, 2 );
02065     b1 = _mm_srli_si128( b1, 2 );
02066 
02067     b2 = _mm_and_si128( b1, mask2 );
02068     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02069     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02070     mask = _mm_slli_si128( mask, 2 );
02071     b1 = _mm_srli_si128( b1, 2 );
02072 
02073     b2 = _mm_and_si128( b1, mask2 );
02074     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02075     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02076     mask = _mm_slli_si128( mask, 2 );
02077     b1 = _mm_srli_si128( b1, 2 );
02078 
02079     b2 = _mm_and_si128( b1, mask2 );
02080     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02081     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02082     mask = _mm_slli_si128( mask, 2 );
02083     b1 = _mm_srli_si128( b1, 2 );
02084 
02085     b2 = _mm_and_si128( b1, mask2 );
02086     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02087     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02088 
02089     mask = _mm_setzero_si128();
02090     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02091     mask2 = _mm_slli_epi16( mask, 8 );
02092     mask = _mm_or_si128( mask, mask2 );
02093     v1 = _mm_and_si128( v1, mask );
02094     mask = _mm_andnot_si128( mask, v2 );
02095     v1 = _mm_or_si128( v1, mask );
02096     return v1;
02097 }
02098 
02100 SSP_FORCEINLINE __m128i ssp_sha_epi16_SSE2(__m128i a, __m128i b)
02101 {
02102     __m128i v1, v2, mask, mask2, b1, b2;
02103     b1 = ssp_abs_epi8_SSE2( b );
02104     mask = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, -1 );
02105     mask2 = _mm_srli_epi16( mask, 12 ); // the shfit count is a 4 bit value
02106 
02107     b2 = _mm_and_si128( b1, mask2 );
02108     v1 = _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ); // negative shift
02109     v2 = _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ); // positive shift
02110     mask = _mm_slli_si128( mask, 2 );
02111     b1 = _mm_srli_si128( b1, 2 );
02112 
02113     b2 = _mm_and_si128( b1, mask2 );
02114     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02115     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02116     mask = _mm_slli_si128( mask, 2 );
02117     b1 = _mm_srli_si128( b1, 2 );
02118 
02119     b2 = _mm_and_si128( b1, mask2 );
02120     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02121     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02122     mask = _mm_slli_si128( mask, 2 );
02123     b1 = _mm_srli_si128( b1, 2 );
02124 
02125     b2 = _mm_and_si128( b1, mask2 );
02126     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02127     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02128     mask = _mm_slli_si128( mask, 2 );
02129     b1 = _mm_srli_si128( b1, 2 );
02130 
02131     b2 = _mm_and_si128( b1, mask2 );
02132     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02133     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02134     mask = _mm_slli_si128( mask, 2 );
02135     b1 = _mm_srli_si128( b1, 2 );
02136 
02137     b2 = _mm_and_si128( b1, mask2 );
02138     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02139     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02140     mask = _mm_slli_si128( mask, 2 );
02141     b1 = _mm_srli_si128( b1, 2 );
02142 
02143     b2 = _mm_and_si128( b1, mask2 );
02144     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02145     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02146     mask = _mm_slli_si128( mask, 2 );
02147     b1 = _mm_srli_si128( b1, 2 );
02148 
02149     b2 = _mm_and_si128( b1, mask2 );
02150     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02151     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02152 
02153     mask = _mm_setzero_si128();
02154     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02155     mask2 = _mm_slli_epi16( mask, 8 );
02156     mask = _mm_or_si128( mask, mask2 );
02157     v1 = _mm_and_si128( v1, mask );
02158     mask = _mm_andnot_si128( mask, v2 );
02159     v1 = _mm_or_si128( v1, mask );
02160     return v1;
02161 }
02162 
02164 SSP_FORCEINLINE __m128i ssp_shl_epi32_SSE2(__m128i a, __m128i b)
02165 {
02166     __m128i v1, v2, mask, mask2, b1, b2;
02167     b1 = ssp_abs_epi8_SSE2( b );
02168     mask = _mm_set_epi32( 0, 0, 0, -1 );
02169     mask2 = _mm_srli_epi32( mask, 27 ); // the shfit count is a 5 bit value
02170 
02171     b2 = _mm_and_si128( b1, mask2 );
02172     v1 = _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ); // negative shift
02173     v2 = _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ); // positive shift
02174     mask = _mm_slli_si128( mask, 4 );
02175     b1 = _mm_srli_si128( b1, 4 );
02176 
02177     b2 = _mm_and_si128( b1, mask2 );
02178     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift
02179     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02180     mask = _mm_slli_si128( mask, 4 );
02181     b1 = _mm_srli_si128( b1, 4 );
02182 
02183     b2 = _mm_and_si128( b1, mask2 );
02184     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift
02185     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02186     mask = _mm_slli_si128( mask, 4 );
02187     b1 = _mm_srli_si128( b1, 4 );
02188 
02189     b2 = _mm_and_si128( b1, mask2 );
02190     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift
02191     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02192 
02193     mask = _mm_setzero_si128();
02194     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02195     mask = _mm_slli_epi32( mask, 24 );
02196     mask = _mm_srai_epi32( mask, 24 );
02197     v1 = _mm_and_si128( v1, mask );
02198     mask = _mm_andnot_si128( mask, v2 );
02199     v1 = _mm_or_si128( v1, mask );
02200     return v1;
02201 }
02202 
02204 SSP_FORCEINLINE __m128i ssp_sha_epi32_SSE2(__m128i a, __m128i b)
02205 {
02206     __m128i v1, v2, mask, mask2, b1, b2;
02207     b1 = ssp_abs_epi8_SSE2( b );
02208     mask = _mm_set_epi32( 0, 0, 0, -1 );
02209     mask2 = _mm_srli_epi32( mask, 27 ); // the shfit count is a 5 bit value
02210 
02211     b2 = _mm_and_si128( b1, mask2 );
02212     v1 = _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ); // negative shift
02213     v2 = _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ); // positive shift
02214     mask = _mm_slli_si128( mask, 4 );
02215     b1 = _mm_srli_si128( b1, 4 );
02216 
02217     b2 = _mm_and_si128( b1, mask2 );
02218     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift
02219     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02220     mask = _mm_slli_si128( mask, 4 );
02221     b1 = _mm_srli_si128( b1, 4 );
02222 
02223     b2 = _mm_and_si128( b1, mask2 );
02224     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift
02225     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02226     mask = _mm_slli_si128( mask, 4 );
02227     b1 = _mm_srli_si128( b1, 4 );
02228 
02229     b2 = _mm_and_si128( b1, mask2 );
02230     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift
02231     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02232 
02233     mask = _mm_setzero_si128();
02234     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02235     mask = _mm_slli_epi32( mask, 24 );
02236     mask = _mm_srai_epi32( mask, 24 );
02237     v1 = _mm_and_si128( v1, mask );
02238     mask = _mm_andnot_si128( mask, v2 );
02239     v1 = _mm_or_si128( v1, mask );
02240     return v1;
02241 }
02242 
02244 SSP_FORCEINLINE __m128i ssp_shl_epi64_SSE2(__m128i a, __m128i b)
02245 {
02246     __m128i v1, v2, mask, mask2, b1, b2;
02247     b1 = ssp_abs_epi8_SSE2( b );
02248     mask = _mm_set_epi32( 0, 0, -1, -1 );
02249     mask2 = _mm_srli_epi64( mask, 58 ); // the shfit count is a 6 bit value
02250 
02251     b2 = _mm_and_si128( b1, mask2 );
02252     v1 = _mm_and_si128( _mm_srl_epi64( a, b2 ), mask ); // negative shift
02253     v2 = _mm_and_si128( _mm_sll_epi64( a, b2 ), mask ); // positive shift
02254     mask = _mm_slli_si128( mask, 8 );
02255     b1 = _mm_srli_si128( b1, 8 );
02256 
02257     b2 = _mm_and_si128( b1, mask2 );
02258     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi64( a, b2 ), mask ) ); // negative shift
02259     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi64( a, b2 ), mask ) ); // positive shift
02260 
02261     mask = _mm_setzero_si128();
02262     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02263     mask = _mm_slli_epi16( mask, 8 );
02264     mask = _mm_srai_epi16( mask, 8 );
02265     mask = _mm_shufflelo_epi16( mask, _MM_SHUFFLE(0,0,0,0) );
02266     mask = _mm_shufflehi_epi16( mask, _MM_SHUFFLE(0,0,0,0) );
02267     v1 = _mm_and_si128( v1, mask );
02268     mask = _mm_andnot_si128( mask, v2 );
02269     v1 = _mm_or_si128( v1, mask );
02270     return v1;
02271 }
02272 
02274 SSP_FORCEINLINE __m128i ssp_sha_epi64_SSE2(__m128i a, __m128i b)
02275 {
02276     int n;
02277     ssp_m128 A,B;
02278     A.i = a;
02279     B.i = b;
02280 
02281     for( n = 0; n < 2; n++ )
02282     {
02283       if( B.s8[n*8] < 0 )
02284       {
02285         unsigned int count = (-B.s8[n*8]) % 64;
02286         A.s64[n] = A.s64[n] >> count;
02287       }
02288       else
02289       {
02290         unsigned int count = B.s8[n*8] % 64;
02291         A.s64[n] = A.s64[n] << count;
02292       }
02293     }
02294 
02295     return A.i;
02296 }
02297 
02303 #endif // __SSEPLUS_EMULATION_SSE2_H__

Generated on Wed May 21 13:44:11 2008 for "SSEPlus" by  doxygen 1.5.4