include/emulation/SSEPlus_emulation_REF.h

Go to the documentation of this file.
00001 //
00002 // Copyright (c) 2006-2008 Advanced Micro Devices, Inc. All Rights Reserved.
00003 // This software is subject to the Apache v2.0 License.
00004 //
00005 #ifndef __SSEPLUS_EMULATION_REF_H__
00006 #define __SSEPLUS_EMULATION_REF_H__
00007 
00008 #include "../SSEPlus_base.h"
00009 #include "../number/SSEPlus_number_REF.h"
00010 #include "../arithmetic/SSEPlus_arithmetic_REF.h"
00011 #include "SSEPlus_emulation_comps_REF.h"
00012 #include <math.h>
00013 
00020 //--------------------------------------
00021 // Extract Fraction
00022 //--------------------------------------
00024 SSP_FORCEINLINE __m128d ssp_frcz_pd_REF(__m128d a)
00025 {
00026         ssp_m128 A;
00027         long long temp;
00028 
00029         A.d = a;
00030 
00031         temp = (long long) A.f64[0];
00032         A.f64[0] -= temp;
00033         temp = (long long) A.f64[1];
00034         A.f64[1] -= temp;
00035 
00036         return A.d;
00037 }
00038 
00040 SSP_FORCEINLINE __m128 ssp_frcz_ps_REF(__m128 a)
00041 {
00042         ssp_m128 A;
00043         int temp;
00044         A.f = a;
00045 
00046         temp = (int) A.f32[0];
00047         A.f32[0] -= temp;
00048         temp = (int) A.f32[1];
00049         A.f32[1] -= temp;
00050         temp = (int) A.f32[2];
00051         A.f32[2] -= temp;
00052         temp = (int) A.f32[3];
00053         A.f32[3] -= temp;
00054 
00055         return A.f;
00056 }
00057 
00059 SSP_FORCEINLINE __m128d ssp_frcz_sd_REF(__m128d a, __m128d b)
00060 {
00061         ssp_m128 A, B;
00062         long long temp;
00063 
00064         A.d = a;
00065         B.d = b;
00066 
00067         temp = (long long) A.f64[0];
00068         B.f64[0] = A.f64[0] - temp;
00069 
00070         return B.d;
00071 }
00072 
00074 SSP_FORCEINLINE __m128 ssp_frcz_ss_REF(__m128 a, __m128 b)
00075 {
00076         ssp_m128 A, B;
00077         int temp;
00078 
00079         A.f = a;
00080         B.f = b;
00081 
00082         temp = (int) A.f32[0];
00083         B.f32[0] = A.f32[0] - temp;
00084 
00085         return B.f;
00086 }
00087 
00088 //--------------------------------------
00089 // Horizontal Add and Sub
00090 //--------------------------------------
00092 SSP_FORCEINLINE __m128i ssp_haddd_epi16_REF(__m128i a)
00093 {
00094         ssp_m128 A, B;
00095         A.i = a;
00096 
00097         B.s32[0] = A.s16[0] + A.s16[1];
00098         B.s32[1] = A.s16[2] + A.s16[3];
00099         B.s32[2] = A.s16[4] + A.s16[5];
00100         B.s32[3] = A.s16[6] + A.s16[7];
00101 
00102         return B.i;
00103 }
00104 
00106 SSP_FORCEINLINE __m128i ssp_haddd_epi8_REF(__m128i a)
00107 {
00108         ssp_m128 A, B;
00109         A.i = a;
00110 
00111         B.s32[0] = A.s8[ 0] + A.s8[ 1] + A.s8[ 2] + A.s8[ 3];
00112         B.s32[1] = A.s8[ 4] + A.s8[ 5] + A.s8[ 6] + A.s8[ 7];
00113         B.s32[2] = A.s8[ 8] + A.s8[ 9] + A.s8[10] + A.s8[11];
00114         B.s32[3] = A.s8[12] + A.s8[13] + A.s8[14] + A.s8[15];
00115 
00116         return B.i;
00117 }
00118 
00120 SSP_FORCEINLINE __m128i ssp_haddd_epu16_REF(__m128i a)
00121 {
00122         ssp_m128 A, B;
00123         A.i = a;
00124 
00125         B.u32[0] = A.u16[0] + A.u16[1];
00126         B.u32[1] = A.u16[2] + A.u16[3];
00127         B.u32[2] = A.u16[4] + A.u16[5];
00128         B.u32[3] = A.u16[6] + A.u16[7];
00129 
00130         return B.i;
00131 }
00132 
00134 SSP_FORCEINLINE __m128i ssp_haddd_epu8_REF(__m128i a)
00135 {
00136         ssp_m128 A, B;
00137         A.i = a;
00138 
00139         B.u32[0] = A.u8[ 0] + A.u8[ 1] + A.u8[ 2] + A.u8[ 3];
00140         B.u32[1] = A.u8[ 4] + A.u8[ 5] + A.u8[ 6] + A.u8[ 7];
00141         B.u32[2] = A.u8[ 8] + A.u8[ 9] + A.u8[10] + A.u8[11];
00142         B.u32[3] = A.u8[12] + A.u8[13] + A.u8[14] + A.u8[15];
00143 
00144         return B.i;
00145 }
00146 
00148 SSP_FORCEINLINE __m128i ssp_haddq_epi16_REF(__m128i a)
00149 {
00150         ssp_m128 A, B;
00151         A.i = a;
00152 
00153         B.s64[0] = A.s16[0] + A.s16[1] + A.s16[2] + A.s16[3];
00154         B.s64[1] = A.s16[4] + A.s16[5] + A.s16[6] + A.s16[7];
00155 
00156         return B.i;
00157 }
00158 
00160 SSP_FORCEINLINE __m128i ssp_haddq_epi32_REF(__m128i a)
00161 {
00162         ssp_m128 A, B;
00163         A.i = a;
00164 
00165         B.s64[0] = A.s32[0] + (long long)A.s32[1];
00166         B.s64[1] = A.s32[2] + (long long)A.s32[3];
00167 
00168         return B.i;
00169 }
00170 
00172 SSP_FORCEINLINE __m128i ssp_haddq_epi8_REF(__m128i a)
00173 {
00174         ssp_m128 A, B;
00175         A.i = a;
00176 
00177         B.s64[0] = A.s8[0] + A.s8[1] + A.s8[2] + A.s8[3] + A.s8[4] + A.s8[5] + A.s8[6] + A.s8[7];
00178         B.s64[1] = A.s8[8] + A.s8[9] + A.s8[10] + A.s8[11] + A.s8[12] + A.s8[13] + A.s8[14] + A.s8[15];
00179 
00180         return B.i;
00181 }
00182 
00184 SSP_FORCEINLINE __m128i ssp_haddq_epu16_REF(__m128i a)
00185 {
00186         ssp_m128 A, B;
00187         A.i = a;
00188 
00189         B.u64[0] = A.u16[0] + A.u16[1] + A.u16[2] + A.u16[3];
00190         B.u64[1] = A.u16[4] + A.u16[5] + A.u16[6] + A.u16[7];
00191 
00192         return B.i;
00193 }
00194 
00196 SSP_FORCEINLINE __m128i ssp_haddq_epu32_REF(__m128i a)
00197 {
00198         ssp_m128 A, B;
00199         A.i = a;
00200 
00201         B.u64[0] = A.u32[0] + (long long)A.u32[1];
00202         B.u64[1] = A.u32[2] + (long long)A.u32[3];
00203 
00204         return B.i;
00205 }
00206 
00208 SSP_FORCEINLINE __m128i ssp_haddq_epu8_REF(__m128i a)
00209 {
00210         ssp_m128 A, B;
00211         A.i = a;
00212 
00213         B.u64[0] = A.u8[0] + A.u8[1] + A.u8[2] + A.u8[3] + A.u8[4] + A.u8[5] + A.u8[6] + A.u8[7];
00214         B.u64[1] = A.u8[8] + A.u8[9] + A.u8[10] + A.u8[11] + A.u8[12] + A.u8[13] + A.u8[14] + A.u8[15];
00215 
00216         return B.i;
00217 }
00218 
00220 SSP_FORCEINLINE __m128i ssp_haddw_epi8_REF(__m128i a)
00221 {
00222         ssp_m128 A, B;
00223         A.i = a;
00224 
00225         B.s16[0] = A.s8[0] + A.s8[1];
00226         B.s16[1] = A.s8[2] + A.s8[3];
00227         B.s16[2] = A.s8[4] + A.s8[5];
00228         B.s16[3] = A.s8[6] + A.s8[7];
00229         B.s16[4] = A.s8[8] + A.s8[9];
00230         B.s16[5] = A.s8[10] + A.s8[11];
00231         B.s16[6] = A.s8[12] + A.s8[13];
00232         B.s16[7] = A.s8[14] + A.s8[15];
00233 
00234         return B.i;
00235 }
00236 
00238 SSP_FORCEINLINE __m128i ssp_haddw_epu8_REF(__m128i a)
00239 {
00240         ssp_m128 A, B;
00241         A.i = a;
00242 
00243         B.u16[0] = A.u8[0] + A.u8[1];
00244         B.u16[1] = A.u8[2] + A.u8[3];
00245         B.u16[2] = A.u8[4] + A.u8[5];
00246         B.u16[3] = A.u8[6] + A.u8[7];
00247         B.u16[4] = A.u8[8] + A.u8[9];
00248         B.u16[5] = A.u8[10] + A.u8[11];
00249         B.u16[6] = A.u8[12] + A.u8[13];
00250         B.u16[7] = A.u8[14] + A.u8[15];
00251 
00252         return B.i;
00253 }
00254 
00256 SSP_FORCEINLINE __m128i ssp_hsubd_epi16_REF(__m128i a)
00257 {
00258         ssp_m128 A, B;
00259         A.i = a;
00260 
00261         B.s32[0] = A.s16[1] - A.s16[0];
00262         B.s32[1] = A.s16[3] - A.s16[2];
00263         B.s32[2] = A.s16[5] - A.s16[4];
00264         B.s32[3] = A.s16[7] - A.s16[6];
00265 
00266         return B.i;
00267 }
00268 
00270 SSP_FORCEINLINE __m128i ssp_hsubq_epi32_REF(__m128i a)
00271 {
00272         ssp_m128 A, B;
00273         A.i = a;
00274 
00275         B.s64[0] = (long long)A.s32[1] - A.s32[0];
00276         B.s64[1] = (long long)A.s32[3] - A.s32[2];
00277 
00278         return B.i;
00279 }
00280 
00282 SSP_FORCEINLINE __m128i ssp_hsubw_epi8_REF(__m128i a)
00283 {
00284         ssp_m128 A, B;
00285         A.i = a;
00286 
00287         B.s16[0] = A.s8[1] - A.s8[0];
00288         B.s16[1] = A.s8[3] - A.s8[2];
00289         B.s16[2] = A.s8[5] - A.s8[4];
00290         B.s16[3] = A.s8[7] - A.s8[6];
00291         B.s16[4] = A.s8[9] - A.s8[8];
00292         B.s16[5] = A.s8[11] - A.s8[10];
00293         B.s16[6] = A.s8[13] - A.s8[12];
00294         B.s16[7] = A.s8[15] - A.s8[14];
00295 
00296         return B.i;
00297 }
00298 
00299 //--------------------------------------
00300 // Multiply Add
00301 //--------------------------------------
00302 
00304 SSP_FORCEINLINE __m128i ssp_macc_epi16_REF( __m128i a, __m128i b, __m128i c )
00305 {
00306     ssp_m128 A,B,C;
00307     A.i = a;
00308     B.i = b;
00309     C.i = c;
00310 
00311     A.s16[0] = A.s16[0] * B.s16[0] + C.s16[0];
00312     A.s16[1] = A.s16[1] * B.s16[1] + C.s16[1];
00313     A.s16[2] = A.s16[2] * B.s16[2] + C.s16[2];
00314     A.s16[3] = A.s16[3] * B.s16[3] + C.s16[3];
00315     A.s16[4] = A.s16[4] * B.s16[4] + C.s16[4];
00316     A.s16[5] = A.s16[5] * B.s16[5] + C.s16[5];
00317     A.s16[6] = A.s16[6] * B.s16[6] + C.s16[6];
00318     A.s16[7] = A.s16[7] * B.s16[7] + C.s16[7];
00319 
00320     return A.i;
00321 }
00322 
00324 SSP_FORCEINLINE __m128i ssp_macc_epi32_REF( __m128i a, __m128i b, __m128i c )
00325 {
00326     ssp_m128 A,B,C;
00327     A.i = a;
00328     B.i = b;
00329     C.i = c;
00330 
00331     A.s32[0] = A.s32[0] * B.s32[0] + C.s32[0];
00332     A.s32[1] = A.s32[1] * B.s32[1] + C.s32[1];
00333     A.s32[2] = A.s32[2] * B.s32[2] + C.s32[2];
00334     A.s32[3] = A.s32[3] * B.s32[3] + C.s32[3];
00335 
00336     return A.i;
00337 }
00338 
00340 SSP_FORCEINLINE __m128 ssp_macc_ps_REF( __m128 a, __m128 b, __m128 c )
00341 {
00342     ssp_m128 A,B,C;
00343     A.f = a;
00344     B.f = b;
00345     C.f = c;
00346 
00347     A.f32[0] = A.f32[0] * B.f32[0] + C.f32[0];
00348     A.f32[1] = A.f32[1] * B.f32[1] + C.f32[1];
00349     A.f32[2] = A.f32[2] * B.f32[2] + C.f32[2];
00350     A.f32[3] = A.f32[3] * B.f32[3] + C.f32[3];
00351     return A.f;
00352 }
00353 
00355 SSP_FORCEINLINE __m128d ssp_macc_pd_REF( __m128d a, __m128d b, __m128d c )
00356 {
00357     ssp_m128 A,B,C;
00358     A.d = a;
00359     B.d = b;
00360     C.d = c;
00361 
00362     A.f64[0] = A.f64[0] * B.f64[0] + C.f64[0];
00363     A.f64[1] = A.f64[1] * B.f64[1] + C.f64[1]; 
00364     return A.d;
00365 }
00366 
00368 SSP_FORCEINLINE __m128 ssp_macc_ss_REF(__m128 a, __m128 b, __m128 c)   // Assuming SSE5 *_ss semantics are similar to _mm_add_ss. TODO: confirm
00369 {
00370     ssp_m128 A,B,C;
00371     A.f = a;
00372     B.f = b;
00373     C.f = c;
00374 
00375     A.f32[0] = A.f32[0] * B.f32[0] + C.f32[0];   
00376     return A.f;
00377 }
00378 
00380 SSP_FORCEINLINE __m128d ssp_macc_sd_REF(__m128d a, __m128d b, __m128d c)   // Assuming SSE5 *_ss semantics are similar to _mm_add_ss. TODO: confirm
00381 {
00382     ssp_m128 A,B,C;
00383     A.d = a;
00384     B.d = b;
00385     C.d = c;
00386 
00387     A.f64[0] = A.f64[0] * B.f64[0] + C.f64[0];   
00388     return A.d;
00389 }
00390 
00392 SSP_FORCEINLINE __m128i ssp_maccd_epi16_REF( __m128i a, __m128i b, __m128i c )
00393 {
00394     ssp_m128 A, B, C, D;
00395     A.i = a;
00396     B.i = b;
00397     C.i = c;
00398 
00399     D.s32[0] = A.s16[0] * B.s16[0] + C.s32[0];
00400     D.s32[1] = A.s16[2] * B.s16[2] + C.s32[1];
00401     D.s32[2] = A.s16[4] * B.s16[4] + C.s32[2];
00402     D.s32[3] = A.s16[6] * B.s16[6] + C.s32[3];
00403 
00404     return D.i;
00405 }
00406 
00408 SSP_FORCEINLINE __m128i ssp_macchi_epi32_REF( __m128i a, __m128i b, __m128i c )
00409 {
00410     ssp_m128 A, B, C, D;
00411     A.i = a;
00412     B.i = b;
00413     C.i = c;
00414 
00415     D.s64[0] = A.s32[1] * B.s32[1] + C.s64[0];
00416     D.s64[1] = A.s32[3] * B.s32[3] + C.s64[1];
00417 
00418     return D.i;
00419 }
00420 
00422 SSP_FORCEINLINE __m128i ssp_macclo_epi32_REF( __m128i a, __m128i b, __m128i c )
00423 {
00424     ssp_m128 A, B, C, D;
00425     A.i = a;
00426     B.i = b;
00427     C.i = c;
00428 
00429     D.s64[0] = A.s32[0] * B.s32[0] + C.s64[0];
00430     D.s64[1] = A.s32[2] * B.s32[2] + C.s64[1];
00431 
00432     return D.i;
00433 }
00434 
00435 #define SSP_SATURATION(a, pos_limit, neg_limit) (a>pos_limit) ? pos_limit : ((a<neg_limit)?neg_limit:a)
00436 
00438 SSP_FORCEINLINE __m128i ssp_maccs_epi16_REF( __m128i a, __m128i b, __m128i c )
00439 {
00440     ssp_m128 A, B, C;
00441         int temp;
00442     A.i = a;
00443     B.i = b;
00444     C.i = c;
00445 
00446         temp = A.s16[0] * B.s16[0] + C.s16[0];
00447         A.s16[0] = SSP_SATURATION(temp, 32767, -32768);
00448         temp = A.s16[1] * B.s16[1] + C.s16[1];
00449     A.s16[1] = SSP_SATURATION(temp, 32767, -32768);
00450         temp = A.s16[2] * B.s16[2] + C.s16[2];
00451     A.s16[2] = SSP_SATURATION(temp, 32767, -32768);
00452         temp = A.s16[3] * B.s16[3] + C.s16[3];
00453     A.s16[3] = SSP_SATURATION(temp, 32767, -32768);
00454         temp = A.s16[4] * B.s16[4] + C.s16[4];
00455     A.s16[4] = SSP_SATURATION(temp, 32767, -32768);
00456         temp = A.s16[5] * B.s16[5] + C.s16[5];
00457     A.s16[5] = SSP_SATURATION(temp, 32767, -32768);
00458         temp = A.s16[6] * B.s16[6] + C.s16[6];
00459     A.s16[6] = SSP_SATURATION(temp, 32767, -32768);
00460         temp = A.s16[7] * B.s16[7] + C.s16[7];
00461     A.s16[7] = SSP_SATURATION(temp, 32767, -32768);
00462 
00463     return A.i;
00464 }
00465 
00467 SSP_FORCEINLINE __m128i ssp_maccs_epi32_REF( __m128i a, __m128i b, __m128i c )
00468 {
00469     ssp_m128 A, B, C;
00470         long long temp;
00471     A.i = a;
00472     B.i = b;
00473     C.i = c;
00474 
00475         temp = (long long)A.s32[0] * B.s32[0] + C.s32[0];
00476         A.s32[0] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00477         temp = (long long)A.s32[1] * B.s32[1] + C.s32[1];
00478     A.s32[1] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00479         temp = (long long)A.s32[2] * B.s32[2] + C.s32[2];
00480     A.s32[2] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00481         temp = (long long)A.s32[3] * B.s32[3] + C.s32[3];
00482     A.s32[3] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00483 
00484     return A.i;
00485 }
00486 
00488 SSP_FORCEINLINE __m128i ssp_maccsd_epi16_REF( __m128i a, __m128i b, __m128i c )
00489 {
00490     ssp_m128 A, B, C, D;
00491         long long temp;
00492     A.i = a;
00493     B.i = b;
00494     C.i = c;
00495 
00496         //should be able to compare data to see whether overflow/underflow
00497         temp = A.s16[0] * B.s16[0] + (long long)C.s32[0];
00498     D.s32[0] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00499         temp = A.s16[2] * B.s16[2] + (long long)C.s32[1];
00500     D.s32[1] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00501         temp = A.s16[4] * B.s16[4] + (long long)C.s32[2];
00502     D.s32[2] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00503         temp = A.s16[6] * B.s16[6] + (long long)C.s32[3];
00504     D.s32[3] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00505 
00506     return D.i;
00507 }
00508 
00510 SSP_FORCEINLINE __m128i ssp_maccshi_epi32_REF( __m128i a, __m128i b, __m128i c )
00511 {
00512     ssp_m128 A, B, C, D;
00513         long long temp;
00514         unsigned long long signT, signC;
00515     A.i = a;
00516     B.i = b;
00517     C.i = c;
00518 
00519         temp = (long long)A.s32[1] * B.s32[1];
00520         signT = temp & 0x8000000000000000LL;
00521         signC = C.s64[0] & 0x8000000000000000LL;
00522         temp += C.s64[0];
00523         D.s64[0] = (signT==signC) ? ((signT >0) ? ((temp > C.s64[0]) ? 0x8000000000000000LL : temp) 
00524                 : ((temp < C.s64[0])? 0x7FFFFFFFFFFFFFFFLL : temp)) : temp;
00525         temp = (long long)A.s32[3] * B.s32[3];
00526         signT = temp & 0x8000000000000000LL;
00527         signC = C.s64[1] & 0x8000000000000000LL;
00528         temp += C.s64[1];
00529         D.s64[1] = (signT==signC) ? ((signT >0) ? ((temp > C.s64[1]) ? 0x8000000000000000LL : temp) 
00530                 : ((temp < C.s64[1])? 0x7FFFFFFFFFFFFFFFLL : temp)) : temp;
00531 
00532     return D.i;
00533 }
00534 
00536 SSP_FORCEINLINE __m128i ssp_maccslo_epi32_REF( __m128i a, __m128i b, __m128i c )
00537 {
00538     ssp_m128 A, B, C, D;
00539         long long temp;
00540         unsigned long long signT, signC;
00541     A.i = a;
00542     B.i = b;
00543     C.i = c;
00544 
00545         temp = (long long)A.s32[0] * B.s32[0];
00546         signT = temp & 0x8000000000000000LL;
00547         signC = C.s64[0] & 0x8000000000000000LL;
00548         temp += C.s64[0];
00549         D.s64[0] = (signT==signC) ? ((signT >0) ? ((temp > C.s64[0]) ? 0x8000000000000000LL : temp) 
00550                 : ((temp < C.s64[0])? 0x7FFFFFFFFFFFFFFFLL : temp)) : temp;
00551         temp = (long long)A.s32[2] * B.s32[2];
00552         signT = temp & 0x8000000000000000LL;
00553         signC = C.s64[1] & 0x8000000000000000LL;
00554         temp += C.s64[1];
00555         D.s64[1] = (signT==signC) ? ((signT >0) ? ((temp > C.s64[1]) ? 0x8000000000000000LL : temp) 
00556                 : ((temp < C.s64[1])? 0x7FFFFFFFFFFFFFFFLL : temp)) : temp;
00557 
00558     return D.i;
00559 }
00560 
00562 SSP_FORCEINLINE __m128i ssp_maddd_epi16_REF( __m128i a, __m128i b, __m128i c )
00563 {
00564     ssp_m128 A, B, C, D;
00565     A.i = a;
00566     B.i = b;
00567     C.i = c;
00568 
00569     D.s32[0] = A.s16[0] * B.s16[0] + A.s16[1] * B.s16[1] + C.s32[0];
00570     D.s32[1] = A.s16[2] * B.s16[2] + A.s16[3] * B.s16[3] + C.s32[1];
00571     D.s32[2] = A.s16[4] * B.s16[4] + A.s16[5] * B.s16[5] + C.s32[2];
00572     D.s32[3] = A.s16[6] * B.s16[6] + A.s16[7] * B.s16[7] + C.s32[3];
00573 
00574     return D.i;
00575 }
00576 
00578 SSP_FORCEINLINE __m128i ssp_maddsd_epi16_REF( __m128i a, __m128i b, __m128i c )
00579 {
00580     ssp_m128 A, B, C, D;
00581         long long temp;
00582 
00583     A.i = a;
00584     B.i = b;
00585     C.i = c;
00586 
00587         temp = A.s16[0] * B.s16[0] + A.s16[1] * B.s16[1] + (long long)C.s32[0];
00588     D.s32[0] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00589         temp = A.s16[2] * B.s16[2] + A.s16[3] * B.s16[3] + (long long)C.s32[1];
00590     D.s32[1] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));;
00591         temp = A.s16[4] * B.s16[4] + A.s16[5] * B.s16[5] + (long long)C.s32[2];
00592     D.s32[2] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));;
00593         temp = A.s16[6] * B.s16[6] + A.s16[7] * B.s16[7] + (long long)C.s32[3];
00594     D.s32[3] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));;
00595 
00596     return D.i;
00597 }
00598 
00599 //--------------------------------------
00600 // Negative Multiply Add
00601 //--------------------------------------
00602 
00604 SSP_FORCEINLINE __m128 ssp_nmacc_ps_REF(__m128 a, __m128 b, __m128 c)
00605 {
00606     ssp_m128 A,B,C;
00607     A.f = a;
00608     B.f = b;
00609     C.f = c;
00610 
00611     A.f32[0] = -(A.f32[0] * B.f32[0]) + C.f32[0];
00612     A.f32[1] = -(A.f32[1] * B.f32[1]) + C.f32[1];
00613     A.f32[2] = -(A.f32[2] * B.f32[2]) + C.f32[2];
00614     A.f32[3] = -(A.f32[3] * B.f32[3]) + C.f32[3];
00615     return A.f;
00616 }
00617 
00619 SSP_FORCEINLINE __m128d ssp_nmacc_pd_REF(__m128d a, __m128d b, __m128d c)
00620 {
00621     ssp_m128 A,B,C;
00622     A.d = a;
00623     B.d = b;
00624     C.d = c;
00625 
00626     A.f64[0] = -(A.f64[0] * B.f64[0]) + C.f64[0];
00627     A.f64[1] = -(A.f64[1] * B.f64[1]) + C.f64[1]; 
00628     return A.d;
00629 }
00630 
00632 SSP_FORCEINLINE __m128 ssp_nmacc_ss_REF(__m128 a, __m128 b, __m128 c)
00633 {
00634     ssp_m128 A,B,C;
00635     A.f = a;
00636     B.f = b;
00637     C.f = c;
00638 
00639     A.f32[0] = -(A.f32[0] * B.f32[0]) + C.f32[0];   
00640     return A.f;
00641 }
00642 
00644 SSP_FORCEINLINE __m128d ssp_nmacc_sd_REF(__m128d a, __m128d b, __m128d c)
00645 {
00646     ssp_m128 A,B,C;
00647     A.d = a;
00648     B.d = b;
00649     C.d = c;
00650 
00651     A.f64[0] = -(A.f64[0] * B.f64[0]) + C.f64[0];   
00652     return A.d;
00653 }
00654 
00655 
00656 //--------------------------------------
00657 // Multiply Subtract
00658 //--------------------------------------
00659 
00661 SSP_FORCEINLINE __m128 ssp_msub_ps_REF( __m128 a, __m128 b, __m128 c )
00662 {
00663     ssp_m128 A,B,C;
00664     A.f = a;
00665     B.f = b;
00666     C.f = c;
00667 
00668     A.f32[0] = A.f32[0] * B.f32[0] - C.f32[0];
00669     A.f32[1] = A.f32[1] * B.f32[1] - C.f32[1];
00670     A.f32[2] = A.f32[2] * B.f32[2] - C.f32[2];
00671     A.f32[3] = A.f32[3] * B.f32[3] - C.f32[3];
00672     return A.f;
00673 }
00674 
00676 SSP_FORCEINLINE __m128d ssp_msub_pd_REF( __m128d a, __m128d b, __m128d c )
00677 {
00678     ssp_m128 A,B,C;
00679     A.d = a;
00680     B.d = b;
00681     C.d = c;
00682 
00683     A.f64[0] = A.f64[0] * B.f64[0] - C.f64[0];
00684     A.f64[1] = A.f64[1] * B.f64[1] - C.f64[1]; 
00685     return A.d;
00686 }
00687 
00689 SSP_FORCEINLINE __m128 ssp_msub_ss_REF(__m128 a, __m128 b, __m128 c)   // Assuming SSE5 *_ss semantics are similar to _mm_add_ss. TODO: confirm
00690 {
00691     ssp_m128 A,B,C;
00692     A.f = a;
00693     B.f = b;
00694     C.f = c;
00695 
00696     A.f32[0] = A.f32[0] * B.f32[0] - C.f32[0];   
00697     return A.f;
00698 }
00699 
00701 SSP_FORCEINLINE __m128d ssp_msub_sd_REF(__m128d a, __m128d b, __m128d c)   // Assuming SSE5 *_ss semantics are similar to _mm_add_ss. TODO: confirm
00702 {
00703     ssp_m128 A,B,C;
00704     A.d = a;
00705     B.d = b;
00706     C.d = c;
00707 
00708     A.f64[0] = A.f64[0] * B.f64[0] - C.f64[0];   
00709     return A.d;
00710 }
00711 
00712 //--------------------------------------
00713 // Negative Multiply Subtract
00714 //--------------------------------------
00715 
00717 SSP_FORCEINLINE __m128 ssp_nmsub_ps_REF(__m128 a, __m128 b, __m128 c)
00718 {
00719     ssp_m128 A,B,C;
00720     A.f = a;
00721     B.f = b;
00722     C.f = c;
00723 
00724     A.f32[0] = -(A.f32[0] * B.f32[0]) - C.f32[0];
00725     A.f32[1] = -(A.f32[1] * B.f32[1]) - C.f32[1];
00726     A.f32[2] = -(A.f32[2] * B.f32[2]) - C.f32[2];
00727     A.f32[3] = -(A.f32[3] * B.f32[3]) - C.f32[3];
00728     return A.f;
00729 }
00730 
00732 SSP_FORCEINLINE __m128d ssp_nmsub_pd_REF(__m128d a, __m128d b, __m128d c)
00733 {
00734     ssp_m128 A,B,C;
00735     A.d = a;
00736     B.d = b;
00737     C.d = c;
00738 
00739     A.f64[0] = -(A.f64[0] * B.f64[0]) - C.f64[0];
00740     A.f64[1] = -(A.f64[1] * B.f64[1]) - C.f64[1]; 
00741     return A.d;
00742 }
00743 
00745 SSP_FORCEINLINE __m128 ssp_nmsub_ss_REF(__m128 a, __m128 b, __m128 c)
00746 {
00747     ssp_m128 A,B,C;
00748     A.f = a;
00749     B.f = b;
00750     C.f = c;
00751 
00752     A.f32[0] = -(A.f32[0] * B.f32[0]) - C.f32[0];   
00753     return A.f;
00754 }
00755 
00757 SSP_FORCEINLINE __m128d ssp_nmsub_sd_REF(__m128d a, __m128d b, __m128d c)
00758 {
00759     ssp_m128 A,B,C;
00760     A.d = a;
00761     B.d = b;
00762     C.d = c;
00763 
00764     A.f64[0] = -(A.f64[0] * B.f64[0]) - C.f64[0];   
00765     return A.d;
00766 }
00767 
00768 
00769 
00770 //---------------------------------------
00771 // AddSubtract
00772 //---------------------------------------
00773 
00775 SSP_FORCEINLINE __m128 ssp_addsub_ps_REF(__m128 a, __m128 b)
00776 {
00777     ssp_m128 A, B;
00778     A.f = a;
00779     B.f = b;
00780 
00781     A.f32[0] -= B.f32[0];
00782     A.f32[1] += B.f32[1];
00783     A.f32[2] -= B.f32[2];
00784     A.f32[3] += B.f32[3];
00785     return A.f;
00786 }
00787 
00789 SSP_FORCEINLINE __m128d ssp_addsub_pd_REF(__m128d a, __m128d b)
00790 {
00791     ssp_m128 A, B;
00792     A.d = a;
00793     B.d = b;
00794 
00795     A.f64[0] -= B.f64[0];
00796     A.f64[1] += B.f64[1];
00797     return A.d;
00798 }
00799 
00800 //---------------------------------------
00801 //Blend
00802 //---------------------------------------
00803 
00805 SSP_FORCEINLINE __m128i ssp_blend_epi16_REF     ( __m128i a, __m128i b, const int mask )
00806 {
00807     ssp_m128 A, B;
00808     A.i = a;
00809     B.i = b;
00810 
00811     A.s16[0] = (mask & 0x01) ? B.s16[0] : A.s16[0];
00812     A.s16[1] = (mask & 0x02) ? B.s16[1] : A.s16[1];
00813     A.s16[2] = (mask & 0x04) ? B.s16[2] : A.s16[2];
00814     A.s16[3] = (mask & 0x08) ? B.s16[3] : A.s16[3];
00815     A.s16[4] = (mask & 0x10) ? B.s16[4] : A.s16[4];
00816     A.s16[5] = (mask & 0x20) ? B.s16[5] : A.s16[5];
00817     A.s16[6] = (mask & 0x40) ? B.s16[6] : A.s16[6];
00818     A.s16[7] = (mask & 0x80) ? B.s16[7] : A.s16[7];
00819     return A.i;
00820 }
00821 
00823 SSP_FORCEINLINE __m128d ssp_blend_pd_REF        ( __m128d a, __m128d b, const int mask )
00824 {
00825     ssp_m128 A, B;
00826     A.d = a;
00827     B.d = b;
00828 
00829     A.f64[0] = (mask & 0x1) ? B.f64[0] : A.f64[0];
00830     A.f64[1] = (mask & 0x2) ? B.f64[1] : A.f64[1];
00831     return A.d;
00832 }
00833 
00835 SSP_FORCEINLINE __m128 ssp_blend_ps_REF        ( __m128 a, __m128 b, const int mask )
00836 {
00837     ssp_m128 A, B;
00838     A.f = a;
00839     B.f = b;
00840 
00841     A.f32[0] = (mask & 0x1) ? B.f32[0] : A.f32[0];
00842     A.f32[1] = (mask & 0x2) ? B.f32[1] : A.f32[1];
00843     A.f32[2] = (mask & 0x4) ? B.f32[2] : A.f32[2];
00844     A.f32[3] = (mask & 0x8) ? B.f32[3] : A.f32[3];
00845     return A.f;
00846 }
00847 
00849 SSP_FORCEINLINE __m128i ssp_blendv_epi8_REF     ( __m128i a, __m128i b, __m128i mask )
00850 {
00851     ssp_m128 A, B, Mask;
00852     A.i = a;
00853     B.i = b;
00854     Mask.i = mask;
00855 
00856     A.s8[0]  = (Mask.s8[0]  & 0x80) ? B.s8[0]  : A.s8[0];
00857     A.s8[1]  = (Mask.s8[1]  & 0x80) ? B.s8[1]  : A.s8[1];
00858     A.s8[2]  = (Mask.s8[2]  & 0x80) ? B.s8[2]  : A.s8[2];
00859     A.s8[3]  = (Mask.s8[3]  & 0x80) ? B.s8[3]  : A.s8[3];
00860     A.s8[4]  = (Mask.s8[4]  & 0x80) ? B.s8[4]  : A.s8[4];
00861     A.s8[5]  = (Mask.s8[5]  & 0x80) ? B.s8[5]  : A.s8[5];
00862     A.s8[6]  = (Mask.s8[6]  & 0x80) ? B.s8[6]  : A.s8[6];
00863     A.s8[7]  = (Mask.s8[7]  & 0x80) ? B.s8[7]  : A.s8[7];
00864     A.s8[8]  = (Mask.s8[8]  & 0x80) ? B.s8[8]  : A.s8[8];
00865     A.s8[9]  = (Mask.s8[9]  & 0x80) ? B.s8[9]  : A.s8[9];
00866     A.s8[10] = (Mask.s8[10] & 0x80) ? B.s8[10] : A.s8[10];
00867     A.s8[11] = (Mask.s8[11] & 0x80) ? B.s8[11] : A.s8[11];
00868     A.s8[12] = (Mask.s8[12] & 0x80) ? B.s8[12] : A.s8[12];
00869     A.s8[13] = (Mask.s8[13] & 0x80) ? B.s8[13] : A.s8[13];
00870     A.s8[14] = (Mask.s8[14] & 0x80) ? B.s8[14] : A.s8[14];
00871     A.s8[15] = (Mask.s8[15] & 0x80) ? B.s8[15] : A.s8[15];
00872     return A.i;
00873 }
00874 
00876 SSP_FORCEINLINE __m128d ssp_blendv_pd_REF       ( __m128d a, __m128d b, __m128d mask )
00877 {
00878     ssp_m128 A, B, Mask;
00879     A.d = a;
00880     B.d = b;
00881     Mask.d = mask;
00882 
00883     A.f64[0] = (Mask.u64[0] & 0x8000000000000000ll) ? B.f64[0] : A.f64[0];
00884     A.f64[1] = (Mask.u64[1] & 0x8000000000000000ll) ? B.f64[1] : A.f64[1];
00885     return A.d;
00886 }
00887 
00889 SSP_FORCEINLINE __m128 ssp_blendv_ps_REF       ( __m128 a, __m128 b, __m128 mask )     
00890 {
00891     ssp_m128 A, B, Mask;
00892     A.f = a;
00893     B.f = b;
00894     Mask.f = mask;
00895 
00896     A.f32[0] = (Mask.u32[0] & 0x80000000) ? B.f32[0] : A.f32[0];
00897     A.f32[1] = (Mask.u32[1] & 0x80000000) ? B.f32[1] : A.f32[1];
00898     A.f32[2] = (Mask.u32[2] & 0x80000000) ? B.f32[2] : A.f32[2];
00899     A.f32[3] = (Mask.u32[3] & 0x80000000) ? B.f32[3] : A.f32[3];
00900     return A.f;
00901 }
00902 
00903 
00904 //---------------------------------------
00905 //Compare
00906 //---------------------------------------
00908 SSP_FORCEINLINE __m128i ssp_cmpeq_epi64_REF( __m128i a, __m128i b )                       
00909 {
00910     ssp_m128 A, B;
00911     A.i = a;
00912     B.i = b;
00913 
00914     if( A.s64[0] == B.s64[0] )
00915         A.s64[0] = 0xFFFFFFFFFFFFFFFFll;
00916     else
00917         A.s64[0] = 0x0ll;
00918 
00919     if( A.s64[1] == B.s64[1] )
00920         A.s64[1] = 0xFFFFFFFFFFFFFFFFll;
00921     else
00922         A.s64[1] = 0x0ll;
00923     return A.i;
00924 }
00925 
00926 //---------------------------------------
00927 // Dot Product
00928 //---------------------------------------
00930 SSP_FORCEINLINE __m128d ssp_dp_pd_REF( __m128d a, __m128d b, const int mask )             
00931 {
00932     ssp_f64 tmp[3];
00933     ssp_m128 A, B;
00934     A.d = a;
00935     B.d = b;
00936 
00937     tmp[0] = (mask & 0x10) ? (A.f64[0] * B.f64[0]) : 0.0;
00938     tmp[1] = (mask & 0x20) ? (A.f64[1] * B.f64[1]) : 0.0;
00939 
00940     tmp[2] = tmp[0] + tmp[1];
00941 
00942     A.f64[0] = (mask & 0x1) ? tmp[2] : 0.0;
00943     A.f64[1] = (mask & 0x2) ? tmp[2] : 0.0;
00944     return A.d;
00945 }
00946 
00948 SSP_FORCEINLINE __m128 ssp_dp_ps_REF( __m128 a, __m128 b, const int mask )                
00949 {
00950     ssp_f32 tmp[5];
00951     ssp_m128 A, B;
00952     A.f = a;
00953     B.f = b;
00954 
00955     tmp[0] = (mask & 0x10) ? (A.f32[0] * B.f32[0]) : 0.0f;
00956     tmp[1] = (mask & 0x20) ? (A.f32[1] * B.f32[1]) : 0.0f;
00957     tmp[2] = (mask & 0x40) ? (A.f32[2] * B.f32[2]) : 0.0f;
00958     tmp[3] = (mask & 0x80) ? (A.f32[3] * B.f32[3]) : 0.0f;
00959 
00960     tmp[4] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
00961 
00962     A.f32[0] = (mask & 0x1) ? tmp[4] : 0.0f;
00963     A.f32[1] = (mask & 0x2) ? tmp[4] : 0.0f;
00964     A.f32[2] = (mask & 0x4) ? tmp[4] : 0.0f;
00965     A.f32[3] = (mask & 0x8) ? tmp[4] : 0.0f;
00966     return A.f;
00967 }
00968 
00970 SSP_FORCEINLINE __m128i ssp_maddubs_epi16_REF( __m128i a,  __m128i b)
00971 {
00972     ssp_m128 A, B, C;
00973         int tmp[8];
00974     A.i = a;
00975     B.i = b;
00976 
00977         // a is 8 bit unsigned integer, b is signed integer
00978         tmp[0] = A.u8[0] * B.s8[0] +  A.u8[1] * B.s8[1];
00979         C.s16[0] = (ssp_s16)(SSP_SATURATION(tmp[0], 32767, -32768));
00980 
00981         tmp[1] = A.u8[2] * B.s8[2] +  A.u8[3] * B.s8[3];
00982         C.s16[1] = (ssp_s16)(SSP_SATURATION(tmp[1], 32767, -32768));
00983 
00984         tmp[2] = A.u8[4] * B.s8[4] +  A.u8[5] * B.s8[5];
00985         C.s16[2] = (ssp_s16)(SSP_SATURATION(tmp[2], 32767, -32768));
00986 
00987         tmp[3] = A.u8[6] * B.s8[6] +  A.u8[7] * B.s8[7];
00988         C.s16[3] = (ssp_s16)(SSP_SATURATION(tmp[3], 32767, -32768));
00989 
00990         tmp[4] = A.u8[8] * B.s8[8] +  A.u8[9] * B.s8[9];
00991         C.s16[4] = (ssp_s16)(SSP_SATURATION(tmp[4], 32767, -32768));
00992 
00993         tmp[5] = A.u8[10] * B.s8[10] +  A.u8[11] * B.s8[11];
00994         C.s16[5] = (ssp_s16)(SSP_SATURATION(tmp[5], 32767, -32768));
00995 
00996         tmp[6] = A.u8[12] * B.s8[12] +  A.u8[13] * B.s8[13];
00997         C.s16[6] = (ssp_s16)(SSP_SATURATION(tmp[6], 32767, -32768));
00998 
00999         tmp[7] = A.u8[14] * B.s8[14] +  A.u8[15] * B.s8[15];
01000         C.s16[7] = (ssp_s16)(SSP_SATURATION(tmp[7], 32767, -32768));
01001 
01002         return C.i;
01003 }
01004 
01008 //__m64 _mm_maddubs_pi16( __m64 a,  __m64 b); [SSSE3]
01009 SSP_FORCEINLINE __m64 ssp_maddubs_pi16_REF( __m64 a,  __m64 b)
01010 {
01011     ssp_m64 A, B, C;
01012         int tmp[4];
01013     A.m64 = a;
01014     B.m64 = b;
01015 
01016         // a is 8 bit unsigned integer, b is signed integer
01017         tmp[0] = A.u8[0] * B.s8[0] +  A.u8[1] * B.s8[1];
01018         C.s16[0] = (ssp_s16)(SSP_SATURATION(tmp[0], 32767, -32768));
01019 
01020         tmp[1] = A.u8[2] * B.s8[2] +  A.u8[3] * B.s8[3];
01021         C.s16[1] = (ssp_s16)(SSP_SATURATION(tmp[1], 32767, -32768));
01022 
01023         tmp[2] = A.u8[4] * B.s8[4] +  A.u8[5] * B.s8[5];
01024         C.s16[2] = (ssp_s16)(SSP_SATURATION(tmp[2], 32767, -32768));
01025 
01026         tmp[3] = A.u8[6] * B.s8[6] +  A.u8[7] * B.s8[7];
01027         C.s16[3] = (ssp_s16)(SSP_SATURATION(tmp[3], 32767, -32768));
01028 
01029         return C.m64;
01030 }
01031 
01032 //__m128i _mm_mulhrs_epi16( __m128i a,  __m128i b);
01034 SSP_FORCEINLINE __m128i ssp_mulhrs_epi16_REF( __m128i a, __m128i b )
01035 {
01036     ssp_m128 A,B;
01037     A.i = a;
01038     B.i = b;
01039 
01040         A.s16[0] = (ssp_s16) ((A.s16[0] * B.s16[0] + 0x4000) >> 15);
01041         A.s16[1] = (ssp_s16) ((A.s16[1] * B.s16[1] + 0x4000) >> 15);
01042         A.s16[2] = (ssp_s16) ((A.s16[2] * B.s16[2] + 0x4000) >> 15);
01043         A.s16[3] = (ssp_s16) ((A.s16[3] * B.s16[3] + 0x4000) >> 15);
01044         A.s16[4] = (ssp_s16) ((A.s16[4] * B.s16[4] + 0x4000) >> 15);
01045         A.s16[5] = (ssp_s16) ((A.s16[5] * B.s16[5] + 0x4000) >> 15);
01046         A.s16[6] = (ssp_s16) ((A.s16[6] * B.s16[6] + 0x4000) >> 15);
01047         A.s16[7] = (ssp_s16) ((A.s16[7] * B.s16[7] + 0x4000) >> 15);
01048 
01049     return A.i;
01050 }
01051 
01052 //__m64 _mm_mulhrs_epi16( __m64 a,  __m64 b);
01056 SSP_FORCEINLINE __m64 ssp_mulhrs_pi16_REF( __m64 a, __m64 b )
01057 {
01058     ssp_m64 A,B;
01059     A.m64 = a;
01060     B.m64 = b;
01061 
01062         A.s16[0] = (ssp_s16) ((A.s16[0] * B.s16[0] + 0x4000) >> 15);
01063         A.s16[1] = (ssp_s16) ((A.s16[1] * B.s16[1] + 0x4000) >> 15);
01064         A.s16[2] = (ssp_s16) ((A.s16[2] * B.s16[2] + 0x4000) >> 15);
01065         A.s16[3] = (ssp_s16) ((A.s16[3] * B.s16[3] + 0x4000) >> 15);
01066 
01067     return A.m64;
01068 }
01069 
01070 //---------------------------------------
01071 //Extract
01072 //---------------------------------------
01073 
01074 // TODO PHS: Test the actual intrinsic to deterine what value is returned if he ndx/imm is a large number.
01075 //           ie.  for _mm_extract_epi8, what is returned if ndx = 20 [since 20=0x14 > 0x0F]?
01076 //           Repeat procedures for other extract functions.
01078 SSP_FORCEINLINE int ssp_extract_epi8_REF( __m128i a, const int ndx )                       
01079 {
01080     ssp_m128 A;
01081     A.i = a;
01082     return (int)A.u8[ndx&0xF];
01083 }
01084 
01086 SSP_FORCEINLINE int ssp_extract_epi32_REF( __m128i a, const int imm )                            
01087 {
01088     ssp_m128 A;
01089     A.i = a;
01090     return (int)A.u32[imm&0x3];
01091 }
01092 
01094 SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_REF( __m128i a, const int ndx )                  
01095 {
01096     ssp_m128 A;
01097     A.i = a;
01098     return A.s64[ndx & 0x1];
01099 }
01100 
01102 SSP_FORCEINLINE int ssp_extract_ps_REF( __m128 a, const int ndx )                          
01103 { 
01104     ssp_m128 A;
01105     A.f = a; 
01106     return A.s32[ndx&0x3];
01107 }
01108 
01109 
01113 SSP_FORCEINLINE __m128i ssp_extract_si64_REF( __m128i a ,__m128i b )        
01114 {
01115     ssp_u32 len, ndx;
01116     ssp_s64 mask;
01117     ssp_m128 A, B;
01118     A.i = a;
01119     B.i = b;
01120     ndx = (ssp_u32)((B.u64[0] & 0x3F00) >> 8);    // Mask ndx field.
01121     len = (ssp_u32)((B.u64[0] & 0x003F));         // Mask len field.
01122 
01123     len = (len) ? len : 64;    
01124     if( (ndx+len) > 64 )               // If the sum of ndx and length is greater than 64, the results are undefined.
01125         return a;                      // If index = 0 and length = 0/64, extract all lower bits.
01126     mask = ~(-1 << len);
01127     A.u64[0] = A.u64[0] >> ndx;
01128     A.u64[0] = A.u64[0] & mask;
01129     return A.i;
01130 }
01134 SSP_FORCEINLINE __m128i ssp_extracti_si64_REF( __m128i a, int len, int ndx )   
01135 {
01136     ssp_s64 mask;
01137     ssp_m128 A;
01138     A.i = a;
01139     ndx = ndx & 0x3F; // ndx % 64
01140     len = len & 0x3F; // len % 64
01141 
01142     len = (len) ? len : 64;    
01143     if( (ndx+len) > 64 )               // If the sum of ndx and length is greater than 64, the results are undefined.
01144         return a;                      // If index = 0 and length = 0/64, extract all lower bits.
01145     mask = ~(-1 << len);
01146     A.u64[0] = A.u64[0] >> ndx;
01147     A.u64[0] = A.u64[0] & mask;
01148     return A.i;
01149 }
01150 
01151 
01152 
01153 //---------------------------------------
01154 // Horizontal Add
01155 //---------------------------------------
01157 SSP_FORCEINLINE __m128i ssp_hadd_epi16_REF ( __m128i a, __m128i b )                       
01158 {
01159     ssp_m128 A, B;
01160     A.i = a;
01161     B.i = b;
01162 
01163     A.s16[0] = A.s16[0] + A.s16[1];
01164     A.s16[1] = A.s16[2] + A.s16[3];
01165     A.s16[2] = A.s16[4] + A.s16[5];
01166     A.s16[3] = A.s16[6] + A.s16[7];
01167     A.s16[4] = B.s16[0] + B.s16[1];
01168     A.s16[5] = B.s16[2] + B.s16[3];
01169     A.s16[6] = B.s16[4] + B.s16[5];
01170     A.s16[7] = B.s16[6] + B.s16[7];
01171     return A.i;
01172 }
01173 
01175 SSP_FORCEINLINE __m128i ssp_hadd_epi32_REF ( __m128i a, __m128i b )                        
01176 {
01177     ssp_m128 A, B;
01178     A.i = a;
01179     B.i = b;
01180 
01181     A.s32[0] = A.s32[0] + A.s32[1];
01182     A.s32[1] = A.s32[2] + A.s32[3];
01183     A.s32[2] = B.s32[0] + B.s32[1];
01184     A.s32[3] = B.s32[2] + B.s32[3];
01185 
01186     return A.i;
01187 }
01188 
01192 SSP_FORCEINLINE __m64 ssp_hadd_pi16_REF ( __m64 a, __m64 b )                        
01193 {
01194     ssp_m64 A, B;
01195     A.m64 = a;
01196     B.m64 = b;
01197 
01198     A.s16[0] = A.s16[0] + A.s16[1];
01199     A.s16[1] = A.s16[2] + A.s16[3];
01200     A.s16[2] = B.s16[0] + B.s16[1];
01201     A.s16[3] = B.s16[2] + B.s16[3];
01202 
01203     return A.m64;
01204 }
01205 
01209 SSP_FORCEINLINE __m64 ssp_hadd_pi32_REF ( __m64 a, __m64 b )                        
01210 {
01211     ssp_m64 A, B;
01212     A.m64 = a;
01213     B.m64 = b;
01214 
01215     A.s32[0] = A.s32[0] + A.s32[1];
01216     A.s32[1] = B.s32[0] + B.s32[1];
01217 
01218     return A.m64;
01219 }
01220 
01222 SSP_FORCEINLINE __m128i ssp_hadds_epi16_REF ( __m128i a, __m128i b )                         
01223 {
01224     ssp_m128 A, B;
01225         int answer[8];
01226     A.i = a;
01227     B.i = b;
01228 
01229         answer[0] = A.s16[0] + A.s16[1];
01230     A.s16[0]  = (ssp_s16) (SSP_SATURATION(answer[0], 32767, -32768));
01231         answer[1] = A.s16[2] + A.s16[3];
01232     A.s16[1]  = (ssp_s16) (SSP_SATURATION(answer[1], 32767, -32768));
01233         answer[2] = A.s16[4] + A.s16[5];
01234     A.s16[2]  = (ssp_s16) (SSP_SATURATION(answer[2], 32767, -32768));
01235         answer[3] = A.s16[6] + A.s16[7];
01236     A.s16[3]  = (ssp_s16) (SSP_SATURATION(answer[3], 32767, -32768));
01237         answer[4] = B.s16[0] + B.s16[1];
01238     A.s16[4]  = (ssp_s16) (SSP_SATURATION(answer[4], 32767, -32768));
01239         answer[5] = B.s16[2] + B.s16[3];
01240     A.s16[5]  = (ssp_s16) (SSP_SATURATION(answer[5], 32767, -32768));
01241         answer[6] = B.s16[4] + B.s16[5];
01242     A.s16[6]  = (ssp_s16) (SSP_SATURATION(answer[6], 32767, -32768));
01243         answer[7] = B.s16[6] + B.s16[7];
01244     A.s16[7]  = (ssp_s16) (SSP_SATURATION(answer[7], 32767, -32768));
01245 
01246         return A.i;
01247 }
01248 
01252 SSP_FORCEINLINE __m64 ssp_hadds_pi16_REF ( __m64 a, __m64 b )                         
01253 {
01254     ssp_m64 A, B;
01255         int answer[4];
01256     A.m64 = a;
01257     B.m64 = b;
01258 
01259         answer[0] = A.s16[0] + A.s16[1];
01260     A.s16[0]  = (ssp_s16) (SSP_SATURATION(answer[0], 32767, -32768));
01261         answer[1] = A.s16[2] + A.s16[3];
01262     A.s16[1]  = (ssp_s16) (SSP_SATURATION(answer[1], 32767, -32768));
01263         answer[2] = B.s16[0] + B.s16[1];
01264     A.s16[2]  = (ssp_s16) (SSP_SATURATION(answer[2], 32767, -32768));
01265         answer[3] = B.s16[2] + B.s16[3];
01266     A.s16[3]  = (ssp_s16) (SSP_SATURATION(answer[3], 32767, -32768));
01267 
01268         return A.m64;
01269 }
01270 
01272 SSP_FORCEINLINE __m128 ssp_hadd_ps_REF(__m128 a, __m128 b)                                 
01273 {
01274     ssp_m128 A, B;
01275     A.f = a;
01276     B.f = b;
01277 
01278     A.f32[0] = A.f32[0] + A.f32[1];
01279     A.f32[1] = A.f32[2] + A.f32[3];
01280     A.f32[2] = B.f32[0] + B.f32[1];
01281     A.f32[3] = B.f32[2] + B.f32[3];
01282     return A.f;
01283 }
01284 
01286 SSP_FORCEINLINE __m128d ssp_hadd_pd_REF(__m128d a, __m128d b)                               
01287 {
01288     ssp_m128 A, B;
01289     A.d = a;
01290     B.d = b;
01291 
01292     A.f64[0] = A.f64[0] + A.f64[1];
01293     A.f64[1] = B.f64[0] + B.f64[1];
01294     return A.d;
01295 }
01296 
01297 
01298 //---------------------------------------
01299 // Horizontal Subtract
01300 //---------------------------------------
01302 SSP_FORCEINLINE __m128i ssp_hsub_epi16_REF ( __m128i a, __m128i b )                        
01303 {
01304     ssp_m128 A, B;
01305     A.i = a;
01306     B.i = b;
01307 
01308     A.s16[0] = A.s16[0] - A.s16[1];
01309     A.s16[1] = A.s16[2] - A.s16[3];
01310     A.s16[2] = A.s16[4] - A.s16[5];
01311     A.s16[3] = A.s16[6] - A.s16[7];
01312     A.s16[4] = B.s16[0] - B.s16[1];
01313     A.s16[5] = B.s16[2] - B.s16[3];
01314     A.s16[6] = B.s16[4] - B.s16[5];
01315     A.s16[7] = B.s16[6] - B.s16[7];
01316 
01317         return A.i;
01318 }
01319 
01321 SSP_FORCEINLINE __m128i ssp_hsub_epi32_REF ( __m128i a, __m128i b )                        
01322 {
01323     ssp_m128 A, B;
01324     A.i = a;
01325     B.i = b;
01326 
01327     A.s32[0] = A.s32[0] - A.s32[1];
01328     A.s32[1] = A.s32[2] - A.s32[3];
01329     A.s32[2] = B.s32[0] - B.s32[1];
01330     A.s32[3] = B.s32[2] - B.s32[3];
01331 
01332     return A.i;
01333 }
01334 
01338 SSP_FORCEINLINE __m64 ssp_hsub_pi16_REF ( __m64 a, __m64 b )                         
01339 {
01340     ssp_m64 A, B;
01341     A.m64 = a;
01342     B.m64 = b;
01343 
01344     A.s16[0] = A.s16[0] - A.s16[1];
01345     A.s16[1] = A.s16[2] - A.s16[3];
01346     A.s16[2] = B.s16[0] - B.s16[1];
01347     A.s16[3] = B.s16[2] - B.s16[3];
01348 
01349         return A.m64;
01350 }
01351 
01355 SSP_FORCEINLINE __m64 ssp_hsub_pi32_REF ( __m64 a, __m64 b )                         
01356 {
01357     ssp_m64 A, B;
01358     A.m64 = a;
01359     B.m64 = b;
01360 
01361     A.s32[0] = A.s32[0] - A.s32[1];
01362     A.s32[1] = B.s32[0] - B.s32[1];
01363 
01364     return A.m64;
01365 }
01366 
01368 SSP_FORCEINLINE __m128i ssp_hsubs_epi16_REF ( __m128i a, __m128i b )                 
01369 {
01370     ssp_m128 A, B;
01371         int answer[8];
01372     A.i = a;
01373     B.i = b;
01374 
01375         answer[0] = A.s16[0] - A.s16[1];
01376     A.s16[0]  = (ssp_s16) (SSP_SATURATION(answer[0], 32767, -32768));
01377         answer[1] = A.s16[2] - A.s16[3];
01378     A.s16[1]  = (ssp_s16) (SSP_SATURATION(answer[1], 32767, -32768));
01379         answer[2] = A.s16[4] - A.s16[5];
01380     A.s16[2]  = (ssp_s16) (SSP_SATURATION(answer[2], 32767, -32768));
01381         answer[3] = A.s16[6] - A.s16[7];
01382     A.s16[3]  = (ssp_s16) (SSP_SATURATION(answer[3], 32767, -32768));
01383         answer[4] = B.s16[0] - B.s16[1];
01384     A.s16[4]  = (ssp_s16) (SSP_SATURATION(answer[4], 32767, -32768));
01385         answer[5] = B.s16[2] - B.s16[3];
01386     A.s16[5]  = (ssp_s16) (SSP_SATURATION(answer[5], 32767, -32768));
01387         answer[6] = B.s16[4] - B.s16[5];
01388     A.s16[6]  = (ssp_s16) (SSP_SATURATION(answer[6], 32767, -32768));
01389         answer[7] = B.s16[6] - B.s16[7];
01390     A.s16[7]  = (ssp_s16) (SSP_SATURATION(answer[7], 32767, -32768));
01391 
01392         return A.i;
01393 }
01394 
01398 SSP_FORCEINLINE __m64 ssp_hsubs_pi16_REF ( __m64 a, __m64 b )                        
01399 {
01400     ssp_m64 A, B;
01401         int answer[4];
01402     A.m64 = a;
01403     B.m64 = b;
01404 
01405         answer[0] = A.s16[0] - A.s16[1];
01406     A.s16[0]  = (ssp_s16) (SSP_SATURATION(answer[0], 32767, -32768));
01407         answer[1] = A.s16[2] - A.s16[3];
01408     A.s16[1]  = (ssp_s16) (SSP_SATURATION(answer[1], 32767, -32768));
01409         answer[2] = B.s16[0] - B.s16[1];
01410     A.s16[2]  = (ssp_s16) (SSP_SATURATION(answer[2], 32767, -32768));
01411         answer[3] = B.s16[2] - B.s16[3];
01412     A.s16[3]  = (ssp_s16) (SSP_SATURATION(answer[3], 32767, -32768));
01413 
01414         return A.m64;
01415 }
01416 
01418 SSP_FORCEINLINE __m128 ssp_hsub_ps_REF(__m128 a, __m128 b)                           
01419 {
01420     ssp_m128 A, B;
01421     A.f = a;
01422     B.f = b;
01423 
01424     A.f32[0] = A.f32[0] - A.f32[1];
01425     A.f32[1] = A.f32[2] - A.f32[3];
01426     A.f32[2] = B.f32[0] - B.f32[1];
01427     A.f32[3] = B.f32[2] - B.f32[3];
01428     return A.f;
01429 }
01430 
01432 SSP_FORCEINLINE __m128d ssp_hsub_pd_REF(__m128d a, __m128d b)                        
01433 {
01434     ssp_m128 A, B;
01435     A.d = a;
01436     B.d = b;
01437 
01438     A.f64[0] = A.f64[0] - A.f64[1];
01439     A.f64[1] = B.f64[0] - B.f64[1];
01440     return A.d;
01441 }
01442 
01443 //---------------------------------------
01444 //Insert
01445 //---------------------------------------
01447 SSP_FORCEINLINE __m128i ssp_insert_epi8_REF( __m128i a, int b, const int ndx )       // Verify behavior on Intel Hardware
01448 {
01449     ssp_m128 A;
01450     A.i = a;
01451 
01452     A.s8[ndx & 0xF] = (ssp_s8)b;
01453     return A.i;
01454 }
01455 
01457 SSP_FORCEINLINE __m128i ssp_insert_epi32_REF( __m128i a, int b, const int ndx )      // Verify behavior on Intel Hardware
01458 {
01459     ssp_m128 A;
01460     A.i = a;
01461 
01462     A.s32[ndx & 0x3] = b;
01463     return A.i;
01464 }
01465 
01467 SSP_FORCEINLINE __m128i ssp_insert_epi64_REF( __m128i a, ssp_s64 b, const int ndx )  // Verify behavior on Intel Hardware
01468 {
01469     ssp_m128 A;
01470     A.i = a;
01471 
01472     A.s64[ndx & 0x1] = b;
01473     return A.i;
01474 }
01475 
01477 SSP_FORCEINLINE __m128 ssp_insert_ps_REF( __m128 a, __m128 b, const int sel )          // Verify behavior on Intel Hardware
01478 {
01479     ssp_f32 tmp;
01480     int count_d,zmask;
01481 
01482     ssp_m128 A,B;
01483     A.f = a;
01484     B.f = b;
01485 
01486     tmp     = B.f32[(sel & 0xC0)>>6];   // 0xC0 = sel[7:6]
01487     count_d = (sel & 0x30)>>4;          // 0x30 = sel[5:4]
01488     zmask   = sel & 0x0F;               // 0x0F = sel[3:0]
01489 
01490     A.f32[count_d] = tmp;
01491 
01492     A.f32[0] = (zmask & 0x1) ? 0 : A.f32[0];
01493     A.f32[1] = (zmask & 0x2) ? 0 : A.f32[1];
01494     A.f32[2] = (zmask & 0x4) ? 0 : A.f32[2];
01495     A.f32[3] = (zmask & 0x8) ? 0 : A.f32[3];
01496     return A.f;
01497 }
01498 
01500 SSP_FORCEINLINE __m128i ssp_insert_si64_REF( __m128i a, __m128i b )
01501 {
01502     ssp_u32  ndx, len;
01503     ssp_s64  mask;
01504     ssp_m128 A, B;
01505     B.i = b;
01506     ndx = (ssp_u32)((B.u64[1] & 0x3F00) >> 8);    // Mask length field.
01507     len = (ssp_u32)((B.u64[1] & 0x003F));         // Mask ndx field.
01508 
01509     if( ( (ndx + len) > 64 ) ||
01510         ( (len == 0) && (ndx > 0) ) )
01511         return a;
01512 
01513     A.i = a;
01514     if( (len == 0 ) && (ndx == 0) )
01515     {
01516         A.u64[0] = B.u64[0];
01517         return A.i;
01518     }
01519 
01520     len = (len) ? len : 64;         // A value of zero for field length is interpreted as 64.
01521     mask = ~(-1 << len);
01522     B.u64[0]  = B.u64[0] & mask;
01523     B.u64[0]  = B.u64[0] << ndx;
01524     mask      = ~(mask << ndx);
01525     A.u64[0]  = A.u64[0] & mask;
01526     A.u64[0] |= B.u64[0];
01527     return A.i;
01528 }
01529 
01531 SSP_FORCEINLINE __m128i ssp_inserti_si64_REF( __m128i a, __m128i b, int len, int ndx )
01532 {
01533     ssp_s64 mask;
01534     ssp_m128 A, B;
01535     A.i = a;
01536     ndx = ndx & 0x3F; // ndx % 64
01537     len = len & 0x3F; // len % 64
01538 
01539     if( ( (ndx + len) > 64 ) ||
01540         ( (len == 0) && (ndx > 0) ) )
01541         return a;
01542 
01543     B.i = b;
01544     if( (len == 0 ) && (ndx == 0) )
01545     {
01546         A.u64[0] = B.u64[0];
01547         return A.i;
01548     }
01549 
01550     len = (len) ? len : 64;         // A value of zero for field length is interpreted as 64.
01551     mask = ~(-1 << len);
01552     B.u64[0]  = B.u64[0] & mask;
01553     B.u64[0]  = B.u64[0] << ndx;
01554     mask      = ~(mask << ndx);
01555     A.u64[0]  = A.u64[0] & mask;
01556     A.u64[0] |= B.u64[0];
01557     return A.i;
01558 }
01559 
01560 
01561 
01562 //---------------------------------------
01563 // Load
01564 //---------------------------------------
01566 SSP_FORCEINLINE __m128d ssp_loaddup_pd_REF(double const * dp)                               
01567 {
01568     ssp_m128 a;
01569     a.f64[0] = *dp;
01570     a.f64[1] = *dp;
01571     return a.d;
01572 }
01573 
01575 SSP_FORCEINLINE __m128i ssp_lddqu_si128_REF(__m128i const *p)                               
01576 {
01577     return *p;
01578 }
01579 
01581 SSP_FORCEINLINE __m128i ssp_stream_load_si128_REF( __m128i *p )                             
01582 {
01583     return *p;
01584 }
01585 
01586 
01587 //---------------------------------------
01588 // Min / Max
01589 //---------------------------------------
01590 
01591 #define SSP_SET_MIN( sd, s) sd=(sd<s)?sd:s;
01592 #define SSP_SET_MAX( sd, s) sd=(sd>s)?sd:s;
01593 
01594 //8 bit min/max
01596 SSP_FORCEINLINE __m128i ssp_min_epi8_REF( __m128i a, __m128i b )
01597 {
01598     ssp_m128 A,B;
01599     A.i = a;
01600     B.i = b;
01601 
01602     SSP_SET_MIN( A.s8[ 0], B.s8[ 0] );
01603     SSP_SET_MIN( A.s8[ 1], B.s8[ 1] );
01604     SSP_SET_MIN( A.s8[ 2], B.s8[ 2] );
01605     SSP_SET_MIN( A.s8[ 3], B.s8[ 3] );
01606     SSP_SET_MIN( A.s8[ 4], B.s8[ 4] );
01607     SSP_SET_MIN( A.s8[ 5], B.s8[ 5] );
01608     SSP_SET_MIN( A.s8[ 6], B.s8[ 6] );
01609     SSP_SET_MIN( A.s8[ 7], B.s8[ 7] );
01610     SSP_SET_MIN( A.s8[ 8], B.s8[ 8] );
01611     SSP_SET_MIN( A.s8[ 9], B.s8[ 9] );
01612     SSP_SET_MIN( A.s8[10], B.s8[10] );
01613     SSP_SET_MIN( A.s8[11], B.s8[11] );
01614     SSP_SET_MIN( A.s8[12], B.s8[12] );
01615     SSP_SET_MIN( A.s8[13], B.s8[13] );
01616     SSP_SET_MIN( A.s8[14], B.s8[14] );
01617     SSP_SET_MIN( A.s8[15], B.s8[15] );
01618     return A.i;
01619 }
01620 
01622 SSP_FORCEINLINE __m128i ssp_max_epi8_REF( __m128i a, __m128i b )
01623 {
01624     ssp_m128 A,B;
01625     A.i = a;
01626     B.i = b;
01627 
01628     SSP_SET_MAX( A.s8[ 0], B.s8[ 0] );
01629     SSP_SET_MAX( A.s8[ 1], B.s8[ 1] );
01630     SSP_SET_MAX( A.s8[ 2], B.s8[ 2] );
01631     SSP_SET_MAX( A.s8[ 3], B.s8[ 3] );
01632     SSP_SET_MAX( A.s8[ 4], B.s8[ 4] );
01633     SSP_SET_MAX( A.s8[ 5], B.s8[ 5] );
01634     SSP_SET_MAX( A.s8[ 6], B.s8[ 6] );
01635     SSP_SET_MAX( A.s8[ 7], B.s8[ 7] );
01636     SSP_SET_MAX( A.s8[ 8], B.s8[ 8] );
01637     SSP_SET_MAX( A.s8[ 9], B.s8[ 9] );
01638     SSP_SET_MAX( A.s8[10], B.s8[10] );
01639     SSP_SET_MAX( A.s8[11], B.s8[11] );
01640     SSP_SET_MAX( A.s8[12], B.s8[12] );
01641     SSP_SET_MAX( A.s8[13], B.s8[13] );
01642     SSP_SET_MAX( A.s8[14], B.s8[14] );
01643     SSP_SET_MAX( A.s8[15], B.s8[15] );
01644     return A.i;
01645 }
01646 
01647 //16 bit min/max
01649 SSP_FORCEINLINE __m128i ssp_min_epu16_REF ( __m128i a, __m128i b )
01650 {
01651     ssp_m128 A,B;
01652     A.i = a;
01653     B.i = b;
01654 
01655     SSP_SET_MIN( A.u16[ 0], B.u16[ 0] );
01656     SSP_SET_MIN( A.u16[ 1], B.u16[ 1] );
01657     SSP_SET_MIN( A.u16[ 2], B.u16[ 2] );
01658     SSP_SET_MIN( A.u16[ 3], B.u16[ 3] );
01659     SSP_SET_MIN( A.u16[ 4], B.u16[ 4] );
01660     SSP_SET_MIN( A.u16[ 5], B.u16[ 5] );
01661     SSP_SET_MIN( A.u16[ 6], B.u16[ 6] );
01662     SSP_SET_MIN( A.u16[ 7], B.u16[ 7] );
01663     return A.i;
01664 }
01665 
01667 SSP_FORCEINLINE __m128i ssp_max_epu16_REF ( __m128i a, __m128i b )
01668 {
01669     ssp_m128 A,B;
01670     A.i = a;
01671     B.i = b;
01672 
01673     SSP_SET_MAX( A.u16[ 0], B.u16[ 0] );
01674     SSP_SET_MAX( A.u16[ 1], B.u16[ 1] );
01675     SSP_SET_MAX( A.u16[ 2], B.u16[ 2] );
01676     SSP_SET_MAX( A.u16[ 3], B.u16[ 3] );
01677     SSP_SET_MAX( A.u16[ 4], B.u16[ 4] );
01678     SSP_SET_MAX( A.u16[ 5], B.u16[ 5] );
01679     SSP_SET_MAX( A.u16[ 6], B.u16[ 6] );
01680     SSP_SET_MAX( A.u16[ 7], B.u16[ 7] );
01681     return A.i;
01682 }
01683 
01684 //32 bit min/max
01686 SSP_FORCEINLINE __m128i ssp_min_epi32_REF( __m128i a, __m128i b )                     
01687 {
01688     ssp_m128 A,B;
01689     A.i = a;
01690     B.i = b;
01691 
01692     SSP_SET_MIN( A.s32[ 0], B.s32[ 0] );
01693     SSP_SET_MIN( A.s32[ 1], B.s32[ 1] );
01694     SSP_SET_MIN( A.s32[ 2], B.s32[ 2] );
01695     SSP_SET_MIN( A.s32[ 3], B.s32[ 3] );
01696     return A.i;
01697 }
01698 
01700 SSP_FORCEINLINE __m128i ssp_max_epi32_REF( __m128i a, __m128i b )                     
01701 {
01702     ssp_m128 A,B;
01703     A.i = a;
01704     B.i = b;
01705 
01706     SSP_SET_MAX( A.s32[ 0], B.s32[ 0] );
01707     SSP_SET_MAX( A.s32[ 1], B.s32[ 1] );
01708     SSP_SET_MAX( A.s32[ 2], B.s32[ 2] );
01709     SSP_SET_MAX( A.s32[ 3], B.s32[ 3] );
01710     return A.i;
01711 }
01712 
01714 SSP_FORCEINLINE __m128i ssp_min_epu32_REF ( __m128i a, __m128i b )                    
01715 {
01716     ssp_m128 A,B;
01717     A.i = a;
01718     B.i = b;
01719 
01720     SSP_SET_MIN( A.u32[ 0], B.u32[ 0] );
01721     SSP_SET_MIN( A.u32[ 1], B.u32[ 1] );
01722     SSP_SET_MIN( A.u32[ 2], B.u32[ 2] );
01723     SSP_SET_MIN( A.u32[ 3], B.u32[ 3] );
01724     return A.i;
01725 }
01726 
01728 SSP_FORCEINLINE __m128i ssp_max_epu32_REF ( __m128i a, __m128i b )                    
01729 {
01730     ssp_m128 A,B;
01731     A.i = a;
01732     B.i = b;
01733 
01734     SSP_SET_MAX( A.u32[ 0], B.u32[ 0] );
01735     SSP_SET_MAX( A.u32[ 1], B.u32[ 1] );
01736     SSP_SET_MAX( A.u32[ 2], B.u32[ 2] );
01737     SSP_SET_MAX( A.u32[ 3], B.u32[ 3] );
01738     return A.i;
01739 }
01740 
01741 #undef SSP_SET_MIN
01742 #undef SSP_SET_MAX
01743 
01745 SSP_FORCEINLINE __m128i ssp_minpos_epu16_REF( __m128i shortValues )                   
01746 {
01747     ssp_m128 ShortValues;
01748     ShortValues.i = shortValues;
01749 
01750     if( ShortValues.u16[1] < ShortValues.u16[0] )
01751     {
01752         ShortValues.u16[0] = ShortValues.u16[1];
01753         ShortValues.u16[1] = 1;
01754     }
01755     else
01756         ShortValues.u16[1] = 0;
01757 
01758 
01759 #define FN( I )                                     \
01760     if( ShortValues.u16[I] < ShortValues.u16[0] )   \
01761     {                                               \
01762         ShortValues.u16[0] = ShortValues.u16[I];    \
01763         ShortValues.u16[1] = I;                     \
01764     }
01765 
01766     FN( 2 );
01767     FN( 3 );
01768     FN( 4 );
01769     FN( 5 );
01770     FN( 6 );
01771     FN( 7 );
01772 
01773     ShortValues.u32[1] = 0;
01774     ShortValues.u64[1] = 0;
01775 
01776 #undef FN
01777 
01778     return ShortValues.i;
01779 }
01780 
01782 SSP_FORCEINLINE __m128i ssp_minpos_epu16_REFb( __m128i shortValues )                   
01783 {
01784     ssp_m128 ShortValues;
01785     ssp_u32 i;
01786     ssp_u16 pos = 0;
01787     ssp_u16 minVal;
01788     ShortValues.i = shortValues;
01789     minVal = ShortValues.u16[0];
01790 
01791     for( i=1; i<8; ++i )
01792     {
01793         if( ShortValues.u16[i] < minVal )
01794         {
01795             minVal = ShortValues.u16[i];
01796             pos    = i;
01797         }
01798 
01799         ShortValues.u16[i] = 0;
01800     }
01801 
01802     ShortValues.u16[0] = minVal;
01803     ShortValues.u16[1] = pos;
01804     return ShortValues.i;
01805 }
01806 
01807 
01808 //---------------------------------------
01809 // Move
01810 //---------------------------------------
01812 SSP_FORCEINLINE __m128 ssp_movehdup_ps_REF(__m128 a)                                   
01813 {
01814     ssp_m128 A;
01815     A.f = a;
01816 
01817     A.f32[0] = A.f32[1];
01818     A.f32[2] = A.f32[3];
01819     return A.f;
01820 }
01821 
01823 SSP_FORCEINLINE __m128 ssp_moveldup_ps_REF(__m128 a)                                   
01824 {
01825     ssp_m128 A;
01826     A.f = a;
01827 
01828     A.f32[1] = A.f32[0];
01829     A.f32[3] = A.f32[2];
01830     return A.f;
01831 }
01832 
01834 SSP_FORCEINLINE __m128d ssp_movedup_pd_REF(__m128d a)                                  
01835 {
01836     ssp_m128 A;
01837     A.d = a;
01838 
01839     A.f64[1] = A.f64[0];
01840     return A.d;
01841 }
01842 
01843 //---------------------------------------
01844 // Multiply
01845 //---------------------------------------
01847 SSP_FORCEINLINE __m128i ssp_mul_epi32_REF( __m128i a, __m128i b )                      
01848 {
01849     ssp_m128 A,B;
01850     A.i = a;
01851     B.i = b;
01852 
01853     A.s64[0] = A.s32[0] * B.s32[0];
01854     A.s64[1] = A.s32[2] * B.s32[2];
01855     return A.i;
01856 }
01857 
01859 SSP_FORCEINLINE __m128i ssp_mullo_epi32_REF( __m128i a, __m128i b )                    
01860 {
01861     ssp_m128 t[2];
01862     ssp_m128 A,B;
01863     A.i = a;
01864     B.i = b;
01865 
01866     t[0].s64[0] = A.s32[0] * B.s32[0];
01867     t[0].s64[1] = A.s32[1] * B.s32[1];
01868     t[1].s64[0] = A.s32[2] * B.s32[2];
01869     t[1].s64[1] = A.s32[3] * B.s32[3];    
01870 
01871     A.s32[0] = t[0].s32[0];
01872     A.s32[1] = t[0].s32[2];
01873     A.s32[2] = t[1].s32[0];
01874     A.s32[3] = t[1].s32[2];
01875     return A.i;
01876 }
01877 
01879 SSP_FORCEINLINE __m128i ssp_mpsadbw_epu8_REF ( __m128i a,   __m128i b,   const int msk  ) 
01880 {
01881         ssp_u8 Abyte[11], Bbyte[4], tmp[4];
01882         ssp_u8 Boffset, Aoffset;
01883         int i;
01884 
01885     ssp_m128 A,B;
01886     A.i = a;
01887     B.i = b;
01888 
01889         Boffset = (msk & 0x3) << 2; // *32/8,   for byte size count
01890         Aoffset = (msk & 0x4);      // *32/8/4, for byte size count and shift msk to bit 2
01891 
01892         for (i=0; i<11; i++)
01893         {
01894                 Abyte[i] = A.u8[i+Aoffset];
01895         }
01896         
01897         Bbyte[0] = B.u8[Boffset  ];
01898         Bbyte[1] = B.u8[Boffset+1];
01899         Bbyte[2] = B.u8[Boffset+2];
01900         Bbyte[3] = B.u8[Boffset+3];
01901 
01902         for (i=0; i<8; i++)
01903         {
01904                 tmp[0] = (Abyte[i  ] > Bbyte[0]) ? (Abyte[i  ] - Bbyte[0]) :  (Bbyte[0] - Abyte[i  ]);        //abs diff
01905                 tmp[1] = (Abyte[i+1] > Bbyte[1]) ? (Abyte[i+1] - Bbyte[1]) :  (Bbyte[1] - Abyte[i+1]);
01906                 tmp[2] = (Abyte[i+2] > Bbyte[2]) ? (Abyte[i+2] - Bbyte[2]) :  (Bbyte[2] - Abyte[i+2]);
01907                 tmp[3] = (Abyte[i+3] > Bbyte[3]) ? (Abyte[i+3] - Bbyte[3]) :  (Bbyte[3] - Abyte[i+3]);
01908 
01909                 A.u16[i] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
01910         }
01911 
01912         return A.i;
01913 }
01914 
01915 //---------------------------------------
01916 // Pack
01917 //---------------------------------------
01919 SSP_FORCEINLINE __m128i ssp_packus_epi32_REF( __m128i a, __m128i b )                       
01920 {
01921     ssp_m128 A,B;
01922     A.i = a;
01923     B.i = b;
01924 
01925     if( A.s32[0] < 0 )
01926         A.u16[0] = 0;
01927     else
01928         if( A.s32[0] > 0xFFFF )
01929             A.u16[0] = 0xFFFF;
01930         else
01931             A.s16[0] = (ssp_u16)A.s32[0];
01932 
01933     if( A.s32[1] < 0 )
01934         A.u16[1] = 0;
01935     else
01936         if( A.s32[1] > 0xFFFF )
01937             A.u16[1] = 0xFFFF;
01938         else
01939             A.s16[1] = (ssp_u16)A.s32[1];
01940 
01941     if( A.s32[2] < 0 )
01942         A.u16[2] = 0;
01943     else
01944         if( A.s32[2] > 0xFFFF )
01945             A.u16[2] = 0xFFFF;
01946         else
01947             A.s16[2] = (ssp_u16)A.s32[2];
01948 
01949 
01950     if( A.s32[3] < 0 )
01951         A.u16[3] = 0;
01952     else
01953         if( A.s32[3] > 0xFFFF )
01954             A.u16[3] = 0xFFFF;
01955         else
01956             A.s16[3] = (ssp_u16)A.s32[3];
01957 
01958     if( B.s32[0] < 0 )
01959         A.u16[4] = 0;
01960     else
01961         if( B.s32[0] > 0xFFFF )
01962             A.u16[4] = 0xFFFF;
01963         else
01964             A.s16[4] = (ssp_u16)B.s32[0];
01965 
01966     if( B.s32[1] < 0 )
01967         A.u16[5] = 0;
01968     else
01969         if( B.s32[1] > 0xFFFF )
01970             A.u16[5] = 0xFFFF;
01971         else
01972             A.s16[5] = (ssp_u16)B.s32[1];
01973 
01974     if( B.s32[2] < 0 )
01975         A.u16[6] = 0;
01976     else
01977         if( B.s32[2] > 0xFFFF )
01978             A.u16[6] = 0xFFFF;
01979         else
01980             A.s16[6] = (ssp_u16)B.s32[2];
01981 
01982 
01983     if( B.s32[3] < 0 )
01984         A.u16[7] = 0;
01985     else
01986         if( B.s32[3] > 0xFFFF )
01987             A.u16[7] = 0xFFFF;
01988         else
01989             A.s16[7] = (ssp_u16)B.s32[3];
01990 
01991     return A.i;
01992 }
01993 
01994 
01995 //---------------------------------------
01996 // Round
01997 //---------------------------------------
01999 SSP_FORCEINLINE __m128d ssp_ceil_pd_REF( __m128d a )                                        
02000 {
02001     ssp_m128 A;
02002     A.d = a;    
02003 
02004     A.f64[0] = ceil( A.f64[0] );
02005     A.f64[1] = ceil( A.f64[1] );
02006     return A.d;
02007 }
02008 
02010 SSP_FORCEINLINE __m128 ssp_ceil_ps_REF( __m128 a )                                          
02011 {
02012     ssp_m128 A;
02013     A.f = a;
02014 
02015     A.f32[0] = (ssp_f32)ceil( A.f32[0] );
02016     A.f32[1] = (ssp_f32)ceil( A.f32[1] );
02017     A.f32[2] = (ssp_f32)ceil( A.f32[2] );
02018     A.f32[3] = (ssp_f32)ceil( A.f32[3] );
02019     return A.f;
02020 }
02021 
02023 SSP_FORCEINLINE __m128d ssp_ceil_sd_REF( __m128d a, __m128d b)                              
02024 {
02025     ssp_m128 A,B;
02026     A.d = a;
02027     B.d = b;
02028 
02029     A.f64[0] = ceil( B.f64[0] );
02030     return A.d;
02031 }
02032 
02034 SSP_FORCEINLINE __m128 ssp_ceil_ss_REF( __m128 a, __m128 b)                               
02035 {
02036     ssp_m128 A,B;
02037     A.f = a;
02038     B.f = b;
02039 
02040     A.f32[0] = (ssp_f32)ceil( B.f32[0] );
02041     return A.f;
02042 }
02043 
02045 SSP_FORCEINLINE __m128d ssp_floor_pd_REF( __m128d a )                                       
02046 {
02047     ssp_m128 A;
02048     A.d = a;
02049 
02050     A.f64[0] = floor( A.f64[0] );
02051     A.f64[1] = floor( A.f64[1] );
02052     return A.d;
02053 }
02054 
02056 SSP_FORCEINLINE __m128 ssp_floor_ps_REF( __m128 a )                                         
02057 {
02058     ssp_m128 A;
02059     A.f = a;
02060 
02061     A.f32[0] = (float)floor( A.f32[0] );
02062     A.f32[1] = (float)floor( A.f32[1] );
02063     A.f32[2] = (float)floor( A.f32[2] );
02064     A.f32[3] = (float)floor( A.f32[3] );
02065     return A.f;
02066 }
02067 
02069 SSP_FORCEINLINE __m128d ssp_floor_sd_REF( __m128d a, __m128d b )                            
02070 {
02071     ssp_m128 A,B;
02072     A.d = a;
02073     B.d = b;
02074 
02075     A.f64[0] = floor( B.f64[0] );
02076     return A.d;
02077 }
02078 
02080 SSP_FORCEINLINE __m128 ssp_floor_ss_REF( __m128 a, __m128 b )                            
02081 {
02082     ssp_m128 A,B;
02083     A.f = a;
02084     B.f = b;
02085 
02086     A.f32[0] = (float)floor( B.f32[0] );
02087     return A.f;
02088 }
02089 
02091 SSP_FORCEINLINE __m128d ssp_round_pd_REF( __m128d val, int iRoundMode )                     
02092 {
02093     ssp_s64 *valPtr;
02094     ssp_m128 Val;
02095     Val.d = val;
02096 
02097     switch( iRoundMode & 0x3 )
02098     {
02099     case SSP_FROUND_CUR_DIRECTION:
02100         break;
02101     case SSP_FROUND_TO_ZERO:
02102         valPtr = (ssp_s64*)(&Val.f64[0]);
02103         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02104             Val.f64[0] = (ssp_f64)( (ssp_s64)Val.f64[0] );
02105 
02106         valPtr = (ssp_s64*)(&Val.f64[1]);
02107         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02108             Val.f64[1] = (ssp_f64)( (ssp_s64)Val.f64[1] );
02109         break;
02110     case SSP_FROUND_TO_POS_INF:
02111         valPtr = (ssp_s64*)(&Val.f64[0]);
02112         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02113             Val.f64[0] = ceil( Val.f64[0] );
02114 
02115         valPtr = (ssp_s64*)(&Val.f64[1]);
02116         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02117             Val.f64[1] = ceil( Val.f64[1] );
02118         break;
02119     case SSP_FROUND_TO_NEG_INF:
02120         valPtr = (ssp_s64*)(&Val.f64[0]);
02121         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02122             Val.f64[0] = floor( Val.f64[0] );
02123 
02124         valPtr = (ssp_s64*)(&Val.f64[1]);
02125         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02126             Val.f64[1] = floor( Val.f64[1] );
02127         break;
02128     default: // SSP_FROUND_TO_NEAREST_INT
02129         valPtr = (ssp_s64*)(&Val.f64[0]);
02130         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02131             Val.f64[0] = (ssp_f64)( (Val.f64[0]>0) ? (ssp_s64)(Val.f64[0]+0.5) : (ssp_s64)(Val.f64[0]-0.5) );
02132         else
02133             Val.f64[0] = ssp_number_changeSNanToQNaN_F64_REF( valPtr );
02134 
02135         valPtr = (ssp_s64*)(&Val.f64[1]);
02136         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02137             Val.f64[1] = (ssp_f64)( (Val.f64[1]>0) ? (ssp_s64)(Val.f64[1]+0.5) : (ssp_s64)(Val.f64[1]-0.5) );
02138         else
02139             Val.f64[1] = ssp_number_changeSNanToQNaN_F64_REF( valPtr );
02140     }
02141     return Val.d;
02142 }
02143 
02145 SSP_FORCEINLINE __m128 ssp_round_ps_REF( __m128 val, int iRoundMode )                     
02146 {
02147     ssp_s32 *valPtr;
02148     ssp_m128 Val;
02149     Val.f = val;
02150 
02151     switch( iRoundMode & 0x3 )
02152     {
02153     case SSP_FROUND_CUR_DIRECTION:
02154         break;
02155     case SSP_FROUND_TO_ZERO:
02156         valPtr = (ssp_s32*)(&Val.f32[0]);
02157         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02158         {
02159             if( Val.f32[0] >= 0 )
02160                 Val.f32[0] = (ssp_f32)( (ssp_s32)Val.f32[0] );
02161             else
02162             {
02163                 Val.f32[0] = (ssp_f32)( (ssp_s32)Val.f32[0] );
02164                 //Val.s32[0] = Val.s32[0] | 0x80000000;
02165             }
02166         }
02167 
02168         valPtr = (ssp_s32*)(&Val.f32[1]);
02169         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02170         {
02171             if( Val.f32[1] >= 0 )
02172                 Val.f32[1] = (ssp_f32)( (ssp_s32)Val.f32[1] );
02173             else
02174             {
02175                 Val.f32[1] = (ssp_f32)( (ssp_s32)Val.f32[1] );
02176                 //Val.s32[1] = Val.s32[1] | 0x80000000;
02177             }
02178         }
02179 
02180         valPtr = (ssp_s32*)(&Val.f32[2]);
02181         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02182         {
02183             if( Val.f32[2] >= 0 )
02184                 Val.f32[2] = (ssp_f32)( (ssp_s32)Val.f32[2] );
02185             else
02186             {
02187                 Val.f32[2] = (ssp_f32)( (ssp_s32)Val.f32[2] );
02188                 //Val.s32[2] = Val.s32[2] | 0x80000000;
02189             }
02190         }
02191 
02192         valPtr = (ssp_s32*)(&Val.f32[3]);
02193         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02194         {
02195             if( Val.f32[3] >= 0 )
02196                 Val.f32[3] = (ssp_f32)( (ssp_s32)Val.f32[3] );
02197             else
02198             {
02199                 Val.f32[3] = (ssp_f32)( (ssp_s32)Val.f32[3] );
02200                 //Val.s32[3] = Val.s32[3] | 0x80000000;
02201             }
02202         }
02203         break;
02204     case SSP_FROUND_TO_POS_INF:
02205         valPtr = (ssp_s32*)(&Val.f32[0]);
02206         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02207             Val.f32[0] = (ssp_f32)ceil( Val.f32[0] );
02208 
02209         valPtr = (ssp_s32*)(&Val.f32[1]);
02210         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02211             Val.f32[1] = (ssp_f32)ceil( Val.f32[1] );
02212 
02213         valPtr = (ssp_s32*)(&Val.f32[2]);
02214         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02215             Val.f32[2] = (ssp_f32)ceil( Val.f32[2] );
02216 
02217         valPtr = (ssp_s32*)(&Val.f32[3]);
02218         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02219             Val.f32[3] = (ssp_f32)ceil( Val.f32[3] );
02220         break;
02221     case SSP_FROUND_TO_NEG_INF:
02222         valPtr = (ssp_s32*)(&Val.f32[0]);
02223         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02224             Val.f32[0] = (ssp_f32)floor( Val.f32[0] );
02225 
02226         valPtr = (ssp_s32*)(&Val.f32[1]);
02227         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02228             Val.f32[1] = (ssp_f32)floor( Val.f32[1] );
02229 
02230         valPtr = (ssp_s32*)(&Val.f32[2]);
02231         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02232             Val.f32[2] = (ssp_f32)floor( Val.f32[2] );
02233 
02234         valPtr = (ssp_s32*)(&Val.f32[3]);
02235         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02236             Val.f32[3] = (ssp_f32)floor( Val.f32[3] );
02237         break;
02238     default: // SSP_FROUND_TO_NEAREST_INT
02239         valPtr = (ssp_s32*)(&Val.f32[0]);
02240         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02241             Val.f32[0] = (ssp_f32)( (Val.f32[0]>0) ? (ssp_s32)(Val.f32[0]+0.5) : (ssp_s32)(Val.f32[0]-0.5) );
02242         else
02243             Val.f32[0] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02244 
02245         valPtr = (ssp_s32*)(&Val.f32[1]);
02246         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02247             Val.f32[1] = (ssp_f32)( (Val.f32[1]>0) ? (ssp_s32)(Val.f32[1]+0.5) : (ssp_s32)(Val.f32[1]-0.5) );
02248         else
02249             Val.f32[1] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02250 
02251         valPtr = (ssp_s32*)(&Val.f32[2]);
02252         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02253             Val.f32[2] = (ssp_f32)( (Val.f32[2]>0) ? (ssp_s32)(Val.f32[2]+0.5) : (ssp_s32)(Val.f32[2]-0.5) );
02254         else
02255             Val.f32[2] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02256 
02257         valPtr = (ssp_s32*)(&Val.f32[3]);
02258         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02259             Val.f32[3] = (ssp_f32)( (Val.f32[3]>0) ? (ssp_s32)(Val.f32[3]+0.5) : (ssp_s32)(Val.f32[3]-0.5) );
02260         else
02261             Val.f32[3] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02262     }
02263 
02264     if( -0.0f == Val.f32[0] ) Val.f32[0]=+0.0f;
02265     if( -0.0f == Val.f32[1] ) Val.f32[1]=+0.0f;
02266     if( -0.0f == Val.f32[2] ) Val.f32[2]=+0.0f;
02267     if( -0.0f == Val.f32[3] ) Val.f32[3]=+0.0f;
02268 
02269     return Val.f;
02270 }
02271 
02273 SSP_FORCEINLINE __m128d ssp_round_sd_REF( __m128d dst, __m128d val, int iRoundMode )        
02274 {
02275     ssp_s64 *valPtr;
02276     ssp_m128 Dst, Val;
02277     Dst.d = dst;
02278     Val.d = val;
02279 
02280     switch( iRoundMode & 0x3 )
02281     {
02282     case SSP_FROUND_CUR_DIRECTION:
02283         break;
02284     case SSP_FROUND_TO_ZERO:
02285         valPtr = (ssp_s64*)(&Val.f64[0]);
02286         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02287             Dst.f64[0] = (ssp_f64)( (ssp_s64)Val.f64[0] );
02288         break;
02289     case SSP_FROUND_TO_POS_INF:
02290         valPtr = (ssp_s64*)(&Val.f64[0]);
02291         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02292             Dst.f64[0] = ceil( Val.f64[0] );
02293         break;
02294     case SSP_FROUND_TO_NEG_INF:
02295         valPtr = (ssp_s64*)(&Val.f64[0]);
02296         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02297             Dst.f64[0] = floor( Val.f64[0] );
02298         break;
02299     default: // SSP_FROUND_TO_NEAREST_INT
02300         valPtr = (ssp_s64*)(&Val.f64[0]);
02301         if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02302             Dst.f64[0] = (ssp_f64)( (Val.f64[0]>0) ? (ssp_s64)(Val.f64[0]+0.5) : (ssp_s64)(Val.f64[0]-0.5) );
02303         else
02304             Dst.f64[0] = ssp_number_changeSNanToQNaN_F64_REF( valPtr );
02305     }
02306     return Dst.d;
02307 }
02308 
02310 SSP_FORCEINLINE __m128 ssp_round_ss_REF( __m128 dst, __m128 val, int iRoundMode )        //_mm_round_ss
02311 {
02312     ssp_s32 *valPtr;
02313     ssp_m128 Dst, Val;
02314     Dst.f = dst;
02315     Val.f = val;
02316 
02317     switch( iRoundMode & 0x3 )
02318     {
02319     case SSP_FROUND_CUR_DIRECTION:
02320         break;
02321     case SSP_FROUND_TO_ZERO:
02322         valPtr = (ssp_s32*)(&Val.f32[0]);
02323         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02324         {
02325             Dst.f32[0] = (ssp_f32)( (ssp_s32)Val.f32[0] );
02326             if( Val.f32[0] <= -0 )
02327                 Dst.s32[0] = Dst.s32[0] | 0x80000000;
02328         }
02329         break;
02330     case SSP_FROUND_TO_POS_INF:
02331         valPtr = (ssp_s32*)(&Val.f32[0]);
02332         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02333             Dst.f32[0] = (ssp_f32)ceil( Val.f32[0] );
02334         break;
02335     case SSP_FROUND_TO_NEG_INF:
02336         valPtr = (ssp_s32*)(&Val.f32[0]);
02337         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02338             Dst.f32[0] = (ssp_f32)floor( Val.f32[0] );
02339         break;
02340     default: // SSP_FROUND_TO_NEAREST_INT
02341         valPtr = (ssp_s32*)(&Val.f32[0]);
02342         if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02343             Dst.f32[0] = (ssp_f32)( (Val.f32[0]>0) ? (ssp_s32)(Val.f32[0]+0.5) : (ssp_s32)(Val.f32[0]-0.5) );
02344         else
02345             Dst.f32[0] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02346     }
02347     return Dst.f;
02348 }
02349 
02350 //---------------------------------------
02351 // Test
02352 //---------------------------------------
02354 SSP_FORCEINLINE int ssp_testc_si128_REF( __m128i a, __m128i b)                              
02355 {
02356     ssp_m128 A,B;
02357     A.i = a;
02358     B.i = b;
02359 
02360     return ( (A.s64[0] & B.s64[0]) == A.s64[0] ) &&
02361            ( (A.s64[1] & B.s64[1]) == A.s64[1] ) ;
02362 }
02363 
02365 SSP_FORCEINLINE int ssp_testz_si128_REF( __m128i a, __m128i b)                              
02366 {
02367     ssp_m128 A,B;
02368     A.i = a;
02369     B.i = b;
02370 
02371     return ( (A.s64[0] & B.s64[0]) == 0 ) &&
02372            ( (A.s64[1] & B.s64[1]) == 0 ) ;
02373 }
02374 
02376 SSP_FORCEINLINE int ssp_testnzc_si128_REF( __m128i a, __m128i b)                            
02377 {
02378     int zf, cf;
02379     ssp_m128 A,B;
02380     A.i = a;
02381     B.i = b;
02382 
02383     zf = ssp_testz_si128_REF( A.i, B.i);
02384 
02385     cf = ( (~A.s64[0] & B.s64[0]) == 0 ) &&
02386          ( (~A.s64[1] & B.s64[1]) == 0 ) ;
02387     return ((int)!zf & (int)!cf);
02388 }
02389 
02390 //---------------------------------------
02391 // Type Conversion
02392 //---------------------------------------
02394 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi16_REF ( __m128i a)                                  
02395 {
02396     ssp_m128 A;
02397     A.i = a;
02398 
02399         A.s16[7] = A.s8[7];
02400         A.s16[6] = A.s8[6];
02401         A.s16[5] = A.s8[5];
02402         A.s16[4] = A.s8[4];
02403         A.s16[3] = A.s8[3];
02404         A.s16[2] = A.s8[2];
02405         A.s16[1] = A.s8[1];
02406         A.s16[0] = A.s8[0];
02407         return A.i;
02408 }
02409 
02411 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi32_REF ( __m128i a)                                  
02412 {
02413     ssp_m128 A;
02414     A.i = a;
02415 
02416         A.s32[3] = A.s8[3];
02417         A.s32[2] = A.s8[2];
02418         A.s32[1] = A.s8[1];
02419         A.s32[0] = A.s8[0];
02420         return A.i;
02421 }
02422 
02424 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi64_REF ( __m128i a)                                  
02425 {
02426     ssp_m128 A;
02427     A.i = a;
02428 
02429         A.s64[1] = A.s8[1];
02430         A.s64[0] = A.s8[0];
02431         return A.i;
02432 }
02433 
02435 SSP_FORCEINLINE __m128i ssp_cvtepi16_epi32_REF ( __m128i a)                                 
02436 {
02437     ssp_m128 A;
02438     A.i = a;
02439 
02440         A.s32[3] = A.s16[3];
02441         A.s32[2] = A.s16[2];
02442         A.s32[1] = A.s16[1];
02443         A.s32[0] = A.s16[0];
02444         return A.i;
02445 }
02446 
02448 SSP_FORCEINLINE __m128i ssp_cvtepi16_epi64_REF ( __m128i a)                                 
02449 {
02450     ssp_m128 A;
02451     A.i = a;
02452 
02453         A.s64[1] = A.s16[1];
02454         A.s64[0] = A.s16[0];
02455         return A.i;
02456 }
02457 
02459 SSP_FORCEINLINE __m128i ssp_cvtepi32_epi64_REF ( __m128i a)                                 
02460 {
02461     ssp_m128 A;
02462     A.i = a;
02463 
02464         A.s64[1] = A.s32[1];
02465     A.s64[0] = A.s32[0];
02466         return A.i;
02467 }
02468 
02470 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi16_REF ( __m128i a)                                  
02471 {
02472     ssp_m128 A;
02473     A.i = a;
02474 
02475         A.s16[7] = A.u8[7];
02476         A.s16[6] = A.u8[6];
02477         A.s16[5] = A.u8[5];
02478         A.s16[4] = A.u8[4];
02479         A.s16[3] = A.u8[3];
02480         A.s16[2] = A.u8[2];
02481         A.s16[1] = A.u8[1];
02482         A.s16[0] = A.u8[0];
02483         return A.i;
02484 }
02485 
02487 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi32_REF ( __m128i a)                                  
02488 {
02489     ssp_m128 A;
02490     A.i = a;
02491 
02492         A.s32[3] = A.u8[3];
02493         A.s32[2] = A.u8[2];
02494         A.s32[1] = A.u8[1];
02495         A.s32[0] = A.u8[0];
02496         return A.i;
02497 }
02498 
02500 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi64_REF ( __m128i a)                                  
02501 {
02502     ssp_m128 A;
02503     A.i = a;
02504 
02505         A.s64[1] = A.u8[1];
02506         A.s64[0] = A.u8[0];
02507         return A.i;
02508 }
02509 
02511 SSP_FORCEINLINE __m128i ssp_cvtepu16_epi32_REF ( __m128i a)                                 
02512 {
02513     ssp_m128 A;
02514     A.i = a;
02515 
02516         A.s32[3] = A.u16[3];
02517         A.s32[2] = A.u16[2];
02518         A.s32[1] = A.u16[1];
02519         A.s32[0] = A.u16[0];
02520         return A.i;
02521 }
02522 
02524 SSP_FORCEINLINE __m128i ssp_cvtepu16_epi64_REF ( __m128i a)                                 
02525 {
02526     ssp_m128 A;
02527     A.i = a;
02528 
02529         A.s64[1] = A.u16[1];
02530         A.s64[0] = A.u16[0];
02531         return A.i;
02532 }
02533 
02535 SSP_FORCEINLINE __m128i ssp_cvtepu32_epi64_REF ( __m128i a)                                 
02536 {
02537     ssp_m128 A;
02538     A.i = a;
02539 
02540         A.s64[1] = A.u32[1];
02541         A.s64[0] = A.u32[0];
02542         return A.i;
02543 }
02544 
02545 //SSSE3
02546 //__m128i _mm_abs_epi8(__m128i a);
02548 SSP_FORCEINLINE __m128i ssp_abs_epi8_REF (__m128i a)
02549 {
02550     ssp_m128 A;
02551     A.i = a;
02552 
02553         A.s8[0]  = (A.s8[0] < 0) ? -A.s8[0]  : A.s8[0];
02554         A.s8[1]  = (A.s8[1] < 0) ? -A.s8[1]  : A.s8[1];
02555         A.s8[2]  = (A.s8[2] < 0) ? -A.s8[2]  : A.s8[2];
02556         A.s8[3]  = (A.s8[3] < 0) ? -A.s8[3]  : A.s8[3];
02557         A.s8[4]  = (A.s8[4] < 0) ? -A.s8[4]  : A.s8[4];
02558         A.s8[5]  = (A.s8[5] < 0) ? -A.s8[5]  : A.s8[5];
02559         A.s8[6]  = (A.s8[6] < 0) ? -A.s8[6]  : A.s8[6];
02560         A.s8[7]  = (A.s8[7] < 0) ? -A.s8[7]  : A.s8[7];
02561         A.s8[8]  = (A.s8[8] < 0) ? -A.s8[8]  : A.s8[8];
02562         A.s8[9]  = (A.s8[9] < 0) ? -A.s8[9]  : A.s8[9];
02563         A.s8[10] = (A.s8[10]< 0) ? -A.s8[10] : A.s8[10];
02564         A.s8[11] = (A.s8[11]< 0) ? -A.s8[11] : A.s8[11];
02565         A.s8[12] = (A.s8[12]< 0) ? -A.s8[12] : A.s8[12];
02566         A.s8[13] = (A.s8[13]< 0) ? -A.s8[13] : A.s8[13];
02567         A.s8[14] = (A.s8[14]< 0) ? -A.s8[14] : A.s8[14];
02568         A.s8[15] = (A.s8[15]< 0) ? -A.s8[15] : A.s8[15];
02569 
02570         return A.i;
02571 }
02572 
02573 //__m128i _mm_abs_epi16(__m128i a);
02575 SSP_FORCEINLINE __m128i ssp_abs_epi16_REF (__m128i a)
02576 {
02577     ssp_m128 A;
02578     A.i = a;
02579 
02580         A.s16[0]  = (A.s16[0] < 0) ? -A.s16[0]  : A.s16[0];
02581         A.s16[1]  = (A.s16[1] < 0) ? -A.s16[1]  : A.s16[1];
02582         A.s16[2]  = (A.s16[2] < 0) ? -A.s16[2]  : A.s16[2];
02583         A.s16[3]  = (A.s16[3] < 0) ? -A.s16[3]  : A.s16[3];
02584         A.s16[4]  = (A.s16[4] < 0) ? -A.s16[4]  : A.s16[4];
02585         A.s16[5]  = (A.s16[5] < 0) ? -A.s16[5]  : A.s16[5];
02586         A.s16[6]  = (A.s16[6] < 0) ? -A.s16[6]  : A.s16[6];
02587         A.s16[7]  = (A.s16[7] < 0) ? -A.s16[7]  : A.s16[7];
02588 
02589         return A.i;
02590 }
02591 
02593 SSP_FORCEINLINE __m128i ssp_abs_epi32_REF (__m128i a)
02594 {
02595     ssp_m128 A;
02596     A.i = a;
02597 
02598         A.s32[0]  = (A.s32[0] < 0) ? -A.s32[0]  : A.s32[0];
02599         A.s32[1]  = (A.s32[1] < 0) ? -A.s32[1]  : A.s32[1];
02600         A.s32[2]  = (A.s32[2] < 0) ? -A.s32[2]  : A.s32[2];
02601         A.s32[3]  = (A.s32[3] < 0) ? -A.s32[3]  : A.s32[3];
02602 
02603         return A.i;
02604 }
02605 
02609 SSP_FORCEINLINE __m64 ssp_abs_pi8_REF (__m64 a)
02610 {
02611     ssp_m64 A;
02612     A.m64 = a;
02613 
02614         A.s8[0]  = (A.s8[0] < 0) ? -A.s8[0]  : A.s8[0];
02615         A.s8[1]  = (A.s8[1] < 0) ? -A.s8[1]  : A.s8[1];
02616         A.s8[2]  = (A.s8[2] < 0) ? -A.s8[2]  : A.s8[2];
02617         A.s8[3]  = (A.s8[3] < 0) ? -A.s8[3]  : A.s8[3];
02618         A.s8[4]  = (A.s8[4] < 0) ? -A.s8[4]  : A.s8[4];
02619         A.s8[5]  = (A.s8[5] < 0) ? -A.s8[5]  : A.s8[5];
02620         A.s8[6]  = (A.s8[6] < 0) ? -A.s8[6]  : A.s8[6];
02621         A.s8[7]  = (A.s8[7] < 0) ? -A.s8[7]  : A.s8[7];
02622 
02623         return A.m64;
02624 }
02625 
02626 //__m64 _mm_abs_pi16( __m64 a);
02630 SSP_FORCEINLINE __m64 ssp_abs_pi16_REF (__m64 a)
02631 {
02632     ssp_m64 A;
02633     A.m64 = a;
02634 
02635         A.s16[0]  = (A.s16[0] < 0) ? -A.s16[0]  : A.s16[0];
02636         A.s16[1]  = (A.s16[1] < 0) ? -A.s16[1]  : A.s16[1];
02637         A.s16[2]  = (A.s16[2] < 0) ? -A.s16[2]  : A.s16[2];
02638         A.s16[3]  = (A.s16[3] < 0) ? -A.s16[3]  : A.s16[3];
02639 
02640         return A.m64;
02641 }
02642 
02643 //__m64 _mm_abs_pi32( __m64 a);
02647 SSP_FORCEINLINE __m64 ssp_abs_pi32_REF (__m64 a)
02648 {
02649     ssp_m64 A;
02650     A.m64 = a;
02651 
02652         A.s32[0]  = (A.s32[0] < 0) ? -A.s32[0]  : A.s32[0];
02653         A.s32[1]  = (A.s32[1] < 0) ? -A.s32[1]  : A.s32[1];
02654 
02655         return A.m64;
02656 }
02657 
02658 // bit manipulation
02659 //__m128i _mm_alignr_epi8(__m128i a, __m128i b, const int ralign);
02661 SSP_FORCEINLINE __m128i ssp_alignr_epi8_REF (__m128i a, __m128i b, const int ralign)
02662 {
02663     ssp_m128 C[3];
02664         ssp_s8 * tmp;
02665         int i, j;
02666 
02667         if (ralign <0) return b; //only shift to right, no negative
02668         C[2].i = _mm_setzero_si128();
02669         if (ralign > 32) return C[2].i;
02670     C[1].i = a;
02671         C[0].i = b;
02672         tmp = & (C[0].s8[0]);
02673 
02674         for (i=ralign+15, j=15; i >=ralign; i--, j--) {
02675                 C[2].s8[j] = tmp[i];
02676         }
02677 
02678         return C[2].i;
02679 }
02680 
02684 SSP_FORCEINLINE __m64 ssp_alignr_pi8_REF (__m64 a, __m64 b, const int ralign)
02685 {
02686     ssp_m64 C[3];
02687         ssp_s8 * tmp;
02688         int i, j;
02689 
02690         if (ralign <0) return b; //only shift to right, no negative
02691         C[2].u32[0] = 0;
02692         C[2].u32[1] = 0;
02693         if (ralign > 16) return C[2].m64;
02694     C[1].m64 = a;
02695         C[0].m64 = b;
02696         tmp = & (C[0].s8[0]);
02697 
02698         for (i=ralign+7, j=7; i >=ralign; i--, j--) {
02699                 C[2].s8[j] = tmp[i];
02700         }
02701 
02702         return C[2].m64;
02703 }
02704 
02705 //__m128i _mm_shuffle_epi8( __m128i a, __m128i mask);
02707 SSP_FORCEINLINE __m128i ssp_shuffle_epi8_REF (__m128i a, __m128i mask)
02708 {
02709     ssp_m128 A, MSK, B;
02710         A.i = a;
02711         MSK.i = mask;
02712 
02713         B.s8[0]  = (MSK.s8[0]  & 0x80) ? 0 : A.s8[(MSK.s8[0]  & 0xf)];
02714         B.s8[1]  = (MSK.s8[1]  & 0x80) ? 0 : A.s8[(MSK.s8[1]  & 0xf)];
02715         B.s8[2]  = (MSK.s8[2]  & 0x80) ? 0 : A.s8[(MSK.s8[2]  & 0xf)];
02716         B.s8[3]  = (MSK.s8[3]  & 0x80) ? 0 : A.s8[(MSK.s8[3]  & 0xf)];
02717         B.s8[4]  = (MSK.s8[4]  & 0x80) ? 0 : A.s8[(MSK.s8[4]  & 0xf)];
02718         B.s8[5]  = (MSK.s8[5]  & 0x80) ? 0 : A.s8[(MSK.s8[5]  & 0xf)];
02719         B.s8[6]  = (MSK.s8[6]  & 0x80) ? 0 : A.s8[(MSK.s8[6]  & 0xf)];
02720         B.s8[7]  = (MSK.s8[7]  & 0x80) ? 0 : A.s8[(MSK.s8[7]  & 0xf)];
02721         B.s8[8]  = (MSK.s8[8]  & 0x80) ? 0 : A.s8[(MSK.s8[8]  & 0xf)];
02722         B.s8[9]  = (MSK.s8[9]  & 0x80) ? 0 : A.s8[(MSK.s8[9]  & 0xf)];
02723         B.s8[10] = (MSK.s8[10] & 0x80) ? 0 : A.s8[(MSK.s8[10] & 0xf)];
02724         B.s8[11] = (MSK.s8[11] & 0x80) ? 0 : A.s8[(MSK.s8[11] & 0xf)];
02725         B.s8[12] = (MSK.s8[12] & 0x80) ? 0 : A.s8[(MSK.s8[12] & 0xf)];
02726         B.s8[13] = (MSK.s8[13] & 0x80) ? 0 : A.s8[(MSK.s8[13] & 0xf)];
02727         B.s8[14] = (MSK.s8[14] & 0x80) ? 0 : A.s8[(MSK.s8[14] & 0xf)];
02728         B.s8[15] = (MSK.s8[15] & 0x80) ? 0 : A.s8[(MSK.s8[15] & 0xf)];
02729 
02730         return B.i;
02731 }
02732 
02736 SSP_FORCEINLINE __m64 ssp_shuffle_pi8_REF (__m64 a, __m64 mask)
02737 {
02738     ssp_m64 A, MSK, B;
02739         A.m64 = a;
02740         MSK.m64 = mask;
02741 
02742         B.s8[0]  = (MSK.s8[0]  & 0x80) ? 0 : A.s8[(MSK.s8[0]  & 0xf)];
02743         B.s8[1]  = (MSK.s8[1]  & 0x80) ? 0 : A.s8[(MSK.s8[1]  & 0xf)];
02744         B.s8[2]  = (MSK.s8[2]  & 0x80) ? 0 : A.s8[(MSK.s8[2]  & 0xf)];
02745         B.s8[3]  = (MSK.s8[3]  & 0x80) ? 0 : A.s8[(MSK.s8[3]  & 0xf)];
02746         B.s8[4]  = (MSK.s8[4]  & 0x80) ? 0 : A.s8[(MSK.s8[4]  & 0xf)];
02747         B.s8[5]  = (MSK.s8[5]  & 0x80) ? 0 : A.s8[(MSK.s8[5]  & 0xf)];
02748         B.s8[6]  = (MSK.s8[6]  & 0x80) ? 0 : A.s8[(MSK.s8[6]  & 0xf)];
02749         B.s8[7]  = (MSK.s8[7]  & 0x80) ? 0 : A.s8[(MSK.s8[7]  & 0xf)];
02750 
02751         return B.m64;
02752 }
02753 
02754 //Negate the number
02755 //__m128i _mm_sign_epi8( __m128i a, __m128i b);
02757 SSP_FORCEINLINE __m128i ssp_sign_epi8_REF (__m128i a, __m128i b)
02758 {
02759     ssp_m128 A, B;
02760         A.i = a;
02761         B.i = b;
02762 
02763         A.s8[0]  = (B.s8[0]<0)  ? (-A.s8[0])  :((B.s8[0]==0) ? 0: A.s8[0]);
02764         A.s8[1]  = (B.s8[1]<0)  ? (-A.s8[1])  :((B.s8[1]==0) ? 0: A.s8[1]);
02765         A.s8[2]  = (B.s8[2]<0)  ? (-A.s8[2])  :((B.s8[2]==0) ? 0: A.s8[2]);
02766         A.s8[3]  = (B.s8[3]<0)  ? (-A.s8[3])  :((B.s8[3]==0) ? 0: A.s8[3]);
02767         A.s8[4]  = (B.s8[4]<0)  ? (-A.s8[4])  :((B.s8[4]==0) ? 0: A.s8[4]);
02768         A.s8[5]  = (B.s8[5]<0)  ? (-A.s8[5])  :((B.s8[5]==0) ? 0: A.s8[5]);
02769         A.s8[6]  = (B.s8[6]<0)  ? (-A.s8[6])  :((B.s8[6]==0) ? 0: A.s8[6]);
02770         A.s8[7]  = (B.s8[7]<0)  ? (-A.s8[7])  :((B.s8[7]==0) ? 0: A.s8[7]);
02771         A.s8[8]  = (B.s8[8]<0)  ? (-A.s8[8])  :((B.s8[8]==0) ? 0: A.s8[8]);
02772         A.s8[9]  = (B.s8[9]<0)  ? (-A.s8[9])  :((B.s8[9]==0) ? 0: A.s8[9]);
02773         A.s8[10] = (B.s8[10]<0) ? (-A.s8[10]) :((B.s8[10]==0)? 0: A.s8[10]);
02774         A.s8[11] = (B.s8[11]<0) ? (-A.s8[11]) :((B.s8[11]==0)? 0: A.s8[11]);
02775         A.s8[12] = (B.s8[12]<0) ? (-A.s8[12]) :((B.s8[12]==0)? 0: A.s8[12]);
02776         A.s8[13] = (B.s8[13]<0) ? (-A.s8[13]) :((B.s8[13]==0)? 0: A.s8[13]);
02777         A.s8[14] = (B.s8[14]<0) ? (-A.s8[14]) :((B.s8[14]==0)? 0: A.s8[14]);
02778         A.s8[15] = (B.s8[15]<0) ? (-A.s8[15]) :((B.s8[15]==0)? 0: A.s8[15]);
02779 
02780         return A.i;
02781 }
02782 
02783 //__m128i _mm_sign_epi16( __m128i a, __m128i b);
02785 SSP_FORCEINLINE __m128i ssp_sign_epi16_REF (__m128i a, __m128i b)
02786 {
02787     ssp_m128 A, B;
02788         A.i = a;
02789         B.i = b;
02790 
02791         A.s16[0]  = (B.s16[0]<0)  ? (-A.s16[0])  :((B.s16[0]==0) ? 0: A.s16[0]);
02792         A.s16[1]  = (B.s16[1]<0)  ? (-A.s16[1])  :((B.s16[1]==0) ? 0: A.s16[1]);
02793         A.s16[2]  = (B.s16[2]<0)  ? (-A.s16[2])  :((B.s16[2]==0) ? 0: A.s16[2]);
02794         A.s16[3]  = (B.s16[3]<0)  ? (-A.s16[3])  :((B.s16[3]==0) ? 0: A.s16[3]);
02795         A.s16[4]  = (B.s16[4]<0)  ? (-A.s16[4])  :((B.s16[4]==0) ? 0: A.s16[4]);
02796         A.s16[5]  = (B.s16[5]<0)  ? (-A.s16[5])  :((B.s16[5]==0) ? 0: A.s16[5]);
02797         A.s16[6]  = (B.s16[6]<0)  ? (-A.s16[6])  :((B.s16[6]==0) ? 0: A.s16[6]);
02798         A.s16[7]  = (B.s16[7]<0)  ? (-A.s16[7])  :((B.s16[7]==0) ? 0: A.s16[7]);
02799 
02800         return A.i;
02801 }
02802 
02803 //__m128i _mm_sign_epi32( __m128i a, __m128i b);
02805 SSP_FORCEINLINE __m128i ssp_sign_epi32_REF (__m128i a, __m128i b)
02806 {
02807     ssp_m128 A, B;
02808         A.i = a;
02809         B.i = b;
02810 
02811         A.s32[0]  = (B.s32[0]<0)  ? (-A.s32[0])  :((B.s32[0]==0) ? 0: A.s32[0]);
02812         A.s32[1]  = (B.s32[1]<0)  ? (-A.s32[1])  :((B.s32[1]==0) ? 0: A.s32[1]);
02813         A.s32[2]  = (B.s32[2]<0)  ? (-A.s32[2])  :((B.s32[2]==0) ? 0: A.s32[2]);
02814         A.s32[3]  = (B.s32[3]<0)  ? (-A.s32[3])  :((B.s32[3]==0) ? 0: A.s32[3]);
02815 
02816         return A.i;
02817 }
02818 
02819 //__m64 _mm_sign_pi8( __m64 a, __m64 b);
02821 SSP_FORCEINLINE __m64 ssp_sign_pi8_REF (__m64 a, __m64 b)
02822 {
02823     ssp_m64 A, B;
02824         A.m64 = a;
02825         B.m64 = b;
02826 
02827         A.s8[0]  = (B.s8[0]<0)  ? (-A.s8[0])  :((B.s8[0]==0) ? 0: A.s8[0]);
02828         A.s8[1]  = (B.s8[1]<0)  ? (-A.s8[1])  :((B.s8[1]==0) ? 0: A.s8[1]);
02829         A.s8[2]  = (B.s8[2]<0)  ? (-A.s8[2])  :((B.s8[2]==0) ? 0: A.s8[2]);
02830         A.s8[3]  = (B.s8[3]<0)  ? (-A.s8[3])  :((B.s8[3]==0) ? 0: A.s8[3]);
02831         A.s8[4]  = (B.s8[4]<0)  ? (-A.s8[4])  :((B.s8[4]==0) ? 0: A.s8[4]);
02832         A.s8[5]  = (B.s8[5]<0)  ? (-A.s8[5])  :((B.s8[5]==0) ? 0: A.s8[5]);
02833         A.s8[6]  = (B.s8[6]<0)  ? (-A.s8[6])  :((B.s8[6]==0) ? 0: A.s8[6]);
02834         A.s8[7]  = (B.s8[7]<0)  ? (-A.s8[7])  :((B.s8[7]==0) ? 0: A.s8[7]);
02835 
02836         return A.m64;
02837 }
02838 
02839 //__m64 _mm_sign_pi16( __m64 a, __m64 b);
02843 SSP_FORCEINLINE __m64 ssp_sign_pi16_REF (__m64 a, __m64 b)
02844 {
02845     ssp_m64 A, B;
02846         A.m64 = a;
02847         B.m64 = b;
02848 
02849         A.s16[0]  = (B.s16[0]<0)  ? (-A.s16[0])  :((B.s16[0]==0) ? 0: A.s16[0]);
02850         A.s16[1]  = (B.s16[1]<0)  ? (-A.s16[1])  :((B.s16[1]==0) ? 0: A.s16[1]);
02851         A.s16[2]  = (B.s16[2]<0)  ? (-A.s16[2])  :((B.s16[2]==0) ? 0: A.s16[2]);
02852         A.s16[3]  = (B.s16[3]<0)  ? (-A.s16[3])  :((B.s16[3]==0) ? 0: A.s16[3]);
02853 
02854         return A.m64;
02855 }
02856 
02857 //__m64 _mm_sign_pi32( __m64 a, __m64 b);
02861 SSP_FORCEINLINE __m64 ssp_sign_pi32_REF (__m64 a, __m64 b)
02862 {
02863     ssp_m64 A, B;
02864         A.m64 = a;
02865         B.m64 = b;
02866 
02867         A.s32[0]  = (B.s32[0]<0)  ? (-A.s32[0])  :((B.s32[0]==0) ? 0: A.s32[0]);
02868         A.s32[1]  = (B.s32[1]<0)  ? (-A.s32[1])  :((B.s32[1]==0) ? 0: A.s32[1]);
02869 
02870         return A.m64;
02871 }
02872 
02874 SSP_FORCEINLINE void ssp_stream_sd_REF( double *dst ,__m128d src )
02875 {
02876     ssp_m128 SRC;
02877     SRC.d = src;
02878     *dst = SRC.f64[0];
02879 }
02880 
02882 SSP_FORCEINLINE void ssp_stream_ss_REF( float *dst, __m128 src )
02883 {
02884     ssp_m128 SRC;
02885     SRC.f = src;
02886     *dst = SRC.f32[0];
02887 }
02888 
02889 //---------------------------------------
02890 // Leading Zeros Count
02891 //---------------------------------------
02893 SSP_FORCEINLINE unsigned short ssp_lzcnt16_REF( unsigned short val )
02894 {
02895     
02896     if( !val )
02897         return 16;
02898     // Binary Search Tree of possible output values
02899     else if( val > 0x00FF )
02900     {
02901         if( val > 0x0FFF )
02902         {
02903             if( val > 0x3FFF )
02904             {
02905                 if( val > 0x7FFF )
02906                     return 0;
02907                 else
02908                     return 1;
02909             }
02910             else // val < 0x3FFF
02911             {
02912                 if( val > 0x1FFF )
02913                     return 2;
02914                 else
02915                     return 3;
02916             }
02917         }
02918         else // val < 0x0FFF
02919         {
02920             if( val > 0x03FF )
02921             {
02922                 if( val > 0x07FF )
02923                     return 4;
02924                 else
02925                     return 5;
02926             }
02927             else // val < 0x03FF
02928             {
02929                 if( val > 0x01FF )
02930                     return 6;
02931                 else
02932                     return 7;
02933             }
02934         }
02935     }
02936     else // val < 0x00FF
02937     {
02938         if( val > 0x000F )
02939         {
02940             if( val > 0x003F  )
02941             {
02942                 if( val > 0x007F  )
02943                     return 8;
02944                 else
02945                     return 9;
02946             }
02947             else // val < 0x003F
02948             {
02949                 if( val > 0x001F)
02950                     return 10;
02951                 else
02952                     return 11;
02953             }
02954         }
02955         else // val < 0x000F
02956         {
02957             if( val > 0x0003  )
02958             {
02959                 if( val > 0x0007  )
02960                     return 12;
02961                 else
02962                     return 13;
02963             }
02964             else // val < 0x0003
02965             {
02966                 if( val > 0x0001)
02967                     return 14;
02968                 else
02969                     return 15;
02970             }
02971         }
02972     }
02973 }
02975 SSP_FORCEINLINE unsigned int ssp_lzcnt_REF( unsigned int val )
02976 {
02977     ssp_u32 cnt;
02978     cnt = ssp_lzcnt16_REF( (ssp_u16)(val>>16) );
02979     if( cnt == 16 )
02980         cnt += ssp_lzcnt16_REF( (ssp_u16)(val & 0x0000FFFF) );
02981     return cnt;
02982 }
02984 SSP_FORCEINLINE ssp_u64 ssp_lzcnt64_REF( ssp_u64 val )
02985 {
02986     ssp_u64 cnt;
02987     cnt = ssp_lzcnt_REF( (ssp_u32)(val>>32) );
02988     if( cnt == 32 )
02989         cnt += ssp_lzcnt_REF( (ssp_u32)(val & 0x00000000FFFFFFFF) );
02990     return cnt;
02991 }
02992 
02993 //---------------------------------------
02994 // Population Count
02995 //---------------------------------------
02997 SSP_FORCEINLINE unsigned short ssp_popcnt16_REF( unsigned short val )
02998 {
02999     int i;
03000     ssp_u16 cnt=0;
03001     for( i=0; i<15, val; ++i, val = val>>1 )
03002         cnt += val & 0x1;
03003     return cnt;
03004 }
03006 SSP_FORCEINLINE unsigned int ssp_popcnt_REF( unsigned int val )
03007 {
03008     int i;
03009     ssp_u32 cnt = 0;
03010     for( i=0; i<31, val; ++i, val = val>>1 )
03011         cnt += val & 0x1;
03012     return cnt;
03013 }
03015 SSP_FORCEINLINE ssp_u64 ssp_popcnt64_REF( ssp_u64 val )
03016 {
03017     int i;
03018     ssp_u64 cnt = 0;
03019     for( i=0; i<63, val; ++i, val = val>>1 )
03020         cnt += val & 0x1;
03021     return cnt;
03022 }
03023 
03024 //--------------------------------------
03025 // Packed permute
03026 //--------------------------------------
03027 
03029 SSP_FORCEINLINE __m128i ssp_perm_epi8_REF(__m128i a, __m128i b, __m128i c)
03030 {
03031     int n;
03032     ssp_m128 A,B,C,R;
03033     A.i = a;
03034     B.i = b;
03035     C.i = c;
03036 
03037     for( n = 0; n < 16; n++ )
03038     {
03039         int op = C.u8[n] >> 5;
03040         switch( op )
03041         {
03042         case 0: // source byte (no logical opeartion)
03043             R.u8[n] = ( C.u8[n] & 0x10 ) ? ( B.u8[C.u8[n] & 0xF] ) : ( A.u8[C.u8[n] & 0xF] );
03044             break;
03045         case 1: // invert source byte
03046             {
03047                 ssp_u8 src = ( C.u8[n] & 0x10 ) ? ( B.u8[C.u8[n] & 0xF] ) : ( A.u8[C.u8[n] & 0xF] );
03048                 R.u8[n] = ~src;
03049             }
03050             break;
03051         case 2: // bit reverse of source byte
03052             {
03053                 ssp_u8 src = ( C.u8[n] & 0x10 ) ? ( B.u8[C.u8[n] & 0xF] ) : ( A.u8[C.u8[n] & 0xF] );
03054                 R.u8[n] = ( (src & 0x0F) << 4 ) | ( (src & 0xF0) >> 4 );
03055                 R.u8[n] = ( (R.u8[n] & 0x33) << 2 ) | ( (R.u8[n] & 0xCC) >> 2 );
03056                 R.u8[n] = ( (R.u8[n] & 0x55) << 1 ) | ( (R.u8[n] & 0xAA) >> 1 );
03057             }
03058             break;
03059         case 3: // bit reverse of inverted source byte
03060             {
03061                 ssp_u8 src = ( C.u8[n] & 0x10 ) ? ( B.u8[C.u8[n] & 0xF] ) : ( A.u8[C.u8[n] & 0xF] );
03062                 R.u8[n] = ( (src & 0x0F) << 4 ) | ( (src & 0xF0) >> 4 );
03063                 R.u8[n] = ( (R.u8[n] & 0x33) << 2 ) | ( (R.u8[n] & 0xCC) >> 2 );
03064                 R.u8[n] = ( (R.u8[n] & 0x55) << 1 ) | ( (R.u8[n] & 0xAA) >> 1 );
03065                 R.u8[n] = ~R.u8[n];
03066             }
03067             break;
03068         case 4: // 0x00
03069             R.u8[n] = 0x00;
03070             break;
03071         case 5: // 0xFF
03072             R.u8[n] = 0xFF;
03073             break;
03074         case 6: // most significant bit of source byte replicated in all bit positions
03075             {
03076                 ssp_s8 src = ( C.u8[n] & 0x10 ) ? ( B.s8[C.u8[n] & 0xF] ) : ( A.s8[C.u8[n] & 0xF] );
03077                 R.s8[n] = src >> 7;
03078             }
03079             break;
03080         case 7: // invert most significant bit of source byte and replicate in all bit positions
03081             {
03082                 ssp_s8 src = ( C.u8[n] & 0x10 ) ? ( B.s8[C.u8[n] & 0xF] ) : ( A.s8[C.u8[n] & 0xF] );
03083                 R.s8[n] = src >> 7;
03084                 R.u8[n] = ~R.u8[n];
03085             }
03086             break;
03087         }
03088     }
03089     return R.i;
03090 }
03092 SSP_FORCEINLINE __m128 ssp_perm_ps_REF(__m128 a, __m128 b, __m128i c)
03093 {
03094     int n;
03095     ssp_m128 A,B,C,R;
03096     A.f = a;
03097     B.f = b;
03098     C.i = c;
03099 
03100     for( n = 0; n < 4; n++ )
03101     {
03102         unsigned char cb = C.u8[n*4];
03103         int op = (cb >> 5) & 0x7;
03104         switch( op )
03105         {
03106         case 0: // single-precision source operand
03107             R.f32[n] = ( cb & 0x04 ) ? ( B.f32[cb & 0x03] ) : ( A.f32[cb & 0x03] );
03108             break;
03109         case 1: // absolute value of single-precision source operand
03110             {
03111                 ssp_f32 src = ( cb & 0x04 ) ? ( B.f32[cb & 0x03] ) : ( A.f32[cb & 0x03] );
03112                 R.f32[n] = ( src < 0.0f ) ? (-src) : src;
03113             }
03114             break;
03115         case 2: // negative value of single-precision source operand
03116             {
03117                 ssp_f32 src = ( cb & 0x04 ) ? ( B.f32[cb & 0x03] ) : ( A.f32[cb & 0x03] );
03118                 R.f32[n] = -src;
03119             }
03120             break;
03121         case 3: // negative of absolute value of single-precision source operand
03122             {
03123                 ssp_f32 src = ( cb & 0x04 ) ? ( B.f32[cb & 0x03] ) : ( A.f32[cb & 0x03] );
03124                 R.f32[n] = ( src < 0.0f ) ? src : (-src);
03125             }
03126             break;
03127         case 4: // +0.0
03128             R.f32[n] = 0.0f;
03129             break;
03130         case 5: // -1.0
03131             R.f32[n] = -1.0f;
03132             break;
03133         case 6: // +1.0
03134             R.f32[n] = 1.0f;
03135             break;
03136         case 7: // +0.0
03137             R.u32[n] = 0x40490FDB; //(for mxcsr.rc 00 or 10 use 0x40490FDB, for 01 or 11 use 0x40490FDA)
03138             break;
03139         }
03140     }
03141     return R.f;
03142 }
03144 SSP_FORCEINLINE __m128d ssp_perm_pd_REF(__m128d a, __m128d b, __m128i c)
03145 {
03146     int n;
03147     ssp_m128 A,B,C,R;
03148     A.d = a;
03149     B.d = b;
03150     C.i = c;
03151 
03152     for( n = 0; n < 2; n++ )
03153     {
03154         unsigned char cb = C.u8[n*8];
03155         int op = (cb >> 5) & 0x7;
03156         switch( op )
03157         {
03158         case 0: // single-precision source operand
03159             R.f64[n] = ( cb & 0x02 ) ? ( B.f64[cb & 0x01] ) : ( A.f64[cb & 0x01] );
03160             break;
03161         case 1: // absolute value of single-precision source operand
03162             {
03163                 ssp_f64 src = ( cb & 0x02 ) ? ( B.f64[cb & 0x01] ) : ( A.f64[cb & 0x01] );
03164                 R.f64[n] = ( src < 0.0 ) ? (-src) : src;
03165             }
03166             break;
03167         case 2: // negative value of single-precision source operand
03168             {
03169                 ssp_f64 src = ( cb & 0x02 ) ? ( B.f64[cb & 0x01] ) : ( A.f64[cb & 0x01] );
03170                 R.f64[n] = -src;
03171             }
03172             break;
03173         case 3: // negative of absolute value of single-precision source operand
03174             {
03175                 ssp_f64 src = ( cb & 0x02 ) ? ( B.f64[cb & 0x01] ) : ( A.f64[cb & 0x01] );
03176                 R.f64[n] = ( src < 0.0 ) ? src : (-src);
03177             }
03178             break;
03179         case 4: // +0.0
03180             R.f64[n] = 0.0;
03181             break;
03182         case 5: // -1.0
03183             R.f64[n] = -1.0;
03184             break;
03185         case 6: // +1.0
03186             R.f64[n] = 1.0;
03187             break;
03188         case 7: // +0.0
03189             R.u64[n] = 0x400921FB54442D18; //(for mxcsr.rc 00, 01 or 11 use 0x400921FB54442D18, for 10 use 0x400921FB54442D19)
03190             break;
03191         }
03192     }
03193     return R.d;
03194 }
03195 
03196 //--------------------------------------
03197 // Packed rotates
03198 //--------------------------------------
03199 
03201 SSP_FORCEINLINE __m128i ssp_rot_epi8_REF(__m128i a, __m128i b  )
03202 {
03203     int n;
03204     ssp_m128 A,B;
03205     A.i = a;
03206     B.i = b;
03207 
03208     for( n = 0; n < 16; n++ )
03209     {
03210       if( B.s8[n] < 0 )
03211       {
03212         unsigned int count = (-B.s8[n]) % 8;
03213         unsigned int carry_count = (8 - count) % 8;
03214         ssp_u8 carry = A.u8[n] << carry_count;
03215         A.u8[n] = A.u8[n] >> count;
03216         A.u8[n] = A.u8[n] | carry;
03217       }
03218       else
03219       {
03220         unsigned int count = B.s8[n] % 8;
03221         unsigned int carry_count = (8 - count) % 8;
03222         ssp_u8 carry = A.u8[n] >> carry_count;
03223         A.u8[n] = A.u8[n] << count;
03224         A.u8[n] = A.u8[n] | carry;
03225       }
03226     }
03227     return A.i;
03228 }
03230 SSP_FORCEINLINE __m128i ssp_rot_epi16_REF(__m128i a, __m128i b  )
03231 {
03232     int n;
03233     ssp_m128 A,B;
03234     A.i = a;
03235     B.i = b;
03236 
03237     for( n = 0; n < 8; n++ )
03238     {
03239       if( B.s16[n] < 0 )
03240       {
03241         unsigned int count = (-B.s16[n]) % 16;
03242         unsigned int carry_count = (16 - count) % 16;
03243         ssp_u16 carry = A.u16[n] << carry_count;
03244         A.u16[n] = A.u16[n] >> count;
03245         A.u16[n] = A.u16[n] | carry;
03246       }
03247       else
03248       {
03249         unsigned int count = B.s16[n] % 8;
03250         unsigned int carry_count = (16 - count) % 16;
03251         ssp_u16 carry = A.u16[n] >> carry_count;
03252         A.u16[n] = A.u16[n] << count;
03253         A.u16[n] = A.u16[n] | carry;
03254       }
03255     }
03256     return A.i;
03257 }
03259 SSP_FORCEINLINE __m128i ssp_rot_epi32_REF(__m128i a, __m128i b  )
03260 {
03261     int n;
03262     ssp_m128 A,B;
03263     A.i = a;
03264     B.i = b;
03265 
03266     for( n = 0; n < 4; n++ )
03267     {
03268       if( B.s32[n] < 0 )
03269       {
03270         unsigned int count = (-B.s32[n]) % 32;
03271         unsigned int carry_count = (32 - count) % 32;
03272         ssp_u32 carry = A.u32[n] << carry_count;
03273         A.u32[n] = A.u32[n] >> count;
03274         A.u32[n] = A.u32[n] | carry;
03275       }
03276       else
03277       {
03278         unsigned int count = B.s32[n] % 32;
03279         unsigned int carry_count = (32 - count) % 32;
03280         ssp_u32 carry = A.u32[n] >> carry_count;
03281         A.u32[n] = A.u32[n] << count;
03282         A.u32[n] = A.u32[n] | carry;
03283       }
03284     }
03285     return A.i;
03286 }
03288 SSP_FORCEINLINE __m128i ssp_rot_epi64_REF(__m128i a, __m128i b  )
03289 {
03290     int n;
03291     ssp_m128 A,B;
03292     A.i = a;
03293     B.i = b;
03294 
03295     for( n = 0; n < 2; n++ )
03296     {
03297       if( B.s64[n] < 0 )
03298       {
03299         unsigned int count = (unsigned int)((-B.s64[n]) % 64);
03300         unsigned int carry_count = (64 - count) % 64;
03301         ssp_u64 carry = A.u64[n] << carry_count;
03302         A.u64[n] = A.u64[n] >> count;
03303         A.u64[n] = A.u64[n] | carry;
03304       }
03305       else
03306       {
03307         unsigned int count = (unsigned int)(B.s64[n] % 64);
03308         unsigned int carry_count = (64 - count) % 64;
03309         ssp_u64 carry = A.u64[n] >> carry_count;
03310         A.u64[n] = A.u64[n] << count;
03311         A.u64[n] = A.u64[n] | carry;
03312       }
03313     }
03314     return A.i;
03315 }
03317 SSP_FORCEINLINE __m128i ssp_roti_epi8_REF(__m128i a, const int b)
03318 {
03319     int n;
03320     ssp_m128 A;
03321     A.i = a;
03322 
03323     if( b < 0 )
03324     {
03325         unsigned int count = (-b) % 8;
03326         unsigned int carry_count = (8 - count) % 8;
03327         for( n = 0; n < 16; n++ )
03328         {
03329             ssp_u8 carry = A.u8[n] << carry_count;
03330             A.u8[n] = A.u8[n] >> count;
03331             A.u8[n] = A.u8[n] | carry;
03332         }
03333     }
03334     else
03335     {
03336         unsigned int count = b % 8;
03337         unsigned int carry_count = (8 - count) % 8;
03338         for( n = 0; n < 16; n++ )
03339         {
03340             ssp_u8 carry = A.u8[n] >> carry_count;
03341             A.u8[n] = A.u8[n] << count;
03342             A.u8[n] = A.u8[n] | carry;
03343         }
03344     }
03345     return A.i;
03346 }
03348 SSP_FORCEINLINE __m128i ssp_roti_epi16_REF(__m128i a, const int b)
03349 {
03350     int n;
03351     ssp_m128 A;
03352     A.i = a;
03353 
03354     if( b < 0 )
03355     {
03356         unsigned int count = (-b) % 16;
03357         unsigned int carry_count = (16 - count) % 16;
03358         for( n = 0; n < 8; n++ )
03359         {
03360             ssp_u16 carry = A.u16[n] << carry_count;
03361             A.u16[n] = A.u16[n] >> count;
03362             A.u16[n] = A.u16[n] | carry;
03363         }
03364     }
03365     else
03366     {
03367         unsigned int count = b % 16;
03368         unsigned int carry_count = (16 - count) % 16;
03369         for( n = 0; n < 8; n++ )
03370         {
03371             ssp_u16 carry = A.u16[n] >> carry_count;
03372             A.u16[n] = A.u16[n] << count;
03373             A.u16[n] = A.u16[n] | carry;
03374         }
03375     }
03376     return A.i;
03377 }
03379 SSP_FORCEINLINE __m128i ssp_roti_epi32_REF(__m128i a, const int b)
03380 {
03381     int n;
03382     ssp_m128 A;
03383     A.i = a;
03384 
03385     if( b < 0 )
03386     {
03387         unsigned int count = (-b) % 32;
03388         unsigned int carry_count = (32 - count) % 32;
03389         for( n = 0; n < 4; n++ )
03390         {
03391             ssp_u32 carry = A.u32[n] << carry_count;
03392             A.u32[n] = A.u32[n] >> count;
03393             A.u32[n] = A.u32[n] | carry;
03394         }
03395     }
03396     else
03397     {
03398         unsigned int count = b % 32;
03399         unsigned int carry_count = (32 - count) % 32;
03400         for( n = 0; n < 4; n++ )
03401         {
03402             ssp_u32 carry = A.u32[n] >> carry_count;
03403             A.u32[n] = A.u32[n] << count;
03404             A.u32[n] = A.u32[n] | carry;
03405         }
03406     }
03407     return A.i;
03408 }
03410 SSP_FORCEINLINE __m128i ssp_roti_epi64_REF(__m128i a, const int b)
03411 {
03412     int n;
03413     ssp_m128 A;
03414     A.i = a;
03415 
03416     if( b < 0 )
03417     {
03418         unsigned int count = (-b) % 64;
03419         unsigned int carry_count = (64 - count) % 64;
03420         for( n = 0; n < 2; n++ )
03421         {
03422             ssp_u64 carry = A.u64[n] << carry_count;
03423             A.u64[n] = A.u64[n] >> count;
03424             A.u64[n] = A.u64[n] | carry;
03425         }
03426     }
03427     else
03428     {
03429         unsigned int count = b % 64;
03430         unsigned int carry_count = (64 - count) % 64;
03431         for( n = 0; n < 2; n++ )
03432         {
03433             ssp_u64 carry = A.u64[n] >> carry_count;
03434             A.u64[n] = A.u64[n] << count;
03435             A.u64[n] = A.u64[n] | carry;
03436         }
03437     }
03438     return A.i;
03439 }
03440 
03441 
03442 //--------------------------------------
03443 // Packed Shift Logical (bytes, words, dwords, qwords)
03444 //--------------------------------------
03445 
03447 SSP_FORCEINLINE __m128i ssp_shl_epi8_REF(__m128i a, __m128i b)
03448 {
03449     int n;
03450     ssp_m128 A,B;
03451     A.i = a;
03452     B.i = b;
03453 
03454     for( n = 0; n < 16; n++ )
03455     {
03456       if( B.s8[n] < 0 )
03457       {
03458         unsigned int count = (-B.s8[n]) % 8;
03459         A.u8[n] = A.u8[n] >> count;
03460       }
03461       else
03462       {
03463         unsigned int count = B.s8[n] % 8;
03464         A.u8[n] = A.u8[n] << count;
03465       }
03466     }
03467     return A.i;
03468 }
03469 
03471 SSP_FORCEINLINE __m128i ssp_sha_epi8_REF(__m128i a, __m128i b)
03472 {
03473     int n;
03474     ssp_m128 A,B;
03475     A.i = a;
03476     B.i = b;
03477 
03478     for( n = 0; n < 16; n++ )
03479     {
03480       if( B.s8[n] < 0 )
03481       {
03482         unsigned int count = (-B.s8[n]) % 8;
03483         A.s8[n] = A.s8[n] >> count;
03484       }
03485       else
03486       {
03487         unsigned int count = B.s8[n] % 8;
03488         A.s8[n] = A.s8[n] << count;
03489       }
03490     }
03491 
03492     return A.i;
03493 }
03494 
03496 SSP_FORCEINLINE __m128i ssp_shl_epi16_REF(__m128i a, __m128i b)
03497 {
03498     int n;
03499     ssp_m128 A,B;
03500     A.i = a;
03501     B.i = b;
03502 
03503     for( n = 0; n < 8; n++ )
03504     {
03505       if( B.s8[n*2] < 0 )
03506       {
03507         unsigned int count = (-B.s8[n*2]) % 16;
03508         A.u16[n] = A.u16[n] >> count;
03509       }
03510       else
03511       {
03512         unsigned int count = B.s8[n*2] % 16;
03513         A.u16[n] = A.u16[n] << count;
03514       }
03515     }
03516     return A.i;
03517 }
03518 
03520 SSP_FORCEINLINE __m128i ssp_sha_epi16_REF(__m128i a, __m128i b)
03521 {
03522     int n;
03523     ssp_m128 A,B;
03524     A.i = a;
03525     B.i = b;
03526 
03527     for( n = 0; n < 8; n++ )
03528     {
03529       if( B.s8[n*2] < 0 )
03530       {
03531         unsigned int count = (-B.s8[n*2]) % 16;
03532         A.s16[n] = A.s16[n] >> count;
03533       }
03534       else
03535       {
03536         unsigned int count = B.s8[n*2] % 16;
03537         A.s16[n] = A.s16[n] << count;
03538       }
03539     }
03540 
03541     return A.i;
03542 }
03543 
03545 SSP_FORCEINLINE __m128i ssp_shl_epi32_REF(__m128i a, __m128i b)
03546 {
03547     int n;
03548     ssp_m128 A,B;
03549     A.i = a;
03550     B.i = b;
03551 
03552     for( n = 0; n < 4; n++ )
03553     {
03554       if( B.s8[n*4] < 0 )
03555       {
03556         unsigned int count = (-B.s8[n*4]) % 32;
03557         A.u32[n] = A.u32[n] >> count;
03558       }
03559       else
03560       {
03561         unsigned int count = B.s8[n*4] % 32;
03562         A.u32[n] = A.u32[n] << count;
03563       }
03564     }
03565     return A.i;
03566 }
03567 
03569 SSP_FORCEINLINE __m128i ssp_sha_epi32_REF(__m128i a, __m128i b)
03570 {
03571     int n;
03572     ssp_m128 A,B;
03573     A.i = a;
03574     B.i = b;
03575 
03576     for( n = 0; n < 4; n++ )
03577     {
03578       if( B.s8[n*4] < 0 )
03579       {
03580         unsigned int count = (-B.s8[n*4]) % 32;
03581         A.s32[n] = A.s32[n] >> count;
03582       }
03583       else
03584       {
03585         unsigned int count = B.s8[n*4] % 32;
03586         A.s32[n] = A.s32[n] << count;
03587       }
03588     }
03589 
03590     return A.i;
03591 }
03592 
03594 SSP_FORCEINLINE __m128i ssp_shl_epi64_REF(__m128i a, __m128i b)
03595 {
03596     int n;
03597     ssp_m128 A,B;
03598     A.i = a;
03599     B.i = b;
03600 
03601     for( n = 0; n < 2; n++ )
03602     {
03603       if( B.s8[n*8] < 0 )
03604       {
03605         unsigned int count = (-B.s8[n*8]) % 64;
03606         A.u64[n] = A.u64[n] >> count;
03607       }
03608       else
03609       {
03610         unsigned int count = B.s8[n*8] % 64;
03611         A.u64[n] = A.u64[n] << count;
03612       }
03613     }
03614     return A.i;
03615 }
03616 
03618 SSP_FORCEINLINE __m128i ssp_sha_epi64_REF(__m128i a, __m128i b)
03619 {
03620     int n;
03621     ssp_m128 A,B;
03622     A.i = a;
03623     B.i = b;
03624 
03625     for( n = 0; n < 2; n++ )
03626     {
03627       if( B.s8[n*8] < 0 )
03628       {
03629         unsigned int count = (-B.s8[n*8]) % 64;
03630         A.s64[n] = A.s64[n] >> count;
03631       }
03632       else
03633       {
03634         unsigned int count = B.s8[n*8] % 64;
03635         A.s64[n] = A.s64[n] << count;
03636       }
03637     }
03638 
03639     return A.i;
03640 }
03641 
03646 #endif // __SSP_EMULATION_REF_H__

Generated on Wed May 21 13:44:11 2008 for "SSEPlus" by  doxygen 1.5.4