00001
00002
00003
00004
00005 #ifndef __SSEPLUS_EMULATION_REF_H__
00006 #define __SSEPLUS_EMULATION_REF_H__
00007
00008 #include "../SSEPlus_base.h"
00009 #include "../number/SSEPlus_number_REF.h"
00010 #include "../arithmetic/SSEPlus_arithmetic_REF.h"
00011 #include "SSEPlus_emulation_comps_REF.h"
00012 #include <math.h>
00013
00020
00021
00022
00024 SSP_FORCEINLINE __m128d ssp_frcz_pd_REF(__m128d a)
00025 {
00026 ssp_m128 A;
00027 long long temp;
00028
00029 A.d = a;
00030
00031 temp = (long long) A.f64[0];
00032 A.f64[0] -= temp;
00033 temp = (long long) A.f64[1];
00034 A.f64[1] -= temp;
00035
00036 return A.d;
00037 }
00038
00040 SSP_FORCEINLINE __m128 ssp_frcz_ps_REF(__m128 a)
00041 {
00042 ssp_m128 A;
00043 int temp;
00044 A.f = a;
00045
00046 temp = (int) A.f32[0];
00047 A.f32[0] -= temp;
00048 temp = (int) A.f32[1];
00049 A.f32[1] -= temp;
00050 temp = (int) A.f32[2];
00051 A.f32[2] -= temp;
00052 temp = (int) A.f32[3];
00053 A.f32[3] -= temp;
00054
00055 return A.f;
00056 }
00057
00059 SSP_FORCEINLINE __m128d ssp_frcz_sd_REF(__m128d a, __m128d b)
00060 {
00061 ssp_m128 A, B;
00062 long long temp;
00063
00064 A.d = a;
00065 B.d = b;
00066
00067 temp = (long long) A.f64[0];
00068 B.f64[0] = A.f64[0] - temp;
00069
00070 return B.d;
00071 }
00072
00074 SSP_FORCEINLINE __m128 ssp_frcz_ss_REF(__m128 a, __m128 b)
00075 {
00076 ssp_m128 A, B;
00077 int temp;
00078
00079 A.f = a;
00080 B.f = b;
00081
00082 temp = (int) A.f32[0];
00083 B.f32[0] = A.f32[0] - temp;
00084
00085 return B.f;
00086 }
00087
00088
00089
00090
00092 SSP_FORCEINLINE __m128i ssp_haddd_epi16_REF(__m128i a)
00093 {
00094 ssp_m128 A, B;
00095 A.i = a;
00096
00097 B.s32[0] = A.s16[0] + A.s16[1];
00098 B.s32[1] = A.s16[2] + A.s16[3];
00099 B.s32[2] = A.s16[4] + A.s16[5];
00100 B.s32[3] = A.s16[6] + A.s16[7];
00101
00102 return B.i;
00103 }
00104
00106 SSP_FORCEINLINE __m128i ssp_haddd_epi8_REF(__m128i a)
00107 {
00108 ssp_m128 A, B;
00109 A.i = a;
00110
00111 B.s32[0] = A.s8[ 0] + A.s8[ 1] + A.s8[ 2] + A.s8[ 3];
00112 B.s32[1] = A.s8[ 4] + A.s8[ 5] + A.s8[ 6] + A.s8[ 7];
00113 B.s32[2] = A.s8[ 8] + A.s8[ 9] + A.s8[10] + A.s8[11];
00114 B.s32[3] = A.s8[12] + A.s8[13] + A.s8[14] + A.s8[15];
00115
00116 return B.i;
00117 }
00118
00120 SSP_FORCEINLINE __m128i ssp_haddd_epu16_REF(__m128i a)
00121 {
00122 ssp_m128 A, B;
00123 A.i = a;
00124
00125 B.u32[0] = A.u16[0] + A.u16[1];
00126 B.u32[1] = A.u16[2] + A.u16[3];
00127 B.u32[2] = A.u16[4] + A.u16[5];
00128 B.u32[3] = A.u16[6] + A.u16[7];
00129
00130 return B.i;
00131 }
00132
00134 SSP_FORCEINLINE __m128i ssp_haddd_epu8_REF(__m128i a)
00135 {
00136 ssp_m128 A, B;
00137 A.i = a;
00138
00139 B.u32[0] = A.u8[ 0] + A.u8[ 1] + A.u8[ 2] + A.u8[ 3];
00140 B.u32[1] = A.u8[ 4] + A.u8[ 5] + A.u8[ 6] + A.u8[ 7];
00141 B.u32[2] = A.u8[ 8] + A.u8[ 9] + A.u8[10] + A.u8[11];
00142 B.u32[3] = A.u8[12] + A.u8[13] + A.u8[14] + A.u8[15];
00143
00144 return B.i;
00145 }
00146
00148 SSP_FORCEINLINE __m128i ssp_haddq_epi16_REF(__m128i a)
00149 {
00150 ssp_m128 A, B;
00151 A.i = a;
00152
00153 B.s64[0] = A.s16[0] + A.s16[1] + A.s16[2] + A.s16[3];
00154 B.s64[1] = A.s16[4] + A.s16[5] + A.s16[6] + A.s16[7];
00155
00156 return B.i;
00157 }
00158
00160 SSP_FORCEINLINE __m128i ssp_haddq_epi32_REF(__m128i a)
00161 {
00162 ssp_m128 A, B;
00163 A.i = a;
00164
00165 B.s64[0] = A.s32[0] + (long long)A.s32[1];
00166 B.s64[1] = A.s32[2] + (long long)A.s32[3];
00167
00168 return B.i;
00169 }
00170
00172 SSP_FORCEINLINE __m128i ssp_haddq_epi8_REF(__m128i a)
00173 {
00174 ssp_m128 A, B;
00175 A.i = a;
00176
00177 B.s64[0] = A.s8[0] + A.s8[1] + A.s8[2] + A.s8[3] + A.s8[4] + A.s8[5] + A.s8[6] + A.s8[7];
00178 B.s64[1] = A.s8[8] + A.s8[9] + A.s8[10] + A.s8[11] + A.s8[12] + A.s8[13] + A.s8[14] + A.s8[15];
00179
00180 return B.i;
00181 }
00182
00184 SSP_FORCEINLINE __m128i ssp_haddq_epu16_REF(__m128i a)
00185 {
00186 ssp_m128 A, B;
00187 A.i = a;
00188
00189 B.u64[0] = A.u16[0] + A.u16[1] + A.u16[2] + A.u16[3];
00190 B.u64[1] = A.u16[4] + A.u16[5] + A.u16[6] + A.u16[7];
00191
00192 return B.i;
00193 }
00194
00196 SSP_FORCEINLINE __m128i ssp_haddq_epu32_REF(__m128i a)
00197 {
00198 ssp_m128 A, B;
00199 A.i = a;
00200
00201 B.u64[0] = A.u32[0] + (long long)A.u32[1];
00202 B.u64[1] = A.u32[2] + (long long)A.u32[3];
00203
00204 return B.i;
00205 }
00206
00208 SSP_FORCEINLINE __m128i ssp_haddq_epu8_REF(__m128i a)
00209 {
00210 ssp_m128 A, B;
00211 A.i = a;
00212
00213 B.u64[0] = A.u8[0] + A.u8[1] + A.u8[2] + A.u8[3] + A.u8[4] + A.u8[5] + A.u8[6] + A.u8[7];
00214 B.u64[1] = A.u8[8] + A.u8[9] + A.u8[10] + A.u8[11] + A.u8[12] + A.u8[13] + A.u8[14] + A.u8[15];
00215
00216 return B.i;
00217 }
00218
00220 SSP_FORCEINLINE __m128i ssp_haddw_epi8_REF(__m128i a)
00221 {
00222 ssp_m128 A, B;
00223 A.i = a;
00224
00225 B.s16[0] = A.s8[0] + A.s8[1];
00226 B.s16[1] = A.s8[2] + A.s8[3];
00227 B.s16[2] = A.s8[4] + A.s8[5];
00228 B.s16[3] = A.s8[6] + A.s8[7];
00229 B.s16[4] = A.s8[8] + A.s8[9];
00230 B.s16[5] = A.s8[10] + A.s8[11];
00231 B.s16[6] = A.s8[12] + A.s8[13];
00232 B.s16[7] = A.s8[14] + A.s8[15];
00233
00234 return B.i;
00235 }
00236
00238 SSP_FORCEINLINE __m128i ssp_haddw_epu8_REF(__m128i a)
00239 {
00240 ssp_m128 A, B;
00241 A.i = a;
00242
00243 B.u16[0] = A.u8[0] + A.u8[1];
00244 B.u16[1] = A.u8[2] + A.u8[3];
00245 B.u16[2] = A.u8[4] + A.u8[5];
00246 B.u16[3] = A.u8[6] + A.u8[7];
00247 B.u16[4] = A.u8[8] + A.u8[9];
00248 B.u16[5] = A.u8[10] + A.u8[11];
00249 B.u16[6] = A.u8[12] + A.u8[13];
00250 B.u16[7] = A.u8[14] + A.u8[15];
00251
00252 return B.i;
00253 }
00254
00256 SSP_FORCEINLINE __m128i ssp_hsubd_epi16_REF(__m128i a)
00257 {
00258 ssp_m128 A, B;
00259 A.i = a;
00260
00261 B.s32[0] = A.s16[1] - A.s16[0];
00262 B.s32[1] = A.s16[3] - A.s16[2];
00263 B.s32[2] = A.s16[5] - A.s16[4];
00264 B.s32[3] = A.s16[7] - A.s16[6];
00265
00266 return B.i;
00267 }
00268
00270 SSP_FORCEINLINE __m128i ssp_hsubq_epi32_REF(__m128i a)
00271 {
00272 ssp_m128 A, B;
00273 A.i = a;
00274
00275 B.s64[0] = (long long)A.s32[1] - A.s32[0];
00276 B.s64[1] = (long long)A.s32[3] - A.s32[2];
00277
00278 return B.i;
00279 }
00280
00282 SSP_FORCEINLINE __m128i ssp_hsubw_epi8_REF(__m128i a)
00283 {
00284 ssp_m128 A, B;
00285 A.i = a;
00286
00287 B.s16[0] = A.s8[1] - A.s8[0];
00288 B.s16[1] = A.s8[3] - A.s8[2];
00289 B.s16[2] = A.s8[5] - A.s8[4];
00290 B.s16[3] = A.s8[7] - A.s8[6];
00291 B.s16[4] = A.s8[9] - A.s8[8];
00292 B.s16[5] = A.s8[11] - A.s8[10];
00293 B.s16[6] = A.s8[13] - A.s8[12];
00294 B.s16[7] = A.s8[15] - A.s8[14];
00295
00296 return B.i;
00297 }
00298
00299
00300
00301
00302
00304 SSP_FORCEINLINE __m128i ssp_macc_epi16_REF( __m128i a, __m128i b, __m128i c )
00305 {
00306 ssp_m128 A,B,C;
00307 A.i = a;
00308 B.i = b;
00309 C.i = c;
00310
00311 A.s16[0] = A.s16[0] * B.s16[0] + C.s16[0];
00312 A.s16[1] = A.s16[1] * B.s16[1] + C.s16[1];
00313 A.s16[2] = A.s16[2] * B.s16[2] + C.s16[2];
00314 A.s16[3] = A.s16[3] * B.s16[3] + C.s16[3];
00315 A.s16[4] = A.s16[4] * B.s16[4] + C.s16[4];
00316 A.s16[5] = A.s16[5] * B.s16[5] + C.s16[5];
00317 A.s16[6] = A.s16[6] * B.s16[6] + C.s16[6];
00318 A.s16[7] = A.s16[7] * B.s16[7] + C.s16[7];
00319
00320 return A.i;
00321 }
00322
00324 SSP_FORCEINLINE __m128i ssp_macc_epi32_REF( __m128i a, __m128i b, __m128i c )
00325 {
00326 ssp_m128 A,B,C;
00327 A.i = a;
00328 B.i = b;
00329 C.i = c;
00330
00331 A.s32[0] = A.s32[0] * B.s32[0] + C.s32[0];
00332 A.s32[1] = A.s32[1] * B.s32[1] + C.s32[1];
00333 A.s32[2] = A.s32[2] * B.s32[2] + C.s32[2];
00334 A.s32[3] = A.s32[3] * B.s32[3] + C.s32[3];
00335
00336 return A.i;
00337 }
00338
00340 SSP_FORCEINLINE __m128 ssp_macc_ps_REF( __m128 a, __m128 b, __m128 c )
00341 {
00342 ssp_m128 A,B,C;
00343 A.f = a;
00344 B.f = b;
00345 C.f = c;
00346
00347 A.f32[0] = A.f32[0] * B.f32[0] + C.f32[0];
00348 A.f32[1] = A.f32[1] * B.f32[1] + C.f32[1];
00349 A.f32[2] = A.f32[2] * B.f32[2] + C.f32[2];
00350 A.f32[3] = A.f32[3] * B.f32[3] + C.f32[3];
00351 return A.f;
00352 }
00353
00355 SSP_FORCEINLINE __m128d ssp_macc_pd_REF( __m128d a, __m128d b, __m128d c )
00356 {
00357 ssp_m128 A,B,C;
00358 A.d = a;
00359 B.d = b;
00360 C.d = c;
00361
00362 A.f64[0] = A.f64[0] * B.f64[0] + C.f64[0];
00363 A.f64[1] = A.f64[1] * B.f64[1] + C.f64[1];
00364 return A.d;
00365 }
00366
00368 SSP_FORCEINLINE __m128 ssp_macc_ss_REF(__m128 a, __m128 b, __m128 c)
00369 {
00370 ssp_m128 A,B,C;
00371 A.f = a;
00372 B.f = b;
00373 C.f = c;
00374
00375 A.f32[0] = A.f32[0] * B.f32[0] + C.f32[0];
00376 return A.f;
00377 }
00378
00380 SSP_FORCEINLINE __m128d ssp_macc_sd_REF(__m128d a, __m128d b, __m128d c)
00381 {
00382 ssp_m128 A,B,C;
00383 A.d = a;
00384 B.d = b;
00385 C.d = c;
00386
00387 A.f64[0] = A.f64[0] * B.f64[0] + C.f64[0];
00388 return A.d;
00389 }
00390
00392 SSP_FORCEINLINE __m128i ssp_maccd_epi16_REF( __m128i a, __m128i b, __m128i c )
00393 {
00394 ssp_m128 A, B, C, D;
00395 A.i = a;
00396 B.i = b;
00397 C.i = c;
00398
00399 D.s32[0] = A.s16[0] * B.s16[0] + C.s32[0];
00400 D.s32[1] = A.s16[2] * B.s16[2] + C.s32[1];
00401 D.s32[2] = A.s16[4] * B.s16[4] + C.s32[2];
00402 D.s32[3] = A.s16[6] * B.s16[6] + C.s32[3];
00403
00404 return D.i;
00405 }
00406
00408 SSP_FORCEINLINE __m128i ssp_macchi_epi32_REF( __m128i a, __m128i b, __m128i c )
00409 {
00410 ssp_m128 A, B, C, D;
00411 A.i = a;
00412 B.i = b;
00413 C.i = c;
00414
00415 D.s64[0] = A.s32[1] * B.s32[1] + C.s64[0];
00416 D.s64[1] = A.s32[3] * B.s32[3] + C.s64[1];
00417
00418 return D.i;
00419 }
00420
00422 SSP_FORCEINLINE __m128i ssp_macclo_epi32_REF( __m128i a, __m128i b, __m128i c )
00423 {
00424 ssp_m128 A, B, C, D;
00425 A.i = a;
00426 B.i = b;
00427 C.i = c;
00428
00429 D.s64[0] = A.s32[0] * B.s32[0] + C.s64[0];
00430 D.s64[1] = A.s32[2] * B.s32[2] + C.s64[1];
00431
00432 return D.i;
00433 }
00434
00435 #define SSP_SATURATION(a, pos_limit, neg_limit) (a>pos_limit) ? pos_limit : ((a<neg_limit)?neg_limit:a)
00436
00438 SSP_FORCEINLINE __m128i ssp_maccs_epi16_REF( __m128i a, __m128i b, __m128i c )
00439 {
00440 ssp_m128 A, B, C;
00441 int temp;
00442 A.i = a;
00443 B.i = b;
00444 C.i = c;
00445
00446 temp = A.s16[0] * B.s16[0] + C.s16[0];
00447 A.s16[0] = SSP_SATURATION(temp, 32767, -32768);
00448 temp = A.s16[1] * B.s16[1] + C.s16[1];
00449 A.s16[1] = SSP_SATURATION(temp, 32767, -32768);
00450 temp = A.s16[2] * B.s16[2] + C.s16[2];
00451 A.s16[2] = SSP_SATURATION(temp, 32767, -32768);
00452 temp = A.s16[3] * B.s16[3] + C.s16[3];
00453 A.s16[3] = SSP_SATURATION(temp, 32767, -32768);
00454 temp = A.s16[4] * B.s16[4] + C.s16[4];
00455 A.s16[4] = SSP_SATURATION(temp, 32767, -32768);
00456 temp = A.s16[5] * B.s16[5] + C.s16[5];
00457 A.s16[5] = SSP_SATURATION(temp, 32767, -32768);
00458 temp = A.s16[6] * B.s16[6] + C.s16[6];
00459 A.s16[6] = SSP_SATURATION(temp, 32767, -32768);
00460 temp = A.s16[7] * B.s16[7] + C.s16[7];
00461 A.s16[7] = SSP_SATURATION(temp, 32767, -32768);
00462
00463 return A.i;
00464 }
00465
00467 SSP_FORCEINLINE __m128i ssp_maccs_epi32_REF( __m128i a, __m128i b, __m128i c )
00468 {
00469 ssp_m128 A, B, C;
00470 long long temp;
00471 A.i = a;
00472 B.i = b;
00473 C.i = c;
00474
00475 temp = (long long)A.s32[0] * B.s32[0] + C.s32[0];
00476 A.s32[0] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00477 temp = (long long)A.s32[1] * B.s32[1] + C.s32[1];
00478 A.s32[1] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00479 temp = (long long)A.s32[2] * B.s32[2] + C.s32[2];
00480 A.s32[2] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00481 temp = (long long)A.s32[3] * B.s32[3] + C.s32[3];
00482 A.s32[3] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00483
00484 return A.i;
00485 }
00486
00488 SSP_FORCEINLINE __m128i ssp_maccsd_epi16_REF( __m128i a, __m128i b, __m128i c )
00489 {
00490 ssp_m128 A, B, C, D;
00491 long long temp;
00492 A.i = a;
00493 B.i = b;
00494 C.i = c;
00495
00496
00497 temp = A.s16[0] * B.s16[0] + (long long)C.s32[0];
00498 D.s32[0] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00499 temp = A.s16[2] * B.s16[2] + (long long)C.s32[1];
00500 D.s32[1] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00501 temp = A.s16[4] * B.s16[4] + (long long)C.s32[2];
00502 D.s32[2] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00503 temp = A.s16[6] * B.s16[6] + (long long)C.s32[3];
00504 D.s32[3] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00505
00506 return D.i;
00507 }
00508
00510 SSP_FORCEINLINE __m128i ssp_maccshi_epi32_REF( __m128i a, __m128i b, __m128i c )
00511 {
00512 ssp_m128 A, B, C, D;
00513 long long temp;
00514 unsigned long long signT, signC;
00515 A.i = a;
00516 B.i = b;
00517 C.i = c;
00518
00519 temp = (long long)A.s32[1] * B.s32[1];
00520 signT = temp & 0x8000000000000000LL;
00521 signC = C.s64[0] & 0x8000000000000000LL;
00522 temp += C.s64[0];
00523 D.s64[0] = (signT==signC) ? ((signT >0) ? ((temp > C.s64[0]) ? 0x8000000000000000LL : temp)
00524 : ((temp < C.s64[0])? 0x7FFFFFFFFFFFFFFFLL : temp)) : temp;
00525 temp = (long long)A.s32[3] * B.s32[3];
00526 signT = temp & 0x8000000000000000LL;
00527 signC = C.s64[1] & 0x8000000000000000LL;
00528 temp += C.s64[1];
00529 D.s64[1] = (signT==signC) ? ((signT >0) ? ((temp > C.s64[1]) ? 0x8000000000000000LL : temp)
00530 : ((temp < C.s64[1])? 0x7FFFFFFFFFFFFFFFLL : temp)) : temp;
00531
00532 return D.i;
00533 }
00534
00536 SSP_FORCEINLINE __m128i ssp_maccslo_epi32_REF( __m128i a, __m128i b, __m128i c )
00537 {
00538 ssp_m128 A, B, C, D;
00539 long long temp;
00540 unsigned long long signT, signC;
00541 A.i = a;
00542 B.i = b;
00543 C.i = c;
00544
00545 temp = (long long)A.s32[0] * B.s32[0];
00546 signT = temp & 0x8000000000000000LL;
00547 signC = C.s64[0] & 0x8000000000000000LL;
00548 temp += C.s64[0];
00549 D.s64[0] = (signT==signC) ? ((signT >0) ? ((temp > C.s64[0]) ? 0x8000000000000000LL : temp)
00550 : ((temp < C.s64[0])? 0x7FFFFFFFFFFFFFFFLL : temp)) : temp;
00551 temp = (long long)A.s32[2] * B.s32[2];
00552 signT = temp & 0x8000000000000000LL;
00553 signC = C.s64[1] & 0x8000000000000000LL;
00554 temp += C.s64[1];
00555 D.s64[1] = (signT==signC) ? ((signT >0) ? ((temp > C.s64[1]) ? 0x8000000000000000LL : temp)
00556 : ((temp < C.s64[1])? 0x7FFFFFFFFFFFFFFFLL : temp)) : temp;
00557
00558 return D.i;
00559 }
00560
00562 SSP_FORCEINLINE __m128i ssp_maddd_epi16_REF( __m128i a, __m128i b, __m128i c )
00563 {
00564 ssp_m128 A, B, C, D;
00565 A.i = a;
00566 B.i = b;
00567 C.i = c;
00568
00569 D.s32[0] = A.s16[0] * B.s16[0] + A.s16[1] * B.s16[1] + C.s32[0];
00570 D.s32[1] = A.s16[2] * B.s16[2] + A.s16[3] * B.s16[3] + C.s32[1];
00571 D.s32[2] = A.s16[4] * B.s16[4] + A.s16[5] * B.s16[5] + C.s32[2];
00572 D.s32[3] = A.s16[6] * B.s16[6] + A.s16[7] * B.s16[7] + C.s32[3];
00573
00574 return D.i;
00575 }
00576
00578 SSP_FORCEINLINE __m128i ssp_maddsd_epi16_REF( __m128i a, __m128i b, __m128i c )
00579 {
00580 ssp_m128 A, B, C, D;
00581 long long temp;
00582
00583 A.i = a;
00584 B.i = b;
00585 C.i = c;
00586
00587 temp = A.s16[0] * B.s16[0] + A.s16[1] * B.s16[1] + (long long)C.s32[0];
00588 D.s32[0] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));
00589 temp = A.s16[2] * B.s16[2] + A.s16[3] * B.s16[3] + (long long)C.s32[1];
00590 D.s32[1] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));;
00591 temp = A.s16[4] * B.s16[4] + A.s16[5] * B.s16[5] + (long long)C.s32[2];
00592 D.s32[2] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));;
00593 temp = A.s16[6] * B.s16[6] + A.s16[7] * B.s16[7] + (long long)C.s32[3];
00594 D.s32[3] = (ssp_s32)(SSP_SATURATION(temp, 2147483647LL, -2147483648LL));;
00595
00596 return D.i;
00597 }
00598
00599
00600
00601
00602
00604 SSP_FORCEINLINE __m128 ssp_nmacc_ps_REF(__m128 a, __m128 b, __m128 c)
00605 {
00606 ssp_m128 A,B,C;
00607 A.f = a;
00608 B.f = b;
00609 C.f = c;
00610
00611 A.f32[0] = -(A.f32[0] * B.f32[0]) + C.f32[0];
00612 A.f32[1] = -(A.f32[1] * B.f32[1]) + C.f32[1];
00613 A.f32[2] = -(A.f32[2] * B.f32[2]) + C.f32[2];
00614 A.f32[3] = -(A.f32[3] * B.f32[3]) + C.f32[3];
00615 return A.f;
00616 }
00617
00619 SSP_FORCEINLINE __m128d ssp_nmacc_pd_REF(__m128d a, __m128d b, __m128d c)
00620 {
00621 ssp_m128 A,B,C;
00622 A.d = a;
00623 B.d = b;
00624 C.d = c;
00625
00626 A.f64[0] = -(A.f64[0] * B.f64[0]) + C.f64[0];
00627 A.f64[1] = -(A.f64[1] * B.f64[1]) + C.f64[1];
00628 return A.d;
00629 }
00630
00632 SSP_FORCEINLINE __m128 ssp_nmacc_ss_REF(__m128 a, __m128 b, __m128 c)
00633 {
00634 ssp_m128 A,B,C;
00635 A.f = a;
00636 B.f = b;
00637 C.f = c;
00638
00639 A.f32[0] = -(A.f32[0] * B.f32[0]) + C.f32[0];
00640 return A.f;
00641 }
00642
00644 SSP_FORCEINLINE __m128d ssp_nmacc_sd_REF(__m128d a, __m128d b, __m128d c)
00645 {
00646 ssp_m128 A,B,C;
00647 A.d = a;
00648 B.d = b;
00649 C.d = c;
00650
00651 A.f64[0] = -(A.f64[0] * B.f64[0]) + C.f64[0];
00652 return A.d;
00653 }
00654
00655
00656
00657
00658
00659
00661 SSP_FORCEINLINE __m128 ssp_msub_ps_REF( __m128 a, __m128 b, __m128 c )
00662 {
00663 ssp_m128 A,B,C;
00664 A.f = a;
00665 B.f = b;
00666 C.f = c;
00667
00668 A.f32[0] = A.f32[0] * B.f32[0] - C.f32[0];
00669 A.f32[1] = A.f32[1] * B.f32[1] - C.f32[1];
00670 A.f32[2] = A.f32[2] * B.f32[2] - C.f32[2];
00671 A.f32[3] = A.f32[3] * B.f32[3] - C.f32[3];
00672 return A.f;
00673 }
00674
00676 SSP_FORCEINLINE __m128d ssp_msub_pd_REF( __m128d a, __m128d b, __m128d c )
00677 {
00678 ssp_m128 A,B,C;
00679 A.d = a;
00680 B.d = b;
00681 C.d = c;
00682
00683 A.f64[0] = A.f64[0] * B.f64[0] - C.f64[0];
00684 A.f64[1] = A.f64[1] * B.f64[1] - C.f64[1];
00685 return A.d;
00686 }
00687
00689 SSP_FORCEINLINE __m128 ssp_msub_ss_REF(__m128 a, __m128 b, __m128 c)
00690 {
00691 ssp_m128 A,B,C;
00692 A.f = a;
00693 B.f = b;
00694 C.f = c;
00695
00696 A.f32[0] = A.f32[0] * B.f32[0] - C.f32[0];
00697 return A.f;
00698 }
00699
00701 SSP_FORCEINLINE __m128d ssp_msub_sd_REF(__m128d a, __m128d b, __m128d c)
00702 {
00703 ssp_m128 A,B,C;
00704 A.d = a;
00705 B.d = b;
00706 C.d = c;
00707
00708 A.f64[0] = A.f64[0] * B.f64[0] - C.f64[0];
00709 return A.d;
00710 }
00711
00712
00713
00714
00715
00717 SSP_FORCEINLINE __m128 ssp_nmsub_ps_REF(__m128 a, __m128 b, __m128 c)
00718 {
00719 ssp_m128 A,B,C;
00720 A.f = a;
00721 B.f = b;
00722 C.f = c;
00723
00724 A.f32[0] = -(A.f32[0] * B.f32[0]) - C.f32[0];
00725 A.f32[1] = -(A.f32[1] * B.f32[1]) - C.f32[1];
00726 A.f32[2] = -(A.f32[2] * B.f32[2]) - C.f32[2];
00727 A.f32[3] = -(A.f32[3] * B.f32[3]) - C.f32[3];
00728 return A.f;
00729 }
00730
00732 SSP_FORCEINLINE __m128d ssp_nmsub_pd_REF(__m128d a, __m128d b, __m128d c)
00733 {
00734 ssp_m128 A,B,C;
00735 A.d = a;
00736 B.d = b;
00737 C.d = c;
00738
00739 A.f64[0] = -(A.f64[0] * B.f64[0]) - C.f64[0];
00740 A.f64[1] = -(A.f64[1] * B.f64[1]) - C.f64[1];
00741 return A.d;
00742 }
00743
00745 SSP_FORCEINLINE __m128 ssp_nmsub_ss_REF(__m128 a, __m128 b, __m128 c)
00746 {
00747 ssp_m128 A,B,C;
00748 A.f = a;
00749 B.f = b;
00750 C.f = c;
00751
00752 A.f32[0] = -(A.f32[0] * B.f32[0]) - C.f32[0];
00753 return A.f;
00754 }
00755
00757 SSP_FORCEINLINE __m128d ssp_nmsub_sd_REF(__m128d a, __m128d b, __m128d c)
00758 {
00759 ssp_m128 A,B,C;
00760 A.d = a;
00761 B.d = b;
00762 C.d = c;
00763
00764 A.f64[0] = -(A.f64[0] * B.f64[0]) - C.f64[0];
00765 return A.d;
00766 }
00767
00768
00769
00770
00771
00772
00773
00775 SSP_FORCEINLINE __m128 ssp_addsub_ps_REF(__m128 a, __m128 b)
00776 {
00777 ssp_m128 A, B;
00778 A.f = a;
00779 B.f = b;
00780
00781 A.f32[0] -= B.f32[0];
00782 A.f32[1] += B.f32[1];
00783 A.f32[2] -= B.f32[2];
00784 A.f32[3] += B.f32[3];
00785 return A.f;
00786 }
00787
00789 SSP_FORCEINLINE __m128d ssp_addsub_pd_REF(__m128d a, __m128d b)
00790 {
00791 ssp_m128 A, B;
00792 A.d = a;
00793 B.d = b;
00794
00795 A.f64[0] -= B.f64[0];
00796 A.f64[1] += B.f64[1];
00797 return A.d;
00798 }
00799
00800
00801
00802
00803
00805 SSP_FORCEINLINE __m128i ssp_blend_epi16_REF ( __m128i a, __m128i b, const int mask )
00806 {
00807 ssp_m128 A, B;
00808 A.i = a;
00809 B.i = b;
00810
00811 A.s16[0] = (mask & 0x01) ? B.s16[0] : A.s16[0];
00812 A.s16[1] = (mask & 0x02) ? B.s16[1] : A.s16[1];
00813 A.s16[2] = (mask & 0x04) ? B.s16[2] : A.s16[2];
00814 A.s16[3] = (mask & 0x08) ? B.s16[3] : A.s16[3];
00815 A.s16[4] = (mask & 0x10) ? B.s16[4] : A.s16[4];
00816 A.s16[5] = (mask & 0x20) ? B.s16[5] : A.s16[5];
00817 A.s16[6] = (mask & 0x40) ? B.s16[6] : A.s16[6];
00818 A.s16[7] = (mask & 0x80) ? B.s16[7] : A.s16[7];
00819 return A.i;
00820 }
00821
00823 SSP_FORCEINLINE __m128d ssp_blend_pd_REF ( __m128d a, __m128d b, const int mask )
00824 {
00825 ssp_m128 A, B;
00826 A.d = a;
00827 B.d = b;
00828
00829 A.f64[0] = (mask & 0x1) ? B.f64[0] : A.f64[0];
00830 A.f64[1] = (mask & 0x2) ? B.f64[1] : A.f64[1];
00831 return A.d;
00832 }
00833
00835 SSP_FORCEINLINE __m128 ssp_blend_ps_REF ( __m128 a, __m128 b, const int mask )
00836 {
00837 ssp_m128 A, B;
00838 A.f = a;
00839 B.f = b;
00840
00841 A.f32[0] = (mask & 0x1) ? B.f32[0] : A.f32[0];
00842 A.f32[1] = (mask & 0x2) ? B.f32[1] : A.f32[1];
00843 A.f32[2] = (mask & 0x4) ? B.f32[2] : A.f32[2];
00844 A.f32[3] = (mask & 0x8) ? B.f32[3] : A.f32[3];
00845 return A.f;
00846 }
00847
00849 SSP_FORCEINLINE __m128i ssp_blendv_epi8_REF ( __m128i a, __m128i b, __m128i mask )
00850 {
00851 ssp_m128 A, B, Mask;
00852 A.i = a;
00853 B.i = b;
00854 Mask.i = mask;
00855
00856 A.s8[0] = (Mask.s8[0] & 0x80) ? B.s8[0] : A.s8[0];
00857 A.s8[1] = (Mask.s8[1] & 0x80) ? B.s8[1] : A.s8[1];
00858 A.s8[2] = (Mask.s8[2] & 0x80) ? B.s8[2] : A.s8[2];
00859 A.s8[3] = (Mask.s8[3] & 0x80) ? B.s8[3] : A.s8[3];
00860 A.s8[4] = (Mask.s8[4] & 0x80) ? B.s8[4] : A.s8[4];
00861 A.s8[5] = (Mask.s8[5] & 0x80) ? B.s8[5] : A.s8[5];
00862 A.s8[6] = (Mask.s8[6] & 0x80) ? B.s8[6] : A.s8[6];
00863 A.s8[7] = (Mask.s8[7] & 0x80) ? B.s8[7] : A.s8[7];
00864 A.s8[8] = (Mask.s8[8] & 0x80) ? B.s8[8] : A.s8[8];
00865 A.s8[9] = (Mask.s8[9] & 0x80) ? B.s8[9] : A.s8[9];
00866 A.s8[10] = (Mask.s8[10] & 0x80) ? B.s8[10] : A.s8[10];
00867 A.s8[11] = (Mask.s8[11] & 0x80) ? B.s8[11] : A.s8[11];
00868 A.s8[12] = (Mask.s8[12] & 0x80) ? B.s8[12] : A.s8[12];
00869 A.s8[13] = (Mask.s8[13] & 0x80) ? B.s8[13] : A.s8[13];
00870 A.s8[14] = (Mask.s8[14] & 0x80) ? B.s8[14] : A.s8[14];
00871 A.s8[15] = (Mask.s8[15] & 0x80) ? B.s8[15] : A.s8[15];
00872 return A.i;
00873 }
00874
00876 SSP_FORCEINLINE __m128d ssp_blendv_pd_REF ( __m128d a, __m128d b, __m128d mask )
00877 {
00878 ssp_m128 A, B, Mask;
00879 A.d = a;
00880 B.d = b;
00881 Mask.d = mask;
00882
00883 A.f64[0] = (Mask.u64[0] & 0x8000000000000000ll) ? B.f64[0] : A.f64[0];
00884 A.f64[1] = (Mask.u64[1] & 0x8000000000000000ll) ? B.f64[1] : A.f64[1];
00885 return A.d;
00886 }
00887
00889 SSP_FORCEINLINE __m128 ssp_blendv_ps_REF ( __m128 a, __m128 b, __m128 mask )
00890 {
00891 ssp_m128 A, B, Mask;
00892 A.f = a;
00893 B.f = b;
00894 Mask.f = mask;
00895
00896 A.f32[0] = (Mask.u32[0] & 0x80000000) ? B.f32[0] : A.f32[0];
00897 A.f32[1] = (Mask.u32[1] & 0x80000000) ? B.f32[1] : A.f32[1];
00898 A.f32[2] = (Mask.u32[2] & 0x80000000) ? B.f32[2] : A.f32[2];
00899 A.f32[3] = (Mask.u32[3] & 0x80000000) ? B.f32[3] : A.f32[3];
00900 return A.f;
00901 }
00902
00903
00904
00905
00906
00908 SSP_FORCEINLINE __m128i ssp_cmpeq_epi64_REF( __m128i a, __m128i b )
00909 {
00910 ssp_m128 A, B;
00911 A.i = a;
00912 B.i = b;
00913
00914 if( A.s64[0] == B.s64[0] )
00915 A.s64[0] = 0xFFFFFFFFFFFFFFFFll;
00916 else
00917 A.s64[0] = 0x0ll;
00918
00919 if( A.s64[1] == B.s64[1] )
00920 A.s64[1] = 0xFFFFFFFFFFFFFFFFll;
00921 else
00922 A.s64[1] = 0x0ll;
00923 return A.i;
00924 }
00925
00926
00927
00928
00930 SSP_FORCEINLINE __m128d ssp_dp_pd_REF( __m128d a, __m128d b, const int mask )
00931 {
00932 ssp_f64 tmp[3];
00933 ssp_m128 A, B;
00934 A.d = a;
00935 B.d = b;
00936
00937 tmp[0] = (mask & 0x10) ? (A.f64[0] * B.f64[0]) : 0.0;
00938 tmp[1] = (mask & 0x20) ? (A.f64[1] * B.f64[1]) : 0.0;
00939
00940 tmp[2] = tmp[0] + tmp[1];
00941
00942 A.f64[0] = (mask & 0x1) ? tmp[2] : 0.0;
00943 A.f64[1] = (mask & 0x2) ? tmp[2] : 0.0;
00944 return A.d;
00945 }
00946
00948 SSP_FORCEINLINE __m128 ssp_dp_ps_REF( __m128 a, __m128 b, const int mask )
00949 {
00950 ssp_f32 tmp[5];
00951 ssp_m128 A, B;
00952 A.f = a;
00953 B.f = b;
00954
00955 tmp[0] = (mask & 0x10) ? (A.f32[0] * B.f32[0]) : 0.0f;
00956 tmp[1] = (mask & 0x20) ? (A.f32[1] * B.f32[1]) : 0.0f;
00957 tmp[2] = (mask & 0x40) ? (A.f32[2] * B.f32[2]) : 0.0f;
00958 tmp[3] = (mask & 0x80) ? (A.f32[3] * B.f32[3]) : 0.0f;
00959
00960 tmp[4] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
00961
00962 A.f32[0] = (mask & 0x1) ? tmp[4] : 0.0f;
00963 A.f32[1] = (mask & 0x2) ? tmp[4] : 0.0f;
00964 A.f32[2] = (mask & 0x4) ? tmp[4] : 0.0f;
00965 A.f32[3] = (mask & 0x8) ? tmp[4] : 0.0f;
00966 return A.f;
00967 }
00968
00970 SSP_FORCEINLINE __m128i ssp_maddubs_epi16_REF( __m128i a, __m128i b)
00971 {
00972 ssp_m128 A, B, C;
00973 int tmp[8];
00974 A.i = a;
00975 B.i = b;
00976
00977
00978 tmp[0] = A.u8[0] * B.s8[0] + A.u8[1] * B.s8[1];
00979 C.s16[0] = (ssp_s16)(SSP_SATURATION(tmp[0], 32767, -32768));
00980
00981 tmp[1] = A.u8[2] * B.s8[2] + A.u8[3] * B.s8[3];
00982 C.s16[1] = (ssp_s16)(SSP_SATURATION(tmp[1], 32767, -32768));
00983
00984 tmp[2] = A.u8[4] * B.s8[4] + A.u8[5] * B.s8[5];
00985 C.s16[2] = (ssp_s16)(SSP_SATURATION(tmp[2], 32767, -32768));
00986
00987 tmp[3] = A.u8[6] * B.s8[6] + A.u8[7] * B.s8[7];
00988 C.s16[3] = (ssp_s16)(SSP_SATURATION(tmp[3], 32767, -32768));
00989
00990 tmp[4] = A.u8[8] * B.s8[8] + A.u8[9] * B.s8[9];
00991 C.s16[4] = (ssp_s16)(SSP_SATURATION(tmp[4], 32767, -32768));
00992
00993 tmp[5] = A.u8[10] * B.s8[10] + A.u8[11] * B.s8[11];
00994 C.s16[5] = (ssp_s16)(SSP_SATURATION(tmp[5], 32767, -32768));
00995
00996 tmp[6] = A.u8[12] * B.s8[12] + A.u8[13] * B.s8[13];
00997 C.s16[6] = (ssp_s16)(SSP_SATURATION(tmp[6], 32767, -32768));
00998
00999 tmp[7] = A.u8[14] * B.s8[14] + A.u8[15] * B.s8[15];
01000 C.s16[7] = (ssp_s16)(SSP_SATURATION(tmp[7], 32767, -32768));
01001
01002 return C.i;
01003 }
01004
01008
01009 SSP_FORCEINLINE __m64 ssp_maddubs_pi16_REF( __m64 a, __m64 b)
01010 {
01011 ssp_m64 A, B, C;
01012 int tmp[4];
01013 A.m64 = a;
01014 B.m64 = b;
01015
01016
01017 tmp[0] = A.u8[0] * B.s8[0] + A.u8[1] * B.s8[1];
01018 C.s16[0] = (ssp_s16)(SSP_SATURATION(tmp[0], 32767, -32768));
01019
01020 tmp[1] = A.u8[2] * B.s8[2] + A.u8[3] * B.s8[3];
01021 C.s16[1] = (ssp_s16)(SSP_SATURATION(tmp[1], 32767, -32768));
01022
01023 tmp[2] = A.u8[4] * B.s8[4] + A.u8[5] * B.s8[5];
01024 C.s16[2] = (ssp_s16)(SSP_SATURATION(tmp[2], 32767, -32768));
01025
01026 tmp[3] = A.u8[6] * B.s8[6] + A.u8[7] * B.s8[7];
01027 C.s16[3] = (ssp_s16)(SSP_SATURATION(tmp[3], 32767, -32768));
01028
01029 return C.m64;
01030 }
01031
01032
01034 SSP_FORCEINLINE __m128i ssp_mulhrs_epi16_REF( __m128i a, __m128i b )
01035 {
01036 ssp_m128 A,B;
01037 A.i = a;
01038 B.i = b;
01039
01040 A.s16[0] = (ssp_s16) ((A.s16[0] * B.s16[0] + 0x4000) >> 15);
01041 A.s16[1] = (ssp_s16) ((A.s16[1] * B.s16[1] + 0x4000) >> 15);
01042 A.s16[2] = (ssp_s16) ((A.s16[2] * B.s16[2] + 0x4000) >> 15);
01043 A.s16[3] = (ssp_s16) ((A.s16[3] * B.s16[3] + 0x4000) >> 15);
01044 A.s16[4] = (ssp_s16) ((A.s16[4] * B.s16[4] + 0x4000) >> 15);
01045 A.s16[5] = (ssp_s16) ((A.s16[5] * B.s16[5] + 0x4000) >> 15);
01046 A.s16[6] = (ssp_s16) ((A.s16[6] * B.s16[6] + 0x4000) >> 15);
01047 A.s16[7] = (ssp_s16) ((A.s16[7] * B.s16[7] + 0x4000) >> 15);
01048
01049 return A.i;
01050 }
01051
01052
01056 SSP_FORCEINLINE __m64 ssp_mulhrs_pi16_REF( __m64 a, __m64 b )
01057 {
01058 ssp_m64 A,B;
01059 A.m64 = a;
01060 B.m64 = b;
01061
01062 A.s16[0] = (ssp_s16) ((A.s16[0] * B.s16[0] + 0x4000) >> 15);
01063 A.s16[1] = (ssp_s16) ((A.s16[1] * B.s16[1] + 0x4000) >> 15);
01064 A.s16[2] = (ssp_s16) ((A.s16[2] * B.s16[2] + 0x4000) >> 15);
01065 A.s16[3] = (ssp_s16) ((A.s16[3] * B.s16[3] + 0x4000) >> 15);
01066
01067 return A.m64;
01068 }
01069
01070
01071
01072
01073
01074
01075
01076
01078 SSP_FORCEINLINE int ssp_extract_epi8_REF( __m128i a, const int ndx )
01079 {
01080 ssp_m128 A;
01081 A.i = a;
01082 return (int)A.u8[ndx&0xF];
01083 }
01084
01086 SSP_FORCEINLINE int ssp_extract_epi32_REF( __m128i a, const int imm )
01087 {
01088 ssp_m128 A;
01089 A.i = a;
01090 return (int)A.u32[imm&0x3];
01091 }
01092
01094 SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_REF( __m128i a, const int ndx )
01095 {
01096 ssp_m128 A;
01097 A.i = a;
01098 return A.s64[ndx & 0x1];
01099 }
01100
01102 SSP_FORCEINLINE int ssp_extract_ps_REF( __m128 a, const int ndx )
01103 {
01104 ssp_m128 A;
01105 A.f = a;
01106 return A.s32[ndx&0x3];
01107 }
01108
01109
01113 SSP_FORCEINLINE __m128i ssp_extract_si64_REF( __m128i a ,__m128i b )
01114 {
01115 ssp_u32 len, ndx;
01116 ssp_s64 mask;
01117 ssp_m128 A, B;
01118 A.i = a;
01119 B.i = b;
01120 ndx = (ssp_u32)((B.u64[0] & 0x3F00) >> 8);
01121 len = (ssp_u32)((B.u64[0] & 0x003F));
01122
01123 len = (len) ? len : 64;
01124 if( (ndx+len) > 64 )
01125 return a;
01126 mask = ~(-1 << len);
01127 A.u64[0] = A.u64[0] >> ndx;
01128 A.u64[0] = A.u64[0] & mask;
01129 return A.i;
01130 }
01134 SSP_FORCEINLINE __m128i ssp_extracti_si64_REF( __m128i a, int len, int ndx )
01135 {
01136 ssp_s64 mask;
01137 ssp_m128 A;
01138 A.i = a;
01139 ndx = ndx & 0x3F;
01140 len = len & 0x3F;
01141
01142 len = (len) ? len : 64;
01143 if( (ndx+len) > 64 )
01144 return a;
01145 mask = ~(-1 << len);
01146 A.u64[0] = A.u64[0] >> ndx;
01147 A.u64[0] = A.u64[0] & mask;
01148 return A.i;
01149 }
01150
01151
01152
01153
01154
01155
01157 SSP_FORCEINLINE __m128i ssp_hadd_epi16_REF ( __m128i a, __m128i b )
01158 {
01159 ssp_m128 A, B;
01160 A.i = a;
01161 B.i = b;
01162
01163 A.s16[0] = A.s16[0] + A.s16[1];
01164 A.s16[1] = A.s16[2] + A.s16[3];
01165 A.s16[2] = A.s16[4] + A.s16[5];
01166 A.s16[3] = A.s16[6] + A.s16[7];
01167 A.s16[4] = B.s16[0] + B.s16[1];
01168 A.s16[5] = B.s16[2] + B.s16[3];
01169 A.s16[6] = B.s16[4] + B.s16[5];
01170 A.s16[7] = B.s16[6] + B.s16[7];
01171 return A.i;
01172 }
01173
01175 SSP_FORCEINLINE __m128i ssp_hadd_epi32_REF ( __m128i a, __m128i b )
01176 {
01177 ssp_m128 A, B;
01178 A.i = a;
01179 B.i = b;
01180
01181 A.s32[0] = A.s32[0] + A.s32[1];
01182 A.s32[1] = A.s32[2] + A.s32[3];
01183 A.s32[2] = B.s32[0] + B.s32[1];
01184 A.s32[3] = B.s32[2] + B.s32[3];
01185
01186 return A.i;
01187 }
01188
01192 SSP_FORCEINLINE __m64 ssp_hadd_pi16_REF ( __m64 a, __m64 b )
01193 {
01194 ssp_m64 A, B;
01195 A.m64 = a;
01196 B.m64 = b;
01197
01198 A.s16[0] = A.s16[0] + A.s16[1];
01199 A.s16[1] = A.s16[2] + A.s16[3];
01200 A.s16[2] = B.s16[0] + B.s16[1];
01201 A.s16[3] = B.s16[2] + B.s16[3];
01202
01203 return A.m64;
01204 }
01205
01209 SSP_FORCEINLINE __m64 ssp_hadd_pi32_REF ( __m64 a, __m64 b )
01210 {
01211 ssp_m64 A, B;
01212 A.m64 = a;
01213 B.m64 = b;
01214
01215 A.s32[0] = A.s32[0] + A.s32[1];
01216 A.s32[1] = B.s32[0] + B.s32[1];
01217
01218 return A.m64;
01219 }
01220
01222 SSP_FORCEINLINE __m128i ssp_hadds_epi16_REF ( __m128i a, __m128i b )
01223 {
01224 ssp_m128 A, B;
01225 int answer[8];
01226 A.i = a;
01227 B.i = b;
01228
01229 answer[0] = A.s16[0] + A.s16[1];
01230 A.s16[0] = (ssp_s16) (SSP_SATURATION(answer[0], 32767, -32768));
01231 answer[1] = A.s16[2] + A.s16[3];
01232 A.s16[1] = (ssp_s16) (SSP_SATURATION(answer[1], 32767, -32768));
01233 answer[2] = A.s16[4] + A.s16[5];
01234 A.s16[2] = (ssp_s16) (SSP_SATURATION(answer[2], 32767, -32768));
01235 answer[3] = A.s16[6] + A.s16[7];
01236 A.s16[3] = (ssp_s16) (SSP_SATURATION(answer[3], 32767, -32768));
01237 answer[4] = B.s16[0] + B.s16[1];
01238 A.s16[4] = (ssp_s16) (SSP_SATURATION(answer[4], 32767, -32768));
01239 answer[5] = B.s16[2] + B.s16[3];
01240 A.s16[5] = (ssp_s16) (SSP_SATURATION(answer[5], 32767, -32768));
01241 answer[6] = B.s16[4] + B.s16[5];
01242 A.s16[6] = (ssp_s16) (SSP_SATURATION(answer[6], 32767, -32768));
01243 answer[7] = B.s16[6] + B.s16[7];
01244 A.s16[7] = (ssp_s16) (SSP_SATURATION(answer[7], 32767, -32768));
01245
01246 return A.i;
01247 }
01248
01252 SSP_FORCEINLINE __m64 ssp_hadds_pi16_REF ( __m64 a, __m64 b )
01253 {
01254 ssp_m64 A, B;
01255 int answer[4];
01256 A.m64 = a;
01257 B.m64 = b;
01258
01259 answer[0] = A.s16[0] + A.s16[1];
01260 A.s16[0] = (ssp_s16) (SSP_SATURATION(answer[0], 32767, -32768));
01261 answer[1] = A.s16[2] + A.s16[3];
01262 A.s16[1] = (ssp_s16) (SSP_SATURATION(answer[1], 32767, -32768));
01263 answer[2] = B.s16[0] + B.s16[1];
01264 A.s16[2] = (ssp_s16) (SSP_SATURATION(answer[2], 32767, -32768));
01265 answer[3] = B.s16[2] + B.s16[3];
01266 A.s16[3] = (ssp_s16) (SSP_SATURATION(answer[3], 32767, -32768));
01267
01268 return A.m64;
01269 }
01270
01272 SSP_FORCEINLINE __m128 ssp_hadd_ps_REF(__m128 a, __m128 b)
01273 {
01274 ssp_m128 A, B;
01275 A.f = a;
01276 B.f = b;
01277
01278 A.f32[0] = A.f32[0] + A.f32[1];
01279 A.f32[1] = A.f32[2] + A.f32[3];
01280 A.f32[2] = B.f32[0] + B.f32[1];
01281 A.f32[3] = B.f32[2] + B.f32[3];
01282 return A.f;
01283 }
01284
01286 SSP_FORCEINLINE __m128d ssp_hadd_pd_REF(__m128d a, __m128d b)
01287 {
01288 ssp_m128 A, B;
01289 A.d = a;
01290 B.d = b;
01291
01292 A.f64[0] = A.f64[0] + A.f64[1];
01293 A.f64[1] = B.f64[0] + B.f64[1];
01294 return A.d;
01295 }
01296
01297
01298
01299
01300
01302 SSP_FORCEINLINE __m128i ssp_hsub_epi16_REF ( __m128i a, __m128i b )
01303 {
01304 ssp_m128 A, B;
01305 A.i = a;
01306 B.i = b;
01307
01308 A.s16[0] = A.s16[0] - A.s16[1];
01309 A.s16[1] = A.s16[2] - A.s16[3];
01310 A.s16[2] = A.s16[4] - A.s16[5];
01311 A.s16[3] = A.s16[6] - A.s16[7];
01312 A.s16[4] = B.s16[0] - B.s16[1];
01313 A.s16[5] = B.s16[2] - B.s16[3];
01314 A.s16[6] = B.s16[4] - B.s16[5];
01315 A.s16[7] = B.s16[6] - B.s16[7];
01316
01317 return A.i;
01318 }
01319
01321 SSP_FORCEINLINE __m128i ssp_hsub_epi32_REF ( __m128i a, __m128i b )
01322 {
01323 ssp_m128 A, B;
01324 A.i = a;
01325 B.i = b;
01326
01327 A.s32[0] = A.s32[0] - A.s32[1];
01328 A.s32[1] = A.s32[2] - A.s32[3];
01329 A.s32[2] = B.s32[0] - B.s32[1];
01330 A.s32[3] = B.s32[2] - B.s32[3];
01331
01332 return A.i;
01333 }
01334
01338 SSP_FORCEINLINE __m64 ssp_hsub_pi16_REF ( __m64 a, __m64 b )
01339 {
01340 ssp_m64 A, B;
01341 A.m64 = a;
01342 B.m64 = b;
01343
01344 A.s16[0] = A.s16[0] - A.s16[1];
01345 A.s16[1] = A.s16[2] - A.s16[3];
01346 A.s16[2] = B.s16[0] - B.s16[1];
01347 A.s16[3] = B.s16[2] - B.s16[3];
01348
01349 return A.m64;
01350 }
01351
01355 SSP_FORCEINLINE __m64 ssp_hsub_pi32_REF ( __m64 a, __m64 b )
01356 {
01357 ssp_m64 A, B;
01358 A.m64 = a;
01359 B.m64 = b;
01360
01361 A.s32[0] = A.s32[0] - A.s32[1];
01362 A.s32[1] = B.s32[0] - B.s32[1];
01363
01364 return A.m64;
01365 }
01366
01368 SSP_FORCEINLINE __m128i ssp_hsubs_epi16_REF ( __m128i a, __m128i b )
01369 {
01370 ssp_m128 A, B;
01371 int answer[8];
01372 A.i = a;
01373 B.i = b;
01374
01375 answer[0] = A.s16[0] - A.s16[1];
01376 A.s16[0] = (ssp_s16) (SSP_SATURATION(answer[0], 32767, -32768));
01377 answer[1] = A.s16[2] - A.s16[3];
01378 A.s16[1] = (ssp_s16) (SSP_SATURATION(answer[1], 32767, -32768));
01379 answer[2] = A.s16[4] - A.s16[5];
01380 A.s16[2] = (ssp_s16) (SSP_SATURATION(answer[2], 32767, -32768));
01381 answer[3] = A.s16[6] - A.s16[7];
01382 A.s16[3] = (ssp_s16) (SSP_SATURATION(answer[3], 32767, -32768));
01383 answer[4] = B.s16[0] - B.s16[1];
01384 A.s16[4] = (ssp_s16) (SSP_SATURATION(answer[4], 32767, -32768));
01385 answer[5] = B.s16[2] - B.s16[3];
01386 A.s16[5] = (ssp_s16) (SSP_SATURATION(answer[5], 32767, -32768));
01387 answer[6] = B.s16[4] - B.s16[5];
01388 A.s16[6] = (ssp_s16) (SSP_SATURATION(answer[6], 32767, -32768));
01389 answer[7] = B.s16[6] - B.s16[7];
01390 A.s16[7] = (ssp_s16) (SSP_SATURATION(answer[7], 32767, -32768));
01391
01392 return A.i;
01393 }
01394
01398 SSP_FORCEINLINE __m64 ssp_hsubs_pi16_REF ( __m64 a, __m64 b )
01399 {
01400 ssp_m64 A, B;
01401 int answer[4];
01402 A.m64 = a;
01403 B.m64 = b;
01404
01405 answer[0] = A.s16[0] - A.s16[1];
01406 A.s16[0] = (ssp_s16) (SSP_SATURATION(answer[0], 32767, -32768));
01407 answer[1] = A.s16[2] - A.s16[3];
01408 A.s16[1] = (ssp_s16) (SSP_SATURATION(answer[1], 32767, -32768));
01409 answer[2] = B.s16[0] - B.s16[1];
01410 A.s16[2] = (ssp_s16) (SSP_SATURATION(answer[2], 32767, -32768));
01411 answer[3] = B.s16[2] - B.s16[3];
01412 A.s16[3] = (ssp_s16) (SSP_SATURATION(answer[3], 32767, -32768));
01413
01414 return A.m64;
01415 }
01416
01418 SSP_FORCEINLINE __m128 ssp_hsub_ps_REF(__m128 a, __m128 b)
01419 {
01420 ssp_m128 A, B;
01421 A.f = a;
01422 B.f = b;
01423
01424 A.f32[0] = A.f32[0] - A.f32[1];
01425 A.f32[1] = A.f32[2] - A.f32[3];
01426 A.f32[2] = B.f32[0] - B.f32[1];
01427 A.f32[3] = B.f32[2] - B.f32[3];
01428 return A.f;
01429 }
01430
01432 SSP_FORCEINLINE __m128d ssp_hsub_pd_REF(__m128d a, __m128d b)
01433 {
01434 ssp_m128 A, B;
01435 A.d = a;
01436 B.d = b;
01437
01438 A.f64[0] = A.f64[0] - A.f64[1];
01439 A.f64[1] = B.f64[0] - B.f64[1];
01440 return A.d;
01441 }
01442
01443
01444
01445
01447 SSP_FORCEINLINE __m128i ssp_insert_epi8_REF( __m128i a, int b, const int ndx )
01448 {
01449 ssp_m128 A;
01450 A.i = a;
01451
01452 A.s8[ndx & 0xF] = (ssp_s8)b;
01453 return A.i;
01454 }
01455
01457 SSP_FORCEINLINE __m128i ssp_insert_epi32_REF( __m128i a, int b, const int ndx )
01458 {
01459 ssp_m128 A;
01460 A.i = a;
01461
01462 A.s32[ndx & 0x3] = b;
01463 return A.i;
01464 }
01465
01467 SSP_FORCEINLINE __m128i ssp_insert_epi64_REF( __m128i a, ssp_s64 b, const int ndx )
01468 {
01469 ssp_m128 A;
01470 A.i = a;
01471
01472 A.s64[ndx & 0x1] = b;
01473 return A.i;
01474 }
01475
01477 SSP_FORCEINLINE __m128 ssp_insert_ps_REF( __m128 a, __m128 b, const int sel )
01478 {
01479 ssp_f32 tmp;
01480 int count_d,zmask;
01481
01482 ssp_m128 A,B;
01483 A.f = a;
01484 B.f = b;
01485
01486 tmp = B.f32[(sel & 0xC0)>>6];
01487 count_d = (sel & 0x30)>>4;
01488 zmask = sel & 0x0F;
01489
01490 A.f32[count_d] = tmp;
01491
01492 A.f32[0] = (zmask & 0x1) ? 0 : A.f32[0];
01493 A.f32[1] = (zmask & 0x2) ? 0 : A.f32[1];
01494 A.f32[2] = (zmask & 0x4) ? 0 : A.f32[2];
01495 A.f32[3] = (zmask & 0x8) ? 0 : A.f32[3];
01496 return A.f;
01497 }
01498
01500 SSP_FORCEINLINE __m128i ssp_insert_si64_REF( __m128i a, __m128i b )
01501 {
01502 ssp_u32 ndx, len;
01503 ssp_s64 mask;
01504 ssp_m128 A, B;
01505 B.i = b;
01506 ndx = (ssp_u32)((B.u64[1] & 0x3F00) >> 8);
01507 len = (ssp_u32)((B.u64[1] & 0x003F));
01508
01509 if( ( (ndx + len) > 64 ) ||
01510 ( (len == 0) && (ndx > 0) ) )
01511 return a;
01512
01513 A.i = a;
01514 if( (len == 0 ) && (ndx == 0) )
01515 {
01516 A.u64[0] = B.u64[0];
01517 return A.i;
01518 }
01519
01520 len = (len) ? len : 64;
01521 mask = ~(-1 << len);
01522 B.u64[0] = B.u64[0] & mask;
01523 B.u64[0] = B.u64[0] << ndx;
01524 mask = ~(mask << ndx);
01525 A.u64[0] = A.u64[0] & mask;
01526 A.u64[0] |= B.u64[0];
01527 return A.i;
01528 }
01529
01531 SSP_FORCEINLINE __m128i ssp_inserti_si64_REF( __m128i a, __m128i b, int len, int ndx )
01532 {
01533 ssp_s64 mask;
01534 ssp_m128 A, B;
01535 A.i = a;
01536 ndx = ndx & 0x3F;
01537 len = len & 0x3F;
01538
01539 if( ( (ndx + len) > 64 ) ||
01540 ( (len == 0) && (ndx > 0) ) )
01541 return a;
01542
01543 B.i = b;
01544 if( (len == 0 ) && (ndx == 0) )
01545 {
01546 A.u64[0] = B.u64[0];
01547 return A.i;
01548 }
01549
01550 len = (len) ? len : 64;
01551 mask = ~(-1 << len);
01552 B.u64[0] = B.u64[0] & mask;
01553 B.u64[0] = B.u64[0] << ndx;
01554 mask = ~(mask << ndx);
01555 A.u64[0] = A.u64[0] & mask;
01556 A.u64[0] |= B.u64[0];
01557 return A.i;
01558 }
01559
01560
01561
01562
01563
01564
01566 SSP_FORCEINLINE __m128d ssp_loaddup_pd_REF(double const * dp)
01567 {
01568 ssp_m128 a;
01569 a.f64[0] = *dp;
01570 a.f64[1] = *dp;
01571 return a.d;
01572 }
01573
01575 SSP_FORCEINLINE __m128i ssp_lddqu_si128_REF(__m128i const *p)
01576 {
01577 return *p;
01578 }
01579
01581 SSP_FORCEINLINE __m128i ssp_stream_load_si128_REF( __m128i *p )
01582 {
01583 return *p;
01584 }
01585
01586
01587
01588
01589
01590
01591 #define SSP_SET_MIN( sd, s) sd=(sd<s)?sd:s;
01592 #define SSP_SET_MAX( sd, s) sd=(sd>s)?sd:s;
01593
01594
01596 SSP_FORCEINLINE __m128i ssp_min_epi8_REF( __m128i a, __m128i b )
01597 {
01598 ssp_m128 A,B;
01599 A.i = a;
01600 B.i = b;
01601
01602 SSP_SET_MIN( A.s8[ 0], B.s8[ 0] );
01603 SSP_SET_MIN( A.s8[ 1], B.s8[ 1] );
01604 SSP_SET_MIN( A.s8[ 2], B.s8[ 2] );
01605 SSP_SET_MIN( A.s8[ 3], B.s8[ 3] );
01606 SSP_SET_MIN( A.s8[ 4], B.s8[ 4] );
01607 SSP_SET_MIN( A.s8[ 5], B.s8[ 5] );
01608 SSP_SET_MIN( A.s8[ 6], B.s8[ 6] );
01609 SSP_SET_MIN( A.s8[ 7], B.s8[ 7] );
01610 SSP_SET_MIN( A.s8[ 8], B.s8[ 8] );
01611 SSP_SET_MIN( A.s8[ 9], B.s8[ 9] );
01612 SSP_SET_MIN( A.s8[10], B.s8[10] );
01613 SSP_SET_MIN( A.s8[11], B.s8[11] );
01614 SSP_SET_MIN( A.s8[12], B.s8[12] );
01615 SSP_SET_MIN( A.s8[13], B.s8[13] );
01616 SSP_SET_MIN( A.s8[14], B.s8[14] );
01617 SSP_SET_MIN( A.s8[15], B.s8[15] );
01618 return A.i;
01619 }
01620
01622 SSP_FORCEINLINE __m128i ssp_max_epi8_REF( __m128i a, __m128i b )
01623 {
01624 ssp_m128 A,B;
01625 A.i = a;
01626 B.i = b;
01627
01628 SSP_SET_MAX( A.s8[ 0], B.s8[ 0] );
01629 SSP_SET_MAX( A.s8[ 1], B.s8[ 1] );
01630 SSP_SET_MAX( A.s8[ 2], B.s8[ 2] );
01631 SSP_SET_MAX( A.s8[ 3], B.s8[ 3] );
01632 SSP_SET_MAX( A.s8[ 4], B.s8[ 4] );
01633 SSP_SET_MAX( A.s8[ 5], B.s8[ 5] );
01634 SSP_SET_MAX( A.s8[ 6], B.s8[ 6] );
01635 SSP_SET_MAX( A.s8[ 7], B.s8[ 7] );
01636 SSP_SET_MAX( A.s8[ 8], B.s8[ 8] );
01637 SSP_SET_MAX( A.s8[ 9], B.s8[ 9] );
01638 SSP_SET_MAX( A.s8[10], B.s8[10] );
01639 SSP_SET_MAX( A.s8[11], B.s8[11] );
01640 SSP_SET_MAX( A.s8[12], B.s8[12] );
01641 SSP_SET_MAX( A.s8[13], B.s8[13] );
01642 SSP_SET_MAX( A.s8[14], B.s8[14] );
01643 SSP_SET_MAX( A.s8[15], B.s8[15] );
01644 return A.i;
01645 }
01646
01647
01649 SSP_FORCEINLINE __m128i ssp_min_epu16_REF ( __m128i a, __m128i b )
01650 {
01651 ssp_m128 A,B;
01652 A.i = a;
01653 B.i = b;
01654
01655 SSP_SET_MIN( A.u16[ 0], B.u16[ 0] );
01656 SSP_SET_MIN( A.u16[ 1], B.u16[ 1] );
01657 SSP_SET_MIN( A.u16[ 2], B.u16[ 2] );
01658 SSP_SET_MIN( A.u16[ 3], B.u16[ 3] );
01659 SSP_SET_MIN( A.u16[ 4], B.u16[ 4] );
01660 SSP_SET_MIN( A.u16[ 5], B.u16[ 5] );
01661 SSP_SET_MIN( A.u16[ 6], B.u16[ 6] );
01662 SSP_SET_MIN( A.u16[ 7], B.u16[ 7] );
01663 return A.i;
01664 }
01665
01667 SSP_FORCEINLINE __m128i ssp_max_epu16_REF ( __m128i a, __m128i b )
01668 {
01669 ssp_m128 A,B;
01670 A.i = a;
01671 B.i = b;
01672
01673 SSP_SET_MAX( A.u16[ 0], B.u16[ 0] );
01674 SSP_SET_MAX( A.u16[ 1], B.u16[ 1] );
01675 SSP_SET_MAX( A.u16[ 2], B.u16[ 2] );
01676 SSP_SET_MAX( A.u16[ 3], B.u16[ 3] );
01677 SSP_SET_MAX( A.u16[ 4], B.u16[ 4] );
01678 SSP_SET_MAX( A.u16[ 5], B.u16[ 5] );
01679 SSP_SET_MAX( A.u16[ 6], B.u16[ 6] );
01680 SSP_SET_MAX( A.u16[ 7], B.u16[ 7] );
01681 return A.i;
01682 }
01683
01684
01686 SSP_FORCEINLINE __m128i ssp_min_epi32_REF( __m128i a, __m128i b )
01687 {
01688 ssp_m128 A,B;
01689 A.i = a;
01690 B.i = b;
01691
01692 SSP_SET_MIN( A.s32[ 0], B.s32[ 0] );
01693 SSP_SET_MIN( A.s32[ 1], B.s32[ 1] );
01694 SSP_SET_MIN( A.s32[ 2], B.s32[ 2] );
01695 SSP_SET_MIN( A.s32[ 3], B.s32[ 3] );
01696 return A.i;
01697 }
01698
01700 SSP_FORCEINLINE __m128i ssp_max_epi32_REF( __m128i a, __m128i b )
01701 {
01702 ssp_m128 A,B;
01703 A.i = a;
01704 B.i = b;
01705
01706 SSP_SET_MAX( A.s32[ 0], B.s32[ 0] );
01707 SSP_SET_MAX( A.s32[ 1], B.s32[ 1] );
01708 SSP_SET_MAX( A.s32[ 2], B.s32[ 2] );
01709 SSP_SET_MAX( A.s32[ 3], B.s32[ 3] );
01710 return A.i;
01711 }
01712
01714 SSP_FORCEINLINE __m128i ssp_min_epu32_REF ( __m128i a, __m128i b )
01715 {
01716 ssp_m128 A,B;
01717 A.i = a;
01718 B.i = b;
01719
01720 SSP_SET_MIN( A.u32[ 0], B.u32[ 0] );
01721 SSP_SET_MIN( A.u32[ 1], B.u32[ 1] );
01722 SSP_SET_MIN( A.u32[ 2], B.u32[ 2] );
01723 SSP_SET_MIN( A.u32[ 3], B.u32[ 3] );
01724 return A.i;
01725 }
01726
01728 SSP_FORCEINLINE __m128i ssp_max_epu32_REF ( __m128i a, __m128i b )
01729 {
01730 ssp_m128 A,B;
01731 A.i = a;
01732 B.i = b;
01733
01734 SSP_SET_MAX( A.u32[ 0], B.u32[ 0] );
01735 SSP_SET_MAX( A.u32[ 1], B.u32[ 1] );
01736 SSP_SET_MAX( A.u32[ 2], B.u32[ 2] );
01737 SSP_SET_MAX( A.u32[ 3], B.u32[ 3] );
01738 return A.i;
01739 }
01740
01741 #undef SSP_SET_MIN
01742 #undef SSP_SET_MAX
01743
01745 SSP_FORCEINLINE __m128i ssp_minpos_epu16_REF( __m128i shortValues )
01746 {
01747 ssp_m128 ShortValues;
01748 ShortValues.i = shortValues;
01749
01750 if( ShortValues.u16[1] < ShortValues.u16[0] )
01751 {
01752 ShortValues.u16[0] = ShortValues.u16[1];
01753 ShortValues.u16[1] = 1;
01754 }
01755 else
01756 ShortValues.u16[1] = 0;
01757
01758
01759 #define FN( I ) \
01760 if( ShortValues.u16[I] < ShortValues.u16[0] ) \
01761 { \
01762 ShortValues.u16[0] = ShortValues.u16[I]; \
01763 ShortValues.u16[1] = I; \
01764 }
01765
01766 FN( 2 );
01767 FN( 3 );
01768 FN( 4 );
01769 FN( 5 );
01770 FN( 6 );
01771 FN( 7 );
01772
01773 ShortValues.u32[1] = 0;
01774 ShortValues.u64[1] = 0;
01775
01776 #undef FN
01777
01778 return ShortValues.i;
01779 }
01780
01782 SSP_FORCEINLINE __m128i ssp_minpos_epu16_REFb( __m128i shortValues )
01783 {
01784 ssp_m128 ShortValues;
01785 ssp_u32 i;
01786 ssp_u16 pos = 0;
01787 ssp_u16 minVal;
01788 ShortValues.i = shortValues;
01789 minVal = ShortValues.u16[0];
01790
01791 for( i=1; i<8; ++i )
01792 {
01793 if( ShortValues.u16[i] < minVal )
01794 {
01795 minVal = ShortValues.u16[i];
01796 pos = i;
01797 }
01798
01799 ShortValues.u16[i] = 0;
01800 }
01801
01802 ShortValues.u16[0] = minVal;
01803 ShortValues.u16[1] = pos;
01804 return ShortValues.i;
01805 }
01806
01807
01808
01809
01810
01812 SSP_FORCEINLINE __m128 ssp_movehdup_ps_REF(__m128 a)
01813 {
01814 ssp_m128 A;
01815 A.f = a;
01816
01817 A.f32[0] = A.f32[1];
01818 A.f32[2] = A.f32[3];
01819 return A.f;
01820 }
01821
01823 SSP_FORCEINLINE __m128 ssp_moveldup_ps_REF(__m128 a)
01824 {
01825 ssp_m128 A;
01826 A.f = a;
01827
01828 A.f32[1] = A.f32[0];
01829 A.f32[3] = A.f32[2];
01830 return A.f;
01831 }
01832
01834 SSP_FORCEINLINE __m128d ssp_movedup_pd_REF(__m128d a)
01835 {
01836 ssp_m128 A;
01837 A.d = a;
01838
01839 A.f64[1] = A.f64[0];
01840 return A.d;
01841 }
01842
01843
01844
01845
01847 SSP_FORCEINLINE __m128i ssp_mul_epi32_REF( __m128i a, __m128i b )
01848 {
01849 ssp_m128 A,B;
01850 A.i = a;
01851 B.i = b;
01852
01853 A.s64[0] = A.s32[0] * B.s32[0];
01854 A.s64[1] = A.s32[2] * B.s32[2];
01855 return A.i;
01856 }
01857
01859 SSP_FORCEINLINE __m128i ssp_mullo_epi32_REF( __m128i a, __m128i b )
01860 {
01861 ssp_m128 t[2];
01862 ssp_m128 A,B;
01863 A.i = a;
01864 B.i = b;
01865
01866 t[0].s64[0] = A.s32[0] * B.s32[0];
01867 t[0].s64[1] = A.s32[1] * B.s32[1];
01868 t[1].s64[0] = A.s32[2] * B.s32[2];
01869 t[1].s64[1] = A.s32[3] * B.s32[3];
01870
01871 A.s32[0] = t[0].s32[0];
01872 A.s32[1] = t[0].s32[2];
01873 A.s32[2] = t[1].s32[0];
01874 A.s32[3] = t[1].s32[2];
01875 return A.i;
01876 }
01877
01879 SSP_FORCEINLINE __m128i ssp_mpsadbw_epu8_REF ( __m128i a, __m128i b, const int msk )
01880 {
01881 ssp_u8 Abyte[11], Bbyte[4], tmp[4];
01882 ssp_u8 Boffset, Aoffset;
01883 int i;
01884
01885 ssp_m128 A,B;
01886 A.i = a;
01887 B.i = b;
01888
01889 Boffset = (msk & 0x3) << 2;
01890 Aoffset = (msk & 0x4);
01891
01892 for (i=0; i<11; i++)
01893 {
01894 Abyte[i] = A.u8[i+Aoffset];
01895 }
01896
01897 Bbyte[0] = B.u8[Boffset ];
01898 Bbyte[1] = B.u8[Boffset+1];
01899 Bbyte[2] = B.u8[Boffset+2];
01900 Bbyte[3] = B.u8[Boffset+3];
01901
01902 for (i=0; i<8; i++)
01903 {
01904 tmp[0] = (Abyte[i ] > Bbyte[0]) ? (Abyte[i ] - Bbyte[0]) : (Bbyte[0] - Abyte[i ]);
01905 tmp[1] = (Abyte[i+1] > Bbyte[1]) ? (Abyte[i+1] - Bbyte[1]) : (Bbyte[1] - Abyte[i+1]);
01906 tmp[2] = (Abyte[i+2] > Bbyte[2]) ? (Abyte[i+2] - Bbyte[2]) : (Bbyte[2] - Abyte[i+2]);
01907 tmp[3] = (Abyte[i+3] > Bbyte[3]) ? (Abyte[i+3] - Bbyte[3]) : (Bbyte[3] - Abyte[i+3]);
01908
01909 A.u16[i] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
01910 }
01911
01912 return A.i;
01913 }
01914
01915
01916
01917
01919 SSP_FORCEINLINE __m128i ssp_packus_epi32_REF( __m128i a, __m128i b )
01920 {
01921 ssp_m128 A,B;
01922 A.i = a;
01923 B.i = b;
01924
01925 if( A.s32[0] < 0 )
01926 A.u16[0] = 0;
01927 else
01928 if( A.s32[0] > 0xFFFF )
01929 A.u16[0] = 0xFFFF;
01930 else
01931 A.s16[0] = (ssp_u16)A.s32[0];
01932
01933 if( A.s32[1] < 0 )
01934 A.u16[1] = 0;
01935 else
01936 if( A.s32[1] > 0xFFFF )
01937 A.u16[1] = 0xFFFF;
01938 else
01939 A.s16[1] = (ssp_u16)A.s32[1];
01940
01941 if( A.s32[2] < 0 )
01942 A.u16[2] = 0;
01943 else
01944 if( A.s32[2] > 0xFFFF )
01945 A.u16[2] = 0xFFFF;
01946 else
01947 A.s16[2] = (ssp_u16)A.s32[2];
01948
01949
01950 if( A.s32[3] < 0 )
01951 A.u16[3] = 0;
01952 else
01953 if( A.s32[3] > 0xFFFF )
01954 A.u16[3] = 0xFFFF;
01955 else
01956 A.s16[3] = (ssp_u16)A.s32[3];
01957
01958 if( B.s32[0] < 0 )
01959 A.u16[4] = 0;
01960 else
01961 if( B.s32[0] > 0xFFFF )
01962 A.u16[4] = 0xFFFF;
01963 else
01964 A.s16[4] = (ssp_u16)B.s32[0];
01965
01966 if( B.s32[1] < 0 )
01967 A.u16[5] = 0;
01968 else
01969 if( B.s32[1] > 0xFFFF )
01970 A.u16[5] = 0xFFFF;
01971 else
01972 A.s16[5] = (ssp_u16)B.s32[1];
01973
01974 if( B.s32[2] < 0 )
01975 A.u16[6] = 0;
01976 else
01977 if( B.s32[2] > 0xFFFF )
01978 A.u16[6] = 0xFFFF;
01979 else
01980 A.s16[6] = (ssp_u16)B.s32[2];
01981
01982
01983 if( B.s32[3] < 0 )
01984 A.u16[7] = 0;
01985 else
01986 if( B.s32[3] > 0xFFFF )
01987 A.u16[7] = 0xFFFF;
01988 else
01989 A.s16[7] = (ssp_u16)B.s32[3];
01990
01991 return A.i;
01992 }
01993
01994
01995
01996
01997
01999 SSP_FORCEINLINE __m128d ssp_ceil_pd_REF( __m128d a )
02000 {
02001 ssp_m128 A;
02002 A.d = a;
02003
02004 A.f64[0] = ceil( A.f64[0] );
02005 A.f64[1] = ceil( A.f64[1] );
02006 return A.d;
02007 }
02008
02010 SSP_FORCEINLINE __m128 ssp_ceil_ps_REF( __m128 a )
02011 {
02012 ssp_m128 A;
02013 A.f = a;
02014
02015 A.f32[0] = (ssp_f32)ceil( A.f32[0] );
02016 A.f32[1] = (ssp_f32)ceil( A.f32[1] );
02017 A.f32[2] = (ssp_f32)ceil( A.f32[2] );
02018 A.f32[3] = (ssp_f32)ceil( A.f32[3] );
02019 return A.f;
02020 }
02021
02023 SSP_FORCEINLINE __m128d ssp_ceil_sd_REF( __m128d a, __m128d b)
02024 {
02025 ssp_m128 A,B;
02026 A.d = a;
02027 B.d = b;
02028
02029 A.f64[0] = ceil( B.f64[0] );
02030 return A.d;
02031 }
02032
02034 SSP_FORCEINLINE __m128 ssp_ceil_ss_REF( __m128 a, __m128 b)
02035 {
02036 ssp_m128 A,B;
02037 A.f = a;
02038 B.f = b;
02039
02040 A.f32[0] = (ssp_f32)ceil( B.f32[0] );
02041 return A.f;
02042 }
02043
02045 SSP_FORCEINLINE __m128d ssp_floor_pd_REF( __m128d a )
02046 {
02047 ssp_m128 A;
02048 A.d = a;
02049
02050 A.f64[0] = floor( A.f64[0] );
02051 A.f64[1] = floor( A.f64[1] );
02052 return A.d;
02053 }
02054
02056 SSP_FORCEINLINE __m128 ssp_floor_ps_REF( __m128 a )
02057 {
02058 ssp_m128 A;
02059 A.f = a;
02060
02061 A.f32[0] = (float)floor( A.f32[0] );
02062 A.f32[1] = (float)floor( A.f32[1] );
02063 A.f32[2] = (float)floor( A.f32[2] );
02064 A.f32[3] = (float)floor( A.f32[3] );
02065 return A.f;
02066 }
02067
02069 SSP_FORCEINLINE __m128d ssp_floor_sd_REF( __m128d a, __m128d b )
02070 {
02071 ssp_m128 A,B;
02072 A.d = a;
02073 B.d = b;
02074
02075 A.f64[0] = floor( B.f64[0] );
02076 return A.d;
02077 }
02078
02080 SSP_FORCEINLINE __m128 ssp_floor_ss_REF( __m128 a, __m128 b )
02081 {
02082 ssp_m128 A,B;
02083 A.f = a;
02084 B.f = b;
02085
02086 A.f32[0] = (float)floor( B.f32[0] );
02087 return A.f;
02088 }
02089
02091 SSP_FORCEINLINE __m128d ssp_round_pd_REF( __m128d val, int iRoundMode )
02092 {
02093 ssp_s64 *valPtr;
02094 ssp_m128 Val;
02095 Val.d = val;
02096
02097 switch( iRoundMode & 0x3 )
02098 {
02099 case SSP_FROUND_CUR_DIRECTION:
02100 break;
02101 case SSP_FROUND_TO_ZERO:
02102 valPtr = (ssp_s64*)(&Val.f64[0]);
02103 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02104 Val.f64[0] = (ssp_f64)( (ssp_s64)Val.f64[0] );
02105
02106 valPtr = (ssp_s64*)(&Val.f64[1]);
02107 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02108 Val.f64[1] = (ssp_f64)( (ssp_s64)Val.f64[1] );
02109 break;
02110 case SSP_FROUND_TO_POS_INF:
02111 valPtr = (ssp_s64*)(&Val.f64[0]);
02112 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02113 Val.f64[0] = ceil( Val.f64[0] );
02114
02115 valPtr = (ssp_s64*)(&Val.f64[1]);
02116 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02117 Val.f64[1] = ceil( Val.f64[1] );
02118 break;
02119 case SSP_FROUND_TO_NEG_INF:
02120 valPtr = (ssp_s64*)(&Val.f64[0]);
02121 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02122 Val.f64[0] = floor( Val.f64[0] );
02123
02124 valPtr = (ssp_s64*)(&Val.f64[1]);
02125 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02126 Val.f64[1] = floor( Val.f64[1] );
02127 break;
02128 default:
02129 valPtr = (ssp_s64*)(&Val.f64[0]);
02130 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02131 Val.f64[0] = (ssp_f64)( (Val.f64[0]>0) ? (ssp_s64)(Val.f64[0]+0.5) : (ssp_s64)(Val.f64[0]-0.5) );
02132 else
02133 Val.f64[0] = ssp_number_changeSNanToQNaN_F64_REF( valPtr );
02134
02135 valPtr = (ssp_s64*)(&Val.f64[1]);
02136 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02137 Val.f64[1] = (ssp_f64)( (Val.f64[1]>0) ? (ssp_s64)(Val.f64[1]+0.5) : (ssp_s64)(Val.f64[1]-0.5) );
02138 else
02139 Val.f64[1] = ssp_number_changeSNanToQNaN_F64_REF( valPtr );
02140 }
02141 return Val.d;
02142 }
02143
02145 SSP_FORCEINLINE __m128 ssp_round_ps_REF( __m128 val, int iRoundMode )
02146 {
02147 ssp_s32 *valPtr;
02148 ssp_m128 Val;
02149 Val.f = val;
02150
02151 switch( iRoundMode & 0x3 )
02152 {
02153 case SSP_FROUND_CUR_DIRECTION:
02154 break;
02155 case SSP_FROUND_TO_ZERO:
02156 valPtr = (ssp_s32*)(&Val.f32[0]);
02157 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02158 {
02159 if( Val.f32[0] >= 0 )
02160 Val.f32[0] = (ssp_f32)( (ssp_s32)Val.f32[0] );
02161 else
02162 {
02163 Val.f32[0] = (ssp_f32)( (ssp_s32)Val.f32[0] );
02164
02165 }
02166 }
02167
02168 valPtr = (ssp_s32*)(&Val.f32[1]);
02169 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02170 {
02171 if( Val.f32[1] >= 0 )
02172 Val.f32[1] = (ssp_f32)( (ssp_s32)Val.f32[1] );
02173 else
02174 {
02175 Val.f32[1] = (ssp_f32)( (ssp_s32)Val.f32[1] );
02176
02177 }
02178 }
02179
02180 valPtr = (ssp_s32*)(&Val.f32[2]);
02181 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02182 {
02183 if( Val.f32[2] >= 0 )
02184 Val.f32[2] = (ssp_f32)( (ssp_s32)Val.f32[2] );
02185 else
02186 {
02187 Val.f32[2] = (ssp_f32)( (ssp_s32)Val.f32[2] );
02188
02189 }
02190 }
02191
02192 valPtr = (ssp_s32*)(&Val.f32[3]);
02193 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02194 {
02195 if( Val.f32[3] >= 0 )
02196 Val.f32[3] = (ssp_f32)( (ssp_s32)Val.f32[3] );
02197 else
02198 {
02199 Val.f32[3] = (ssp_f32)( (ssp_s32)Val.f32[3] );
02200
02201 }
02202 }
02203 break;
02204 case SSP_FROUND_TO_POS_INF:
02205 valPtr = (ssp_s32*)(&Val.f32[0]);
02206 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02207 Val.f32[0] = (ssp_f32)ceil( Val.f32[0] );
02208
02209 valPtr = (ssp_s32*)(&Val.f32[1]);
02210 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02211 Val.f32[1] = (ssp_f32)ceil( Val.f32[1] );
02212
02213 valPtr = (ssp_s32*)(&Val.f32[2]);
02214 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02215 Val.f32[2] = (ssp_f32)ceil( Val.f32[2] );
02216
02217 valPtr = (ssp_s32*)(&Val.f32[3]);
02218 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02219 Val.f32[3] = (ssp_f32)ceil( Val.f32[3] );
02220 break;
02221 case SSP_FROUND_TO_NEG_INF:
02222 valPtr = (ssp_s32*)(&Val.f32[0]);
02223 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02224 Val.f32[0] = (ssp_f32)floor( Val.f32[0] );
02225
02226 valPtr = (ssp_s32*)(&Val.f32[1]);
02227 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02228 Val.f32[1] = (ssp_f32)floor( Val.f32[1] );
02229
02230 valPtr = (ssp_s32*)(&Val.f32[2]);
02231 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02232 Val.f32[2] = (ssp_f32)floor( Val.f32[2] );
02233
02234 valPtr = (ssp_s32*)(&Val.f32[3]);
02235 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02236 Val.f32[3] = (ssp_f32)floor( Val.f32[3] );
02237 break;
02238 default:
02239 valPtr = (ssp_s32*)(&Val.f32[0]);
02240 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02241 Val.f32[0] = (ssp_f32)( (Val.f32[0]>0) ? (ssp_s32)(Val.f32[0]+0.5) : (ssp_s32)(Val.f32[0]-0.5) );
02242 else
02243 Val.f32[0] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02244
02245 valPtr = (ssp_s32*)(&Val.f32[1]);
02246 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02247 Val.f32[1] = (ssp_f32)( (Val.f32[1]>0) ? (ssp_s32)(Val.f32[1]+0.5) : (ssp_s32)(Val.f32[1]-0.5) );
02248 else
02249 Val.f32[1] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02250
02251 valPtr = (ssp_s32*)(&Val.f32[2]);
02252 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02253 Val.f32[2] = (ssp_f32)( (Val.f32[2]>0) ? (ssp_s32)(Val.f32[2]+0.5) : (ssp_s32)(Val.f32[2]-0.5) );
02254 else
02255 Val.f32[2] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02256
02257 valPtr = (ssp_s32*)(&Val.f32[3]);
02258 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02259 Val.f32[3] = (ssp_f32)( (Val.f32[3]>0) ? (ssp_s32)(Val.f32[3]+0.5) : (ssp_s32)(Val.f32[3]-0.5) );
02260 else
02261 Val.f32[3] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02262 }
02263
02264 if( -0.0f == Val.f32[0] ) Val.f32[0]=+0.0f;
02265 if( -0.0f == Val.f32[1] ) Val.f32[1]=+0.0f;
02266 if( -0.0f == Val.f32[2] ) Val.f32[2]=+0.0f;
02267 if( -0.0f == Val.f32[3] ) Val.f32[3]=+0.0f;
02268
02269 return Val.f;
02270 }
02271
02273 SSP_FORCEINLINE __m128d ssp_round_sd_REF( __m128d dst, __m128d val, int iRoundMode )
02274 {
02275 ssp_s64 *valPtr;
02276 ssp_m128 Dst, Val;
02277 Dst.d = dst;
02278 Val.d = val;
02279
02280 switch( iRoundMode & 0x3 )
02281 {
02282 case SSP_FROUND_CUR_DIRECTION:
02283 break;
02284 case SSP_FROUND_TO_ZERO:
02285 valPtr = (ssp_s64*)(&Val.f64[0]);
02286 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02287 Dst.f64[0] = (ssp_f64)( (ssp_s64)Val.f64[0] );
02288 break;
02289 case SSP_FROUND_TO_POS_INF:
02290 valPtr = (ssp_s64*)(&Val.f64[0]);
02291 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02292 Dst.f64[0] = ceil( Val.f64[0] );
02293 break;
02294 case SSP_FROUND_TO_NEG_INF:
02295 valPtr = (ssp_s64*)(&Val.f64[0]);
02296 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02297 Dst.f64[0] = floor( Val.f64[0] );
02298 break;
02299 default:
02300 valPtr = (ssp_s64*)(&Val.f64[0]);
02301 if( ssp_number_isValidNumber_F64_REF( valPtr ) )
02302 Dst.f64[0] = (ssp_f64)( (Val.f64[0]>0) ? (ssp_s64)(Val.f64[0]+0.5) : (ssp_s64)(Val.f64[0]-0.5) );
02303 else
02304 Dst.f64[0] = ssp_number_changeSNanToQNaN_F64_REF( valPtr );
02305 }
02306 return Dst.d;
02307 }
02308
02310 SSP_FORCEINLINE __m128 ssp_round_ss_REF( __m128 dst, __m128 val, int iRoundMode )
02311 {
02312 ssp_s32 *valPtr;
02313 ssp_m128 Dst, Val;
02314 Dst.f = dst;
02315 Val.f = val;
02316
02317 switch( iRoundMode & 0x3 )
02318 {
02319 case SSP_FROUND_CUR_DIRECTION:
02320 break;
02321 case SSP_FROUND_TO_ZERO:
02322 valPtr = (ssp_s32*)(&Val.f32[0]);
02323 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02324 {
02325 Dst.f32[0] = (ssp_f32)( (ssp_s32)Val.f32[0] );
02326 if( Val.f32[0] <= -0 )
02327 Dst.s32[0] = Dst.s32[0] | 0x80000000;
02328 }
02329 break;
02330 case SSP_FROUND_TO_POS_INF:
02331 valPtr = (ssp_s32*)(&Val.f32[0]);
02332 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02333 Dst.f32[0] = (ssp_f32)ceil( Val.f32[0] );
02334 break;
02335 case SSP_FROUND_TO_NEG_INF:
02336 valPtr = (ssp_s32*)(&Val.f32[0]);
02337 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02338 Dst.f32[0] = (ssp_f32)floor( Val.f32[0] );
02339 break;
02340 default:
02341 valPtr = (ssp_s32*)(&Val.f32[0]);
02342 if( ssp_number_isValidNumber_F32_REF( valPtr ) )
02343 Dst.f32[0] = (ssp_f32)( (Val.f32[0]>0) ? (ssp_s32)(Val.f32[0]+0.5) : (ssp_s32)(Val.f32[0]-0.5) );
02344 else
02345 Dst.f32[0] = ssp_number_changeSNanToQNaN_F32_REF( valPtr );
02346 }
02347 return Dst.f;
02348 }
02349
02350
02351
02352
02354 SSP_FORCEINLINE int ssp_testc_si128_REF( __m128i a, __m128i b)
02355 {
02356 ssp_m128 A,B;
02357 A.i = a;
02358 B.i = b;
02359
02360 return ( (A.s64[0] & B.s64[0]) == A.s64[0] ) &&
02361 ( (A.s64[1] & B.s64[1]) == A.s64[1] ) ;
02362 }
02363
02365 SSP_FORCEINLINE int ssp_testz_si128_REF( __m128i a, __m128i b)
02366 {
02367 ssp_m128 A,B;
02368 A.i = a;
02369 B.i = b;
02370
02371 return ( (A.s64[0] & B.s64[0]) == 0 ) &&
02372 ( (A.s64[1] & B.s64[1]) == 0 ) ;
02373 }
02374
02376 SSP_FORCEINLINE int ssp_testnzc_si128_REF( __m128i a, __m128i b)
02377 {
02378 int zf, cf;
02379 ssp_m128 A,B;
02380 A.i = a;
02381 B.i = b;
02382
02383 zf = ssp_testz_si128_REF( A.i, B.i);
02384
02385 cf = ( (~A.s64[0] & B.s64[0]) == 0 ) &&
02386 ( (~A.s64[1] & B.s64[1]) == 0 ) ;
02387 return ((int)!zf & (int)!cf);
02388 }
02389
02390
02391
02392
02394 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi16_REF ( __m128i a)
02395 {
02396 ssp_m128 A;
02397 A.i = a;
02398
02399 A.s16[7] = A.s8[7];
02400 A.s16[6] = A.s8[6];
02401 A.s16[5] = A.s8[5];
02402 A.s16[4] = A.s8[4];
02403 A.s16[3] = A.s8[3];
02404 A.s16[2] = A.s8[2];
02405 A.s16[1] = A.s8[1];
02406 A.s16[0] = A.s8[0];
02407 return A.i;
02408 }
02409
02411 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi32_REF ( __m128i a)
02412 {
02413 ssp_m128 A;
02414 A.i = a;
02415
02416 A.s32[3] = A.s8[3];
02417 A.s32[2] = A.s8[2];
02418 A.s32[1] = A.s8[1];
02419 A.s32[0] = A.s8[0];
02420 return A.i;
02421 }
02422
02424 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi64_REF ( __m128i a)
02425 {
02426 ssp_m128 A;
02427 A.i = a;
02428
02429 A.s64[1] = A.s8[1];
02430 A.s64[0] = A.s8[0];
02431 return A.i;
02432 }
02433
02435 SSP_FORCEINLINE __m128i ssp_cvtepi16_epi32_REF ( __m128i a)
02436 {
02437 ssp_m128 A;
02438 A.i = a;
02439
02440 A.s32[3] = A.s16[3];
02441 A.s32[2] = A.s16[2];
02442 A.s32[1] = A.s16[1];
02443 A.s32[0] = A.s16[0];
02444 return A.i;
02445 }
02446
02448 SSP_FORCEINLINE __m128i ssp_cvtepi16_epi64_REF ( __m128i a)
02449 {
02450 ssp_m128 A;
02451 A.i = a;
02452
02453 A.s64[1] = A.s16[1];
02454 A.s64[0] = A.s16[0];
02455 return A.i;
02456 }
02457
02459 SSP_FORCEINLINE __m128i ssp_cvtepi32_epi64_REF ( __m128i a)
02460 {
02461 ssp_m128 A;
02462 A.i = a;
02463
02464 A.s64[1] = A.s32[1];
02465 A.s64[0] = A.s32[0];
02466 return A.i;
02467 }
02468
02470 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi16_REF ( __m128i a)
02471 {
02472 ssp_m128 A;
02473 A.i = a;
02474
02475 A.s16[7] = A.u8[7];
02476 A.s16[6] = A.u8[6];
02477 A.s16[5] = A.u8[5];
02478 A.s16[4] = A.u8[4];
02479 A.s16[3] = A.u8[3];
02480 A.s16[2] = A.u8[2];
02481 A.s16[1] = A.u8[1];
02482 A.s16[0] = A.u8[0];
02483 return A.i;
02484 }
02485
02487 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi32_REF ( __m128i a)
02488 {
02489 ssp_m128 A;
02490 A.i = a;
02491
02492 A.s32[3] = A.u8[3];
02493 A.s32[2] = A.u8[2];
02494 A.s32[1] = A.u8[1];
02495 A.s32[0] = A.u8[0];
02496 return A.i;
02497 }
02498
02500 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi64_REF ( __m128i a)
02501 {
02502 ssp_m128 A;
02503 A.i = a;
02504
02505 A.s64[1] = A.u8[1];
02506 A.s64[0] = A.u8[0];
02507 return A.i;
02508 }
02509
02511 SSP_FORCEINLINE __m128i ssp_cvtepu16_epi32_REF ( __m128i a)
02512 {
02513 ssp_m128 A;
02514 A.i = a;
02515
02516 A.s32[3] = A.u16[3];
02517 A.s32[2] = A.u16[2];
02518 A.s32[1] = A.u16[1];
02519 A.s32[0] = A.u16[0];
02520 return A.i;
02521 }
02522
02524 SSP_FORCEINLINE __m128i ssp_cvtepu16_epi64_REF ( __m128i a)
02525 {
02526 ssp_m128 A;
02527 A.i = a;
02528
02529 A.s64[1] = A.u16[1];
02530 A.s64[0] = A.u16[0];
02531 return A.i;
02532 }
02533
02535 SSP_FORCEINLINE __m128i ssp_cvtepu32_epi64_REF ( __m128i a)
02536 {
02537 ssp_m128 A;
02538 A.i = a;
02539
02540 A.s64[1] = A.u32[1];
02541 A.s64[0] = A.u32[0];
02542 return A.i;
02543 }
02544
02545
02546
02548 SSP_FORCEINLINE __m128i ssp_abs_epi8_REF (__m128i a)
02549 {
02550 ssp_m128 A;
02551 A.i = a;
02552
02553 A.s8[0] = (A.s8[0] < 0) ? -A.s8[0] : A.s8[0];
02554 A.s8[1] = (A.s8[1] < 0) ? -A.s8[1] : A.s8[1];
02555 A.s8[2] = (A.s8[2] < 0) ? -A.s8[2] : A.s8[2];
02556 A.s8[3] = (A.s8[3] < 0) ? -A.s8[3] : A.s8[3];
02557 A.s8[4] = (A.s8[4] < 0) ? -A.s8[4] : A.s8[4];
02558 A.s8[5] = (A.s8[5] < 0) ? -A.s8[5] : A.s8[5];
02559 A.s8[6] = (A.s8[6] < 0) ? -A.s8[6] : A.s8[6];
02560 A.s8[7] = (A.s8[7] < 0) ? -A.s8[7] : A.s8[7];
02561 A.s8[8] = (A.s8[8] < 0) ? -A.s8[8] : A.s8[8];
02562 A.s8[9] = (A.s8[9] < 0) ? -A.s8[9] : A.s8[9];
02563 A.s8[10] = (A.s8[10]< 0) ? -A.s8[10] : A.s8[10];
02564 A.s8[11] = (A.s8[11]< 0) ? -A.s8[11] : A.s8[11];
02565 A.s8[12] = (A.s8[12]< 0) ? -A.s8[12] : A.s8[12];
02566 A.s8[13] = (A.s8[13]< 0) ? -A.s8[13] : A.s8[13];
02567 A.s8[14] = (A.s8[14]< 0) ? -A.s8[14] : A.s8[14];
02568 A.s8[15] = (A.s8[15]< 0) ? -A.s8[15] : A.s8[15];
02569
02570 return A.i;
02571 }
02572
02573
02575 SSP_FORCEINLINE __m128i ssp_abs_epi16_REF (__m128i a)
02576 {
02577 ssp_m128 A;
02578 A.i = a;
02579
02580 A.s16[0] = (A.s16[0] < 0) ? -A.s16[0] : A.s16[0];
02581 A.s16[1] = (A.s16[1] < 0) ? -A.s16[1] : A.s16[1];
02582 A.s16[2] = (A.s16[2] < 0) ? -A.s16[2] : A.s16[2];
02583 A.s16[3] = (A.s16[3] < 0) ? -A.s16[3] : A.s16[3];
02584 A.s16[4] = (A.s16[4] < 0) ? -A.s16[4] : A.s16[4];
02585 A.s16[5] = (A.s16[5] < 0) ? -A.s16[5] : A.s16[5];
02586 A.s16[6] = (A.s16[6] < 0) ? -A.s16[6] : A.s16[6];
02587 A.s16[7] = (A.s16[7] < 0) ? -A.s16[7] : A.s16[7];
02588
02589 return A.i;
02590 }
02591
02593 SSP_FORCEINLINE __m128i ssp_abs_epi32_REF (__m128i a)
02594 {
02595 ssp_m128 A;
02596 A.i = a;
02597
02598 A.s32[0] = (A.s32[0] < 0) ? -A.s32[0] : A.s32[0];
02599 A.s32[1] = (A.s32[1] < 0) ? -A.s32[1] : A.s32[1];
02600 A.s32[2] = (A.s32[2] < 0) ? -A.s32[2] : A.s32[2];
02601 A.s32[3] = (A.s32[3] < 0) ? -A.s32[3] : A.s32[3];
02602
02603 return A.i;
02604 }
02605
02609 SSP_FORCEINLINE __m64 ssp_abs_pi8_REF (__m64 a)
02610 {
02611 ssp_m64 A;
02612 A.m64 = a;
02613
02614 A.s8[0] = (A.s8[0] < 0) ? -A.s8[0] : A.s8[0];
02615 A.s8[1] = (A.s8[1] < 0) ? -A.s8[1] : A.s8[1];
02616 A.s8[2] = (A.s8[2] < 0) ? -A.s8[2] : A.s8[2];
02617 A.s8[3] = (A.s8[3] < 0) ? -A.s8[3] : A.s8[3];
02618 A.s8[4] = (A.s8[4] < 0) ? -A.s8[4] : A.s8[4];
02619 A.s8[5] = (A.s8[5] < 0) ? -A.s8[5] : A.s8[5];
02620 A.s8[6] = (A.s8[6] < 0) ? -A.s8[6] : A.s8[6];
02621 A.s8[7] = (A.s8[7] < 0) ? -A.s8[7] : A.s8[7];
02622
02623 return A.m64;
02624 }
02625
02626
02630 SSP_FORCEINLINE __m64 ssp_abs_pi16_REF (__m64 a)
02631 {
02632 ssp_m64 A;
02633 A.m64 = a;
02634
02635 A.s16[0] = (A.s16[0] < 0) ? -A.s16[0] : A.s16[0];
02636 A.s16[1] = (A.s16[1] < 0) ? -A.s16[1] : A.s16[1];
02637 A.s16[2] = (A.s16[2] < 0) ? -A.s16[2] : A.s16[2];
02638 A.s16[3] = (A.s16[3] < 0) ? -A.s16[3] : A.s16[3];
02639
02640 return A.m64;
02641 }
02642
02643
02647 SSP_FORCEINLINE __m64 ssp_abs_pi32_REF (__m64 a)
02648 {
02649 ssp_m64 A;
02650 A.m64 = a;
02651
02652 A.s32[0] = (A.s32[0] < 0) ? -A.s32[0] : A.s32[0];
02653 A.s32[1] = (A.s32[1] < 0) ? -A.s32[1] : A.s32[1];
02654
02655 return A.m64;
02656 }
02657
02658
02659
02661 SSP_FORCEINLINE __m128i ssp_alignr_epi8_REF (__m128i a, __m128i b, const int ralign)
02662 {
02663 ssp_m128 C[3];
02664 ssp_s8 * tmp;
02665 int i, j;
02666
02667 if (ralign <0) return b;
02668 C[2].i = _mm_setzero_si128();
02669 if (ralign > 32) return C[2].i;
02670 C[1].i = a;
02671 C[0].i = b;
02672 tmp = & (C[0].s8[0]);
02673
02674 for (i=ralign+15, j=15; i >=ralign; i--, j--) {
02675 C[2].s8[j] = tmp[i];
02676 }
02677
02678 return C[2].i;
02679 }
02680
02684 SSP_FORCEINLINE __m64 ssp_alignr_pi8_REF (__m64 a, __m64 b, const int ralign)
02685 {
02686 ssp_m64 C[3];
02687 ssp_s8 * tmp;
02688 int i, j;
02689
02690 if (ralign <0) return b;
02691 C[2].u32[0] = 0;
02692 C[2].u32[1] = 0;
02693 if (ralign > 16) return C[2].m64;
02694 C[1].m64 = a;
02695 C[0].m64 = b;
02696 tmp = & (C[0].s8[0]);
02697
02698 for (i=ralign+7, j=7; i >=ralign; i--, j--) {
02699 C[2].s8[j] = tmp[i];
02700 }
02701
02702 return C[2].m64;
02703 }
02704
02705
02707 SSP_FORCEINLINE __m128i ssp_shuffle_epi8_REF (__m128i a, __m128i mask)
02708 {
02709 ssp_m128 A, MSK, B;
02710 A.i = a;
02711 MSK.i = mask;
02712
02713 B.s8[0] = (MSK.s8[0] & 0x80) ? 0 : A.s8[(MSK.s8[0] & 0xf)];
02714 B.s8[1] = (MSK.s8[1] & 0x80) ? 0 : A.s8[(MSK.s8[1] & 0xf)];
02715 B.s8[2] = (MSK.s8[2] & 0x80) ? 0 : A.s8[(MSK.s8[2] & 0xf)];
02716 B.s8[3] = (MSK.s8[3] & 0x80) ? 0 : A.s8[(MSK.s8[3] & 0xf)];
02717 B.s8[4] = (MSK.s8[4] & 0x80) ? 0 : A.s8[(MSK.s8[4] & 0xf)];
02718 B.s8[5] = (MSK.s8[5] & 0x80) ? 0 : A.s8[(MSK.s8[5] & 0xf)];
02719 B.s8[6] = (MSK.s8[6] & 0x80) ? 0 : A.s8[(MSK.s8[6] & 0xf)];
02720 B.s8[7] = (MSK.s8[7] & 0x80) ? 0 : A.s8[(MSK.s8[7] & 0xf)];
02721 B.s8[8] = (MSK.s8[8] & 0x80) ? 0 : A.s8[(MSK.s8[8] & 0xf)];
02722 B.s8[9] = (MSK.s8[9] & 0x80) ? 0 : A.s8[(MSK.s8[9] & 0xf)];
02723 B.s8[10] = (MSK.s8[10] & 0x80) ? 0 : A.s8[(MSK.s8[10] & 0xf)];
02724 B.s8[11] = (MSK.s8[11] & 0x80) ? 0 : A.s8[(MSK.s8[11] & 0xf)];
02725 B.s8[12] = (MSK.s8[12] & 0x80) ? 0 : A.s8[(MSK.s8[12] & 0xf)];
02726 B.s8[13] = (MSK.s8[13] & 0x80) ? 0 : A.s8[(MSK.s8[13] & 0xf)];
02727 B.s8[14] = (MSK.s8[14] & 0x80) ? 0 : A.s8[(MSK.s8[14] & 0xf)];
02728 B.s8[15] = (MSK.s8[15] & 0x80) ? 0 : A.s8[(MSK.s8[15] & 0xf)];
02729
02730 return B.i;
02731 }
02732
02736 SSP_FORCEINLINE __m64 ssp_shuffle_pi8_REF (__m64 a, __m64 mask)
02737 {
02738 ssp_m64 A, MSK, B;
02739 A.m64 = a;
02740 MSK.m64 = mask;
02741
02742 B.s8[0] = (MSK.s8[0] & 0x80) ? 0 : A.s8[(MSK.s8[0] & 0xf)];
02743 B.s8[1] = (MSK.s8[1] & 0x80) ? 0 : A.s8[(MSK.s8[1] & 0xf)];
02744 B.s8[2] = (MSK.s8[2] & 0x80) ? 0 : A.s8[(MSK.s8[2] & 0xf)];
02745 B.s8[3] = (MSK.s8[3] & 0x80) ? 0 : A.s8[(MSK.s8[3] & 0xf)];
02746 B.s8[4] = (MSK.s8[4] & 0x80) ? 0 : A.s8[(MSK.s8[4] & 0xf)];
02747 B.s8[5] = (MSK.s8[5] & 0x80) ? 0 : A.s8[(MSK.s8[5] & 0xf)];
02748 B.s8[6] = (MSK.s8[6] & 0x80) ? 0 : A.s8[(MSK.s8[6] & 0xf)];
02749 B.s8[7] = (MSK.s8[7] & 0x80) ? 0 : A.s8[(MSK.s8[7] & 0xf)];
02750
02751 return B.m64;
02752 }
02753
02754
02755
02757 SSP_FORCEINLINE __m128i ssp_sign_epi8_REF (__m128i a, __m128i b)
02758 {
02759 ssp_m128 A, B;
02760 A.i = a;
02761 B.i = b;
02762
02763 A.s8[0] = (B.s8[0]<0) ? (-A.s8[0]) :((B.s8[0]==0) ? 0: A.s8[0]);
02764 A.s8[1] = (B.s8[1]<0) ? (-A.s8[1]) :((B.s8[1]==0) ? 0: A.s8[1]);
02765 A.s8[2] = (B.s8[2]<0) ? (-A.s8[2]) :((B.s8[2]==0) ? 0: A.s8[2]);
02766 A.s8[3] = (B.s8[3]<0) ? (-A.s8[3]) :((B.s8[3]==0) ? 0: A.s8[3]);
02767 A.s8[4] = (B.s8[4]<0) ? (-A.s8[4]) :((B.s8[4]==0) ? 0: A.s8[4]);
02768 A.s8[5] = (B.s8[5]<0) ? (-A.s8[5]) :((B.s8[5]==0) ? 0: A.s8[5]);
02769 A.s8[6] = (B.s8[6]<0) ? (-A.s8[6]) :((B.s8[6]==0) ? 0: A.s8[6]);
02770 A.s8[7] = (B.s8[7]<0) ? (-A.s8[7]) :((B.s8[7]==0) ? 0: A.s8[7]);
02771 A.s8[8] = (B.s8[8]<0) ? (-A.s8[8]) :((B.s8[8]==0) ? 0: A.s8[8]);
02772 A.s8[9] = (B.s8[9]<0) ? (-A.s8[9]) :((B.s8[9]==0) ? 0: A.s8[9]);
02773 A.s8[10] = (B.s8[10]<0) ? (-A.s8[10]) :((B.s8[10]==0)? 0: A.s8[10]);
02774 A.s8[11] = (B.s8[11]<0) ? (-A.s8[11]) :((B.s8[11]==0)? 0: A.s8[11]);
02775 A.s8[12] = (B.s8[12]<0) ? (-A.s8[12]) :((B.s8[12]==0)? 0: A.s8[12]);
02776 A.s8[13] = (B.s8[13]<0) ? (-A.s8[13]) :((B.s8[13]==0)? 0: A.s8[13]);
02777 A.s8[14] = (B.s8[14]<0) ? (-A.s8[14]) :((B.s8[14]==0)? 0: A.s8[14]);
02778 A.s8[15] = (B.s8[15]<0) ? (-A.s8[15]) :((B.s8[15]==0)? 0: A.s8[15]);
02779
02780 return A.i;
02781 }
02782
02783
02785 SSP_FORCEINLINE __m128i ssp_sign_epi16_REF (__m128i a, __m128i b)
02786 {
02787 ssp_m128 A, B;
02788 A.i = a;
02789 B.i = b;
02790
02791 A.s16[0] = (B.s16[0]<0) ? (-A.s16[0]) :((B.s16[0]==0) ? 0: A.s16[0]);
02792 A.s16[1] = (B.s16[1]<0) ? (-A.s16[1]) :((B.s16[1]==0) ? 0: A.s16[1]);
02793 A.s16[2] = (B.s16[2]<0) ? (-A.s16[2]) :((B.s16[2]==0) ? 0: A.s16[2]);
02794 A.s16[3] = (B.s16[3]<0) ? (-A.s16[3]) :((B.s16[3]==0) ? 0: A.s16[3]);
02795 A.s16[4] = (B.s16[4]<0) ? (-A.s16[4]) :((B.s16[4]==0) ? 0: A.s16[4]);
02796 A.s16[5] = (B.s16[5]<0) ? (-A.s16[5]) :((B.s16[5]==0) ? 0: A.s16[5]);
02797 A.s16[6] = (B.s16[6]<0) ? (-A.s16[6]) :((B.s16[6]==0) ? 0: A.s16[6]);
02798 A.s16[7] = (B.s16[7]<0) ? (-A.s16[7]) :((B.s16[7]==0) ? 0: A.s16[7]);
02799
02800 return A.i;
02801 }
02802
02803
02805 SSP_FORCEINLINE __m128i ssp_sign_epi32_REF (__m128i a, __m128i b)
02806 {
02807 ssp_m128 A, B;
02808 A.i = a;
02809 B.i = b;
02810
02811 A.s32[0] = (B.s32[0]<0) ? (-A.s32[0]) :((B.s32[0]==0) ? 0: A.s32[0]);
02812 A.s32[1] = (B.s32[1]<0) ? (-A.s32[1]) :((B.s32[1]==0) ? 0: A.s32[1]);
02813 A.s32[2] = (B.s32[2]<0) ? (-A.s32[2]) :((B.s32[2]==0) ? 0: A.s32[2]);
02814 A.s32[3] = (B.s32[3]<0) ? (-A.s32[3]) :((B.s32[3]==0) ? 0: A.s32[3]);
02815
02816 return A.i;
02817 }
02818
02819
02821 SSP_FORCEINLINE __m64 ssp_sign_pi8_REF (__m64 a, __m64 b)
02822 {
02823 ssp_m64 A, B;
02824 A.m64 = a;
02825 B.m64 = b;
02826
02827 A.s8[0] = (B.s8[0]<0) ? (-A.s8[0]) :((B.s8[0]==0) ? 0: A.s8[0]);
02828 A.s8[1] = (B.s8[1]<0) ? (-A.s8[1]) :((B.s8[1]==0) ? 0: A.s8[1]);
02829 A.s8[2] = (B.s8[2]<0) ? (-A.s8[2]) :((B.s8[2]==0) ? 0: A.s8[2]);
02830 A.s8[3] = (B.s8[3]<0) ? (-A.s8[3]) :((B.s8[3]==0) ? 0: A.s8[3]);
02831 A.s8[4] = (B.s8[4]<0) ? (-A.s8[4]) :((B.s8[4]==0) ? 0: A.s8[4]);
02832 A.s8[5] = (B.s8[5]<0) ? (-A.s8[5]) :((B.s8[5]==0) ? 0: A.s8[5]);
02833 A.s8[6] = (B.s8[6]<0) ? (-A.s8[6]) :((B.s8[6]==0) ? 0: A.s8[6]);
02834 A.s8[7] = (B.s8[7]<0) ? (-A.s8[7]) :((B.s8[7]==0) ? 0: A.s8[7]);
02835
02836 return A.m64;
02837 }
02838
02839
02843 SSP_FORCEINLINE __m64 ssp_sign_pi16_REF (__m64 a, __m64 b)
02844 {
02845 ssp_m64 A, B;
02846 A.m64 = a;
02847 B.m64 = b;
02848
02849 A.s16[0] = (B.s16[0]<0) ? (-A.s16[0]) :((B.s16[0]==0) ? 0: A.s16[0]);
02850 A.s16[1] = (B.s16[1]<0) ? (-A.s16[1]) :((B.s16[1]==0) ? 0: A.s16[1]);
02851 A.s16[2] = (B.s16[2]<0) ? (-A.s16[2]) :((B.s16[2]==0) ? 0: A.s16[2]);
02852 A.s16[3] = (B.s16[3]<0) ? (-A.s16[3]) :((B.s16[3]==0) ? 0: A.s16[3]);
02853
02854 return A.m64;
02855 }
02856
02857
02861 SSP_FORCEINLINE __m64 ssp_sign_pi32_REF (__m64 a, __m64 b)
02862 {
02863 ssp_m64 A, B;
02864 A.m64 = a;
02865 B.m64 = b;
02866
02867 A.s32[0] = (B.s32[0]<0) ? (-A.s32[0]) :((B.s32[0]==0) ? 0: A.s32[0]);
02868 A.s32[1] = (B.s32[1]<0) ? (-A.s32[1]) :((B.s32[1]==0) ? 0: A.s32[1]);
02869
02870 return A.m64;
02871 }
02872
02874 SSP_FORCEINLINE void ssp_stream_sd_REF( double *dst ,__m128d src )
02875 {
02876 ssp_m128 SRC;
02877 SRC.d = src;
02878 *dst = SRC.f64[0];
02879 }
02880
02882 SSP_FORCEINLINE void ssp_stream_ss_REF( float *dst, __m128 src )
02883 {
02884 ssp_m128 SRC;
02885 SRC.f = src;
02886 *dst = SRC.f32[0];
02887 }
02888
02889
02890
02891
02893 SSP_FORCEINLINE unsigned short ssp_lzcnt16_REF( unsigned short val )
02894 {
02895
02896 if( !val )
02897 return 16;
02898
02899 else if( val > 0x00FF )
02900 {
02901 if( val > 0x0FFF )
02902 {
02903 if( val > 0x3FFF )
02904 {
02905 if( val > 0x7FFF )
02906 return 0;
02907 else
02908 return 1;
02909 }
02910 else
02911 {
02912 if( val > 0x1FFF )
02913 return 2;
02914 else
02915 return 3;
02916 }
02917 }
02918 else
02919 {
02920 if( val > 0x03FF )
02921 {
02922 if( val > 0x07FF )
02923 return 4;
02924 else
02925 return 5;
02926 }
02927 else
02928 {
02929 if( val > 0x01FF )
02930 return 6;
02931 else
02932 return 7;
02933 }
02934 }
02935 }
02936 else
02937 {
02938 if( val > 0x000F )
02939 {
02940 if( val > 0x003F )
02941 {
02942 if( val > 0x007F )
02943 return 8;
02944 else
02945 return 9;
02946 }
02947 else
02948 {
02949 if( val > 0x001F)
02950 return 10;
02951 else
02952 return 11;
02953 }
02954 }
02955 else
02956 {
02957 if( val > 0x0003 )
02958 {
02959 if( val > 0x0007 )
02960 return 12;
02961 else
02962 return 13;
02963 }
02964 else
02965 {
02966 if( val > 0x0001)
02967 return 14;
02968 else
02969 return 15;
02970 }
02971 }
02972 }
02973 }
02975 SSP_FORCEINLINE unsigned int ssp_lzcnt_REF( unsigned int val )
02976 {
02977 ssp_u32 cnt;
02978 cnt = ssp_lzcnt16_REF( (ssp_u16)(val>>16) );
02979 if( cnt == 16 )
02980 cnt += ssp_lzcnt16_REF( (ssp_u16)(val & 0x0000FFFF) );
02981 return cnt;
02982 }
02984 SSP_FORCEINLINE ssp_u64 ssp_lzcnt64_REF( ssp_u64 val )
02985 {
02986 ssp_u64 cnt;
02987 cnt = ssp_lzcnt_REF( (ssp_u32)(val>>32) );
02988 if( cnt == 32 )
02989 cnt += ssp_lzcnt_REF( (ssp_u32)(val & 0x00000000FFFFFFFF) );
02990 return cnt;
02991 }
02992
02993
02994
02995
02997 SSP_FORCEINLINE unsigned short ssp_popcnt16_REF( unsigned short val )
02998 {
02999 int i;
03000 ssp_u16 cnt=0;
03001 for( i=0; i<15, val; ++i, val = val>>1 )
03002 cnt += val & 0x1;
03003 return cnt;
03004 }
03006 SSP_FORCEINLINE unsigned int ssp_popcnt_REF( unsigned int val )
03007 {
03008 int i;
03009 ssp_u32 cnt = 0;
03010 for( i=0; i<31, val; ++i, val = val>>1 )
03011 cnt += val & 0x1;
03012 return cnt;
03013 }
03015 SSP_FORCEINLINE ssp_u64 ssp_popcnt64_REF( ssp_u64 val )
03016 {
03017 int i;
03018 ssp_u64 cnt = 0;
03019 for( i=0; i<63, val; ++i, val = val>>1 )
03020 cnt += val & 0x1;
03021 return cnt;
03022 }
03023
03024
03025
03026
03027
03029 SSP_FORCEINLINE __m128i ssp_perm_epi8_REF(__m128i a, __m128i b, __m128i c)
03030 {
03031 int n;
03032 ssp_m128 A,B,C,R;
03033 A.i = a;
03034 B.i = b;
03035 C.i = c;
03036
03037 for( n = 0; n < 16; n++ )
03038 {
03039 int op = C.u8[n] >> 5;
03040 switch( op )
03041 {
03042 case 0:
03043 R.u8[n] = ( C.u8[n] & 0x10 ) ? ( B.u8[C.u8[n] & 0xF] ) : ( A.u8[C.u8[n] & 0xF] );
03044 break;
03045 case 1:
03046 {
03047 ssp_u8 src = ( C.u8[n] & 0x10 ) ? ( B.u8[C.u8[n] & 0xF] ) : ( A.u8[C.u8[n] & 0xF] );
03048 R.u8[n] = ~src;
03049 }
03050 break;
03051 case 2:
03052 {
03053 ssp_u8 src = ( C.u8[n] & 0x10 ) ? ( B.u8[C.u8[n] & 0xF] ) : ( A.u8[C.u8[n] & 0xF] );
03054 R.u8[n] = ( (src & 0x0F) << 4 ) | ( (src & 0xF0) >> 4 );
03055 R.u8[n] = ( (R.u8[n] & 0x33) << 2 ) | ( (R.u8[n] & 0xCC) >> 2 );
03056 R.u8[n] = ( (R.u8[n] & 0x55) << 1 ) | ( (R.u8[n] & 0xAA) >> 1 );
03057 }
03058 break;
03059 case 3:
03060 {
03061 ssp_u8 src = ( C.u8[n] & 0x10 ) ? ( B.u8[C.u8[n] & 0xF] ) : ( A.u8[C.u8[n] & 0xF] );
03062 R.u8[n] = ( (src & 0x0F) << 4 ) | ( (src & 0xF0) >> 4 );
03063 R.u8[n] = ( (R.u8[n] & 0x33) << 2 ) | ( (R.u8[n] & 0xCC) >> 2 );
03064 R.u8[n] = ( (R.u8[n] & 0x55) << 1 ) | ( (R.u8[n] & 0xAA) >> 1 );
03065 R.u8[n] = ~R.u8[n];
03066 }
03067 break;
03068 case 4:
03069 R.u8[n] = 0x00;
03070 break;
03071 case 5:
03072 R.u8[n] = 0xFF;
03073 break;
03074 case 6:
03075 {
03076 ssp_s8 src = ( C.u8[n] & 0x10 ) ? ( B.s8[C.u8[n] & 0xF] ) : ( A.s8[C.u8[n] & 0xF] );
03077 R.s8[n] = src >> 7;
03078 }
03079 break;
03080 case 7:
03081 {
03082 ssp_s8 src = ( C.u8[n] & 0x10 ) ? ( B.s8[C.u8[n] & 0xF] ) : ( A.s8[C.u8[n] & 0xF] );
03083 R.s8[n] = src >> 7;
03084 R.u8[n] = ~R.u8[n];
03085 }
03086 break;
03087 }
03088 }
03089 return R.i;
03090 }
03092 SSP_FORCEINLINE __m128 ssp_perm_ps_REF(__m128 a, __m128 b, __m128i c)
03093 {
03094 int n;
03095 ssp_m128 A,B,C,R;
03096 A.f = a;
03097 B.f = b;
03098 C.i = c;
03099
03100 for( n = 0; n < 4; n++ )
03101 {
03102 unsigned char cb = C.u8[n*4];
03103 int op = (cb >> 5) & 0x7;
03104 switch( op )
03105 {
03106 case 0:
03107 R.f32[n] = ( cb & 0x04 ) ? ( B.f32[cb & 0x03] ) : ( A.f32[cb & 0x03] );
03108 break;
03109 case 1:
03110 {
03111 ssp_f32 src = ( cb & 0x04 ) ? ( B.f32[cb & 0x03] ) : ( A.f32[cb & 0x03] );
03112 R.f32[n] = ( src < 0.0f ) ? (-src) : src;
03113 }
03114 break;
03115 case 2:
03116 {
03117 ssp_f32 src = ( cb & 0x04 ) ? ( B.f32[cb & 0x03] ) : ( A.f32[cb & 0x03] );
03118 R.f32[n] = -src;
03119 }
03120 break;
03121 case 3:
03122 {
03123 ssp_f32 src = ( cb & 0x04 ) ? ( B.f32[cb & 0x03] ) : ( A.f32[cb & 0x03] );
03124 R.f32[n] = ( src < 0.0f ) ? src : (-src);
03125 }
03126 break;
03127 case 4:
03128 R.f32[n] = 0.0f;
03129 break;
03130 case 5:
03131 R.f32[n] = -1.0f;
03132 break;
03133 case 6:
03134 R.f32[n] = 1.0f;
03135 break;
03136 case 7:
03137 R.u32[n] = 0x40490FDB;
03138 break;
03139 }
03140 }
03141 return R.f;
03142 }
03144 SSP_FORCEINLINE __m128d ssp_perm_pd_REF(__m128d a, __m128d b, __m128i c)
03145 {
03146 int n;
03147 ssp_m128 A,B,C,R;
03148 A.d = a;
03149 B.d = b;
03150 C.i = c;
03151
03152 for( n = 0; n < 2; n++ )
03153 {
03154 unsigned char cb = C.u8[n*8];
03155 int op = (cb >> 5) & 0x7;
03156 switch( op )
03157 {
03158 case 0:
03159 R.f64[n] = ( cb & 0x02 ) ? ( B.f64[cb & 0x01] ) : ( A.f64[cb & 0x01] );
03160 break;
03161 case 1:
03162 {
03163 ssp_f64 src = ( cb & 0x02 ) ? ( B.f64[cb & 0x01] ) : ( A.f64[cb & 0x01] );
03164 R.f64[n] = ( src < 0.0 ) ? (-src) : src;
03165 }
03166 break;
03167 case 2:
03168 {
03169 ssp_f64 src = ( cb & 0x02 ) ? ( B.f64[cb & 0x01] ) : ( A.f64[cb & 0x01] );
03170 R.f64[n] = -src;
03171 }
03172 break;
03173 case 3:
03174 {
03175 ssp_f64 src = ( cb & 0x02 ) ? ( B.f64[cb & 0x01] ) : ( A.f64[cb & 0x01] );
03176 R.f64[n] = ( src < 0.0 ) ? src : (-src);
03177 }
03178 break;
03179 case 4:
03180 R.f64[n] = 0.0;
03181 break;
03182 case 5:
03183 R.f64[n] = -1.0;
03184 break;
03185 case 6:
03186 R.f64[n] = 1.0;
03187 break;
03188 case 7:
03189 R.u64[n] = 0x400921FB54442D18;
03190 break;
03191 }
03192 }
03193 return R.d;
03194 }
03195
03196
03197
03198
03199
03201 SSP_FORCEINLINE __m128i ssp_rot_epi8_REF(__m128i a, __m128i b )
03202 {
03203 int n;
03204 ssp_m128 A,B;
03205 A.i = a;
03206 B.i = b;
03207
03208 for( n = 0; n < 16; n++ )
03209 {
03210 if( B.s8[n] < 0 )
03211 {
03212 unsigned int count = (-B.s8[n]) % 8;
03213 unsigned int carry_count = (8 - count) % 8;
03214 ssp_u8 carry = A.u8[n] << carry_count;
03215 A.u8[n] = A.u8[n] >> count;
03216 A.u8[n] = A.u8[n] | carry;
03217 }
03218 else
03219 {
03220 unsigned int count = B.s8[n] % 8;
03221 unsigned int carry_count = (8 - count) % 8;
03222 ssp_u8 carry = A.u8[n] >> carry_count;
03223 A.u8[n] = A.u8[n] << count;
03224 A.u8[n] = A.u8[n] | carry;
03225 }
03226 }
03227 return A.i;
03228 }
03230 SSP_FORCEINLINE __m128i ssp_rot_epi16_REF(__m128i a, __m128i b )
03231 {
03232 int n;
03233 ssp_m128 A,B;
03234 A.i = a;
03235 B.i = b;
03236
03237 for( n = 0; n < 8; n++ )
03238 {
03239 if( B.s16[n] < 0 )
03240 {
03241 unsigned int count = (-B.s16[n]) % 16;
03242 unsigned int carry_count = (16 - count) % 16;
03243 ssp_u16 carry = A.u16[n] << carry_count;
03244 A.u16[n] = A.u16[n] >> count;
03245 A.u16[n] = A.u16[n] | carry;
03246 }
03247 else
03248 {
03249 unsigned int count = B.s16[n] % 8;
03250 unsigned int carry_count = (16 - count) % 16;
03251 ssp_u16 carry = A.u16[n] >> carry_count;
03252 A.u16[n] = A.u16[n] << count;
03253 A.u16[n] = A.u16[n] | carry;
03254 }
03255 }
03256 return A.i;
03257 }
03259 SSP_FORCEINLINE __m128i ssp_rot_epi32_REF(__m128i a, __m128i b )
03260 {
03261 int n;
03262 ssp_m128 A,B;
03263 A.i = a;
03264 B.i = b;
03265
03266 for( n = 0; n < 4; n++ )
03267 {
03268 if( B.s32[n] < 0 )
03269 {
03270 unsigned int count = (-B.s32[n]) % 32;
03271 unsigned int carry_count = (32 - count) % 32;
03272 ssp_u32 carry = A.u32[n] << carry_count;
03273 A.u32[n] = A.u32[n] >> count;
03274 A.u32[n] = A.u32[n] | carry;
03275 }
03276 else
03277 {
03278 unsigned int count = B.s32[n] % 32;
03279 unsigned int carry_count = (32 - count) % 32;
03280 ssp_u32 carry = A.u32[n] >> carry_count;
03281 A.u32[n] = A.u32[n] << count;
03282 A.u32[n] = A.u32[n] | carry;
03283 }
03284 }
03285 return A.i;
03286 }
03288 SSP_FORCEINLINE __m128i ssp_rot_epi64_REF(__m128i a, __m128i b )
03289 {
03290 int n;
03291 ssp_m128 A,B;
03292 A.i = a;
03293 B.i = b;
03294
03295 for( n = 0; n < 2; n++ )
03296 {
03297 if( B.s64[n] < 0 )
03298 {
03299 unsigned int count = (unsigned int)((-B.s64[n]) % 64);
03300 unsigned int carry_count = (64 - count) % 64;
03301 ssp_u64 carry = A.u64[n] << carry_count;
03302 A.u64[n] = A.u64[n] >> count;
03303 A.u64[n] = A.u64[n] | carry;
03304 }
03305 else
03306 {
03307 unsigned int count = (unsigned int)(B.s64[n] % 64);
03308 unsigned int carry_count = (64 - count) % 64;
03309 ssp_u64 carry = A.u64[n] >> carry_count;
03310 A.u64[n] = A.u64[n] << count;
03311 A.u64[n] = A.u64[n] | carry;
03312 }
03313 }
03314 return A.i;
03315 }
03317 SSP_FORCEINLINE __m128i ssp_roti_epi8_REF(__m128i a, const int b)
03318 {
03319 int n;
03320 ssp_m128 A;
03321 A.i = a;
03322
03323 if( b < 0 )
03324 {
03325 unsigned int count = (-b) % 8;
03326 unsigned int carry_count = (8 - count) % 8;
03327 for( n = 0; n < 16; n++ )
03328 {
03329 ssp_u8 carry = A.u8[n] << carry_count;
03330 A.u8[n] = A.u8[n] >> count;
03331 A.u8[n] = A.u8[n] | carry;
03332 }
03333 }
03334 else
03335 {
03336 unsigned int count = b % 8;
03337 unsigned int carry_count = (8 - count) % 8;
03338 for( n = 0; n < 16; n++ )
03339 {
03340 ssp_u8 carry = A.u8[n] >> carry_count;
03341 A.u8[n] = A.u8[n] << count;
03342 A.u8[n] = A.u8[n] | carry;
03343 }
03344 }
03345 return A.i;
03346 }
03348 SSP_FORCEINLINE __m128i ssp_roti_epi16_REF(__m128i a, const int b)
03349 {
03350 int n;
03351 ssp_m128 A;
03352 A.i = a;
03353
03354 if( b < 0 )
03355 {
03356 unsigned int count = (-b) % 16;
03357 unsigned int carry_count = (16 - count) % 16;
03358 for( n = 0; n < 8; n++ )
03359 {
03360 ssp_u16 carry = A.u16[n] << carry_count;
03361 A.u16[n] = A.u16[n] >> count;
03362 A.u16[n] = A.u16[n] | carry;
03363 }
03364 }
03365 else
03366 {
03367 unsigned int count = b % 16;
03368 unsigned int carry_count = (16 - count) % 16;
03369 for( n = 0; n < 8; n++ )
03370 {
03371 ssp_u16 carry = A.u16[n] >> carry_count;
03372 A.u16[n] = A.u16[n] << count;
03373 A.u16[n] = A.u16[n] | carry;
03374 }
03375 }
03376 return A.i;
03377 }
03379 SSP_FORCEINLINE __m128i ssp_roti_epi32_REF(__m128i a, const int b)
03380 {
03381 int n;
03382 ssp_m128 A;
03383 A.i = a;
03384
03385 if( b < 0 )
03386 {
03387 unsigned int count = (-b) % 32;
03388 unsigned int carry_count = (32 - count) % 32;
03389 for( n = 0; n < 4; n++ )
03390 {
03391 ssp_u32 carry = A.u32[n] << carry_count;
03392 A.u32[n] = A.u32[n] >> count;
03393 A.u32[n] = A.u32[n] | carry;
03394 }
03395 }
03396 else
03397 {
03398 unsigned int count = b % 32;
03399 unsigned int carry_count = (32 - count) % 32;
03400 for( n = 0; n < 4; n++ )
03401 {
03402 ssp_u32 carry = A.u32[n] >> carry_count;
03403 A.u32[n] = A.u32[n] << count;
03404 A.u32[n] = A.u32[n] | carry;
03405 }
03406 }
03407 return A.i;
03408 }
03410 SSP_FORCEINLINE __m128i ssp_roti_epi64_REF(__m128i a, const int b)
03411 {
03412 int n;
03413 ssp_m128 A;
03414 A.i = a;
03415
03416 if( b < 0 )
03417 {
03418 unsigned int count = (-b) % 64;
03419 unsigned int carry_count = (64 - count) % 64;
03420 for( n = 0; n < 2; n++ )
03421 {
03422 ssp_u64 carry = A.u64[n] << carry_count;
03423 A.u64[n] = A.u64[n] >> count;
03424 A.u64[n] = A.u64[n] | carry;
03425 }
03426 }
03427 else
03428 {
03429 unsigned int count = b % 64;
03430 unsigned int carry_count = (64 - count) % 64;
03431 for( n = 0; n < 2; n++ )
03432 {
03433 ssp_u64 carry = A.u64[n] >> carry_count;
03434 A.u64[n] = A.u64[n] << count;
03435 A.u64[n] = A.u64[n] | carry;
03436 }
03437 }
03438 return A.i;
03439 }
03440
03441
03442
03443
03444
03445
03447 SSP_FORCEINLINE __m128i ssp_shl_epi8_REF(__m128i a, __m128i b)
03448 {
03449 int n;
03450 ssp_m128 A,B;
03451 A.i = a;
03452 B.i = b;
03453
03454 for( n = 0; n < 16; n++ )
03455 {
03456 if( B.s8[n] < 0 )
03457 {
03458 unsigned int count = (-B.s8[n]) % 8;
03459 A.u8[n] = A.u8[n] >> count;
03460 }
03461 else
03462 {
03463 unsigned int count = B.s8[n] % 8;
03464 A.u8[n] = A.u8[n] << count;
03465 }
03466 }
03467 return A.i;
03468 }
03469
03471 SSP_FORCEINLINE __m128i ssp_sha_epi8_REF(__m128i a, __m128i b)
03472 {
03473 int n;
03474 ssp_m128 A,B;
03475 A.i = a;
03476 B.i = b;
03477
03478 for( n = 0; n < 16; n++ )
03479 {
03480 if( B.s8[n] < 0 )
03481 {
03482 unsigned int count = (-B.s8[n]) % 8;
03483 A.s8[n] = A.s8[n] >> count;
03484 }
03485 else
03486 {
03487 unsigned int count = B.s8[n] % 8;
03488 A.s8[n] = A.s8[n] << count;
03489 }
03490 }
03491
03492 return A.i;
03493 }
03494
03496 SSP_FORCEINLINE __m128i ssp_shl_epi16_REF(__m128i a, __m128i b)
03497 {
03498 int n;
03499 ssp_m128 A,B;
03500 A.i = a;
03501 B.i = b;
03502
03503 for( n = 0; n < 8; n++ )
03504 {
03505 if( B.s8[n*2] < 0 )
03506 {
03507 unsigned int count = (-B.s8[n*2]) % 16;
03508 A.u16[n] = A.u16[n] >> count;
03509 }
03510 else
03511 {
03512 unsigned int count = B.s8[n*2] % 16;
03513 A.u16[n] = A.u16[n] << count;
03514 }
03515 }
03516 return A.i;
03517 }
03518
03520 SSP_FORCEINLINE __m128i ssp_sha_epi16_REF(__m128i a, __m128i b)
03521 {
03522 int n;
03523 ssp_m128 A,B;
03524 A.i = a;
03525 B.i = b;
03526
03527 for( n = 0; n < 8; n++ )
03528 {
03529 if( B.s8[n*2] < 0 )
03530 {
03531 unsigned int count = (-B.s8[n*2]) % 16;
03532 A.s16[n] = A.s16[n] >> count;
03533 }
03534 else
03535 {
03536 unsigned int count = B.s8[n*2] % 16;
03537 A.s16[n] = A.s16[n] << count;
03538 }
03539 }
03540
03541 return A.i;
03542 }
03543
03545 SSP_FORCEINLINE __m128i ssp_shl_epi32_REF(__m128i a, __m128i b)
03546 {
03547 int n;
03548 ssp_m128 A,B;
03549 A.i = a;
03550 B.i = b;
03551
03552 for( n = 0; n < 4; n++ )
03553 {
03554 if( B.s8[n*4] < 0 )
03555 {
03556 unsigned int count = (-B.s8[n*4]) % 32;
03557 A.u32[n] = A.u32[n] >> count;
03558 }
03559 else
03560 {
03561 unsigned int count = B.s8[n*4] % 32;
03562 A.u32[n] = A.u32[n] << count;
03563 }
03564 }
03565 return A.i;
03566 }
03567
03569 SSP_FORCEINLINE __m128i ssp_sha_epi32_REF(__m128i a, __m128i b)
03570 {
03571 int n;
03572 ssp_m128 A,B;
03573 A.i = a;
03574 B.i = b;
03575
03576 for( n = 0; n < 4; n++ )
03577 {
03578 if( B.s8[n*4] < 0 )
03579 {
03580 unsigned int count = (-B.s8[n*4]) % 32;
03581 A.s32[n] = A.s32[n] >> count;
03582 }
03583 else
03584 {
03585 unsigned int count = B.s8[n*4] % 32;
03586 A.s32[n] = A.s32[n] << count;
03587 }
03588 }
03589
03590 return A.i;
03591 }
03592
03594 SSP_FORCEINLINE __m128i ssp_shl_epi64_REF(__m128i a, __m128i b)
03595 {
03596 int n;
03597 ssp_m128 A,B;
03598 A.i = a;
03599 B.i = b;
03600
03601 for( n = 0; n < 2; n++ )
03602 {
03603 if( B.s8[n*8] < 0 )
03604 {
03605 unsigned int count = (-B.s8[n*8]) % 64;
03606 A.u64[n] = A.u64[n] >> count;
03607 }
03608 else
03609 {
03610 unsigned int count = B.s8[n*8] % 64;
03611 A.u64[n] = A.u64[n] << count;
03612 }
03613 }
03614 return A.i;
03615 }
03616
03618 SSP_FORCEINLINE __m128i ssp_sha_epi64_REF(__m128i a, __m128i b)
03619 {
03620 int n;
03621 ssp_m128 A,B;
03622 A.i = a;
03623 B.i = b;
03624
03625 for( n = 0; n < 2; n++ )
03626 {
03627 if( B.s8[n*8] < 0 )
03628 {
03629 unsigned int count = (-B.s8[n*8]) % 64;
03630 A.s64[n] = A.s64[n] >> count;
03631 }
03632 else
03633 {
03634 unsigned int count = B.s8[n*8] % 64;
03635 A.s64[n] = A.s64[n] << count;
03636 }
03637 }
03638
03639 return A.i;
03640 }
03641
03646 #endif // __SSP_EMULATION_REF_H__