00001
00002
00003
00004
00005 #ifndef __SSEPLUS_NATIVE_SSE_H__
00006 #define __SSEPLUS_NATIVE_SSE_H__
00007
00008 #include "../SSEPlus_base.h"
00009 #include <xmmintrin.h>
00010
00016
00017
00018
00019
00021 SSP_FORCEINLINE __m128 ssp_add_ss_SSE( __m128 a, __m128 b )
00022 {
00023 return _mm_add_ss( a, b );
00024 }
00025
00027 SSP_FORCEINLINE __m128 ssp_add_ps_SSE( __m128 a, __m128 b )
00028 {
00029 return _mm_add_ps( a, b );
00030 }
00031
00033 SSP_FORCEINLINE __m128 ssp_sub_ss_SSE( __m128 a, __m128 b )
00034 {
00035 return _mm_sub_ss( a, b );
00036 }
00037
00039 SSP_FORCEINLINE __m128 ssp_sub_ps_SSE( __m128 a, __m128 b )
00040 {
00041 return _mm_sub_ps( a, b );
00042 }
00043
00045 SSP_FORCEINLINE __m128 ssp_mul_ss_SSE( __m128 a, __m128 b )
00046 {
00047 return _mm_mul_ss( a, b );
00048 }
00049
00051 SSP_FORCEINLINE __m128 ssp_mul_ps_SSE( __m128 a, __m128 b )
00052 {
00053 return _mm_mul_ps( a, b );
00054 }
00055
00057 SSP_FORCEINLINE __m128 ssp_div_ss_SSE( __m128 a, __m128 b )
00058 {
00059 return _mm_div_ss( a, b );
00060 }
00061
00063 SSP_FORCEINLINE __m128 ssp_div_ps_SSE( __m128 a, __m128 b )
00064 {
00065 return _mm_div_ps( a, b );
00066 }
00067
00069 SSP_FORCEINLINE __m128 ssp_sqrt_ss_SSE( __m128 a )
00070 {
00071 return _mm_sqrt_ss( a );
00072 }
00073
00075 SSP_FORCEINLINE __m128 ssp_sqrt_ps_SSE( __m128 a )
00076 {
00077 return _mm_sqrt_ps( a );
00078 }
00079
00081 SSP_FORCEINLINE __m128 ssp_rcp_ss_SSE( __m128 a )
00082 {
00083 return _mm_rcp_ss( a );
00084 }
00085
00087 SSP_FORCEINLINE __m128 ssp_rcp_ps_SSE( __m128 a )
00088 {
00089 return _mm_rcp_ps( a );
00090 }
00091
00093 SSP_FORCEINLINE __m128 ssp_rsqrt_ss_SSE( __m128 a )
00094 {
00095 return _mm_rsqrt_ss( a );
00096 }
00097
00099 SSP_FORCEINLINE __m128 ssp_rsqrt_ps_SSE( __m128 a )
00100 {
00101 return _mm_rsqrt_ps( a );
00102 }
00103
00105 SSP_FORCEINLINE __m128 ssp_min_ss_SSE( __m128 a, __m128 b )
00106 {
00107 return _mm_min_ss( a, b );
00108 }
00109
00111 SSP_FORCEINLINE __m128 ssp_min_ps_SSE( __m128 a, __m128 b )
00112 {
00113 return _mm_min_ps( a, b );
00114 }
00115
00117 SSP_FORCEINLINE __m128 ssp_max_ss_SSE( __m128 a, __m128 b )
00118 {
00119 return _mm_max_ss( a, b );
00120 }
00121
00123 SSP_FORCEINLINE __m128 ssp_max_ps_SSE( __m128 a, __m128 b )
00124 {
00125 return _mm_max_ps( a, b );
00126 }
00127
00129 SSP_FORCEINLINE __m128 ssp_and_ps_SSE( __m128 a, __m128 b )
00130 {
00131 return _mm_and_ps( a, b );
00132 }
00133
00135 SSP_FORCEINLINE __m128 ssp_andnot_ps_SSE( __m128 a, __m128 b )
00136 {
00137 return _mm_andnot_ps( a, b );
00138 }
00139
00141 SSP_FORCEINLINE __m128 ssp_or_ps_SSE( __m128 a, __m128 b )
00142 {
00143 return _mm_or_ps( a, b );
00144 }
00145
00147 SSP_FORCEINLINE __m128 ssp_xor_ps_SSE( __m128 a, __m128 b )
00148 {
00149 return _mm_xor_ps( a, b );
00150 }
00151
00153 SSP_FORCEINLINE __m128 ssp_cmpeq_ss_SSE( __m128 a, __m128 b )
00154 {
00155 return _mm_cmpeq_ss( a, b );
00156 }
00157
00159 SSP_FORCEINLINE __m128 ssp_cmpeq_ps_SSE( __m128 a, __m128 b )
00160 {
00161 return _mm_cmpeq_ps( a, b );
00162 }
00163
00165 SSP_FORCEINLINE __m128 ssp_cmplt_ss_SSE( __m128 a, __m128 b )
00166 {
00167 return _mm_cmplt_ss( a, b );
00168 }
00169
00171 SSP_FORCEINLINE __m128 ssp_cmplt_ps_SSE( __m128 a, __m128 b )
00172 {
00173 return _mm_cmplt_ps( a, b );
00174 }
00175
00177 SSP_FORCEINLINE __m128 ssp_cmple_ss_SSE( __m128 a, __m128 b )
00178 {
00179 return _mm_cmple_ss( a, b );
00180 }
00181
00183 SSP_FORCEINLINE __m128 ssp_cmple_ps_SSE( __m128 a, __m128 b )
00184 {
00185 return _mm_cmple_ps( a, b );
00186 }
00187
00189 SSP_FORCEINLINE __m128 ssp_cmpgt_ss_SSE( __m128 a, __m128 b )
00190 {
00191 return _mm_cmpgt_ss( a, b );
00192 }
00193
00195 SSP_FORCEINLINE __m128 ssp_cmpgt_ps_SSE( __m128 a, __m128 b )
00196 {
00197 return _mm_cmpgt_ps( a, b );
00198 }
00199
00201 SSP_FORCEINLINE __m128 ssp_cmpge_ss_SSE( __m128 a, __m128 b )
00202 {
00203 return _mm_cmpge_ss( a, b );
00204 }
00205
00207 SSP_FORCEINLINE __m128 ssp_cmpge_ps_SSE( __m128 a, __m128 b )
00208 {
00209 return _mm_cmpge_ps( a, b );
00210 }
00211
00213 SSP_FORCEINLINE __m128 ssp_cmpneq_ss_SSE( __m128 a, __m128 b )
00214 {
00215 return _mm_cmpneq_ss( a, b );
00216 }
00217
00219 SSP_FORCEINLINE __m128 ssp_cmpneq_ps_SSE( __m128 a, __m128 b )
00220 {
00221 return _mm_cmpneq_ps( a, b );
00222 }
00223
00225 SSP_FORCEINLINE __m128 ssp_cmpnlt_ss_SSE( __m128 a, __m128 b )
00226 {
00227 return _mm_cmpnlt_ss( a, b );
00228 }
00229
00231 SSP_FORCEINLINE __m128 ssp_cmpnlt_ps_SSE( __m128 a, __m128 b )
00232 {
00233 return _mm_cmpnlt_ps( a, b );
00234 }
00235
00237 SSP_FORCEINLINE __m128 ssp_cmpnle_ss_SSE( __m128 a, __m128 b )
00238 {
00239 return _mm_cmpnle_ss( a, b );
00240 }
00241
00243 SSP_FORCEINLINE __m128 ssp_cmpnle_ps_SSE( __m128 a, __m128 b )
00244 {
00245 return _mm_cmpnle_ps( a, b );
00246 }
00247
00249 SSP_FORCEINLINE __m128 ssp_cmpngt_ss_SSE( __m128 a, __m128 b )
00250 {
00251 return _mm_cmpngt_ss( a, b );
00252 }
00253
00255 SSP_FORCEINLINE __m128 ssp_cmpngt_ps_SSE( __m128 a, __m128 b )
00256 {
00257 return _mm_cmpngt_ps( a, b );
00258 }
00259
00261 SSP_FORCEINLINE __m128 ssp_cmpnge_ss_SSE( __m128 a, __m128 b )
00262 {
00263 return _mm_cmpnge_ss( a, b );
00264 }
00265
00267 SSP_FORCEINLINE __m128 ssp_cmpnge_ps_SSE( __m128 a, __m128 b )
00268 {
00269 return _mm_cmpnge_ps( a, b );
00270 }
00271
00273 SSP_FORCEINLINE __m128 ssp_cmpord_ss_SSE( __m128 a, __m128 b )
00274 {
00275 return _mm_cmpord_ss( a, b );
00276 }
00277
00279 SSP_FORCEINLINE __m128 ssp_cmpord_ps_SSE( __m128 a, __m128 b )
00280 {
00281 return _mm_cmpord_ps( a, b );
00282 }
00283
00285 SSP_FORCEINLINE __m128 ssp_cmpunord_ss_SSE( __m128 a, __m128 b )
00286 {
00287 return _mm_cmpunord_ss( a, b );
00288 }
00289
00291 SSP_FORCEINLINE __m128 ssp_cmpunord_ps_SSE( __m128 a, __m128 b )
00292 {
00293 return _mm_cmpunord_ps( a, b );
00294 }
00295
00297 SSP_FORCEINLINE int ssp_comieq_ss_SSE( __m128 a, __m128 b )
00298 {
00299 return _mm_comieq_ss( a, b );
00300 }
00301
00303 SSP_FORCEINLINE int ssp_comilt_ss_SSE( __m128 a, __m128 b )
00304 {
00305 return _mm_comilt_ss( a, b );
00306 }
00307
00309 SSP_FORCEINLINE int ssp_comile_ss_SSE( __m128 a, __m128 b )
00310 {
00311 return _mm_comile_ss( a, b );
00312 }
00313
00315 SSP_FORCEINLINE int ssp_comigt_ss_SSE( __m128 a, __m128 b )
00316 {
00317 return _mm_comigt_ss( a, b );
00318 }
00319
00321 SSP_FORCEINLINE int ssp_comige_ss_SSE( __m128 a, __m128 b )
00322 {
00323 return _mm_comige_ss( a, b );
00324 }
00325
00327 SSP_FORCEINLINE int ssp_comineq_ss_SSE( __m128 a, __m128 b )
00328 {
00329 return _mm_comineq_ss( a, b );
00330 }
00331
00333 SSP_FORCEINLINE int ssp_ucomieq_ss_SSE( __m128 a, __m128 b )
00334 {
00335 return _mm_ucomieq_ss( a, b );
00336 }
00337
00339 SSP_FORCEINLINE int ssp_ucomilt_ss_SSE( __m128 a, __m128 b )
00340 {
00341 return _mm_ucomilt_ss( a, b );
00342 }
00343
00345 SSP_FORCEINLINE int ssp_ucomile_ss_SSE( __m128 a, __m128 b )
00346 {
00347 return _mm_ucomile_ss( a, b );
00348 }
00349
00351 SSP_FORCEINLINE int ssp_ucomigt_ss_SSE( __m128 a, __m128 b )
00352 {
00353 return _mm_ucomigt_ss( a, b );
00354 }
00355
00357 SSP_FORCEINLINE int ssp_ucomige_ss_SSE( __m128 a, __m128 b )
00358 {
00359 return _mm_ucomige_ss( a, b );
00360 }
00361
00363 SSP_FORCEINLINE int ssp_ucomineq_ss_SSE( __m128 a, __m128 b )
00364 {
00365 return _mm_ucomineq_ss( a, b );
00366 }
00367
00369 SSP_FORCEINLINE int ssp_cvt_ss2si_SSE( __m128 a )
00370 {
00371 return _mm_cvt_ss2si( a );
00372 }
00373
00375 SSP_FORCEINLINE __m64 ssp_cvt_ps2pi_SSE( __m128 a )
00376 {
00377 return _mm_cvt_ps2pi( a );
00378 }
00379
00381 SSP_FORCEINLINE int ssp_cvtt_ss2si_SSE( __m128 a )
00382 {
00383 return _mm_cvtt_ss2si( a );
00384 }
00385
00387 SSP_FORCEINLINE __m64 ssp_cvtt_ps2pi_SSE( __m128 a )
00388 {
00389 return _mm_cvtt_ps2pi( a );
00390 }
00391
00393 SSP_FORCEINLINE __m128 ssp_cvt_si2ss_SSE( __m128 a, int imm )
00394 {
00395 return _mm_cvt_si2ss( a, imm );
00396 }
00397
00399 SSP_FORCEINLINE __m128 ssp_cvt_pi2ps_SSE( __m128 a, __m64 b )
00400 {
00401 return _mm_cvt_pi2ps( a, b );
00402 }
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412 #if defined(SYS64)
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426 #endif
00427
00428
00429
00430
00431
00433 SSP_FORCEINLINE __m128 ssp_shuffle_ps_SSE( __m128 a, __m128 b, unsigned int imm8 )
00434 {
00435 switch( imm8 & 0xFF )
00436 {
00437 CASE_256( _mm_shuffle_ps, a, b );
00438 }
00439 }
00440
00442 SSP_FORCEINLINE __m128 ssp_unpackhi_ps_SSE( __m128 a, __m128 b )
00443 {
00444 return _mm_unpackhi_ps( a, b );
00445 }
00446
00448 SSP_FORCEINLINE __m128 ssp_unpacklo_ps_SSE( __m128 a, __m128 b )
00449 {
00450 return _mm_unpacklo_ps( a, b );
00451 }
00452
00454 SSP_FORCEINLINE __m128 ssp_loadh_pi_SSE( __m128 a, __m64 const* b )
00455 {
00456 return _mm_loadh_pi( a, b );
00457 }
00458
00460 SSP_FORCEINLINE __m128 ssp_movehl_ps_SSE( __m128 a, __m128 b )
00461 {
00462 return _mm_movehl_ps( a, b );
00463 }
00464
00466 SSP_FORCEINLINE __m128 ssp_movelh_ps_SSE( __m128 a, __m128 b )
00467 {
00468 return _mm_movelh_ps( a, b );
00469 }
00470
00472 SSP_FORCEINLINE void ssp_storeh_pi_SSE( __m64 *a, __m128 b )
00473 {
00474 _mm_storeh_pi( a, b );
00475 }
00476
00478 SSP_FORCEINLINE __m128 ssp_loadl_pi_SSE( __m128 a, __m64 const* b )
00479 {
00480 return _mm_loadl_pi( a, b );
00481 }
00482
00484 SSP_FORCEINLINE void ssp_storel_pi_SSE( __m64 *a, __m128 b )
00485 {
00486 _mm_storel_pi( a, b );
00487 }
00488
00490 SSP_FORCEINLINE int ssp_movemask_ps_SSE( __m128 a )
00491 {
00492 return _mm_movemask_ps( a );
00493 }
00494
00495
00496
00497
00498
00499
00501 SSP_FORCEINLINE int ssp_pextrw_SSE( __m64 a, int imm )
00502 {
00503 switch( imm & 0x3 )
00504 {
00505 CASE_4( _m_pextrw, a );
00506 }
00507 }
00508
00510 SSP_FORCEINLINE __m64 ssp_pinsrw_SSE( __m64 a, int b, int imm )
00511 {
00512 switch( imm & 0x3 )
00513 {
00514 CASE_4( _m_pinsrw, a, b );
00515 }
00516 }
00517
00519 SSP_FORCEINLINE __m64 ssp_pmaxsw_SSE( __m64 a, __m64 b )
00520 {
00521 return _m_pmaxsw( a, b );
00522 }
00523
00525 SSP_FORCEINLINE __m64 ssp_pmaxub_SSE( __m64 a, __m64 b )
00526 {
00527 return _m_pmaxub( a, b );
00528 }
00529
00531 SSP_FORCEINLINE __m64 ssp_pminsw_SSE( __m64 a, __m64 b )
00532 {
00533 return _m_pminsw( a, b );
00534 }
00535
00537 SSP_FORCEINLINE __m64 ssp_pminub_SSE( __m64 a, __m64 b )
00538 {
00539 return _m_pminub( a, b );
00540 }
00541
00543 SSP_FORCEINLINE int ssp_pmovmskb_SSE( __m64 a )
00544 {
00545 return _m_pmovmskb( a );
00546 }
00547
00549 SSP_FORCEINLINE __m64 ssp_pmulhuw_SSE( __m64 a, __m64 b )
00550 {
00551 return _m_pmulhuw( a, b );
00552 }
00553
00555 SSP_FORCEINLINE __m64 ssp_pshufw_SSE( __m64 a, int imm )
00556 {
00557 switch( imm & 0xFF )
00558 {
00559 CASE_256( _m_pshufw, a );
00560 }
00561 }
00562
00564 SSP_FORCEINLINE void ssp_maskmovq_SSE( __m64 a, __m64 b, char *c )
00565 {
00566 _m_maskmovq( a, a, c );
00567 }
00568
00570 SSP_FORCEINLINE __m64 ssp_pavgb_SSE( __m64 a, __m64 b )
00571 {
00572 return _m_pavgb( a, b );
00573 }
00574
00576 SSP_FORCEINLINE __m64 ssp_pavgw_SSE( __m64 a, __m64 b )
00577 {
00578 return _m_pavgw( a, b );
00579 }
00580
00582 SSP_FORCEINLINE __m64 ssp_psadbw_SSE( __m64 a, __m64 b )
00583 {
00584 return _m_psadbw( a, b );
00585 }
00586
00587
00588
00589
00590
00592 SSP_FORCEINLINE __m128 ssp_set_ss_SSE( float a )
00593 {
00594 return _mm_set_ss( a );
00595 }
00596
00598 SSP_FORCEINLINE __m128 ssp_set_ps1_SSE( float a )
00599 {
00600 return _mm_set_ps1( a );
00601 }
00602
00604 SSP_FORCEINLINE __m128 _mm_set_ps_SSE( float a, float b, float c, float d )
00605 {
00606 return _mm_set_ps( a, b, c, d );
00607 }
00608
00610 SSP_FORCEINLINE __m128 ssp_setr_ps_SSE( float a, float b, float c, float d )
00611 {
00612 return _mm_setr_ps( a, b, c, d );
00613 }
00614
00616 SSP_FORCEINLINE __m128 ssp_setzero_ps_SSE( void )
00617 {
00618 return _mm_setzero_ps( );
00619 }
00620
00622 SSP_FORCEINLINE __m128 ssp_load_ss_SSE( float const*a )
00623 {
00624 return _mm_load_ss( a );
00625 }
00626
00628 SSP_FORCEINLINE __m128 ssp_load_ps1_SSE( float const*a )
00629 {
00630 return _mm_load_ps1( a );
00631 }
00632
00634 SSP_FORCEINLINE __m128 ssp_load_ps_SSE( float const*a )
00635 {
00636 return _mm_load_ps( a );
00637 }
00638
00640 SSP_FORCEINLINE __m128 ssp_loadr_ps_SSE( float const*a )
00641 {
00642 return _mm_loadr_ps( a );
00643 }
00644
00646 SSP_FORCEINLINE __m128 ssp_loadu_ps_SSE( float const*a )
00647 {
00648 return _mm_loadu_ps( a );
00649 }
00650
00652 SSP_FORCEINLINE void ssp_store_ss_SSE( float *v, __m128 a )
00653 {
00654 _mm_store_ss( v, a );
00655 }
00656
00658 SSP_FORCEINLINE void ssp_store_ps1_SSE( float *v, __m128 a )
00659 {
00660 _mm_store_ps1( v, a );
00661 }
00662
00664 SSP_FORCEINLINE void ssp_store_ps_SSE( float *v, __m128 a )
00665 {
00666 _mm_store_ps( v, a );
00667 }
00668
00670 SSP_FORCEINLINE void ssp_storer_ps_SSE( float *v, __m128 a )
00671 {
00672 _mm_storer_ps( v, a );
00673 }
00674
00676 SSP_FORCEINLINE void ssp_storeu_ps_SSE( float *v, __m128 a )
00677 {
00678 _mm_storeu_ps( v, a );
00679 }
00680
00682 SSP_FORCEINLINE void ssp_prefetch_SSE( char *a, int sel )
00683 {
00684 switch( sel & 0x3 )
00685 {
00686 case 1: _mm_prefetch( a, _MM_HINT_T0 ); break;
00687 case 2: _mm_prefetch( a, _MM_HINT_T1 ); break;
00688 case 3: _mm_prefetch( a, _MM_HINT_T2 ); break;
00689 default: _mm_prefetch( a, _MM_HINT_NTA );
00690 }
00691 }
00692
00694 SSP_FORCEINLINE void ssp_stream_pi_SSE( __m64 *a, __m64 b )
00695 {
00696 _mm_stream_pi( a, b );
00697 }
00698
00700 SSP_FORCEINLINE void ssp_stream_ps_SSE( float *a, __m128 b )
00701 {
00702 _mm_stream_ps( a, b );
00703 }
00704
00706 SSP_FORCEINLINE __m128 ssp_move_ss_SSE( __m128 a, __m128 b )
00707 {
00708 return _mm_move_ss( a, b );
00709 }
00710
00712 SSP_FORCEINLINE void ssp_sfence_SSE( void )
00713 {
00714 _mm_sfence( );
00715 }
00716
00718 unsigned int ssp_getcsr_SSE( void )
00719 {
00720 return _mm_getcsr( );
00721 }
00722
00724 SSP_FORCEINLINE void ssp_setcsr_SSE( unsigned int a )
00725 {
00726 _mm_setcsr( a );
00727 }
00728
00729
00731 SSP_FORCEINLINE __m128 ssp_cvtpi16_ps_SSE( __m64 a )
00732 {
00733 return _mm_cvtpi16_ps( a );
00734 }
00735
00737 SSP_FORCEINLINE __m128 ssp_cvtpu16_ps_SSE( __m64 a )
00738 {
00739 return _mm_cvtpu16_ps( a );
00740 }
00741
00743 SSP_FORCEINLINE __m64 ssp_cvtps_pi16_SSE( __m128 a )
00744 {
00745 return _mm_cvtps_pi16( a );
00746 }
00747
00749 SSP_FORCEINLINE __m128 ssp_cvtpi8_ps_SSE( __m64 a )
00750 {
00751 return _mm_cvtpi8_ps( a );
00752 }
00753
00755 SSP_FORCEINLINE __m128 ssp_cvtpu8_ps_SSE( __m64 a )
00756 {
00757 return _mm_cvtpu8_ps( a );
00758 }
00759
00761 SSP_FORCEINLINE __m64 ssp_cvtps_pi8_SSE( __m128 a )
00762 {
00763 return _mm_cvtps_pi8( a );
00764 }
00765
00767 SSP_FORCEINLINE __m128 ssp_cvtpi32x2_ps_SSE( __m64 a, __m64 b )
00768 {
00769 return _mm_cvtpi32x2_ps( a, b );
00770 }
00771
00773
00777 #define ssp_cvtss_si32_SSE ssp_cvt_ss2si_SSE
00778 #define ssp_cvtps_pi32_SSE ssp_cvt_ps2pi_SSE
00779 #define ssp_cvttss_si32_SSE ssp_cvtt_ss2si_SSE
00780 #define ssp_cvttps_pi32_SSE ssp_cvtt_ps2pi_SSE
00781 #define ssp_cvtsi32_ss_SSE ssp_cvt_si2ss_SSE
00782 #define ssp_cvtpi32_ps_SSE ssp_cvt_pi2ps_SSE
00783 #define ssp_extract_pi16_SSE ssp_pextrw_SSE
00784 #define ssp_insert_pi16_SSE ssp_pinsrw_SSE
00785 #define ssp_max_pi16_SSE ssp_pmaxsw_SSE
00786 #define ssp_max_pu8_SSE ssp_pmaxub_SSE
00787 #define ssp_min_pi16_SSE ssp_pminsw_SSE
00788 #define ssp_min_pu8_SSE ssp_pminub_SSE
00789 #define ssp_movemask_pi8_SSE ssp_pmovmskb_SSE
00790 #define ssp_mulhi_pu16_SSE ssp_pmulhuw_SSE
00791 #define ssp_shuffle_pi16_SSE ssp_pshufw_SSE
00792 #define ssp_maskmove_si64_SSE ssp_maskmovq_SSE
00793 #define ssp_avg_pu8_SSE ssp_pavgb_SSE
00794 #define ssp_avg_pu16_SSE ssp_pavgw_SSE
00795 #define ssp_sad_pu8_SSE ssp_psadbw_SSE
00796 #define ssp_set1_ps_SSE ssp_set_ps1_SSE
00797 #define ssp_load1_ps_SSE ssp_load_ps1_SSE
00798 #define ssp_store1_ps_SSE ssp_store_ps1_SSE
00799
00804 #endif // __SSEPLUS_NATIVE_SSE_H__