00001
00002
00003
00004
00005 #ifndef __SSEPLUS_NATIVE_SSE4_1_H__
00006 #define __SSEPLUS_NATIVE_SSE4_1_H__
00007
00008 #include "../SSEPlus_base.h"
00009 #include <smmintrin.h>
00010
00018 SSP_FORCEINLINE __m128d ssp_ceil_pd_SSE4_1(__m128d a)
00019 {
00020 return _mm_ceil_pd( a );
00021 }
00023 SSP_FORCEINLINE __m128d ssp_ceil_sd_SSE4_1(__m128d dst, __m128d a)
00024 {
00025 return _mm_ceil_sd( dst, a );
00026 }
00028 SSP_FORCEINLINE __m128d ssp_floor_pd_SSE4_1(__m128d a)
00029 {
00030 return _mm_floor_pd( a );
00031 }
00033 SSP_FORCEINLINE __m128d ssp_floor_sd_SSE4_1(__m128d dst, __m128d a)
00034 {
00035 return _mm_floor_sd( dst, a );
00036 }
00038 SSP_FORCEINLINE __m128 ssp_ceil_ps_SSE4_1(__m128 a)
00039 {
00040 return _mm_ceil_ps( a );
00041 }
00043 SSP_FORCEINLINE __m128 ssp_ceil_ss_SSE4_1(__m128 dst, __m128 a)
00044 {
00045 return _mm_ceil_ss( dst, a );
00046 }
00048 SSP_FORCEINLINE __m128 ssp_floor_ps_SSE4_1(__m128 a)
00049 {
00050 return _mm_floor_ps( a );
00051 }
00053 SSP_FORCEINLINE __m128 ssp_floor_ss_SSE4_1(__m128 dst, __m128 a)
00054 {
00055 return _mm_floor_ss( dst, a );
00056 }
00058 SSP_FORCEINLINE __m128i ssp_blend_epi16_SSE4_1(__m128i a, __m128i b,const int mask)
00059 {
00060 switch( mask & 0xFF )
00061 {
00062 CASE_256( _mm_blend_epi16, a, b );
00063 }
00064 }
00066 SSP_FORCEINLINE __m128i ssp_blendv_epi8_SSE4_1(__m128i a, __m128i b, __m128i mask)
00067 {
00068 return _mm_blendv_epi8( a, b, mask );
00069 }
00071 SSP_FORCEINLINE __m128 ssp_blend_ps_SSE4_1(__m128 a, __m128 b, const int mask)
00072 {
00073 switch( mask & 0x0F )
00074 {
00075 CASE_16( _mm_blend_ps, a, b );
00076 }
00077 }
00079 SSP_FORCEINLINE __m128 ssp_blendv_ps_SSE4_1(__m128 a, __m128 b, __m128 mask)
00080 {
00081 return _mm_blendv_ps( a, b, mask);
00082 }
00084 SSP_FORCEINLINE __m128d ssp_blend_pd_SSE4_1(__m128d a, __m128d b, const int mask)
00085 {
00086 switch(mask&0x3)
00087 {
00088 CASE_4( _mm_blend_pd, a, b );
00089 }
00090 }
00092 SSP_FORCEINLINE __m128d ssp_blendv_pd_SSE4_1(__m128d a, __m128d b, __m128d mask)
00093 {
00094 return _mm_blendv_pd( a, b, mask);
00095 }
00097 SSP_FORCEINLINE __m128 ssp_dp_ps_SSE4_1(__m128 a, __m128 b, const int mask)
00098 {
00099 switch( mask & 0xFF )
00100 {
00101 CASE_256( _mm_dp_ps, a, b );
00102 }
00103 }
00105 SSP_FORCEINLINE __m128d ssp_dp_pd_SSE4_1(__m128d a, __m128d b, const int mask)
00106 {
00107 switch( mask & 0x3F )
00108 {
00109 CASE_128( _mm_dp_pd, a, b );
00110 }
00111 }
00113 SSP_FORCEINLINE __m128i ssp_cmpeq_epi64_SSE4_1(__m128i a, __m128i b)
00114 {
00115 return _mm_cmpeq_epi64( a, b);
00116 }
00118 SSP_FORCEINLINE __m128i ssp_min_epi8_SSE4_1(__m128i a, __m128i b)
00119 {
00120 return _mm_min_epi8( a, b);
00121 }
00123 SSP_FORCEINLINE __m128i ssp_max_epi8_SSE4_1(__m128i a, __m128i b)
00124 {
00125 return _mm_max_epi8( a, b);
00126 }
00128 SSP_FORCEINLINE __m128i ssp_min_epu16_SSE4_1(__m128i a, __m128i b)
00129 {
00130 return _mm_min_epu16( a, b);
00131 }
00133 SSP_FORCEINLINE __m128i ssp_max_epu16_SSE4_1(__m128i a, __m128i b)
00134 {
00135 return _mm_max_epu16( a, b);
00136 }
00138 SSP_FORCEINLINE __m128i ssp_min_epi32_SSE4_1(__m128i a, __m128i b)
00139 {
00140 return _mm_min_epi32( a, b);
00141 }
00143 SSP_FORCEINLINE __m128i ssp_max_epi32_SSE4_1(__m128i a, __m128i b)
00144 {
00145 return _mm_max_epi32( a, b);
00146 }
00148 SSP_FORCEINLINE __m128i ssp_min_epu32_SSE4_1(__m128i a, __m128i b)
00149 {
00150 return _mm_min_epu32( a, b);
00151 }
00153 SSP_FORCEINLINE __m128i ssp_max_epu32_SSE4_1(__m128i a, __m128i b)
00154 {
00155 return _mm_max_epu32( a, b);
00156 }
00158 SSP_FORCEINLINE __m128i ssp_mullo_epi32_SSE4_1(__m128i a, __m128i b)
00159 {
00160 return _mm_mullo_epi32( a, b);
00161 }
00163 SSP_FORCEINLINE __m128i ssp_mul_epi32_SSE4_1(__m128i a, __m128i b)
00164 {
00165 return _mm_mul_epi32( a, b);
00166 }
00167
00169 SSP_FORCEINLINE __m128 ssp_insert_ps_SSE4_1(__m128 dst, __m128 src, const int ndx)
00170 {
00171 switch( ndx & 0xFF )
00172 {
00173 CASE_256( _mm_insert_ps, dst, src );
00174 }
00175 }
00177 SSP_FORCEINLINE int ssp_extract_ps_SSE4_1(__m128 src, const int ndx)
00178 {
00179 switch(ndx&0x3)
00180 {
00181 CASE_4( _mm_extract_ps, src )
00182 }
00183 }
00185 SSP_FORCEINLINE __m128i ssp_insert_epi8_SSE4_1(__m128i dst, int s, const int ndx)
00186 {
00187 switch( ndx & 0xF )
00188 {
00189 CASE_16( _mm_insert_epi8, dst, s );
00190 }
00191 }
00193 SSP_FORCEINLINE __m128i ssp_insert_epi32_SSE4_1(__m128i dst, int s, const int ndx)
00194 {
00195 switch( ndx & 0x3 )
00196 {
00197 CASE_4( _mm_insert_epi32, dst, s );
00198 }
00199 }
00200
00202 SSP_FORCEINLINE int ssp_extract_epi8_SSE4_1(__m128i src, const int ndx)
00203 {
00204 switch( ndx & 0xF )
00205 {
00206 CASE_16( _mm_extract_epi8, src );
00207 }
00208 }
00210 SSP_FORCEINLINE int ssp_extract_epi32_SSE4_1(__m128i src, const int ndx)
00211 {
00212 switch( ndx & 0x3 )
00213 {
00214 CASE_4( _mm_extract_epi32, src );
00215 }
00216 }
00217
00219 SSP_FORCEINLINE __m128i ssp_minpos_epu16_SSE4_1(__m128i shortValues)
00220 {
00221 return _mm_minpos_epu16( shortValues );
00222 }
00223
00225 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi32_SSE4_1(__m128i byteValues)
00226 {
00227 return _mm_cvtepi8_epi32( byteValues );
00228 }
00230 SSP_FORCEINLINE __m128i ssp_cvtepi16_epi32_SSE4_1(__m128i shortValues)
00231 {
00232 return _mm_cvtepi16_epi32( shortValues );
00233 }
00235 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi64_SSE4_1(__m128i byteValues)
00236 {
00237 return _mm_cvtepi8_epi64( byteValues );
00238 }
00240 SSP_FORCEINLINE __m128i ssp_cvtepi32_epi64_SSE4_1(__m128i intValues)
00241 {
00242 return _mm_cvtepi32_epi64( intValues );
00243 }
00245 SSP_FORCEINLINE __m128i ssp_cvtepi16_epi64_SSE4_1(__m128i shortValues)
00246 {
00247 return _mm_cvtepi16_epi64( shortValues );
00248 }
00250 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi16_SSE4_1(__m128i byteValues)
00251 {
00252 return _mm_cvtepi8_epi16( byteValues );
00253 }
00255 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi32_SSE4_1(__m128i byteValues)
00256 {
00257 return _mm_cvtepu8_epi32( byteValues );
00258 }
00260 SSP_FORCEINLINE __m128i ssp_cvtepu16_epi32_SSE4_1(__m128i shortValues)
00261 {
00262 return _mm_cvtepu16_epi32( shortValues );
00263 }
00265 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi64_SSE4_1(__m128i shortValues)
00266 {
00267 return _mm_cvtepu8_epi64( shortValues );
00268 }
00270 SSP_FORCEINLINE __m128i ssp_cvtepu32_epi64_SSE4_1(__m128i intValues)
00271 {
00272 return _mm_cvtepu32_epi64( intValues );
00273 }
00275 SSP_FORCEINLINE __m128i ssp_cvtepu16_epi64_SSE4_1(__m128i shortValues)
00276 {
00277 return _mm_cvtepu16_epi64( shortValues );
00278 }
00280 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi16_SSE4_1(__m128i byteValues)
00281 {
00282 return _mm_cvtepu8_epi16( byteValues );
00283 }
00285 SSP_FORCEINLINE __m128i ssp_packus_epi32_SSE4_1(__m128i a, __m128i b)
00286 {
00287 return _mm_packus_epi32( a, b );
00288 }
00290 SSP_FORCEINLINE __m128i ssp_mpsadbw_epu8_SSE4_1(__m128i a, __m128i b, const int msk)
00291 {
00292 switch( msk & 0x7 )
00293 {
00294 CASE_8( _mm_mpsadbw_epu8, a, b );
00295 }
00296 }
00298 SSP_FORCEINLINE __m128i ssp_stream_load_si128_SSE4_1(__m128i* a)
00299 {
00300 return _mm_stream_load_si128( a );
00301 }
00302
00303
00304
00305
00306
00307
00309 SSP_FORCEINLINE int ssp_testz_si128_SSE4_1(__m128i mask, __m128i a)
00310 {
00311 return _mm_testz_si128( mask, a);
00312 }
00314 SSP_FORCEINLINE int ssp_testc_si128_SSE4_1(__m128i mask, __m128i a)
00315 {
00316 return _mm_testc_si128( mask, a);
00317 }
00319 SSP_FORCEINLINE int ssp_testnzc_si128_SSE4_1(__m128i mask, __m128i b)
00320 {
00321 return _mm_testnzc_si128( mask, b);
00322 }
00323
00325 SSP_FORCEINLINE __m128d ssp_round_pd_SSE4_1(__m128d a, int iRoundMode)
00326 {
00327 switch( iRoundMode & 0xF )
00328 {
00329 CASE_16( _mm_round_pd, a );
00330 }
00331 }
00333 SSP_FORCEINLINE __m128d ssp_round_sd_SSE4_1(__m128d dst, __m128d a, int iRoundMode)
00334 {
00335 switch( iRoundMode & 0xF )
00336 {
00337 CASE_16( _mm_round_sd, dst, a );
00338 }
00339 }
00341 SSP_FORCEINLINE __m128 ssp_round_ps_SSE4_1(__m128 a, int iRoundMode)
00342 {
00343 switch( iRoundMode & 0xF )
00344 {
00345 CASE_16( _mm_round_ps, a );
00346 }
00347 }
00349 SSP_FORCEINLINE __m128 ssp_round_ss_SSE4_1(__m128 dst, __m128 a, int iRoundMode)
00350 {
00351 switch( iRoundMode & 0xF )
00352 {
00353 CASE_16( _mm_round_ss, dst, a );
00354 }
00355 }
00356
00357
00358 #ifdef SYS64
00359
00360 SSP_FORCEINLINE __m128i ssp_insert_epi64_SSE4_1(__m128i dst, ssp_s64 s, const int ndx)
00361 {
00362 switch( ndx & 0x1 )
00363 {
00364 CASE_2( _mm_insert_epi64, dst, s );
00365 }
00366 }
00367
00369 SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_SSE4_1(__m128i src, const int ndx)
00370 {
00371 switch( ndx & 0x1 )
00372 {
00373 CASE_2( _mm_extract_epi64, src );
00374 }
00375 }
00376 #endif
00377
00382 #endif // __SSEPLUS_NATIVE_SSE4_1_H__