include/native/SSEPlus_native_SSE4.1.h

Go to the documentation of this file.
00001 //
00002 // Copyright (c) 2006-2008 Advanced Micro Devices, Inc. All Rights Reserved.
00003 // This software is subject to the Apache v2.0 License.
00004 //
00005 #ifndef __SSEPLUS_NATIVE_SSE4_1_H__
00006 #define __SSEPLUS_NATIVE_SSE4_1_H__
00007 
00008 #include "../SSEPlus_base.h"
00009 #include <smmintrin.h> // SSE4.1
00010 
00018 SSP_FORCEINLINE __m128d ssp_ceil_pd_SSE4_1(__m128d a)
00019 {
00020     return _mm_ceil_pd( a );
00021 }
00023 SSP_FORCEINLINE __m128d ssp_ceil_sd_SSE4_1(__m128d dst, __m128d a)
00024 {
00025     return _mm_ceil_sd( dst, a );
00026 }
00028 SSP_FORCEINLINE __m128d ssp_floor_pd_SSE4_1(__m128d a)
00029 {
00030     return _mm_floor_pd( a );
00031 }
00033 SSP_FORCEINLINE __m128d ssp_floor_sd_SSE4_1(__m128d dst, __m128d a)
00034 {
00035     return _mm_floor_sd( dst, a );
00036 }
00038 SSP_FORCEINLINE __m128 ssp_ceil_ps_SSE4_1(__m128 a)
00039 {
00040     return _mm_ceil_ps( a );
00041 }
00043 SSP_FORCEINLINE __m128 ssp_ceil_ss_SSE4_1(__m128 dst, __m128 a)
00044 {
00045     return _mm_ceil_ss( dst, a );
00046 }
00048 SSP_FORCEINLINE __m128 ssp_floor_ps_SSE4_1(__m128 a)
00049 {
00050     return _mm_floor_ps( a );
00051 }
00053 SSP_FORCEINLINE __m128 ssp_floor_ss_SSE4_1(__m128 dst, __m128 a)
00054 {
00055     return _mm_floor_ss( dst, a );
00056 }
00058 SSP_FORCEINLINE __m128i ssp_blend_epi16_SSE4_1(__m128i a, __m128i b,const int mask)
00059 {
00060     switch( mask & 0xFF )
00061     {
00062         CASE_256( _mm_blend_epi16, a, b );
00063     }
00064 }
00066 SSP_FORCEINLINE __m128i ssp_blendv_epi8_SSE4_1(__m128i a, __m128i b, __m128i mask)
00067 {
00068     return _mm_blendv_epi8( a, b, mask );
00069 }
00071 SSP_FORCEINLINE __m128 ssp_blend_ps_SSE4_1(__m128  a, __m128  b, const int mask)
00072 {
00073     switch( mask & 0x0F )
00074     {
00075         CASE_16( _mm_blend_ps, a, b );
00076     }
00077 }
00079 SSP_FORCEINLINE __m128 ssp_blendv_ps_SSE4_1(__m128  a, __m128  b, __m128 mask)
00080 {
00081     return _mm_blendv_ps( a, b, mask);
00082 }
00084 SSP_FORCEINLINE __m128d ssp_blend_pd_SSE4_1(__m128d a, __m128d b, const int mask)
00085 {
00086     switch(mask&0x3)
00087     {
00088         CASE_4( _mm_blend_pd, a, b );
00089     }
00090 }
00092 SSP_FORCEINLINE __m128d ssp_blendv_pd_SSE4_1(__m128d a, __m128d b, __m128d mask)
00093 {
00094     return _mm_blendv_pd( a, b, mask);
00095 }
00097 SSP_FORCEINLINE __m128 ssp_dp_ps_SSE4_1(__m128  a, __m128  b, const int mask)
00098 {
00099     switch( mask & 0xFF )
00100     {
00101         CASE_256( _mm_dp_ps, a, b );
00102     }
00103 }
00105 SSP_FORCEINLINE __m128d ssp_dp_pd_SSE4_1(__m128d a, __m128d b, const int mask)
00106 {
00107     switch( mask & 0x3F )
00108     {
00109         CASE_128( _mm_dp_pd, a, b );
00110     }
00111 }
00113 SSP_FORCEINLINE __m128i ssp_cmpeq_epi64_SSE4_1(__m128i a, __m128i b)
00114 {
00115     return _mm_cmpeq_epi64( a, b);
00116 }
00118 SSP_FORCEINLINE __m128i ssp_min_epi8_SSE4_1(__m128i a, __m128i b)
00119 {
00120     return _mm_min_epi8( a, b);
00121 }
00123 SSP_FORCEINLINE __m128i ssp_max_epi8_SSE4_1(__m128i a, __m128i b)
00124 {
00125     return _mm_max_epi8( a, b);
00126 }
00128 SSP_FORCEINLINE __m128i ssp_min_epu16_SSE4_1(__m128i a, __m128i b)
00129 {
00130     return _mm_min_epu16( a, b);
00131 }
00133 SSP_FORCEINLINE __m128i ssp_max_epu16_SSE4_1(__m128i a, __m128i b)
00134 {
00135     return _mm_max_epu16( a, b);
00136 }
00138 SSP_FORCEINLINE __m128i ssp_min_epi32_SSE4_1(__m128i a, __m128i b)
00139 {
00140     return _mm_min_epi32( a, b);
00141 }
00143 SSP_FORCEINLINE __m128i ssp_max_epi32_SSE4_1(__m128i a, __m128i b)
00144 {
00145     return _mm_max_epi32( a, b);
00146 }
00148 SSP_FORCEINLINE __m128i ssp_min_epu32_SSE4_1(__m128i a, __m128i b)
00149 {
00150     return _mm_min_epu32( a, b);
00151 }
00153 SSP_FORCEINLINE __m128i ssp_max_epu32_SSE4_1(__m128i a, __m128i b)
00154 {
00155     return _mm_max_epu32( a, b);
00156 }
00158 SSP_FORCEINLINE __m128i ssp_mullo_epi32_SSE4_1(__m128i a, __m128i b)
00159 {
00160     return _mm_mullo_epi32( a, b);
00161 }
00163 SSP_FORCEINLINE __m128i ssp_mul_epi32_SSE4_1(__m128i a, __m128i b)
00164 {
00165     return _mm_mul_epi32( a, b);
00166 }
00167 
00169 SSP_FORCEINLINE __m128 ssp_insert_ps_SSE4_1(__m128 dst, __m128 src, const int ndx)
00170 {
00171     switch( ndx & 0xFF )
00172     {
00173         CASE_256( _mm_insert_ps, dst, src );
00174     }
00175 }
00177 SSP_FORCEINLINE int ssp_extract_ps_SSE4_1(__m128 src, const int ndx)
00178 {
00179     switch(ndx&0x3)
00180     {
00181         CASE_4( _mm_extract_ps, src )
00182     }
00183 }
00185 SSP_FORCEINLINE __m128i ssp_insert_epi8_SSE4_1(__m128i dst, int s, const int ndx)
00186 {
00187     switch( ndx & 0xF )
00188     {
00189         CASE_16( _mm_insert_epi8, dst, s );
00190     }
00191 }
00193 SSP_FORCEINLINE __m128i ssp_insert_epi32_SSE4_1(__m128i dst, int s, const int ndx)
00194 {
00195     switch( ndx & 0x3 )
00196     {
00197         CASE_4( _mm_insert_epi32, dst, s );
00198     }
00199 }
00200 
00202 SSP_FORCEINLINE int ssp_extract_epi8_SSE4_1(__m128i src, const int ndx)
00203 {
00204     switch( ndx & 0xF )
00205     {
00206         CASE_16( _mm_extract_epi8, src );
00207     }
00208 }
00210 SSP_FORCEINLINE int ssp_extract_epi32_SSE4_1(__m128i src, const int ndx)
00211 {
00212     switch( ndx & 0x3 )
00213     {
00214         CASE_4( _mm_extract_epi32, src );
00215     }
00216 }
00217 
00219 SSP_FORCEINLINE __m128i ssp_minpos_epu16_SSE4_1(__m128i shortValues)
00220 {
00221     return _mm_minpos_epu16( shortValues );
00222 }
00223 
00225 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi32_SSE4_1(__m128i byteValues)
00226 {
00227     return _mm_cvtepi8_epi32( byteValues );
00228 }
00230 SSP_FORCEINLINE __m128i ssp_cvtepi16_epi32_SSE4_1(__m128i shortValues)
00231 {
00232     return _mm_cvtepi16_epi32( shortValues );
00233 }
00235 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi64_SSE4_1(__m128i byteValues)
00236 {
00237     return _mm_cvtepi8_epi64( byteValues );
00238 }
00240 SSP_FORCEINLINE __m128i ssp_cvtepi32_epi64_SSE4_1(__m128i intValues)
00241 {
00242     return _mm_cvtepi32_epi64( intValues );
00243 }
00245 SSP_FORCEINLINE __m128i ssp_cvtepi16_epi64_SSE4_1(__m128i shortValues)
00246 {
00247     return _mm_cvtepi16_epi64( shortValues );
00248 }
00250 SSP_FORCEINLINE __m128i ssp_cvtepi8_epi16_SSE4_1(__m128i byteValues)
00251 {
00252     return _mm_cvtepi8_epi16( byteValues );
00253 }
00255 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi32_SSE4_1(__m128i byteValues)
00256 {
00257     return _mm_cvtepu8_epi32( byteValues );
00258 }
00260 SSP_FORCEINLINE __m128i ssp_cvtepu16_epi32_SSE4_1(__m128i shortValues)
00261 {
00262     return _mm_cvtepu16_epi32( shortValues );
00263 }
00265 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi64_SSE4_1(__m128i shortValues)
00266 {
00267     return _mm_cvtepu8_epi64( shortValues );
00268 }
00270 SSP_FORCEINLINE __m128i ssp_cvtepu32_epi64_SSE4_1(__m128i intValues)
00271 {
00272     return _mm_cvtepu32_epi64( intValues );
00273 }
00275 SSP_FORCEINLINE __m128i ssp_cvtepu16_epi64_SSE4_1(__m128i shortValues)
00276 {
00277     return _mm_cvtepu16_epi64( shortValues );
00278 }
00280 SSP_FORCEINLINE __m128i ssp_cvtepu8_epi16_SSE4_1(__m128i byteValues)
00281 {
00282     return _mm_cvtepu8_epi16( byteValues );
00283 }
00285 SSP_FORCEINLINE __m128i ssp_packus_epi32_SSE4_1(__m128i a, __m128i b)
00286 {
00287     return _mm_packus_epi32( a, b );
00288 }
00290 SSP_FORCEINLINE __m128i ssp_mpsadbw_epu8_SSE4_1(__m128i a, __m128i b, const int msk)
00291 {
00292     switch( msk & 0x7 )
00293     {
00294         CASE_8( _mm_mpsadbw_epu8, a, b );
00295     }
00296 }
00298 SSP_FORCEINLINE __m128i ssp_stream_load_si128_SSE4_1(__m128i* a)
00299 {
00300     return _mm_stream_load_si128( a );
00301 }
00302 
00303 
00304 //
00305 // Functions common with SSE5
00306 //
00307 
00309 SSP_FORCEINLINE int ssp_testz_si128_SSE4_1(__m128i mask, __m128i a)
00310 {
00311     return _mm_testz_si128( mask, a);
00312 }
00314 SSP_FORCEINLINE int ssp_testc_si128_SSE4_1(__m128i mask, __m128i a)
00315 {
00316     return _mm_testc_si128( mask, a);
00317 }
00319 SSP_FORCEINLINE int ssp_testnzc_si128_SSE4_1(__m128i mask, __m128i b)
00320 {
00321     return _mm_testnzc_si128( mask, b);
00322 }
00323 
00325 SSP_FORCEINLINE __m128d ssp_round_pd_SSE4_1(__m128d a, int iRoundMode)
00326 {
00327     switch( iRoundMode & 0xF )
00328     {
00329         CASE_16( _mm_round_pd, a );
00330     }
00331 }
00333 SSP_FORCEINLINE __m128d ssp_round_sd_SSE4_1(__m128d dst, __m128d a, int iRoundMode)
00334 {
00335     switch( iRoundMode & 0xF )
00336     {
00337         CASE_16( _mm_round_sd, dst, a );
00338     }
00339 }
00341 SSP_FORCEINLINE __m128 ssp_round_ps_SSE4_1(__m128  a, int iRoundMode)
00342 {
00343     switch( iRoundMode & 0xF )
00344     {
00345         CASE_16( _mm_round_ps, a );
00346     }
00347 }
00349 SSP_FORCEINLINE __m128 ssp_round_ss_SSE4_1(__m128 dst, __m128  a, int iRoundMode)
00350 {
00351     switch( iRoundMode & 0xF )
00352     {
00353         CASE_16( _mm_round_ss, dst, a );
00354     }
00355 }
00356 
00357 
00358 #ifdef SYS64
00359 
00360 SSP_FORCEINLINE __m128i ssp_insert_epi64_SSE4_1(__m128i dst, ssp_s64 s, const int ndx)
00361 {
00362     switch( ndx & 0x1 )
00363     {
00364         CASE_2( _mm_insert_epi64, dst, s );
00365     }
00366 }
00367 
00369 SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_SSE4_1(__m128i src, const int ndx)
00370 {
00371     switch( ndx & 0x1 )
00372     {
00373         CASE_2( _mm_extract_epi64, src );
00374     }
00375 }
00376 #endif
00377 
00382 #endif // __SSEPLUS_NATIVE_SSE4_1_H__

Generated on Wed May 21 13:44:11 2008 for "SSEPlus" by  doxygen 1.5.4