SSE[3,4A,...,5] implemented in SSE2 | |
SSP_FORCEINLINE __m128i | ssp_comeq_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comeq_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comeq_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comeq_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comeq_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comeq_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comeq_epu64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comeq_epu8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128d | ssp_comeq_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comeq_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comeq_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comeq_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128i | ssp_comlt_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comlt_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comlt_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comlt_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comlt_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comlt_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comlt_epu64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comlt_epu8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128d | ssp_comlt_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comlt_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comlt_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comlt_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128i | ssp_comle_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comle_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comle_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comle_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comle_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comle_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comle_epu64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comle_epu8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128d | ssp_comle_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comle_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comle_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comle_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comunord_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comunord_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comunord_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comunord_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128i | ssp_comneq_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comneq_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comneq_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comneq_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comneq_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comneq_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comneq_epu64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comneq_epu8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128d | ssp_comneq_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comneq_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comneq_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comneq_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comnlt_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comnlt_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comnlt_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comnlt_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comnle_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comnle_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comnle_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comnle_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comord_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comord_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comord_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comord_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comueq_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comueq_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comueq_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comueq_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comnge_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comnge_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comnge_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comnge_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comngt_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comngt_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comngt_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comngt_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128i | ssp_comfalse_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comfalse_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comfalse_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comfalse_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comfalse_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comfalse_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comfalse_epu64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comfalse_epu8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128d | ssp_comfalse_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comfalse_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comfalse_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comfalse_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comoneq_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comoneq_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comoneq_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comoneq_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128i | ssp_comge_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comge_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comge_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comge_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comge_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comge_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comge_epu64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comge_epu8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128d | ssp_comge_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comge_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comge_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comge_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128i | ssp_comgt_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comgt_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comgt_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comgt_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comgt_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comgt_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comgt_epu64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comgt_epu8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128d | ssp_comgt_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comgt_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comgt_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comgt_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128i | ssp_comtrue_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comtrue_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comtrue_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comtrue_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comtrue_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comtrue_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comtrue_epu64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_comtrue_epu8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128d | ssp_comtrue_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comtrue_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_comtrue_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_comtrue_ss_SSE2 (__m128 a, __m128 b) |
SSE[3,4A,...,5] implemented in SSE2 | |
SSP_FORCEINLINE __m128i | ssp_macc_epi16_SSE2 (__m128i a, __m128i b, __m128i c) |
SSP_FORCEINLINE __m128i | ssp_macc_epi32_SSE2 (__m128i a, __m128i b, __m128i c) |
SSP_FORCEINLINE __m128d | ssp_macc_pd_SSE2 (__m128d a, __m128d b, __m128d c) |
SSP_FORCEINLINE __m128 | ssp_macc_ps_SSE2 (__m128 a, __m128 b, __m128 c) |
SSP_FORCEINLINE __m128d | ssp_macc_sd_SSE2 (__m128d a, __m128d b, __m128d c) |
SSP_FORCEINLINE __m128 | ssp_macc_ss_SSE2 (__m128 a, __m128 b, __m128 c) |
SSP_FORCEINLINE __m128i | ssp_maccd_epi16_SSE2 (__m128i a, __m128i b, __m128i c) |
SSP_FORCEINLINE __m128i | ssp_macchi_epi32_SSE2 (__m128i a, __m128i b, __m128i c) |
SSP_FORCEINLINE __m128i | ssp_macclo_epi32_SSE2 (__m128i a, __m128i b, __m128i c) |
SSP_FORCEINLINE __m128i | ssp_maccs_epi16_SSE2 (__m128i a, __m128i b, __m128i c) |
SSP_FORCEINLINE __m128i | ssp_maccs_epi32_SSE2 (__m128i a, __m128i b, __m128i c) |
SSP_FORCEINLINE __m128 | ssp_nmacc_ps_SSE2 (__m128 a, __m128 b, __m128 c) |
SSP_FORCEINLINE __m128d | ssp_nmacc_pd_SSE2 (__m128d a, __m128d b, __m128d c) |
SSP_FORCEINLINE __m128 | ssp_nmacc_ss_SSE2 (__m128 a, __m128 b, __m128 c) |
SSP_FORCEINLINE __m128d | ssp_nmacc_sd_SSE2 (__m128d a, __m128d b, __m128d c) |
SSP_FORCEINLINE __m128 | ssp_msub_ps_SSE2 (__m128 a, __m128 b, __m128 c) |
SSP_FORCEINLINE __m128d | ssp_msub_pd_SSE2 (__m128d a, __m128d b, __m128d c) |
SSP_FORCEINLINE __m128 | ssp_msub_ss_SSE2 (__m128 a, __m128 b, __m128 c) |
SSP_FORCEINLINE __m128d | ssp_msub_sd_SSE2 (__m128d a, __m128d b, __m128d c) |
SSP_FORCEINLINE __m128 | ssp_nmsub_ps_SSE2 (__m128 a, __m128 b, __m128 c) |
SSP_FORCEINLINE __m128d | ssp_nmsub_pd_SSE2 (__m128d a, __m128d b, __m128d c) |
SSP_FORCEINLINE __m128 | ssp_nmsub_ss_SSE2 (__m128 a, __m128 b, __m128 c) |
SSP_FORCEINLINE __m128d | ssp_nmsub_sd_SSE2 (__m128d a, __m128d b, __m128d c) |
SSP_FORCEINLINE __m128i | ssp_abs_epi8_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_abs_epi16_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_abs_epi32_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128 | ssp_addsub_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_addsub_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128i | ssp_blend_epi16_SSE2 (__m128i a, __m128i b, const int mask) |
SSP_FORCEINLINE __m128d | ssp_blend_pd_SSE2 (__m128d a, __m128d b, const int mask) |
SSP_FORCEINLINE __m128 | ssp_blend_ps_SSE2 (__m128 a, __m128 b, const int mask) |
SSP_FORCEINLINE __m128i | ssp_blendv_epi8_SSE2 (__m128i a, __m128i b, __m128i mask) |
SSP_FORCEINLINE __m128d | ssp_blendv_pd_SSE2 (__m128d a, __m128d b, __m128d mask) |
SSP_FORCEINLINE __m128 | ssp_blendv_ps_SSE2 (__m128 a, __m128 b, __m128 mask) |
SSP_FORCEINLINE __m128i | ssp_cmpeq_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_hadd_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_hadds_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_hsub_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_hsubs_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_hadd_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_hsub_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128 | ssp_hadd_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128 | ssp_hsub_ps_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128d | ssp_hadd_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128d | ssp_hsub_pd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128i | ssp_mulhrs_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_insert_epi32_SSE2 (__m128i a, int b, const int ndx) |
SSP_FORCEINLINE __m128i | ssp_min_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_max_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_min_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_max_epu16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_min_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_max_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_min_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_max_epu32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_maddubs_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_mpsadbw_epu8_SSE2 (__m128i a, __m128i b, const int msk) |
SSP_FORCEINLINE __m128d | ssp_dp_pd_SSE2 (__m128d a, __m128d b, const int mask) |
SSP_FORCEINLINE __m128 | ssp_dp_ps_SSE2 (__m128 a, __m128 b, const int mask) |
SSP_FORCEINLINE __m128 | ssp_round_ps_SSE2 (__m128 a, int iRoundMode) |
SSP_FORCEINLINE __m128d | ssp_round_pd_SSE2 (__m128d a, int iRoundMode) |
SSP_FORCEINLINE __m128 | ssp_round_ss_SSE2 (__m128 a, __m128 b, int iRoundMode) |
SSP_FORCEINLINE __m128 | ssp_ceil_ps_SSE2 (__m128 a) |
SSP_FORCEINLINE __m128 | ssp_floor_ps_SSE2 (__m128 a) |
SSP_FORCEINLINE __m128d | ssp_floor_pd_SSE2 (__m128d a) |
SSP_FORCEINLINE __m128d | ssp_ceil_pd_SSE2 (__m128d a) |
SSP_FORCEINLINE __m128d | ssp_floor_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128d | ssp_ceil_sd_SSE2 (__m128d a, __m128d b) |
SSP_FORCEINLINE __m128 | ssp_floor_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128 | ssp_ceil_ss_SSE2 (__m128 a, __m128 b) |
SSP_FORCEINLINE __m128i | ssp_cvtepi8_epi16_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepi8_epi32_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepi8_epi64_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepi16_epi32_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepi16_epi64_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepi32_epi64_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepu8_epi16_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepu8_epi32_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepu8_epi64_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepu16_epi32_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepu16_epi64_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_cvtepu32_epi64_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128i | ssp_packus_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_alignr_epi8_SSE2 (__m128i a, __m128i b, const int ralign) |
SSP_FORCEINLINE __m128i | ssp_insert_epi8_SSE2 (__m128i a, int b, const int ndx) |
SSP_FORCEINLINE __m128i | ssp_inserti_si64_SSE2 (__m128i a, __m128i b, int len, int ndx) |
SSP_FORCEINLINE __m128i | ssp_insert_si64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE int | ssp_extract_epi8_SSE2 (__m128i a, const int ndx) |
SSP_FORCEINLINE int | ssp_extract_epi32_SSE2 (__m128i a, const int imm) |
SSP_FORCEINLINE int | ssp_extract_ps_SSE2 (__m128 a, const int ndx) |
SSP_FORCEINLINE ssp_s64 | ssp_extract_epi64_SSE2 (__m128i a, const int ndx) |
SSP_FORCEINLINE __m128i | ssp_extracti_si64_SSE2 (__m128i a, int len, int ndx) |
SSP_FORCEINLINE __m128i | ssp_extract_si64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_shuffle_epi8_SSE2 (__m128i a, __m128i mask) |
SSP_FORCEINLINE __m128i | ssp_sign_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_sign_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_sign_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE int | ssp_testc_si128_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE int | ssp_testz_si128_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE int | ssp_testnzc_si128_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128 | ssp_movehdup_ps_SSE2 (__m128 a) |
SSP_FORCEINLINE __m128 | ssp_moveldup_ps_SSE2 (__m128 a) |
SSP_FORCEINLINE __m128d | ssp_movedup_pd_SSE2 (__m128d a) |
SSP_FORCEINLINE __m128i | ssp_rot_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_rot_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_rot_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_rot_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_roti_epi8_SSE2 (__m128i a, const int b) |
SSP_FORCEINLINE __m128i | ssp_roti_epi16_SSE2 (__m128i a, const int b) |
SSP_FORCEINLINE __m128i | ssp_roti_epi32_SSE2 (__m128i a, const int b) |
SSP_FORCEINLINE __m128i | ssp_roti_epi64_SSE2 (__m128i a, const int b) |
SSP_FORCEINLINE __m128i | ssp_shl_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_sha_epi8_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_shl_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_sha_epi16_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_shl_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_sha_epi32_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_shl_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_sha_epi64_SSE2 (__m128i a, __m128i b) |
SSP_FORCEINLINE __m128i ssp_abs_epi16_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_abs_epi16 [SSSE3]. (Searches MSDN)
Definition at line 409 of file SSEPlus_emulation_SSE2.h.
00410 { 00411 __m128i mask = _mm_cmplt_epi16( a, _mm_setzero_si128() ); // FFFF where a < 0 00412 a = _mm_xor_si128 ( a, mask ); // Invert where a < 0 00413 mask = _mm_srli_epi16( mask, 15 ); // 0001 where a < 0 00414 a = _mm_add_epi16 ( a, mask ); // Add 1 where a < 0 00415 return a; 00416 }
SSP_FORCEINLINE __m128i ssp_abs_epi32_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_abs_epi32 [SSSE3]. (Searches MSDN)
Definition at line 420 of file SSEPlus_emulation_SSE2.h.
00421 { 00422 __m128i mask = _mm_cmplt_epi32( a, _mm_setzero_si128() ); // FFFF where a < 0 00423 a = _mm_xor_si128 ( a, mask ); // Invert where a < 0 00424 mask = _mm_srli_epi32( mask, 31 ); // 0001 where a < 0 00425 a = _mm_add_epi32( a, mask ); // Add 1 where a < 0 00426 return a; 00427 }
SSP_FORCEINLINE __m128i ssp_abs_epi8_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_abs_epi8 [SSSE3]. (Searches MSDN)
Definition at line 397 of file SSEPlus_emulation_SSE2.h.
00398 { 00399 __m128i mask = _mm_cmplt_epi8( a, _mm_setzero_si128() ); // FFFF where a < 0 00400 __m128i one = _mm_set1_epi8(1); 00401 a = _mm_xor_si128 ( a, mask ); // Invert where a < 0 00402 mask = _mm_and_si128 ( mask, one ); // 0001 where a < 0 00403 a = _mm_add_epi8 ( a, mask ); // Add 1 where a < 0 00404 return a; 00405 }
SSP_FORCEINLINE __m128d ssp_addsub_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_addsub_ps [SSE3]. (Searches MSDN)
Definition at line 443 of file SSEPlus_emulation_SSE2.h.
00444 { 00445 const static __m128d const_addSub_pd_neg = SSP_CONST_SET_64F( 1, -1 ); 00446 00447 b = _mm_mul_pd( b, const_addSub_pd_neg ); 00448 a = _mm_add_pd( a, b ); 00449 return a; 00450 }
SSP_FORCEINLINE __m128 ssp_addsub_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_addsub_ps [SSE3]. (Searches MSDN)
Definition at line 432 of file SSEPlus_emulation_SSE2.h.
00433 { 00434 const static __m128 neg = SSP_CONST_SET_32F( 1, -1, 1, -1 ); 00435 00436 b = _mm_mul_ps( b, neg ); 00437 a = _mm_add_ps( a, b ); 00438 return a; 00439 }
SSP_FORCEINLINE __m128i ssp_alignr_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
const int | ralign | |||
) |
Reference implementation of _mm_alignr_epi8 [SSSE3]. (Searches MSDN)
Definition at line 1300 of file SSEPlus_emulation_SSE2.h.
01301 { 01302 if (ralign < 0) return b; //only shift to right, no negative 01303 //if (ralign > 32) return _mm_setzero_si128(); 01304 // 01305 //if (ralign > 16) return _mm_srli_si128(a, ralign-16); 01306 01307 //b = _mm_srli_si128(b, ralign); 01308 //a = _mm_slli_si128(a, 16-ralign); 01309 switch (ralign) { 01310 case 0: 01311 return b; 01312 case 1: 01313 b = _mm_srli_si128(b, 1); 01314 a = _mm_slli_si128(a, 15); 01315 return _mm_or_si128( a, b ); 01316 case 2: 01317 b = _mm_srli_si128(b, 2); 01318 a = _mm_slli_si128(a, 14); 01319 return _mm_or_si128( a, b ); 01320 case 3: 01321 b = _mm_srli_si128(b, 3); 01322 a = _mm_slli_si128(a, 13); 01323 return _mm_or_si128( a, b ); 01324 case 4: 01325 b = _mm_srli_si128(b, 4); 01326 a = _mm_slli_si128(a, 12); 01327 return _mm_or_si128( a, b ); 01328 case 5: 01329 b = _mm_srli_si128(b, 5); 01330 a = _mm_slli_si128(a, 11); 01331 return _mm_or_si128( a, b ); 01332 case 6: 01333 b = _mm_srli_si128(b, 6); 01334 a = _mm_slli_si128(a, 10); 01335 return _mm_or_si128( a, b ); 01336 case 7: 01337 b = _mm_srli_si128(b, 7); 01338 a = _mm_slli_si128(a, 9); 01339 return _mm_or_si128( a, b ); 01340 case 8: 01341 b = _mm_srli_si128(b, 8); 01342 a = _mm_slli_si128(a, 8); 01343 return _mm_or_si128( a, b ); 01344 case 9: 01345 b = _mm_srli_si128(b, 9); 01346 a = _mm_slli_si128(a, 7); 01347 return _mm_or_si128( a, b ); 01348 case 10: 01349 b = _mm_srli_si128(b, 10); 01350 a = _mm_slli_si128(a, 6); 01351 return _mm_or_si128( a, b ); 01352 case 11: 01353 b = _mm_srli_si128(b, 11); 01354 a = _mm_slli_si128(a, 5); 01355 return _mm_or_si128( a, b ); 01356 case 12: 01357 b = _mm_srli_si128(b, 12); 01358 a = _mm_slli_si128(a, 4); 01359 return _mm_or_si128( a, b ); 01360 case 13: 01361 b = _mm_srli_si128(b, 13); 01362 a = _mm_slli_si128(a, 3); 01363 return _mm_or_si128( a, b ); 01364 case 14: 01365 b = _mm_srli_si128(b, 14); 01366 a = _mm_slli_si128(a, 2); 01367 return _mm_or_si128( a, b ); 01368 case 15: 01369 b = _mm_srli_si128(b, 15); 01370 a = _mm_slli_si128(a, 1); 01371 return _mm_or_si128( a, b ); 01372 case 16: 01373 return a; 01374 case 17: 01375 a = _mm_slli_si128(a, 1); 01376 return _mm_srli_si128(a, 1); 01377 case 18: 01378 a = _mm_slli_si128(a, 2); 01379 return _mm_srli_si128(a, 2); 01380 case 19: 01381 a = _mm_slli_si128(a, 3); 01382 return _mm_srli_si128(a, 3); 01383 case 20: 01384 a = _mm_slli_si128(a, 4); 01385 return _mm_srli_si128(a, 4); 01386 case 21: 01387 a = _mm_slli_si128(a, 5); 01388 return _mm_srli_si128(a, 5); 01389 case 22: 01390 a = _mm_slli_si128(a, 6); 01391 return _mm_srli_si128(a, 6); 01392 case 23: 01393 a = _mm_slli_si128(a, 7); 01394 return _mm_srli_si128(a, 7); 01395 case 24: 01396 a = _mm_slli_si128(a, 8); 01397 return _mm_srli_si128(a, 8); 01398 case 25: 01399 a = _mm_slli_si128(a, 9); 01400 return _mm_srli_si128(a, 9); 01401 case 26: 01402 a = _mm_slli_si128(a, 10); 01403 return _mm_srli_si128(a, 10); 01404 case 27: 01405 a = _mm_slli_si128(a, 11); 01406 return _mm_srli_si128(a, 11); 01407 case 28: 01408 a = _mm_slli_si128(a, 12); 01409 return _mm_srli_si128(a, 12); 01410 case 29: 01411 a = _mm_slli_si128(a, 13); 01412 return _mm_srli_si128(a, 13); 01413 case 30: 01414 a = _mm_slli_si128(a, 14); 01415 return _mm_srli_si128(a, 14); 01416 case 31: 01417 a = _mm_slli_si128(a, 15); 01418 return _mm_srli_si128(a, 15); 01419 default: 01420 return _mm_setzero_si128(); 01421 } 01422 }
SSP_FORCEINLINE __m128i ssp_blend_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
const int | mask | |||
) |
SSE2 implementation of _mm_blend_epi16 [SSE4.1]. (Searches MSDN)
Definition at line 458 of file SSEPlus_emulation_SSE2.h.
00459 { 00460 __m128i screen; 00461 const static __m128i mulShiftImm = SSP_CONST_SET_16I( 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 ); // Shift mask multiply moves all bits to left, becomes MSB 00462 00463 screen = _mm_set1_epi16 ( mask ); // Load the mask into register 00464 screen = _mm_mullo_epi16 ( screen, mulShiftImm ); // Shift bits to MSB 00465 screen = _mm_srai_epi16 ( screen, 15 ); // Shift bits to obtain 0xFFFF or 0x0000 00466 b = _mm_and_si128 ( screen, b ); // Mask out the correct values from b 00467 a = _mm_andnot_si128( screen, a ); // Mask out the correct values from a (invert the mask) 00468 a = _mm_or_si128 ( a, b ); // Or the 2 results. 00469 return a; 00470 }
SSP_FORCEINLINE __m128d ssp_blend_pd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
const int | mask | |||
) |
SSE2 implementation of _mm_blend_pd [SSE4.1]. (Searches MSDN)
Definition at line 474 of file SSEPlus_emulation_SSE2.h.
00475 { 00476 __m128d screen; 00477 screen = _mm_set_pd( (mask&0x2)>>1, mask&0x1 ); 00478 b = _mm_mul_pd( b, screen ); 00479 screen = _mm_set_pd( (~mask&0x2)>>1, (~mask&0x1) ); 00480 a = _mm_mul_pd( a, screen ); 00481 a = _mm_or_pd ( a, b ); 00482 return a; 00483 }
SSP_FORCEINLINE __m128 ssp_blend_ps_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
const int | mask | |||
) |
SSE2 implementation of _mm_blend_ps [SSE4.1]. (Searches MSDN)
Definition at line 487 of file SSEPlus_emulation_SSE2.h.
00488 { 00489 ssp_m128 screen, A, B; 00490 A.f = a; 00491 B.f = b; 00492 screen.i = ssp_movmask_imm8_to_epi32_SSE2( mask ); 00493 screen.i = ssp_logical_bitwise_select_SSE2( B.i, A.i, screen.i ); 00494 return screen.f; 00495 }
SSP_FORCEINLINE __m128i ssp_blendv_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | mask | |||
) |
SSE2 implementation of _mm_blendv_epi8 [SSE4.1]. (Searches MSDN)
Definition at line 499 of file SSEPlus_emulation_SSE2.h.
00500 { 00501 __m128i mHi, mLo; 00502 __m128i zero = _mm_setzero_si128 (); 00503 00504 mHi = _mm_unpacklo_epi8( zero, mask ); 00505 mHi = _mm_srai_epi16 ( mHi, 15 ); 00506 mHi = _mm_srli_epi16 ( mHi, 1 ); 00507 00508 mLo = _mm_unpackhi_epi8( zero, mask ); 00509 mLo = _mm_srai_epi16 ( mLo, 15 ); 00510 mLo = _mm_srli_epi16 ( mLo, 1 ); 00511 00512 mHi = _mm_packus_epi16 ( mHi, mLo ); 00513 00514 b = _mm_and_si128 ( b, mHi ); 00515 a = _mm_andnot_si128 ( mHi, a ); 00516 a = _mm_or_si128 ( a, b ); 00517 return a; 00518 }
SSP_FORCEINLINE __m128d ssp_blendv_pd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | mask | |||
) |
SSE2 implementation of _mm_blendv_pd [SSE4.1]. (Searches MSDN)
Definition at line 521 of file SSEPlus_emulation_SSE2.h.
00522 { 00523 ssp_m128 A, B, Mask; 00524 A.d = a; 00525 B.d = b; 00526 Mask.d = mask; 00527 00528 Mask.i = _mm_shuffle_epi32( Mask.i, _MM_SHUFFLE(3, 3, 1, 1) ); 00529 Mask.i = _mm_srai_epi32 ( Mask.i, 31 ); 00530 00531 B.i = _mm_and_si128( B.i, Mask.i ); 00532 A.i = _mm_andnot_si128( Mask.i, A.i ); 00533 A.i = _mm_or_si128( A.i, B.i ); 00534 return A.d; 00535 }
SSP_FORCEINLINE __m128 ssp_blendv_ps_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | mask | |||
) |
SSE2 implementation of _mm_blendv_epi8 [SSE4.1]. (Searches MSDN)
Definition at line 537 of file SSEPlus_emulation_SSE2.h.
00538 { 00539 ssp_m128 A, B, Mask; 00540 A.f = a; 00541 B.f = b; 00542 Mask.f = mask; 00543 00544 Mask.i = _mm_srai_epi32( Mask.i, 31 ); 00545 B.i = _mm_and_si128( B.i, Mask.i ); 00546 A.i = _mm_andnot_si128( Mask.i, A.i ); 00547 A.i = _mm_or_si128( A.i, B.i ); 00548 return A.f; 00549 }
SSP_FORCEINLINE __m128d ssp_ceil_pd_SSE2 | ( | __m128d | a | ) |
SSE2 implementation of _mm_ceil_pd [SSE4.1]. (Searches MSDN)
Definition at line 1060 of file SSEPlus_emulation_SSE2.h.
01061 { 01062 return ssp_round_pd_SSE2( a, SSP_FROUND_TO_POS_INF ); 01063 }
SSP_FORCEINLINE __m128 ssp_ceil_ps_SSE2 | ( | __m128 | a | ) |
SSE2 implementation of _mm_ceil_ps [SSE4.1]. (Searches MSDN)
Definition at line 1039 of file SSEPlus_emulation_SSE2.h.
01040 { 01041 return ssp_round_ps_SSE2( a, SSP_FROUND_TO_POS_INF ); 01042 }
SSP_FORCEINLINE __m128d ssp_ceil_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_ceil_sd [SSE4.1]. (Searches MSDN)
Definition at line 1074 of file SSEPlus_emulation_SSE2.h.
01075 { 01076 b = ssp_round_pd_SSE2(b, SSP_FROUND_TO_POS_INF ); 01077 01078 return _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1,0)); 01079 }
SSP_FORCEINLINE __m128 ssp_ceil_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_ceil_ss [SSE4.1]. (Searches MSDN)
Definition at line 1090 of file SSEPlus_emulation_SSE2.h.
01091 { 01092 b = ssp_round_ps_SSE2(b, SSP_FROUND_TO_POS_INF ); 01093 b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0)); 01094 return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0)); 01095 }
SSP_FORCEINLINE __m128i ssp_cmpeq_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_cmpeq_epi64 [SSE4.1]. (Searches MSDN)
Definition at line 557 of file SSEPlus_emulation_SSE2.h.
00558 { 00559 return ssp_comeq_epi64_SSE2( a, b ); 00560 }
SSP_FORCEINLINE __m128i ssp_comeq_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comeq_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 21 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comeq_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comeq_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 28 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comeq_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comeq_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 35 of file SSEPlus_emulation_comps_SSE2.h.
00036 { 00037 ssp_m128 A, B; 00038 A.i = a; 00039 B.i = b; 00040 A.i = _mm_cmpeq_epi32( A.i, B.i ); // A0=B0, A1=B1, A2=B2, A3=B3 00041 B.f = _mm_movehdup_ps( A.f ); // A1=B1, A1=B1, A3=B3, A3=B3 00042 A.f = _mm_moveldup_ps( A.f ); // A0=B0, A0=B0, A2=B2, A2=B2 00043 A.i = _mm_and_si128 ( A.i, B.i ); // A0=B0 & A1=B1, A2=B2 & A3=B3 00044 return A.i; 00045 }
SSP_FORCEINLINE __m128i ssp_comeq_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comeq_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 48 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comeq_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comeq_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 55 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comeq_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comeq_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)
Definition at line 62 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comeq_epu64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comeq_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 69 of file SSEPlus_emulation_comps_SSE2.h.
00070 { 00071 a = ssp_comeq_epi64_SSE2( a, b ); 00072 return a; 00073 }
SSP_FORCEINLINE __m128i ssp_comeq_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comeq_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)
Definition at line 76 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comeq_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comeq_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 83 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comeq_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comeq_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 90 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comeq_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comeq_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 97 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comeq_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comeq_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 104 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comfalse_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comfalse_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 655 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comfalse_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comfalse_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 661 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comfalse_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comfalse_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 667 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comfalse_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comfalse_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 673 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comfalse_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comfalse_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 679 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comfalse_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comfalse_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)
Definition at line 685 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comfalse_epu64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comfalse_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 691 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comfalse_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comfalse_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)
Definition at line 697 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comfalse_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comfalse_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 703 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comfalse_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comfalse_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 709 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comfalse_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSP_FORCEINLINE __m128 ssp_comfalse_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSP_FORCEINLINE __m128i ssp_comge_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comge_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 784 of file SSEPlus_emulation_comps_SSE2.h.
00785 { 00786 __m128i c; 00787 c = _mm_cmpgt_epi16( a, b ); 00788 a = _mm_cmpeq_epi16( a, b ); 00789 a = _mm_or_si128 ( a, c ); 00790 return a; 00791 }
SSP_FORCEINLINE __m128i ssp_comge_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comge_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 794 of file SSEPlus_emulation_comps_SSE2.h.
00795 { 00796 __m128i c; 00797 c = _mm_cmpgt_epi32( a, b ); 00798 a = _mm_cmpeq_epi32( a, b ); 00799 a = _mm_or_si128 ( a, c ); 00800 return a; 00801 }
SSP_FORCEINLINE __m128i ssp_comge_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comge_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 804 of file SSEPlus_emulation_comps_SSE2.h.
00805 { 00806 a = ssp_comge_epi64_REF( a, b ); 00807 return a; 00808 }
SSP_FORCEINLINE __m128i ssp_comge_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comge_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 811 of file SSEPlus_emulation_comps_SSE2.h.
00812 { 00813 __m128i c; 00814 c = _mm_cmpgt_epi8( a, b ); 00815 a = _mm_cmpeq_epi8( a, b ); 00816 a = _mm_or_si128 ( a, c ); 00817 return a; 00818 }
SSP_FORCEINLINE __m128i ssp_comge_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comge_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 822 of file SSEPlus_emulation_comps_SSE2.h.
00823 { 00824 __m128i mask; 00825 mask = ssp_comge_epi16_SSE2( a, b ); // FFFF where a < b (signed) 00826 mask = ssp_logical_signinvert_16_SSE2( mask, a, b ); 00827 return mask; 00828 }
SSP_FORCEINLINE __m128i ssp_comge_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comge_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)
Definition at line 831 of file SSEPlus_emulation_comps_SSE2.h.
00832 { 00833 __m128i mask; 00834 mask = ssp_comge_epi32_SSE2( a, b ); // FFFF where a < b (signed) 00835 mask = ssp_logical_signinvert_32_SSE2( mask, a, b ); 00836 return mask; 00837 }
SSP_FORCEINLINE __m128i ssp_comge_epu64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comge_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 840 of file SSEPlus_emulation_comps_SSE2.h.
00841 { 00842 a = ssp_comge_epu64_REF( a, b ); 00843 return a; 00844 }
SSP_FORCEINLINE __m128i ssp_comge_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comge_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)
Definition at line 847 of file SSEPlus_emulation_comps_SSE2.h.
00848 { 00849 a = ssp_comge_epu8_REF( a, b ); 00850 return a; 00851 }
SSP_FORCEINLINE __m128d ssp_comge_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comge_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 854 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comge_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comge_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 861 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comge_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comge_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 868 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comge_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comge_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 875 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comgt_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comgt_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 887 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comgt_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comgt_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 894 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comgt_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comgt_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 901 of file SSEPlus_emulation_comps_SSE2.h.
00902 { 00903 a = ssp_comgt_epi64_REF( a, b ); 00904 return a; 00905 }
SSP_FORCEINLINE __m128i ssp_comgt_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comgt_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 908 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comgt_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comgt_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 915 of file SSEPlus_emulation_comps_SSE2.h.
00916 { 00917 __m128i signMask, mask; 00918 00919 mask = _mm_cmpgt_epi16( a, b ); // FFFF where a > b (signed) 00920 signMask = _mm_xor_si128 ( a, b ); // Signbit is 1 where signs differ 00921 signMask = _mm_srai_epi16 ( signMask, 15 ); // fill all fields with sign bit 00922 mask = _mm_xor_si128 ( mask, signMask ); // Invert output where signs differed 00923 return mask; 00924 }
SSP_FORCEINLINE __m128i ssp_comgt_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comgt_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)
Definition at line 927 of file SSEPlus_emulation_comps_SSE2.h.
00928 { 00929 __m128i signMask, mask; 00930 00931 mask = _mm_cmpgt_epi32( a, b ); // FFFF where a < b (signed) 00932 signMask = _mm_xor_si128 ( a, b ); // Signbit is 1 where signs differ 00933 signMask = _mm_srai_epi32 ( signMask, 31 ); // fill all fields with sign bit 00934 mask = _mm_xor_si128 ( mask, signMask ); // Invert output where signs differed 00935 return mask; 00936 }
SSP_FORCEINLINE __m128i ssp_comgt_epu64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comgt_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 939 of file SSEPlus_emulation_comps_SSE2.h.
00940 { 00941 a = ssp_comgt_epu64_REF( a, b ); 00942 return a; 00943 }
SSP_FORCEINLINE __m128i ssp_comgt_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comgt_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)
Definition at line 946 of file SSEPlus_emulation_comps_SSE2.h.
00947 { 00948 a = ssp_comgt_epu8_REF( a, b ); 00949 return a; 00950 }
SSP_FORCEINLINE __m128d ssp_comgt_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comgt_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 953 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comgt_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comgt_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 960 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comgt_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comgt_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 967 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comgt_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comgt_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 974 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comle_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comle_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 213 of file SSEPlus_emulation_comps_SSE2.h.
00214 { 00215 __m128i c; 00216 c = _mm_cmplt_epi16( a, b ); 00217 a = _mm_cmpeq_epi16( a, b ); 00218 a = _mm_or_si128 ( a, c ); 00219 return a; 00220 }
SSP_FORCEINLINE __m128i ssp_comle_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comle_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 223 of file SSEPlus_emulation_comps_SSE2.h.
00224 { 00225 __m128i c; 00226 c = _mm_cmplt_epi32( a, b ); 00227 a = _mm_cmpeq_epi32( a, b ); 00228 a = _mm_or_si128 ( a, c ); 00229 return a; 00230 }
SSP_FORCEINLINE __m128i ssp_comle_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comle_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 233 of file SSEPlus_emulation_comps_SSE2.h.
00234 { 00235 a = ssp_comle_epi64_REF( a, b ); 00236 return a; 00237 }
SSP_FORCEINLINE __m128i ssp_comle_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comle_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 240 of file SSEPlus_emulation_comps_SSE2.h.
00241 { 00242 __m128i c; 00243 c = _mm_cmplt_epi8( a, b ); 00244 a = _mm_cmpeq_epi8( a, b ); 00245 a = _mm_or_si128 ( a, c ); 00246 return a; 00247 }
SSP_FORCEINLINE __m128i ssp_comle_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comle_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 250 of file SSEPlus_emulation_comps_SSE2.h.
00251 { 00252 a = ssp_comle_epu16_REF( a, b ); 00253 return a; 00254 }
SSP_FORCEINLINE __m128i ssp_comle_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comle_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)
Definition at line 257 of file SSEPlus_emulation_comps_SSE2.h.
00258 { 00259 a = ssp_comle_epu32_REF( a, b ); 00260 return a; 00261 }
SSP_FORCEINLINE __m128i ssp_comle_epu64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comle_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 264 of file SSEPlus_emulation_comps_SSE2.h.
00265 { 00266 a = ssp_comle_epu64_REF( a, b ); 00267 return a; 00268 }
SSP_FORCEINLINE __m128i ssp_comle_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comle_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)
Definition at line 271 of file SSEPlus_emulation_comps_SSE2.h.
00272 { 00273 a = ssp_comle_epu8_REF( a, b ); 00274 return a; 00275 }
SSP_FORCEINLINE __m128d ssp_comle_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comle_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 278 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comle_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comle_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 285 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comle_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comle_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 292 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comle_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comle_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 299 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comlt_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comlt_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 115 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comlt_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comlt_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 122 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comlt_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comlt_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 129 of file SSEPlus_emulation_comps_SSE2.h.
00130 { 00131 a = ssp_comlt_epi64_REF( a, b ); 00132 return a; 00133 }
SSP_FORCEINLINE __m128i ssp_comlt_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comlt_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 136 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comlt_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comlt_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 143 of file SSEPlus_emulation_comps_SSE2.h.
00144 { 00145 __m128i signMask, mask; 00146 00147 mask = _mm_cmplt_epi16( a, b ); // FFFF where a < b (signed) 00148 signMask = _mm_xor_si128 ( a, b ); // Signbit is 1 where signs differ 00149 signMask = _mm_srai_epi16 ( signMask, 15 ); // fill all fields with sign bit 00150 mask = _mm_xor_si128 ( mask, signMask ); // Invert output where signs differed 00151 return mask; 00152 }
SSP_FORCEINLINE __m128i ssp_comlt_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comlt_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)
Definition at line 155 of file SSEPlus_emulation_comps_SSE2.h.
00156 { 00157 __m128i signMask, mask; 00158 00159 mask = _mm_cmplt_epi32( a, b ); // FFFF where a < b (signed) 00160 signMask = _mm_xor_si128 ( a, b ); // Signbit is 1 where signs differ 00161 signMask = _mm_srai_epi32 ( signMask, 31 ); // fill all fields with sign bit 00162 mask = _mm_xor_si128 ( mask, signMask ); // Invert output where signs differed 00163 return mask; 00164 }
SSP_FORCEINLINE __m128i ssp_comlt_epu64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comlt_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 167 of file SSEPlus_emulation_comps_SSE2.h.
00168 { 00169 a = ssp_comlt_epu64_REF( a, b ); 00170 return a; 00171 }
SSP_FORCEINLINE __m128i ssp_comlt_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comlt_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)
Definition at line 174 of file SSEPlus_emulation_comps_SSE2.h.
00175 { 00176 a = ssp_comlt_epu8_REF( a, b ); 00177 return a; 00178 }
SSP_FORCEINLINE __m128d ssp_comlt_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comlt_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 181 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comlt_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comlt_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 188 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comlt_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comlt_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 195 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comlt_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comlt_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 202 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comneq_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comneq_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 347 of file SSEPlus_emulation_comps_SSE2.h.
00348 { 00349 a = ssp_comeq_epi16_SSE2( a, b ); 00350 a = ssp_logical_invert_si128_SSE2( a ); 00351 return a; 00352 }
SSP_FORCEINLINE __m128i ssp_comneq_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comneq_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 355 of file SSEPlus_emulation_comps_SSE2.h.
00356 { 00357 a = ssp_comeq_epi32_SSE2( a, b ); 00358 a = ssp_logical_invert_si128_SSE2( a ); 00359 return a; 00360 }
SSP_FORCEINLINE __m128i ssp_comneq_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comneq_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 363 of file SSEPlus_emulation_comps_SSE2.h.
00364 { 00365 a = ssp_comeq_epi64_SSE2( a, b ); 00366 a = ssp_logical_invert_si128_SSE2( a ); 00367 return a; 00368 }
SSP_FORCEINLINE __m128i ssp_comneq_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comneq_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 371 of file SSEPlus_emulation_comps_SSE2.h.
00372 { 00373 a = ssp_comeq_epi8_SSE2( a, b ); 00374 a = ssp_logical_invert_si128_SSE2( a ); 00375 return a; 00376 }
SSP_FORCEINLINE __m128i ssp_comneq_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comneq_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 379 of file SSEPlus_emulation_comps_SSE2.h.
00380 { 00381 a = ssp_comeq_epu16_SSE2( a, b ); 00382 a = ssp_logical_invert_si128_SSE2( a ); 00383 return a; 00384 }
SSP_FORCEINLINE __m128i ssp_comneq_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comneq_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)
Definition at line 387 of file SSEPlus_emulation_comps_SSE2.h.
00388 { 00389 a = ssp_comeq_epu32_SSE2( a, b ); 00390 a = ssp_logical_invert_si128_SSE2( a ); 00391 return a; 00392 }
SSP_FORCEINLINE __m128i ssp_comneq_epu64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comneq_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 395 of file SSEPlus_emulation_comps_SSE2.h.
00396 { 00397 a = ssp_comeq_epu64_SSE2( a, b ); 00398 a = ssp_logical_invert_si128_SSE2( a ); 00399 return a; 00400 }
SSP_FORCEINLINE __m128i ssp_comneq_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comneq_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)
Definition at line 403 of file SSEPlus_emulation_comps_SSE2.h.
00404 { 00405 a = ssp_comeq_epu8_SSE2( a, b ); 00406 a = ssp_logical_invert_si128_SSE2( a ); 00407 return a; 00408 }
SSP_FORCEINLINE __m128d ssp_comneq_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comneq_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 411 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comneq_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comneq_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 418 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comneq_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comneq_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 425 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comneq_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comneq_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 432 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comnge_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comnge_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 589 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comnge_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comnge_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 596 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comnge_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comnge_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 603 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comnge_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comnge_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 610 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comngt_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comngt_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 622 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comngt_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comngt_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 629 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comngt_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comngt_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 636 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comngt_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comngt_ss/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 643 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comnle_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comnle_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 476 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comnle_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comnle_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 483 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comnle_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comnle_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 490 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comnle_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comnle_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 497 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comnlt_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comnlt_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 443 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comnlt_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comnlt_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 450 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comnlt_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comnlt_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 457 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comnlt_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comnlt_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 464 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comoneq_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comoneq_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 736 of file SSEPlus_emulation_comps_SSE2.h.
00737 { 00738 __m128d c; 00739 c = _mm_cmpord_pd( a, b ); 00740 a = _mm_cmpneq_pd( a, b ); 00741 a = _mm_and_pd ( a, c ); 00742 return a; 00743 }
SSP_FORCEINLINE __m128 ssp_comoneq_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comoneq_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 746 of file SSEPlus_emulation_comps_SSE2.h.
00747 { 00748 __m128 c; 00749 c = _mm_cmpord_ps( a, b ); 00750 a = _mm_cmpneq_ps( a, b ); 00751 a = _mm_and_ps ( a, c ); 00752 return a; 00753 }
SSP_FORCEINLINE __m128d ssp_comoneq_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comoneq_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 757 of file SSEPlus_emulation_comps_SSE2.h.
00758 { 00759 __m128d c; 00760 c = _mm_cmpord_pd( a, b ); 00761 b = _mm_cmpneq_pd( a, b ); 00762 b = _mm_and_pd ( b, c ); 00763 a = _mm_move_sd ( a, b ); 00764 return a; 00765 }
SSP_FORCEINLINE __m128 ssp_comoneq_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comoneq_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 768 of file SSEPlus_emulation_comps_SSE2.h.
00769 { 00770 __m128 c; 00771 c = _mm_cmpord_ps( a, b ); 00772 b = _mm_cmpneq_ps( a, b ); 00773 b = _mm_and_ps ( b, c ); 00774 a = _mm_move_ss ( a, b ); 00775 return a; 00776 }
SSP_FORCEINLINE __m128d ssp_comord_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comord_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 509 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comord_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comord_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 516 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comord_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comord_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 523 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comord_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comord_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 530 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comtrue_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comtrue_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 986 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comtrue_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comtrue_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 992 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comtrue_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comtrue_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 998 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comtrue_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comtrue_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1004 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comtrue_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comtrue_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1010 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comtrue_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comtrue_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1016 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comtrue_epu64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comtrue_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1022 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_comtrue_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_comtrue_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1028 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comtrue_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSP_FORCEINLINE __m128 ssp_comtrue_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSP_FORCEINLINE __m128d ssp_comtrue_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSP_FORCEINLINE __m128 ssp_comtrue_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSP_FORCEINLINE __m128d ssp_comueq_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comueq_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 542 of file SSEPlus_emulation_comps_SSE2.h.
00543 { 00544 __m128d c; 00545 c = _mm_cmpunord_pd( a, b ); 00546 a = _mm_cmpeq_pd ( a, b ); 00547 a = _mm_or_pd ( a, c ); 00548 return a; 00549 }
SSP_FORCEINLINE __m128 ssp_comueq_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comueq_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 552 of file SSEPlus_emulation_comps_SSE2.h.
00553 { 00554 __m128 c; 00555 c = _mm_cmpunord_ps( a, b ); 00556 a = _mm_cmpeq_ps ( a, b ); 00557 a = _mm_or_ps ( a, c ); 00558 return a; 00559 }
SSP_FORCEINLINE __m128d ssp_comueq_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comueq_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 562 of file SSEPlus_emulation_comps_SSE2.h.
00563 { 00564 __m128d c; 00565 c = _mm_cmpunord_sd( a, b ); 00566 b = _mm_cmpeq_sd ( a, b ); 00567 b = _mm_or_pd ( b, c ); 00568 a = _mm_move_sd ( a, b ); 00569 return a; 00570 }
SSP_FORCEINLINE __m128 ssp_comueq_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comueq_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 573 of file SSEPlus_emulation_comps_SSE2.h.
00574 { 00575 __m128 c; 00576 c = _mm_cmpunord_ss( a, b ); 00577 b = _mm_cmpeq_ss ( a, b ); 00578 b = _mm_or_ps ( a, c ); 00579 a = _mm_move_ss ( a, b ); 00580 return a; 00581 }
SSP_FORCEINLINE __m128d ssp_comunord_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comunord_pd/ compd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 310 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comunord_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comunord_ps/ comps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 318 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128d ssp_comunord_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_comunord_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 326 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128 ssp_comunord_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_comunord_ss/ comss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 334 of file SSEPlus_emulation_comps_SSE2.h.
SSP_FORCEINLINE __m128i ssp_cvtepi16_epi32_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepi16_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 1178 of file SSEPlus_emulation_SSE2.h.
01179 { 01180 __m128i b = _mm_set1_epi32 (-1); //0xFFFFFFFF 01181 __m128i c = _mm_unpacklo_epi16(a, b); //FFFFa0**FFFFa1**.... 01182 __m128i d = _mm_set1_epi32 (0x8000); //0x8000 01183 01184 b = _mm_andnot_si128(c, d); // 0x80 for positive, 0x00 for negative 01185 d = _mm_slli_epi32(b, 1); // 0x100 for positive, 0x000 for negative 01186 01187 return _mm_add_epi32(c, d); 01188 }
SSP_FORCEINLINE __m128i ssp_cvtepi16_epi64_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepi16_epi64 [SSE4.1]. (Searches MSDN)
Definition at line 1192 of file SSEPlus_emulation_SSE2.h.
01193 { 01194 __m128i b = _mm_set1_epi32 (-1); //0xFFFFFFFF 01195 __m128i c = _mm_unpacklo_epi16(a, b); //FFFFa0**FFFFa1**.... 01196 __m128i d = _mm_set_epi32(0,0x8000, 0,0x8000); //0x8000 01197 01198 c = _mm_unpacklo_epi32(c, b); //FFFFFFFFFFFFFFa0... 01199 b = _mm_andnot_si128(c, d); // 0x80 for positive, 0x00 for negative 01200 d = _mm_slli_epi64(b, 1); // 0x100 for positive, 0x000 for negative 01201 01202 return _mm_add_epi64(c, d); 01203 }
SSP_FORCEINLINE __m128i ssp_cvtepi32_epi64_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepi32_epi64 [SSE4.1]. (Searches MSDN)
Definition at line 1207 of file SSEPlus_emulation_SSE2.h.
01208 { 01209 __m128i b = _mm_set1_epi32 (-1); //0xFFFFFFFF 01210 __m128i c = _mm_unpacklo_epi32(a, b); //FFFFFFFFa0******FFFFFFFFa1******.... 01211 __m128i d = _mm_set_epi32(0, 0x80000000,0,0x80000000); //0x80000000 01212 01213 b = _mm_andnot_si128(c, d); // 0x80 for positive, 0x00 for negative 01214 d = _mm_slli_epi64(b, 1); // 0x100 for positive, 0x000 for negative 01215 01216 return _mm_add_epi64(c, d); 01217 }
SSP_FORCEINLINE __m128i ssp_cvtepi8_epi16_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepi8_epi16 [SSE4.1]. (Searches MSDN)
Definition at line 1124 of file SSEPlus_emulation_SSE2.h.
01125 { 01126 __m128i b = _mm_setzero_si128 (); 01127 __m128i c = _mm_unpacklo_epi8(a, b); 01128 __m128i d = _mm_set1_epi16 (128); 01129 01130 b = _mm_and_si128(d, c); 01131 d = _mm_set1_epi16(0x1FE); 01132 b = _mm_mullo_epi16(b, d); 01133 01134 return _mm_add_epi16(c, b); 01135 01136 //Another way, slower 01137 //__m128i b = _mm_set1_epi32 (-1); //0xFFFFFFFF 01138 //__m128i c = _mm_unpacklo_epi8(a, b); //FFa0FFa1.... 01139 //__m128i d = _mm_set1_epi16 (128); //0x80 01140 //b = _mm_andnot_si128(c, d); // 0x80 for positive, 0x00 for negative 01141 //d = _mm_slli_epi16(b, 1); // 0x100 for positive, 0x000 for negative 01142 //return _mm_add_epi16(c, d); 01143 }
SSP_FORCEINLINE __m128i ssp_cvtepi8_epi32_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepi8_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 1147 of file SSEPlus_emulation_SSE2.h.
01148 { 01149 __m128i b = _mm_set1_epi32 (-1); //0xFFFFFFFF 01150 __m128i c = _mm_unpacklo_epi8(a, b); //FFa0FFa1.... 01151 __m128i d = _mm_set1_epi32 (128); //0x80 01152 01153 c = _mm_unpacklo_epi16(c, b); //FFFFFFa0FFFFFFa1... 01154 b = _mm_andnot_si128(c, d); // 0x80 for positive, 0x00 for negative 01155 d = _mm_slli_epi32(b, 1); // 0x100 for positive, 0x000 for negative 01156 01157 return _mm_add_epi32(c, d); 01158 }
SSP_FORCEINLINE __m128i ssp_cvtepi8_epi64_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepi8_epi64 [SSE4.1]. (Searches MSDN)
Definition at line 1162 of file SSEPlus_emulation_SSE2.h.
01163 { 01164 __m128i b = _mm_set1_epi32 (-1); //0xFFFFFFFF 01165 __m128i c = _mm_unpacklo_epi8(a, b); //FFa0FFa1.... 01166 __m128i d = _mm_set_epi32 (0, 128, 0, 128); //0x80 01167 01168 c = _mm_unpacklo_epi16(c, b); //FFFFFFa0FFFFFFa1... 01169 c = _mm_unpacklo_epi32(c, b); //FFFFFFFFFFFFFFa0... 01170 b = _mm_andnot_si128(c, d); // 0x80 for positive, 0x00 for negative 01171 d = _mm_slli_epi64(b, 1); // 0x100 for positive, 0x000 for negative 01172 01173 return _mm_add_epi64(c, d); 01174 }
SSP_FORCEINLINE __m128i ssp_cvtepu16_epi32_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepu16_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 1254 of file SSEPlus_emulation_SSE2.h.
01255 { 01256 __m128i b = _mm_setzero_si128 (); 01257 01258 return _mm_unpacklo_epi16(a, b); 01259 }
SSP_FORCEINLINE __m128i ssp_cvtepu16_epi64_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepu16_epi64 [SSE4.1]. (Searches MSDN)
Definition at line 1263 of file SSEPlus_emulation_SSE2.h.
01264 { 01265 __m128i b = _mm_setzero_si128 (); 01266 01267 a = _mm_unpacklo_epi16(a, b); 01268 01269 return _mm_unpacklo_epi32(a, b); 01270 }
SSP_FORCEINLINE __m128i ssp_cvtepu32_epi64_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepu32_epi64 [SSE4.1]. (Searches MSDN)
Definition at line 1274 of file SSEPlus_emulation_SSE2.h.
01275 { 01276 __m128i b = _mm_setzero_si128 (); 01277 01278 return _mm_unpacklo_epi32(a, b); 01279 }
SSP_FORCEINLINE __m128i ssp_cvtepu8_epi16_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepu8_epi16 [SSE4.1]. (Searches MSDN)
Definition at line 1221 of file SSEPlus_emulation_SSE2.h.
SSP_FORCEINLINE __m128i ssp_cvtepu8_epi32_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepu8_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 1230 of file SSEPlus_emulation_SSE2.h.
01231 { 01232 __m128i b = _mm_setzero_si128 (); 01233 01234 a = _mm_unpacklo_epi8(a, b); 01235 01236 return _mm_unpacklo_epi16(a, b); 01237 }
SSP_FORCEINLINE __m128i ssp_cvtepu8_epi64_SSE2 | ( | __m128i | a | ) |
SSE2 implementation of _mm_cvtepu8_epi64 [SSE4.1]. (Searches MSDN)
Definition at line 1241 of file SSEPlus_emulation_SSE2.h.
01242 { 01243 __m128i b = _mm_setzero_si128 (); 01244 01245 a = _mm_unpacklo_epi8(a, b); 01246 01247 a = _mm_unpacklo_epi16(a, b); 01248 01249 return _mm_unpacklo_epi32(a, b); 01250 }
SSP_FORCEINLINE __m128d ssp_dp_pd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
const int | mask | |||
) |
SSE2 implementation of _mm_dp_pd [SSE4.1]. (Searches MSDN)
Definition at line 893 of file SSEPlus_emulation_SSE2.h.
00894 { 00895 int smallMask = (mask & 0x33)<<16; 00896 const static __m128i mulShiftImm_01 = SSP_CONST_SET_32I( 0x40000000, 0x40000000, 0x80000000, 0x80000000 ); // Shift mask multiply moves 0,1, bits to left, becomes MSB 00897 const static __m128i mulShiftImm_45 = SSP_CONST_SET_32I( 0x04000000, 0x04000000, 0x08000000, 0x08000000 ); // Shift mask multiply moves 4,5, bits to left, becomes MSB 00898 ssp_m128 mHi, mLo; 00899 00900 mLo.i = _mm_set1_epi32( smallMask );// Load the mask into register 00901 mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_01 ); // Shift the bits 00902 mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_45 ); // Shift the bits 00903 00904 mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() ); // FFFFFFFF if bit set, 00000000 if not set 00905 mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() ); // FFFFFFFF if bit set, 00000000 if not set 00906 00907 a = _mm_and_pd( a, mHi.d ); // Clear input using the high bits of the mask 00908 a = _mm_mul_pd( a, b ); 00909 00910 b = _mm_shuffle_pd( a, a, _MM_SHUFFLE2(0, 1) ); // Shuffle the values so that we b = { a[0], a[1] } and a = { a[1], a[0] } 00911 a = _mm_add_pd( a, b ); // Horizontally add the 4 values 00912 a = _mm_and_pd( a, mLo.d ); // Clear output using low bits of the mask 00913 return a; 00914 }
SSP_FORCEINLINE __m128 ssp_dp_ps_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
const int | mask | |||
) |
SSE2 implementation of _mm_dp_pd [SSE4.1]. (Searches MSDN)
Definition at line 918 of file SSEPlus_emulation_SSE2.h.
00919 { 00920 const static __m128i mulShiftImm_0123 = SSP_CONST_SET_32I( 0x010000, 0x020000, 0x040000, 0x080000 ); // Shift mask multiply moves 0,1,2,3 bits to left, becomes MSB 00921 const static __m128i mulShiftImm_4567 = SSP_CONST_SET_32I( 0x100000, 0x200000, 0x400000, 0x800000 ); // Shift mask multiply moves 4,5,6,7 bits to left, becomes MSB 00922 00923 // Begin mask preparation 00924 ssp_m128 mHi, mLo; 00925 mLo.i = _mm_set1_epi32( mask ); // Load the mask into register 00926 mLo.i = _mm_slli_si128( mLo.i, 3 ); // Shift into reach of the 16 bit multiply 00927 00928 mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_0123 ); // Shift the bits 00929 mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_4567 ); // Shift the bits 00930 00931 mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() ); // FFFFFFFF if bit set, 00000000 if not set 00932 mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() ); // FFFFFFFF if bit set, 00000000 if not set 00933 // End mask preparation - Mask bits 0-3 in mLo, 4-7 in mHi 00934 00935 a = _mm_and_ps( a, mHi.f ); // Clear input using the high bits of the mask 00936 a = _mm_mul_ps( a, b ); 00937 00938 a = ssp_arithmetic_hadd4_dup_ps_SSE2( a ); // Horizontally add the 4 values 00939 a = _mm_and_ps( a, mLo.f ); // Clear output using low bits of the mask 00940 return a; 00941 }
SSP_FORCEINLINE int ssp_extract_epi32_SSE2 | ( | __m128i | a, | |
const int | imm | |||
) |
SSE2 implementation of _mm_extract_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 1532 of file SSEPlus_emulation_SSE2.h.
01533 { 01534 ssp_m128 mask; 01535 switch( imm & 0x3 ) 01536 { 01537 case 3: a = _mm_srli_si128( a, 12 ); break; 01538 case 2: a = _mm_srli_si128( a, 8 ); break; 01539 case 1: a = _mm_srli_si128( a, 4 ); break; 01540 } 01541 01542 mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF ); 01543 mask.i = _mm_and_si128 ( mask.i, a ); 01544 01545 return mask.s32[0]; 01546 }
SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_SSE2 | ( | __m128i | a, | |
const int | ndx | |||
) |
SSE2 implementation of _mm_extract_epi64 [SSE4.1]. (Searches MSDN)
Definition at line 1557 of file SSEPlus_emulation_SSE2.h.
01558 { 01559 ssp_m128 mask; 01560 switch( ndx & 0x1 ) 01561 { 01562 case 1: a = _mm_srli_si128( a, 8 ); break; 01563 } 01564 01565 mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF ); 01566 mask.i = _mm_and_si128 ( mask.i, a ); 01567 01568 return mask.s64[0]; 01569 }
SSP_FORCEINLINE int ssp_extract_epi8_SSE2 | ( | __m128i | a, | |
const int | ndx | |||
) |
SSE2 implementation of _mm_extract_epi8 [SSE4.1]. (Searches MSDN)
Definition at line 1503 of file SSEPlus_emulation_SSE2.h.
01504 { 01505 ssp_m128 mask; 01506 switch( ndx & 0xF ) 01507 { 01508 case 15: a = _mm_srli_si128( a, 15 ); break; 01509 case 14: a = _mm_srli_si128( a, 14 ); break; 01510 case 13: a = _mm_srli_si128( a, 13 ); break; 01511 case 12: a = _mm_srli_si128( a, 12 ); break; 01512 case 11: a = _mm_srli_si128( a, 11 ); break; 01513 case 10: a = _mm_srli_si128( a, 10 ); break; 01514 case 9: a = _mm_srli_si128( a, 9 ); break; 01515 case 8: a = _mm_srli_si128( a, 8 ); break; 01516 case 7: a = _mm_srli_si128( a, 7 ); break; 01517 case 6: a = _mm_srli_si128( a, 6 ); break; 01518 case 5: a = _mm_srli_si128( a, 5 ); break; 01519 case 4: a = _mm_srli_si128( a, 4 ); break; 01520 case 3: a = _mm_srli_si128( a, 3 ); break; 01521 case 2: a = _mm_srli_si128( a, 2 ); break; 01522 case 1: a = _mm_srli_si128( a, 1 ); break; 01523 } 01524 01525 mask.i = _mm_setr_epi8 ( -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ); 01526 // mask = { 00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,FF } 01527 mask.i = _mm_and_si128 ( mask.i, a ); 01528 return mask.s8[0]; 01529 }
SSP_FORCEINLINE int ssp_extract_ps_SSE2 | ( | __m128 | a, | |
const int | ndx | |||
) |
SSE2 implementation of _mm_extract_ps [SSE4.1]. (Searches MSDN)
Definition at line 1549 of file SSEPlus_emulation_SSE2.h.
01550 { 01551 ssp_m128 A; 01552 A.f = a; 01553 return ssp_extract_epi32_SSE2( A.i, ndx ); 01554 }
SSP_FORCEINLINE __m128i ssp_extract_si64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_extract_si64 [SSE4a]. (Searches MSDN)
NOTE: The upper 64-bit of the destination register are undefined.
Definition at line 1586 of file SSEPlus_emulation_SSE2.h.
01587 { 01588 ssp_u32 len, ndx; 01589 ssp_m128 B; 01590 B.i = b; 01591 01592 ndx = (ssp_u32)((B.u64[0] & 0x3F00) >> 8); // Mask ndx field. 01593 len = (ssp_u32)((B.u64[0] & 0x003F)); // Mask len field. 01594 01595 a = ssp_extracti_si64_SSE2( a, len, ndx ); 01596 return a; 01597 }
SSP_FORCEINLINE __m128i ssp_extracti_si64_SSE2 | ( | __m128i | a, | |
int | len, | |||
int | ndx | |||
) |
SSE2 implementation of _mm_extracti_si64 [SSE4a]. (Searches MSDN)
NOTE: The upper 64-bits of the destination register are undefined.
Definition at line 1574 of file SSEPlus_emulation_SSE2.h.
01575 { 01576 int left = ndx + len; 01577 a = _mm_slli_epi64( a, 64-left ); // clear the mask to the left 01578 a = _mm_srli_epi64( a, 64-len ); // clear the mask to the right 01579 return a; 01580 }
SSP_FORCEINLINE __m128d ssp_floor_pd_SSE2 | ( | __m128d | a | ) |
SSE2 implementation of _mm_floor_pd [SSE4.1]. (Searches MSDN)
Definition at line 1053 of file SSEPlus_emulation_SSE2.h.
01054 { 01055 return ssp_round_pd_SSE2( a, SSP_FROUND_TO_NEG_INF ); 01056 }
SSP_FORCEINLINE __m128 ssp_floor_ps_SSE2 | ( | __m128 | a | ) |
SSE2 implementation of _mm_floor_ps [SSE4.1]. (Searches MSDN)
Definition at line 1046 of file SSEPlus_emulation_SSE2.h.
01047 { 01048 return ssp_round_ps_SSE2( a, SSP_FROUND_TO_NEG_INF ); 01049 }
SSP_FORCEINLINE __m128d ssp_floor_sd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_ceil_sd [SSE4.1]. (Searches MSDN)
Definition at line 1066 of file SSEPlus_emulation_SSE2.h.
01067 { 01068 b = ssp_round_pd_SSE2(b, SSP_FROUND_TO_NEG_INF ); 01069 01070 return _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1,0)); 01071 }
SSP_FORCEINLINE __m128 ssp_floor_ss_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_floor_ss [SSE4.1]. (Searches MSDN)
Definition at line 1082 of file SSEPlus_emulation_SSE2.h.
01083 { 01084 b = ssp_round_ps_SSE2(b, SSP_FROUND_TO_NEG_INF ); 01085 b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0)); 01086 return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0)); 01087 }
SSP_FORCEINLINE __m128i ssp_hadd_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_hadd_epi16 [SSE3]. (Searches MSDN)
Definition at line 569 of file SSEPlus_emulation_SSE2.h.
00570 { 00571 ssp_convert_odd_even_epi16_SSE2( &a, &b ); 00572 a = _mm_add_epi16( a, b ); 00573 return a; 00574 }
SSP_FORCEINLINE __m128i ssp_hadd_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_hadd_epi32 [SSSE3]. (Searches MSDN)
Definition at line 606 of file SSEPlus_emulation_SSE2.h.
00607 { 00608 ssp_convert_odd_even_epi32_SSE2( &a, &b ); 00609 a = _mm_add_epi32( a, b ); 00610 return a; 00611 }
SSP_FORCEINLINE __m128d ssp_hadd_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_hadd_pd [SSE3]. (Searches MSDN)
Definition at line 643 of file SSEPlus_emulation_SSE2.h.
00644 { 00645 ssp_m128 A,B,C; 00646 A.d = a; 00647 C.d = a; 00648 B.d = b; 00649 00650 A.f = _mm_movelh_ps( A.f, B.f ); 00651 B.f = _mm_movehl_ps( B.f, C.f ); 00652 A.d = _mm_add_pd ( A.d, B.d ); 00653 return A.d; 00654 }
SSP_FORCEINLINE __m128 ssp_hadd_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_hadd_ps [SSE3]. (Searches MSDN)
Definition at line 624 of file SSEPlus_emulation_SSE2.h.
00625 { 00626 ssp_convert_odd_even_ps_SSE2( &a, &b ); 00627 a = _mm_add_ps( a, b ); 00628 return a; 00629 }
SSP_FORCEINLINE __m128i ssp_hadds_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_hadds_epi16 [SSSE3]. (Searches MSDN)
Definition at line 577 of file SSEPlus_emulation_SSE2.h.
00578 { 00579 ssp_convert_odd_even_epi16_SSE2( &a, &b ); 00580 a = _mm_adds_epi16( a, b ); 00581 return a; 00582 }
SSP_FORCEINLINE __m128i ssp_hsub_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_hsub_epi16 [SSE3]. (Searches MSDN)
Definition at line 587 of file SSEPlus_emulation_SSE2.h.
00588 { 00589 ssp_convert_odd_even_epi16_SSE2( &a, &b ); 00590 a = _mm_sub_epi16( a, b ); 00591 return a; 00592 }
SSP_FORCEINLINE __m128i ssp_hsub_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_hsub_epi32 [SSSE3]. (Searches MSDN)
Definition at line 614 of file SSEPlus_emulation_SSE2.h.
00615 { 00616 ssp_convert_odd_even_epi32_SSE2( &a, &b ); 00617 a = _mm_sub_epi32( b, a ); 00618 return a; 00619 }
SSP_FORCEINLINE __m128d ssp_hsub_pd_SSE2 | ( | __m128d | a, | |
__m128d | b | |||
) |
SSE2 implementation of _mm_hsub_pd [SSE3]. (Searches MSDN)
Definition at line 658 of file SSEPlus_emulation_SSE2.h.
00659 { 00660 ssp_m128 A,B,C; 00661 A.d = a; 00662 C.d = a; 00663 B.d = b; 00664 00665 A.f = _mm_movelh_ps( A.f, B.f ); 00666 B.f = _mm_movehl_ps( B.f, C.f ); 00667 A.d = _mm_sub_pd ( A.d, B.d ); 00668 return A.d; 00669 }
SSP_FORCEINLINE __m128 ssp_hsub_ps_SSE2 | ( | __m128 | a, | |
__m128 | b | |||
) |
SSE2 implementation of _mm_hsub_ps [SSE3]. (Searches MSDN)
Definition at line 633 of file SSEPlus_emulation_SSE2.h.
00634 { 00635 ssp_convert_odd_even_ps_SSE2( &a, &b ); 00636 a = _mm_sub_ps( b, a ); 00637 return a; 00638 }
SSP_FORCEINLINE __m128i ssp_hsubs_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_hsub_epi16 [SSE3]. (Searches MSDN)
Definition at line 596 of file SSEPlus_emulation_SSE2.h.
00597 { 00598 ssp_convert_odd_even_epi16_SSE2( &a, &b ); 00599 a = _mm_subs_epi16( a, b ); 00600 return a; 00601 }
SSP_FORCEINLINE __m128i ssp_insert_epi32_SSE2 | ( | __m128i | a, | |
int | b, | |||
const int | ndx | |||
) |
SSE2 implementation of _mm_insert_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 698 of file SSEPlus_emulation_SSE2.h.
00698 : Verify behavior on Intel Hardware 00699 { 00700 switch( ndx & 0x3 ) 00701 { 00702 case 0: a = _mm_insert_epi16( a, b , 0 ); 00703 a = _mm_insert_epi16( a, b<<16, 1 ); break; 00704 case 1: a = _mm_insert_epi16( a, b , 2 ); 00705 a = _mm_insert_epi16( a, b<<16, 3 ); break; 00706 case 2: a = _mm_insert_epi16( a, b , 4 ); 00707 a = _mm_insert_epi16( a, b<<16, 5 ); break; 00708 case 3: a = _mm_insert_epi16( a, b , 6 ); 00709 a = _mm_insert_epi16( a, b<<16, 7 ); break; 00710 } 00711 return a; 00712 }
SSP_FORCEINLINE __m128i ssp_insert_epi8_SSE2 | ( | __m128i | a, | |
int | b, | |||
const int | ndx | |||
) |
SSE2 implementation of _mm_insert_epi8 [SSE4.1]. (Searches MSDN)
* Another implementation, but slower: */
Definition at line 1428 of file SSEPlus_emulation_SSE2.h.
01429 { 01430 ssp_m128 Ahi, Alo; 01431 b = b & 0xFF; /* Convert to 8-bit integer */ 01432 Ahi.i = _mm_unpackhi_epi8( a, _mm_setzero_si128() ); /* Ahi = a_8[8:15] Simulate 8bit integers as 16-bit integers */ 01433 Alo.i = _mm_unpacklo_epi8( a, _mm_setzero_si128() ); /* Alo = a_8[0:7] Simulate 8bit integers as 16-bit integers */ 01434 01435 /* Insert b as a 16-bit integer to upper or lower half of a */ 01436 switch( ndx & 0xF ) 01437 { 01438 case 0: Alo.i = _mm_insert_epi16( Alo.i, b, 0 ); break; 01439 case 1: Alo.i = _mm_insert_epi16( Alo.i, b, 1 ); break; 01440 case 2: Alo.i = _mm_insert_epi16( Alo.i, b, 2 ); break; 01441 case 3: Alo.i = _mm_insert_epi16( Alo.i, b, 3 ); break; 01442 case 4: Alo.i = _mm_insert_epi16( Alo.i, b, 4 ); break; 01443 case 5: Alo.i = _mm_insert_epi16( Alo.i, b, 5 ); break; 01444 case 6: Alo.i = _mm_insert_epi16( Alo.i, b, 6 ); break; 01445 case 7: Alo.i = _mm_insert_epi16( Alo.i, b, 7 ); break; 01446 case 8: Ahi.i = _mm_insert_epi16( Ahi.i, b, 0 ); break; 01447 case 9: Ahi.i = _mm_insert_epi16( Ahi.i, b, 1 ); break; 01448 case 10: Ahi.i = _mm_insert_epi16( Ahi.i, b, 2 ); break; 01449 case 11: Ahi.i = _mm_insert_epi16( Ahi.i, b, 3 ); break; 01450 case 12: Ahi.i = _mm_insert_epi16( Ahi.i, b, 4 ); break; 01451 case 13: Ahi.i = _mm_insert_epi16( Ahi.i, b, 5 ); break; 01452 case 14: Ahi.i = _mm_insert_epi16( Ahi.i, b, 6 ); break; 01453 default: Ahi.i = _mm_insert_epi16( Ahi.i, b, 7 ); 01454 } 01455 return _mm_packus_epi16( Alo.i, Ahi.i ); // Pack the 16-bit integers to 8bit again. 01456 01458 //ssp_m128 A, B, mask; 01459 //mask.i = _mm_setzero_si128(); 01460 //mask.s8[ ndx & 0x0F ] = (ssp_s8)0xFF; 01461 //B.i = _mm_set1_epi8( (ssp_s8)b ); 01462 //A.i = _mm_andnot_si128( mask.i, a ); 01463 //mask.i = _mm_and_si128( mask.i, B.i ); 01464 //A.i = _mm_or_si128( A.i, mask.i ); 01465 //return A.i; 01466 }
SSP_FORCEINLINE __m128i ssp_insert_si64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_insert_si64 [SSE4a]. (Searches MSDN)
Definition at line 1485 of file SSEPlus_emulation_SSE2.h.
01486 { 01487 ssp_u32 ndx, len; 01488 ssp_m128 B; 01489 B.i = b; 01490 01491 ndx = (ssp_u32)((B.u64[1] & 0x3F00) >> 8); // Mask length field. 01492 len = (ssp_u32)((B.u64[1] & 0x003F)); // Mask ndx field. 01493 01494 a = ssp_inserti_si64_SSE2( a, b, len, ndx ); 01495 return a; 01496 }
SSP_FORCEINLINE __m128i ssp_inserti_si64_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
int | len, | |||
int | ndx | |||
) |
SSE2 implementation of _mm_inserti_si64 [SSE4a]. (Searches MSDN)
Definition at line 1468 of file SSEPlus_emulation_SSE2.h.
01469 { 01470 const static __m128i MASK = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF ); 01471 01472 int left = ndx + len; 01473 __m128i m; 01474 m = _mm_slli_epi64( MASK, 64-left ); // clear the mask to the left 01475 m = _mm_srli_epi64( m, 64-len ); // clear the mask to the right 01476 m = _mm_slli_epi64( m, ndx ); // put the mask into the proper position 01477 b = _mm_slli_epi64( b, ndx ); // put the insert bits into the proper position 01478 01479 a = ssp_logical_bitwise_select_SSE2( b, a, m ); 01480 return a; 01481 }
SSP_FORCEINLINE __m128i ssp_macc_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | c | |||
) |
SSE2 implementation of _mm_macc_epi16/ pmacsww [SSE5]. (SSE5 .pdf documentation here)
Definition at line 25 of file SSEPlus_emulation_SSE2.h.
SSP_FORCEINLINE __m128i ssp_macc_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | c | |||
) |
SSE2 implementation of _mm_macc_epi32/ pmacsdd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 33 of file SSEPlus_emulation_SSE2.h.
00034 { 00035 __m128i ab02, ab13, mask; 00036 00037 mask = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF); 00038 ab02 = _mm_mul_epu32(a, b); 00039 ab02 = _mm_and_si128(ab02, mask); 00040 a = _mm_srli_epi64(a, 32); 00041 b = _mm_srli_epi64(b, 32); 00042 ab13 = _mm_mul_epu32(a, b); 00043 ab13 = _mm_slli_epi64(ab13, 32); 00044 00045 a = _mm_add_epi32(ab02, ab13); 00046 00047 return _mm_add_epi32(a, c); 00048 }
SSP_FORCEINLINE __m128d ssp_macc_pd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | c | |||
) |
SSE2 implementation of _mm_macc_pd/fmaddpd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 51 of file SSEPlus_emulation_SSE2.h.
SSP_FORCEINLINE __m128 ssp_macc_ps_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | c | |||
) |
SSE2 implementation of _mm_macc_ps/fmaddps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 59 of file SSEPlus_emulation_SSE2.h.
SSP_FORCEINLINE __m128d ssp_macc_sd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | c | |||
) |
SSE2 implementation of _mm_macc_sd/fmaddsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 67 of file SSEPlus_emulation_SSE2.h.
00068 { 00069 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 ); 00070 00071 ssp_m128 A,B; 00072 A.d = a; 00073 B.d = b; 00074 B.d = ssp_macc_pd_SSE2( A.d, B.d, c ); 00075 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles 00076 return B.d; 00077 }
SSP_FORCEINLINE __m128 ssp_macc_ss_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | c | |||
) |
SSE2 implementation of _mm_macc_ss/fmaddss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 80 of file SSEPlus_emulation_SSE2.h.
00080 : confirm 00081 { 00082 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 ); 00083 00084 ssp_m128 A,B; 00085 A.f = a; 00086 B.f = b; 00087 B.f = ssp_macc_ps_SSE2( A.f, B.f, c ); 00088 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles 00089 return B.f; 00090 }
SSP_FORCEINLINE __m128i ssp_maccd_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | c | |||
) |
SSE2 implementation of _mm_maccd_epi16/ pmacswd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 93 of file SSEPlus_emulation_SSE2.h.
00094 { 00095 __m128i ab_lo, ab_hi; 00096 __m128i mask = _mm_set1_epi32(0xFFFF); 00097 00098 ab_lo = _mm_mullo_epi16(a, b); 00099 ab_hi = _mm_mulhi_epi16(a, b); 00100 00101 ab_lo = _mm_and_si128(ab_lo, mask); 00102 ab_hi = _mm_and_si128(ab_hi, mask); 00103 ab_hi = _mm_slli_epi32(ab_hi, 16); 00104 a = _mm_add_epi32( ab_lo, ab_hi ); 00105 return _mm_add_epi32 (a, c); 00106 00108 //b = _mm_unpacklo_epi16(ab_lo, ab_hi); 00109 //ab_hi = _mm_unpackhi_epi16(ab_lo, ab_hi); 00110 //ab_lo = _mm_unpacklo_epi32(b, ab_hi); 00111 //ab_hi = _mm_unpackhi_epi32(b, ab_hi); 00112 //ab_lo = _mm_unpacklo_epi32(ab_lo, ab_hi); 00113 //return _mm_add_epi32(ab_lo, c); 00114 }
SSP_FORCEINLINE __m128i ssp_macchi_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | c | |||
) |
SSE2 implementation of _mm_macchi_epi32/ pmacsdqh [SSE5]. (SSE5 .pdf documentation here)
Definition at line 117 of file SSEPlus_emulation_SSE2.h.
00118 { 00119 __m128i mask, mask_A, mask_B, mask_C, ab; 00120 00121 a = _mm_srli_epi64(a, 32); 00122 b = _mm_srli_epi64(b, 32); 00123 mask = _mm_set_epi32(0x7FFFFFFF, 0, 0x7FFFFFFF, 0); 00124 00125 //abs(A) 00126 mask_A = _mm_cmplt_epi32( a, mask); //FFF...F when a < 0 00127 a = _mm_xor_si128 ( a, mask_A ); //Invert when a < 0 00128 mask_C = _mm_srli_epi32( mask_A, 31 ); // 1 when a < 0 00129 a = _mm_add_epi32( a, mask_C ); //Add 1 when a < 0 00130 00131 //abs(B) 00132 mask_B = _mm_cmplt_epi32( b, mask); //FFF...F when b < 0 00133 b = _mm_xor_si128 ( b, mask_B ); //Invert when b < 0 00134 mask_C = _mm_srli_epi32( mask_B, 31 ); // 1 when b < 0 00135 b = _mm_add_epi32( b, mask_C ); //Add 1 when b < 0 00136 00137 ab = _mm_mul_epu32(a, b); 00138 00139 //correct negative cases 00140 mask_A = _mm_xor_si128(mask_A, mask_B); 00141 mask_C = _mm_srli_epi32(mask_A, 31 ); 00142 mask_B = _mm_slli_epi64(mask_A, 32); 00143 mask = _mm_add_epi32(mask_A, mask_B); 00144 a = _mm_xor_si128(ab, mask); 00145 a = _mm_add_epi64(a, mask_C); 00146 00147 return _mm_add_epi64(a, c); 00148 }
SSP_FORCEINLINE __m128i ssp_macclo_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | c | |||
) |
SSE2 implementation of _mm_macclo_epi32/ pmacsdql [SSE5]. (SSE5 .pdf documentation here)
Definition at line 151 of file SSEPlus_emulation_SSE2.h.
00152 { 00153 __m128i mask, mask_A, mask_B, mask_C, ab; 00154 00155 mask = _mm_set_epi32(0x7FFFFFFF, 0, 0x7FFFFFFF, 0); 00156 //abs(A) 00157 mask_A = _mm_cmplt_epi32( a, mask); //FFF...F when a < 0 00158 a = _mm_xor_si128 ( a, mask_A ); //Invert when a < 0 00159 mask_C = _mm_srli_epi32( mask_A, 31 ); // 1 when a < 0 00160 a = _mm_add_epi32( a, mask_C ); //Add 1 when a < 0 00161 00162 //abs(B) 00163 mask_B = _mm_cmplt_epi32( b, mask); //FFF...F when b < 0 00164 b = _mm_xor_si128 ( b, mask_B ); //Invert when b < 0 00165 mask_C = _mm_srli_epi32( mask_B, 31 ); // 1 when b < 0 00166 b = _mm_add_epi32( b, mask_C ); //Add 1 when b < 0 00167 00168 ab = _mm_mul_epu32(a, b); 00169 00170 //correct negative cases 00171 mask_A = _mm_xor_si128(mask_A, mask_B); 00172 mask_C = _mm_srli_epi32(mask_A, 31 ); 00173 mask_B = _mm_slli_epi64(mask_A, 32); 00174 mask = _mm_add_epi32(mask_A, mask_B); 00175 a = _mm_xor_si128(ab, mask); 00176 a = _mm_add_epi64(a, mask_C); 00177 00178 return _mm_add_epi64(a, c); 00179 }
SSP_FORCEINLINE __m128i ssp_maccs_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | c | |||
) |
SSE2 implementation of _mm_maccs_epi16/ pmacssww [SSE5]. (SSE5 .pdf documentation here)
Definition at line 182 of file SSEPlus_emulation_SSE2.h.
00183 { 00184 //similar to the version in Framewave CBL 00185 __m128i ablo, abhi, unlo, unhi, signC, clo, chi; 00186 00187 ablo = _mm_mullo_epi16( a, b ); 00188 abhi = _mm_mulhi_epi16( a, b ); 00189 unlo = _mm_unpacklo_epi16( ablo, abhi ); 00190 unhi = _mm_unpackhi_epi16( ablo, abhi ); 00191 00192 //unpack and keep the sign of C 00193 signC = _mm_srai_epi16 (c, 15); 00194 chi = _mm_unpackhi_epi16(c, signC); 00195 clo = _mm_unpacklo_epi16(c, signC); 00196 00197 chi = _mm_add_epi32(chi, unhi); 00198 clo = _mm_add_epi32(clo, unlo); 00199 00200 return _mm_packs_epi32(clo, chi); 00201 }
SSP_FORCEINLINE __m128i ssp_maccs_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | c | |||
) |
SSE2 implementation of _mm_maccs_epi32/ pmacssdd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 204 of file SSEPlus_emulation_SSE2.h.
00205 { 00206 //Version 1, slightly modified from Framewave CBL 00207 ssp_m128 s1lo,s1hi,s2lo,s2hi,s3lo,s3hi, sl, sh; 00208 static const __m128d max_val = {(double)0x7FFFFFFFl, (double)0x7FFFFFFFl}; 00209 static const __m128d min_val = {(-(double)0x80000000l), (-(double)0x80000000l)}; 00210 00211 s1lo.d = _mm_cvtepi32_pd(a); 00212 s1hi.d = _mm_cvtepi32_pd(_mm_srli_si128(a, 8)); 00213 00214 s2lo.d = _mm_cvtepi32_pd(b); 00215 s2hi.d = _mm_cvtepi32_pd(_mm_srli_si128(b,8)); 00216 00217 s1lo.d = _mm_mul_pd(s1lo.d,s2lo.d); 00218 s1hi.d = _mm_mul_pd(s1hi.d,s2hi.d); 00219 00220 s3lo.d = _mm_cvtepi32_pd(c); 00221 s3hi.d = _mm_cvtepi32_pd(_mm_srli_si128(c,8)); 00222 00223 s1lo.d = _mm_add_pd(s1lo.d,s3lo.d); 00224 s1hi.d = _mm_add_pd(s1hi.d,s3hi.d); 00225 00226 sl.d = _mm_min_pd(s1lo.d, max_val); 00227 sl.d = _mm_max_pd(sl.d, min_val); 00228 00229 sh.d = _mm_min_pd(s1hi.d, max_val); 00230 sh.d = _mm_max_pd(sh.d, min_val); 00231 00232 sl.i = _mm_cvtpd_epi32(sl.d); 00233 sh.i = _mm_cvtpd_epi32(sh.d); 00234 00235 sh.i = _mm_slli_si128(sh.i, 8); 00236 sl.i = _mm_or_si128(sl.i, sh.i); 00237 00238 return sl.i; 00239 }
SSP_FORCEINLINE __m128i ssp_maddubs_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_maddubs_epi16 [SSSE3]. (Searches MSDN)
in: 2 registers x 16 x 8 bit values (a is unsigned, b is signed) out: 1 register x 8 x 16 bit values
r0 := SATURATE_16((a0 * b0) + (a1 * b1))
Definition at line 799 of file SSEPlus_emulation_SSE2.h.
00800 { 00801 const static __m128i EVEN_8 = SSP_CONST_SET_8I( 0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF); 00802 __m128i Aodd, Aeven, Beven, Bodd; 00803 00804 // Convert the 8 bit inputs into 16 bits by dropping every other value 00805 Aodd = _mm_srli_epi16( a, 8 ); // A is unsigned 00806 Bodd = _mm_srai_epi16( b, 8 ); // B is signed 00807 00808 Aeven = _mm_and_si128 ( a, EVEN_8 ); // A is unsigned 00809 Beven = _mm_slli_si128( b, 1 ); // B is signed 00810 Beven = _mm_srai_epi16( Beven, 8 ); 00811 00812 a = _mm_mullo_epi16( Aodd , Bodd ); // Will always fit in lower 16 00813 b = _mm_mullo_epi16( Aeven, Beven ); 00814 a = _mm_adds_epi16 ( a, b ); 00815 return a; 00816 }
SSP_FORCEINLINE __m128i ssp_max_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_max_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 765 of file SSEPlus_emulation_SSE2.h.
00766 { 00767 __m128i mask = _mm_cmpgt_epi32( a, b ); // FFFFFFFF where a > b 00768 a = ssp_logical_bitwise_select_SSE2( a, b, mask ); 00769 return a; 00770 }
SSP_FORCEINLINE __m128i ssp_max_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_max_epi8 [SSE4.1]. (Searches MSDN)
Definition at line 729 of file SSEPlus_emulation_SSE2.h.
00730 { 00731 __m128i mask = _mm_cmpgt_epi8( a, b ); // FFFFFFFF where a > b 00732 a = ssp_logical_bitwise_select_SSE2( a, b, mask ); 00733 return a; 00734 }
SSP_FORCEINLINE __m128i ssp_max_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_max_epu16 [SSE4.1]. (Searches MSDN)
Definition at line 747 of file SSEPlus_emulation_SSE2.h.
00748 { 00749 __m128i mask = ssp_comgt_epu16_SSE2( a, b ); 00750 a = ssp_logical_bitwise_select_SSE2( a, b, mask ); 00751 return a; 00752 }
SSP_FORCEINLINE __m128i ssp_max_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_max_epu32 [SSE4.1]. (Searches MSDN)
Definition at line 783 of file SSEPlus_emulation_SSE2.h.
00784 { 00785 __m128i mask = ssp_comgt_epu32_SSE2( a, b ); 00786 a = ssp_logical_bitwise_select_SSE2( a, b, mask ); 00787 return a; 00788 }
SSP_FORCEINLINE __m128i ssp_min_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_min_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 756 of file SSEPlus_emulation_SSE2.h.
00757 { 00758 __m128i mask = _mm_cmplt_epi32( a, b ); // FFFFFFFF where a < b 00759 a = ssp_logical_bitwise_select_SSE2( a, b, mask ); 00760 return a; 00761 }
SSP_FORCEINLINE __m128i ssp_min_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_min_epi8 [SSE4.1]. (Searches MSDN)
Definition at line 720 of file SSEPlus_emulation_SSE2.h.
00721 { 00722 __m128i mask = _mm_cmplt_epi8( a, b ); // FFFFFFFF where a < b 00723 a = ssp_logical_bitwise_select_SSE2( a, b, mask ); 00724 return a; 00725 }
SSP_FORCEINLINE __m128i ssp_min_epu16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_min_epu16 [SSE4.1]. (Searches MSDN)
Definition at line 738 of file SSEPlus_emulation_SSE2.h.
00739 { 00740 __m128i mask = ssp_comlt_epu16_SSE2( a, b ); 00741 a = ssp_logical_bitwise_select_SSE2( a, b, mask ); 00742 return a; 00743 }
SSP_FORCEINLINE __m128i ssp_min_epu32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_min_epu32 [SSE4.1]. (Searches MSDN)
Definition at line 774 of file SSEPlus_emulation_SSE2.h.
00775 { 00776 __m128i mask = ssp_comlt_epu32_SSE2( a, b ); 00777 a = ssp_logical_bitwise_select_SSE2( a, b, mask ); 00778 return a; 00779 }
SSP_FORCEINLINE __m128d ssp_movedup_pd_SSE2 | ( | __m128d | a | ) |
SSE2 implementation of _mm_movedup_pd [SSE3]. (Searches MSDN)
Definition at line 1757 of file SSEPlus_emulation_SSE2.h.
SSP_FORCEINLINE __m128 ssp_movehdup_ps_SSE2 | ( | __m128 | a | ) |
SSE2 implementation of _mm_movehdup_ps [SSE3]. (Searches MSDN)
Definition at line 1739 of file SSEPlus_emulation_SSE2.h.
01740 { 01741 ssp_m128 A; 01742 A.f = a; 01743 A.i = _mm_shuffle_epi32( A.i, _MM_SHUFFLE( 3, 3, 1, 1) ); 01744 return A.f; 01745 }
SSP_FORCEINLINE __m128 ssp_moveldup_ps_SSE2 | ( | __m128 | a | ) |
SSE2 implementation of _mm_moveldup_ps [SSE3]. (Searches MSDN)
Definition at line 1748 of file SSEPlus_emulation_SSE2.h.
01749 { 01750 ssp_m128 A; 01751 A.f = a; 01752 A.i = _mm_shuffle_epi32( A.i, _MM_SHUFFLE( 2, 2, 0, 0) ); 01753 return A.f; 01754 }
SSP_FORCEINLINE __m128i ssp_mpsadbw_epu8_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
const int | msk | |||
) |
SSE2 implementation of _mm_mpsadbw_epu8 [SSE4.1]. (Searches MSDN)
Definition at line 822 of file SSEPlus_emulation_SSE2.h.
00823 { 00824 const static __m128i MASK_BITS04 = SSP_CONST_SET_16I( 0,0,0,0xFFFF,0,0,0,0xFFFF ); 00825 const static __m128i MASK_BITS15 = SSP_CONST_SET_16I( 0,0,0xFFFF,0,0,0,0xFFFF,0 ); 00826 const static __m128i MASK_BITS26 = SSP_CONST_SET_16I( 0,0xFFFF,0,0,0,0xFFFF,0,0 ); 00827 const static __m128i MASK_BITS37 = SSP_CONST_SET_16I( 0xFFFF,0,0,0,0xFFFF,0,0,0 ); 00828 00829 ssp_m128 A,B,A16,tmp,out; 00830 A.i = a; 00831 B.i = b; 00832 00833 switch( msk & 0x4 ) // Possible values: 0, 4 00834 { 00835 case 4: A.i = _mm_srli_si128( A.i, 4 ); 00836 } 00837 00838 switch( (msk & 0x3) * 4 ) // Possible values: 0, 4, 8, 12 00839 { 00840 case 0: B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(0,0,0,0) ); break; 00841 case 4: B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(1,1,1,1) ); break; 00842 case 8: B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(2,2,2,2) ); break; 00843 case 12: B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(3,3,3,3) ); break; 00844 //default: ASSERT( false ); 00845 } 00846 00847 // out[0,4] 00848 B.i = _mm_unpacklo_epi8( B.i, _mm_setzero_si128() ); // 1 2 3 4 | 1 2 3 4 00849 A16.i = _mm_unpacklo_epi8( A.i, _mm_setzero_si128() ); // a b c d | e f g h 00850 tmp.i = _mm_subs_epi16 ( A16.i, B.i ); // a-1,b-2,c-3,d-4 | e-1,f-2,g-3,h-4 00851 tmp.i = ssp_abs_epi16_SSE2 ( tmp.i ); // abs(a-1),abs(b-2),...,abs(h-4) | ... 00852 tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 0 ); // x,x,x,abs(a-1)+abs(b-2)+abs(c-3)+abs(d-4) | ... 00853 tmp.i = _mm_and_si128 ( tmp.i, MASK_BITS04 ); // 0,0,0,abs(a-1)+abs(b-2)+abs(c-3)+abs(d-4) | ... 00854 out.i = tmp.i; 00855 00856 // out[1,5] 00857 A16.i = _mm_srli_si128 ( A.i, 1 ); 00858 A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() ); // b c | d e | f g | h i 00859 tmp.i = _mm_subs_epi16 ( A16.i, B.i ); 00860 tmp.i = ssp_abs_epi16_SSE2 ( tmp.i ); 00861 tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 1 ); 00862 tmp.i = _mm_and_si128 ( tmp.i, MASK_BITS15 ); 00863 out.i = _mm_or_si128 ( out.i, tmp.i ); 00864 00865 // out[2,6] 00866 A16.i = _mm_srli_si128 ( A.i, 2 ); 00867 A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() ); // c d | e f | g h | i j 00868 tmp.i = _mm_subs_epi16 ( A16.i, B.i ); 00869 tmp.i = ssp_abs_epi16_SSE2 ( tmp.i ); 00870 tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 2 ); 00871 tmp.i = _mm_and_si128 ( tmp.i, MASK_BITS26 ); 00872 out.i = _mm_or_si128 ( out.i, tmp.i ); 00873 00874 // out[3,7] 00875 A16.i = _mm_srli_si128 ( A.i, 3 ); 00876 A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() ); // d e | f g | h i | j k 00877 tmp.i = _mm_subs_epi16 ( A16.i, B.i ); 00878 tmp.i = ssp_abs_epi16_SSE2 ( tmp.i ); 00879 tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 3 ); 00880 tmp.i = _mm_and_si128 ( tmp.i, MASK_BITS37 ); 00881 out.i = _mm_or_si128 ( out.i, tmp.i ); 00882 00883 return out.i; 00884 }
SSP_FORCEINLINE __m128d ssp_msub_pd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | c | |||
) |
SSE2 implementation of _mm_msub_pd/fmsubpd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 305 of file SSEPlus_emulation_SSE2.h.
SSP_FORCEINLINE __m128 ssp_msub_ps_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | c | |||
) |
SSE2 implementation of _mm_msub_ps/fmsubps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 297 of file SSEPlus_emulation_SSE2.h.
SSP_FORCEINLINE __m128d ssp_msub_sd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | c | |||
) |
SSE2 implementation of _mm_msub_sd/fmsubsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 326 of file SSEPlus_emulation_SSE2.h.
00327 { 00328 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 ); 00329 00330 ssp_m128 A,B; 00331 A.d = a; 00332 B.d = b; 00333 B.d = ssp_msub_pd_SSE2( A.d, B.d, c ); 00334 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles 00335 return B.d; 00336 }
SSP_FORCEINLINE __m128 ssp_msub_ss_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | c | |||
) |
SSE2 implementation of _mm_msub_ss/fmsubss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 313 of file SSEPlus_emulation_SSE2.h.
00314 { 00315 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 ); 00316 00317 ssp_m128 A,B; 00318 A.f = a; 00319 B.f = b; 00320 B.f = ssp_msub_ps_SSE2( A.f, B.f, c ); 00321 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles 00322 return B.f; 00323 }
SSP_FORCEINLINE __m128i ssp_mulhrs_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_mulhrs_epi16 [SSSE3]. (Searches MSDN)
Definition at line 674 of file SSEPlus_emulation_SSE2.h.
00675 { 00676 const static __m128i VAL = SSP_CONST_SET_32I( 0x4000, 0x4000, 0x4000, 0x4000 ); 00677 __m128i c,d; 00678 00679 c = _mm_mullo_epi16( a, b ); 00680 d = _mm_mulhi_epi16( a, b ); 00681 00682 a = _mm_unpackhi_epi16( c, d ); 00683 b = _mm_unpacklo_epi16( c, d ); 00684 00685 a = _mm_add_epi32( a, VAL ); 00686 b = _mm_add_epi32( b, VAL ); 00687 00688 a = _mm_srai_epi32( a, 15 ); 00689 b = _mm_srai_epi32( b, 15 ); 00690 00691 a = _mm_packs_epi32( b, a ); 00692 return a; 00693 }
SSP_FORCEINLINE __m128d ssp_nmacc_pd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | c | |||
) |
SSE2 implementation of _mm_nmacc_pd/fnmaddpd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 256 of file SSEPlus_emulation_SSE2.h.
00257 { 00258 const static __m128d neg1 = SSP_CONST_SET_64F( -1.0, -1.0 ); 00259 00260 a = _mm_mul_pd( a, b ); 00261 a = _mm_mul_pd( a, neg1 ); 00262 a = _mm_add_pd( a, c ); 00263 return a; 00264 }
SSP_FORCEINLINE __m128 ssp_nmacc_ps_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | c | |||
) |
SSE2 implementation of _mm_nmacc_ps/fnmaddps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 245 of file SSEPlus_emulation_SSE2.h.
00246 { 00247 const static __m128 neg1 = SSP_CONST_SET_32F( -1.0f, -1.0f, -1.0f, -1.0f ); 00248 00249 a = _mm_mul_ps( a, b ); 00250 a = _mm_mul_ps( a, neg1 ); 00251 a = _mm_add_ps( a, c ); 00252 return a; 00253 }
SSP_FORCEINLINE __m128d ssp_nmacc_sd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | c | |||
) |
SSE2 implementation of _mm_nmacc_sd/fnmaddsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 280 of file SSEPlus_emulation_SSE2.h.
00281 { 00282 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 ); 00283 00284 ssp_m128 A,B; 00285 A.d = a; 00286 B.d = b; 00287 B.d = ssp_nmacc_pd_SSE2( A.d, B.d, c ); 00288 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles 00289 return B.d; 00290 }
SSP_FORCEINLINE __m128 ssp_nmacc_ss_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | c | |||
) |
SSE2 implementation of _mm_nmacc_ss/fnmaddss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 267 of file SSEPlus_emulation_SSE2.h.
00267 : confirm 00268 { 00269 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 ); 00270 00271 ssp_m128 A,B; 00272 A.f = a; 00273 B.f = b; 00274 B.f = ssp_nmacc_ps_SSE2( A.f, B.f, c ); 00275 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles 00276 return B.f; 00277 }
SSP_FORCEINLINE __m128d ssp_nmsub_pd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | c | |||
) |
SSE2 implementation of _mm_nmsub_pd/fnmsubpd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 354 of file SSEPlus_emulation_SSE2.h.
00355 { 00356 const static __m128d neg1 = SSP_CONST_SET_64F( -1.0, -1.0 ); 00357 00358 a = _mm_mul_pd( a, b ); 00359 a = _mm_mul_pd( a, neg1 ); 00360 a = _mm_sub_pd( a, c ); 00361 return a; 00362 }
SSP_FORCEINLINE __m128 ssp_nmsub_ps_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | c | |||
) |
SSE2 implementation of _mm_nmsub_ps/fnmsubps [SSE5]. (SSE5 .pdf documentation here)
Definition at line 343 of file SSEPlus_emulation_SSE2.h.
00344 { 00345 const static __m128 neg1 = SSP_CONST_SET_32F( -1.0f, -1.0f, -1.0f, -1.0f ); 00346 00347 a = _mm_mul_ps( a, b ); 00348 a = _mm_mul_ps( a, neg1 ); 00349 a = _mm_sub_ps( a, c ); 00350 return a; 00351 }
SSP_FORCEINLINE __m128d ssp_nmsub_sd_SSE2 | ( | __m128d | a, | |
__m128d | b, | |||
__m128d | c | |||
) |
SSE2 implementation of _mm_nmsub_sd/fnmsubsd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 378 of file SSEPlus_emulation_SSE2.h.
00379 { 00380 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 ); 00381 00382 ssp_m128 A,B; 00383 A.d = a; 00384 B.d = b; 00385 B.d = ssp_nmsub_pd_SSE2( A.d, B.d, c ); 00386 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles 00387 return B.d; 00388 }
SSP_FORCEINLINE __m128 ssp_nmsub_ss_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
__m128 | c | |||
) |
SSE2 implementation of _mm_nmsub_ss/fnmsubss [SSE5]. (SSE5 .pdf documentation here)
Definition at line 365 of file SSEPlus_emulation_SSE2.h.
00366 { 00367 const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 ); 00368 00369 ssp_m128 A,B; 00370 A.f = a; 00371 B.f = b; 00372 B.f = ssp_nmsub_ps_SSE2( A.f, B.f, c ); 00373 B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles 00374 return B.f; 00375 }
SSP_FORCEINLINE __m128i ssp_packus_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_packus_epi32 [SSE4.1]. (Searches MSDN)
Definition at line 1283 of file SSEPlus_emulation_SSE2.h.
01284 { 01285 const static __m128i val_32 = SSP_CONST_SET_32I( 0x8000, 0x8000, 0x8000, 0x8000 ); 01286 const static __m128i val_16 = SSP_CONST_SET_16I( 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000 ); 01287 01288 a = _mm_sub_epi32( a, val_32 ); 01289 b = _mm_sub_epi32( b, val_32 ); 01290 a = _mm_packs_epi32( a, b ); 01291 a = _mm_add_epi16( a, val_16 ); 01292 return a; 01293 }
SSP_FORCEINLINE __m128i ssp_rot_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_rot_epi16/ protw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1794 of file SSEPlus_emulation_SSE2.h.
01795 { 01796 int n; 01797 ssp_m128 A,B; 01798 A.i = a; 01799 B.i = b; 01800 01801 for( n = 0; n < 8; n++ ) 01802 { 01803 if( B.s16[n] < 0 ) 01804 { 01805 unsigned int count = (-B.s16[n]) % 16; 01806 unsigned int carry_count = (16 - count) % 16; 01807 ssp_u16 carry = A.u16[n] << carry_count; 01808 A.u16[n] = A.u16[n] >> count; 01809 A.u16[n] = A.u16[n] | carry; 01810 } 01811 else 01812 { 01813 unsigned int count = B.s16[n] % 8; 01814 unsigned int carry_count = (16 - count) % 16; 01815 ssp_u16 carry = A.u16[n] >> carry_count; 01816 A.u16[n] = A.u16[n] << count; 01817 A.u16[n] = A.u16[n] | carry; 01818 } 01819 } 01820 return A.i; 01821 }
SSP_FORCEINLINE __m128i ssp_rot_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_rot_epi32/ protd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1823 of file SSEPlus_emulation_SSE2.h.
01824 { 01825 int n; 01826 ssp_m128 A,B; 01827 A.i = a; 01828 B.i = b; 01829 01830 for( n = 0; n < 4; n++ ) 01831 { 01832 if( B.s32[n] < 0 ) 01833 { 01834 unsigned int count = (-B.s32[n]) % 32; 01835 unsigned int carry_count = (32 - count) % 32; 01836 ssp_u32 carry = A.u32[n] << carry_count; 01837 A.u32[n] = A.u32[n] >> count; 01838 A.u32[n] = A.u32[n] | carry; 01839 } 01840 else 01841 { 01842 unsigned int count = B.s32[n] % 32; 01843 unsigned int carry_count = (32 - count) % 32; 01844 ssp_u32 carry = A.u32[n] >> carry_count; 01845 A.u32[n] = A.u32[n] << count; 01846 A.u32[n] = A.u32[n] | carry; 01847 } 01848 } 01849 return A.i; 01850 }
SSP_FORCEINLINE __m128i ssp_rot_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_rot_epi64/ protq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1852 of file SSEPlus_emulation_SSE2.h.
01853 { 01854 int n; 01855 ssp_m128 A,B; 01856 A.i = a; 01857 B.i = b; 01858 01859 for( n = 0; n < 2; n++ ) 01860 { 01861 if( B.s64[n] < 0 ) 01862 { 01863 unsigned int count = (unsigned int)((-B.s64[n]) % 64); 01864 unsigned int carry_count = (64 - count) % 64; 01865 ssp_u64 carry = A.u64[n] << carry_count; 01866 A.u64[n] = A.u64[n] >> count; 01867 A.u64[n] = A.u64[n] | carry; 01868 } 01869 else 01870 { 01871 unsigned int count = (unsigned int)(B.s64[n] % 64); 01872 unsigned int carry_count = (64 - count) % 64; 01873 ssp_u64 carry = A.u64[n] >> carry_count; 01874 A.u64[n] = A.u64[n] << count; 01875 A.u64[n] = A.u64[n] | carry; 01876 } 01877 } 01878 return A.i; 01879 }
SSP_FORCEINLINE __m128i ssp_rot_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_rot_epi8/ protb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1765 of file SSEPlus_emulation_SSE2.h.
01766 { 01767 int n; 01768 ssp_m128 A,B; 01769 A.i = a; 01770 B.i = b; 01771 01772 for( n = 0; n < 16; n++ ) 01773 { 01774 if( B.s8[n] < 0 ) 01775 { 01776 unsigned int count = (-B.s8[n]) % 8; 01777 unsigned int carry_count = (8 - count) % 8; 01778 unsigned char carry = A.u8[n] << carry_count; 01779 A.u8[n] = A.u8[n] >> count; 01780 A.u8[n] = A.u8[n] | carry; 01781 } 01782 else 01783 { 01784 unsigned int count = B.s8[n] % 8; 01785 unsigned int carry_count = (8 - count) % 8; 01786 unsigned char carry = A.u8[n] >> carry_count; 01787 A.u8[n] = A.u8[n] << count; 01788 A.u8[n] = A.u8[n] | carry; 01789 } 01790 } 01791 return A.i; 01792 }
SSP_FORCEINLINE __m128i ssp_roti_epi16_SSE2 | ( | __m128i | a, | |
const int | b | |||
) |
SSE2 implementation of _mm_roti_epi16/ protw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1907 of file SSEPlus_emulation_SSE2.h.
01908 { 01909 ssp_m128 A; 01910 A.i = a; 01911 01912 if( b < 0 ) 01913 { 01914 const unsigned int count = (-b) % 16; 01915 const unsigned int carry_count = (16 - count) % 16; 01916 __m128i t = _mm_slli_epi16( A.i, carry_count ); 01917 A.i = _mm_srli_epi16( A.i, count ); 01918 A.i = _mm_or_si128( A.i, t ); 01919 } 01920 else 01921 { 01922 const unsigned int count = b % 16; 01923 const unsigned int carry_count = (16 - count) % 16; 01924 __m128i t = _mm_srli_epi16( A.i, carry_count ); 01925 A.i = _mm_slli_epi16( A.i, count ); 01926 A.i = _mm_or_si128( A.i, t ); 01927 } 01928 01929 return A.i; 01930 }
SSP_FORCEINLINE __m128i ssp_roti_epi32_SSE2 | ( | __m128i | a, | |
const int | b | |||
) |
SSE2 implementation of _mm_roti_epi32/ protd [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1932 of file SSEPlus_emulation_SSE2.h.
01933 { 01934 ssp_m128 A; 01935 A.i = a; 01936 01937 if( b < 0 ) 01938 { 01939 const unsigned int count = (-b) % 32; 01940 const unsigned int carry_count = (32 - count) % 32; 01941 __m128i t = _mm_slli_epi32( A.i, carry_count ); 01942 A.i = _mm_srli_epi32( A.i, count ); 01943 A.i = _mm_or_si128( A.i, t ); 01944 } 01945 else 01946 { 01947 const unsigned int count = b % 32; 01948 const unsigned int carry_count = (32 - count) % 32; 01949 __m128i t = _mm_srli_epi32( A.i, carry_count ); 01950 A.i = _mm_slli_epi32( A.i, count ); 01951 A.i = _mm_or_si128( A.i, t ); 01952 } 01953 01954 return A.i; 01955 }
SSP_FORCEINLINE __m128i ssp_roti_epi64_SSE2 | ( | __m128i | a, | |
const int | b | |||
) |
SSE2 implementation of _mm_roti_epi64/ protq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1957 of file SSEPlus_emulation_SSE2.h.
01958 { 01959 ssp_m128 A; 01960 A.i = a; 01961 01962 if( b < 0 ) 01963 { 01964 const unsigned int count = (-b) % 64; 01965 const unsigned int carry_count = (64 - count) % 64; 01966 __m128i t = _mm_slli_epi64( A.i, carry_count ); 01967 A.i = _mm_srli_epi64( A.i, count ); 01968 A.i = _mm_or_si128( A.i, t ); 01969 } 01970 else 01971 { 01972 const unsigned int count = b % 64; 01973 const unsigned int carry_count = (64 - count) % 64; 01974 __m128i t = _mm_srli_epi64( A.i, carry_count ); 01975 A.i = _mm_slli_epi64( A.i, count ); 01976 A.i = _mm_or_si128( A.i, t ); 01977 } 01978 01979 return A.i; 01980 }
SSP_FORCEINLINE __m128i ssp_roti_epi8_SSE2 | ( | __m128i | a, | |
const int | b | |||
) |
SSE2 implementation of _mm_roti_epi8/ protb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1882 of file SSEPlus_emulation_SSE2.h.
01883 { 01884 ssp_m128 A; 01885 A.i = a; 01886 01887 if( b < 0 ) 01888 { 01889 const unsigned int count = (-b) % 8; 01890 const unsigned int carry_count = (8 - count) % 8; 01891 __m128i t = ssp_slli_epi8_SSE2( A.i, carry_count ); 01892 A.i = ssp_srli_epi8_SSE2( A.i, count ); 01893 A.i = _mm_or_si128( A.i, t ); 01894 } 01895 else 01896 { 01897 const unsigned int count = b % 8; 01898 const unsigned int carry_count = (8 - count) % 8; 01899 __m128i t = ssp_srli_epi8_SSE2( A.i, carry_count ); 01900 A.i = ssp_slli_epi8_SSE2( A.i, count ); 01901 A.i = _mm_or_si128( A.i, t ); 01902 } 01903 01904 return A.i; 01905 }
SSP_FORCEINLINE __m128d ssp_round_pd_SSE2 | ( | __m128d | a, | |
int | iRoundMode | |||
) |
Definition at line 985 of file SSEPlus_emulation_SSE2.h.
00986 { 00987 #pragma message( "" WARN() "SSEPlus SSE2 rounding functions overflow if input outside 32 bit integer range" ) 00988 00989 enum ENUM_MXCSR 00990 { 00991 CSR_ROUND_TO_EVEN = 0x00001F80, // 00992 CSR_ROUND_DOWN = 0x00003F80, // 00993 CSR_ROUND_UP = 0x00005F80, // 00994 CSR_ROUND_TRUNC = 0x00007F80, //(_mm_getcsr() & ~_mm_ROUND_MASK) | _mm_ROUND_TOWARD_ZERO; 00995 }; 00996 00997 ssp_u32 bak = _mm_getcsr(); 00998 ssp_m128 A, i; 00999 A.d = a; 01000 01001 01002 switch( iRoundMode & 0x3 ) 01003 { 01004 case SSP_FROUND_CUR_DIRECTION: break; 01005 case SSP_FROUND_TO_ZERO: _mm_setcsr( CSR_ROUND_TRUNC ); break; 01006 case SSP_FROUND_TO_POS_INF: _mm_setcsr( CSR_ROUND_UP ); break; 01007 case SSP_FROUND_TO_NEG_INF: _mm_setcsr( CSR_ROUND_DOWN ); break; 01008 default: _mm_setcsr( CSR_ROUND_TO_EVEN); break; 01009 } 01010 01011 i.i = _mm_cvtpd_epi32( A.d ); // Convert to integer 01012 A.d = _mm_cvtepi32_pd( i.i ); // Convert back to float 01013 01014 i.u32[0] = bak; // Workaround for a bug in the MSVC compiler. MSVC was hoisting the mxcsr restore above the converts. 01015 _mm_setcsr( i.u32[0] ); 01016 return A.d; 01017 }
SSP_FORCEINLINE __m128 ssp_round_ps_SSE2 | ( | __m128 | a, | |
int | iRoundMode | |||
) |
SSE2 implementation of _mm_round_ps [SSE4.1]. (Searches MSDN)
NOTE_1: When rounding from negative numbers to zero, this function returns 0 and NOT -0.
If this behavor is desired, use the slower function ssp_round_ps_neg_zero_SSE2().
NOTE_2: This functon should used only with input in the range (-2,147,483,648 -> 2,147,483,647)
If a greater range is desired, use the slower function ssp_round_ps_REF().
Definition at line 950 of file SSEPlus_emulation_SSE2.h.
00951 { 00952 #pragma message( "" WARN() "SSEPlus SSE2 rounding functions overflow if input outside 32 bit integer range" ) 00953 00954 enum ENUM_MXCSR 00955 { 00956 CSR_ROUND_TO_EVEN = 0x00001F80, // 00957 CSR_ROUND_DOWN = 0x00003F80, // 00958 CSR_ROUND_UP = 0x00005F80, // 00959 CSR_ROUND_TRUNC = 0x00007F80, //(_mm_getcsr() & ~_mm_ROUND_MASK) | _mm_ROUND_TOWARD_ZERO; 00960 }; 00961 00962 ssp_u32 bak = _mm_getcsr(); 00963 ssp_m128 A, i; 00964 A.f = a; 00965 00966 switch( iRoundMode & 0x3 ) 00967 { 00968 case SSP_FROUND_CUR_DIRECTION: break; 00969 case SSP_FROUND_TO_ZERO: _mm_setcsr( CSR_ROUND_TRUNC ); break; 00970 case SSP_FROUND_TO_POS_INF: _mm_setcsr( CSR_ROUND_UP ); break; 00971 case SSP_FROUND_TO_NEG_INF: _mm_setcsr( CSR_ROUND_DOWN ); break; 00972 default: _mm_setcsr( CSR_ROUND_TO_EVEN); break; 00973 } 00974 00975 i.i = _mm_cvtps_epi32( A.f ); // Convert to integer 00976 A.f = _mm_cvtepi32_ps( i.i ); // Convert back to float 00977 00978 i.u32[0] = bak; // Workaround for a bug in the MSVC compiler. MSVC was hoisting the mxcsr restore above the converts. 00979 _mm_setcsr( i.u32[0] ); 00980 return A.f; 00981 }
SSP_FORCEINLINE __m128 ssp_round_ss_SSE2 | ( | __m128 | a, | |
__m128 | b, | |||
int | iRoundMode | |||
) |
SSE2 implementation of _mm_round_ss [SSE4.1]. (Searches MSDN)
Definition at line 1021 of file SSEPlus_emulation_SSE2.h.
01022 { 01023 //Commented code will generate linker error in x64 platform 01024 //ssp_m128 A,B; 01025 //A.f = a; 01026 //B.f = ssp_round_ps_SSE2( b, iRoundMode ); 01027 01028 //A.f = _mm_move_ss( A.f, B.f ); 01029 01031 //return A.f; 01032 b = ssp_round_ps_SSE2(b, iRoundMode); // B contains modified values through whole vector 01033 b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0)); 01034 return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0)); 01035 }
SSP_FORCEINLINE __m128i ssp_sha_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of ssp_sha_epi16/pshaw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 2100 of file SSEPlus_emulation_SSE2.h.
02101 { 02102 __m128i v1, v2, mask, mask2, b1, b2; 02103 b1 = ssp_abs_epi8_SSE2( b ); 02104 mask = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, -1 ); 02105 mask2 = _mm_srli_epi16( mask, 12 ); // the shfit count is a 4 bit value 02106 02107 b2 = _mm_and_si128( b1, mask2 ); 02108 v1 = _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ); // negative shift 02109 v2 = _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ); // positive shift 02110 mask = _mm_slli_si128( mask, 2 ); 02111 b1 = _mm_srli_si128( b1, 2 ); 02112 02113 b2 = _mm_and_si128( b1, mask2 ); 02114 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift 02115 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02116 mask = _mm_slli_si128( mask, 2 ); 02117 b1 = _mm_srli_si128( b1, 2 ); 02118 02119 b2 = _mm_and_si128( b1, mask2 ); 02120 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift 02121 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02122 mask = _mm_slli_si128( mask, 2 ); 02123 b1 = _mm_srli_si128( b1, 2 ); 02124 02125 b2 = _mm_and_si128( b1, mask2 ); 02126 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift 02127 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02128 mask = _mm_slli_si128( mask, 2 ); 02129 b1 = _mm_srli_si128( b1, 2 ); 02130 02131 b2 = _mm_and_si128( b1, mask2 ); 02132 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift 02133 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02134 mask = _mm_slli_si128( mask, 2 ); 02135 b1 = _mm_srli_si128( b1, 2 ); 02136 02137 b2 = _mm_and_si128( b1, mask2 ); 02138 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift 02139 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02140 mask = _mm_slli_si128( mask, 2 ); 02141 b1 = _mm_srli_si128( b1, 2 ); 02142 02143 b2 = _mm_and_si128( b1, mask2 ); 02144 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift 02145 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02146 mask = _mm_slli_si128( mask, 2 ); 02147 b1 = _mm_srli_si128( b1, 2 ); 02148 02149 b2 = _mm_and_si128( b1, mask2 ); 02150 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift 02151 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02152 02153 mask = _mm_setzero_si128(); 02154 mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b 02155 mask2 = _mm_slli_epi16( mask, 8 ); 02156 mask = _mm_or_si128( mask, mask2 ); 02157 v1 = _mm_and_si128( v1, mask ); 02158 mask = _mm_andnot_si128( mask, v2 ); 02159 v1 = _mm_or_si128( v1, mask ); 02160 return v1; 02161 }
SSP_FORCEINLINE __m128i ssp_sha_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of ssp_sha_epi32/pshad [SSE5]. (SSE5 .pdf documentation here)
Definition at line 2204 of file SSEPlus_emulation_SSE2.h.
02205 { 02206 __m128i v1, v2, mask, mask2, b1, b2; 02207 b1 = ssp_abs_epi8_SSE2( b ); 02208 mask = _mm_set_epi32( 0, 0, 0, -1 ); 02209 mask2 = _mm_srli_epi32( mask, 27 ); // the shfit count is a 5 bit value 02210 02211 b2 = _mm_and_si128( b1, mask2 ); 02212 v1 = _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ); // negative shift 02213 v2 = _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ); // positive shift 02214 mask = _mm_slli_si128( mask, 4 ); 02215 b1 = _mm_srli_si128( b1, 4 ); 02216 02217 b2 = _mm_and_si128( b1, mask2 ); 02218 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift 02219 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift 02220 mask = _mm_slli_si128( mask, 4 ); 02221 b1 = _mm_srli_si128( b1, 4 ); 02222 02223 b2 = _mm_and_si128( b1, mask2 ); 02224 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift 02225 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift 02226 mask = _mm_slli_si128( mask, 4 ); 02227 b1 = _mm_srli_si128( b1, 4 ); 02228 02229 b2 = _mm_and_si128( b1, mask2 ); 02230 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift 02231 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift 02232 02233 mask = _mm_setzero_si128(); 02234 mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b 02235 mask = _mm_slli_epi32( mask, 24 ); 02236 mask = _mm_srai_epi32( mask, 24 ); 02237 v1 = _mm_and_si128( v1, mask ); 02238 mask = _mm_andnot_si128( mask, v2 ); 02239 v1 = _mm_or_si128( v1, mask ); 02240 return v1; 02241 }
SSP_FORCEINLINE __m128i ssp_sha_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of ssp_sha_epi64/pshaq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 2274 of file SSEPlus_emulation_SSE2.h.
02275 { 02276 int n; 02277 ssp_m128 A,B; 02278 A.i = a; 02279 B.i = b; 02280 02281 for( n = 0; n < 2; n++ ) 02282 { 02283 if( B.s8[n*8] < 0 ) 02284 { 02285 unsigned int count = (-B.s8[n*8]) % 64; 02286 A.s64[n] = A.s64[n] >> count; 02287 } 02288 else 02289 { 02290 unsigned int count = B.s8[n*8] % 64; 02291 A.s64[n] = A.s64[n] << count; 02292 } 02293 } 02294 02295 return A.i; 02296 }
SSP_FORCEINLINE __m128i ssp_sha_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of ssp_sha_epi8/pshab [SSE5]. (SSE5 .pdf documentation here)
Definition at line 2011 of file SSEPlus_emulation_SSE2.h.
02012 { 02013 int n; 02014 ssp_m128 A,B; 02015 A.i = a; 02016 B.i = b; 02017 02018 for( n = 0; n < 16; n++ ) 02019 { 02020 if( B.s8[n] < 0 ) 02021 { 02022 unsigned int count = (-B.s8[n]) % 8; 02023 A.s8[n] = A.s8[n] >> count; 02024 } 02025 else 02026 { 02027 unsigned int count = B.s8[n] % 8; 02028 A.s8[n] = A.s8[n] << count; 02029 } 02030 } 02031 02032 return A.i; 02033 }
SSP_FORCEINLINE __m128i ssp_shl_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of ssp_shl_epi16/pshlw [SSE5]. (SSE5 .pdf documentation here)
Definition at line 2036 of file SSEPlus_emulation_SSE2.h.
02037 { 02038 __m128i v1, v2, mask, mask2, b1, b2; 02039 b1 = ssp_abs_epi8_SSE2( b ); 02040 mask = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, -1 ); 02041 mask2 = _mm_srli_epi16( mask, 12 ); // the shfit count is a 4 bit value 02042 02043 b2 = _mm_and_si128( b1, mask2 ); 02044 v1 = _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ); // negative shift 02045 v2 = _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ); // positive shift 02046 mask = _mm_slli_si128( mask, 2 ); 02047 b1 = _mm_srli_si128( b1, 2 ); 02048 02049 b2 = _mm_and_si128( b1, mask2 ); 02050 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift 02051 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02052 mask = _mm_slli_si128( mask, 2 ); 02053 b1 = _mm_srli_si128( b1, 2 ); 02054 02055 b2 = _mm_and_si128( b1, mask2 ); 02056 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift 02057 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02058 mask = _mm_slli_si128( mask, 2 ); 02059 b1 = _mm_srli_si128( b1, 2 ); 02060 02061 b2 = _mm_and_si128( b1, mask2 ); 02062 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift 02063 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02064 mask = _mm_slli_si128( mask, 2 ); 02065 b1 = _mm_srli_si128( b1, 2 ); 02066 02067 b2 = _mm_and_si128( b1, mask2 ); 02068 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift 02069 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02070 mask = _mm_slli_si128( mask, 2 ); 02071 b1 = _mm_srli_si128( b1, 2 ); 02072 02073 b2 = _mm_and_si128( b1, mask2 ); 02074 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift 02075 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02076 mask = _mm_slli_si128( mask, 2 ); 02077 b1 = _mm_srli_si128( b1, 2 ); 02078 02079 b2 = _mm_and_si128( b1, mask2 ); 02080 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift 02081 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02082 mask = _mm_slli_si128( mask, 2 ); 02083 b1 = _mm_srli_si128( b1, 2 ); 02084 02085 b2 = _mm_and_si128( b1, mask2 ); 02086 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift 02087 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift 02088 02089 mask = _mm_setzero_si128(); 02090 mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b 02091 mask2 = _mm_slli_epi16( mask, 8 ); 02092 mask = _mm_or_si128( mask, mask2 ); 02093 v1 = _mm_and_si128( v1, mask ); 02094 mask = _mm_andnot_si128( mask, v2 ); 02095 v1 = _mm_or_si128( v1, mask ); 02096 return v1; 02097 }
SSP_FORCEINLINE __m128i ssp_shl_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of ssp_shl_epi32/pshld [SSE5]. (SSE5 .pdf documentation here)
Definition at line 2164 of file SSEPlus_emulation_SSE2.h.
02165 { 02166 __m128i v1, v2, mask, mask2, b1, b2; 02167 b1 = ssp_abs_epi8_SSE2( b ); 02168 mask = _mm_set_epi32( 0, 0, 0, -1 ); 02169 mask2 = _mm_srli_epi32( mask, 27 ); // the shfit count is a 5 bit value 02170 02171 b2 = _mm_and_si128( b1, mask2 ); 02172 v1 = _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ); // negative shift 02173 v2 = _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ); // positive shift 02174 mask = _mm_slli_si128( mask, 4 ); 02175 b1 = _mm_srli_si128( b1, 4 ); 02176 02177 b2 = _mm_and_si128( b1, mask2 ); 02178 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift 02179 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift 02180 mask = _mm_slli_si128( mask, 4 ); 02181 b1 = _mm_srli_si128( b1, 4 ); 02182 02183 b2 = _mm_and_si128( b1, mask2 ); 02184 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift 02185 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift 02186 mask = _mm_slli_si128( mask, 4 ); 02187 b1 = _mm_srli_si128( b1, 4 ); 02188 02189 b2 = _mm_and_si128( b1, mask2 ); 02190 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift 02191 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift 02192 02193 mask = _mm_setzero_si128(); 02194 mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b 02195 mask = _mm_slli_epi32( mask, 24 ); 02196 mask = _mm_srai_epi32( mask, 24 ); 02197 v1 = _mm_and_si128( v1, mask ); 02198 mask = _mm_andnot_si128( mask, v2 ); 02199 v1 = _mm_or_si128( v1, mask ); 02200 return v1; 02201 }
SSP_FORCEINLINE __m128i ssp_shl_epi64_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of ssp_shl_epi64/pshlq [SSE5]. (SSE5 .pdf documentation here)
Definition at line 2244 of file SSEPlus_emulation_SSE2.h.
02245 { 02246 __m128i v1, v2, mask, mask2, b1, b2; 02247 b1 = ssp_abs_epi8_SSE2( b ); 02248 mask = _mm_set_epi32( 0, 0, -1, -1 ); 02249 mask2 = _mm_srli_epi64( mask, 58 ); // the shfit count is a 6 bit value 02250 02251 b2 = _mm_and_si128( b1, mask2 ); 02252 v1 = _mm_and_si128( _mm_srl_epi64( a, b2 ), mask ); // negative shift 02253 v2 = _mm_and_si128( _mm_sll_epi64( a, b2 ), mask ); // positive shift 02254 mask = _mm_slli_si128( mask, 8 ); 02255 b1 = _mm_srli_si128( b1, 8 ); 02256 02257 b2 = _mm_and_si128( b1, mask2 ); 02258 v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi64( a, b2 ), mask ) ); // negative shift 02259 v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi64( a, b2 ), mask ) ); // positive shift 02260 02261 mask = _mm_setzero_si128(); 02262 mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b 02263 mask = _mm_slli_epi16( mask, 8 ); 02264 mask = _mm_srai_epi16( mask, 8 ); 02265 mask = _mm_shufflelo_epi16( mask, _MM_SHUFFLE(0,0,0,0) ); 02266 mask = _mm_shufflehi_epi16( mask, _MM_SHUFFLE(0,0,0,0) ); 02267 v1 = _mm_and_si128( v1, mask ); 02268 mask = _mm_andnot_si128( mask, v2 ); 02269 v1 = _mm_or_si128( v1, mask ); 02270 return v1; 02271 }
SSP_FORCEINLINE __m128i ssp_shl_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of ssp_shl_epi8/pshlb [SSE5]. (SSE5 .pdf documentation here)
Definition at line 1987 of file SSEPlus_emulation_SSE2.h.
01988 { 01989 int n; 01990 ssp_m128 A,B; 01991 A.i = a; 01992 B.i = b; 01993 01994 for( n = 0; n < 16; n++ ) 01995 { 01996 if( B.s8[n] < 0 ) 01997 { 01998 unsigned int count = (-B.s8[n]) % 8; 01999 A.u8[n] = A.u8[n] >> count; 02000 } 02001 else 02002 { 02003 unsigned int count = B.s8[n] % 8; 02004 A.u8[n] = A.u8[n] << count; 02005 } 02006 } 02007 return A.i; 02008 }
SSP_FORCEINLINE __m128i ssp_shuffle_epi8_SSE2 | ( | __m128i | a, | |
__m128i | mask | |||
) |
SSE2 implementation of _mm_shuffle_epi8 [SSSE3]. (Searches MSDN)
Definition at line 1601 of file SSEPlus_emulation_SSE2.h.
01602 { 01603 ssp_m128 A,B, MASK, maskZero; 01604 A.i = a; 01605 maskZero.i = ssp_comge_epi8_SSE2( mask, _mm_setzero_si128() ); 01606 MASK.i = _mm_and_si128 ( mask, _mm_set1_epi8( (char)0x0F) ); 01607 01608 B.s8[ 0] = A.s8[ (MASK.s8[ 0]) ]; 01609 B.s8[ 1] = A.s8[ (MASK.s8[ 1]) ]; 01610 B.s8[ 2] = A.s8[ (MASK.s8[ 2]) ]; 01611 B.s8[ 3] = A.s8[ (MASK.s8[ 3]) ]; 01612 B.s8[ 4] = A.s8[ (MASK.s8[ 4]) ]; 01613 B.s8[ 5] = A.s8[ (MASK.s8[ 5]) ]; 01614 B.s8[ 6] = A.s8[ (MASK.s8[ 6]) ]; 01615 B.s8[ 7] = A.s8[ (MASK.s8[ 7]) ]; 01616 B.s8[ 8] = A.s8[ (MASK.s8[ 8]) ]; 01617 B.s8[ 9] = A.s8[ (MASK.s8[ 9]) ]; 01618 B.s8[10] = A.s8[ (MASK.s8[10]) ]; 01619 B.s8[11] = A.s8[ (MASK.s8[11]) ]; 01620 B.s8[12] = A.s8[ (MASK.s8[12]) ]; 01621 B.s8[13] = A.s8[ (MASK.s8[13]) ]; 01622 B.s8[14] = A.s8[ (MASK.s8[14]) ]; 01623 B.s8[15] = A.s8[ (MASK.s8[15]) ]; 01624 01625 B.i = _mm_and_si128( B.i, maskZero.i ); 01626 return B.i; 01627 }
SSP_FORCEINLINE __m128i ssp_sign_epi16_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_sign_epi16 [SSSE3]. (Searches MSDN)
Definition at line 1654 of file SSEPlus_emulation_SSE2.h.
01655 { 01656 __m128i c, d, zero; 01657 01658 zero=_mm_setzero_si128(); 01659 d = _mm_cmpgt_epi16(b, zero); 01660 c = _mm_cmplt_epi16(b, zero); 01661 d = _mm_srli_epi16(d, 15); 01662 c = _mm_or_si128(c, d); 01663 a = _mm_mullo_epi16(a, c); 01664 01665 //The following method has same performance 01666 //zero=_mm_setzero_si128(); 01667 //d = _mm_cmpgt_epi16(b, zero); 01668 //c = _mm_cmplt_epi16(b, zero); 01669 //one = _mm_set1_epi16(1); 01670 //d = _mm_and_si128(d, one); 01671 //c = _mm_add_epi16(c, d); 01672 //a = _mm_mullo_epi16(a, c); 01673 01674 return a; 01675 }
SSP_FORCEINLINE __m128i ssp_sign_epi32_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_sign_epi32 [SSSE3]. (Searches MSDN)
Definition at line 1679 of file SSEPlus_emulation_SSE2.h.
01680 { 01681 __m128i ap, an, c, d, zero, one; 01682 01683 zero=_mm_setzero_si128(); 01684 //Great than zero part 01685 d = _mm_cmpgt_epi32(b, zero); 01686 ap = _mm_and_si128(a, d); 01687 01688 //Less than zero 01689 c = _mm_cmplt_epi32(b, zero); 01690 one = _mm_set1_epi32(1); 01691 an = _mm_and_si128(a, c); //get the all number which needs to be negated 01692 an = _mm_xor_si128(an, c); 01693 one = _mm_and_si128(one, c); 01694 an = _mm_add_epi8(an, one); 01695 01696 return _mm_or_si128(an, ap); 01697 }
SSP_FORCEINLINE __m128i ssp_sign_epi8_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_sign_epi8 [SSSE3]. (Searches MSDN)
Definition at line 1632 of file SSEPlus_emulation_SSE2.h.
01633 { 01634 __m128i ap, an, c, d, zero, one; 01635 01636 zero=_mm_setzero_si128(); 01637 //Great than zero part 01638 d = _mm_cmpgt_epi8(b, zero); 01639 ap = _mm_and_si128(a, d); 01640 01641 //Less than zero 01642 c = _mm_cmplt_epi8(b, zero); 01643 one = _mm_set1_epi8(1); 01644 an = _mm_and_si128(a, c); //get the all number which needs to be negated 01645 an = _mm_xor_si128(an, c); 01646 one = _mm_and_si128(one, c); 01647 an = _mm_add_epi8(an, one); 01648 01649 return _mm_or_si128(an, ap);//_mm_add_epi8(an, ap); 01650 }
SSP_FORCEINLINE int ssp_testc_si128_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_testc_si128 [SSE4.1]. (Searches MSDN)
Definition at line 1703 of file SSEPlus_emulation_SSE2.h.
01704 { 01705 a = _mm_xor_si128( a, b ); 01706 return ssp_testz_si128_SSE2( a, a ); 01707 }
SSP_FORCEINLINE int ssp_testnzc_si128_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_testnzc_si128 [SSE4.1]. (Searches MSDN)
Definition at line 1721 of file SSEPlus_emulation_SSE2.h.
01722 { 01723 ssp_m128 zf, cf; 01724 01725 zf.i = _mm_and_si128 ( a, b ); 01726 zf.i = _mm_packs_epi32( zf.i, _mm_setzero_si128() ); 01727 01728 cf.i = _mm_andnot_si128( a, b ); 01729 cf.i = _mm_packs_epi32( cf.i, _mm_setzero_si128() ); 01730 01731 return ( !(zf.u64[0] == 0) && !(cf.u64[0] == 0)); 01732 }
SSP_FORCEINLINE int ssp_testz_si128_SSE2 | ( | __m128i | a, | |
__m128i | b | |||
) |
SSE2 implementation of _mm_testz_si128 [SSE4.1]. (Searches MSDN)
Definition at line 1711 of file SSEPlus_emulation_SSE2.h.
01712 { 01713 ssp_m128 t; 01714 t.i = _mm_and_si128 ( a, b ); 01715 t.i = _mm_packs_epi32( t.i, _mm_setzero_si128() ); 01716 return t.u64[0] == 0; 01717 }