SSE2
[Emulated]


SSE[3,4A,...,5] implemented in SSE2

SSP_FORCEINLINE __m128i ssp_comeq_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comeq_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comeq_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comeq_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comeq_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comeq_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comeq_epu64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comeq_epu8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128d ssp_comeq_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comeq_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comeq_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comeq_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128i ssp_comlt_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comlt_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comlt_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comlt_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comlt_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comlt_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comlt_epu64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comlt_epu8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128d ssp_comlt_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comlt_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comlt_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comlt_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128i ssp_comle_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comle_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comle_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comle_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comle_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comle_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comle_epu64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comle_epu8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128d ssp_comle_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comle_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comle_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comle_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comunord_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comunord_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comunord_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comunord_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128i ssp_comneq_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comneq_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comneq_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comneq_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comneq_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comneq_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comneq_epu64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comneq_epu8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128d ssp_comneq_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comneq_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comneq_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comneq_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comnlt_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comnlt_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comnlt_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comnlt_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comnle_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comnle_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comnle_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comnle_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comord_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comord_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comord_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comord_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comueq_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comueq_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comueq_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comueq_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comnge_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comnge_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comnge_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comnge_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comngt_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comngt_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comngt_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comngt_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128i ssp_comfalse_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comfalse_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comfalse_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comfalse_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comfalse_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comfalse_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comfalse_epu64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comfalse_epu8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128d ssp_comfalse_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comfalse_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comfalse_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comfalse_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comoneq_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comoneq_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comoneq_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comoneq_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128i ssp_comge_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comge_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comge_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comge_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comge_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comge_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comge_epu64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comge_epu8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128d ssp_comge_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comge_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comge_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comge_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128i ssp_comgt_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comgt_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comgt_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comgt_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comgt_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comgt_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comgt_epu64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comgt_epu8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128d ssp_comgt_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comgt_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comgt_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comgt_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128i ssp_comtrue_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comtrue_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comtrue_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comtrue_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comtrue_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comtrue_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comtrue_epu64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_comtrue_epu8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128d ssp_comtrue_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comtrue_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_comtrue_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_comtrue_ss_SSE2 (__m128 a, __m128 b)

SSE[3,4A,...,5] implemented in SSE2

SSP_FORCEINLINE __m128i ssp_macc_epi16_SSE2 (__m128i a, __m128i b, __m128i c)
SSP_FORCEINLINE __m128i ssp_macc_epi32_SSE2 (__m128i a, __m128i b, __m128i c)
SSP_FORCEINLINE __m128d ssp_macc_pd_SSE2 (__m128d a, __m128d b, __m128d c)
SSP_FORCEINLINE __m128 ssp_macc_ps_SSE2 (__m128 a, __m128 b, __m128 c)
SSP_FORCEINLINE __m128d ssp_macc_sd_SSE2 (__m128d a, __m128d b, __m128d c)
SSP_FORCEINLINE __m128 ssp_macc_ss_SSE2 (__m128 a, __m128 b, __m128 c)
SSP_FORCEINLINE __m128i ssp_maccd_epi16_SSE2 (__m128i a, __m128i b, __m128i c)
SSP_FORCEINLINE __m128i ssp_macchi_epi32_SSE2 (__m128i a, __m128i b, __m128i c)
SSP_FORCEINLINE __m128i ssp_macclo_epi32_SSE2 (__m128i a, __m128i b, __m128i c)
SSP_FORCEINLINE __m128i ssp_maccs_epi16_SSE2 (__m128i a, __m128i b, __m128i c)
SSP_FORCEINLINE __m128i ssp_maccs_epi32_SSE2 (__m128i a, __m128i b, __m128i c)
SSP_FORCEINLINE __m128 ssp_nmacc_ps_SSE2 (__m128 a, __m128 b, __m128 c)
SSP_FORCEINLINE __m128d ssp_nmacc_pd_SSE2 (__m128d a, __m128d b, __m128d c)
SSP_FORCEINLINE __m128 ssp_nmacc_ss_SSE2 (__m128 a, __m128 b, __m128 c)
SSP_FORCEINLINE __m128d ssp_nmacc_sd_SSE2 (__m128d a, __m128d b, __m128d c)
SSP_FORCEINLINE __m128 ssp_msub_ps_SSE2 (__m128 a, __m128 b, __m128 c)
SSP_FORCEINLINE __m128d ssp_msub_pd_SSE2 (__m128d a, __m128d b, __m128d c)
SSP_FORCEINLINE __m128 ssp_msub_ss_SSE2 (__m128 a, __m128 b, __m128 c)
SSP_FORCEINLINE __m128d ssp_msub_sd_SSE2 (__m128d a, __m128d b, __m128d c)
SSP_FORCEINLINE __m128 ssp_nmsub_ps_SSE2 (__m128 a, __m128 b, __m128 c)
SSP_FORCEINLINE __m128d ssp_nmsub_pd_SSE2 (__m128d a, __m128d b, __m128d c)
SSP_FORCEINLINE __m128 ssp_nmsub_ss_SSE2 (__m128 a, __m128 b, __m128 c)
SSP_FORCEINLINE __m128d ssp_nmsub_sd_SSE2 (__m128d a, __m128d b, __m128d c)
SSP_FORCEINLINE __m128i ssp_abs_epi8_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_abs_epi16_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_abs_epi32_SSE2 (__m128i a)
SSP_FORCEINLINE __m128 ssp_addsub_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_addsub_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128i ssp_blend_epi16_SSE2 (__m128i a, __m128i b, const int mask)
SSP_FORCEINLINE __m128d ssp_blend_pd_SSE2 (__m128d a, __m128d b, const int mask)
SSP_FORCEINLINE __m128 ssp_blend_ps_SSE2 (__m128 a, __m128 b, const int mask)
SSP_FORCEINLINE __m128i ssp_blendv_epi8_SSE2 (__m128i a, __m128i b, __m128i mask)
SSP_FORCEINLINE __m128d ssp_blendv_pd_SSE2 (__m128d a, __m128d b, __m128d mask)
SSP_FORCEINLINE __m128 ssp_blendv_ps_SSE2 (__m128 a, __m128 b, __m128 mask)
SSP_FORCEINLINE __m128i ssp_cmpeq_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_hadd_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_hadds_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_hsub_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_hsubs_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_hadd_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_hsub_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128 ssp_hadd_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128 ssp_hsub_ps_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128d ssp_hadd_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128d ssp_hsub_pd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128i ssp_mulhrs_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_insert_epi32_SSE2 (__m128i a, int b, const int ndx)
SSP_FORCEINLINE __m128i ssp_min_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_max_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_min_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_max_epu16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_min_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_max_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_min_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_max_epu32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_maddubs_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_mpsadbw_epu8_SSE2 (__m128i a, __m128i b, const int msk)
SSP_FORCEINLINE __m128d ssp_dp_pd_SSE2 (__m128d a, __m128d b, const int mask)
SSP_FORCEINLINE __m128 ssp_dp_ps_SSE2 (__m128 a, __m128 b, const int mask)
SSP_FORCEINLINE __m128 ssp_round_ps_SSE2 (__m128 a, int iRoundMode)
SSP_FORCEINLINE __m128d ssp_round_pd_SSE2 (__m128d a, int iRoundMode)
SSP_FORCEINLINE __m128 ssp_round_ss_SSE2 (__m128 a, __m128 b, int iRoundMode)
SSP_FORCEINLINE __m128 ssp_ceil_ps_SSE2 (__m128 a)
SSP_FORCEINLINE __m128 ssp_floor_ps_SSE2 (__m128 a)
SSP_FORCEINLINE __m128d ssp_floor_pd_SSE2 (__m128d a)
SSP_FORCEINLINE __m128d ssp_ceil_pd_SSE2 (__m128d a)
SSP_FORCEINLINE __m128d ssp_floor_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128d ssp_ceil_sd_SSE2 (__m128d a, __m128d b)
SSP_FORCEINLINE __m128 ssp_floor_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128 ssp_ceil_ss_SSE2 (__m128 a, __m128 b)
SSP_FORCEINLINE __m128i ssp_cvtepi8_epi16_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepi8_epi32_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepi8_epi64_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepi16_epi32_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepi16_epi64_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepi32_epi64_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepu8_epi16_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepu8_epi32_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepu8_epi64_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepu16_epi32_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepu16_epi64_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_cvtepu32_epi64_SSE2 (__m128i a)
SSP_FORCEINLINE __m128i ssp_packus_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_alignr_epi8_SSE2 (__m128i a, __m128i b, const int ralign)
SSP_FORCEINLINE __m128i ssp_insert_epi8_SSE2 (__m128i a, int b, const int ndx)
SSP_FORCEINLINE __m128i ssp_inserti_si64_SSE2 (__m128i a, __m128i b, int len, int ndx)
SSP_FORCEINLINE __m128i ssp_insert_si64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE int ssp_extract_epi8_SSE2 (__m128i a, const int ndx)
SSP_FORCEINLINE int ssp_extract_epi32_SSE2 (__m128i a, const int imm)
SSP_FORCEINLINE int ssp_extract_ps_SSE2 (__m128 a, const int ndx)
SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_SSE2 (__m128i a, const int ndx)
SSP_FORCEINLINE __m128i ssp_extracti_si64_SSE2 (__m128i a, int len, int ndx)
SSP_FORCEINLINE __m128i ssp_extract_si64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_shuffle_epi8_SSE2 (__m128i a, __m128i mask)
SSP_FORCEINLINE __m128i ssp_sign_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_sign_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_sign_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE int ssp_testc_si128_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE int ssp_testz_si128_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE int ssp_testnzc_si128_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128 ssp_movehdup_ps_SSE2 (__m128 a)
SSP_FORCEINLINE __m128 ssp_moveldup_ps_SSE2 (__m128 a)
SSP_FORCEINLINE __m128d ssp_movedup_pd_SSE2 (__m128d a)
SSP_FORCEINLINE __m128i ssp_rot_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_rot_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_rot_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_rot_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_roti_epi8_SSE2 (__m128i a, const int b)
SSP_FORCEINLINE __m128i ssp_roti_epi16_SSE2 (__m128i a, const int b)
SSP_FORCEINLINE __m128i ssp_roti_epi32_SSE2 (__m128i a, const int b)
SSP_FORCEINLINE __m128i ssp_roti_epi64_SSE2 (__m128i a, const int b)
SSP_FORCEINLINE __m128i ssp_shl_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_sha_epi8_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_shl_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_sha_epi16_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_shl_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_sha_epi32_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_shl_epi64_SSE2 (__m128i a, __m128i b)
SSP_FORCEINLINE __m128i ssp_sha_epi64_SSE2 (__m128i a, __m128i b)

Function Documentation

SSP_FORCEINLINE __m128i ssp_abs_epi16_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_abs_epi16 [SSSE3]. (Searches MSDN)

Definition at line 409 of file SSEPlus_emulation_SSE2.h.

00410 {
00411     __m128i mask = _mm_cmplt_epi16( a, _mm_setzero_si128() ); // FFFF   where a < 0
00412     a    = _mm_xor_si128 ( a, mask  );                        // Invert where a < 0
00413     mask = _mm_srli_epi16( mask, 15 );                        // 0001   where a < 0
00414     a    = _mm_add_epi16 ( a, mask  );                        // Add 1  where a < 0
00415     return a;
00416 }

SSP_FORCEINLINE __m128i ssp_abs_epi32_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_abs_epi32 [SSSE3]. (Searches MSDN)

Definition at line 420 of file SSEPlus_emulation_SSE2.h.

00421 {
00422     __m128i mask = _mm_cmplt_epi32( a, _mm_setzero_si128() ); // FFFF   where a < 0
00423     a    = _mm_xor_si128 ( a, mask );                         // Invert where a < 0
00424     mask = _mm_srli_epi32( mask, 31 );                        // 0001   where a < 0
00425     a = _mm_add_epi32( a, mask );                             // Add 1  where a < 0
00426         return a;
00427 }

SSP_FORCEINLINE __m128i ssp_abs_epi8_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_abs_epi8 [SSSE3]. (Searches MSDN)

Definition at line 397 of file SSEPlus_emulation_SSE2.h.

00398 {
00399     __m128i mask = _mm_cmplt_epi8( a, _mm_setzero_si128() );  // FFFF   where a < 0
00400         __m128i one  = _mm_set1_epi8(1);
00401     a    = _mm_xor_si128 ( a, mask  );                        // Invert where a < 0
00402     mask = _mm_and_si128 ( mask, one );                       // 0001   where a < 0
00403     a    = _mm_add_epi8  ( a, mask  );                        // Add 1  where a < 0 
00404     return a;
00405 }

SSP_FORCEINLINE __m128d ssp_addsub_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_addsub_ps [SSE3]. (Searches MSDN)

Definition at line 443 of file SSEPlus_emulation_SSE2.h.

00444 {
00445     const static __m128d const_addSub_pd_neg = SSP_CONST_SET_64F( 1, -1 );
00446 
00447     b = _mm_mul_pd( b, const_addSub_pd_neg );
00448     a = _mm_add_pd( a, b   );
00449     return a;
00450 }

SSP_FORCEINLINE __m128 ssp_addsub_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_addsub_ps [SSE3]. (Searches MSDN)

Definition at line 432 of file SSEPlus_emulation_SSE2.h.

00433 {
00434     const static __m128 neg = SSP_CONST_SET_32F(  1, -1, 1, -1 );
00435 
00436     b = _mm_mul_ps( b, neg );
00437     a = _mm_add_ps( a, b   );
00438     return a;
00439 }

SSP_FORCEINLINE __m128i ssp_alignr_epi8_SSE2 ( __m128i  a,
__m128i  b,
const int  ralign 
)

Reference implementation of _mm_alignr_epi8 [SSSE3]. (Searches MSDN)

Definition at line 1300 of file SSEPlus_emulation_SSE2.h.

01301 {
01302         if (ralign < 0)  return b; //only shift to right, no negative
01303         //if (ralign > 32) return _mm_setzero_si128();
01304         //
01305         //if (ralign > 16) return _mm_srli_si128(a, ralign-16);
01306 
01307         //b = _mm_srli_si128(b, ralign);
01308         //a = _mm_slli_si128(a, 16-ralign);
01309         switch (ralign) {
01310         case 0: 
01311                 return b;                       
01312         case 1: 
01313                 b = _mm_srli_si128(b, 1);       
01314                 a = _mm_slli_si128(a, 15);          
01315                 return _mm_or_si128( a, b );    
01316         case 2: 
01317                 b = _mm_srli_si128(b, 2);       
01318                 a = _mm_slli_si128(a, 14);          
01319                 return _mm_or_si128( a, b );    
01320         case 3: 
01321                 b = _mm_srli_si128(b, 3);       
01322                 a = _mm_slli_si128(a, 13);          
01323                 return _mm_or_si128( a, b );    
01324         case 4:                                         
01325                 b = _mm_srli_si128(b, 4);           
01326                 a = _mm_slli_si128(a, 12);          
01327                 return _mm_or_si128( a, b );    
01328         case 5:                                         
01329                 b = _mm_srli_si128(b, 5);           
01330                 a = _mm_slli_si128(a, 11);          
01331                 return _mm_or_si128( a, b );    
01332         case 6:                                         
01333                 b = _mm_srli_si128(b, 6);           
01334                  a = _mm_slli_si128(a, 10);         
01335                 return _mm_or_si128( a, b );    
01336         case 7:                                         
01337                 b = _mm_srli_si128(b, 7);           
01338                 a = _mm_slli_si128(a, 9);           
01339                 return _mm_or_si128( a, b );    
01340         case 8:                                         
01341                 b = _mm_srli_si128(b, 8);           
01342                 a = _mm_slli_si128(a, 8);           
01343                 return _mm_or_si128( a, b );    
01344         case 9:                                         
01345                 b = _mm_srli_si128(b, 9);           
01346                 a = _mm_slli_si128(a, 7);           
01347                 return _mm_or_si128( a, b );    
01348         case 10:                                            
01349                 b = _mm_srli_si128(b, 10);          
01350                 a = _mm_slli_si128(a,  6);          
01351                 return _mm_or_si128( a, b );    
01352         case 11:                                            
01353                 b = _mm_srli_si128(b, 11);      
01354                 a = _mm_slli_si128(a,  5);      
01355                 return _mm_or_si128( a, b );    
01356         case 12:                                            
01357                 b = _mm_srli_si128(b, 12);      
01358                 a = _mm_slli_si128(a,  4);      
01359                 return _mm_or_si128( a, b );    
01360         case 13:                                            
01361                 b = _mm_srli_si128(b, 13);      
01362                 a = _mm_slli_si128(a,  3);      
01363                 return _mm_or_si128( a, b );    
01364         case 14:                                            
01365                 b = _mm_srli_si128(b, 14);          
01366                 a = _mm_slli_si128(a,  2);      
01367                 return _mm_or_si128( a, b );    
01368         case 15:                                            
01369                 b = _mm_srli_si128(b, 15);          
01370                 a = _mm_slli_si128(a,  1);          
01371                 return _mm_or_si128( a, b );    
01372         case 16:                            
01373                 return a;                       
01374         case 17:                            
01375                     a    = _mm_slli_si128(a,  1);   
01376                     return _mm_srli_si128(a,  1);   
01377         case 18:                            
01378                     a    = _mm_slli_si128(a,  2);   
01379                     return _mm_srli_si128(a,  2);   
01380         case 19:                            
01381                     a    = _mm_slli_si128(a,  3);   
01382                     return _mm_srli_si128(a,  3);   
01383         case 20:                            
01384                     a    = _mm_slli_si128(a,  4);   
01385                     return _mm_srli_si128(a,  4);   
01386         case 21:                            
01387                     a    = _mm_slli_si128(a,  5);   
01388                     return _mm_srli_si128(a,  5);   
01389         case 22:                            
01390                     a    = _mm_slli_si128(a,  6);   
01391                     return _mm_srli_si128(a,  6);   
01392         case 23:                            
01393                     a    = _mm_slli_si128(a,  7);   
01394                     return _mm_srli_si128(a,  7);   
01395         case 24:                            
01396                     a    = _mm_slli_si128(a,  8);   
01397                     return _mm_srli_si128(a,  8);   
01398         case 25:                            
01399                     a    = _mm_slli_si128(a,  9);   
01400                     return _mm_srli_si128(a,  9);   
01401         case 26:                            
01402                     a    = _mm_slli_si128(a, 10);   
01403                     return _mm_srli_si128(a, 10);   
01404         case 27:                            
01405                     a    = _mm_slli_si128(a, 11);   
01406                     return _mm_srli_si128(a, 11);   
01407         case 28:                            
01408                     a    = _mm_slli_si128(a, 12);   
01409                     return _mm_srli_si128(a, 12);   
01410         case 29:                            
01411                     a    = _mm_slli_si128(a, 13);   
01412                     return _mm_srli_si128(a, 13);   
01413         case 30:                            
01414                     a    = _mm_slli_si128(a, 14);   
01415                     return _mm_srli_si128(a, 14);   
01416         case 31:                            
01417                     a    = _mm_slli_si128(a, 15);   
01418                     return _mm_srli_si128(a, 15);   
01419         default:                            
01420                     return _mm_setzero_si128(); 
01421         }
01422 }

SSP_FORCEINLINE __m128i ssp_blend_epi16_SSE2 ( __m128i  a,
__m128i  b,
const int  mask 
)

SSE2 implementation of _mm_blend_epi16 [SSE4.1]. (Searches MSDN)

Definition at line 458 of file SSEPlus_emulation_SSE2.h.

00459 {
00460     __m128i screen;
00461     const static __m128i mulShiftImm = SSP_CONST_SET_16I( 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 ); // Shift mask multiply moves all bits to left, becomes MSB
00462 
00463     screen = _mm_set1_epi16  ( mask                );   // Load the mask into register
00464     screen = _mm_mullo_epi16 ( screen, mulShiftImm );   // Shift bits to MSB
00465     screen = _mm_srai_epi16  ( screen, 15          );   // Shift bits to obtain 0xFFFF or 0x0000
00466     b      = _mm_and_si128   ( screen,  b          );   // Mask out the correct values from b
00467     a      = _mm_andnot_si128( screen,  a          );   // Mask out the correct values from a (invert the mask)
00468     a      = _mm_or_si128    (      a,  b          );   // Or the 2 results.
00469     return a;
00470 }

SSP_FORCEINLINE __m128d ssp_blend_pd_SSE2 ( __m128d  a,
__m128d  b,
const int  mask 
)

SSE2 implementation of _mm_blend_pd [SSE4.1]. (Searches MSDN)

Definition at line 474 of file SSEPlus_emulation_SSE2.h.

00475 {
00476     __m128d screen;
00477     screen = _mm_set_pd(  (mask&0x2)>>1,    mask&0x1 );
00478     b      = _mm_mul_pd(              b,      screen );
00479     screen = _mm_set_pd( (~mask&0x2)>>1, (~mask&0x1) );
00480     a      = _mm_mul_pd(              a,      screen );
00481     a      = _mm_or_pd (              a,           b );
00482     return a;
00483 }

SSP_FORCEINLINE __m128 ssp_blend_ps_SSE2 ( __m128  a,
__m128  b,
const int  mask 
)

SSE2 implementation of _mm_blend_ps [SSE4.1]. (Searches MSDN)

Definition at line 487 of file SSEPlus_emulation_SSE2.h.

00488 {
00489     ssp_m128 screen, A, B;
00490     A.f = a;
00491     B.f = b;
00492     screen.i = ssp_movmask_imm8_to_epi32_SSE2( mask );
00493     screen.i = ssp_logical_bitwise_select_SSE2( B.i, A.i, screen.i );
00494     return screen.f;
00495 }

SSP_FORCEINLINE __m128i ssp_blendv_epi8_SSE2 ( __m128i  a,
__m128i  b,
__m128i  mask 
)

SSE2 implementation of _mm_blendv_epi8 [SSE4.1]. (Searches MSDN)

Definition at line 499 of file SSEPlus_emulation_SSE2.h.

00500 {
00501     __m128i mHi, mLo;
00502     __m128i zero = _mm_setzero_si128 ();
00503 
00504     mHi = _mm_unpacklo_epi8( zero, mask );
00505     mHi = _mm_srai_epi16   (  mHi,   15 );
00506     mHi = _mm_srli_epi16   (  mHi,    1 );
00507 
00508     mLo = _mm_unpackhi_epi8( zero, mask );
00509     mLo = _mm_srai_epi16   (  mLo,   15 );
00510     mLo = _mm_srli_epi16   (  mLo,    1 );
00511 
00512     mHi = _mm_packus_epi16  ( mHi,  mLo );
00513 
00514     b   = _mm_and_si128     (    b, mHi  );
00515     a   = _mm_andnot_si128  (  mHi,  a   );
00516     a   = _mm_or_si128      (    a,  b   );
00517     return a;
00518 }

SSP_FORCEINLINE __m128d ssp_blendv_pd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  mask 
)

SSE2 implementation of _mm_blendv_pd [SSE4.1]. (Searches MSDN)

Definition at line 521 of file SSEPlus_emulation_SSE2.h.

00522 {
00523     ssp_m128 A, B, Mask;
00524     A.d = a;
00525     B.d = b;
00526     Mask.d = mask;
00527 
00528     Mask.i = _mm_shuffle_epi32( Mask.i, _MM_SHUFFLE(3, 3, 1, 1) );
00529     Mask.i = _mm_srai_epi32   ( Mask.i, 31                      );
00530 
00531     B.i = _mm_and_si128( B.i, Mask.i );
00532     A.i = _mm_andnot_si128( Mask.i, A.i );
00533     A.i = _mm_or_si128( A.i, B.i );
00534     return A.d;
00535 }

SSP_FORCEINLINE __m128 ssp_blendv_ps_SSE2 ( __m128  a,
__m128  b,
__m128  mask 
)

SSE2 implementation of _mm_blendv_epi8 [SSE4.1]. (Searches MSDN)

Definition at line 537 of file SSEPlus_emulation_SSE2.h.

00538 {
00539     ssp_m128 A, B, Mask;
00540     A.f = a;
00541     B.f = b;
00542     Mask.f = mask;
00543 
00544     Mask.i = _mm_srai_epi32( Mask.i, 31 );
00545     B.i = _mm_and_si128( B.i, Mask.i );
00546     A.i = _mm_andnot_si128( Mask.i, A.i );
00547     A.i = _mm_or_si128( A.i, B.i );
00548     return A.f;
00549 }

SSP_FORCEINLINE __m128d ssp_ceil_pd_SSE2 ( __m128d  a  ) 

SSE2 implementation of _mm_ceil_pd [SSE4.1]. (Searches MSDN)

Definition at line 1060 of file SSEPlus_emulation_SSE2.h.

01061 {
01062     return ssp_round_pd_SSE2( a, SSP_FROUND_TO_POS_INF );
01063 }

SSP_FORCEINLINE __m128 ssp_ceil_ps_SSE2 ( __m128  a  ) 

SSE2 implementation of _mm_ceil_ps [SSE4.1]. (Searches MSDN)

Definition at line 1039 of file SSEPlus_emulation_SSE2.h.

01040 {
01041     return ssp_round_ps_SSE2( a, SSP_FROUND_TO_POS_INF );
01042 }

SSP_FORCEINLINE __m128d ssp_ceil_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_ceil_sd [SSE4.1]. (Searches MSDN)

Definition at line 1074 of file SSEPlus_emulation_SSE2.h.

01075 {
01076         b = ssp_round_pd_SSE2(b, SSP_FROUND_TO_POS_INF );
01077 
01078     return _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1,0));
01079 }

SSP_FORCEINLINE __m128 ssp_ceil_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_ceil_ss [SSE4.1]. (Searches MSDN)

Definition at line 1090 of file SSEPlus_emulation_SSE2.h.

01091 {
01092         b = ssp_round_ps_SSE2(b, SSP_FROUND_TO_POS_INF );
01093         b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));
01094     return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0));
01095 }

SSP_FORCEINLINE __m128i ssp_cmpeq_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_cmpeq_epi64 [SSE4.1]. (Searches MSDN)

Definition at line 557 of file SSEPlus_emulation_SSE2.h.

00558 {
00559     return ssp_comeq_epi64_SSE2( a, b );
00560 }

SSP_FORCEINLINE __m128i ssp_comeq_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comeq_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 21 of file SSEPlus_emulation_comps_SSE2.h.

00022 {
00023     a = _mm_cmpeq_epi16( a, b );
00024     return a;   
00025 }

SSP_FORCEINLINE __m128i ssp_comeq_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comeq_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 28 of file SSEPlus_emulation_comps_SSE2.h.

00029 {
00030     a = _mm_cmpeq_epi32( a, b );
00031     return a;   
00032 }

SSP_FORCEINLINE __m128i ssp_comeq_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comeq_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 35 of file SSEPlus_emulation_comps_SSE2.h.

00036 {
00037     ssp_m128 A, B;
00038     A.i = a;
00039     B.i = b;
00040     A.i = _mm_cmpeq_epi32( A.i, B.i );  // A0=B0,  A1=B1, A2=B2,  A3=B3
00041     B.f = _mm_movehdup_ps( A.f );       // A1=B1,  A1=B1, A3=B3,  A3=B3
00042     A.f = _mm_moveldup_ps( A.f );       // A0=B0,  A0=B0, A2=B2,  A2=B2
00043     A.i = _mm_and_si128  ( A.i, B.i );  // A0=B0 & A1=B1, A2=B2 & A3=B3   
00044     return A.i;
00045 }

SSP_FORCEINLINE __m128i ssp_comeq_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comeq_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 48 of file SSEPlus_emulation_comps_SSE2.h.

00049 {
00050     a = _mm_cmpeq_epi8( a, b );
00051     return a;
00052 }

SSP_FORCEINLINE __m128i ssp_comeq_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comeq_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 55 of file SSEPlus_emulation_comps_SSE2.h.

00056 {
00057     a = _mm_cmpeq_epi16( a, b );
00058     return a; 
00059 }

SSP_FORCEINLINE __m128i ssp_comeq_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comeq_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)

Definition at line 62 of file SSEPlus_emulation_comps_SSE2.h.

00063 {
00064     a = _mm_cmpeq_epi32( a, b );
00065     return a; 
00066 }

SSP_FORCEINLINE __m128i ssp_comeq_epu64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comeq_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 69 of file SSEPlus_emulation_comps_SSE2.h.

00070 {
00071     a = ssp_comeq_epi64_SSE2( a, b );  
00072     return a;
00073 }

SSP_FORCEINLINE __m128i ssp_comeq_epu8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comeq_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)

Definition at line 76 of file SSEPlus_emulation_comps_SSE2.h.

00077 {
00078     a = _mm_cmpeq_epi8( a, b );
00079     return a;
00080 }

SSP_FORCEINLINE __m128d ssp_comeq_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comeq_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 83 of file SSEPlus_emulation_comps_SSE2.h.

00084 {
00085     a = _mm_cmpeq_pd( a, b );
00086     return a;
00087 }

SSP_FORCEINLINE __m128 ssp_comeq_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comeq_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 90 of file SSEPlus_emulation_comps_SSE2.h.

00091 {
00092     a = _mm_cmpeq_ps( a, b );
00093     return a;
00094 }

SSP_FORCEINLINE __m128d ssp_comeq_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comeq_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 97 of file SSEPlus_emulation_comps_SSE2.h.

00098 {
00099     a = _mm_cmpeq_sd( a, b );
00100     return a;
00101 }

SSP_FORCEINLINE __m128 ssp_comeq_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comeq_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 104 of file SSEPlus_emulation_comps_SSE2.h.

00105 {
00106     a = _mm_cmpeq_ss( a, b );
00107     return a;
00108 }

SSP_FORCEINLINE __m128i ssp_comfalse_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comfalse_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 655 of file SSEPlus_emulation_comps_SSE2.h.

00656 {
00657         return _mm_setzero_si128();
00658 }

SSP_FORCEINLINE __m128i ssp_comfalse_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comfalse_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 661 of file SSEPlus_emulation_comps_SSE2.h.

00662 {
00663         return _mm_setzero_si128();
00664 }

SSP_FORCEINLINE __m128i ssp_comfalse_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comfalse_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 667 of file SSEPlus_emulation_comps_SSE2.h.

00668 {
00669         return _mm_setzero_si128();
00670 }

SSP_FORCEINLINE __m128i ssp_comfalse_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comfalse_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 673 of file SSEPlus_emulation_comps_SSE2.h.

00674 {
00675         return _mm_setzero_si128();
00676 }

SSP_FORCEINLINE __m128i ssp_comfalse_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comfalse_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 679 of file SSEPlus_emulation_comps_SSE2.h.

00680 {
00681         return _mm_setzero_si128();
00682 }

SSP_FORCEINLINE __m128i ssp_comfalse_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comfalse_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)

Definition at line 685 of file SSEPlus_emulation_comps_SSE2.h.

00686 {
00687         return _mm_setzero_si128();
00688 }

SSP_FORCEINLINE __m128i ssp_comfalse_epu64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comfalse_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 691 of file SSEPlus_emulation_comps_SSE2.h.

00692 {
00693         return _mm_setzero_si128();
00694 }

SSP_FORCEINLINE __m128i ssp_comfalse_epu8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comfalse_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)

Definition at line 697 of file SSEPlus_emulation_comps_SSE2.h.

00698 {
00699         return _mm_setzero_si128();
00700 }

SSP_FORCEINLINE __m128d ssp_comfalse_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comfalse_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 703 of file SSEPlus_emulation_comps_SSE2.h.

00704 {
00705         return _mm_setzero_pd();
00706 }

SSP_FORCEINLINE __m128 ssp_comfalse_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comfalse_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 709 of file SSEPlus_emulation_comps_SSE2.h.

00710 {
00711         return _mm_setzero_ps();
00712 }

SSP_FORCEINLINE __m128d ssp_comfalse_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comfalse_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 715 of file SSEPlus_emulation_comps_SSE2.h.

00716 {
00717         ssp_m128 B;
00718         B.i = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
00719         return _mm_and_pd(a, B.d);
00720 }

SSP_FORCEINLINE __m128 ssp_comfalse_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comfalse_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 723 of file SSEPlus_emulation_comps_SSE2.h.

00724 {
00725         ssp_m128 B;
00726         B.i = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0);
00727         return _mm_and_ps(a, B.f);
00728 }

SSP_FORCEINLINE __m128i ssp_comge_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comge_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 784 of file SSEPlus_emulation_comps_SSE2.h.

00785 {
00786     __m128i c;
00787     c = _mm_cmpgt_epi16( a, b );
00788     a = _mm_cmpeq_epi16( a, b );
00789     a = _mm_or_si128  ( a, c );
00790     return a;
00791 }

SSP_FORCEINLINE __m128i ssp_comge_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comge_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 794 of file SSEPlus_emulation_comps_SSE2.h.

00795 {
00796     __m128i c;
00797     c = _mm_cmpgt_epi32( a, b );
00798     a = _mm_cmpeq_epi32( a, b );
00799     a = _mm_or_si128   ( a, c );
00800     return a;
00801 }

SSP_FORCEINLINE __m128i ssp_comge_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comge_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 804 of file SSEPlus_emulation_comps_SSE2.h.

00805 {
00806     a = ssp_comge_epi64_REF( a, b );
00807     return a;
00808 }

SSP_FORCEINLINE __m128i ssp_comge_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comge_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 811 of file SSEPlus_emulation_comps_SSE2.h.

00812 {
00813     __m128i c;
00814     c = _mm_cmpgt_epi8( a, b );
00815     a = _mm_cmpeq_epi8( a, b );
00816     a = _mm_or_si128  ( a, c );
00817     return a;
00818 }

SSP_FORCEINLINE __m128i ssp_comge_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comge_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 822 of file SSEPlus_emulation_comps_SSE2.h.

00823 {
00824     __m128i mask;
00825     mask = ssp_comge_epi16_SSE2( a, b );         // FFFF where a < b (signed)
00826     mask = ssp_logical_signinvert_16_SSE2( mask, a, b );
00827     return mask;
00828 }

SSP_FORCEINLINE __m128i ssp_comge_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comge_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)

Definition at line 831 of file SSEPlus_emulation_comps_SSE2.h.

00832 {
00833     __m128i mask;
00834     mask = ssp_comge_epi32_SSE2( a, b );         // FFFF where a < b (signed)
00835     mask = ssp_logical_signinvert_32_SSE2( mask, a, b );
00836     return mask;
00837 }

SSP_FORCEINLINE __m128i ssp_comge_epu64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comge_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 840 of file SSEPlus_emulation_comps_SSE2.h.

00841 {
00842     a = ssp_comge_epu64_REF( a, b );
00843     return a;
00844 }

SSP_FORCEINLINE __m128i ssp_comge_epu8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comge_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)

Definition at line 847 of file SSEPlus_emulation_comps_SSE2.h.

00848 {
00849     a = ssp_comge_epu8_REF( a, b );
00850     return a;
00851 }

SSP_FORCEINLINE __m128d ssp_comge_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comge_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 854 of file SSEPlus_emulation_comps_SSE2.h.

00855 {
00856     a = _mm_cmpge_pd( a, b );
00857     return a;    
00858 }

SSP_FORCEINLINE __m128 ssp_comge_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comge_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 861 of file SSEPlus_emulation_comps_SSE2.h.

00862 {
00863     a = _mm_cmpge_ps( a, b );
00864     return a;   
00865 }

SSP_FORCEINLINE __m128d ssp_comge_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comge_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 868 of file SSEPlus_emulation_comps_SSE2.h.

00869 {
00870     a = _mm_cmpge_sd( a, b );
00871     return a;   
00872 }

SSP_FORCEINLINE __m128 ssp_comge_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comge_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 875 of file SSEPlus_emulation_comps_SSE2.h.

00876 {
00877     a = _mm_cmpge_ss( a, b );
00878     return a;   
00879 }

SSP_FORCEINLINE __m128i ssp_comgt_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comgt_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 887 of file SSEPlus_emulation_comps_SSE2.h.

00888 {
00889     a = _mm_cmpgt_epi16( a, b );
00890     return a;  
00891 }

SSP_FORCEINLINE __m128i ssp_comgt_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comgt_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 894 of file SSEPlus_emulation_comps_SSE2.h.

00895 {
00896     a = _mm_cmpgt_epi32( a, b );
00897     return a;  
00898 }

SSP_FORCEINLINE __m128i ssp_comgt_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comgt_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 901 of file SSEPlus_emulation_comps_SSE2.h.

00902 {
00903     a = ssp_comgt_epi64_REF( a, b );
00904     return a;
00905 }

SSP_FORCEINLINE __m128i ssp_comgt_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comgt_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 908 of file SSEPlus_emulation_comps_SSE2.h.

00909 {
00910      a = _mm_cmpgt_epi8( a, b );
00911     return a;  
00912 }

SSP_FORCEINLINE __m128i ssp_comgt_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comgt_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 915 of file SSEPlus_emulation_comps_SSE2.h.

00916 {
00917     __m128i signMask, mask;
00918 
00919     mask     = _mm_cmpgt_epi16( a, b );              // FFFF where a > b (signed)
00920     signMask = _mm_xor_si128  ( a, b );              // Signbit is 1 where signs differ 
00921     signMask = _mm_srai_epi16 ( signMask, 15 );      // fill all fields with sign bit     
00922     mask     = _mm_xor_si128  ( mask, signMask );    // Invert output where signs differed
00923     return mask;
00924 }

SSP_FORCEINLINE __m128i ssp_comgt_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comgt_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)

Definition at line 927 of file SSEPlus_emulation_comps_SSE2.h.

00928 {
00929     __m128i signMask, mask;
00930 
00931     mask     = _mm_cmpgt_epi32( a, b );              // FFFF where a < b (signed)
00932     signMask = _mm_xor_si128  ( a, b );              // Signbit is 1 where signs differ 
00933     signMask = _mm_srai_epi32 ( signMask, 31 );      // fill all fields with sign bit     
00934     mask     = _mm_xor_si128  ( mask, signMask );    // Invert output where signs differed
00935     return mask;
00936 }

SSP_FORCEINLINE __m128i ssp_comgt_epu64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comgt_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 939 of file SSEPlus_emulation_comps_SSE2.h.

00940 {
00941     a = ssp_comgt_epu64_REF( a, b );
00942     return a;
00943 }

SSP_FORCEINLINE __m128i ssp_comgt_epu8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comgt_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)

Definition at line 946 of file SSEPlus_emulation_comps_SSE2.h.

00947 {
00948     a = ssp_comgt_epu8_REF( a, b );
00949     return a;
00950 }

SSP_FORCEINLINE __m128d ssp_comgt_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comgt_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 953 of file SSEPlus_emulation_comps_SSE2.h.

00954 {
00955     a = _mm_cmpgt_pd( a, b );
00956     return a;
00957 }

SSP_FORCEINLINE __m128 ssp_comgt_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comgt_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 960 of file SSEPlus_emulation_comps_SSE2.h.

00961 {
00962     a = _mm_cmpgt_ps( a, b );
00963     return a;
00964 }

SSP_FORCEINLINE __m128d ssp_comgt_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comgt_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 967 of file SSEPlus_emulation_comps_SSE2.h.

00968 {
00969     a = _mm_cmpgt_sd( a, b );
00970     return a;
00971 }

SSP_FORCEINLINE __m128 ssp_comgt_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comgt_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 974 of file SSEPlus_emulation_comps_SSE2.h.

00975 {
00976     a = _mm_cmpgt_ss( a, b );
00977     return a;
00978 }

SSP_FORCEINLINE __m128i ssp_comle_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comle_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 213 of file SSEPlus_emulation_comps_SSE2.h.

00214 {
00215     __m128i c;    
00216     c = _mm_cmplt_epi16( a, b );
00217     a = _mm_cmpeq_epi16( a, b );
00218     a = _mm_or_si128   ( a, c );
00219     return a;
00220 }

SSP_FORCEINLINE __m128i ssp_comle_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comle_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 223 of file SSEPlus_emulation_comps_SSE2.h.

00224 {
00225     __m128i c;    
00226     c = _mm_cmplt_epi32( a, b );
00227     a = _mm_cmpeq_epi32( a, b );
00228     a = _mm_or_si128   ( a, c );
00229     return a;
00230 }

SSP_FORCEINLINE __m128i ssp_comle_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comle_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 233 of file SSEPlus_emulation_comps_SSE2.h.

00234 {
00235     a = ssp_comle_epi64_REF( a, b );
00236     return a;
00237 }

SSP_FORCEINLINE __m128i ssp_comle_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comle_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 240 of file SSEPlus_emulation_comps_SSE2.h.

00241 {
00242     __m128i c;    
00243     c = _mm_cmplt_epi8( a, b );
00244     a = _mm_cmpeq_epi8( a, b );
00245     a = _mm_or_si128  ( a, c );
00246     return a;
00247 }

SSP_FORCEINLINE __m128i ssp_comle_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comle_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 250 of file SSEPlus_emulation_comps_SSE2.h.

00251 {
00252     a = ssp_comle_epu16_REF( a, b );
00253     return a;
00254 }

SSP_FORCEINLINE __m128i ssp_comle_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comle_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)

Definition at line 257 of file SSEPlus_emulation_comps_SSE2.h.

00258 {
00259     a = ssp_comle_epu32_REF( a, b );
00260     return a;
00261 }

SSP_FORCEINLINE __m128i ssp_comle_epu64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comle_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 264 of file SSEPlus_emulation_comps_SSE2.h.

00265 {
00266     a = ssp_comle_epu64_REF( a, b );
00267     return a;
00268 }

SSP_FORCEINLINE __m128i ssp_comle_epu8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comle_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)

Definition at line 271 of file SSEPlus_emulation_comps_SSE2.h.

00272 {
00273     a = ssp_comle_epu8_REF( a, b );
00274     return a;
00275 }

SSP_FORCEINLINE __m128d ssp_comle_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comle_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 278 of file SSEPlus_emulation_comps_SSE2.h.

00279 {
00280    a = _mm_cmple_pd( a, b );
00281    return a;
00282 }

SSP_FORCEINLINE __m128 ssp_comle_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comle_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 285 of file SSEPlus_emulation_comps_SSE2.h.

00286 {
00287    a = _mm_cmple_ps( a, b );
00288    return a;
00289 }

SSP_FORCEINLINE __m128d ssp_comle_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comle_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 292 of file SSEPlus_emulation_comps_SSE2.h.

00293 {
00294    a = _mm_cmple_sd( a, b );
00295    return a;
00296 }

SSP_FORCEINLINE __m128 ssp_comle_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comle_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 299 of file SSEPlus_emulation_comps_SSE2.h.

00300 {
00301    a = _mm_cmple_ss( a, b );
00302    return a;
00303 }

SSP_FORCEINLINE __m128i ssp_comlt_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comlt_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 115 of file SSEPlus_emulation_comps_SSE2.h.

00116 {
00117     a = _mm_cmplt_epi16( a, b );
00118     return a;
00119 }

SSP_FORCEINLINE __m128i ssp_comlt_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comlt_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 122 of file SSEPlus_emulation_comps_SSE2.h.

00123 {
00124     a = _mm_cmplt_epi32( a, b );
00125     return a;
00126 }

SSP_FORCEINLINE __m128i ssp_comlt_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comlt_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 129 of file SSEPlus_emulation_comps_SSE2.h.

00130 {
00131     a = ssp_comlt_epi64_REF( a, b );
00132     return a;
00133 }

SSP_FORCEINLINE __m128i ssp_comlt_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comlt_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 136 of file SSEPlus_emulation_comps_SSE2.h.

00137 {
00138     a = _mm_cmplt_epi8( a, b );
00139     return a;
00140 }

SSP_FORCEINLINE __m128i ssp_comlt_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comlt_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 143 of file SSEPlus_emulation_comps_SSE2.h.

00144 {
00145     __m128i signMask, mask;
00146 
00147     mask     = _mm_cmplt_epi16( a, b );              // FFFF where a < b (signed)
00148     signMask = _mm_xor_si128  ( a, b );              // Signbit is 1 where signs differ 
00149     signMask = _mm_srai_epi16 ( signMask, 15 );      // fill all fields with sign bit     
00150     mask     = _mm_xor_si128  ( mask, signMask );    // Invert output where signs differed
00151     return mask;
00152 }

SSP_FORCEINLINE __m128i ssp_comlt_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comlt_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)

Definition at line 155 of file SSEPlus_emulation_comps_SSE2.h.

00156 {
00157     __m128i signMask, mask;
00158 
00159     mask     = _mm_cmplt_epi32( a, b );              // FFFF where a < b (signed)
00160     signMask = _mm_xor_si128  ( a, b );              // Signbit is 1 where signs differ 
00161     signMask = _mm_srai_epi32 ( signMask, 31 );      // fill all fields with sign bit     
00162     mask     = _mm_xor_si128  ( mask, signMask );    // Invert output where signs differed
00163     return mask;
00164 }

SSP_FORCEINLINE __m128i ssp_comlt_epu64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comlt_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 167 of file SSEPlus_emulation_comps_SSE2.h.

00168 {
00169     a = ssp_comlt_epu64_REF( a, b );
00170     return a;
00171 }

SSP_FORCEINLINE __m128i ssp_comlt_epu8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comlt_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)

Definition at line 174 of file SSEPlus_emulation_comps_SSE2.h.

00175 {
00176     a = ssp_comlt_epu8_REF( a, b );
00177     return a;
00178 }

SSP_FORCEINLINE __m128d ssp_comlt_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comlt_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 181 of file SSEPlus_emulation_comps_SSE2.h.

00182 {
00183     a = _mm_cmplt_pd( a, b );
00184     return a;
00185 }

SSP_FORCEINLINE __m128 ssp_comlt_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comlt_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 188 of file SSEPlus_emulation_comps_SSE2.h.

00189 {
00190     a = _mm_cmplt_ps( a, b );
00191     return a;
00192 }

SSP_FORCEINLINE __m128d ssp_comlt_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comlt_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 195 of file SSEPlus_emulation_comps_SSE2.h.

00196 {
00197     a = _mm_cmplt_sd( a, b );
00198     return a;
00199 }

SSP_FORCEINLINE __m128 ssp_comlt_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comlt_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 202 of file SSEPlus_emulation_comps_SSE2.h.

00203 {
00204     a = _mm_cmplt_ss( a, b );
00205     return a;
00206 }

SSP_FORCEINLINE __m128i ssp_comneq_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comneq_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 347 of file SSEPlus_emulation_comps_SSE2.h.

00348 {
00349     a = ssp_comeq_epi16_SSE2( a, b );
00350     a = ssp_logical_invert_si128_SSE2( a );
00351     return a;   
00352 }

SSP_FORCEINLINE __m128i ssp_comneq_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comneq_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 355 of file SSEPlus_emulation_comps_SSE2.h.

00356 {
00357     a = ssp_comeq_epi32_SSE2( a, b );
00358     a = ssp_logical_invert_si128_SSE2( a );
00359     return a;   
00360 }

SSP_FORCEINLINE __m128i ssp_comneq_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comneq_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 363 of file SSEPlus_emulation_comps_SSE2.h.

00364 {
00365     a = ssp_comeq_epi64_SSE2( a, b );
00366     a = ssp_logical_invert_si128_SSE2( a );
00367     return a;   
00368 }

SSP_FORCEINLINE __m128i ssp_comneq_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comneq_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 371 of file SSEPlus_emulation_comps_SSE2.h.

00372 {
00373     a = ssp_comeq_epi8_SSE2( a, b );
00374     a = ssp_logical_invert_si128_SSE2( a );
00375     return a;   
00376 }

SSP_FORCEINLINE __m128i ssp_comneq_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comneq_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 379 of file SSEPlus_emulation_comps_SSE2.h.

00380 {
00381     a = ssp_comeq_epu16_SSE2( a, b );
00382     a = ssp_logical_invert_si128_SSE2( a );
00383     return a;   
00384 }

SSP_FORCEINLINE __m128i ssp_comneq_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comneq_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)

Definition at line 387 of file SSEPlus_emulation_comps_SSE2.h.

00388 {
00389     a = ssp_comeq_epu32_SSE2( a, b );
00390     a = ssp_logical_invert_si128_SSE2( a );
00391     return a;   
00392 }

SSP_FORCEINLINE __m128i ssp_comneq_epu64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comneq_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 395 of file SSEPlus_emulation_comps_SSE2.h.

00396 {
00397     a = ssp_comeq_epu64_SSE2( a, b );
00398     a = ssp_logical_invert_si128_SSE2( a );
00399     return a;   
00400 }

SSP_FORCEINLINE __m128i ssp_comneq_epu8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comneq_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)

Definition at line 403 of file SSEPlus_emulation_comps_SSE2.h.

00404 {
00405     a = ssp_comeq_epu8_SSE2( a, b );
00406     a = ssp_logical_invert_si128_SSE2( a );
00407     return a;   
00408 }

SSP_FORCEINLINE __m128d ssp_comneq_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comneq_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 411 of file SSEPlus_emulation_comps_SSE2.h.

00412 {
00413     a = _mm_cmpneq_pd( a, b );
00414     return a;
00415 }

SSP_FORCEINLINE __m128 ssp_comneq_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comneq_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 418 of file SSEPlus_emulation_comps_SSE2.h.

00419 {
00420     a = _mm_cmpneq_ps( a, b );
00421     return a;
00422 }

SSP_FORCEINLINE __m128d ssp_comneq_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comneq_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 425 of file SSEPlus_emulation_comps_SSE2.h.

00426 {
00427     a = _mm_cmpneq_sd( a, b );
00428     return a;
00429 }

SSP_FORCEINLINE __m128 ssp_comneq_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comneq_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 432 of file SSEPlus_emulation_comps_SSE2.h.

00433 {
00434     a = _mm_cmpneq_ss( a, b );
00435     return a;
00436 }

SSP_FORCEINLINE __m128d ssp_comnge_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comnge_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 589 of file SSEPlus_emulation_comps_SSE2.h.

00590 {
00591     a = _mm_cmpnge_pd( a, b );
00592     return a;
00593 }

SSP_FORCEINLINE __m128 ssp_comnge_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comnge_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 596 of file SSEPlus_emulation_comps_SSE2.h.

00597 {
00598     a = _mm_cmpnge_ps( a, b );
00599     return a;
00600 }

SSP_FORCEINLINE __m128d ssp_comnge_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comnge_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 603 of file SSEPlus_emulation_comps_SSE2.h.

00604 {
00605     a = _mm_cmpnge_sd( a, b );
00606     return a;
00607 }

SSP_FORCEINLINE __m128 ssp_comnge_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comnge_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 610 of file SSEPlus_emulation_comps_SSE2.h.

00611 {
00612     a = _mm_cmpnge_ss( a, b );
00613     return a;
00614 }

SSP_FORCEINLINE __m128d ssp_comngt_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comngt_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 622 of file SSEPlus_emulation_comps_SSE2.h.

00623 {
00624     a = _mm_cmpngt_pd( a, b );
00625     return a;
00626 }

SSP_FORCEINLINE __m128 ssp_comngt_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comngt_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 629 of file SSEPlus_emulation_comps_SSE2.h.

00630 {
00631     a = _mm_cmpngt_ps( a, b );
00632     return a;
00633 }

SSP_FORCEINLINE __m128d ssp_comngt_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comngt_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 636 of file SSEPlus_emulation_comps_SSE2.h.

00637 {
00638     a = _mm_cmpngt_sd( a, b );
00639     return a;
00640 }

SSP_FORCEINLINE __m128 ssp_comngt_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comngt_ss/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 643 of file SSEPlus_emulation_comps_SSE2.h.

00644 {
00645     a = _mm_cmpngt_ss( a, b );
00646     return a;
00647 }

SSP_FORCEINLINE __m128d ssp_comnle_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comnle_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 476 of file SSEPlus_emulation_comps_SSE2.h.

00477 {    
00478     a = _mm_cmpnle_pd( a, b );
00479     return a;
00480 }

SSP_FORCEINLINE __m128 ssp_comnle_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comnle_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 483 of file SSEPlus_emulation_comps_SSE2.h.

00484 {
00485     a = _mm_cmpnle_ps( a, b );
00486     return a;
00487 }

SSP_FORCEINLINE __m128d ssp_comnle_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comnle_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 490 of file SSEPlus_emulation_comps_SSE2.h.

00491 {
00492     a = _mm_cmpnle_sd( a, b );
00493     return a;
00494 }

SSP_FORCEINLINE __m128 ssp_comnle_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comnle_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 497 of file SSEPlus_emulation_comps_SSE2.h.

00498 {
00499     a = _mm_cmpnle_ss( a, b );
00500     return a;
00501 }

SSP_FORCEINLINE __m128d ssp_comnlt_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comnlt_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 443 of file SSEPlus_emulation_comps_SSE2.h.

00444 {
00445     a = _mm_cmpnlt_pd( a, b );    
00446     return a;
00447 }

SSP_FORCEINLINE __m128 ssp_comnlt_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comnlt_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 450 of file SSEPlus_emulation_comps_SSE2.h.

00451 {
00452     a = _mm_cmpnlt_ps( a, b );    
00453     return a;
00454 }

SSP_FORCEINLINE __m128d ssp_comnlt_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comnlt_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 457 of file SSEPlus_emulation_comps_SSE2.h.

00458 {
00459     a = _mm_cmpnlt_sd( a, b );    
00460     return a;
00461 }

SSP_FORCEINLINE __m128 ssp_comnlt_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comnlt_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 464 of file SSEPlus_emulation_comps_SSE2.h.

00465 {
00466     a = _mm_cmpnlt_ss( a, b );    
00467     return a;
00468 }

SSP_FORCEINLINE __m128d ssp_comoneq_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comoneq_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 736 of file SSEPlus_emulation_comps_SSE2.h.

00737 {
00738     __m128d c;
00739     c = _mm_cmpord_pd( a, b );
00740     a = _mm_cmpneq_pd( a, b );
00741     a = _mm_and_pd   ( a, c );
00742     return a;
00743 }

SSP_FORCEINLINE __m128 ssp_comoneq_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comoneq_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 746 of file SSEPlus_emulation_comps_SSE2.h.

00747 {
00748     __m128 c;
00749     c = _mm_cmpord_ps( a, b );
00750     a = _mm_cmpneq_ps( a, b );
00751     a = _mm_and_ps   ( a, c );
00752     return a;
00753 }

SSP_FORCEINLINE __m128d ssp_comoneq_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comoneq_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 757 of file SSEPlus_emulation_comps_SSE2.h.

00758 {
00759     __m128d c;
00760     c = _mm_cmpord_pd( a, b );
00761     b = _mm_cmpneq_pd( a, b );
00762     b = _mm_and_pd   ( b, c );
00763     a = _mm_move_sd  ( a, b );
00764     return a;   
00765 }

SSP_FORCEINLINE __m128 ssp_comoneq_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comoneq_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 768 of file SSEPlus_emulation_comps_SSE2.h.

00769 {
00770     __m128 c;
00771     c = _mm_cmpord_ps( a, b );
00772     b = _mm_cmpneq_ps( a, b );
00773     b = _mm_and_ps   ( b, c );
00774     a = _mm_move_ss  ( a, b );
00775     return a; 
00776 }

SSP_FORCEINLINE __m128d ssp_comord_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comord_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 509 of file SSEPlus_emulation_comps_SSE2.h.

00510 {
00511     a = _mm_cmpord_pd( a, b );
00512     return a;
00513 }

SSP_FORCEINLINE __m128 ssp_comord_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comord_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 516 of file SSEPlus_emulation_comps_SSE2.h.

00517 {
00518     a = _mm_cmpord_ps( a, b );
00519     return a;
00520 }

SSP_FORCEINLINE __m128d ssp_comord_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comord_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 523 of file SSEPlus_emulation_comps_SSE2.h.

00524 {
00525     a = _mm_cmpord_sd( a, b );
00526     return a;
00527 }

SSP_FORCEINLINE __m128 ssp_comord_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comord_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 530 of file SSEPlus_emulation_comps_SSE2.h.

00531 {
00532     a = _mm_cmpord_ss( a, b );
00533     return a;
00534 }

SSP_FORCEINLINE __m128i ssp_comtrue_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comtrue_epi16/ pcomw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 986 of file SSEPlus_emulation_comps_SSE2.h.

00987 {
00988         return _mm_set1_epi32(0xFFFFFFFF);
00989 }

SSP_FORCEINLINE __m128i ssp_comtrue_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comtrue_epi32/ pcomd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 992 of file SSEPlus_emulation_comps_SSE2.h.

00993 {
00994         return _mm_set1_epi32(0xFFFFFFFF);
00995 }

SSP_FORCEINLINE __m128i ssp_comtrue_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comtrue_epi64/ pcomq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 998 of file SSEPlus_emulation_comps_SSE2.h.

00999 {
01000         return _mm_set1_epi32(0xFFFFFFFF);
01001 }

SSP_FORCEINLINE __m128i ssp_comtrue_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comtrue_epi8/ pcomb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1004 of file SSEPlus_emulation_comps_SSE2.h.

01005 {
01006         return _mm_set1_epi32(0xFFFFFFFF);
01007 }

SSP_FORCEINLINE __m128i ssp_comtrue_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comtrue_epu16/ pcomuw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1010 of file SSEPlus_emulation_comps_SSE2.h.

01011 {
01012         return _mm_set1_epi32(0xFFFFFFFF);
01013 }

SSP_FORCEINLINE __m128i ssp_comtrue_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comtrue_epu32/ pcomud [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1016 of file SSEPlus_emulation_comps_SSE2.h.

01017 {
01018         return _mm_set1_epi32(0xFFFFFFFF);
01019 }

SSP_FORCEINLINE __m128i ssp_comtrue_epu64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comtrue_epu64/ pcomuq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1022 of file SSEPlus_emulation_comps_SSE2.h.

01023 {
01024         return _mm_set1_epi32(0xFFFFFFFF);
01025 }

SSP_FORCEINLINE __m128i ssp_comtrue_epu8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_comtrue_epu8/ pcomub [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1028 of file SSEPlus_emulation_comps_SSE2.h.

01029 {
01030         return _mm_set1_epi32(0xFFFFFFFF);
01031 }

SSP_FORCEINLINE __m128d ssp_comtrue_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comtrue_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1034 of file SSEPlus_emulation_comps_SSE2.h.

01035 {
01036         ssp_m128 B;
01037         B.i = _mm_set1_epi32(0xFFFFFFFF);
01038         return B.d;
01039 }

SSP_FORCEINLINE __m128 ssp_comtrue_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comtrue_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1042 of file SSEPlus_emulation_comps_SSE2.h.

01043 {
01044         ssp_m128 B;
01045         B.i = _mm_set1_epi32(0xFFFFFFFF);
01046         return B.f;
01047 }

SSP_FORCEINLINE __m128d ssp_comtrue_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comtrue_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1050 of file SSEPlus_emulation_comps_SSE2.h.

01051 {
01052         ssp_m128 B;
01053         B.i = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
01054         return _mm_or_pd(a, B.d);
01055 }

SSP_FORCEINLINE __m128 ssp_comtrue_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comtrue_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1058 of file SSEPlus_emulation_comps_SSE2.h.

01059 {
01060         ssp_m128 B;
01061         B.i = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
01062         return _mm_or_ps(a, B.f);
01063 }

SSP_FORCEINLINE __m128d ssp_comueq_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comueq_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 542 of file SSEPlus_emulation_comps_SSE2.h.

00543 {
00544     __m128d c;
00545     c = _mm_cmpunord_pd( a, b );
00546     a = _mm_cmpeq_pd   ( a, b );
00547     a = _mm_or_pd      ( a, c );
00548     return a;   
00549 }

SSP_FORCEINLINE __m128 ssp_comueq_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comueq_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 552 of file SSEPlus_emulation_comps_SSE2.h.

00553 {
00554     __m128 c;
00555     c = _mm_cmpunord_ps( a, b );
00556     a = _mm_cmpeq_ps   ( a, b );
00557     a = _mm_or_ps      ( a, c );
00558     return a;   
00559 }

SSP_FORCEINLINE __m128d ssp_comueq_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comueq_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 562 of file SSEPlus_emulation_comps_SSE2.h.

00563 {
00564     __m128d c;
00565     c = _mm_cmpunord_sd( a, b );
00566     b = _mm_cmpeq_sd   ( a, b );
00567     b = _mm_or_pd      ( b, c );
00568     a = _mm_move_sd    ( a, b );
00569     return a;   
00570 }

SSP_FORCEINLINE __m128 ssp_comueq_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comueq_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 573 of file SSEPlus_emulation_comps_SSE2.h.

00574 {
00575     __m128 c;
00576     c = _mm_cmpunord_ss( a, b );
00577     b = _mm_cmpeq_ss   ( a, b );
00578     b = _mm_or_ps      ( a, c );
00579     a = _mm_move_ss    ( a, b );
00580     return a;   
00581 }

SSP_FORCEINLINE __m128d ssp_comunord_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comunord_pd/ compd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 310 of file SSEPlus_emulation_comps_SSE2.h.

00311 {
00312     a = _mm_or_pd    ( a, b );
00313     a = _mm_cmpneq_pd( a, a );
00314     return a;   
00315 }

SSP_FORCEINLINE __m128 ssp_comunord_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comunord_ps/ comps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 318 of file SSEPlus_emulation_comps_SSE2.h.

00319 {
00320     a = _mm_or_ps    ( a, b );
00321     a = _mm_cmpneq_ps( a, a );
00322     return a;      
00323 }

SSP_FORCEINLINE __m128d ssp_comunord_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_comunord_sd/ comsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 326 of file SSEPlus_emulation_comps_SSE2.h.

00327 {
00328     b = _mm_or_pd    ( a, b );
00329     a = _mm_cmpneq_sd( a, b );
00330     return a; 
00331 }

SSP_FORCEINLINE __m128 ssp_comunord_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_comunord_ss/ comss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 334 of file SSEPlus_emulation_comps_SSE2.h.

00335 {
00336     b = _mm_or_ps    ( a, b );
00337     a = _mm_cmpneq_ss( a, b );
00338     return a; 
00339 }

SSP_FORCEINLINE __m128i ssp_cvtepi16_epi32_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepi16_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 1178 of file SSEPlus_emulation_SSE2.h.

01179 {
01180         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01181         __m128i c = _mm_unpacklo_epi16(a, b);                   //FFFFa0**FFFFa1**....
01182         __m128i d = _mm_set1_epi32 (0x8000);                    //0x8000
01183 
01184         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01185         d = _mm_slli_epi32(b, 1);                                               // 0x100 for positive, 0x000 for negative
01186 
01187         return _mm_add_epi32(c, d);
01188 }

SSP_FORCEINLINE __m128i ssp_cvtepi16_epi64_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepi16_epi64 [SSE4.1]. (Searches MSDN)

Definition at line 1192 of file SSEPlus_emulation_SSE2.h.

01193 {
01194         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01195         __m128i c = _mm_unpacklo_epi16(a, b);                   //FFFFa0**FFFFa1**....
01196         __m128i d = _mm_set_epi32(0,0x8000, 0,0x8000);  //0x8000
01197 
01198         c = _mm_unpacklo_epi32(c, b);                                   //FFFFFFFFFFFFFFa0...
01199         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01200         d = _mm_slli_epi64(b, 1);                                               // 0x100 for positive, 0x000 for negative
01201 
01202         return _mm_add_epi64(c, d);
01203 }

SSP_FORCEINLINE __m128i ssp_cvtepi32_epi64_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepi32_epi64 [SSE4.1]. (Searches MSDN)

Definition at line 1207 of file SSEPlus_emulation_SSE2.h.

01208 {
01209         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01210         __m128i c = _mm_unpacklo_epi32(a, b);                   //FFFFFFFFa0******FFFFFFFFa1******....
01211         __m128i d = _mm_set_epi32(0, 0x80000000,0,0x80000000);  //0x80000000
01212 
01213         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01214         d = _mm_slli_epi64(b, 1);                                               // 0x100 for positive, 0x000 for negative
01215 
01216         return _mm_add_epi64(c, d);
01217 }

SSP_FORCEINLINE __m128i ssp_cvtepi8_epi16_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepi8_epi16 [SSE4.1]. (Searches MSDN)

Definition at line 1124 of file SSEPlus_emulation_SSE2.h.

01125 {
01126         __m128i b = _mm_setzero_si128 ();
01127         __m128i c = _mm_unpacklo_epi8(a, b);
01128         __m128i d = _mm_set1_epi16 (128);
01129 
01130         b = _mm_and_si128(d, c);
01131         d = _mm_set1_epi16(0x1FE);
01132         b = _mm_mullo_epi16(b, d);
01133 
01134         return _mm_add_epi16(c, b);
01135 
01136         //Another way, slower
01137         //__m128i b = _mm_set1_epi32 (-1);                              //0xFFFFFFFF
01138         //__m128i c = _mm_unpacklo_epi8(a, b);                  //FFa0FFa1....
01139         //__m128i d = _mm_set1_epi16 (128);                             //0x80
01140         //b = _mm_andnot_si128(c, d);                                   // 0x80 for positive, 0x00 for negative
01141         //d = _mm_slli_epi16(b, 1);                                             // 0x100 for positive, 0x000 for negative
01142         //return _mm_add_epi16(c, d);
01143 }

SSP_FORCEINLINE __m128i ssp_cvtepi8_epi32_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepi8_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 1147 of file SSEPlus_emulation_SSE2.h.

01148 {
01149         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01150         __m128i c = _mm_unpacklo_epi8(a, b);                    //FFa0FFa1....
01151         __m128i d = _mm_set1_epi32 (128);                               //0x80
01152 
01153         c = _mm_unpacklo_epi16(c, b);                                   //FFFFFFa0FFFFFFa1...
01154         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01155         d = _mm_slli_epi32(b, 1);                                               // 0x100 for positive, 0x000 for negative
01156 
01157         return _mm_add_epi32(c, d);
01158 }

SSP_FORCEINLINE __m128i ssp_cvtepi8_epi64_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepi8_epi64 [SSE4.1]. (Searches MSDN)

Definition at line 1162 of file SSEPlus_emulation_SSE2.h.

01163 {
01164         __m128i b = _mm_set1_epi32 (-1);                                //0xFFFFFFFF
01165         __m128i c = _mm_unpacklo_epi8(a, b);                    //FFa0FFa1....
01166         __m128i d = _mm_set_epi32 (0, 128, 0, 128);             //0x80
01167 
01168         c = _mm_unpacklo_epi16(c, b);                                   //FFFFFFa0FFFFFFa1...
01169         c = _mm_unpacklo_epi32(c, b);                                   //FFFFFFFFFFFFFFa0...
01170         b = _mm_andnot_si128(c, d);                                             // 0x80 for positive, 0x00 for negative
01171         d = _mm_slli_epi64(b, 1);                                               // 0x100 for positive, 0x000 for negative
01172 
01173         return _mm_add_epi64(c, d);
01174 }

SSP_FORCEINLINE __m128i ssp_cvtepu16_epi32_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepu16_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 1254 of file SSEPlus_emulation_SSE2.h.

01255 {
01256         __m128i b = _mm_setzero_si128 ();
01257 
01258         return _mm_unpacklo_epi16(a, b);
01259 }

SSP_FORCEINLINE __m128i ssp_cvtepu16_epi64_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepu16_epi64 [SSE4.1]. (Searches MSDN)

Definition at line 1263 of file SSEPlus_emulation_SSE2.h.

01264 {
01265         __m128i b = _mm_setzero_si128 ();
01266 
01267         a = _mm_unpacklo_epi16(a, b);
01268 
01269         return _mm_unpacklo_epi32(a, b);
01270 }

SSP_FORCEINLINE __m128i ssp_cvtepu32_epi64_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepu32_epi64 [SSE4.1]. (Searches MSDN)

Definition at line 1274 of file SSEPlus_emulation_SSE2.h.

01275 {
01276         __m128i b = _mm_setzero_si128 ();
01277 
01278         return _mm_unpacklo_epi32(a, b);
01279 }

SSP_FORCEINLINE __m128i ssp_cvtepu8_epi16_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepu8_epi16 [SSE4.1]. (Searches MSDN)

Definition at line 1221 of file SSEPlus_emulation_SSE2.h.

01222 {
01223         __m128i b =_mm_setzero_si128 ();
01224 
01225         return _mm_unpacklo_epi8(a, b);
01226 }

SSP_FORCEINLINE __m128i ssp_cvtepu8_epi32_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepu8_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 1230 of file SSEPlus_emulation_SSE2.h.

01231 {
01232         __m128i b = _mm_setzero_si128 ();
01233 
01234         a = _mm_unpacklo_epi8(a, b);
01235 
01236         return _mm_unpacklo_epi16(a, b);
01237 }

SSP_FORCEINLINE __m128i ssp_cvtepu8_epi64_SSE2 ( __m128i  a  ) 

SSE2 implementation of _mm_cvtepu8_epi64 [SSE4.1]. (Searches MSDN)

Definition at line 1241 of file SSEPlus_emulation_SSE2.h.

01242 {
01243         __m128i b = _mm_setzero_si128 ();
01244 
01245         a = _mm_unpacklo_epi8(a, b);
01246 
01247         a = _mm_unpacklo_epi16(a, b);
01248 
01249         return _mm_unpacklo_epi32(a, b);
01250 }

SSP_FORCEINLINE __m128d ssp_dp_pd_SSE2 ( __m128d  a,
__m128d  b,
const int  mask 
)

SSE2 implementation of _mm_dp_pd [SSE4.1]. (Searches MSDN)

Definition at line 893 of file SSEPlus_emulation_SSE2.h.

00894 {
00895     int smallMask = (mask & 0x33)<<16;
00896     const static __m128i mulShiftImm_01 = SSP_CONST_SET_32I( 0x40000000, 0x40000000, 0x80000000, 0x80000000 );   // Shift mask multiply moves 0,1, bits to left, becomes MSB
00897     const static __m128i mulShiftImm_45 = SSP_CONST_SET_32I( 0x04000000, 0x04000000, 0x08000000, 0x08000000 );   // Shift mask multiply moves 4,5, bits to left, becomes MSB
00898     ssp_m128 mHi, mLo;
00899 
00900     mLo.i = _mm_set1_epi32( smallMask );// Load the mask into register
00901     mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_01 );       // Shift the bits
00902     mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_45 );       // Shift the bits
00903 
00904     mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() );  // FFFFFFFF if bit set, 00000000 if not set
00905     mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() );  // FFFFFFFF if bit set, 00000000 if not set
00906 
00907     a = _mm_and_pd( a, mHi.d );                                     // Clear input using the high bits of the mask
00908     a = _mm_mul_pd( a, b );
00909 
00910     b = _mm_shuffle_pd( a, a, _MM_SHUFFLE2(0, 1) );                 // Shuffle the values so that we b = { a[0], a[1] } and a = { a[1], a[0] }
00911     a = _mm_add_pd( a, b );                                         // Horizontally add the 4 values
00912     a = _mm_and_pd( a, mLo.d );                                     // Clear output using low bits of the mask
00913     return a;
00914 }

SSP_FORCEINLINE __m128 ssp_dp_ps_SSE2 ( __m128  a,
__m128  b,
const int  mask 
)

SSE2 implementation of _mm_dp_pd [SSE4.1]. (Searches MSDN)

Definition at line 918 of file SSEPlus_emulation_SSE2.h.

00919 {
00920     const static __m128i mulShiftImm_0123 = SSP_CONST_SET_32I( 0x010000, 0x020000, 0x040000, 0x080000 );   // Shift mask multiply moves 0,1,2,3 bits to left, becomes MSB
00921     const static __m128i mulShiftImm_4567 = SSP_CONST_SET_32I( 0x100000, 0x200000, 0x400000, 0x800000 );   // Shift mask multiply moves 4,5,6,7 bits to left, becomes MSB
00922 
00923     // Begin mask preparation
00924     ssp_m128 mHi, mLo;
00925     mLo.i = _mm_set1_epi32( mask );                                 // Load the mask into register
00926     mLo.i = _mm_slli_si128( mLo.i, 3 );                             // Shift into reach of the 16 bit multiply
00927 
00928     mHi.i = _mm_mullo_epi16( mLo.i, mulShiftImm_0123 );             // Shift the bits
00929     mLo.i = _mm_mullo_epi16( mLo.i, mulShiftImm_4567 );             // Shift the bits
00930 
00931     mHi.i = _mm_cmplt_epi32( mHi.i, _mm_setzero_si128() );          // FFFFFFFF if bit set, 00000000 if not set
00932     mLo.i = _mm_cmplt_epi32( mLo.i, _mm_setzero_si128() );          // FFFFFFFF if bit set, 00000000 if not set
00933     // End mask preparation - Mask bits 0-3 in mLo, 4-7 in mHi
00934 
00935     a = _mm_and_ps( a, mHi.f );                                     // Clear input using the high bits of the mask
00936     a = _mm_mul_ps( a, b );
00937 
00938     a = ssp_arithmetic_hadd4_dup_ps_SSE2( a );                      // Horizontally add the 4 values
00939     a = _mm_and_ps( a, mLo.f );                                     // Clear output using low bits of the mask
00940     return a;
00941 }

SSP_FORCEINLINE int ssp_extract_epi32_SSE2 ( __m128i  a,
const int  imm 
)

SSE2 implementation of _mm_extract_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 1532 of file SSEPlus_emulation_SSE2.h.

01533 {
01534     ssp_m128 mask;
01535     switch( imm & 0x3 )
01536     {
01537     case 3:  a = _mm_srli_si128( a, 12 ); break;
01538     case 2:  a = _mm_srli_si128( a, 8  ); break;
01539     case 1:  a = _mm_srli_si128( a, 4  ); break;
01540     }
01541 
01542     mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF );
01543     mask.i = _mm_and_si128 ( mask.i, a   );
01544 
01545     return mask.s32[0];
01546 }

SSP_FORCEINLINE ssp_s64 ssp_extract_epi64_SSE2 ( __m128i  a,
const int  ndx 
)

SSE2 implementation of _mm_extract_epi64 [SSE4.1]. (Searches MSDN)

Definition at line 1557 of file SSEPlus_emulation_SSE2.h.

01558 {
01559     ssp_m128 mask;
01560     switch( ndx & 0x1 )
01561     {
01562     case 1:  a = _mm_srli_si128( a, 8  ); break;
01563     }
01564 
01565     mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF );
01566     mask.i = _mm_and_si128 ( mask.i, a   );
01567 
01568     return mask.s64[0];
01569 }

SSP_FORCEINLINE int ssp_extract_epi8_SSE2 ( __m128i  a,
const int  ndx 
)

SSE2 implementation of _mm_extract_epi8 [SSE4.1]. (Searches MSDN)

Definition at line 1503 of file SSEPlus_emulation_SSE2.h.

01504 {
01505     ssp_m128 mask;
01506     switch( ndx & 0xF )
01507     {
01508     case 15:  a = _mm_srli_si128( a, 15 ); break;
01509     case 14:  a = _mm_srli_si128( a, 14 ); break;
01510     case 13:  a = _mm_srli_si128( a, 13 ); break;
01511     case 12:  a = _mm_srli_si128( a, 12 ); break;
01512     case 11:  a = _mm_srli_si128( a, 11 ); break;
01513     case 10:  a = _mm_srli_si128( a, 10 ); break;
01514     case 9:   a = _mm_srli_si128( a,  9 ); break;
01515     case 8:   a = _mm_srli_si128( a,  8 ); break;
01516     case 7:   a = _mm_srli_si128( a,  7 ); break;
01517     case 6:   a = _mm_srli_si128( a,  6 ); break;
01518     case 5:   a = _mm_srli_si128( a,  5 ); break;
01519     case 4:   a = _mm_srli_si128( a,  4 ); break;
01520     case 3:   a = _mm_srli_si128( a,  3 ); break;
01521     case 2:   a = _mm_srli_si128( a,  2 ); break;
01522     case 1:   a = _mm_srli_si128( a,  1 ); break;
01523     }
01524 
01525     mask.i = _mm_setr_epi8 ( -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ); 
01526     // mask = { 00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,FF }
01527     mask.i = _mm_and_si128 ( mask.i, a   );
01528     return mask.s8[0];
01529 }

SSP_FORCEINLINE int ssp_extract_ps_SSE2 ( __m128  a,
const int  ndx 
)

SSE2 implementation of _mm_extract_ps [SSE4.1]. (Searches MSDN)

Definition at line 1549 of file SSEPlus_emulation_SSE2.h.

01550 {
01551     ssp_m128 A;
01552     A.f = a;
01553     return ssp_extract_epi32_SSE2( A.i, ndx );
01554 }

SSP_FORCEINLINE __m128i ssp_extract_si64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_extract_si64 [SSE4a]. (Searches MSDN)
NOTE: The upper 64-bit of the destination register are undefined.

Definition at line 1586 of file SSEPlus_emulation_SSE2.h.

01587 {
01588     ssp_u32 len, ndx;   
01589     ssp_m128 B;
01590     B.i = b;
01591 
01592     ndx = (ssp_u32)((B.u64[0] & 0x3F00) >> 8);    // Mask ndx field.
01593     len = (ssp_u32)((B.u64[0] & 0x003F));         // Mask len field.
01594 
01595     a = ssp_extracti_si64_SSE2( a, len, ndx );   
01596     return a;
01597 }

SSP_FORCEINLINE __m128i ssp_extracti_si64_SSE2 ( __m128i  a,
int  len,
int  ndx 
)

SSE2 implementation of _mm_extracti_si64 [SSE4a]. (Searches MSDN)
NOTE: The upper 64-bits of the destination register are undefined.

Definition at line 1574 of file SSEPlus_emulation_SSE2.h.

01575 {
01576     int left = ndx + len;   
01577     a = _mm_slli_epi64( a, 64-left );    // clear the mask to the left
01578     a = _mm_srli_epi64( a, 64-len  );    // clear the mask to the right      
01579     return a;
01580 }

SSP_FORCEINLINE __m128d ssp_floor_pd_SSE2 ( __m128d  a  ) 

SSE2 implementation of _mm_floor_pd [SSE4.1]. (Searches MSDN)

Definition at line 1053 of file SSEPlus_emulation_SSE2.h.

01054 {
01055     return ssp_round_pd_SSE2( a, SSP_FROUND_TO_NEG_INF );
01056 }

SSP_FORCEINLINE __m128 ssp_floor_ps_SSE2 ( __m128  a  ) 

SSE2 implementation of _mm_floor_ps [SSE4.1]. (Searches MSDN)

Definition at line 1046 of file SSEPlus_emulation_SSE2.h.

01047 {
01048     return ssp_round_ps_SSE2( a, SSP_FROUND_TO_NEG_INF );
01049 }

SSP_FORCEINLINE __m128d ssp_floor_sd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_ceil_sd [SSE4.1]. (Searches MSDN)

Definition at line 1066 of file SSEPlus_emulation_SSE2.h.

01067 {
01068         b = ssp_round_pd_SSE2(b, SSP_FROUND_TO_NEG_INF );
01069 
01070     return _mm_shuffle_pd(b, a, _MM_SHUFFLE2(1,0));
01071 }

SSP_FORCEINLINE __m128 ssp_floor_ss_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_floor_ss [SSE4.1]. (Searches MSDN)

Definition at line 1082 of file SSEPlus_emulation_SSE2.h.

01083 {
01084         b = ssp_round_ps_SSE2(b, SSP_FROUND_TO_NEG_INF );
01085         b = _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));
01086     return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0));
01087 }

SSP_FORCEINLINE __m128i ssp_hadd_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_hadd_epi16 [SSE3]. (Searches MSDN)

Definition at line 569 of file SSEPlus_emulation_SSE2.h.

00570 {
00571     ssp_convert_odd_even_epi16_SSE2( &a, &b );
00572     a = _mm_add_epi16( a, b );     
00573     return a;
00574 }

SSP_FORCEINLINE __m128i ssp_hadd_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_hadd_epi32 [SSSE3]. (Searches MSDN)

Definition at line 606 of file SSEPlus_emulation_SSE2.h.

00607 {
00608    ssp_convert_odd_even_epi32_SSE2( &a, &b );
00609    a = _mm_add_epi32( a, b );
00610    return a; 
00611 }

SSP_FORCEINLINE __m128d ssp_hadd_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_hadd_pd [SSE3]. (Searches MSDN)

Definition at line 643 of file SSEPlus_emulation_SSE2.h.

00644 {
00645     ssp_m128 A,B,C;
00646     A.d = a;
00647     C.d = a;
00648     B.d = b;
00649 
00650     A.f = _mm_movelh_ps( A.f, B.f );
00651     B.f = _mm_movehl_ps( B.f, C.f );
00652     A.d = _mm_add_pd   ( A.d, B.d );
00653     return A.d;
00654 }

SSP_FORCEINLINE __m128 ssp_hadd_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_hadd_ps [SSE3]. (Searches MSDN)

Definition at line 624 of file SSEPlus_emulation_SSE2.h.

00625 {
00626     ssp_convert_odd_even_ps_SSE2( &a, &b );
00627     a = _mm_add_ps( a, b );
00628     return a;
00629 }

SSP_FORCEINLINE __m128i ssp_hadds_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_hadds_epi16 [SSSE3]. (Searches MSDN)

Definition at line 577 of file SSEPlus_emulation_SSE2.h.

00578 {
00579     ssp_convert_odd_even_epi16_SSE2( &a, &b );
00580     a = _mm_adds_epi16( a, b );    
00581     return a;
00582 }

SSP_FORCEINLINE __m128i ssp_hsub_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_hsub_epi16 [SSE3]. (Searches MSDN)

Definition at line 587 of file SSEPlus_emulation_SSE2.h.

00588 {
00589     ssp_convert_odd_even_epi16_SSE2( &a, &b ); 
00590     a = _mm_sub_epi16( a, b );     
00591     return a;  
00592 }

SSP_FORCEINLINE __m128i ssp_hsub_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_hsub_epi32 [SSSE3]. (Searches MSDN)

Definition at line 614 of file SSEPlus_emulation_SSE2.h.

00615 {
00616    ssp_convert_odd_even_epi32_SSE2( &a, &b );
00617    a = _mm_sub_epi32( b, a );
00618    return a;
00619 }

SSP_FORCEINLINE __m128d ssp_hsub_pd_SSE2 ( __m128d  a,
__m128d  b 
)

SSE2 implementation of _mm_hsub_pd [SSE3]. (Searches MSDN)

Definition at line 658 of file SSEPlus_emulation_SSE2.h.

00659 {
00660     ssp_m128 A,B,C;
00661     A.d = a;
00662     C.d = a;
00663     B.d = b;
00664 
00665     A.f = _mm_movelh_ps( A.f, B.f );
00666     B.f = _mm_movehl_ps( B.f, C.f );
00667     A.d = _mm_sub_pd   ( A.d, B.d );
00668     return A.d;
00669 }

SSP_FORCEINLINE __m128 ssp_hsub_ps_SSE2 ( __m128  a,
__m128  b 
)

SSE2 implementation of _mm_hsub_ps [SSE3]. (Searches MSDN)

Definition at line 633 of file SSEPlus_emulation_SSE2.h.

00634 {
00635     ssp_convert_odd_even_ps_SSE2( &a, &b );
00636     a = _mm_sub_ps( b, a );
00637     return a;
00638 }

SSP_FORCEINLINE __m128i ssp_hsubs_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_hsub_epi16 [SSE3]. (Searches MSDN)

Definition at line 596 of file SSEPlus_emulation_SSE2.h.

00597 {
00598     ssp_convert_odd_even_epi16_SSE2( &a, &b ); 
00599     a = _mm_subs_epi16( a, b );     
00600     return a;  
00601 }

SSP_FORCEINLINE __m128i ssp_insert_epi32_SSE2 ( __m128i  a,
int  b,
const int  ndx 
)

SSE2 implementation of _mm_insert_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 698 of file SSEPlus_emulation_SSE2.h.

00698                                                                                    : Verify behavior on Intel Hardware
00699 {
00700     switch( ndx & 0x3 )
00701     {
00702     case 0: a = _mm_insert_epi16( a, b    , 0 );
00703             a = _mm_insert_epi16( a, b<<16, 1 ); break;
00704     case 1: a = _mm_insert_epi16( a, b    , 2 );
00705             a = _mm_insert_epi16( a, b<<16, 3 ); break;
00706     case 2: a = _mm_insert_epi16( a, b    , 4 );
00707             a = _mm_insert_epi16( a, b<<16, 5 ); break;
00708     case 3: a = _mm_insert_epi16( a, b    , 6 );
00709             a = _mm_insert_epi16( a, b<<16, 7 ); break;
00710     }
00711     return a;
00712 }

SSP_FORCEINLINE __m128i ssp_insert_epi8_SSE2 ( __m128i  a,
int  b,
const int  ndx 
)

SSE2 implementation of _mm_insert_epi8 [SSE4.1]. (Searches MSDN)

* Another implementation, but slower: */

Definition at line 1428 of file SSEPlus_emulation_SSE2.h.

01429 {
01430     ssp_m128 Ahi, Alo;
01431     b = b & 0xFF;                                           /* Convert to 8-bit integer */
01432     Ahi.i = _mm_unpackhi_epi8( a, _mm_setzero_si128() );    /* Ahi = a_8[8:15]  Simulate 8bit integers as 16-bit integers */
01433     Alo.i = _mm_unpacklo_epi8( a, _mm_setzero_si128() );    /* Alo = a_8[0:7]   Simulate 8bit integers as 16-bit integers */
01434 
01435     /* Insert b as a 16-bit integer to upper or lower half of a */
01436     switch( ndx & 0xF )
01437     {
01438     case 0:  Alo.i = _mm_insert_epi16( Alo.i, b, 0 ); break;
01439     case 1:  Alo.i = _mm_insert_epi16( Alo.i, b, 1 ); break;
01440     case 2:  Alo.i = _mm_insert_epi16( Alo.i, b, 2 ); break;
01441     case 3:  Alo.i = _mm_insert_epi16( Alo.i, b, 3 ); break;
01442     case 4:  Alo.i = _mm_insert_epi16( Alo.i, b, 4 ); break;
01443     case 5:  Alo.i = _mm_insert_epi16( Alo.i, b, 5 ); break;
01444     case 6:  Alo.i = _mm_insert_epi16( Alo.i, b, 6 ); break;
01445     case 7:  Alo.i = _mm_insert_epi16( Alo.i, b, 7 ); break;
01446     case 8:  Ahi.i = _mm_insert_epi16( Ahi.i, b, 0 ); break;
01447     case 9:  Ahi.i = _mm_insert_epi16( Ahi.i, b, 1 ); break;
01448     case 10: Ahi.i = _mm_insert_epi16( Ahi.i, b, 2 ); break;
01449     case 11: Ahi.i = _mm_insert_epi16( Ahi.i, b, 3 ); break;
01450     case 12: Ahi.i = _mm_insert_epi16( Ahi.i, b, 4 ); break;
01451     case 13: Ahi.i = _mm_insert_epi16( Ahi.i, b, 5 ); break;
01452     case 14: Ahi.i = _mm_insert_epi16( Ahi.i, b, 6 ); break;
01453     default: Ahi.i = _mm_insert_epi16( Ahi.i, b, 7 );
01454     }
01455     return _mm_packus_epi16( Alo.i, Ahi.i ); // Pack the 16-bit integers to 8bit again.
01456 
01458     //ssp_m128 A, B, mask;
01459     //mask.i = _mm_setzero_si128();
01460     //mask.s8[ ndx & 0x0F ] = (ssp_s8)0xFF;
01461     //B.i    = _mm_set1_epi8( (ssp_s8)b );
01462     //A.i    = _mm_andnot_si128( mask.i, a );
01463     //mask.i = _mm_and_si128( mask.i, B.i );
01464     //A.i = _mm_or_si128( A.i, mask.i );
01465     //return A.i;
01466 }

SSP_FORCEINLINE __m128i ssp_insert_si64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_insert_si64 [SSE4a]. (Searches MSDN)

Definition at line 1485 of file SSEPlus_emulation_SSE2.h.

01486 {
01487     ssp_u32 ndx, len;
01488     ssp_m128 B;
01489     B.i = b;
01490 
01491     ndx = (ssp_u32)((B.u64[1] & 0x3F00) >> 8);    // Mask length field.
01492     len = (ssp_u32)((B.u64[1] & 0x003F));         // Mask ndx field.
01493 
01494     a = ssp_inserti_si64_SSE2( a, b, len, ndx );
01495     return a;
01496 }

SSP_FORCEINLINE __m128i ssp_inserti_si64_SSE2 ( __m128i  a,
__m128i  b,
int  len,
int  ndx 
)

SSE2 implementation of _mm_inserti_si64 [SSE4a]. (Searches MSDN)

Definition at line 1468 of file SSEPlus_emulation_SSE2.h.

01469 {
01470     const static __m128i MASK = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF );
01471 
01472     int left = ndx + len;
01473     __m128i m;
01474     m = _mm_slli_epi64( MASK, 64-left );    // clear the mask to the left
01475     m = _mm_srli_epi64( m,    64-len  );    // clear the mask to the right
01476     m = _mm_slli_epi64( m,    ndx     );    // put the mask into the proper position
01477     b = _mm_slli_epi64( b,    ndx     );    // put the insert bits into the proper position
01478 
01479     a = ssp_logical_bitwise_select_SSE2( b, a, m );
01480     return a;
01481 }

SSP_FORCEINLINE __m128i ssp_macc_epi16_SSE2 ( __m128i  a,
__m128i  b,
__m128i  c 
)

SSE2 implementation of _mm_macc_epi16/ pmacsww [SSE5]. (SSE5 .pdf documentation here)

Definition at line 25 of file SSEPlus_emulation_SSE2.h.

00026 {
00027     a = _mm_mullo_epi16( a, b );
00028     a = _mm_add_epi16( a, c );
00029     return a;
00030 }

SSP_FORCEINLINE __m128i ssp_macc_epi32_SSE2 ( __m128i  a,
__m128i  b,
__m128i  c 
)

SSE2 implementation of _mm_macc_epi32/ pmacsdd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 33 of file SSEPlus_emulation_SSE2.h.

00034 {
00035         __m128i ab02, ab13, mask;
00036 
00037         mask = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
00038         ab02 = _mm_mul_epu32(a, b);
00039         ab02 = _mm_and_si128(ab02, mask);
00040         a    = _mm_srli_epi64(a, 32);
00041         b    = _mm_srli_epi64(b, 32);
00042         ab13 = _mm_mul_epu32(a, b);
00043         ab13 = _mm_slli_epi64(ab13, 32);
00044 
00045         a    = _mm_add_epi32(ab02, ab13);
00046 
00047         return _mm_add_epi32(a, c);
00048 }

SSP_FORCEINLINE __m128d ssp_macc_pd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  c 
)

SSE2 implementation of _mm_macc_pd/fmaddpd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 51 of file SSEPlus_emulation_SSE2.h.

00052 {
00053     a = _mm_mul_pd( a, b );
00054     a = _mm_add_pd( a, c );
00055     return a;
00056 }

SSP_FORCEINLINE __m128 ssp_macc_ps_SSE2 ( __m128  a,
__m128  b,
__m128  c 
)

SSE2 implementation of _mm_macc_ps/fmaddps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 59 of file SSEPlus_emulation_SSE2.h.

00060 {
00061     a = _mm_mul_ps( a, b );
00062     a = _mm_add_ps( a, c );
00063     return a;
00064 }

SSP_FORCEINLINE __m128d ssp_macc_sd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  c 
)

SSE2 implementation of _mm_macc_sd/fmaddsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 67 of file SSEPlus_emulation_SSE2.h.

00068 {
00069     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00070 
00071     ssp_m128 A,B;
00072     A.d = a;
00073     B.d = b;
00074     B.d = ssp_macc_pd_SSE2( A.d, B.d, c );
00075     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00076     return B.d;
00077 }

SSP_FORCEINLINE __m128 ssp_macc_ss_SSE2 ( __m128  a,
__m128  b,
__m128  c 
)

SSE2 implementation of _mm_macc_ss/fmaddss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 80 of file SSEPlus_emulation_SSE2.h.

00080                                                                                                                                        : confirm
00081 {
00082     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00083 
00084     ssp_m128 A,B;
00085     A.f = a;
00086     B.f = b;
00087     B.f = ssp_macc_ps_SSE2( A.f, B.f, c );
00088     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00089     return B.f;
00090 }

SSP_FORCEINLINE __m128i ssp_maccd_epi16_SSE2 ( __m128i  a,
__m128i  b,
__m128i  c 
)

SSE2 implementation of _mm_maccd_epi16/ pmacswd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 93 of file SSEPlus_emulation_SSE2.h.

00094 {
00095         __m128i ab_lo, ab_hi;
00096         __m128i mask = _mm_set1_epi32(0xFFFF);
00097 
00098     ab_lo = _mm_mullo_epi16(a, b);
00099         ab_hi = _mm_mulhi_epi16(a, b);
00100 
00101         ab_lo = _mm_and_si128(ab_lo, mask);
00102         ab_hi = _mm_and_si128(ab_hi, mask);
00103         ab_hi = _mm_slli_epi32(ab_hi, 16);
00104         a = _mm_add_epi32( ab_lo, ab_hi );
00105         return _mm_add_epi32 (a, c);
00106 
00108         //b     = _mm_unpacklo_epi16(ab_lo, ab_hi);
00109         //ab_hi = _mm_unpackhi_epi16(ab_lo, ab_hi);
00110         //ab_lo = _mm_unpacklo_epi32(b,     ab_hi);
00111         //ab_hi = _mm_unpackhi_epi32(b,     ab_hi);
00112         //ab_lo = _mm_unpacklo_epi32(ab_lo, ab_hi);
00113         //return _mm_add_epi32(ab_lo, c);
00114 }

SSP_FORCEINLINE __m128i ssp_macchi_epi32_SSE2 ( __m128i  a,
__m128i  b,
__m128i  c 
)

SSE2 implementation of _mm_macchi_epi32/ pmacsdqh [SSE5]. (SSE5 .pdf documentation here)

Definition at line 117 of file SSEPlus_emulation_SSE2.h.

00118 {
00119         __m128i mask, mask_A, mask_B, mask_C, ab;
00120 
00121         a = _mm_srli_epi64(a, 32);
00122         b = _mm_srli_epi64(b, 32);
00123         mask   = _mm_set_epi32(0x7FFFFFFF, 0, 0x7FFFFFFF, 0);
00124 
00125         //abs(A)
00126         mask_A = _mm_cmplt_epi32( a, mask);     //FFF...F when a < 0
00127         a      = _mm_xor_si128 ( a, mask_A );   //Invert  when a < 0
00128         mask_C = _mm_srli_epi32( mask_A, 31 );  // 1      when a < 0
00129         a      = _mm_add_epi32( a, mask_C );    //Add 1   when a < 0
00130 
00131         //abs(B)
00132         mask_B = _mm_cmplt_epi32( b, mask);     //FFF...F when b < 0
00133         b      = _mm_xor_si128 ( b, mask_B );   //Invert  when b < 0
00134         mask_C = _mm_srli_epi32( mask_B, 31 );  // 1      when b < 0
00135         b      = _mm_add_epi32( b, mask_C );    //Add 1   when b < 0
00136 
00137         ab     = _mm_mul_epu32(a, b);
00138 
00139         //correct negative cases
00140         mask_A = _mm_xor_si128(mask_A, mask_B);
00141         mask_C = _mm_srli_epi32(mask_A, 31 );
00142         mask_B = _mm_slli_epi64(mask_A, 32);
00143         mask   = _mm_add_epi32(mask_A, mask_B);
00144         a      = _mm_xor_si128(ab, mask);
00145         a      = _mm_add_epi64(a, mask_C);
00146 
00147         return _mm_add_epi64(a, c);
00148 }

SSP_FORCEINLINE __m128i ssp_macclo_epi32_SSE2 ( __m128i  a,
__m128i  b,
__m128i  c 
)

SSE2 implementation of _mm_macclo_epi32/ pmacsdql [SSE5]. (SSE5 .pdf documentation here)

Definition at line 151 of file SSEPlus_emulation_SSE2.h.

00152 {
00153         __m128i mask, mask_A, mask_B, mask_C, ab;
00154 
00155         mask   = _mm_set_epi32(0x7FFFFFFF, 0, 0x7FFFFFFF, 0);
00156         //abs(A)
00157         mask_A = _mm_cmplt_epi32( a, mask);     //FFF...F when a < 0
00158         a      = _mm_xor_si128 ( a, mask_A );   //Invert  when a < 0
00159         mask_C = _mm_srli_epi32( mask_A, 31 );  // 1      when a < 0
00160         a      = _mm_add_epi32( a, mask_C );    //Add 1   when a < 0
00161 
00162         //abs(B)
00163         mask_B = _mm_cmplt_epi32( b, mask);     //FFF...F when b < 0
00164         b      = _mm_xor_si128 ( b, mask_B );   //Invert  when b < 0
00165         mask_C = _mm_srli_epi32( mask_B, 31 );  // 1      when b < 0
00166         b      = _mm_add_epi32( b, mask_C );    //Add 1   when b < 0
00167 
00168         ab     = _mm_mul_epu32(a, b);
00169 
00170         //correct negative cases
00171         mask_A = _mm_xor_si128(mask_A, mask_B);
00172         mask_C = _mm_srli_epi32(mask_A, 31 );
00173         mask_B = _mm_slli_epi64(mask_A, 32);
00174         mask   = _mm_add_epi32(mask_A, mask_B);
00175         a      = _mm_xor_si128(ab, mask);
00176         a      = _mm_add_epi64(a, mask_C);
00177 
00178         return _mm_add_epi64(a, c);
00179 }

SSP_FORCEINLINE __m128i ssp_maccs_epi16_SSE2 ( __m128i  a,
__m128i  b,
__m128i  c 
)

SSE2 implementation of _mm_maccs_epi16/ pmacssww [SSE5]. (SSE5 .pdf documentation here)

Definition at line 182 of file SSEPlus_emulation_SSE2.h.

00183 {
00184         //similar to the version in Framewave CBL
00185         __m128i ablo, abhi, unlo, unhi, signC, clo, chi;
00186 
00187         ablo  = _mm_mullo_epi16( a, b );
00188         abhi  = _mm_mulhi_epi16( a, b );
00189         unlo  = _mm_unpacklo_epi16( ablo, abhi );
00190         unhi  = _mm_unpackhi_epi16( ablo, abhi );
00191 
00192         //unpack and keep the sign of C
00193         signC = _mm_srai_epi16 (c, 15);
00194         chi   = _mm_unpackhi_epi16(c, signC);
00195         clo   = _mm_unpacklo_epi16(c, signC);
00196 
00197         chi   = _mm_add_epi32(chi, unhi);
00198         clo   = _mm_add_epi32(clo, unlo);
00199 
00200         return _mm_packs_epi32(clo, chi);
00201 }

SSP_FORCEINLINE __m128i ssp_maccs_epi32_SSE2 ( __m128i  a,
__m128i  b,
__m128i  c 
)

SSE2 implementation of _mm_maccs_epi32/ pmacssdd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 204 of file SSEPlus_emulation_SSE2.h.

00205 {
00206         //Version 1, slightly modified from Framewave CBL
00207         ssp_m128 s1lo,s1hi,s2lo,s2hi,s3lo,s3hi, sl, sh;
00208         static const __m128d max_val = {(double)0x7FFFFFFFl, (double)0x7FFFFFFFl};
00209         static const __m128d min_val = {(-(double)0x80000000l), (-(double)0x80000000l)};
00210 
00211         s1lo.d =  _mm_cvtepi32_pd(a);
00212         s1hi.d = _mm_cvtepi32_pd(_mm_srli_si128(a, 8)); 
00213 
00214         s2lo.d =  _mm_cvtepi32_pd(b);
00215         s2hi.d = _mm_cvtepi32_pd(_mm_srli_si128(b,8)); 
00216 
00217         s1lo.d = _mm_mul_pd(s1lo.d,s2lo.d);
00218         s1hi.d = _mm_mul_pd(s1hi.d,s2hi.d);
00219 
00220         s3lo.d =  _mm_cvtepi32_pd(c);
00221         s3hi.d = _mm_cvtepi32_pd(_mm_srli_si128(c,8)); 
00222         
00223         s1lo.d = _mm_add_pd(s1lo.d,s3lo.d);
00224         s1hi.d = _mm_add_pd(s1hi.d,s3hi.d);
00225 
00226         sl.d   = _mm_min_pd(s1lo.d, max_val);
00227         sl.d   = _mm_max_pd(sl.d, min_val);
00228 
00229         sh.d   = _mm_min_pd(s1hi.d, max_val);
00230         sh.d   = _mm_max_pd(sh.d, min_val);
00231 
00232         sl.i   = _mm_cvtpd_epi32(sl.d); 
00233         sh.i   = _mm_cvtpd_epi32(sh.d);
00234 
00235         sh.i   = _mm_slli_si128(sh.i, 8); 
00236         sl.i   = _mm_or_si128(sl.i, sh.i);
00237 
00238     return sl.i;
00239 }

SSP_FORCEINLINE __m128i ssp_maddubs_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_maddubs_epi16 [SSSE3]. (Searches MSDN)

in: 2 registers x 16 x 8 bit values (a is unsigned, b is signed) out: 1 register x 8 x 16 bit values

r0 := SATURATE_16((a0 * b0) + (a1 * b1))

Definition at line 799 of file SSEPlus_emulation_SSE2.h.

00800 {
00801     const static __m128i EVEN_8 = SSP_CONST_SET_8I( 0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF);
00802     __m128i Aodd, Aeven, Beven, Bodd;
00803 
00804     // Convert the 8 bit inputs into 16 bits by dropping every other value
00805     Aodd  = _mm_srli_epi16( a, 8 );             // A is unsigned  
00806     Bodd  = _mm_srai_epi16( b, 8 );             // B is signed
00807 
00808     Aeven = _mm_and_si128 ( a, EVEN_8 );        // A is unsigned   
00809     Beven = _mm_slli_si128( b,     1  );        // B is signed
00810     Beven = _mm_srai_epi16( Beven, 8  );
00811 
00812     a = _mm_mullo_epi16( Aodd , Bodd  );        // Will always fit in lower 16
00813     b = _mm_mullo_epi16( Aeven, Beven );  
00814     a = _mm_adds_epi16 ( a, b );
00815         return a;
00816 }

SSP_FORCEINLINE __m128i ssp_max_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_max_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 765 of file SSEPlus_emulation_SSE2.h.

00766 {
00767     __m128i mask  = _mm_cmpgt_epi32( a, b );                            // FFFFFFFF where a > b
00768     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00769     return a;
00770 }

SSP_FORCEINLINE __m128i ssp_max_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_max_epi8 [SSE4.1]. (Searches MSDN)

Definition at line 729 of file SSEPlus_emulation_SSE2.h.

00730 {
00731     __m128i mask  = _mm_cmpgt_epi8( a, b );                             // FFFFFFFF where a > b
00732     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00733     return a;
00734 }

SSP_FORCEINLINE __m128i ssp_max_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_max_epu16 [SSE4.1]. (Searches MSDN)

Definition at line 747 of file SSEPlus_emulation_SSE2.h.

00748 {
00749     __m128i mask = ssp_comgt_epu16_SSE2( a, b );
00750     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00751     return a;
00752 }

SSP_FORCEINLINE __m128i ssp_max_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_max_epu32 [SSE4.1]. (Searches MSDN)

Definition at line 783 of file SSEPlus_emulation_SSE2.h.

00784 {
00785    __m128i mask = ssp_comgt_epu32_SSE2( a, b );
00786     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00787     return a;
00788 }

SSP_FORCEINLINE __m128i ssp_min_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_min_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 756 of file SSEPlus_emulation_SSE2.h.

00757 {
00758     __m128i mask  = _mm_cmplt_epi32( a, b );                            // FFFFFFFF where a < b
00759     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00760     return a;
00761 }

SSP_FORCEINLINE __m128i ssp_min_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_min_epi8 [SSE4.1]. (Searches MSDN)

Definition at line 720 of file SSEPlus_emulation_SSE2.h.

00721 {
00722     __m128i mask  = _mm_cmplt_epi8( a, b );                             // FFFFFFFF where a < b
00723     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00724     return a;
00725 }

SSP_FORCEINLINE __m128i ssp_min_epu16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_min_epu16 [SSE4.1]. (Searches MSDN)

Definition at line 738 of file SSEPlus_emulation_SSE2.h.

00739 {
00740     __m128i mask = ssp_comlt_epu16_SSE2( a, b );
00741     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00742     return a;
00743 }

SSP_FORCEINLINE __m128i ssp_min_epu32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_min_epu32 [SSE4.1]. (Searches MSDN)

Definition at line 774 of file SSEPlus_emulation_SSE2.h.

00775 {
00776     __m128i mask = ssp_comlt_epu32_SSE2( a, b );
00777     a = ssp_logical_bitwise_select_SSE2( a, b, mask );
00778     return a;
00779 }

SSP_FORCEINLINE __m128d ssp_movedup_pd_SSE2 ( __m128d  a  ) 

SSE2 implementation of _mm_movedup_pd [SSE3]. (Searches MSDN)

Definition at line 1757 of file SSEPlus_emulation_SSE2.h.

01758 {
01759     ssp_m128 A;
01760     A.d = a;
01761     return _mm_set_pd( A.f64[0], A.f64[0] );
01762 }

SSP_FORCEINLINE __m128 ssp_movehdup_ps_SSE2 ( __m128  a  ) 

SSE2 implementation of _mm_movehdup_ps [SSE3]. (Searches MSDN)

Definition at line 1739 of file SSEPlus_emulation_SSE2.h.

01740 {
01741     ssp_m128 A;
01742     A.f = a;
01743     A.i = _mm_shuffle_epi32( A.i, _MM_SHUFFLE( 3, 3, 1, 1) );
01744     return A.f;
01745 }

SSP_FORCEINLINE __m128 ssp_moveldup_ps_SSE2 ( __m128  a  ) 

SSE2 implementation of _mm_moveldup_ps [SSE3]. (Searches MSDN)

Definition at line 1748 of file SSEPlus_emulation_SSE2.h.

01749 {
01750     ssp_m128 A;
01751     A.f = a;
01752     A.i = _mm_shuffle_epi32( A.i, _MM_SHUFFLE( 2, 2, 0, 0) );
01753     return A.f;
01754 }

SSP_FORCEINLINE __m128i ssp_mpsadbw_epu8_SSE2 ( __m128i  a,
__m128i  b,
const int  msk 
)

SSE2 implementation of _mm_mpsadbw_epu8 [SSE4.1]. (Searches MSDN)

Definition at line 822 of file SSEPlus_emulation_SSE2.h.

00823 {
00824     const static __m128i MASK_BITS04 = SSP_CONST_SET_16I( 0,0,0,0xFFFF,0,0,0,0xFFFF );
00825     const static __m128i MASK_BITS15 = SSP_CONST_SET_16I( 0,0,0xFFFF,0,0,0,0xFFFF,0 );
00826     const static __m128i MASK_BITS26 = SSP_CONST_SET_16I( 0,0xFFFF,0,0,0,0xFFFF,0,0 );
00827     const static __m128i MASK_BITS37 = SSP_CONST_SET_16I( 0xFFFF,0,0,0,0xFFFF,0,0,0 );
00828 
00829     ssp_m128 A,B,A16,tmp,out;
00830     A.i = a;
00831     B.i = b;
00832 
00833     switch( msk & 0x4 )         // Possible values: 0, 4
00834     {
00835     case 4: A.i = _mm_srli_si128( A.i, 4 );
00836     }
00837 
00838     switch( (msk & 0x3) * 4 )   // Possible values: 0, 4, 8, 12
00839     {
00840     case 0:     B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(0,0,0,0) ); break;
00841     case 4:     B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(1,1,1,1) ); break;
00842     case 8:     B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(2,2,2,2) ); break;
00843     case 12:    B.i = _mm_shuffle_epi32( B.i, _MM_SHUFFLE(3,3,3,3) ); break;
00844     //default: ASSERT( false );
00845     }
00846 
00847     // out[0,4]
00848     B.i   = _mm_unpacklo_epi8( B.i, _mm_setzero_si128() );          // 1 2 3 4 | 1 2 3 4
00849     A16.i = _mm_unpacklo_epi8( A.i, _mm_setzero_si128() );          // a b c d | e f g h
00850     tmp.i = _mm_subs_epi16                 ( A16.i, B.i );          // a-1,b-2,c-3,d-4 | e-1,f-2,g-3,h-4
00851     tmp.i = ssp_abs_epi16_SSE2             ( tmp.i    );            // abs(a-1),abs(b-2),...,abs(h-4) | ...
00852     tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 0 );            // x,x,x,abs(a-1)+abs(b-2)+abs(c-3)+abs(d-4) | ...
00853     tmp.i = _mm_and_si128                  ( tmp.i, MASK_BITS04 );  // 0,0,0,abs(a-1)+abs(b-2)+abs(c-3)+abs(d-4) | ...
00854     out.i = tmp.i;
00855 
00856     // out[1,5]
00857     A16.i = _mm_srli_si128   ( A.i, 1 );
00858     A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );        // b c | d e | f g | h i
00859     tmp.i = _mm_subs_epi16                 ( A16.i, B.i );
00860     tmp.i = ssp_abs_epi16_SSE2             ( tmp.i    );
00861     tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 1 );
00862     tmp.i = _mm_and_si128                  ( tmp.i, MASK_BITS15 );
00863     out.i = _mm_or_si128                   ( out.i, tmp.i );
00864 
00865     // out[2,6]
00866     A16.i = _mm_srli_si128   ( A.i, 2 );
00867     A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );        // c d | e f | g h | i j
00868     tmp.i = _mm_subs_epi16                 ( A16.i, B.i );
00869     tmp.i = ssp_abs_epi16_SSE2             ( tmp.i    );
00870     tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 2 );
00871     tmp.i = _mm_and_si128                  ( tmp.i, MASK_BITS26 );
00872     out.i = _mm_or_si128                   ( out.i, tmp.i );
00873 
00874     // out[3,7]
00875     A16.i = _mm_srli_si128   ( A.i, 3 );
00876     A16.i = _mm_unpacklo_epi8( A16.i, _mm_setzero_si128() );        // d e | f g | h i | j k
00877     tmp.i = _mm_subs_epi16                 ( A16.i, B.i );
00878     tmp.i = ssp_abs_epi16_SSE2             ( tmp.i    );
00879     tmp.i = ssp_arithmetic_hadd4_epi16_SSE2( tmp.i, 3 );
00880     tmp.i = _mm_and_si128                  ( tmp.i, MASK_BITS37 );
00881     out.i = _mm_or_si128                   ( out.i, tmp.i );
00882 
00883     return out.i;
00884 }

SSP_FORCEINLINE __m128d ssp_msub_pd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  c 
)

SSE2 implementation of _mm_msub_pd/fmsubpd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 305 of file SSEPlus_emulation_SSE2.h.

00306 {
00307     a = _mm_mul_pd( a, b );
00308     a = _mm_sub_pd( a, c );
00309     return a;
00310 }

SSP_FORCEINLINE __m128 ssp_msub_ps_SSE2 ( __m128  a,
__m128  b,
__m128  c 
)

SSE2 implementation of _mm_msub_ps/fmsubps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 297 of file SSEPlus_emulation_SSE2.h.

00298 {
00299     a = _mm_mul_ps( a, b );
00300     a = _mm_sub_ps( a, c );
00301     return a;
00302 }

SSP_FORCEINLINE __m128d ssp_msub_sd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  c 
)

SSE2 implementation of _mm_msub_sd/fmsubsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 326 of file SSEPlus_emulation_SSE2.h.

00327 {
00328     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00329 
00330     ssp_m128 A,B;
00331     A.d = a;
00332     B.d = b;
00333     B.d = ssp_msub_pd_SSE2( A.d, B.d, c );
00334     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00335     return B.d;
00336 }

SSP_FORCEINLINE __m128 ssp_msub_ss_SSE2 ( __m128  a,
__m128  b,
__m128  c 
)

SSE2 implementation of _mm_msub_ss/fmsubss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 313 of file SSEPlus_emulation_SSE2.h.

00314 {
00315     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00316 
00317     ssp_m128 A,B;
00318     A.f = a;
00319     B.f = b;
00320     B.f = ssp_msub_ps_SSE2( A.f, B.f, c );
00321     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00322     return B.f;
00323 }

SSP_FORCEINLINE __m128i ssp_mulhrs_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_mulhrs_epi16 [SSSE3]. (Searches MSDN)

Definition at line 674 of file SSEPlus_emulation_SSE2.h.

00675 {
00676     const static __m128i VAL = SSP_CONST_SET_32I( 0x4000, 0x4000, 0x4000, 0x4000 );
00677     __m128i c,d;   
00678 
00679     c = _mm_mullo_epi16( a, b );
00680     d = _mm_mulhi_epi16( a, b );
00681    
00682     a = _mm_unpackhi_epi16( c, d );
00683     b = _mm_unpacklo_epi16( c, d );
00684 
00685     a = _mm_add_epi32( a, VAL );
00686     b = _mm_add_epi32( b, VAL );
00687 
00688     a = _mm_srai_epi32( a, 15 );
00689     b = _mm_srai_epi32( b, 15 );
00690 
00691     a = _mm_packs_epi32( b, a );
00692     return a;
00693 }

SSP_FORCEINLINE __m128d ssp_nmacc_pd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  c 
)

SSE2 implementation of _mm_nmacc_pd/fnmaddpd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 256 of file SSEPlus_emulation_SSE2.h.

00257 {
00258     const static __m128d neg1 = SSP_CONST_SET_64F( -1.0, -1.0 );
00259 
00260     a = _mm_mul_pd( a, b    );
00261     a = _mm_mul_pd( a, neg1 );
00262     a = _mm_add_pd( a, c    );
00263     return a;
00264 }

SSP_FORCEINLINE __m128 ssp_nmacc_ps_SSE2 ( __m128  a,
__m128  b,
__m128  c 
)

SSE2 implementation of _mm_nmacc_ps/fnmaddps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 245 of file SSEPlus_emulation_SSE2.h.

00246 {
00247     const static __m128 neg1 = SSP_CONST_SET_32F( -1.0f, -1.0f, -1.0f, -1.0f );
00248 
00249     a = _mm_mul_ps( a, b    );
00250     a = _mm_mul_ps( a, neg1 );
00251     a = _mm_add_ps( a, c    );
00252     return a;
00253 }

SSP_FORCEINLINE __m128d ssp_nmacc_sd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  c 
)

SSE2 implementation of _mm_nmacc_sd/fnmaddsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 280 of file SSEPlus_emulation_SSE2.h.

00281 {
00282     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00283 
00284     ssp_m128 A,B;
00285     A.d = a;
00286     B.d = b;
00287     B.d = ssp_nmacc_pd_SSE2( A.d, B.d, c );
00288     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00289     return B.d;
00290 }

SSP_FORCEINLINE __m128 ssp_nmacc_ss_SSE2 ( __m128  a,
__m128  b,
__m128  c 
)

SSE2 implementation of _mm_nmacc_ss/fnmaddss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 267 of file SSEPlus_emulation_SSE2.h.

00267                                                                                                                                         : confirm
00268 {
00269     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00270 
00271     ssp_m128 A,B;
00272     A.f = a;
00273     B.f = b;
00274     B.f = ssp_nmacc_ps_SSE2( A.f, B.f, c );
00275     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00276     return B.f;
00277 }

SSP_FORCEINLINE __m128d ssp_nmsub_pd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  c 
)

SSE2 implementation of _mm_nmsub_pd/fnmsubpd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 354 of file SSEPlus_emulation_SSE2.h.

00355 {
00356     const static __m128d neg1 = SSP_CONST_SET_64F( -1.0, -1.0 );
00357 
00358     a = _mm_mul_pd( a, b    );
00359     a = _mm_mul_pd( a, neg1 );
00360     a = _mm_sub_pd( a, c    );
00361     return a;
00362 }

SSP_FORCEINLINE __m128 ssp_nmsub_ps_SSE2 ( __m128  a,
__m128  b,
__m128  c 
)

SSE2 implementation of _mm_nmsub_ps/fnmsubps [SSE5]. (SSE5 .pdf documentation here)

Definition at line 343 of file SSEPlus_emulation_SSE2.h.

00344 {
00345     const static __m128 neg1 = SSP_CONST_SET_32F( -1.0f, -1.0f, -1.0f, -1.0f );
00346 
00347     a = _mm_mul_ps( a, b    );
00348     a = _mm_mul_ps( a, neg1 );
00349     a = _mm_sub_ps( a, c    );
00350     return a;
00351 }

SSP_FORCEINLINE __m128d ssp_nmsub_sd_SSE2 ( __m128d  a,
__m128d  b,
__m128d  c 
)

SSE2 implementation of _mm_nmsub_sd/fnmsubsd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 378 of file SSEPlus_emulation_SSE2.h.

00379 {
00380     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0, 0 );
00381 
00382     ssp_m128 A,B;
00383     A.d = a;
00384     B.d = b;
00385     B.d = ssp_nmsub_pd_SSE2( A.d, B.d, c );
00386     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00387     return B.d;
00388 }

SSP_FORCEINLINE __m128 ssp_nmsub_ss_SSE2 ( __m128  a,
__m128  b,
__m128  c 
)

SSE2 implementation of _mm_nmsub_ss/fnmsubss [SSE5]. (SSE5 .pdf documentation here)

Definition at line 365 of file SSEPlus_emulation_SSE2.h.

00366 {
00367     const static __m128i mask = SSP_CONST_SET_32I( SSP_ALL_SET_32I, SSP_ALL_SET_32I, SSP_ALL_SET_32I, 0 );
00368 
00369     ssp_m128 A,B;
00370     A.f = a;
00371     B.f = b;
00372     B.f = ssp_nmsub_ps_SSE2( A.f, B.f, c );
00373     B.i = ssp_logical_bitwise_select_SSE2( A.i, B.i, mask ); // This was faster than using 2 shuffles
00374     return B.f;
00375 }

SSP_FORCEINLINE __m128i ssp_packus_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_packus_epi32 [SSE4.1]. (Searches MSDN)

Definition at line 1283 of file SSEPlus_emulation_SSE2.h.

01284 {
01285     const static __m128i val_32 = SSP_CONST_SET_32I(  0x8000, 0x8000, 0x8000, 0x8000 );
01286     const static __m128i val_16 = SSP_CONST_SET_16I(  0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000 );
01287 
01288     a = _mm_sub_epi32( a, val_32 );
01289     b = _mm_sub_epi32( b, val_32 );
01290     a = _mm_packs_epi32( a, b );
01291     a = _mm_add_epi16( a, val_16 );
01292     return a;
01293 }

SSP_FORCEINLINE __m128i ssp_rot_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_rot_epi16/ protw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1794 of file SSEPlus_emulation_SSE2.h.

01795 {
01796     int n;
01797     ssp_m128 A,B;
01798     A.i = a;
01799     B.i = b;
01800 
01801     for( n = 0; n < 8; n++ )
01802     {
01803       if( B.s16[n] < 0 )
01804       {
01805         unsigned int count = (-B.s16[n]) % 16;
01806         unsigned int carry_count = (16 - count) % 16;
01807         ssp_u16 carry = A.u16[n] << carry_count;
01808         A.u16[n] = A.u16[n] >> count;
01809         A.u16[n] = A.u16[n] | carry;
01810       }
01811       else
01812       {
01813         unsigned int count = B.s16[n] % 8;
01814         unsigned int carry_count = (16 - count) % 16;
01815         ssp_u16 carry = A.u16[n] >> carry_count;
01816         A.u16[n] = A.u16[n] << count;
01817         A.u16[n] = A.u16[n] | carry;
01818       }
01819     }
01820     return A.i;
01821 }

SSP_FORCEINLINE __m128i ssp_rot_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_rot_epi32/ protd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1823 of file SSEPlus_emulation_SSE2.h.

01824 {
01825     int n;
01826     ssp_m128 A,B;
01827     A.i = a;
01828     B.i = b;
01829 
01830     for( n = 0; n < 4; n++ )
01831     {
01832       if( B.s32[n] < 0 )
01833       {
01834         unsigned int count = (-B.s32[n]) % 32;
01835         unsigned int carry_count = (32 - count) % 32;
01836         ssp_u32 carry = A.u32[n] << carry_count;
01837         A.u32[n] = A.u32[n] >> count;
01838         A.u32[n] = A.u32[n] | carry;
01839       }
01840       else
01841       {
01842         unsigned int count = B.s32[n] % 32;
01843         unsigned int carry_count = (32 - count) % 32;
01844         ssp_u32 carry = A.u32[n] >> carry_count;
01845         A.u32[n] = A.u32[n] << count;
01846         A.u32[n] = A.u32[n] | carry;
01847       }
01848     }
01849     return A.i;
01850 }

SSP_FORCEINLINE __m128i ssp_rot_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_rot_epi64/ protq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1852 of file SSEPlus_emulation_SSE2.h.

01853 {
01854     int n;
01855     ssp_m128 A,B;
01856     A.i = a;
01857     B.i = b;
01858 
01859     for( n = 0; n < 2; n++ )
01860     {
01861       if( B.s64[n] < 0 )
01862       {
01863         unsigned int count = (unsigned int)((-B.s64[n]) % 64);
01864         unsigned int carry_count = (64 - count) % 64;
01865         ssp_u64 carry = A.u64[n] << carry_count;
01866         A.u64[n] = A.u64[n] >> count;
01867         A.u64[n] = A.u64[n] | carry;
01868       }
01869       else
01870       {
01871         unsigned int count = (unsigned int)(B.s64[n] % 64);
01872         unsigned int carry_count = (64 - count) % 64;
01873         ssp_u64 carry = A.u64[n] >> carry_count;
01874         A.u64[n] = A.u64[n] << count;
01875         A.u64[n] = A.u64[n] | carry;
01876       }
01877     }
01878     return A.i;
01879 }

SSP_FORCEINLINE __m128i ssp_rot_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_rot_epi8/ protb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1765 of file SSEPlus_emulation_SSE2.h.

01766 {
01767     int n;
01768     ssp_m128 A,B;
01769     A.i = a;
01770     B.i = b;
01771 
01772     for( n = 0; n < 16; n++ )
01773     {
01774       if( B.s8[n] < 0 )
01775       {
01776         unsigned int count = (-B.s8[n]) % 8;
01777         unsigned int carry_count = (8 - count) % 8;
01778         unsigned char carry = A.u8[n] << carry_count;
01779         A.u8[n] = A.u8[n] >> count;
01780         A.u8[n] = A.u8[n] | carry;
01781       }
01782       else
01783       {
01784         unsigned int count = B.s8[n] % 8;
01785         unsigned int carry_count = (8 - count) % 8;
01786         unsigned char carry = A.u8[n] >> carry_count;
01787         A.u8[n] = A.u8[n] << count;
01788         A.u8[n] = A.u8[n] | carry;
01789       }
01790     }
01791     return A.i;
01792 }

SSP_FORCEINLINE __m128i ssp_roti_epi16_SSE2 ( __m128i  a,
const int  b 
)

SSE2 implementation of _mm_roti_epi16/ protw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1907 of file SSEPlus_emulation_SSE2.h.

01908 {
01909     ssp_m128 A;
01910     A.i = a;
01911 
01912     if( b < 0 )
01913     {
01914         const unsigned int count = (-b) % 16;
01915         const unsigned int carry_count = (16 - count) % 16;
01916         __m128i t = _mm_slli_epi16( A.i, carry_count );
01917         A.i = _mm_srli_epi16( A.i, count );
01918         A.i = _mm_or_si128( A.i, t );
01919     }
01920     else
01921     {
01922         const unsigned int count = b % 16;
01923         const unsigned int carry_count = (16 - count) % 16;
01924         __m128i t = _mm_srli_epi16( A.i, carry_count );
01925         A.i = _mm_slli_epi16( A.i, count );
01926         A.i = _mm_or_si128( A.i, t );
01927     }
01928 
01929     return A.i;
01930 }

SSP_FORCEINLINE __m128i ssp_roti_epi32_SSE2 ( __m128i  a,
const int  b 
)

SSE2 implementation of _mm_roti_epi32/ protd [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1932 of file SSEPlus_emulation_SSE2.h.

01933 {
01934     ssp_m128 A;
01935     A.i = a;
01936 
01937     if( b < 0 )
01938     {
01939         const unsigned int count = (-b) % 32;
01940         const unsigned int carry_count = (32 - count) % 32;
01941         __m128i t = _mm_slli_epi32( A.i, carry_count );
01942         A.i = _mm_srli_epi32( A.i, count );
01943         A.i = _mm_or_si128( A.i, t );
01944     }
01945     else
01946     {
01947         const unsigned int count = b % 32;
01948         const unsigned int carry_count = (32 - count) % 32;
01949         __m128i t = _mm_srli_epi32( A.i, carry_count );
01950         A.i = _mm_slli_epi32( A.i, count );
01951         A.i = _mm_or_si128( A.i, t );
01952     }
01953 
01954     return A.i;
01955 }

SSP_FORCEINLINE __m128i ssp_roti_epi64_SSE2 ( __m128i  a,
const int  b 
)

SSE2 implementation of _mm_roti_epi64/ protq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1957 of file SSEPlus_emulation_SSE2.h.

01958 {
01959     ssp_m128 A;
01960     A.i = a;
01961 
01962     if( b < 0 )
01963     {
01964         const unsigned int count = (-b) % 64;
01965         const unsigned int carry_count = (64 - count) % 64;
01966         __m128i t = _mm_slli_epi64( A.i, carry_count );
01967         A.i = _mm_srli_epi64( A.i, count );
01968         A.i = _mm_or_si128( A.i, t );
01969     }
01970     else
01971     {
01972         const unsigned int count = b % 64;
01973         const unsigned int carry_count = (64 - count) % 64;
01974         __m128i t = _mm_srli_epi64( A.i, carry_count );
01975         A.i = _mm_slli_epi64( A.i, count );
01976         A.i = _mm_or_si128( A.i, t );
01977     }
01978 
01979     return A.i;
01980 }

SSP_FORCEINLINE __m128i ssp_roti_epi8_SSE2 ( __m128i  a,
const int  b 
)

SSE2 implementation of _mm_roti_epi8/ protb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1882 of file SSEPlus_emulation_SSE2.h.

01883 {
01884     ssp_m128 A;
01885     A.i = a;
01886 
01887     if( b < 0 )
01888     {
01889         const unsigned int count = (-b) % 8;
01890         const unsigned int carry_count = (8 - count) % 8;
01891         __m128i t = ssp_slli_epi8_SSE2( A.i, carry_count );
01892         A.i = ssp_srli_epi8_SSE2( A.i, count );
01893         A.i = _mm_or_si128( A.i, t );
01894     }
01895     else
01896     {
01897         const unsigned int count = b % 8;
01898         const unsigned int carry_count = (8 - count) % 8;
01899         __m128i t = ssp_srli_epi8_SSE2( A.i, carry_count );
01900         A.i = ssp_slli_epi8_SSE2( A.i, count );
01901         A.i = _mm_or_si128( A.i, t );
01902     }
01903 
01904     return A.i;
01905 }

SSP_FORCEINLINE __m128d ssp_round_pd_SSE2 ( __m128d  a,
int  iRoundMode 
)

Definition at line 985 of file SSEPlus_emulation_SSE2.h.

00986 {
00987     #pragma message( "" WARN() "SSEPlus SSE2 rounding functions overflow if input outside 32 bit integer range" )
00988 
00989     enum ENUM_MXCSR
00990     {
00991         CSR_ROUND_TO_EVEN = 0x00001F80, //
00992         CSR_ROUND_DOWN    = 0x00003F80, //
00993         CSR_ROUND_UP      = 0x00005F80, //
00994         CSR_ROUND_TRUNC   = 0x00007F80, //(_mm_getcsr() & ~_mm_ROUND_MASK) | _mm_ROUND_TOWARD_ZERO;
00995     }; 
00996 
00997     ssp_u32 bak = _mm_getcsr();
00998     ssp_m128 A, i;
00999     A.d = a;
01000     
01001 
01002     switch( iRoundMode & 0x3 )
01003     {
01004     case SSP_FROUND_CUR_DIRECTION:                                      break;
01005     case SSP_FROUND_TO_ZERO:            _mm_setcsr( CSR_ROUND_TRUNC  ); break;
01006     case SSP_FROUND_TO_POS_INF:         _mm_setcsr( CSR_ROUND_UP     ); break;
01007     case SSP_FROUND_TO_NEG_INF:         _mm_setcsr( CSR_ROUND_DOWN   ); break;
01008     default:                            _mm_setcsr( CSR_ROUND_TO_EVEN); break;
01009     }
01010     
01011     i.i    = _mm_cvtpd_epi32( A.d );    // Convert to integer
01012     A.d    = _mm_cvtepi32_pd( i.i );    // Convert back to float
01013 
01014     i.u32[0] = bak;                     // Workaround for a bug in the MSVC compiler. MSVC was hoisting the mxcsr restore above the converts. 
01015     _mm_setcsr( i.u32[0] );             
01016     return A.d;
01017 }

SSP_FORCEINLINE __m128 ssp_round_ps_SSE2 ( __m128  a,
int  iRoundMode 
)

SSE2 implementation of _mm_round_ps [SSE4.1]. (Searches MSDN)
NOTE_1: When rounding from negative numbers to zero, this function returns 0 and NOT -0.
If this behavor is desired, use the slower function ssp_round_ps_neg_zero_SSE2().
NOTE_2: This functon should used only with input in the range (-2,147,483,648 -> 2,147,483,647)
If a greater range is desired, use the slower function ssp_round_ps_REF().

Definition at line 950 of file SSEPlus_emulation_SSE2.h.

00951 {
00952     #pragma message( "" WARN() "SSEPlus SSE2 rounding functions overflow if input outside 32 bit integer range" )
00953 
00954     enum ENUM_MXCSR
00955     {
00956         CSR_ROUND_TO_EVEN = 0x00001F80, //
00957         CSR_ROUND_DOWN    = 0x00003F80, //
00958         CSR_ROUND_UP      = 0x00005F80, //
00959         CSR_ROUND_TRUNC   = 0x00007F80, //(_mm_getcsr() & ~_mm_ROUND_MASK) | _mm_ROUND_TOWARD_ZERO;
00960     }; 
00961 
00962     ssp_u32 bak = _mm_getcsr();
00963     ssp_m128 A, i;
00964     A.f = a;
00965 
00966     switch( iRoundMode & 0x3 )
00967     {
00968     case SSP_FROUND_CUR_DIRECTION:                                      break;
00969     case SSP_FROUND_TO_ZERO:            _mm_setcsr( CSR_ROUND_TRUNC  ); break;
00970     case SSP_FROUND_TO_POS_INF:         _mm_setcsr( CSR_ROUND_UP     ); break;
00971     case SSP_FROUND_TO_NEG_INF:         _mm_setcsr( CSR_ROUND_DOWN   ); break;
00972     default:                            _mm_setcsr( CSR_ROUND_TO_EVEN); break;
00973     }
00974     
00975     i.i    = _mm_cvtps_epi32( A.f );    // Convert to integer
00976     A.f    = _mm_cvtepi32_ps( i.i );    // Convert back to float
00977 
00978     i.u32[0] = bak;                     // Workaround for a bug in the MSVC compiler. MSVC was hoisting the mxcsr restore above the converts. 
00979     _mm_setcsr( i.u32[0] );
00980     return A.f;
00981 }

SSP_FORCEINLINE __m128 ssp_round_ss_SSE2 ( __m128  a,
__m128  b,
int  iRoundMode 
)

SSE2 implementation of _mm_round_ss [SSE4.1]. (Searches MSDN)

Definition at line 1021 of file SSEPlus_emulation_SSE2.h.

01022 {
01023         //Commented code will generate linker error in x64 platform
01024     //ssp_m128 A,B;
01025     //A.f = a;
01026     //B.f = ssp_round_ps_SSE2( b, iRoundMode );
01027 
01028     //A.f = _mm_move_ss( A.f, B.f );
01029 
01031         //return A.f;
01032         b = ssp_round_ps_SSE2(b, iRoundMode);               // B contains modified values through whole vector
01033         b =    _mm_shuffle_ps(b, a, _MM_SHUFFLE(1,1,0,0));  
01034     return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,2,0)); 
01035 }

SSP_FORCEINLINE __m128i ssp_sha_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of ssp_sha_epi16/pshaw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 2100 of file SSEPlus_emulation_SSE2.h.

02101 {
02102     __m128i v1, v2, mask, mask2, b1, b2;
02103     b1 = ssp_abs_epi8_SSE2( b );
02104     mask = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, -1 );
02105     mask2 = _mm_srli_epi16( mask, 12 ); // the shfit count is a 4 bit value
02106 
02107     b2 = _mm_and_si128( b1, mask2 );
02108     v1 = _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ); // negative shift
02109     v2 = _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ); // positive shift
02110     mask = _mm_slli_si128( mask, 2 );
02111     b1 = _mm_srli_si128( b1, 2 );
02112 
02113     b2 = _mm_and_si128( b1, mask2 );
02114     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02115     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02116     mask = _mm_slli_si128( mask, 2 );
02117     b1 = _mm_srli_si128( b1, 2 );
02118 
02119     b2 = _mm_and_si128( b1, mask2 );
02120     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02121     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02122     mask = _mm_slli_si128( mask, 2 );
02123     b1 = _mm_srli_si128( b1, 2 );
02124 
02125     b2 = _mm_and_si128( b1, mask2 );
02126     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02127     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02128     mask = _mm_slli_si128( mask, 2 );
02129     b1 = _mm_srli_si128( b1, 2 );
02130 
02131     b2 = _mm_and_si128( b1, mask2 );
02132     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02133     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02134     mask = _mm_slli_si128( mask, 2 );
02135     b1 = _mm_srli_si128( b1, 2 );
02136 
02137     b2 = _mm_and_si128( b1, mask2 );
02138     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02139     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02140     mask = _mm_slli_si128( mask, 2 );
02141     b1 = _mm_srli_si128( b1, 2 );
02142 
02143     b2 = _mm_and_si128( b1, mask2 );
02144     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02145     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02146     mask = _mm_slli_si128( mask, 2 );
02147     b1 = _mm_srli_si128( b1, 2 );
02148 
02149     b2 = _mm_and_si128( b1, mask2 );
02150     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi16( a, b2 ), mask ) ); // negative shift
02151     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02152 
02153     mask = _mm_setzero_si128();
02154     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02155     mask2 = _mm_slli_epi16( mask, 8 );
02156     mask = _mm_or_si128( mask, mask2 );
02157     v1 = _mm_and_si128( v1, mask );
02158     mask = _mm_andnot_si128( mask, v2 );
02159     v1 = _mm_or_si128( v1, mask );
02160     return v1;
02161 }

SSP_FORCEINLINE __m128i ssp_sha_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of ssp_sha_epi32/pshad [SSE5]. (SSE5 .pdf documentation here)

Definition at line 2204 of file SSEPlus_emulation_SSE2.h.

02205 {
02206     __m128i v1, v2, mask, mask2, b1, b2;
02207     b1 = ssp_abs_epi8_SSE2( b );
02208     mask = _mm_set_epi32( 0, 0, 0, -1 );
02209     mask2 = _mm_srli_epi32( mask, 27 ); // the shfit count is a 5 bit value
02210 
02211     b2 = _mm_and_si128( b1, mask2 );
02212     v1 = _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ); // negative shift
02213     v2 = _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ); // positive shift
02214     mask = _mm_slli_si128( mask, 4 );
02215     b1 = _mm_srli_si128( b1, 4 );
02216 
02217     b2 = _mm_and_si128( b1, mask2 );
02218     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift
02219     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02220     mask = _mm_slli_si128( mask, 4 );
02221     b1 = _mm_srli_si128( b1, 4 );
02222 
02223     b2 = _mm_and_si128( b1, mask2 );
02224     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift
02225     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02226     mask = _mm_slli_si128( mask, 4 );
02227     b1 = _mm_srli_si128( b1, 4 );
02228 
02229     b2 = _mm_and_si128( b1, mask2 );
02230     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_sra_epi32( a, b2 ), mask ) ); // negative shift
02231     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02232 
02233     mask = _mm_setzero_si128();
02234     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02235     mask = _mm_slli_epi32( mask, 24 );
02236     mask = _mm_srai_epi32( mask, 24 );
02237     v1 = _mm_and_si128( v1, mask );
02238     mask = _mm_andnot_si128( mask, v2 );
02239     v1 = _mm_or_si128( v1, mask );
02240     return v1;
02241 }

SSP_FORCEINLINE __m128i ssp_sha_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of ssp_sha_epi64/pshaq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 2274 of file SSEPlus_emulation_SSE2.h.

02275 {
02276     int n;
02277     ssp_m128 A,B;
02278     A.i = a;
02279     B.i = b;
02280 
02281     for( n = 0; n < 2; n++ )
02282     {
02283       if( B.s8[n*8] < 0 )
02284       {
02285         unsigned int count = (-B.s8[n*8]) % 64;
02286         A.s64[n] = A.s64[n] >> count;
02287       }
02288       else
02289       {
02290         unsigned int count = B.s8[n*8] % 64;
02291         A.s64[n] = A.s64[n] << count;
02292       }
02293     }
02294 
02295     return A.i;
02296 }

SSP_FORCEINLINE __m128i ssp_sha_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of ssp_sha_epi8/pshab [SSE5]. (SSE5 .pdf documentation here)

Definition at line 2011 of file SSEPlus_emulation_SSE2.h.

02012 {
02013     int n;
02014     ssp_m128 A,B;
02015     A.i = a;
02016     B.i = b;
02017 
02018     for( n = 0; n < 16; n++ )
02019     {
02020       if( B.s8[n] < 0 )
02021       {
02022         unsigned int count = (-B.s8[n]) % 8;
02023         A.s8[n] = A.s8[n] >> count;
02024       }
02025       else
02026       {
02027         unsigned int count = B.s8[n] % 8;
02028         A.s8[n] = A.s8[n] << count;
02029       }
02030     }
02031 
02032     return A.i;
02033 }

SSP_FORCEINLINE __m128i ssp_shl_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of ssp_shl_epi16/pshlw [SSE5]. (SSE5 .pdf documentation here)

Definition at line 2036 of file SSEPlus_emulation_SSE2.h.

02037 {
02038     __m128i v1, v2, mask, mask2, b1, b2;
02039     b1 = ssp_abs_epi8_SSE2( b );
02040     mask = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, -1 );
02041     mask2 = _mm_srli_epi16( mask, 12 ); // the shfit count is a 4 bit value
02042 
02043     b2 = _mm_and_si128( b1, mask2 );
02044     v1 = _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ); // negative shift
02045     v2 = _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ); // positive shift
02046     mask = _mm_slli_si128( mask, 2 );
02047     b1 = _mm_srli_si128( b1, 2 );
02048 
02049     b2 = _mm_and_si128( b1, mask2 );
02050     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02051     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02052     mask = _mm_slli_si128( mask, 2 );
02053     b1 = _mm_srli_si128( b1, 2 );
02054 
02055     b2 = _mm_and_si128( b1, mask2 );
02056     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02057     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02058     mask = _mm_slli_si128( mask, 2 );
02059     b1 = _mm_srli_si128( b1, 2 );
02060 
02061     b2 = _mm_and_si128( b1, mask2 );
02062     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02063     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02064     mask = _mm_slli_si128( mask, 2 );
02065     b1 = _mm_srli_si128( b1, 2 );
02066 
02067     b2 = _mm_and_si128( b1, mask2 );
02068     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02069     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02070     mask = _mm_slli_si128( mask, 2 );
02071     b1 = _mm_srli_si128( b1, 2 );
02072 
02073     b2 = _mm_and_si128( b1, mask2 );
02074     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02075     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02076     mask = _mm_slli_si128( mask, 2 );
02077     b1 = _mm_srli_si128( b1, 2 );
02078 
02079     b2 = _mm_and_si128( b1, mask2 );
02080     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02081     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02082     mask = _mm_slli_si128( mask, 2 );
02083     b1 = _mm_srli_si128( b1, 2 );
02084 
02085     b2 = _mm_and_si128( b1, mask2 );
02086     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi16( a, b2 ), mask ) ); // negative shift
02087     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi16( a, b2 ), mask ) ); // positive shift
02088 
02089     mask = _mm_setzero_si128();
02090     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02091     mask2 = _mm_slli_epi16( mask, 8 );
02092     mask = _mm_or_si128( mask, mask2 );
02093     v1 = _mm_and_si128( v1, mask );
02094     mask = _mm_andnot_si128( mask, v2 );
02095     v1 = _mm_or_si128( v1, mask );
02096     return v1;
02097 }

SSP_FORCEINLINE __m128i ssp_shl_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of ssp_shl_epi32/pshld [SSE5]. (SSE5 .pdf documentation here)

Definition at line 2164 of file SSEPlus_emulation_SSE2.h.

02165 {
02166     __m128i v1, v2, mask, mask2, b1, b2;
02167     b1 = ssp_abs_epi8_SSE2( b );
02168     mask = _mm_set_epi32( 0, 0, 0, -1 );
02169     mask2 = _mm_srli_epi32( mask, 27 ); // the shfit count is a 5 bit value
02170 
02171     b2 = _mm_and_si128( b1, mask2 );
02172     v1 = _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ); // negative shift
02173     v2 = _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ); // positive shift
02174     mask = _mm_slli_si128( mask, 4 );
02175     b1 = _mm_srli_si128( b1, 4 );
02176 
02177     b2 = _mm_and_si128( b1, mask2 );
02178     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift
02179     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02180     mask = _mm_slli_si128( mask, 4 );
02181     b1 = _mm_srli_si128( b1, 4 );
02182 
02183     b2 = _mm_and_si128( b1, mask2 );
02184     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift
02185     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02186     mask = _mm_slli_si128( mask, 4 );
02187     b1 = _mm_srli_si128( b1, 4 );
02188 
02189     b2 = _mm_and_si128( b1, mask2 );
02190     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi32( a, b2 ), mask ) ); // negative shift
02191     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi32( a, b2 ), mask ) ); // positive shift
02192 
02193     mask = _mm_setzero_si128();
02194     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02195     mask = _mm_slli_epi32( mask, 24 );
02196     mask = _mm_srai_epi32( mask, 24 );
02197     v1 = _mm_and_si128( v1, mask );
02198     mask = _mm_andnot_si128( mask, v2 );
02199     v1 = _mm_or_si128( v1, mask );
02200     return v1;
02201 }

SSP_FORCEINLINE __m128i ssp_shl_epi64_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of ssp_shl_epi64/pshlq [SSE5]. (SSE5 .pdf documentation here)

Definition at line 2244 of file SSEPlus_emulation_SSE2.h.

02245 {
02246     __m128i v1, v2, mask, mask2, b1, b2;
02247     b1 = ssp_abs_epi8_SSE2( b );
02248     mask = _mm_set_epi32( 0, 0, -1, -1 );
02249     mask2 = _mm_srli_epi64( mask, 58 ); // the shfit count is a 6 bit value
02250 
02251     b2 = _mm_and_si128( b1, mask2 );
02252     v1 = _mm_and_si128( _mm_srl_epi64( a, b2 ), mask ); // negative shift
02253     v2 = _mm_and_si128( _mm_sll_epi64( a, b2 ), mask ); // positive shift
02254     mask = _mm_slli_si128( mask, 8 );
02255     b1 = _mm_srli_si128( b1, 8 );
02256 
02257     b2 = _mm_and_si128( b1, mask2 );
02258     v1 = _mm_or_si128( v1, _mm_and_si128( _mm_srl_epi64( a, b2 ), mask ) ); // negative shift
02259     v2 = _mm_or_si128( v2, _mm_and_si128( _mm_sll_epi64( a, b2 ), mask ) ); // positive shift
02260 
02261     mask = _mm_setzero_si128();
02262     mask = _mm_cmpgt_epi8( mask, b ); // set mask to 0xFF for all negative shift counts in b
02263     mask = _mm_slli_epi16( mask, 8 );
02264     mask = _mm_srai_epi16( mask, 8 );
02265     mask = _mm_shufflelo_epi16( mask, _MM_SHUFFLE(0,0,0,0) );
02266     mask = _mm_shufflehi_epi16( mask, _MM_SHUFFLE(0,0,0,0) );
02267     v1 = _mm_and_si128( v1, mask );
02268     mask = _mm_andnot_si128( mask, v2 );
02269     v1 = _mm_or_si128( v1, mask );
02270     return v1;
02271 }

SSP_FORCEINLINE __m128i ssp_shl_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of ssp_shl_epi8/pshlb [SSE5]. (SSE5 .pdf documentation here)

Definition at line 1987 of file SSEPlus_emulation_SSE2.h.

01988 {
01989     int n;
01990     ssp_m128 A,B;
01991     A.i = a;
01992     B.i = b;
01993 
01994     for( n = 0; n < 16; n++ )
01995     {
01996       if( B.s8[n] < 0 )
01997       {
01998         unsigned int count = (-B.s8[n]) % 8;
01999         A.u8[n] = A.u8[n] >> count;
02000       }
02001       else
02002       {
02003         unsigned int count = B.s8[n] % 8;
02004         A.u8[n] = A.u8[n] << count;
02005       }
02006     }
02007     return A.i;
02008 }

SSP_FORCEINLINE __m128i ssp_shuffle_epi8_SSE2 ( __m128i  a,
__m128i  mask 
)

SSE2 implementation of _mm_shuffle_epi8 [SSSE3]. (Searches MSDN)

Definition at line 1601 of file SSEPlus_emulation_SSE2.h.

01602 {  
01603     ssp_m128 A,B, MASK, maskZero;       
01604     A.i        = a;
01605     maskZero.i = ssp_comge_epi8_SSE2( mask, _mm_setzero_si128()        );    
01606     MASK.i     = _mm_and_si128      ( mask, _mm_set1_epi8( (char)0x0F) );
01607 
01608     B.s8[ 0] = A.s8[ (MASK.s8[ 0]) ];
01609         B.s8[ 1] = A.s8[ (MASK.s8[ 1]) ];
01610         B.s8[ 2] = A.s8[ (MASK.s8[ 2]) ];
01611         B.s8[ 3] = A.s8[ (MASK.s8[ 3]) ];
01612         B.s8[ 4] = A.s8[ (MASK.s8[ 4]) ];
01613         B.s8[ 5] = A.s8[ (MASK.s8[ 5]) ];
01614         B.s8[ 6] = A.s8[ (MASK.s8[ 6]) ];
01615         B.s8[ 7] = A.s8[ (MASK.s8[ 7]) ];
01616         B.s8[ 8] = A.s8[ (MASK.s8[ 8]) ];
01617         B.s8[ 9] = A.s8[ (MASK.s8[ 9]) ];
01618         B.s8[10] = A.s8[ (MASK.s8[10]) ];
01619         B.s8[11] = A.s8[ (MASK.s8[11]) ];
01620         B.s8[12] = A.s8[ (MASK.s8[12]) ];
01621         B.s8[13] = A.s8[ (MASK.s8[13]) ];
01622         B.s8[14] = A.s8[ (MASK.s8[14]) ];
01623         B.s8[15] = A.s8[ (MASK.s8[15]) ];  
01624 
01625     B.i = _mm_and_si128( B.i, maskZero.i );
01626         return B.i;
01627 }

SSP_FORCEINLINE __m128i ssp_sign_epi16_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_sign_epi16 [SSSE3]. (Searches MSDN)

Definition at line 1654 of file SSEPlus_emulation_SSE2.h.

01655 {
01656     __m128i c, d, zero;
01657 
01658         zero=_mm_setzero_si128();
01659         d   = _mm_cmpgt_epi16(b, zero);
01660         c   = _mm_cmplt_epi16(b, zero);
01661         d   = _mm_srli_epi16(d, 15);
01662         c   = _mm_or_si128(c, d);
01663         a   = _mm_mullo_epi16(a, c);
01664 
01665         //The following method has same performance
01666         //zero=_mm_setzero_si128();
01667         //d   = _mm_cmpgt_epi16(b, zero);
01668         //c   = _mm_cmplt_epi16(b, zero);
01669         //one = _mm_set1_epi16(1);
01670         //d   = _mm_and_si128(d, one);
01671         //c   = _mm_add_epi16(c, d);
01672         //a   = _mm_mullo_epi16(a, c);
01673 
01674         return a;
01675 }

SSP_FORCEINLINE __m128i ssp_sign_epi32_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_sign_epi32 [SSSE3]. (Searches MSDN)

Definition at line 1679 of file SSEPlus_emulation_SSE2.h.

01680 {
01681     __m128i ap, an, c, d, zero, one;
01682 
01683         zero=_mm_setzero_si128();
01684         //Great than zero part
01685         d  = _mm_cmpgt_epi32(b, zero);
01686         ap = _mm_and_si128(a, d);
01687 
01688         //Less than zero
01689         c   = _mm_cmplt_epi32(b, zero);
01690         one = _mm_set1_epi32(1);
01691         an  = _mm_and_si128(a, c);  //get the all number which needs to be negated 
01692         an  = _mm_xor_si128(an, c);
01693         one = _mm_and_si128(one, c);
01694         an  = _mm_add_epi8(an, one);
01695 
01696         return _mm_or_si128(an, ap);
01697 }

SSP_FORCEINLINE __m128i ssp_sign_epi8_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_sign_epi8 [SSSE3]. (Searches MSDN)

Definition at line 1632 of file SSEPlus_emulation_SSE2.h.

01633 {
01634     __m128i ap, an, c, d, zero, one;
01635 
01636         zero=_mm_setzero_si128();
01637         //Great than zero part
01638         d  = _mm_cmpgt_epi8(b, zero);
01639         ap = _mm_and_si128(a, d);
01640 
01641         //Less than zero
01642         c   = _mm_cmplt_epi8(b, zero);
01643         one = _mm_set1_epi8(1);
01644         an  = _mm_and_si128(a, c);  //get the all number which needs to be negated 
01645         an  = _mm_xor_si128(an, c);
01646         one = _mm_and_si128(one, c);
01647         an  = _mm_add_epi8(an, one);
01648 
01649         return _mm_or_si128(an, ap);//_mm_add_epi8(an, ap);
01650 }

SSP_FORCEINLINE int ssp_testc_si128_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_testc_si128 [SSE4.1]. (Searches MSDN)

Definition at line 1703 of file SSEPlus_emulation_SSE2.h.

01704 {
01705     a = _mm_xor_si128( a, b );
01706     return ssp_testz_si128_SSE2( a, a );
01707 }

SSP_FORCEINLINE int ssp_testnzc_si128_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_testnzc_si128 [SSE4.1]. (Searches MSDN)

Definition at line 1721 of file SSEPlus_emulation_SSE2.h.

01722 {
01723     ssp_m128 zf, cf;    
01724 
01725     zf.i = _mm_and_si128  ( a, b );   
01726     zf.i = _mm_packs_epi32( zf.i, _mm_setzero_si128() ); 
01727  
01728     cf.i = _mm_andnot_si128( a, b );
01729     cf.i = _mm_packs_epi32( cf.i, _mm_setzero_si128() );  
01730 
01731     return ( !(zf.u64[0] == 0) && !(cf.u64[0] == 0));
01732 }

SSP_FORCEINLINE int ssp_testz_si128_SSE2 ( __m128i  a,
__m128i  b 
)

SSE2 implementation of _mm_testz_si128 [SSE4.1]. (Searches MSDN)

Definition at line 1711 of file SSEPlus_emulation_SSE2.h.

01712 {
01713     ssp_m128 t;
01714     t.i = _mm_and_si128  ( a, b );   
01715     t.i = _mm_packs_epi32( t.i, _mm_setzero_si128() );   
01716     return t.u64[0] == 0;
01717 }


Generated on Wed May 21 13:44:15 2008 for "SSEPlus" by  doxygen 1.5.4