Arithmetic Operations | |
SSP_FORCEINLINE __m128 | ssp_arithmetic_hadd4_dup_ps_SSE2 (__m128 a) |
SSP_FORCEINLINE __m128i | ssp_arithmetic_hadd4_epi16_SSE2 (__m128i a, const unsigned int offset) |
SSP_FORCEINLINE __m128 | ssp_round_ps_neg_zero_SSE2 (__m128 a, int iRoundMode) |
Convert Operations | |
SSP_FORCEINLINE void | ssp_convert_odd_even_epi16_SSE2 (__m128i *a, __m128i *b) |
SSP_FORCEINLINE void | ssp_convert_odd_even_ps_SSE2 (__m128 *a, __m128 *b) |
SSP_FORCEINLINE void | ssp_convert_odd_even_epi32_SSE2 (__m128i *a, __m128i *b) |
SSP_FORCEINLINE void | ssp_convert_3c_3p_epi8_SSE2 (__m128i *rgb1, __m128i *rgb2, __m128i *rgb3) |
SSP_FORCEINLINE void | ssp_convert_reverse_transpose_SSE2 (__m128i *a, __m128i *b, __m128i *c) |
SSP_FORCEINLINE void | ssp_convert_3p_3c_epi8_SSE2 (__m128i *r, __m128i *g, __m128i *b) |
SSP_FORCEINLINE void | ssp_convert_3c_3p_epi16_SSE2 (__m128i *rgb1, __m128i *rgb2, __m128i *rgb3) |
SSP_FORCEINLINE void | ssp_convert_3p_3c_epi16_SSE2 (__m128i *r, __m128i *g, __m128i *b) |
SSP_FORCEINLINE void | ssp_convert_3c_3p_epi32_SSE2 (__m128i *rgb1, __m128i *rgb2, __m128i *rgb3) |
SSP_FORCEINLINE void | ssp_convert_3p_3c_epi32_SSE2 (__m128i *r, __m128i *g, __m128i *b) |
SSP_FORCEINLINE void | ssp_convert_4c_4p_epi8_SSE2 (__m128i *rgba1, __m128i *rgba2, __m128i *rgba3, __m128i *rgba4) |
SSP_FORCEINLINE void | ssp_convert_4p_4c_epi8_SSE2 (__m128i *r, __m128i *g, __m128i *b, __m128i *a) |
SSP_FORCEINLINE void | ssp_convert_4c_4p_epi16_SSE2 (__m128i *rgba1, __m128i *rgba2, __m128i *rgba3, __m128i *rgba4) |
SSP_FORCEINLINE void | ssp_convert_4p_4c_epi16_SSE2 (__m128i *r, __m128i *g, __m128i *b, __m128i *a) |
SSP_FORCEINLINE void | ssp_convert_4c_4p_epi32_SSE2 (__m128i *rgba1, __m128i *rgba2, __m128i *rgba3, __m128i *rgba4) |
SSP_FORCEINLINE void | ssp_convert_4p_4c_epi32_SSE2 (__m128i *r, __m128i *g, __m128i *b, __m128i *a) |
Logical Operations | |
SSP_FORCEINLINE __m128i | ssp_logical_signinvert_16_SSE2 (__m128i mask, __m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_logical_signinvert_32_SSE2 (__m128i mask, __m128i a, __m128i b) |
SSP_FORCEINLINE __m128i | ssp_logical_invert_si128_SSE2 (__m128i a) |
SSP_FORCEINLINE __m128d | ssp_logical_invert_sd_SSE2 (__m128d a) |
SSP_FORCEINLINE __m128 | ssp_logical_invert_ss_SSE2 (__m128 a) |
SSP_FORCEINLINE __m128i | ssp_logical_bitwise_select_SSE2 (__m128i a, __m128i b, __m128i mask) |
SSP_FORCEINLINE __m128i | ssp_movmask_imm8_to_epi32_SSE2 (int mask) |
SSP_FORCEINLINE __m128i | ssp_slli_epi8_SSE2 (__m128i a, const int b) |
SSP_FORCEINLINE __m128i | ssp_srli_epi8_SSE2 (__m128i a, const int b) |
Memory Operations | |
__m128i | ssp_memory_load1_epu8_SSE2 (unsigned char a) |
SSP_FORCEINLINE __m128 ssp_arithmetic_hadd4_dup_ps_SSE2 | ( | __m128 | a | ) |
Definition at line 17 of file SSEPlus_arithmetic_SSE2.h.
00018 { 00019 __m128 t; 00020 t = _mm_shuffle_ps( a, a, _MM_SHUFFLE(2, 3, 0, 1) ); //TODO shuflo, shuf hi 00021 a = _mm_add_ps( a, t ); 00022 00023 t = _mm_shuffle_ps( a, a, _MM_SHUFFLE(1, 0, 3, 2) ); //TODO shuflo, shuf hi 00024 a = _mm_add_ps( a, t ); 00025 return a; 00026 }
SSP_FORCEINLINE __m128i ssp_arithmetic_hadd4_epi16_SSE2 | ( | __m128i | a, | |
const unsigned int | offset | |||
) |
in = a,b,c,d | e,f,g,h, 0 out = x,x,x,a+b+c+d | x,x,x,e+f+g+h
in = a,b,c,d | e,f,g,h, 3 out = a+b+c+d,x,x,x,| x,x,x,e+f+g+h
offset indicates desired position of sum (0,1,2,3)
Definition at line 39 of file SSEPlus_arithmetic_SSE2.h.
00040 { 00041 ssp_m128 A,B; 00042 A.i = a; //A = a, b, c, d | e, f, g, h 00043 00044 if( offset >= 2 ) B.i = _mm_slli_si128( A.i, 4 ); //B = c, d, x, x | g, h, x, x 00045 else B.i = _mm_srli_si128( A.i, 4 ); //B = x, x, a, b | x, x, e, f 00046 00047 A.i = _mm_add_epi16 ( A.i, B.i ); 00048 00049 if( offset & 1 ) B.i = _mm_slli_si128( A.i, 2 ); 00050 else B.i = _mm_srli_si128( A.i, 2 ); 00051 00052 A.i = _mm_add_epi16 ( A.i, B.i ); 00053 return A.i; 00054 }
SSP_FORCEINLINE void ssp_convert_3c_3p_epi16_SSE2 | ( | __m128i * | rgb1, | |
__m128i * | rgb2, | |||
__m128i * | rgb3 | |||
) |
Definition at line 192 of file SSEPlus_convert_SSE2.h.
00193 { 00194 __m128i temp1, temp2; 00195 00196 *rgb2 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b3,g3,r3,b2,r5,b4,g4,r4 00197 temp1 = _mm_unpacklo_epi16(*rgb1, *rgb2); //r5,r1,b4,b0,g4,g0,r4,r0 00198 temp2 = _mm_unpackhi_epi16(*rgb2, *rgb3); //b7,b3,g7,g3,r7,r3,b6,b2 00199 *rgb3 = _mm_slli_si128(*rgb3, 8); //g6,r6,b5,g5, 0, 0, 0, 0 00200 *rgb2 = _mm_unpackhi_epi16(*rgb1, *rgb3); //g6,g2,r6,r2,b5,b1,g5,g1 00201 00202 *rgb3 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b5,b1,g5,g1,g6,g2,r6,r2 00203 *rgb1 = _mm_unpacklo_epi16(temp1, *rgb3); //g6,g4,g2,g0,r6,r4,r2,r0 00204 temp1 = _mm_srli_si128(temp1, 8); // 0, 0, 0, 0,r5,r1,b4,b0 00205 temp1 = _mm_unpacklo_epi16(temp1, temp2); //r7,r5,r3,r1,b6,b4,b2,b0 00206 temp2 = _mm_unpackhi_epi16(*rgb3, temp2); //b7,b5,b3,b1,g7,g5,g3,g1 00207 00208 temp1 = _mm_shuffle_epi32(temp1, _MM_SHUFFLE(1,0,3,2)); //b6,b4,b2,b0,r7,r5,r3,r1 00209 *rgb3 = _mm_unpackhi_epi16(temp1, temp2); //b7,b6,b5,b4,b3,b2,b1,b0 00210 temp2 = _mm_slli_si128(temp2, 8); //g7,g5,g3,g1, 0, 0, 0, 0 00211 *rgb2 = _mm_unpackhi_epi16(*rgb1, temp2); //g7,g6,g5,g4,g3,g2,g1,g0 00212 *rgb1 = _mm_unpacklo_epi16(*rgb1, temp1); //r7,r6,r5,r4,r3,r2,r1,r0 00213 }
SSP_FORCEINLINE void ssp_convert_3c_3p_epi32_SSE2 | ( | __m128i * | rgb1, | |
__m128i * | rgb2, | |||
__m128i * | rgb3 | |||
) |
Definition at line 259 of file SSEPlus_convert_SSE2.h.
00260 { 00261 __m128i temp1, temp2; 00262 00263 *rgb2 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b1,g1,g2,r2 00264 temp1 = _mm_unpacklo_epi32(*rgb1, *rgb2); //g2,g0,r2,r0 00265 temp2 = _mm_unpackhi_epi32(*rgb2, *rgb3); //b3,b1,g3,g1 00266 *rgb3 = _mm_slli_si128(*rgb3, 8); //r3,b2, 0, 0 00267 *rgb2 = _mm_unpackhi_epi32(*rgb1, *rgb3); //r3,r1,b2,b0 00268 00269 *rgb3 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2));//b2,b0,r3,r1 00270 *rgb1 = _mm_unpacklo_epi32(temp1, *rgb3); //r3,r2,r1,r0 00271 temp1 = _mm_srli_si128(temp1, 8); // 0, 0,g2,g0 00272 *rgb2 = _mm_unpacklo_epi32(temp1, temp2); //g3,g2,g1,g0 00273 *rgb3 = _mm_unpackhi_epi32(*rgb3, temp2); //b3,b2,b1,b0 00274 }
SSP_FORCEINLINE void ssp_convert_3c_3p_epi8_SSE2 | ( | __m128i * | rgb1, | |
__m128i * | rgb2, | |||
__m128i * | rgb3 | |||
) |
Definition at line 90 of file SSEPlus_convert_SSE2.h.
00091 { 00092 __m128i temp1, temp2; 00093 // RGB1 = r5 , b4 g4 r4 , b3 g3 r3 , b2 g2 r2 , b1 g1 r1 , b0 g0 r0 00094 // RGB2 = g10 r10, b9 g9 r9 , b8 g8 r8 , b7 g7 r7 , b6 g6 r6 , b5 g5 00095 // RGB3 = b15 g15 r15, b14 g14 r14, b13 g13 r13, b12 g12 r12, b11 g11 r11, b10 00096 00097 00098 *rgb2 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2)); // b7, g7, r7, b6, g6, r6, b5, g5,g10,r10, b9, g9, r9, b8, g8, r8 00099 temp1 = _mm_unpacklo_epi8(*rgb1, *rgb2); //g10, g2,r10, r2, b9, b1, g9, g1, r9, r1, b8, b0, g8, g0, r8, r0 00100 temp2 = _mm_unpackhi_epi8(*rgb2, *rgb3); //b15, b7,g15, g7,r15, r7,b14, b6,g14, g6,r14, r6,b13, b5,g13, g5 00101 *rgb3 = _mm_slli_si128 (*rgb3, 8 ); //r13,b12,g12,r12,b11,g11,r11,b10, 0, 0, 0, 0, 0, 0, 0, 0 00102 *rgb2 = _mm_unpackhi_epi8(*rgb1, *rgb3); //r13, r5,b12, b4,g12, g4,r12, r4,b11, b3,g11, g3,r11, r3,b10, b2 00103 00104 *rgb3 = _mm_shuffle_epi32(*rgb2, _MM_SHUFFLE(1,0,3,2)); //b11, b3,g11, g3,r11, r3,b10, b2,r13, r5,b12, b4,g12, g4,r12, r4 00105 *rgb1 = _mm_unpacklo_epi8(temp1, *rgb3); //r13, r9, r5, r1,b12, b8, b4, b0,g12, g8, g4, g0,r12, r8, r4, r0 00106 temp1 = _mm_srli_si128 (temp1, 8 ); // 0, 0, 0, 0, 0, 0, 0, 0,g10, g2,r10, r2, b9, b1, g9, g1 00107 temp1 = _mm_unpacklo_epi8(temp1, temp2); //g14,g10, g6, g2,r14,r10, r6, r2,b13, b9, b5, b1,g13, g9, g5, g1 00108 temp2 = _mm_unpackhi_epi8(*rgb3, temp2); //b15,b11, b7, b3,g15,g11, g7, g3,r15,r11, r7, r3,b14,b10, b6, b2 00109 00110 temp1 = _mm_shuffle_epi32(temp1, _MM_SHUFFLE(1,0,3,2)); //b13, b9, b5, b1,g13, g9, g5, g1,g14,g10, g6, g2,r14,r10, r6, r2 00111 *rgb3 = _mm_unpackhi_epi8(temp1, temp2); //b15,b13,b11, b9, b7, b5, b3, b1,g15,g13,g11, g9, g7, g5, g3, g1 00112 temp2 = _mm_slli_si128 (temp2, 8 ); //r15,r11, r7, r3,b14,b10, b6, b2, 0, 0, 0, 0, 0, 0, 0, 0 00113 temp2 = _mm_unpackhi_epi8(*rgb1, temp2); //r15,r13,r11, r9, r7, r5, r3, r1,b14,b12,b10, b8, b6, b4, b2, b0 00114 temp1 = _mm_unpacklo_epi8(*rgb1, temp1); //g14,g12,g10, g8, g6, g4, g2, g0,r14,r12,r10, r8, r6, r4, r2, r0 00115 00116 temp2 = _mm_shuffle_epi32(temp2, _MM_SHUFFLE(1,0,3,2)); //b14,b12,b10, b8, b6, b4, b2, b0,r15,r13,r11, r9, r7, r5, r3, r1 00117 *rgb1 = _mm_unpacklo_epi8(temp1, temp2); //r15,r14,r13,r12,r11,r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0 00118 temp1 = _mm_srli_si128 (temp1, 8 ); // 0, 0, 0, 0, 0, 0, 0, 0,g14,g12,g10, g8, g6, g4, g2, g0 00119 *rgb2 = _mm_unpacklo_epi8(temp1, *rgb3); //g15,g14,g13,g12,g11,g10, g9, g8, g7, g6, g5, g4, g3, g2, g1, g0 00120 *rgb3 = _mm_unpackhi_epi8(temp2, *rgb3); //b15,b14,b13,b12,b11,b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0 00121 }
SSP_FORCEINLINE void ssp_convert_3p_3c_epi16_SSE2 | ( | __m128i * | r, | |
__m128i * | g, | |||
__m128i * | b | |||
) |
Definition at line 216 of file SSEPlus_convert_SSE2.h.
00217 { 00218 __m128i temp; 00219 00220 temp = _mm_srli_si128(*r, 8); // 0, 0, 0, 0,r7,r6,r5,r4 00221 *r = _mm_unpacklo_epi16(*r, temp); //r7,r3,r6,r2,r5,r1,r4,r0 00222 temp = _mm_srli_si128(*r, 8); // 0, 0, 0, 0,r7,r3,r6,r2 00223 *r = _mm_unpacklo_epi16(*r, temp); //r7,r5,r3,r1,r6,r4,r2,r0 00224 00225 temp = _mm_srli_si128(*g, 8); //g7,g3,g6,g2,g5,g1,g4,g0 00226 *g = _mm_unpacklo_epi16(*g, temp); // 0, 0, 0, 0,g7,g3,g6,g2 00227 temp = _mm_srli_si128(*g, 8); //g7,g5,g3,g1,g6,g4,g2,g0 00228 *g = _mm_unpacklo_epi16(*g, temp); //g7,g5,g3,g1,g6,g4,g2,g0 00229 00230 temp = _mm_srli_si128(*b, 8); //b7,b3,b6,b2,b5,b1,b4,b0 00231 *b = _mm_unpacklo_epi16(*b, temp); // 0, 0, 0, 0,b7,b3,b6,b2 00232 temp = _mm_srli_si128(*b, 8); //b7,b5,b3,b1,b6,b4,b2,b0 00233 *b = _mm_unpacklo_epi16(*b, temp); //b7,b5,b3,b1,b6,b4,b2,b0 00234 00235 temp = _mm_unpacklo_epi16(*r, *g); //g6,r6,g4,r4,g2,r2,g0,r0 00236 *r = _mm_srli_si128(*r , 8); // 0, 0, 0, 0,r7,r5,r3,r1 00237 *r = _mm_unpacklo_epi16(*b, *r); //r7,b6,r5,b4,r3,b2,r1,b0 00238 *g = _mm_unpackhi_epi16(*g, *b); //b7,g7,b5,g5,b3,g3,b1,g1 00239 00240 *b = _mm_srli_si128(*r, 8); // 0, 0, 0, 0,r7,b6,r5,b4 00241 *r = _mm_unpacklo_epi32(*r, *b); //r7,b6,r3,b2,r5,b4,r1,b0 00242 *b = _mm_srli_si128(*g, 8); // 0, 0, 0, 0,b7,g7,b5,g5 00243 *g = _mm_unpacklo_epi32(*g, *b); //b7,g7,b3,g3,b5,g5,b1,g1 00244 *b = _mm_srli_si128(temp, 8); // 0, 0, 0, 0,g6,r6,g4,r4 00245 temp = _mm_unpacklo_epi32(temp, *b); //g6,r6,g2,r2,g4,r4,g0,r0 00246 00247 *b = _mm_unpacklo_epi32(temp, *g); //b5,g5,g4,r4,b1,g1,g0,r0 00248 temp = _mm_srli_si128(temp, 8); // 0, 0, 0, 0,g6,r6,g2,r2 00249 temp = _mm_unpacklo_epi32(*r, temp); //g6,r6,r5,b4,g2,r2,r1,b0 00250 *g = _mm_unpackhi_epi32(*r, *g); //b7,g7,r7,b6,b3,g3,r3,b2 00251 00252 *r = _mm_unpacklo_epi32(*b, temp); //g2,r2,b1,g1,r1,b0,g0,r0 00253 temp = _mm_unpackhi_epi32(*b, temp); //g6,r6,b5,g5,r5,b4,g4,b4 00254 *b = _mm_unpackhi_epi64(temp, *g); //b7,g7,r7,b6,g6,r6,b5,g5 00255 *g = _mm_unpacklo_epi64(*g, temp); //r5,b4,g4,r4,b3,g3,r3,b2 00256 }
SSP_FORCEINLINE void ssp_convert_3p_3c_epi32_SSE2 | ( | __m128i * | r, | |
__m128i * | g, | |||
__m128i * | b | |||
) |
Definition at line 277 of file SSEPlus_convert_SSE2.h.
00278 { 00279 __m128i temp; 00280 00281 temp = _mm_srli_si128(*r, 8); // 0, 0,r3,r2 00282 *r = _mm_unpacklo_epi32(*r, temp); //r3,r1,r2,r0 00283 temp = _mm_srli_si128(*g, 8); // 0, 0,g3,g2 00284 *g = _mm_unpacklo_epi32(*g, temp); //g3,g1,g2,g0 00285 temp = _mm_srli_si128(*b, 8); // 0, 0,b3,b2 00286 *b = _mm_unpacklo_epi32(*b, temp); //b3,b1,b2,b0 00287 00288 temp = _mm_unpacklo_epi32(*r, *g); //g2,r2,g0,r0 00289 *g = _mm_unpackhi_epi32(*g, *b); //b3,g3,b1,g1 00290 *r = _mm_srli_si128(*r, 8); // 0, 0,r3,r1 00291 *b = _mm_unpacklo_epi32(*b, *r); //r3,b2,r1,b0 00292 00293 *r = _mm_unpacklo_epi64(temp, *b); //r1,b0,g0,r0 00294 *b = _mm_unpackhi_epi64(*b, *g); //b3,g3,r3,b2 00295 *g = _mm_slli_si128(*g, 8); //b1,g1, 0, 0 00296 *g = _mm_unpackhi_epi64(*g, temp); //g2,r2,b1,g1 00297 }
SSP_FORCEINLINE void ssp_convert_3p_3c_epi8_SSE2 | ( | __m128i * | r, | |
__m128i * | g, | |||
__m128i * | b | |||
) |
Definition at line 150 of file SSEPlus_convert_SSE2.h.
00151 { 00152 const static __m128i odd_8 = SSP_CONST_SET_8I( 0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0 ); 00153 const static __m128i even_8 = SSP_CONST_SET_8I( 0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF,0,0xFF ); 00154 00155 const static __m128i odd_16 = SSP_CONST_SET_16I( 0xFFFF,0,0xFFFF,0,0xFFFF,0,0xFFFF,0 ); 00156 const static __m128i even_16 = SSP_CONST_SET_16I( 0,0xFFFF,0,0xFFFF,0,0xFFFF,0,0xFFFF ); 00157 00158 ssp_m128 T, RG, GB, BR, RGBR, GBRG, BRGB; 00159 00160 RG.i = _mm_and_si128 ( *r, even_8 ); // Mask out the odd r bits 00161 T.i = _mm_slli_epi16( *g, 8 ); // Move the even g bits to the odd position 00162 RG.i = _mm_or_si128 ( RG.i, T.i ); // G14 R14 ... G2 R2 G0 R0 00163 00164 GB.i = _mm_srli_epi16( *g, 8 ); 00165 T.i = _mm_and_si128 ( *b, odd_8 ); 00166 GB.i = _mm_or_si128 ( GB.i, T.i ); 00167 00168 BR.i = _mm_and_si128 ( *b, even_8 ); 00169 T.i = _mm_and_si128 ( *r, odd_8 ); 00170 BR.i = _mm_or_si128 ( BR.i, T.i ); 00171 00172 RGBR.i = _mm_and_si128 ( RG.i, even_16 ); 00173 T.i = _mm_slli_epi32( BR.i, 16 ); 00174 RGBR.i = _mm_or_si128 ( RGBR.i, T.i ); 00175 00176 GBRG.i = _mm_and_si128 ( GB.i, even_16 ); 00177 T.i = _mm_and_si128 ( RG.i, odd_16 ); 00178 GBRG.i = _mm_or_si128 ( GBRG.i, T.i ); 00179 00180 BRGB.i = _mm_srli_epi32( BR.i, 16 ); 00181 T.i = _mm_and_si128 ( GB.i, odd_16 ); 00182 BRGB.i = _mm_or_si128 ( BRGB.i, T.i ); 00183 00184 ssp_convert_reverse_transpose_SSE2( &RGBR.i, &GBRG.i, &BRGB.i ); 00185 00186 *r = RGBR.i; 00187 *g = GBRG.i; 00188 *b = BRGB.i; 00189 }
SSP_FORCEINLINE void ssp_convert_4c_4p_epi16_SSE2 | ( | __m128i * | rgba1, | |
__m128i * | rgba2, | |||
__m128i * | rgba3, | |||
__m128i * | rgba4 | |||
) |
Definition at line 344 of file SSEPlus_convert_SSE2.h.
00345 { 00346 __m128i temp1, temp2; 00347 00348 temp1 = _mm_unpacklo_epi16(*rgba1, *rgba3); //a4,a0,b4,b0,g4,g0,r4,r0 00349 *rgba1 = _mm_unpackhi_epi16(*rgba1, *rgba3); //a5,a1,b5,b1,g5,g1,r5,r1 00350 *rgba3 = _mm_unpacklo_epi16(*rgba2, *rgba4); //a6,a2,b6,b2,g6,g2,r6,r2 00351 *rgba2 = _mm_unpackhi_epi16(*rgba2, *rgba4); //a7,a3,b7,b3,g7,g3,r7,r3 00352 00353 *rgba4 = _mm_unpackhi_epi16(*rgba1, *rgba2); //a7,a5,a3,a1,b7,b5,b3,b1 00354 *rgba1 = _mm_unpacklo_epi16(*rgba1, *rgba2); //g7,g5,g3,g1,r7,r5,r3,r1 00355 temp2 = _mm_unpacklo_epi16(temp1, *rgba3); //g6,g4,g2,g0,r6,r4,r2,r0 00356 temp1 = _mm_unpackhi_epi16(temp1, *rgba3); //a6,a4,a2,a0,b6,b4,b2,b0 00357 00358 *rgba3 = _mm_unpacklo_epi16(temp1, *rgba4); //b7,b6,b5,b4,b3,b2,b1,b0 00359 *rgba4 = _mm_unpackhi_epi16(temp1, *rgba4); //a7,a6,a5,a4,a3,a2,a1,a0 00360 *rgba2 = _mm_unpackhi_epi16(temp2, *rgba1); //g7,g6,g5,g4,g3,g2,g1,g0 00361 *rgba1 = _mm_unpacklo_epi16(temp2, *rgba1); //r7,r6,r5,r4,r3,r2,r1,r0 00362 }
SSP_FORCEINLINE void ssp_convert_4c_4p_epi32_SSE2 | ( | __m128i * | rgba1, | |
__m128i * | rgba2, | |||
__m128i * | rgba3, | |||
__m128i * | rgba4 | |||
) |
Definition at line 381 of file SSEPlus_convert_SSE2.h.
00382 { 00383 __m128i temp1, temp2; 00384 00385 temp1 = _mm_unpacklo_epi32(*rgba1, *rgba3); //g2,g0,r2,r0 00386 *rgba1 = _mm_unpackhi_epi32(*rgba1, *rgba3); //a2,a0,b2,b0 00387 temp2 = _mm_unpacklo_epi32(*rgba2, *rgba4); //g3,g1,r3,r1 00388 *rgba2 = _mm_unpackhi_epi32(*rgba2, *rgba4); //a3,a1,b3,b1 00389 00390 *rgba4 = _mm_unpackhi_epi32(*rgba1, *rgba2); //a3,a2,a1,a0 00391 *rgba3 = _mm_unpacklo_epi32(*rgba1, *rgba2); //b3,b2,b1,b0 00392 *rgba1 = _mm_unpacklo_epi32(temp1, temp2); //r3,r2,r1,r0 00393 *rgba2 = _mm_unpackhi_epi32(temp1, temp2); //g3,g2,g1,g0 00394 }
SSP_FORCEINLINE void ssp_convert_4c_4p_epi8_SSE2 | ( | __m128i * | rgba1, | |
__m128i * | rgba2, | |||
__m128i * | rgba3, | |||
__m128i * | rgba4 | |||
) |
Definition at line 301 of file SSEPlus_convert_SSE2.h.
00302 { 00303 __m128i temp1,temp2; 00304 00305 temp1 = _mm_unpacklo_epi8(*rgba1, *rgba3); // a9, a1, b9, b1, g9, g1, r9, r1, a8, a0, b8, b0, g8, g0, r8, r0 00306 *rgba1 = _mm_unpackhi_epi8(*rgba1, *rgba3); //a11, a3,b11, b3,g11, g3,r11, r3,a10, a2,b10, b2,g10, g2,r10, r2 00307 *rgba3 = _mm_unpacklo_epi8(*rgba2, *rgba4); //a13, a5,b13, b5,g13, g5,r13, r5,a12, a4,b12, b4,g12, g4,r12, r4 00308 temp2 = _mm_unpackhi_epi8(*rgba2, *rgba4); //a15, a7,b15, b7,g15, g7,r15, r7,a14, a6,b14, b6,g14, g6,r14, r6 00309 00310 *rgba4 = _mm_unpackhi_epi8(*rgba1, temp2); //a15,a11, a7, a3,b15,b11, b7, b3,g15,g11, g7, g3,r15,r11, r7, r3 00311 *rgba1 = _mm_unpacklo_epi8(*rgba1, temp2); //a14,a10, a6, a2,b14,b10, b6, b2,g14,g10, g6, g2,r14,r10, r6, r2 00312 *rgba2 = _mm_unpacklo_epi8(temp1, *rgba3); //a12, a8, a4, a0,b12, b8, b4, b0,g12, g8, g4, g0,r12, r8, r4, r0 00313 *rgba3 = _mm_unpackhi_epi8(temp1, *rgba3); //a13, a9, a5, a1,b13, b9, b5, b1,g13, g9, g5, g1,r13, r9, r5, r1 00314 00315 temp1 = _mm_unpacklo_epi8(*rgba3, *rgba4); //g15,g13,g11, g9, g7, g5, g3, g1,r15,r13,r11, r9, r7, r5, r3, r1 00316 *rgba3 = _mm_unpackhi_epi8(*rgba3, *rgba4); //a15,a13,a11, a9, a7, a5, a3, a1,b15,b13,b11, b9, b7, b5, b3, b1 00317 temp2 = _mm_unpackhi_epi8(*rgba2, *rgba1); //a14,a12,a10, a8, a6, a4, a2, a0,b14,b12,b10, b8, b6, b4, b2, b0 00318 *rgba2 = _mm_unpacklo_epi8(*rgba2, *rgba1); //g14,g12,g10, g8, g6, g4, g2, g0,r14,r12,r10, r8, r6, r4, r2, r0 00319 00320 *rgba1 = _mm_unpacklo_epi8(*rgba2, temp1); //r15,r14,r13,r12,r11,r10, r9, r8, r7, r6, r5, r4, r3, r2, r1, r0 00321 *rgba2 = _mm_unpackhi_epi8(*rgba2, temp1); //g15,g14,g13,g12,g11,g10, g9, g8, g7, g6, g5, g4, g3, g2, g1, g0 00322 *rgba4 = _mm_unpackhi_epi8(temp2, *rgba3); //a15,a14,a13,a12,a11,a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0 00323 *rgba3 = _mm_unpacklo_epi8(temp2, *rgba3); //b15,b14,b13,b12,b11,b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0 00324 }
SSP_FORCEINLINE void ssp_convert_4p_4c_epi16_SSE2 | ( | __m128i * | r, | |
__m128i * | g, | |||
__m128i * | b, | |||
__m128i * | a | |||
) |
Definition at line 365 of file SSEPlus_convert_SSE2.h.
00366 { 00367 __m128i temp1, temp2; 00368 00369 temp1 = _mm_unpacklo_epi16(*r, *b); //b3,r3,b2,r2,b1,r1,b0,r0 00370 *r = _mm_unpackhi_epi16(*r, *b); //b7,r7,b6,r6,b5,r5,b4,r4 00371 temp2 = _mm_unpacklo_epi16(*g, *a); //a3,g3,a2,g2,a1,g1,a0,g0 00372 *g = _mm_unpackhi_epi16(*g, *a); //a7,g7,a6,g6,a5,g5,a4,g4 00373 00374 *b = _mm_unpacklo_epi16(*r, *g); //a5,b5,g5,r5,a4,b4,g4,r4 00375 *a = _mm_unpackhi_epi16(*r, *g); //a7,b7,g7,r7,a6,b6,g6,r6 00376 *r = _mm_unpacklo_epi16(temp1, temp2); //a1,b1,g1,r1,a0,b0,g0,r0 00377 *g = _mm_unpackhi_epi16(temp1, temp2); //a3,b3,g3,r3,a2,b2,g2,r2 00378 }
SSP_FORCEINLINE void ssp_convert_4p_4c_epi32_SSE2 | ( | __m128i * | r, | |
__m128i * | g, | |||
__m128i * | b, | |||
__m128i * | a | |||
) |
Definition at line 397 of file SSEPlus_convert_SSE2.h.
00398 { 00399 __m128i temp1, temp2; 00400 00401 temp1 = _mm_unpacklo_epi32(*r, *b); //b1,r1,b0,r0 00402 *r = _mm_unpackhi_epi32(*r, *b); //b3,r3,b2,r2 00403 temp2 = _mm_unpacklo_epi32(*g, *a); //a1,g1,a0,g0 00404 *g = _mm_unpackhi_epi32(*g, *a); //a3,g3,a2,g2 00405 00406 *b = _mm_unpacklo_epi32(*r, *g); //a2,b2,g2,r2 00407 *a = _mm_unpackhi_epi32(*r, *g); //a3,b3,g3,r3 00408 *r = _mm_unpacklo_epi32(temp1, temp2); //a0,b0,g0,r0 00409 *g = _mm_unpackhi_epi32(temp1, temp2); //a1,b1,g1,r1 00410 }
SSP_FORCEINLINE void ssp_convert_4p_4c_epi8_SSE2 | ( | __m128i * | r, | |
__m128i * | g, | |||
__m128i * | b, | |||
__m128i * | a | |||
) |
Definition at line 328 of file SSEPlus_convert_SSE2.h.
00329 { 00330 __m128i temp1, temp2; 00331 00332 temp1 = _mm_unpacklo_epi8(*r, *b); // b7, r7, b6, r6, b5, r5, b4, r4, b3, r3, b2, r2, b1, r1, b0, r0 00333 *r = _mm_unpackhi_epi8(*r, *b); //b15,r15,b14,r14,b13,r13,b12,r12,b11,r11,b10,r10, b9, r9, b8, r8 00334 temp2 = _mm_unpacklo_epi8(*g, *a); // a7, g7, a6, g6, a5, g5, a4, g4, a3, g3, a2, g2, a1, g1, a0, g0 00335 *g = _mm_unpackhi_epi8(*g, *a); //a15,g15,a14,g14,a13,g13,a12,g12,a11,g11,a10,g10, a9, g9, a8, g8 00336 00337 *b = _mm_unpacklo_epi8(*r, *g); //a11,b11,g11,r11,a10,b10,g10,r10, a9, b9, g9, r9, a8, b8, g8, r8 00338 *a = _mm_unpackhi_epi8(*r, *g); //a16,b16,g16,r16,a15,b15,g15,r15,a14, b1,g14,r14,a13,b13,g12,r12 00339 *r = _mm_unpacklo_epi8(temp1, temp2); // a3, b3, g3, r3, a2, b2, g2, r2, a1, b1, g1, r1, a0, b0, g0, r0 00340 *g = _mm_unpackhi_epi8(temp1, temp2); // a7, b7, g7, r7, a6, b6, g6, r6, a5, b5, g5, r5, a4, b4, g4, r4 00341 }
SSP_FORCEINLINE void ssp_convert_odd_even_epi16_SSE2 | ( | __m128i * | a, | |
__m128i * | b | |||
) |
Definition at line 18 of file SSEPlus_convert_SSE2.h.
00019 { 00020 // IN 00021 // a = a7,a6,a5,a4,a3,a2,a1,a0 00022 // b = b7,b6,b5,b4,b3,b2,b1,b0 00023 00024 // OUT 00025 // a = b6,b4,b2,b0,a6,a4,a2,a0 // even 00026 // b = b7,b5,b3,b1,a7,a5,a3,a1 // odd 00027 00028 __m128i A = *a; 00029 __m128i B = *b; 00030 __m128i ta, tb, odd, even; 00031 00032 ta = _mm_srai_epi32 ( A, 16 ); // sign,a7,sign,a5,sign,a3,sign,a1 00033 tb = _mm_srai_epi32 ( B, 16 ); // sign,b7,sign,b5,sign,b3,sign,b1 00034 odd = _mm_packs_epi32( ta, tb ); // b7,b5, b3,b1, a7,a5, a3,a1 00035 00036 A = _mm_slli_si128 ( A, 2 ); // a6, 0, a4, 0, a2, 0, a0, 0 00037 B = _mm_slli_si128 ( B, 2 ); // b6, 0, b4, 0, b2, 0, b0, 0 00038 A = _mm_srai_epi32 ( A, 16 ); // sign,a6,sign,a4,sign,a2,sign,a0 00039 B = _mm_srai_epi32 ( B, 16 ); // sign,b6,sign,b4,sign,b2,sign,b0 00040 even = _mm_packs_epi32( A, B ); // b6,b4, b2,b0, a6,a4, a2,a0 00041 00042 *a = even; 00043 *b = odd; 00044 }
SSP_FORCEINLINE void ssp_convert_odd_even_epi32_SSE2 | ( | __m128i * | a, | |
__m128i * | b | |||
) |
Definition at line 68 of file SSEPlus_convert_SSE2.h.
00069 { 00070 // IN 00071 // a = a3,a2,a1,a0 00072 // b = b3,b2,b1,b0 00073 00074 // OUT 00075 // a = b2,b0,a2,a0 // even 00076 // b = b3,b1,a3,a1 // odd 00077 00078 ssp_m128 A,B; 00079 A.i = *a; 00080 B.i = *b; 00081 00082 ssp_convert_odd_even_ps_SSE2( &A.f, &B.f ); 00083 00084 *a = A.i; 00085 *b = B.i; 00086 }
SSP_FORCEINLINE void ssp_convert_odd_even_ps_SSE2 | ( | __m128 * | a, | |
__m128 * | b | |||
) |
Definition at line 49 of file SSEPlus_convert_SSE2.h.
00050 { 00051 // IN 00052 // a = a3,a2,a1,a0 00053 // b = b3,b2,b1,b0 00054 00055 // OUT 00056 // a = b2,b0,a2,a0 // even 00057 // b = b3,b1,a3,a1 // odd 00058 00059 __m128 c, d; 00060 c = _mm_shuffle_ps( *a, *b, _MM_SHUFFLE(3,1,3,1) ); 00061 d = _mm_shuffle_ps( *a, *b, _MM_SHUFFLE(2,0,2,0) ); 00062 *a = c; 00063 *b = d; 00064 }
SSP_FORCEINLINE void ssp_convert_reverse_transpose_SSE2 | ( | __m128i * | a, | |
__m128i * | b, | |||
__m128i * | c | |||
) |
Definition at line 128 of file SSEPlus_convert_SSE2.h.
00129 { 00130 ssp_m128 A, B, C, T1, T2, T3; 00131 A.i = *a; 00132 B.i = *b; 00133 C.i = *c; 00134 00135 T1.f = _mm_shuffle_ps( C.f, A.f, _MM_SHUFFLE( 3,1,2,0) ); // 9 3 8 2 00136 T2.f = _mm_shuffle_ps( B.f, A.f, _MM_SHUFFLE( 2,0,2,0) ); // 6 0 7 1 00137 T3.f = _mm_shuffle_ps( C.f, B.f, _MM_SHUFFLE( 3,1,3,1) ); // 10 4 11 5 00138 00139 A.f = _mm_shuffle_ps( T2.f, T1.f, _MM_SHUFFLE( 2,0,0,2 ) ); //3 2 1 0 00140 B.f = _mm_shuffle_ps( T3.f, T2.f, _MM_SHUFFLE( 1,3,0,2 ) ); //7 6 5 4 00141 C.f = _mm_shuffle_ps( T1.f, T3.f, _MM_SHUFFLE( 1,3,3,1 ) ); //11 10 9 8 00142 00143 *a = A.i; 00144 *b = B.i; 00145 *c = C.i; 00146 }
SSP_FORCEINLINE __m128i ssp_logical_bitwise_select_SSE2 | ( | __m128i | a, | |
__m128i | b, | |||
__m128i | mask | |||
) |
Definition at line 63 of file SSEPlus_logical_SSE2.h.
00063 : b) 00064 { 00065 a = _mm_and_si128 ( a, mask ); // clear a where mask = 0 00066 b = _mm_andnot_si128( mask, b ); // clear b where mask = 1 00067 a = _mm_or_si128 ( a, b ); // a = a OR b 00068 return a; 00069 }
SSP_FORCEINLINE __m128d ssp_logical_invert_sd_SSE2 | ( | __m128d | a | ) |
SSP_FORCEINLINE __m128i ssp_logical_invert_si128_SSE2 | ( | __m128i | a | ) |
Definition at line 37 of file SSEPlus_logical_SSE2.h.
00038 { 00039 const static __m128i mask = SSP_CONST_SET_32I( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF ); 00040 a = _mm_xor_si128( a, mask ); 00041 return a; 00042 }
SSP_FORCEINLINE __m128 ssp_logical_invert_ss_SSE2 | ( | __m128 | a | ) |
SSP_FORCEINLINE __m128i ssp_logical_signinvert_16_SSE2 | ( | __m128i | mask, | |
__m128i | a, | |||
__m128i | b | |||
) |
Invert 'mask' if 'a' and 'b' have different signs.
Definition at line 17 of file SSEPlus_logical_SSE2.h.
00018 { 00019 __m128i signMask; 00020 signMask = _mm_xor_si128 ( a, b ); // Signbit is 1 where signs differ 00021 signMask = _mm_srai_epi16 ( signMask, 15 ); // fill all fields with sign bit 00022 mask = _mm_xor_si128 ( mask, signMask ); // Invert output where signs differed 00023 return mask; 00024 }
SSP_FORCEINLINE __m128i ssp_logical_signinvert_32_SSE2 | ( | __m128i | mask, | |
__m128i | a, | |||
__m128i | b | |||
) |
Invert 'mask' if 'a' and 'b' have different signs.
Definition at line 27 of file SSEPlus_logical_SSE2.h.
00028 { 00029 __m128i signMask; 00030 signMask = _mm_xor_si128 ( a, b ); // Signbit is 1 where signs differ 00031 signMask = _mm_srai_epi32 ( signMask, 31 ); // fill all fields with sign bit 00032 mask = _mm_xor_si128 ( mask, signMask ); // Invert output where signs differed 00033 return mask; 00034 }
__m128i ssp_memory_load1_epu8_SSE2 | ( | unsigned char | a | ) |
SSP_FORCEINLINE __m128i ssp_movmask_imm8_to_epi32_SSE2 | ( | int | mask | ) |
Definition at line 84 of file SSEPlus_logical_SSE2.h.
00085 { 00086 __m128i screen; 00087 const static __m128i mulShiftImm = SSP_CONST_SET_16I( 0x1000, 0x0000, 0x2000, 0x0000, 0x4000, 0x0000, 0x8000, 0x0000 ); // Shift mask multiply moves all bits to left, becomes MSB 00088 screen = _mm_set1_epi16 ( mask ); // Load the mask into register 00089 screen = _mm_mullo_epi16( screen, mulShiftImm ); // Shift bits to MSB 00090 screen = _mm_srai_epi32 ( screen, 31 ); // Shift bits to obtain all F's or all 0's 00091 return screen; 00092 }
SSP_FORCEINLINE __m128 ssp_round_ps_neg_zero_SSE2 | ( | __m128 | a, | |
int | iRoundMode | |||
) |
This function wraps ssp_round_ps_SSE2. It guarantees that numbers rounding to 0 from a negative will generate a negative zero.
Definition at line 91 of file SSEPlus_arithmetic_SSE2.h.
00092 { 00093 const static __m128i SIGN_BIT = SSP_CONST_SET_32I( 0x80000000, 0x80000000, 0x80000000,0x80000000 ); 00094 ssp_m128 A, sign; 00095 A.f = a; 00096 00097 sign.i = _mm_and_si128 ( A.i, SIGN_BIT ); // Store the sign bits 00098 A.f = ssp_round_ps_SSE2( A.f, iRoundMode ); 00099 A.i = _mm_or_si128 ( A.i, sign.i ); // Restore the sign bits (preserves -0) 00100 00101 return A.f; 00102 }
SSP_FORCEINLINE __m128i ssp_slli_epi8_SSE2 | ( | __m128i | a, | |
const int | b | |||
) |
r_:= a_ << b; (logical left shift)
Definition at line 98 of file SSEPlus_logical_SSE2.h.
00099 { // a = VfVeVdVcVbVaV9V8V7V6V5V4V3V2V1V0 00100 __m128i t1 = _mm_srli_epi16( a, 8 ); // t1 = Vf Vd Vb V9 V7 V5 V3 V1 00101 __m128i t2 = _mm_slli_epi16( a, b + 8 ); // t2 = Re Rc Ra R8 R6 R4 R2 R0 00102 t1 = _mm_slli_epi16( t1, b + 8 ); // t1 = Rf Rd Rb R9 R7 R5 R3 R1 00103 t2 = _mm_srli_epi16( t1, 8 ); // t2 = Re Rc Ra R8 R6 R4 R2 R0 00104 t1 = _mm_or_si128( t1, t2 ); // t1 = RfReRdRcRbRaR9R8R7R6R5R4R3R2R1R0 00105 return t1; 00106 }
SSP_FORCEINLINE __m128i ssp_srli_epi8_SSE2 | ( | __m128i | a, | |
const int | b | |||
) |
r_:= a_ >> b; (logical right shift)
Definition at line 111 of file SSEPlus_logical_SSE2.h.
00112 { // a = VfVeVdVcVbVaV9V8V7V6V5V4V3V2V1V0 00113 __m128i t1 = _mm_slli_epi16( a, 8 ); // t1 = Ve Vc Va V8 V6 V4 V2 V0 00114 __m128i t2 = _mm_srli_epi16( a, b + 8 ); // t2 = Rf Rd Rb R9 R7 R5 R3 R1 00115 t1 = _mm_srli_epi16( t1, b + 8 ); // t1 = Re Rc Ra R8 R6 R4 R2 R0 00116 t2 = _mm_slli_epi16( t1, 8 ); // t2 = Rf Rd Rb R9 R7 R5 R3 R1 00117 t1 = _mm_or_si128( t1, t2 ); // t1 = RfReRdRcRbRaR9R8R7R6R5R4R3R2R1R0 00118 return t1; 00119 }