/* evaluation of 4 sines at onces, using only SSE2. The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to take into account the special handling they have for greater values -- it does not return garbage for arguments over 8192, though, but the extra precision is missing). Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the surprising but correct result. Performance is also surprisingly good, 1.33 times faster than the macos vsinf SSE2 function, and 1.5 times faster than the __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not too bad for an SSE1 function (with no special tuning) ! However the latter libraries probably have a much better handling of NaN, Inf, denormalized and other special arguments.. On my core 1 duo, the execution of this function takes approximately 95 cycles. From what I have observed on the experiments with Intel AMath lib, switching to an SSE2 version would improve the perf by only 10%. Since it is based on SSE intrinsics, it has to be compiled at -O2 to deliver full speed. */ __m128 sin_ps(__m128 x) { // any x typedef __m128 v4sf; typedef __m128i v4si; v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; v4si emm0, emm2; sign_bit = x; /* take the absolute value */ x = _mm_and_ps(x, constants::inv_mant_mask.ps); /* extract the sign bit (upper one) */ sign_bit = _mm_and_ps(sign_bit, constants::sign_mask.ps); /* scale by 4/Pi */ y = _mm_mul_ps(x, constants::cephes_FOPI.ps); /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi); emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi); y = _mm_cvtepi32_ps(emm2); /* get the swap sign flag */ emm0 = _mm_and_si128(emm2, constants::pi32_4.pi); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4<x<=Pi/2 Both branches will be computed. */ emm2 = _mm_and_si128(emm2, constants::pi32_2.pi); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); v4sf swap_sign_bit = _mm_castsi128_ps(emm0); v4sf poly_mask = _mm_castsi128_ps(emm2); sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = constants::minus_cephes_DP1.ps; xmm2 = constants::minus_cephes_DP2.ps; xmm3 = constants::minus_cephes_DP3.ps; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = constants::coscof_p0.ps; v4sf z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, constants::coscof_p1.ps); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, constants::coscof_p2.ps); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, constants::ps_1.ps); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v4sf y2 = constants::sincof_p0.ps; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, constants::sincof_p1.ps); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, constants::sincof_p2.ps); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }
inline GPR_t si_orhi( GPR_t RA, int64_t IMM ) { return _mm_or_ps( RA, _mm_castsi128_ps( _mm_set1_epi16( (int16_t)IMM ) ) ); }
inline GPR_t si_orc( GPR_t RA, GPR_t RB ) { const __m128 not_RB = _mm_andnot_ps( RB, _mm_castsi128_ps( _mm_set1_epi32(0xffffffff) ) ); return _mm_or_ps( RA, not_RB ); }
namespace embree { const __m128 _mm_lookupmask_ps[16] = { _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) }; const __m128d _mm_lookupmask_pd[4] = { _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)), _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)), _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)), _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1)) }; }
namespace q { const __m128 _mm_lookupmask_ps[16] = { _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)), _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)), _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)), _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)), _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)), _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)), _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)), _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)), _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)), _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)), _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)), _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)), _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)), _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)), _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)), _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1)) }; } /* namespace q */
inline GPR_t si_xorbi( GPR_t RA, int64_t IMM ) { return _mm_xor_ps( RA, _mm_castsi128_ps( _mm_set1_epi8((uint8_t)IMM) ) ); }
GSVector4 GSVector4::cast(const GSVector4i& v) { return GSVector4(_mm_castsi128_ps(v.m)); }
/* Function: esl_sse_logf() * Synopsis: <r[z] = log x[z]> * Incept: SRE, Fri Dec 14 11:32:54 2007 [Janelia] * * Purpose: Given a vector <x> containing four floats, returns a * vector <r> in which each element <r[z] = logf(x[z])>. * * Valid in the domain $x_z > 0$ for normalized IEEE754 * $x_z$. * * For <x> $< 0$, including -0, returns <NaN>. For <x> $== * 0$ or subnormal <x>, returns <-inf>. For <x = inf>, * returns <inf>. For <x = NaN>, returns <NaN>. For * subnormal <x>, returns <-inf>. * * Xref: J2/71. * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2 and added handling * of IEEE754 specials. */ __m128 esl_sse_logf(__m128 x) { static float cephes_p[9] = { 7.0376836292E-2f, -1.1514610310E-1f, 1.1676998740E-1f, -1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f, 2.0000714765E-1f, -2.4999993993E-1f, 3.3333331174E-1f }; __m128 onev = _mm_set1_ps(1.0f); /* all elem = 1.0 */ __m128 v0p5 = _mm_set1_ps(0.5f); /* all elem = 0.5 */ __m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */ __m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */ __m128i ei; __m128 e; __m128 invalid_mask, zero_mask, inf_mask; /* masks used to handle special IEEE754 inputs */ __m128 mask; __m128 origx; __m128 tmp; __m128 y; __m128 z; /* first, split x apart: x = frexpf(x, &e); */ ei = _mm_srli_epi32( _mm_castps_si128(x), 23); /* shift right 23: IEEE754 floats: ei = biased exponents */ invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg)); /* mask any elem that's negative; these become NaN */ zero_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128())); /* mask any elem zero or subnormal; these become -inf */ inf_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp)); /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN */ origx = x; /* store original x, used for log(inf) = inf, log(NaN) = NaN */ x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand */ x = _mm_or_ps (x, v0p5); /* sets hidden bit b[0] */ ei = _mm_sub_epi32(ei, _mm_set1_epi32(126)); /* -127 (ei now signed base-2 exponent); then +1 */ e = _mm_cvtepi32_ps(ei); /* now, calculate the log */ mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches. */ tmp = _mm_and_ps(x, mask); /* tmp contains x values < 0.707, else 0 */ x = _mm_sub_ps(x, onev); e = _mm_sub_ps(e, _mm_and_ps(onev, mask)); x = _mm_add_ps(x, tmp); z = _mm_mul_ps(x,x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8])); y = _mm_mul_ps(y, x); y = _mm_mul_ps(y, z); tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f)); y = _mm_add_ps(y, tmp); tmp = _mm_mul_ps(z, v0p5); y = _mm_sub_ps(y, tmp); tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f)); x = _mm_add_ps(x, y); x = _mm_add_ps(x, tmp); /* IEEE754 cleanup: */ x = esl_sse_select_ps(x, origx, inf_mask); /* log(inf)=inf; log(NaN) = NaN */ x = _mm_or_ps(x, invalid_mask); /* log(x<0, including -0,-inf) = NaN */ x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal = -inf */ return x; }
static void SinCos(const float rad, float &sin, float &cos) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_fopi = _mm_set1_ps(4.0f / pi); const __m128 _ps_0p5 = _mm_set1_ps(0.5f); const __m128 _ps_1 = _mm_set1_ps(1.0f); const __m128 _ps_dp1 = _mm_set1_ps(-0.7851562f); const __m128 _ps_dp2 = _mm_set1_ps(-2.4187564849853515625e-4f); const __m128 _ps_dp3 = _mm_set1_ps(-3.77489497744594108e-8f); const __m128 _ps_sincof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f); const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f); const __m128 _ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f); const __m128 _ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f); const __m128i _pi32_1 = _mm_set1_epi32(1); const __m128i _pi32_i1 = _mm_set1_epi32(~1); const __m128i _pi32_2 = _mm_set1_epi32(2); const __m128i _pi32_4 = _mm_set1_epi32(4); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2; __m128i mmi0, mmi2, mmi4; __m128 x, y, z; __m128 y1, y2; __m128 a = _mm_set1_ps(rad); x = _mm_and_ps(a, _mask_sign_inv); y = _mm_mul_ps(x, _ps_fopi); mmi2 = _mm_cvtps_epi32(y); mmi2 = _mm_add_epi32(mmi2, _pi32_1); mmi2 = _mm_and_si128(mmi2, _pi32_i1); y = _mm_cvtepi32_ps(mmi2); mmi4 = mmi2; mmi0 = _mm_and_si128(mmi2, _pi32_4); mmi0 = _mm_slli_epi32(mmi0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(mmi0); mmi2 = _mm_and_si128(mmi2, _pi32_2); mmi2 = _mm_cmpeq_epi32(mmi2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(mmi2); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp1)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp2)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp3)); mmi4 = _mm_sub_epi32(mmi4, _pi32_2); mmi4 = _mm_andnot_si128(mmi4, _pi32_4); mmi4 = _mm_slli_epi32(mmi4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(mmi4); __m128 sign_bit_sin = _mm_xor_ps(_mm_and_ps(a, _mask_sign_raw), swap_sign_bit_sin); z = _mm_mul_ps(x, x); y1 = _mm_mul_ps(_ps_coscof_p0, z); y1 = _mm_add_ps(y1, _ps_coscof_p1); y1 = _mm_mul_ps(y1, z); y1 = _mm_add_ps(y1, _ps_coscof_p2); y1 = _mm_mul_ps(y1, z); y1 = _mm_mul_ps(y1, z); y1 = _mm_sub_ps(y1, _mm_mul_ps(z, _ps_0p5)); y1 = _mm_add_ps(y1, _ps_1); y2 = _mm_mul_ps(_ps_sincof_p0, z); y2 = _mm_add_ps(y2, _ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, _ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); __m128 sin1y = _mm_andnot_ps(poly_mask, y1); __m128 sin2y = _mm_and_ps(poly_mask, y2); mm1 = _mm_add_ps(sin1y, sin2y); mm2 = _mm_add_ps(_mm_sub_ps(y1, sin1y), _mm_sub_ps(y2, sin2y)); sin = _mm_cvtss_f32(_mm_xor_ps(mm1, sign_bit_sin)); cos = _mm_cvtss_f32(_mm_xor_ps(mm2, sign_bit_cos)); }
int main() { //Transpose vec4 mat[4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; __m128i xmm0 = _mm_unpacklo_epi32(_mm_castps_si128(mat[0]), _mm_castps_si128(mat[1])); __m128i xmm1 = _mm_unpackhi_epi32(_mm_castps_si128(mat[0]), _mm_castps_si128(mat[1])); __m128i xmm2 = _mm_unpacklo_epi32(_mm_castps_si128(mat[2]), _mm_castps_si128(mat[3])); __m128i xmm3 = _mm_unpackhi_epi32(_mm_castps_si128(mat[2]), _mm_castps_si128(mat[3])); vec4 trans[4]; trans[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(xmm0, xmm2)); trans[1] = _mm_castsi128_ps(_mm_unpackhi_epi64(xmm0, xmm2)); trans[2] = _mm_castsi128_ps(_mm_unpacklo_epi64(xmm1, xmm3)); trans[3] = _mm_castsi128_ps(_mm_unpackhi_epi64(xmm1, xmm3)); vec4 trans2[4]; ml::transpose(trans2, mat); FILE* file = fopen("..\\..\\AppData\\VT.swf", "rb"); fseek(file, 0, SEEK_END); size_t size = ftell(file); fseek(file, 0, SEEK_SET); unsigned char* fileData = (unsigned char*)malloc(size); fread(fileData, 1, size, file); fclose(file); MemReader data = {(const char*)fileData, (const char*)fileData+size, (const char*)fileData}; //Read SWF header const u32 signatureAndVersion = data.read<u32>(); const u32 actualSize = data.read<u32>(); u32 signature = signatureAndVersion&0x00FFFFFF; u8 version = signatureAndVersion>>24; bool isCompressed = signature=='\0SWC'; bool isUncompressed = signature=='\0SWF'; //if !isCompressed && !isUncompressed return error; MemReader data2 = {0, 0, 0}; char* uncompressed = 0; if (isCompressed) { uncompressed = (char*)malloc(actualSize-8); data2.cur = data2.start = uncompressed; data2.end = uncompressed+actualSize-8; uLongf uncompressedSize = actualSize-8; uncompress((Bytef*)uncompressed, &uncompressedSize, data.as<Bytef>(), size-8); } else if (isCompressed) { data2.cur = data2.start = data.as<char>(); data2.end = data2.start+actualSize-8; } u8 bits = data2.read<u8>(); u8 numBits = bits>>3; u32 rectSizeMinusOne = (numBits*4+5)>>3; data2.move(rectSizeMinusOne); const u16 frameRate = data2.read<u16>(); const u16 frameCount = data2.read<u16>(); std::set<u32> tagsUsed; size_t tagCount = 0; while (data2.cur!=data2.end) { u16 tagHeader = data2.read<u16>(); u32 tagLength = tagHeader&0x3F; u32 tagType = tagHeader>>6; tagsUsed.insert(tagType); if (tagLength==0x3F) tagLength = data2.read<u32>(); data2.move(tagLength); parseTag(tagType); ++tagCount; } if (uncompressed) free(uncompressed); printf("\nProcessed %d tags\n\n", tagCount); printf(" Tags used \n"); printf("-------------------------\n"); std::set<u32>::iterator it = tagsUsed.begin(), end = tagsUsed.end(); for (; it!=end; ++it) { parseTag(*it); } free(fileData); }
/* Function: esl_sse_expf() * Synopsis: <r[z] = exp x[z]> * Incept: SRE, Fri Dec 14 14:46:27 2007 [Janelia] * * Purpose: Given a vector <x> containing four floats, returns a * vector <r> in which each element <r[z] = expf(x[z])>. * * Valid for all IEEE754 floats $x_z$. * * Xref: J2/71 * J10/62: bugfix, minlogf/maxlogf range was too wide; * (k+127) must be >=0 and <=255, so (k+127)<<23 * is a valid IEEE754 float, without touching * the sign bit. Pommier had this right in the * first place, and I didn't understand. * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2. * * Note on maxlogf/minlogf, which are close to but not * exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so * k+127 is 0..255, a valid IEEE754 8-bit exponent * (0..255), so the bit pattern (k+127)<<23 is IEEE754 * single-precision for 2^k. If k=-127, we get IEEE754 0. * If k=128, we get IEEE754 +inf. If k<-127, k+127 is * negative and we get screwed up. If k>128, k+127 * overflows the 8-bit exponent and sets the sign bit. So * for x' (base 2) < -127.5 we must definitely return e^x ~ * 0; for x' < 126.5 we're going to calculate 0 anyway * (because k=floor(-126.5-epsilon+0.5) = -127). So any * minlogf between -126.5 log2 ... -127.5 log2 will suffice * as the cutoff. Ditto for 126.5 log2 .. 127.5log2. * That's 87.68312 .. 88.3762655. I think Pommier's * thinking is, you don't want to get to close to the * edges, lest fp roundoff error screw you (he may have * consider 1 ulp carefully, I can't tell), but otherwise * you may as well put your bounds close to the outer edge; * so * maxlogf = 127.5 log(2) - epsilon * minlogf = -127.5 log(2) + epsilon * for an epsilon that happen to be ~ 3e-6. */ __m128 esl_sse_expf(__m128 x) { static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f, 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f }; static float cephes_c[2] = { 0.693359375f, -2.12194440e-4f }; static float maxlogf = 88.3762626647949f; /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */ static float minlogf = -88.3762626647949f; /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */ __m128i k; __m128 mask, tmp, fx, z, y, minmask, maxmask; /* handle out-of-range and special conditions */ maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf)); minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf)); /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */ fx = _mm_mul_ps(x, _mm_set1_ps(eslCONST_LOG2R)); fx = _mm_add_ps(fx, _mm_set1_ps(0.5f)); /* floorf() with SSE: */ k = _mm_cvttps_epi32(fx); /* cast to int with truncation */ tmp = _mm_cvtepi32_ps(k); /* cast back to float */ mask = _mm_cmpgt_ps(tmp, fx); /* if it increased (i.e. if it was negative...) */ mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch... */ fx = _mm_sub_ps(tmp, mask); /* then subtract one. */ k = _mm_cvttps_epi32(fx); /* k is now ready for the 2^k part. */ /* polynomial approx for e^f for f in range [-0.5, 0.5] */ tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0])); z = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1])); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x, x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(1.0f)); /* build 2^k by hand, by creating a IEEE754 float */ k = _mm_add_epi32(k, _mm_set1_epi32(127)); k = _mm_slli_epi32(k, 23); fx = _mm_castsi128_ps(k); /* put 2^k e^f together (fx = 2^k, y = e^f) and we're done */ y = _mm_mul_ps(y, fx); /* special/range cleanup */ y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128) */ y = esl_sse_select_ps(y, _mm_set1_ps(0.0f), minmask); /* exp(x) = 0 for x < log(2^-149) */ return y; }
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; const __m128i thresh = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); const __m128i blimit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); q4p4 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); q3p3 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); q2p2 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); q1p1 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); p1q1 = _mm_shuffle_epi32(q1p1, 78); q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); q0p0 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); fe = _mm_set1_epi8(0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); } // lp filter { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8(0x80); const __m128i t1 = _mm_set1_epi16(0x1); __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); __m128i qs0 = _mm_xor_si128(p0q0, t80); __m128i qs1 = _mm_xor_si128(p1q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); work_a = _mm_subs_epi8(qs0, qs0ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); filter1 = _mm_unpacklo_epi8(zero, filter1); filter1 = _mm_srai_epi16(filter1, 0xB); filter2 = _mm_unpacklo_epi8(zero, filter2); filter2 = _mm_srai_epi16(filter2, 0xB); /* Filter1 >> 3 */ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); /* filt >> 1 */ filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); // loopfilter done { __m128i work; flat = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); flat = _mm_max_epu8(abs_p1p0, flat); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); q5p5 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); q6p6 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); q7p7 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations { const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; __m128i pixelFilter_p, pixelFilter_q; __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; p7_16 = _mm_unpacklo_epi8(q7p7, zero); p6_16 = _mm_unpacklo_epi8(q6p6, zero); p5_16 = _mm_unpacklo_epi8(q5p5, zero); p4_16 = _mm_unpacklo_epi8(q4p4, zero); p3_16 = _mm_unpacklo_epi8(q3p3, zero); p2_16 = _mm_unpacklo_epi8(q2p2, zero); p1_16 = _mm_unpacklo_epi8(q1p1, zero); p0_16 = _mm_unpacklo_epi8(q0p0, zero); q0_16 = _mm_unpackhi_epi8(q0p0, zero); q1_16 = _mm_unpackhi_epi8(q1p1, zero); q2_16 = _mm_unpackhi_epi8(q2p2, zero); q3_16 = _mm_unpackhi_epi8(q3p3, zero); q4_16 = _mm_unpackhi_epi8(q4p4, zero); q5_16 = _mm_unpackhi_epi8(q5p5, zero); q6_16 = _mm_unpackhi_epi8(q6p6, zero); q7_16 = _mm_unpackhi_epi8(q7p7, zero); pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), _mm_add_epi16(p4_16, p3_16)); pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), _mm_add_epi16(q4_16, q3_16)); pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); pixetFilter_p2p1p0 = _mm_add_epi16( four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); flat2_q0p0 = _mm_packus_epi16(res_p, res_q); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); flat_q0p0 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(p7_16, p7_16); sum_q7 = _mm_add_epi16(q7_16, q7_16); sum_p3 = _mm_add_epi16(p3_16, p3_16); sum_q3 = _mm_add_epi16(q3_16, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); flat2_q1p1 = _mm_packus_epi16(res_p, res_q); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); flat_q1p1 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); sum_p3 = _mm_add_epi16(sum_p3, p3_16); sum_q3 = _mm_add_epi16(sum_q3, q3_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); flat2_q2p2 = _mm_packus_epi16(res_p, res_q); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); flat_q2p2 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); flat2_q3p3 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); flat2_q4p4 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); flat2_q5p5 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); flat2_q6p6 = _mm_packus_epi16(res_p, res_q); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat = _mm_shuffle_epi32(flat, 68); flat2 = _mm_shuffle_epi32(flat2, 68); q2p2 = _mm_andnot_si128(flat, q2p2); flat_q2p2 = _mm_and_si128(flat, flat_q2p2); q2p2 = _mm_or_si128(q2p2, flat_q2p2); qs1ps1 = _mm_andnot_si128(flat, qs1ps1); flat_q1p1 = _mm_and_si128(flat, flat_q1p1); q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); qs0ps0 = _mm_andnot_si128(flat, qs0ps0); flat_q0p0 = _mm_and_si128(flat, flat_q0p0); q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); } }
INLINE __m128 shade(BilinearSamplePos const& bsp, const SWR_TRIANGLE_DESC & work, WideVector<BilinearSamplePos::NUM_ATTRIBUTES, __m128> const& pAttrs, BYTE* pBuffer, BYTE*, UINT*) { TextureView *pTxv = (TextureView*)work.pTextureViews[KNOB_NUMBER_OF_TEXTURE_VIEWS + 0]; Sampler *pSmp = (Sampler*)work.pSamplers[0]; TexCoord tcidx; tcidx.U = get<4>(pAttrs); tcidx.V = get<5>(pAttrs); UINT mips[] = {0,0,0,0}; WideColor color; SampleSimplePointRGBAF32(*pTxv, *pSmp, tcidx, mips, color); // modulate color.R = _mm_mul_ps(color.R, get<0>(pAttrs)); color.G = _mm_mul_ps(color.G, get<1>(pAttrs)); color.B = _mm_mul_ps(color.B, get<2>(pAttrs)); color.A = _mm_mul_ps(color.A, get<3>(pAttrs)); // convert float to unorm __m128i r = vFloatToUnorm( color.R ); __m128i g = vFloatToUnorm( color.G ); __m128i b = vFloatToUnorm( color.B ); __m128i a = vFloatToUnorm( color.A ); // pack __m128i vPixel = b; vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(g, 8)); vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(r, 16)); vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(a, 24)); // blend with GL_ONE and GL_ONE if (bsp.sFactor == GL_ONE && bsp.dFactor == GL_ONE) { __m128i vColorBuffer = _mm_load_si128((const __m128i*)pBuffer); vPixel = _mm_adds_epu8(vPixel, vColorBuffer); } if (bsp.sFactor == GL_SRC_ALPHA && bsp.dFactor == GL_ONE_MINUS_SRC_ALPHA) { const __m128i SHUF_ALPHA = _mm_set_epi32(0x8080800f, 0x8080800b, 0x80808007, 0x80808003); const __m128i SHUF_RED = _mm_set_epi32(0x8080800e, 0x8080800a, 0x80808006, 0x80808002); const __m128i SHUF_GREEN = _mm_set_epi32(0x8080800d, 0x80808009, 0x80808005, 0x80808001); const __m128i SHUF_BLUE = _mm_set_epi32(0x8080800c, 0x80808008, 0x80808004, 0x80808000); // mul by src_alpha __m128 vSrcRedF = _mm_mul_ps(color.R, color.A); __m128 vSrcGreenF = _mm_mul_ps(color.G, color.A); __m128 vSrcBlueF = _mm_mul_ps(color.B, color.A); // convert to int __m128i vSrcRed = vFloatToUnorm(vSrcRedF); __m128i vSrcGreen = vFloatToUnorm(vSrcGreenF); __m128i vSrcBlue = vFloatToUnorm(vSrcBlueF); __m128i vSrcAlpha = vFloatToUnorm(color.A); // pack __m128i vSrcPixel = vSrcBlue; vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcGreen, 8)); vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcRed, 16)); vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcAlpha, 24)); // shuffle dst R,G,B,A __m128i vColorBuffer = _mm_load_si128((const __m128i*)pBuffer); __m128i vDstAlpha = _mm_shuffle_epi8(vColorBuffer, SHUF_ALPHA); __m128i vDstRed = _mm_shuffle_epi8(vColorBuffer, SHUF_RED); __m128i vDstGreen = _mm_shuffle_epi8(vColorBuffer, SHUF_GREEN); __m128i vDstBlue = _mm_shuffle_epi8(vColorBuffer, SHUF_BLUE); // convert to float __m128 vDstAlphaF = _mm_cvtepi32_ps(vDstAlpha); __m128 vDstRedF = _mm_cvtepi32_ps(vDstRed); __m128 vDstGreenF = _mm_cvtepi32_ps(vDstGreen); __m128 vDstBlueF = _mm_cvtepi32_ps(vDstBlue); // mul by 1-src_alpha __m128 vOneMinusSrcAlphaF = _mm_sub_ps(_mm_set1_ps(1.0f), color.A); vDstAlphaF = _mm_mul_ps(vDstAlphaF, vOneMinusSrcAlphaF); vDstRedF = _mm_mul_ps(vDstRedF, vOneMinusSrcAlphaF); vDstGreenF = _mm_mul_ps(vDstGreenF, vOneMinusSrcAlphaF); vDstBlueF = _mm_mul_ps(vDstBlueF, vOneMinusSrcAlphaF); // convert to int vDstAlpha = _mm_cvtps_epi32(vDstAlphaF); vDstRed = _mm_cvtps_epi32(vDstRedF); vDstGreen = _mm_cvtps_epi32(vDstGreenF); vDstBlue = _mm_cvtps_epi32(vDstBlueF); // pack __m128i vDstPixel = vDstBlue; vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstGreen, 8)); vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstRed, 16)); vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstAlpha, 24)); // final rgba = min(src + dst,255) vPixel = _mm_adds_epu8(vSrcPixel, vDstPixel); } return _mm_castsi128_ps(vPixel); }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(__m128 x, __m128* s, __m128* c) { typedef __m128 v4sf; typedef __m128i v4si; v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; v4si emm0, emm2, emm4; sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, constants::inv_sign_mask.ps); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, constants::sign_mask.ps); /* scale by 4/Pi */ y = _mm_mul_ps(x, constants::cephes_FOPI.ps); /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi); emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, constants::pi32_4.pi); emm0 = _mm_slli_epi32(emm0, 29); v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, constants::pi32_2.pi); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); v4sf poly_mask = _mm_castsi128_ps(emm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = constants::minus_cephes_DP1.ps; xmm2 = constants::minus_cephes_DP2.ps; xmm3 = constants::minus_cephes_DP3.ps; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); emm4 = _mm_sub_epi32(emm4, constants::pi32_2.pi); emm4 = _mm_andnot_si128(emm4, constants::pi32_4.pi); emm4 = _mm_slli_epi32(emm4, 29); v4sf sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ v4sf z = _mm_mul_ps(x,x); y = constants::coscof_p0.ps; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, constants::coscof_p1.ps); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, constants::coscof_p2.ps); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, constants::ps_1.ps); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v4sf y2 = constants::sincof_p0.ps; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, constants::sincof_p1.ps); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, constants::sincof_p2.ps); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; v4sf ysin2 = _mm_and_ps(xmm3, y2); v4sf ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
void sincos_ps(__m128 x, __m128 *s, __m128 *c) { __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; __m128i emm0, emm2, emm4; sign_bit_sin = x; x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_sign_mask)); sign_bit_sin = _mm_and_ps(sign_bit_sin, *reinterpret_cast<const __m128*>(_pi_sign_mask)); y = _mm_mul_ps(x, *_ps_cephes_FOPI); emm2 = _mm_cvttps_epi32(y); emm2 = _mm_add_epi32(emm2, *_pi_1); emm2 = _mm_and_si128(emm2, *_pi_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; emm0 = _mm_and_si128(emm2, *_pi_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); emm2 = _mm_and_si128(emm2, *_pi_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); xmm1 = *_ps_minus_cephes_DP1; xmm2 = *_ps_minus_cephes_DP2; xmm3 = *_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); emm4 = _mm_sub_epi32(emm4, *_pi_2); emm4 = _mm_andnot_si128(emm4, *_pi_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); __m128 z = _mm_mul_ps(x, x); y = *_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *_ps_1); __m128 y2 = *_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1, ysin2); xmm2 = _mm_add_ps(y, y2); *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
__m128 exp_ps(v4sfu *xPtr) { __m128 x=*((__m128 *)xPtr); __m128 tmp = _mm_setzero_ps(), fx; #ifdef USE_SSE2 __m128i emm0; #else __m64 mm0, mm1; #endif __m128 one = *(__m128*)_ps_1; x = _mm_min_ps(x, *(__m128*)_ps_exp_hi); x = _mm_max_ps(x, *(__m128*)_ps_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF); fx = _mm_add_ps(fx, *(__m128*)_ps_0p5); /* how to perform a floorf with SSE: just below */ #ifndef USE_SSE2 /* step 1 : cast to int */ tmp = _mm_movehl_ps(tmp, fx); mm0 = _mm_cvttps_pi32(fx); mm1 = _mm_cvttps_pi32(tmp); /* step 2 : cast back to float */ tmp = _mm_cvtpi32x2_ps(mm0, mm1); #else emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); #endif /* if greater, substract 1 */ __m128 mask = _mm_cmpgt_ps(tmp, fx); mask = _mm_and_ps(mask, one); fx = _mm_sub_ps(tmp, mask); tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1); __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x,x); __m128 y = *(__m128*)_ps_cephes_exp_p0; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, one); /* build 2^n */ #ifndef USE_SSE2 z = _mm_movehl_ps(z, fx); mm0 = _mm_cvttps_pi32(fx); mm1 = _mm_cvttps_pi32(z); mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f); mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f); mm0 = _mm_slli_pi32(mm0, 23); mm1 = _mm_slli_pi32(mm1, 23); __m128 pow2n; COPY_MM_TO_XMM(mm0, mm1, pow2n); _mm_empty(); #else emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f); emm0 = _mm_slli_epi32(emm0, 23); __m128 pow2n = _mm_castsi128_ps(emm0); #endif y = _mm_mul_ps(y, pow2n); return y; }
void spu_rotmahi( SPU_t* SPU, SPU_INSTRUCTION op ) { const int s = 0x1f&( 0 - SignExtend(op.RI7.I7, 7) ); SPU->GPR[op.RI7.RT] = _mm_castsi128_ps( _mm_srai_epi16( _mm_castps_si128( SPU->GPR[op.RI7.RA] ), s ) ); }
/* almost the same as sin_ps */ __m128 cos_ps(v4sfu *xPtr) { // any x __m128 x=*((__m128 *)xPtr); __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; #ifdef USE_SSE2 __m128i emm0, emm2; #else __m64 mm0, mm1, mm2, mm3; #endif /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2); /* get the swap sign flag */ emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 sign_bit = _mm_castsi128_ps(emm0); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm0:mm1 */ xmm2 = _mm_movehl_ps(xmm2, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm2); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2); mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2); /* get the swap sign flag in mm0:mm1 and the polynom selection mask in mm2:mm3 */ mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 sign_bit, poly_mask; COPY_MM_TO_XMM(mm0, mm1, sign_bit); COPY_MM_TO_XMM(mm2, mm3, poly_mask); _mm_empty(); /* good-bye mmx */ #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m128*)_ps_coscof_p0; __m128 z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }
static void cftmdl_128_SSE2(float* a) { const int l = 8; const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); int j0; __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); for (j0 = 0; j0 < l; j0 += 2) { const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), _MM_SHUFFLE(1, 0, 1, 0)); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2)); const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3)); const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); const __m128 yy3 = _mm_add_ps(yy0, yy2); const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0)); _mm_storel_epi64( (__m128i*)&a[j0 + 32], _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1)); _mm_storel_epi64( (__m128i*)&a[j0 + 48], _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3))); a[j0 + 48] = -a[j0 + 48]; _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add)); _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub)); _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4)); _mm_storel_epi64( (__m128i*)&a[j0 + 56], _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3))); } { int k = 64; int k1 = 2; int k2 = 2 * k1; const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]); const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); for (j0 = k; j0 < l + k; j0 += 2) { const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), _MM_SHUFFLE(1, 0, 1, 0)); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); const __m128 xx3 = _mm_mul_ps(wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx4 = _mm_add_ps(xx2, xx3); const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); const __m128 xx11 = _mm_mul_ps( wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx12 = _mm_add_ps(xx10, xx11); const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); const __m128 xx21 = _mm_mul_ps( wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx22 = _mm_add_ps(xx20, xx21); _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); _mm_storel_epi64( (__m128i*)&a[j0 + 32], _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); _mm_storel_epi64( (__m128i*)&a[j0 + 48], _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); _mm_storel_epi64( (__m128i*)&a[j0 + 40], _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); _mm_storel_epi64( (__m128i*)&a[j0 + 56], _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); } } }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) { __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; #ifdef USE_SSE2 __m128i emm0, emm2, emm4; #else __m64 mm0, mm1, mm2, mm3, mm4, mm5; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm2:mm3 */ xmm3 = _mm_movehl_ps(xmm3, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm3); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm4 = mm2; mm5 = mm3; /* get the swap sign flag for the sine */ mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); __m128 swap_sign_bit_sin; COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); /* get the polynom selection mask for the sine */ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 poly_mask; COPY_MM_TO_XMM(mm2, mm3, poly_mask); #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); #ifdef USE_SSE2 emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); #else /* get the sign flag for the cosine */ mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2); mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2); mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4); mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4); mm4 = _mm_slli_pi32(mm4, 29); mm5 = _mm_slli_pi32(mm5, 29); __m128 sign_bit_cos; COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); _mm_empty(); /* good-bye mmx */ #endif sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m128 z = _mm_mul_ps(x,x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
void SWModelRenderer::RenderInner(spades::draw::SWModel *model, const client::ModelRenderParam ¶m) { auto& mat = param.matrix; auto origin = mat.GetOrigin(); auto axis1 = mat.GetAxis(0); auto axis2 = mat.GetAxis(1); auto axis3 = mat.GetAxis(2); auto *rawModel = model->GetRawModel(); auto rawModelOrigin = rawModel->GetOrigin(); rawModelOrigin += 0.1f; origin += axis1 * rawModelOrigin.x; origin += axis2 * rawModelOrigin.y; origin += axis3 * rawModelOrigin.z; int w = rawModel->GetWidth(); int h = rawModel->GetHeight(); //int d = rawModel->GetDepth(); // evaluate brightness for each normals uint8_t brights[3*3*3]; { auto lightVec = MakeVector3(0.f, -0.707f, -0.707f); float dot1 = Vector3::Dot(axis1, lightVec) * fastRSqrt(axis1.GetPoweredLength()); float dot2 = Vector3::Dot(axis2, lightVec) * fastRSqrt(axis2.GetPoweredLength()); float dot3 = Vector3::Dot(axis3, lightVec) * fastRSqrt(axis3.GetPoweredLength()); for(int x = 0; x < 3; x++){ float d; int cnt; switch(x){ case 0: d = -dot1; cnt = 1; break; case 1: d = 0.f; cnt = 0; break; case 2: d = dot1; cnt = 1; break; } for(int y = 0; y < 3; y++){ auto d2 = d; auto cnt2 = cnt; switch(y){ case 0: d2 -= dot2; cnt2++; break; case 1: break; case 2: d2 += dot2; cnt2++; break; } for(int z = 0; z < 3; z++) { auto d3 = d; auto cnt3 = cnt2; switch(y){ case 0: d3 -= dot3; cnt3++; break; case 1: break; case 2: d3 += dot3; cnt3++; break; } switch(cnt3){ case 2: d3 *= 0.707f; break; case 3: d3 *= 0.57735f; break; } d3 = 192.f + d3 * 62.f; brights[x + y * 3 + z * 9] = static_cast<uint8_t>(d3); } } } } // compute center coord. for culling { auto center = origin; auto localCenter = model->GetCenter(); center += axis1 * localCenter.x; center += axis2 * localCenter.y; center += axis3 * localCenter.z; float largestAxis = axis1.GetPoweredLength(); largestAxis = std::max(largestAxis, axis2.GetPoweredLength()); largestAxis = std::max(largestAxis, axis3.GetPoweredLength()); if(!r->SphereFrustrumCull(center, model->GetRadius() * sqrtf(largestAxis))) return; } Bitmap *fbmp = r->fb; auto *fb = fbmp->GetPixels(); int fw = fbmp->GetWidth(); int fh = fbmp->GetHeight(); auto *db = r->depthBuffer.data(); Matrix4 viewproj = r->GetProjectionViewMatrix(); Vector4 ndc2scrscale = {fw * 0.5f, -fh * 0.5f, 1.f, 1.f}; //Vector4 ndc2scroff = {fw * 0.5f, fh * 0.5f, 0.f, 0.f}; int ndc2scroffX = fw >> 1; int ndc2scroffY = fh >> 1; // render each points auto tOrigin = viewproj * MakeVector4(origin.x, origin.y, origin.z, 1.f); auto tAxis1 = viewproj * MakeVector4(axis1.x, axis1.y, axis1.z, 0.f); auto tAxis2 = viewproj * MakeVector4(axis2.x, axis2.y, axis2.z, 0.f); auto tAxis3 = viewproj * MakeVector4(axis3.x, axis3.y, axis3.z, 0.f); tOrigin *= ndc2scrscale; tAxis1 *= ndc2scrscale; tAxis2 *= ndc2scrscale; tAxis3 *= ndc2scrscale; float pointDiameter;// = largestAxis * 0.55f * fh * 0.5f; { float largestAxis = tAxis1.GetPoweredLength(); largestAxis = std::max(largestAxis, tAxis2.GetPoweredLength()); largestAxis = std::max(largestAxis, tAxis3.GetPoweredLength()); pointDiameter = sqrtf(largestAxis); } uint32_t customColor; customColor = ToFixed8(param.customColor.z) | (ToFixed8(param.customColor.y) << 8) | (ToFixed8(param.customColor.x) << 16); auto v1 = tOrigin; float zNear = r->sceneDef.zNear; for(int x = 0; x < w; x++) { auto v2 = v1; for(int y = 0; y < h; y++) { auto *mp = &model->renderData [model->renderDataAddr[x + y * w]]; while(*mp != -1) { uint32_t data = *(mp++); uint32_t normal = *(mp++); int z = static_cast<int>(data >> 24); //SPAssert(z < d); SPAssert(z >= 0); auto vv = v2 + tAxis3 * zvals[z]; if(vv.z < zNear) continue; // save Z value (don't divide this by W!) float zval = vv.z; // use vv.z for point radius to be divided by W vv.z = pointDiameter; // perspective division float scl = fastRcp(vv.w); vv *= scl; int ix = static_cast<int>(vv.x) + ndc2scroffX; int iy = static_cast<int>(vv.y) + ndc2scroffY; int idm = static_cast<int>(vv.z + .99f); idm = std::max(1, idm); int minX = ix - (idm >> 1); int minY = iy - (idm >> 1); if(minX >= fw || minY >= fh) continue; int maxX = ix + idm; int maxY = iy + idm; if(maxX <= 0 || maxY <= 0) continue; minX = std::max(minX, 0); minY = std::max(minY, 0); maxX = std::min(maxX, fw); maxY = std::min(maxY, fh); auto *fb2 = fb + (minX + minY * fw); auto *db2 = db + (minX + minY * fw); int w = maxX - minX; uint32_t color = data & 0xffffff; if(color == 0) color = customColor; SPAssert(normal < 27); int bright = brights[normal]; #if ENABLE_SSE2 if(lvl == SWFeatureLevel::SSE2) { auto m = _mm_setr_epi32(color, 0, 0, 0); auto f = _mm_set1_epi16(bright << 8); m = _mm_unpacklo_epi8(m, _mm_setzero_si128()); m = _mm_mulhi_epu16(m, f); m = _mm_packus_epi16(m, m); _mm_store_ss(reinterpret_cast<float*>(&color), _mm_castsi128_ps(m)); }else #endif { uint32_t c1 = color & 0xff00; uint32_t c2 = color & 0xff00ff; c1 *= bright; c2 *= bright; color = ((c1&0xff0000) | (c2&0xff00ff00)) >> 8; } for(int yy = minY; yy < maxY; yy++){ auto *fb3 = fb2; auto *db3 = db2; for(int xx = w; xx > 0; xx--) { if(zval < *db3) { *db3 = zval; *fb3 = color; } fb3++; db3++; } fb2 += fw; db2 += fw; } } v2 += tAxis2; } v1 += tAxis1; } }
void spu_interpreter::FRSQEST(SPUThread& CPU, spu_opcode_t op) { const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); CPU.GPR[op.rt].vf = _mm_rsqrt_ps(_mm_and_ps(CPU.GPR[op.ra].vf, mask)); }
static __m128 mm_pow_ps(__m128 a, __m128 b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. __m128 log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = { 0x43800000, 0x43800000, 0x43800000, 0x43800000}; static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = { 0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; static const int shift_exponent_into_top_mantissa = 8; const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask)); const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32( _mm_castps_si128(two_n), shift_exponent_into_top_mantissa)); const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent)); const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one)); // Compute y. static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = { 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = { 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask)); const __m128 y = _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 static const ALIGN16_BEG float ALIGN16_END C5[4] = { -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; static const ALIGN16_BEG float ALIGN16_END C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; static const ALIGN16_BEG float ALIGN16_END C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f}; static const ALIGN16_BEG float ALIGN16_END C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f}; const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5)); const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4)); const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3)); const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2)); const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1)); const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0)); const __m128 y_minus_one = _mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one)); const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y); // Combine parts. log2_a = _mm_add_ps(n, log2_y); } // b * log2(a) b_log2_a = _mm_mul_ps(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, 129.f, 129.f}; static const ALIGN16_BEG float min_input[4] ALIGN16_END = { -126.99999f, -126.99999f, -126.99999f, -126.99999f}; const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input)); const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input)); // Compute n. static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f, 0.5f}; const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half)); const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); // Compute 2^n. static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = { 127, 127, 127, 127}; static const int float_exponent_shift = 23; const __m128i two_n_exponent = _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias)); const __m128 two_n = _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift)); // Compute y. const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. static const ALIGN16_BEG float C2[4] ALIGN16_END = { 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; static const ALIGN16_BEG float C1[4] ALIGN16_END = { 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f}; const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2)); const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1)); const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0)); // Combine parts. a_exp_b = _mm_mul_ps(exp2_y, two_n); } return a_exp_b; }
inline GPR_t si_andbi( GPR_t RA, uint8_t I10 ) { return _mm_castsi128_ps( _mm_and_si128( _mm_castps_si128( RA ), _mm_set1_epi8( I10 ) ) ); }
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000), GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000), GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000), GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f), GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f), GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f), GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f), }; const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f); const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f); const GSVector4 GSVector4::m_half(0.5f); const GSVector4 GSVector4::m_one(1.0f); const GSVector4 GSVector4::m_two(2.0f); const GSVector4 GSVector4::m_four(4.0f); const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000))); const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f800000))); #if _M_SSE >= 0x500 const GSVector8 GSVector8::m_half(0.5f); const GSVector8 GSVector8::m_one(1.0f); const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); const GSVector8 GSVector8::m_x4b000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4b000000))); const GSVector8 GSVector8::m_x4f800000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4f800000))); #endif #if _M_SSE >= 0x501
inline GPR_t si_andi( GPR_t RA, int16_t I10 ) { return _mm_castsi128_ps( _mm_and_si128( _mm_castps_si128( RA ), _mm_set1_epi32( static_cast<int32_t>(SignExtend( I10, 10 ) ) ) ) ); }
inline GPR_t si_ori( GPR_t RA, int64_t IMM ) { return _mm_or_ps( RA, _mm_castsi128_ps( _mm_set1_epi32( (int32_t)IMM ) ) ); }
inline GPR_t si_nand( GPR_t RA, GPR_t RB ) { return _mm_andnot_ps( _mm_and_ps( RA, RB ), _mm_castsi128_ps( _mm_set1_epi32( 0xffffffff ) ) ); }
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128 one_sixth = _mm_set1_ps(1.0/6.0); __m128 one_twelfth = _mm_set1_ps(1.0/12.0); __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; real rswitch_scalar,d_scalar; __m128 dummy_mask,cutoff_mask; __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); __m128 one = _mm_set1_ps(1.0); __m128 two = _mm_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; nvdwtype = fr->ntype; vdwparam = fr->nbfp;
//---------------------------------------------------------------- // Transforms the AABB vertices to screen space once every frame // Also performs a coarse depth pre-test //---------------------------------------------------------------- PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary) { // w ends up being garbage, but it doesn't matter - we ignore it anyway. __m128 vCenter = _mm_loadu_ps(&mBBCenter.x); __m128 vHalf = _mm_loadu_ps(&mBBHalf.x); __m128 vMin = _mm_sub_ps(vCenter, vHalf); __m128 vMax = _mm_add_ps(vCenter, vHalf); // transforms __m128 xRow[2], yRow[2], zRow[2]; xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0]; xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0]; yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1]; yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1]; zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2]; zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2]; __m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0)); __m128 screenMin = _mm_set1_ps(FLT_MAX); __m128 screenMax = _mm_set1_ps(-FLT_MAX); for(UINT i = 0; i < AABB_VERTICES; i++) { // Transform the vertex __m128 vert = cumulativeMatrix[3]; vert += xRow[sBBxInd[i]]; vert += yRow[sBByInd[i]]; vert += zRow[sBBzInd[i]]; // We have inverted z; z is in front of near plane iff z <= w. __m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz __m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww __m128 zIn = _mm_cmple_ps(vertZ, vertW); zAllIn = _mm_and_ps(zAllIn, zIn); // project xformedPos[i] = _mm_div_ps(vert, vertW); // update bounds screenMin = _mm_min_ps(screenMin, xformedPos[i]); screenMax = _mm_max_ps(screenMax, xformedPos[i]); } // if any of the verts are z-clipped, we (conservatively) say the box is in if(_mm_movemask_ps(zAllIn) != 0xf) return ePT_VISIBLE; // Clip against screen bounds screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX)); screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX)); // Quick rejection test if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin))) return ePT_INVISIBLE; // Prepare integer bounds __m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY __m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY); __m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3); __m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa); // Traverse all 8x8 blocks covered by 2d screen-space BBox; // if we know for sure that this box is behind the geometry we know is there, // we can stop. int rX0 = minMaxXYis.m128i_i32[0]; int rY0 = minMaxXYis.m128i_i32[1]; int rX1 = minMaxXYis.m128i_i32[2]; int rY1 = minMaxXYis.m128i_i32[3]; __m128 anyCloser = _mm_setzero_ps(); for(int by = rY0; by <= rY1; by++) { const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE); // If for any 8x8 block, maxZ is not less than (=behind) summarized // min Z, box might be visible. for(int bx = rX0; bx <= rX1; bx++) { anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx]))); } if(_mm_movemask_ps(anyCloser)) { return ePT_UNSURE; // okay, box might be in } } // If we get here, we know for sure that the box is fully behind the stuff in the // depth buffer. return ePT_INVISIBLE; }