Example #1
0
/* evaluation of 4 sines at onces, using only SSE2.

   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
   take into account the special handling they have for greater values
   -- it does not return garbage for arguments over 8192, though, but
   the extra precision is missing).

   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
   surprising but correct result.

   Performance is also surprisingly good, 1.33 times faster than the
   macos vsinf SSE2 function, and 1.5 times faster than the
   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
   too bad for an SSE1 function (with no special tuning) !
   However the latter libraries probably have a much better handling of NaN,
   Inf, denormalized and other special arguments..

   On my core 1 duo, the execution of this function takes approximately 95 cycles.

   From what I have observed on the experiments with Intel AMath lib, switching to an
   SSE2 version would improve the perf by only 10%.

   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
   deliver full speed.
*/
__m128 sin_ps(__m128 x) { // any x
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;

    v4si emm0, emm2;
    sign_bit = x;
    /* take the absolute value */
    x = _mm_and_ps(x, constants::inv_mant_mask.ps);
    /* extract the sign bit (upper one) */
    sign_bit = _mm_and_ps(sign_bit, constants::sign_mask.ps);

    /* scale by 4/Pi */
    y = _mm_mul_ps(x, constants::cephes_FOPI.ps);

    /* store the integer part of y in mm0 */
    emm2 = _mm_cvttps_epi32(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi);
    emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi);
    y = _mm_cvtepi32_ps(emm2);
    /* get the swap sign flag */
    emm0 = _mm_and_si128(emm2, constants::pi32_4.pi);
    emm0 = _mm_slli_epi32(emm0, 29);
    /* get the polynom selection mask
       there is one polynom for 0 <= x <= Pi/4
       and another one for Pi/4<x<=Pi/2

       Both branches will be computed.
    */
    emm2 = _mm_and_si128(emm2, constants::pi32_2.pi);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

    v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
    v4sf poly_mask = _mm_castsi128_ps(emm2);
    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);

    /* The magic pass: "******"
    x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = constants::minus_cephes_DP1.ps;
    xmm2 = constants::minus_cephes_DP2.ps;
    xmm3 = constants::minus_cephes_DP3.ps;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
    y = constants::coscof_p0.ps;
    v4sf z = _mm_mul_ps(x,x);

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p1.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p2.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, constants::ps_1.ps);

    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

    v4sf y2 = constants::sincof_p0.ps;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p1.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p2.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    /* select the correct result from the two polynoms */
    xmm3 = poly_mask;
    y2 = _mm_and_ps(xmm3, y2); //, xmm3);
    y = _mm_andnot_ps(xmm3, y);
    y = _mm_add_ps(y,y2);
    /* update the sign */
    y = _mm_xor_ps(y, sign_bit);

    return y;
}
Example #2
0
inline GPR_t si_orhi( GPR_t RA, int64_t IMM )
{
	return _mm_or_ps( RA, _mm_castsi128_ps( _mm_set1_epi16( (int16_t)IMM ) ) );
}
Example #3
0
inline GPR_t si_orc( GPR_t RA, GPR_t RB )
{
	const __m128 not_RB = _mm_andnot_ps( RB, _mm_castsi128_ps( _mm_set1_epi32(0xffffffff) ) );
	return _mm_or_ps( RA, not_RB );
}
Example #4
0
namespace embree {
  const __m128 _mm_lookupmask_ps[16] = {
    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
  };

  const __m128d _mm_lookupmask_pd[4] = {
    _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)),
    _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)),
    _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)),
    _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1))
  };

}
Example #5
0
namespace q {
const __m128 _mm_lookupmask_ps[16] = {
  _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
  _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
  _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
  _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
  _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
  _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
  _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
  _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
  _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
  _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
  _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
  _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
  _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
  _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
  _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
  _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
};
} /* namespace q */
Example #6
0
inline GPR_t si_xorbi( GPR_t RA, int64_t IMM )
{
	return _mm_xor_ps( RA, _mm_castsi128_ps( _mm_set1_epi8((uint8_t)IMM) ) );
}
Example #7
0
GSVector4 GSVector4::cast(const GSVector4i& v)
{
	return GSVector4(_mm_castsi128_ps(v.m));
}
Example #8
0
/* Function:  esl_sse_logf()
 * Synopsis:  <r[z] = log x[z]>
 * Incept:    SRE, Fri Dec 14 11:32:54 2007 [Janelia]
 *
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = logf(x[z])>.
 *            
 *            Valid in the domain $x_z > 0$ for normalized IEEE754
 *            $x_z$.
 *
 *            For <x> $< 0$, including -0, returns <NaN>. For <x> $==
 *            0$ or subnormal <x>, returns <-inf>. For <x = inf>,
 *            returns <inf>. For <x = NaN>, returns <NaN>. For 
 *            subnormal <x>, returns <-inf>.
 *
 * Xref:      J2/71.
 * 
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2 and added handling
 *            of IEEE754 specials.
 */
__m128 
esl_sse_logf(__m128 x) 
{
  static float cephes_p[9] = {  7.0376836292E-2f, -1.1514610310E-1f,  1.1676998740E-1f,
				-1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f,
				2.0000714765E-1f, -2.4999993993E-1f,  3.3333331174E-1f };
  __m128  onev = _mm_set1_ps(1.0f);          /* all elem = 1.0 */
  __m128  v0p5 = _mm_set1_ps(0.5f);          /* all elem = 0.5 */
  __m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */
  __m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */
  __m128i ei;
  __m128  e;
  __m128  invalid_mask, zero_mask, inf_mask;            /* masks used to handle special IEEE754 inputs */
  __m128  mask;
  __m128  origx;
  __m128  tmp;
  __m128  y;
  __m128  z;

  /* first, split x apart: x = frexpf(x, &e); */
  ei           = _mm_srli_epi32( _mm_castps_si128(x), 23);	                                        /* shift right 23: IEEE754 floats: ei = biased exponents     */
  invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg));  /* mask any elem that's negative; these become NaN           */
  zero_mask    = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128()));                          /* mask any elem zero or subnormal; these become -inf        */
  inf_mask     = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp));  /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN      */
  origx        = x;			                                                                /* store original x, used for log(inf) = inf, log(NaN) = NaN */

  x  = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand        */
  x  = _mm_or_ps (x, v0p5);                                          /* sets hidden bit b[0]                                      */

  ei = _mm_sub_epi32(ei, _mm_set1_epi32(126));                       /* -127 (ei now signed base-2 exponent); then +1             */
  e  = _mm_cvtepi32_ps(ei);

  /* now, calculate the log */
  mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches.           */
  tmp  = _mm_and_ps(x, mask);	                              /* tmp contains x values < 0.707, else 0 */
  x    = _mm_sub_ps(x, onev);
  e    = _mm_sub_ps(e, _mm_and_ps(onev, mask));
  x    = _mm_add_ps(x, tmp);
  z    = _mm_mul_ps(x,x);

  y =               _mm_set1_ps(cephes_p[0]);    y = _mm_mul_ps(y, x); 
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));   y = _mm_mul_ps(y, x);    
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));   y = _mm_mul_ps(y, x);    
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6]));   y = _mm_mul_ps(y, x); 
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7]));   y = _mm_mul_ps(y, x);  
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8]));   y = _mm_mul_ps(y, x);
  y = _mm_mul_ps(y, z);

  tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f));
  y   = _mm_add_ps(y, tmp);

  tmp = _mm_mul_ps(z, v0p5);
  y   = _mm_sub_ps(y, tmp);

  tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f));
  x = _mm_add_ps(x, y);
  x = _mm_add_ps(x, tmp);

  /* IEEE754 cleanup: */
  x = esl_sse_select_ps(x, origx,                     inf_mask);  /* log(inf)=inf; log(NaN)      = NaN  */
  x = _mm_or_ps(x, invalid_mask);                                 /* log(x<0, including -0,-inf) = NaN  */
  x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal         = -inf */
  return x;
}
Example #9
0
static void SinCos(const float rad, float &sin, float &cos) // #include <emmintrin.h>, #include <xmmintrin.h>
{
	const __m128 _ps_fopi = _mm_set1_ps(4.0f / pi);

	const __m128 _ps_0p5 = _mm_set1_ps(0.5f);
	const __m128 _ps_1   = _mm_set1_ps(1.0f);

	const __m128 _ps_dp1 = _mm_set1_ps(-0.7851562f);
	const __m128 _ps_dp2 = _mm_set1_ps(-2.4187564849853515625e-4f);
	const __m128 _ps_dp3 = _mm_set1_ps(-3.77489497744594108e-8f);

	const __m128 _ps_sincof_p0 = _mm_set1_ps(2.443315711809948e-5f);
	const __m128 _ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f);
	const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f);
	const __m128 _ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f);
	const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f);
	const __m128 _ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f);

	const __m128i _pi32_1  = _mm_set1_epi32(1);
	const __m128i _pi32_i1 = _mm_set1_epi32(~1);
	const __m128i _pi32_2  = _mm_set1_epi32(2);
	const __m128i _pi32_4  = _mm_set1_epi32(4);

	const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000));
	const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));

	__m128  mm1,  mm2;
	__m128i mmi0, mmi2, mmi4;

	__m128 x, y, z;
	__m128 y1,  y2;

	__m128 a = _mm_set1_ps(rad);

	x = _mm_and_ps(a, _mask_sign_inv);
	y = _mm_mul_ps(x, _ps_fopi);

	mmi2 = _mm_cvtps_epi32(y);
	mmi2 = _mm_add_epi32(mmi2, _pi32_1);
	mmi2 = _mm_and_si128(mmi2, _pi32_i1);
	y    = _mm_cvtepi32_ps(mmi2);

	mmi4 = mmi2;

	mmi0 = _mm_and_si128(mmi2, _pi32_4);
	mmi0 = _mm_slli_epi32(mmi0, 29);
	__m128 swap_sign_bit_sin = _mm_castsi128_ps(mmi0);

	mmi2 = _mm_and_si128(mmi2, _pi32_2);
	mmi2 = _mm_cmpeq_epi32(mmi2, _mm_setzero_si128());
	__m128 poly_mask = _mm_castsi128_ps(mmi2);

	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp1));
	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp2));
	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp3));

	mmi4 = _mm_sub_epi32(mmi4, _pi32_2);
	mmi4 = _mm_andnot_si128(mmi4, _pi32_4);
	mmi4 = _mm_slli_epi32(mmi4, 29);

	__m128 sign_bit_cos = _mm_castsi128_ps(mmi4);
	__m128 sign_bit_sin = _mm_xor_ps(_mm_and_ps(a, _mask_sign_raw), swap_sign_bit_sin);


	z  = _mm_mul_ps(x, x);

	y1 = _mm_mul_ps(_ps_coscof_p0, z);
	y1 = _mm_add_ps(y1, _ps_coscof_p1);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_add_ps(y1, _ps_coscof_p2);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_sub_ps(y1, _mm_mul_ps(z, _ps_0p5));
	y1 = _mm_add_ps(y1, _ps_1);

	y2 = _mm_mul_ps(_ps_sincof_p0, z);
	y2 = _mm_add_ps(y2, _ps_sincof_p1);
	y2 = _mm_mul_ps(y2, z);
	y2 = _mm_add_ps(y2, _ps_sincof_p2);
	y2 = _mm_mul_ps(y2, z);
	y2 = _mm_mul_ps(y2, x);
	y2 = _mm_add_ps(y2, x);


	__m128 sin1y = _mm_andnot_ps(poly_mask, y1);
	__m128 sin2y = _mm_and_ps(poly_mask, y2);


	mm1 = _mm_add_ps(sin1y, sin2y);
	mm2 = _mm_add_ps(_mm_sub_ps(y1, sin1y), _mm_sub_ps(y2, sin2y));

	sin = _mm_cvtss_f32(_mm_xor_ps(mm1, sign_bit_sin));
	cos = _mm_cvtss_f32(_mm_xor_ps(mm2, sign_bit_cos));
}
Example #10
0
int main()
{
	//Transpose
	vec4 mat[4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};

	__m128i xmm0 = _mm_unpacklo_epi32(_mm_castps_si128(mat[0]), _mm_castps_si128(mat[1]));
	__m128i xmm1 = _mm_unpackhi_epi32(_mm_castps_si128(mat[0]), _mm_castps_si128(mat[1]));
	__m128i xmm2 = _mm_unpacklo_epi32(_mm_castps_si128(mat[2]), _mm_castps_si128(mat[3]));
	__m128i xmm3 = _mm_unpackhi_epi32(_mm_castps_si128(mat[2]), _mm_castps_si128(mat[3]));

	vec4 trans[4];

	trans[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(xmm0, xmm2));
	trans[1] = _mm_castsi128_ps(_mm_unpackhi_epi64(xmm0, xmm2));
	trans[2] = _mm_castsi128_ps(_mm_unpacklo_epi64(xmm1, xmm3));
	trans[3] = _mm_castsi128_ps(_mm_unpackhi_epi64(xmm1, xmm3));

	vec4 trans2[4];

	ml::transpose(trans2, mat);

	FILE* file = fopen("..\\..\\AppData\\VT.swf", "rb");
	fseek(file, 0, SEEK_END);
	size_t size = ftell(file);
	fseek(file, 0, SEEK_SET);
	unsigned char* fileData = (unsigned char*)malloc(size);
	fread(fileData, 1, size, file);
	fclose(file);

	MemReader data = {(const char*)fileData, (const char*)fileData+size, (const char*)fileData};
	
	//Read SWF header
	const u32 signatureAndVersion	= data.read<u32>();
	const u32 actualSize			= data.read<u32>();

	u32 signature = signatureAndVersion&0x00FFFFFF;
	u8	version = signatureAndVersion>>24;

	bool isCompressed   = signature=='\0SWC';
	bool isUncompressed = signature=='\0SWF';

	//if !isCompressed && !isUncompressed return error;

	MemReader data2 = {0, 0, 0};

	char* uncompressed = 0;
	if (isCompressed)
	{
		 uncompressed = (char*)malloc(actualSize-8);
		 data2.cur = data2.start = uncompressed;
		 data2.end = uncompressed+actualSize-8;
		 uLongf uncompressedSize = actualSize-8;
		 uncompress((Bytef*)uncompressed, &uncompressedSize, data.as<Bytef>(), size-8);
	}
	else if (isCompressed)
	{
		data2.cur = data2.start = data.as<char>();
		data2.end = data2.start+actualSize-8;
	}

	
	u8 bits = data2.read<u8>();
	u8 numBits = bits>>3;

	u32 rectSizeMinusOne = (numBits*4+5)>>3;
	data2.move(rectSizeMinusOne);
	
	const u16 frameRate	 = data2.read<u16>();
	const u16 frameCount = data2.read<u16>();

	std::set<u32>	tagsUsed;
	size_t tagCount = 0;

	while (data2.cur!=data2.end)
	{
		u16 tagHeader = data2.read<u16>();
		u32 tagLength = tagHeader&0x3F;
		u32 tagType = tagHeader>>6;
		tagsUsed.insert(tagType);

		if (tagLength==0x3F)
			tagLength = data2.read<u32>();
		data2.move(tagLength);

		parseTag(tagType);

		++tagCount;
	}

	if (uncompressed) free(uncompressed);

	printf("\nProcessed %d tags\n\n", tagCount);

	printf("        Tags used        \n");
	printf("-------------------------\n");

	std::set<u32>::iterator	it  = tagsUsed.begin(),
							end = tagsUsed.end();

	for (; it!=end; ++it)
	{
		parseTag(*it);
	}

	free(fileData);
}
Example #11
0
/* Function:  esl_sse_expf()
 * Synopsis:  <r[z] = exp x[z]>
 * Incept:    SRE, Fri Dec 14 14:46:27 2007 [Janelia]
 *
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = expf(x[z])>.
 *
 *            Valid for all IEEE754 floats $x_z$.
 *
 * Xref:      J2/71
 *            J10/62: bugfix, minlogf/maxlogf range was too wide;
 *                    (k+127) must be >=0 and <=255, so (k+127)<<23
 *                    is a valid IEEE754 float, without touching
 *                    the sign bit. Pommier had this right in the
 *                    first place, and I didn't understand.
 *
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2.
 *
 *            Note on maxlogf/minlogf, which are close to but not
 *            exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so
 *            k+127 is 0..255, a valid IEEE754 8-bit exponent
 *            (0..255), so the bit pattern (k+127)<<23 is IEEE754
 *            single-precision for 2^k.  If k=-127, we get IEEE754 0.
 *            If k=128, we get IEEE754 +inf.  If k<-127, k+127 is
 *            negative and we get screwed up.  If k>128, k+127
 *            overflows the 8-bit exponent and sets the sign bit.  So
 *            for x' (base 2) < -127.5 we must definitely return e^x ~
 *            0; for x' < 126.5 we're going to calculate 0 anyway
 *            (because k=floor(-126.5-epsilon+0.5) = -127).  So any
 *            minlogf between -126.5 log2 ... -127.5 log2 will suffice
 *            as the cutoff. Ditto for 126.5 log2 .. 127.5log2.
 *            That's 87.68312 .. 88.3762655.  I think Pommier's
 *            thinking is, you don't want to get to close to the
 *            edges, lest fp roundoff error screw you (he may have
 *            consider 1 ulp carefully, I can't tell), but otherwise
 *            you may as well put your bounds close to the outer edge;
 *            so
 *              maxlogf =  127.5 log(2) - epsilon
 *              minlogf = -127.5 log(2) + epsilon
 *            for an epsilon that happen to be ~ 3e-6.
 */
__m128
esl_sse_expf(__m128 x)
{
    static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f,
                                 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f
                               };
    static float cephes_c[2] = { 0.693359375f,    -2.12194440e-4f };
    static float maxlogf     =  88.3762626647949f;  /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */
    static float minlogf     = -88.3762626647949f;  /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */
    __m128i k;
    __m128  mask, tmp, fx, z, y, minmask, maxmask;

    /* handle out-of-range and special conditions */
    maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf));
    minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf));

    /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */
    fx = _mm_mul_ps(x,  _mm_set1_ps(eslCONST_LOG2R));
    fx = _mm_add_ps(fx, _mm_set1_ps(0.5f));

    /* floorf() with SSE:  */
    k    = _mm_cvttps_epi32(fx);	              /* cast to int with truncation                  */
    tmp  = _mm_cvtepi32_ps(k);	              /* cast back to float                           */
    mask = _mm_cmpgt_ps(tmp, fx);               /* if it increased (i.e. if it was negative...) */
    mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch...           */
    fx   = _mm_sub_ps(tmp, mask);	              /* then subtract one.                           */
    k    = _mm_cvttps_epi32(fx);	              /* k is now ready for the 2^k part.             */

    /* polynomial approx for e^f for f in range [-0.5, 0.5] */
    tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0]));
    z   = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1]));
    x   = _mm_sub_ps(x, tmp);
    x   = _mm_sub_ps(x, z);
    z   = _mm_mul_ps(x, x);

    y =               _mm_set1_ps(cephes_p[0]);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(1.0f));

    /* build 2^k by hand, by creating a IEEE754 float */
    k  = _mm_add_epi32(k, _mm_set1_epi32(127));
    k  = _mm_slli_epi32(k, 23);
    fx = _mm_castsi128_ps(k);

    /* put 2^k e^f together (fx = 2^k,  y = e^f) and we're done */
    y = _mm_mul_ps(y, fx);

    /* special/range cleanup */
    y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128)  */
    y = esl_sse_select_ps(y, _mm_set1_ps(0.0f),        minmask); /* exp(x) = 0   for x < log(2^-149) */
    return y;
}
Example #12
0
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
                                            const unsigned char *_blimit,
                                            const unsigned char *_limit,
                                            const unsigned char *_thresh) {
  __m128i mask, hev, flat, flat2;
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i one = _mm_set1_epi8(1);
  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
  __m128i abs_p1p0;

  const __m128i thresh =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
  const __m128i blimit =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));

  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
  q4p4 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
  q3p3 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
  q2p2 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
  q1p1 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
  p1q1 = _mm_shuffle_epi32(q1p1, 78);
  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
  q0p0 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
  p0q0 = _mm_shuffle_epi32(q0p0, 78);

  {
    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
    abs_p1p0 =
        _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
    fe = _mm_set1_epi8(0xfe);
    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    abs_p0q0 =
        _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
    abs_p1q1 =
        _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    hev = _mm_subs_epu8(flat, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    mask = _mm_max_epu8(abs_p1p0, mask);
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;

    work = _mm_max_epu8(
        _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
    mask = _mm_max_epu8(work, mask);
    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }

  // lp filter
  {
    const __m128i t4 = _mm_set1_epi8(4);
    const __m128i t3 = _mm_set1_epi8(3);
    const __m128i t80 = _mm_set1_epi8(0x80);
    const __m128i t1 = _mm_set1_epi16(0x1);
    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
    __m128i qs0 = _mm_xor_si128(p0q0, t80);
    __m128i qs1 = _mm_xor_si128(p1q1, t80);
    __m128i filt;
    __m128i work_a;
    __m128i filter1, filter2;
    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;

    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
    work_a = _mm_subs_epi8(qs0, qs0ps0);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
    filt = _mm_and_si128(filt, mask);

    filter1 = _mm_adds_epi8(filt, t4);
    filter2 = _mm_adds_epi8(filt, t3);

    filter1 = _mm_unpacklo_epi8(zero, filter1);
    filter1 = _mm_srai_epi16(filter1, 0xB);
    filter2 = _mm_unpacklo_epi8(zero, filter2);
    filter2 = _mm_srai_epi16(filter2, 0xB);

    /* Filter1 >> 3 */
    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);

    /* filt >> 1 */
    filt = _mm_adds_epi16(filter1, t1);
    filt = _mm_srai_epi16(filt, 1);
    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
                            filt);
    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
    // loopfilter done

    {
      __m128i work;
      flat = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
      flat = _mm_max_epu8(abs_p1p0, flat);
      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
      flat = _mm_subs_epu8(flat, one);
      flat = _mm_cmpeq_epi8(flat, zero);
      flat = _mm_and_si128(flat, mask);

      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
      q5p5 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));

      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
      q6p6 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));

      flat2 = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
          _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));

      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
      q7p7 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));

      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));

      flat2 = _mm_max_epu8(work, flat2);
      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
      flat2 = _mm_subs_epu8(flat2, one);
      flat2 = _mm_cmpeq_epi8(flat2, zero);
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    }

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // flat and wide flat calculations
    {
      const __m128i eight = _mm_set1_epi16(8);
      const __m128i four = _mm_set1_epi16(4);
      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
      __m128i pixelFilter_p, pixelFilter_q;
      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;

      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
      q7_16 = _mm_unpackhi_epi8(q7p7, zero);

      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
                                    _mm_add_epi16(p4_16, p3_16));
      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
                                    _mm_add_epi16(q4_16, q3_16));

      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);

      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
      pixelFilter_p =
          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
      pixetFilter_p2p1p0 = _mm_add_epi16(
          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);

      flat_q0p0 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(p7_16, p7_16);
      sum_q7 = _mm_add_epi16(q7_16, q7_16);
      sum_p3 = _mm_add_epi16(p3_16, p3_16);
      sum_q3 = _mm_add_epi16(q3_16, q3_16);

      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
      flat_q1p1 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
      sum_q3 = _mm_add_epi16(sum_q3, q3_16);

      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);

      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
      flat_q2p2 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
    }
    // wide flat
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    flat = _mm_shuffle_epi32(flat, 68);
    flat2 = _mm_shuffle_epi32(flat2, 68);

    q2p2 = _mm_andnot_si128(flat, q2p2);
    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat_q2p2);

    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);

    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);

    q6p6 = _mm_andnot_si128(flat2, q6p6);
    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));

    q5p5 = _mm_andnot_si128(flat2, q5p5);
    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));

    q4p4 = _mm_andnot_si128(flat2, q4p4);
    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));

    q3p3 = _mm_andnot_si128(flat2, q3p3);
    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));

    q2p2 = _mm_andnot_si128(flat2, q2p2);
    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));

    q1p1 = _mm_andnot_si128(flat2, q1p1);
    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));

    q0p0 = _mm_andnot_si128(flat2, q0p0);
    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
  }
}
Example #13
0
INLINE __m128 shade(BilinearSamplePos const& bsp, const SWR_TRIANGLE_DESC & work, WideVector<BilinearSamplePos::NUM_ATTRIBUTES, __m128> const& pAttrs, BYTE* pBuffer, BYTE*, UINT*)
{
	TextureView *pTxv = (TextureView*)work.pTextureViews[KNOB_NUMBER_OF_TEXTURE_VIEWS + 0];
	Sampler *pSmp = (Sampler*)work.pSamplers[0];

	TexCoord tcidx;
	tcidx.U = get<4>(pAttrs);
	tcidx.V = get<5>(pAttrs);
	UINT mips[] = {0,0,0,0};
	WideColor color;
	SampleSimplePointRGBAF32(*pTxv, *pSmp, tcidx, mips, color);

	// modulate
	color.R = _mm_mul_ps(color.R, get<0>(pAttrs));
	color.G = _mm_mul_ps(color.G, get<1>(pAttrs));
	color.B = _mm_mul_ps(color.B, get<2>(pAttrs));
	color.A = _mm_mul_ps(color.A, get<3>(pAttrs));

	// convert float to unorm
	__m128i r = vFloatToUnorm( color.R );
	__m128i g = vFloatToUnorm( color.G );
	__m128i b = vFloatToUnorm( color.B );
	__m128i a = vFloatToUnorm( color.A );

	// pack
	__m128i vPixel = b;
	vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(g, 8));
	vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(r, 16));
	vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(a, 24));

	// blend with GL_ONE and GL_ONE
	if (bsp.sFactor == GL_ONE && bsp.dFactor == GL_ONE)
	{ 
		__m128i vColorBuffer = _mm_load_si128((const __m128i*)pBuffer);
		vPixel = _mm_adds_epu8(vPixel, vColorBuffer);
	}
	
	if (bsp.sFactor == GL_SRC_ALPHA && bsp.dFactor == GL_ONE_MINUS_SRC_ALPHA)
	{
		const __m128i SHUF_ALPHA = _mm_set_epi32(0x8080800f, 0x8080800b, 0x80808007, 0x80808003);
		const __m128i SHUF_RED = _mm_set_epi32(0x8080800e, 0x8080800a, 0x80808006, 0x80808002);
		const __m128i SHUF_GREEN = _mm_set_epi32(0x8080800d, 0x80808009, 0x80808005, 0x80808001);
		const __m128i SHUF_BLUE = _mm_set_epi32(0x8080800c, 0x80808008, 0x80808004, 0x80808000);

		// mul by src_alpha
		__m128 vSrcRedF = _mm_mul_ps(color.R, color.A);
		__m128 vSrcGreenF = _mm_mul_ps(color.G, color.A);
		__m128 vSrcBlueF = _mm_mul_ps(color.B, color.A);

		// convert to int
		__m128i vSrcRed = vFloatToUnorm(vSrcRedF);
		__m128i vSrcGreen = vFloatToUnorm(vSrcGreenF);
		__m128i vSrcBlue = vFloatToUnorm(vSrcBlueF);
		__m128i vSrcAlpha = vFloatToUnorm(color.A);

		// pack
		__m128i vSrcPixel = vSrcBlue;
		vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcGreen, 8));
		vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcRed, 16));
		vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcAlpha, 24));

		// shuffle dst R,G,B,A
		__m128i vColorBuffer = _mm_load_si128((const __m128i*)pBuffer);

		__m128i vDstAlpha = _mm_shuffle_epi8(vColorBuffer, SHUF_ALPHA);
		__m128i vDstRed = _mm_shuffle_epi8(vColorBuffer, SHUF_RED);
		__m128i vDstGreen = _mm_shuffle_epi8(vColorBuffer, SHUF_GREEN);
		__m128i vDstBlue = _mm_shuffle_epi8(vColorBuffer, SHUF_BLUE);

		// convert to float
		__m128 vDstAlphaF = _mm_cvtepi32_ps(vDstAlpha);
		__m128 vDstRedF = _mm_cvtepi32_ps(vDstRed);
		__m128 vDstGreenF = _mm_cvtepi32_ps(vDstGreen);
		__m128 vDstBlueF = _mm_cvtepi32_ps(vDstBlue);

		// mul by 1-src_alpha
		__m128 vOneMinusSrcAlphaF = _mm_sub_ps(_mm_set1_ps(1.0f), color.A);
		
		vDstAlphaF = _mm_mul_ps(vDstAlphaF, vOneMinusSrcAlphaF);
		vDstRedF = _mm_mul_ps(vDstRedF, vOneMinusSrcAlphaF);
		vDstGreenF = _mm_mul_ps(vDstGreenF, vOneMinusSrcAlphaF);
		vDstBlueF = _mm_mul_ps(vDstBlueF, vOneMinusSrcAlphaF);

		// convert to int
		vDstAlpha = _mm_cvtps_epi32(vDstAlphaF);
		vDstRed = _mm_cvtps_epi32(vDstRedF);
		vDstGreen = _mm_cvtps_epi32(vDstGreenF);
		vDstBlue = _mm_cvtps_epi32(vDstBlueF);

		// pack
		__m128i vDstPixel = vDstBlue;
		vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstGreen, 8));
		vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstRed, 16));
		vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstAlpha, 24));

		// final rgba = min(src + dst,255)
		vPixel = _mm_adds_epu8(vSrcPixel, vDstPixel);
	}

	return _mm_castsi128_ps(vPixel);
}
Example #14
0
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
   it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(__m128 x, __m128* s, __m128* c) {
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
    v4si emm0, emm2, emm4;
    sign_bit_sin = x;
    /* take the absolute value */
    x = _mm_and_ps(x, constants::inv_sign_mask.ps);
    /* extract the sign bit (upper one) */
    sign_bit_sin = _mm_and_ps(sign_bit_sin, constants::sign_mask.ps);

    /* scale by 4/Pi */
    y = _mm_mul_ps(x, constants::cephes_FOPI.ps);

    /* store the integer part of y in emm2 */
    emm2 = _mm_cvttps_epi32(y);

    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi);
    emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi);
    y = _mm_cvtepi32_ps(emm2);

    emm4 = emm2;

    /* get the swap sign flag for the sine */
    emm0 = _mm_and_si128(emm2, constants::pi32_4.pi);
    emm0 = _mm_slli_epi32(emm0, 29);
    v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);

    /* get the polynom selection mask for the sine*/
    emm2 = _mm_and_si128(emm2, constants::pi32_2.pi);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    v4sf poly_mask = _mm_castsi128_ps(emm2);

    /* The magic pass: "******"
    x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = constants::minus_cephes_DP1.ps;
    xmm2 = constants::minus_cephes_DP2.ps;
    xmm3 = constants::minus_cephes_DP3.ps;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    emm4 = _mm_sub_epi32(emm4, constants::pi32_2.pi);
    emm4 = _mm_andnot_si128(emm4, constants::pi32_4.pi);
    emm4 = _mm_slli_epi32(emm4, 29);
    v4sf sign_bit_cos = _mm_castsi128_ps(emm4);

    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);


    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
    v4sf z = _mm_mul_ps(x,x);
    y = constants::coscof_p0.ps;

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p1.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p2.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, constants::ps_1.ps);

    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

    v4sf y2 = constants::sincof_p0.ps;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p1.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p2.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    /* select the correct result from the two polynoms */
    xmm3 = poly_mask;
    v4sf ysin2 = _mm_and_ps(xmm3, y2);
    v4sf ysin1 = _mm_andnot_ps(xmm3, y);
    y2 = _mm_sub_ps(y2,ysin2);
    y = _mm_sub_ps(y, ysin1);

    xmm1 = _mm_add_ps(ysin1,ysin2);
    xmm2 = _mm_add_ps(y,y2);

    /* update the sign */
    *s = _mm_xor_ps(xmm1, sign_bit_sin);
    *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
Example #15
0
void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
  __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
  __m128i emm0, emm2, emm4;
  sign_bit_sin = x;
  x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_sign_mask));
  sign_bit_sin = _mm_and_ps(sign_bit_sin,
                            *reinterpret_cast<const __m128*>(_pi_sign_mask));
  y = _mm_mul_ps(x, *_ps_cephes_FOPI);
  emm2 = _mm_cvttps_epi32(y);
  emm2 = _mm_add_epi32(emm2, *_pi_1);
  emm2 = _mm_and_si128(emm2, *_pi_inv1);
  y = _mm_cvtepi32_ps(emm2);
  emm4 = emm2;
  emm0 = _mm_and_si128(emm2, *_pi_4);
  emm0 = _mm_slli_epi32(emm0, 29);
  __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
  emm2 = _mm_and_si128(emm2, *_pi_2);
  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  __m128 poly_mask = _mm_castsi128_ps(emm2);
  xmm1 = *_ps_minus_cephes_DP1;
  xmm2 = *_ps_minus_cephes_DP2;
  xmm3 = *_ps_minus_cephes_DP3;
  xmm1 = _mm_mul_ps(y, xmm1);
  xmm2 = _mm_mul_ps(y, xmm2);
  xmm3 = _mm_mul_ps(y, xmm3);
  x = _mm_add_ps(x, xmm1);
  x = _mm_add_ps(x, xmm2);
  x = _mm_add_ps(x, xmm3);
  emm4 = _mm_sub_epi32(emm4, *_pi_2);
  emm4 = _mm_andnot_si128(emm4, *_pi_4);
  emm4 = _mm_slli_epi32(emm4, 29);
  __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  __m128 z = _mm_mul_ps(x, x);
  y = *_ps_coscof_p0;
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *_ps_coscof_p1);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *_ps_coscof_p2);
  y = _mm_mul_ps(y, z);
  y = _mm_mul_ps(y, z);
  __m128 tmp = _mm_mul_ps(z, *_ps_0p5);
  y = _mm_sub_ps(y, tmp);
  y = _mm_add_ps(y, *_ps_1);
  __m128 y2 = *_ps_sincof_p0;
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *_ps_sincof_p1);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *_ps_sincof_p2);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_mul_ps(y2, x);
  y2 = _mm_add_ps(y2, x);
  xmm3 = poly_mask;
  __m128 ysin2 = _mm_and_ps(xmm3, y2);
  __m128 ysin1 = _mm_andnot_ps(xmm3, y);
  y2 = _mm_sub_ps(y2, ysin2);
  y = _mm_sub_ps(y, ysin1);
  xmm1 = _mm_add_ps(ysin1, ysin2);
  xmm2 = _mm_add_ps(y, y2);
  *s = _mm_xor_ps(xmm1, sign_bit_sin);
  *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
__m128 exp_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
   __m128 tmp = _mm_setzero_ps(), fx;
#ifdef USE_SSE2
   __m128i emm0;
#else
   __m64 mm0, mm1;
#endif
   __m128 one = *(__m128*)_ps_1;

   x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
   x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);

   /* express exp(x) as exp(g + n*log(2)) */
   fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
   fx = _mm_add_ps(fx, *(__m128*)_ps_0p5);

   /* how to perform a floorf with SSE: just below */
#ifndef USE_SSE2
   /* step 1 : cast to int */
   tmp = _mm_movehl_ps(tmp, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(tmp);
   /* step 2 : cast back to float */
   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
#else
   emm0 = _mm_cvttps_epi32(fx);
   tmp  = _mm_cvtepi32_ps(emm0);
#endif
   /* if greater, substract 1 */
   __m128 mask = _mm_cmpgt_ps(tmp, fx);    
   mask = _mm_and_ps(mask, one);
   fx = _mm_sub_ps(tmp, mask);

   tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
   __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
   x = _mm_sub_ps(x, tmp);
   x = _mm_sub_ps(x, z);

   z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_exp_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, x);
   y = _mm_add_ps(y, one);

   /* build 2^n */
#ifndef USE_SSE2
   z = _mm_movehl_ps(z, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(z);
   mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
   mm0 = _mm_slli_pi32(mm0, 23); 
   mm1 = _mm_slli_pi32(mm1, 23);

   __m128 pow2n; 
   COPY_MM_TO_XMM(mm0, mm1, pow2n);
   _mm_empty();
#else
   emm0 = _mm_cvttps_epi32(fx);
   emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f);
   emm0 = _mm_slli_epi32(emm0, 23);
   __m128 pow2n = _mm_castsi128_ps(emm0);
#endif
   y = _mm_mul_ps(y, pow2n);
   return y;
}
Example #17
0
void spu_rotmahi( SPU_t* SPU, SPU_INSTRUCTION op )
{
	const int s = 0x1f&( 0 - SignExtend(op.RI7.I7, 7) );
	SPU->GPR[op.RI7.RT] = _mm_castsi128_ps( _mm_srai_epi16( _mm_castps_si128( SPU->GPR[op.RI7.RA] ), s ) );
}
/* almost the same as sin_ps */
__m128 cos_ps(v4sfu *xPtr) { // any x
   __m128 x=*((__m128 *)xPtr);
   __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
#ifdef USE_SSE2
   __m128i emm0, emm2;
#else
   __m64 mm0, mm1, mm2, mm3;
#endif
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in mm0 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2);

   /* get the swap sign flag */
   emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   /* get the polynom selection mask */
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

   __m128 sign_bit = _mm_castsi128_ps(emm0);
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm0:mm1 */
   xmm2 = _mm_movehl_ps(xmm2, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm2);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);


   mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2);

   /* get the swap sign flag in mm0:mm1 and the 
   polynom selection mask in mm2:mm3 */

   mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);

   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());

   __m128 sign_bit, poly_mask;
   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
   _mm_empty(); /* good-bye mmx */
#endif
   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   y = *(__m128*)_ps_coscof_p0;
   __m128 z = _mm_mul_ps(x,x);

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   y = _mm_andnot_ps(xmm3, y);
   y = _mm_add_ps(y,y2);
   /* update the sign */
   y = _mm_xor_ps(y, sign_bit);

   return y;
}
Example #19
0
static void cftmdl_128_SSE2(float* a) {
  const int l = 8;
  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
  int j0;

  __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
  for (j0 = 0; j0 < l; j0 += 2) {
    const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
    const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
    const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
    const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
                                          _mm_castsi128_ps(a_32),
                                          _MM_SHUFFLE(1, 0, 1, 0));
    const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
                                          _mm_castsi128_ps(a_40),
                                          _MM_SHUFFLE(1, 0, 1, 0));
    __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
    const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

    const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
    const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
    const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
    const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
    const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
                                          _mm_castsi128_ps(a_48),
                                          _MM_SHUFFLE(1, 0, 1, 0));
    const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
                                          _mm_castsi128_ps(a_56),
                                          _MM_SHUFFLE(1, 0, 1, 0));
    const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
    const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

    const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

    const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
        _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
    const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
    const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
    const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

    const __m128 yy0 =
        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
    const __m128 yy1 =
        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
    const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
    const __m128 yy3 = _mm_add_ps(yy0, yy2);
    const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);

    _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
    _mm_storel_epi64(
        (__m128i*)&a[j0 + 32],
        _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));

    _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
    _mm_storel_epi64(
        (__m128i*)&a[j0 + 48],
        _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
    a[j0 + 48] = -a[j0 + 48];

    _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
    _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));

    _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
    _mm_storel_epi64(
        (__m128i*)&a[j0 + 56],
        _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
  }

  {
    int k = 64;
    int k1 = 2;
    int k2 = 2 * k1;
    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
    wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
    for (j0 = k; j0 < l + k; j0 += 2) {
      const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
      const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
      const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
      const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
      const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
                                            _mm_castsi128_ps(a_32),
                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
                                            _mm_castsi128_ps(a_40),
                                            _MM_SHUFFLE(1, 0, 1, 0));
      __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
      const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

      const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
      const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
      const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
      const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
      const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
                                            _mm_castsi128_ps(a_48),
                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
                                            _mm_castsi128_ps(a_56),
                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
      const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

      const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
      const __m128 xx3 =
          _mm_mul_ps(wk2iv,
                     _mm_castsi128_ps(_mm_shuffle_epi32(
                         _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx4 = _mm_add_ps(xx2, xx3);

      const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
          _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
      const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
      const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
      const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

      const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
      const __m128 xx11 = _mm_mul_ps(
          wk1iv,
          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
                                             _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx12 = _mm_add_ps(xx10, xx11);

      const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
      const __m128 xx21 = _mm_mul_ps(
          wk3iv,
          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
                                             _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx22 = _mm_add_ps(xx20, xx21);

      _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
      _mm_storel_epi64(
          (__m128i*)&a[j0 + 32],
          _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));

      _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
      _mm_storel_epi64(
          (__m128i*)&a[j0 + 48],
          _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));

      _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
      _mm_storel_epi64(
          (__m128i*)&a[j0 + 40],
          _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));

      _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
      _mm_storel_epi64(
          (__m128i*)&a[j0 + 56],
          _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
    }
  }
}
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) {
   __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
#ifdef USE_SSE2
   __m128i emm0, emm2, emm4;
#else
   __m64 mm0, mm1, mm2, mm3, mm4, mm5;
#endif
   sign_bit_sin = x;
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* extract the sign bit (upper one) */
   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in emm2 */
   emm2 = _mm_cvttps_epi32(y);

   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm4 = emm2;

   /* get the swap sign flag for the sine */
   emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);

   /* get the polynom selection mask for the sine*/
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm2:mm3 */
   xmm3 = _mm_movehl_ps(xmm3, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm3);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);

   mm4 = mm2;
   mm5 = mm3;

   /* get the swap sign flag for the sine */
   mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   __m128 swap_sign_bit_sin;
   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);

   /* get the polynom selection mask for the sine */

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 poly_mask;
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
#endif

   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

#ifdef USE_SSE2
   emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
   emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
   emm4 = _mm_slli_epi32(emm4, 29);
   __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
#else
   /* get the sign flag for the cosine */
   mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
   mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
   mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
   mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
   mm4 = _mm_slli_pi32(mm4, 29);
   mm5 = _mm_slli_pi32(mm5, 29);
   __m128 sign_bit_cos;
   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
   _mm_empty(); /* good-bye mmx */
#endif

   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);


   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   __m128 z = _mm_mul_ps(x,x);
   y = *(__m128*)_ps_coscof_p0;

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   __m128 ysin2 = _mm_and_ps(xmm3, y2);
   __m128 ysin1 = _mm_andnot_ps(xmm3, y);
   y2 = _mm_sub_ps(y2,ysin2);
   y = _mm_sub_ps(y, ysin1);

   xmm1 = _mm_add_ps(ysin1,ysin2);
   xmm2 = _mm_add_ps(y,y2);

   /* update the sign */
   *s = _mm_xor_ps(xmm1, sign_bit_sin);
   *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
Example #21
0
		void SWModelRenderer::RenderInner(spades::draw::SWModel *model,
									 const client::ModelRenderParam &param) {
			auto& mat = param.matrix;
			auto origin = mat.GetOrigin();
			auto axis1 = mat.GetAxis(0);
			auto axis2 = mat.GetAxis(1);
			auto axis3 = mat.GetAxis(2);
			auto *rawModel = model->GetRawModel();
			auto rawModelOrigin = rawModel->GetOrigin();
			rawModelOrigin += 0.1f;
			origin += axis1 * rawModelOrigin.x;
			origin += axis2 * rawModelOrigin.y;
			origin += axis3 * rawModelOrigin.z;
			
			int w = rawModel->GetWidth();
			int h = rawModel->GetHeight();
			//int d = rawModel->GetDepth();
			
			// evaluate brightness for each normals
			uint8_t brights[3*3*3];
			{
				auto lightVec = MakeVector3(0.f, -0.707f, -0.707f);
				float dot1 = Vector3::Dot(axis1, lightVec) * fastRSqrt(axis1.GetPoweredLength());
				float dot2 = Vector3::Dot(axis2, lightVec) * fastRSqrt(axis2.GetPoweredLength());
				float dot3 = Vector3::Dot(axis3, lightVec) * fastRSqrt(axis3.GetPoweredLength());
				for(int x = 0; x < 3; x++){
					float d;
					int cnt;
					switch(x){
						case 0: d = -dot1; cnt = 1; break;
						case 1: d = 0.f; cnt = 0; break;
						case 2: d = dot1; cnt = 1; break;
					}
					for(int y = 0; y < 3; y++){
						auto d2 = d;
						auto cnt2 = cnt;
						switch(y){
							case 0: d2 -= dot2; cnt2++; break;
							case 1: break;
							case 2: d2 += dot2; cnt2++; break;
						}
						for(int z = 0; z < 3; z++) {
							auto d3 = d;
							auto cnt3 = cnt2;
							switch(y){
								case 0: d3 -= dot3; cnt3++; break;
								case 1: break;
								case 2: d3 += dot3; cnt3++; break;
							}
							switch(cnt3){
								case 2:
									d3 *= 0.707f;
									break;
								case 3:
									d3 *= 0.57735f;
									break;
							}
							d3 = 192.f + d3 * 62.f;
							brights[x + y * 3 + z * 9]
							= static_cast<uint8_t>(d3);
						}
					}
				}
			}
				
			
			// compute center coord. for culling
			{
				auto center = origin;
				auto localCenter = model->GetCenter();
				center += axis1 * localCenter.x;
				center += axis2 * localCenter.y;
				center += axis3 * localCenter.z;
				
				float largestAxis = axis1.GetPoweredLength();
				largestAxis = std::max(largestAxis, axis2.GetPoweredLength());
				largestAxis = std::max(largestAxis, axis3.GetPoweredLength());
				
				if(!r->SphereFrustrumCull(center, model->GetRadius() * sqrtf(largestAxis)))
					return;
			}
			
			Bitmap *fbmp = r->fb;
			auto *fb = fbmp->GetPixels();
			int fw = fbmp->GetWidth();
			int fh = fbmp->GetHeight();
			auto *db = r->depthBuffer.data();
			
			Matrix4 viewproj = r->GetProjectionViewMatrix();
			Vector4 ndc2scrscale = {fw * 0.5f, -fh * 0.5f, 1.f, 1.f};
			//Vector4 ndc2scroff = {fw * 0.5f, fh * 0.5f, 0.f, 0.f};
			int ndc2scroffX = fw >> 1;
			int ndc2scroffY = fh >> 1;
			
			
			// render each points
			auto tOrigin = viewproj * MakeVector4(origin.x, origin.y, origin.z, 1.f);
			auto tAxis1 = viewproj * MakeVector4(axis1.x, axis1.y, axis1.z, 0.f);
			auto tAxis2 = viewproj * MakeVector4(axis2.x, axis2.y, axis2.z, 0.f);
			auto tAxis3 = viewproj * MakeVector4(axis3.x, axis3.y, axis3.z, 0.f);
			tOrigin *= ndc2scrscale;
			tAxis1 *= ndc2scrscale;
			tAxis2 *= ndc2scrscale;
			tAxis3 *= ndc2scrscale;
			
			float pointDiameter;// = largestAxis * 0.55f * fh * 0.5f;
			{
				float largestAxis = tAxis1.GetPoweredLength();
				largestAxis = std::max(largestAxis, tAxis2.GetPoweredLength());
				largestAxis = std::max(largestAxis, tAxis3.GetPoweredLength());
				pointDiameter = sqrtf(largestAxis);
			}
			
			uint32_t customColor;
			customColor =
			ToFixed8(param.customColor.z) |
			(ToFixed8(param.customColor.y) << 8) |
			(ToFixed8(param.customColor.x) << 16);
			
			auto v1 = tOrigin;
			float zNear = r->sceneDef.zNear;
			for(int x = 0; x < w; x++) {
				auto v2 = v1;
				for(int y = 0; y < h; y++) {
					auto *mp = &model->renderData
					[model->renderDataAddr[x + y * w]];
					while(*mp != -1) {
						uint32_t data = *(mp++);
						uint32_t normal = *(mp++);
						int z = static_cast<int>(data >> 24);
						//SPAssert(z < d);
						SPAssert(z >= 0);
						
						auto vv = v2 + tAxis3 * zvals[z];
						if(vv.z < zNear) continue;
						
						// save Z value (don't divide this by W!)
						float zval = vv.z;
						
						// use vv.z for point radius to be divided by W
						vv.z = pointDiameter;
						
						// perspective division
						float scl = fastRcp(vv.w);
						vv *= scl;
						
						int ix = static_cast<int>(vv.x) + ndc2scroffX;
						int iy = static_cast<int>(vv.y) + ndc2scroffY;
						int idm = static_cast<int>(vv.z + .99f);
						idm = std::max(1, idm);
						int minX = ix - (idm >> 1);
						int minY = iy - (idm >> 1);
						if(minX >= fw || minY >= fh) continue;
						int maxX = ix + idm;
						int maxY = iy + idm;
						if(maxX <= 0 || maxY <= 0) continue;
						
						minX = std::max(minX, 0);
						minY = std::max(minY, 0);
						maxX = std::min(maxX, fw);
						maxY = std::min(maxY, fh);
						
						auto *fb2 = fb + (minX + minY * fw);
						auto *db2 = db + (minX + minY * fw);
						int w = maxX - minX;
						
						uint32_t color = data & 0xffffff;
						if(color == 0)
							color = customColor;
						
						SPAssert(normal < 27);
						int bright = brights[normal];
#if ENABLE_SSE2
						if(lvl == SWFeatureLevel::SSE2) {
							auto m = _mm_setr_epi32(color, 0, 0, 0);
							auto f = _mm_set1_epi16(bright << 8);
							
							m = _mm_unpacklo_epi8(m, _mm_setzero_si128());
							m = _mm_mulhi_epu16(m, f);
							m = _mm_packus_epi16(m, m);
							
							_mm_store_ss(reinterpret_cast<float*>(&color),
										 _mm_castsi128_ps(m));
						}else
#endif
						{
							uint32_t c1 = color & 0xff00;
							uint32_t c2 = color & 0xff00ff;
							c1 *= bright;
							c2 *= bright;
							color = ((c1&0xff0000) | (c2&0xff00ff00)) >> 8;
						}
						
						for(int yy = minY; yy < maxY; yy++){
							auto *fb3 = fb2;
							auto *db3 = db2;
							
							for(int xx = w; xx > 0; xx--) {
								if(zval < *db3) {
									*db3 = zval;
									*fb3 = color;
								}
								fb3++; db3++;
							}
							
							fb2 += fw;
							db2 += fw;
						}
						
						
					}
					v2 += tAxis2;
				}
				v1 += tAxis1;
			}
		}
Example #22
0
void spu_interpreter::FRSQEST(SPUThread& CPU, spu_opcode_t op)
{
	const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
	CPU.GPR[op.rt].vf = _mm_rsqrt_ps(_mm_and_ps(CPU.GPR[op.ra].vf, mask));
}
Example #23
0
static __m128 mm_pow_ps(__m128 a, __m128 b) {
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  __m128 log2_a, b_log2_a, a_exp_b;

  // Calculate log2(x), x = a.
  {
    // To calculate log2(x), we decompose x like this:
    //   x = y * 2^n
    //     n is an integer
    //     y is in the [1.0, 2.0) range
    //
    //   log2(x) = log2(y) + n
    //     n       can be evaluated by playing with float representation.
    //     log2(y) in a small range can be approximated, this code uses an order
    //             five polynomial approximation. The coefficients have been
    //             estimated with the Remez algorithm and the resulting
    //             polynomial has a maximum relative error of 0.00086%.

    // Compute n.
    //    This is done by masking the exponent, shifting it into the top bit of
    //    the mantissa, putting eight into the biased exponent (to shift/
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
        0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
        0x43800000, 0x43800000, 0x43800000, 0x43800000};
    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
        0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
    static const int shift_exponent_into_top_mantissa = 8;
    const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask));
    const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
        _mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
    const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent));
    const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one));

    // Compute y.
    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
        0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
        0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
    const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask));
    const __m128 y =
        _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one));

    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
    static const ALIGN16_BEG float ALIGN16_END C5[4] = {
        -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
    static const ALIGN16_BEG float ALIGN16_END
        C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
    static const ALIGN16_BEG float ALIGN16_END
        C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
    static const ALIGN16_BEG float ALIGN16_END
        C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
    static const ALIGN16_BEG float ALIGN16_END
        C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
    static const ALIGN16_BEG float ALIGN16_END
        C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
    const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5));
    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4));
    const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3));
    const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2));
    const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1));
    const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
    const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0));
    const __m128 y_minus_one =
        _mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one));
    const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);

    // Combine parts.
    log2_a = _mm_add_ps(n, log2_y);
  }

  // b * log2(a)
  b_log2_a = _mm_mul_ps(b, log2_a);

  // Calculate exp2(x), x = b * log2(a).
  {
    // To calculate 2^x, we decompose x like this:
    //   x = n + y
    //     n is an integer, the value of x - 0.5 rounded down, therefore
    //     y is in the [0.5, 1.5) range
    //
    //   2^x = 2^n * 2^y
    //     2^n can be evaluated by playing with float representation.
    //     2^y in a small range can be approximated, this code uses an order two
    //         polynomial approximation. The coefficients have been estimated
    //         with the Remez algorithm and the resulting polynomial has a
    //         maximum relative error of 0.17%.

    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
    static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
                                                               129.f, 129.f};
    static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
        -126.99999f, -126.99999f, -126.99999f, -126.99999f};
    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input));
    const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input));
    // Compute n.
    static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,
                                                          0.5f, 0.5f};
    const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half));
    const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
    // Compute 2^n.
    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
        127, 127, 127, 127};
    static const int float_exponent_shift = 23;
    const __m128i two_n_exponent =
        _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias));
    const __m128 two_n =
        _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
    // Compute y.
    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
    static const ALIGN16_BEG float C2[4] ALIGN16_END = {
        3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
    static const ALIGN16_BEG float C1[4] ALIGN16_END = {
        6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
    static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
                                                        1.0017247f, 1.0017247f};
    const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2));
    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1));
    const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
    const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0));

    // Combine parts.
    a_exp_b = _mm_mul_ps(exp2_y, two_n);
  }
  return a_exp_b;
}
Example #24
0
inline GPR_t si_andbi( GPR_t RA, uint8_t I10 )
{
	return _mm_castsi128_ps( _mm_and_si128( _mm_castps_si128( RA ), _mm_set1_epi8( I10 ) ) );
}
Example #25
0
	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000), 
	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000), 
	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000),
	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f), 
	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f), 
	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f), 
	GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
};

const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f);
const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
const GSVector4 GSVector4::m_half(0.5f);
const GSVector4 GSVector4::m_one(1.0f);
const GSVector4 GSVector4::m_two(2.0f);
const GSVector4 GSVector4::m_four(4.0f);
const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f800000)));

#if _M_SSE >= 0x500

const GSVector8 GSVector8::m_half(0.5f);
const GSVector8 GSVector8::m_one(1.0f);
const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
const GSVector8 GSVector8::m_x4b000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4b000000)));
const GSVector8 GSVector8::m_x4f800000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4f800000)));

#endif

#if _M_SSE >= 0x501
Example #26
0
inline GPR_t si_andi( GPR_t RA, int16_t I10 )
{
	return _mm_castsi128_ps( _mm_and_si128( _mm_castps_si128( RA ), _mm_set1_epi32( static_cast<int32_t>(SignExtend( I10, 10 ) ) ) ) );
}
Example #27
0
inline GPR_t si_ori( GPR_t RA, int64_t IMM )
{
	return _mm_or_ps( RA, _mm_castsi128_ps( _mm_set1_epi32( (int32_t)IMM ) ) );
}
Example #28
0
inline GPR_t si_nand( GPR_t RA, GPR_t RB )
{
	return _mm_andnot_ps( _mm_and_ps( RA, RB ), _mm_castsi128_ps( _mm_set1_epi32( 0xffffffff ) ) );
}
    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
    real             rswitch_scalar,d_scalar;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    nvdwtype         = fr->ntype;
    vdwparam         = fr->nbfp;
//----------------------------------------------------------------
// Transforms the AABB vertices to screen space once every frame
// Also performs a coarse depth pre-test
//----------------------------------------------------------------
PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary)
{
	// w ends up being garbage, but it doesn't matter - we ignore it anyway.
	__m128 vCenter = _mm_loadu_ps(&mBBCenter.x);
	__m128 vHalf   = _mm_loadu_ps(&mBBHalf.x);

	__m128 vMin    = _mm_sub_ps(vCenter, vHalf);
	__m128 vMax    = _mm_add_ps(vCenter, vHalf);

	// transforms
	__m128 xRow[2], yRow[2], zRow[2];
	xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0];
	xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0];
	yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1];
	yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1];
	zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2];
	zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2];

	__m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0));
	__m128 screenMin = _mm_set1_ps(FLT_MAX);
	__m128 screenMax = _mm_set1_ps(-FLT_MAX);

	for(UINT i = 0; i < AABB_VERTICES; i++)
	{
		// Transform the vertex
		__m128 vert = cumulativeMatrix[3];
		vert += xRow[sBBxInd[i]];
		vert += yRow[sBByInd[i]];
		vert += zRow[sBBzInd[i]];

		// We have inverted z; z is in front of near plane iff z <= w.
		__m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz
		__m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww
		__m128 zIn = _mm_cmple_ps(vertZ, vertW);
		zAllIn = _mm_and_ps(zAllIn, zIn);

		// project
		xformedPos[i] = _mm_div_ps(vert, vertW);
		
	    // update bounds
	    screenMin = _mm_min_ps(screenMin, xformedPos[i]);
	    screenMax = _mm_max_ps(screenMax, xformedPos[i]);
	}

	// if any of the verts are z-clipped, we (conservatively) say the box is in
	if(_mm_movemask_ps(zAllIn) != 0xf)
		return ePT_VISIBLE;

	// Clip against screen bounds
	screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX));
	screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX));

	// Quick rejection test
	if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin)))
		return ePT_INVISIBLE;

	// Prepare integer bounds
	__m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY
	__m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY);
	__m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3);

	__m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa);

	// Traverse all 8x8 blocks covered by 2d screen-space BBox;
	// if we know for sure that this box is behind the geometry we know is there,
	// we can stop.
	int rX0 = minMaxXYis.m128i_i32[0];
	int rY0 = minMaxXYis.m128i_i32[1];
	int rX1 = minMaxXYis.m128i_i32[2];
	int rY1 = minMaxXYis.m128i_i32[3];

	__m128 anyCloser = _mm_setzero_ps();
	for(int by = rY0; by <= rY1; by++)
	{
		const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE);

		// If for any 8x8 block, maxZ is not less than (=behind) summarized
		// min Z, box might be visible.
		for(int bx = rX0; bx <= rX1; bx++)
		{
			anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx])));
		}

		if(_mm_movemask_ps(anyCloser))
		{
			return ePT_UNSURE; // okay, box might be in
		}
	}

	// If we get here, we know for sure that the box is fully behind the stuff in the
	// depth buffer.
	return ePT_INVISIBLE;
}