示例#1
0
文件: BBox.cpp 项目: Mak13/Fast-BVH
bool BBox::intersect(const Ray& ray, float *tnear, float *tfar) const {

	// you may already have those values hanging around somewhere
	const __m128
		plus_inf	= loadps(ps_cst_plus_inf),
		minus_inf	= loadps(ps_cst_minus_inf);

	// use whatever's apropriate to load.
	const __m128
		box_min	= loadps(&min),
		box_max	= loadps(&max),
		pos	= loadps(&ray.o),
		inv_dir	= loadps(&ray.inv_d);

	// use a div if inverted directions aren't available
	const __m128 l1 = mulps(subps(box_min, pos), inv_dir);
	const __m128 l2 = mulps(subps(box_max, pos), inv_dir);

	// the order we use for those min/max is vital to filter out
	// NaNs that happens when an inv_dir is +/- inf and
	// (box_min - pos) is 0. inf * 0 = NaN
	const __m128 filtered_l1a = minps(l1, plus_inf);
	const __m128 filtered_l2a = minps(l2, plus_inf);

	const __m128 filtered_l1b = maxps(l1, minus_inf);
	const __m128 filtered_l2b = maxps(l2, minus_inf);

	// now that we're back on our feet, test those slabs.
	__m128 lmax = maxps(filtered_l1a, filtered_l2a);
	__m128 lmin = minps(filtered_l1b, filtered_l2b);

	// unfold back. try to hide the latency of the shufps & co.
	const __m128 lmax0 = rotatelps(lmax);
	const __m128 lmin0 = rotatelps(lmin);
	lmax = minss(lmax, lmax0);
	lmin = maxss(lmin, lmin0);

	const __m128 lmax1 = muxhps(lmax,lmax);
	const __m128 lmin1 = muxhps(lmin,lmin);
	lmax = minss(lmax, lmax1);
	lmin = maxss(lmin, lmin1);

	const bool ret = _mm_comige_ss(lmax, _mm_setzero_ps()) & _mm_comige_ss(lmax,lmin);

	storess(lmin, tnear);
	storess(lmax, tfar);

	return  ret;
}
示例#2
0
inline bool ray_box_intersect(const box_t & b, const ray_t & ray, rayseg_t & rs) {
    /* you may already have those values hanging around somewhere */
    const __m128
        plus_inf = loadps(ps_cst_plus_inf), minus_inf = loadps(ps_cst_minus_inf);

    /* use whatever's apropriate to load. */
    const __m128
        box_min = loadps(&b.min), box_max = loadps(&b.max), pos =
        loadps(&ray.pos), inv_dir = loadps(&ray.inv_dir);

    /* use a div if inverted directions aren't available */
    const __m128 l1 = mulps(subps(box_min, pos), inv_dir);
    const __m128 l2 = mulps(subps(box_max, pos), inv_dir);

    /* the order we use for those min/max is vital to filter out */
    /* NaNs that happens when an inv_dir is +/- inf and */
    /* (box_min - pos) is 0. inf * 0 = NaN */
    const __m128 filtered_l1a = minps(l1, plus_inf);
    const __m128 filtered_l2a = minps(l2, plus_inf);
    const __m128 filtered_l1b = maxps(l1, minus_inf);
    const __m128 filtered_l2b = maxps(l2, minus_inf);

    /* now that we're back on our feet, test those slabs. */
    __m128 lmax = maxps(filtered_l1a, filtered_l2a);
    __m128 lmin = minps(filtered_l1b, filtered_l2b);

    /* unfold back. try to hide the latency of the shufps & co. */
    const __m128 lmax0 = rotatelps(lmax);
    const __m128 lmin0 = rotatelps(lmin);
    lmax = minss(lmax, lmax0);
    lmin = maxss(lmin, lmin0);
    const __m128 lmax1 = muxhps(lmax, lmax);
    const __m128 lmin1 = muxhps(lmin, lmin);
    lmax = minss(lmax, lmax1);
    lmin = maxss(lmin, lmin1);
    const bool ret =
        _mm_comige_ss(lmax, _mm_setzero_ps()) & _mm_comige_ss(lmax, lmin);
    storess(lmin, &rs.t_near);
    storess(lmax, &rs.t_far);
    return ret;
}
示例#3
0
文件: sphere.cpp 项目: IrmatDen/pprt
bool Sphere::hit(const Ray &ray, IntersectionInfo &ii) const
{
	// Make a vector to avoid Point -> Vector casting below
	const Point3 localRayOrigin((worldToObject * ray.origin).get128());
	const Vector3 localRayOriginAsVec(localRayOrigin);
	const Vector3 localRayDir	((worldToObject * ray.direction()).get128());

#ifdef SSE4
	const __m128 a = dotps(localRayDir.get128(), localRayDir.get128());
	const __m128 b = mulps(set1ps(2), dotps(localRayDir.get128(), localRayOriginAsVec.get128()));
	const __m128 rv = set1ps(r);
	const __m128 c = subps(dotps(localRayOriginAsVec.get128(), localRayOriginAsVec.get128()), mulps(rv, rv));
	
	const float ar = a.m128_f32[0];
	const float br = b.m128_f32[0];
	const float cr = c.m128_f32[0];
	
	// Solve quadratic
	const float d = (subps(mulps(b,b), mulps(set1ps(4), mulps(a, c)))).m128_f32[0];
#else
	const float a = dot(localRayDir, localRayDir);
	const float b = 2.f * dot(localRayDir, localRayOriginAsVec);
	const float c = dot(localRayOriginAsVec, localRayOriginAsVec) - r*r;
	
	const float ar = a;
	const float br = b;
	const float cr = c;
	
	// Solve quadratic
	const float d = b*b - 4.f * a*c;
#endif

	if (d < 0)
		return false;
	
	const float sqrtD = sqrt(d);
	float q;
	if (br < 0)
		q = -0.5f * (br - sqrtD);
	else
		q = -0.5f * (br + sqrtD);

	float t0	= q / ar;
	float t1	= cr / q;
	if (t0 > t1)
		std::swap(t0, t1);

	if (t0 > ray.maxT || t1 < 0)
		return false;

	float hit = t0;
	if (t0 < 0)
	{
		hit = t1;
		if (hit > ray.maxT)
			return false;
	}
	ray.maxT = hit;

	// Now that we have a hit fill the intersection info structure
	const Point3 localHitP(localRayOrigin + ray.maxT * localRayDir);

	ii.P  = Point3((objectToWorld * localHitP).get128());
	ii.Ng = Vector3(localHitP) * invr;
	ii.Ng = Vector3((worldToObjectN * ii.Ng).get128());
    ii.N  = ii.Ng;

    ii.Cs = color;
    ii.Os = opacity;
	
	const float invPi = 1.f / 3.141592654f;
	ii.s = ::asinf(ii.Ng.getX()) * invPi + 0.5f;
	ii.t = ::asinf(ii.Ng.getY()) * invPi + 0.5f;

	return true;
}
void GSSetupPrimCodeGenerator::Depth()
{
	if(!m_en.z && !m_en.f)
	{
		return;
	}

	if(!m_env.sel.sprite)
	{
		// GSVector4 t = dscan.p;

		movaps(xmm0, xmmword[edx + 16]);

		if(m_en.f)
		{
			// GSVector4 df = p.wwww();

			movaps(xmm1, xmm0);
			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

			// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();

			movaps(xmm2, xmm1);
			mulps(xmm2, xmm3);
			cvttps2dq(xmm2, xmm2);
			pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			movdqa(xmmword[&m_env.d4.f], xmm2);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();

				movaps(xmm2, xmm1);
				mulps(xmm2, Xmm(4 + i));
				cvttps2dq(xmm2, xmm2);
				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				movdqa(xmmword[&m_env.d[i].f], xmm2);
			}
		}

		if(m_en.z)
		{
			// GSVector4 dz = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			// m_env.d4.z = dz * 4.0f;

			movaps(xmm1, xmm0);
			mulps(xmm1, xmm3);
			movdqa(xmmword[&m_env.d4.z], xmm1);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].z = dz * m_shift[i];

				movaps(xmm1, xmm0);
				mulps(xmm1, Xmm(4 + i));
				movdqa(xmmword[&m_env.d[i].z], xmm1);
			}
		}
	}
	else
	{
		// GSVector4 p = vertices[0].p;

		movaps(xmm0, xmmword[ecx + 16]);

		if(m_en.f)
		{
			// m_env.p.f = GSVector4i(p).zzzzh().zzzz();

			movaps(xmm1, xmm0);
			cvttps2dq(xmm1, xmm1);
			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			movdqa(xmmword[&m_env.p.f], xmm1);
		}

		if(m_en.z)
		{
			// GSVector4 z = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			if(m_env.sel.zoverflow)
			{
				// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());

				static const float half = 0.5f;

				movss(xmm1, dword[&half]);
				shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
				mulps(xmm1, xmm0);
				cvttps2dq(xmm1, xmm1);
				pslld(xmm1, 1);

				cvttps2dq(xmm0, xmm0);
				pcmpeqd(xmm2, xmm2);
				psrld(xmm2, 31);
				pand(xmm0, xmm2);
				
				por(xmm0, xmm1);
			}
			else
			{
				// m_env.p.z = GSVector4i(z);

				cvttps2dq(xmm0, xmm0);
			}

			movdqa(xmmword[&m_env.p.z], xmm0);
		}
	}
}
void GSSetupPrimCodeGenerator::Color()
{
	if(!m_en.c)
	{
		return;
	}

	if(m_env.sel.iip)
	{
		// GSVector4 c = dscan.c;

		movaps(xmm0, xmmword[edx]);
		movaps(xmm1, xmm0);

		// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();

		movaps(xmm2, xmm0);
		mulps(xmm2, xmm3);
		cvttps2dq(xmm2, xmm2);
		pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
		packssdw(xmm2, xmm2);
		movdqa(xmmword[&m_env.d4.c], xmm2);

		// xmm3 is not needed anymore

		// GSVector4 dr = c.xxxx();
		// GSVector4 db = c.zzzz();

		shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
		shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();

			movaps(xmm2, xmm0);
			mulps(xmm2, Xmm(4 + i));
			cvttps2dq(xmm2, xmm2);
			packssdw(xmm2, xmm2);

			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();

			movaps(xmm3, xmm1);
			mulps(xmm3, Xmm(4 + i));
			cvttps2dq(xmm3, xmm3);
			packssdw(xmm3, xmm3);

			// m_env.d[i].rb = r.upl16(b);

			punpcklwd(xmm2, xmm3);
			movdqa(xmmword[&m_env.d[i].rb], xmm2);
		}

		// GSVector4 c = dscan.c;

		movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it
		movaps(xmm1, xmm0);

		// GSVector4 dg = c.yyyy();
		// GSVector4 da = c.wwww();

		shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
		shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();

			movaps(xmm2, xmm0);
			mulps(xmm2, Xmm(4 + i));
			cvttps2dq(xmm2, xmm2);
			packssdw(xmm2, xmm2);

			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();

			movaps(xmm3, xmm1);
			mulps(xmm3, Xmm(4 + i));
			cvttps2dq(xmm3, xmm3);
			packssdw(xmm3, xmm3);

			// m_env.d[i].ga = g.upl16(a);

			punpcklwd(xmm2, xmm3);
			movdqa(xmmword[&m_env.d[i].ga], xmm2);
		}
	}
	else
	{
		// GSVector4i c = GSVector4i(vertices[0].c);

		movaps(xmm0, xmmword[ecx]);
		cvttps2dq(xmm0, xmm0);

		// c = c.upl16(c.zwxy());

		movdqa(xmm1, xmm0);
		pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2));
		punpcklwd(xmm0, xmm1);

		// if(!tme) c = c.srl16(7);

		if(m_env.sel.tfx == TFX_NONE)
		{
			psrlw(xmm0, 7);
		}

		// m_env.c.rb = c.xxxx();
		// m_env.c.ga = c.zzzz();

		movdqa(xmm1, xmm0);
		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
		pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
		movdqa(xmmword[&m_env.c.rb], xmm0);
		movdqa(xmmword[&m_env.c.ga], xmm1);
	}
}
void GSSetupPrimCodeGenerator::Texture()
{
	if(!m_en.t)
	{
		return;
	}

	// GSVector4 t = dscan.t;

	movaps(xmm0, xmmword[edx + 32]);

	movaps(xmm1, xmm0);
	mulps(xmm1, xmm3);

	if(m_env.sel.fst)
	{
		// m_env.d4.st = GSVector4i(t * 4.0f);

		cvttps2dq(xmm1, xmm1);
		movdqa(xmmword[&m_env.d4.st], xmm1);
	}
	else
	{
		// m_env.d4.stq = t * 4.0f;

		movaps(xmmword[&m_env.d4.stq], xmm1);
	}

	for(int j = 0, k = m_env.sel.fst ? 2 : 3; j < k; j++)
	{
		// GSVector4 ds = t.xxxx();
		// GSVector4 dt = t.yyyy();
		// GSVector4 dq = t.zzzz();

		movaps(xmm1, xmm0);
		shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4 v = ds/dt * m_shift[i];

			movaps(xmm2, xmm1);
			mulps(xmm2, Xmm(4 + i));

			if(m_env.sel.fst)
			{
				// m_env.d[i].si/ti = GSVector4i(v);

				cvttps2dq(xmm2, xmm2);

				switch(j)
				{
				case 0: movdqa(xmmword[&m_env.d[i].si], xmm2); break;
				case 1: movdqa(xmmword[&m_env.d[i].ti], xmm2); break;
				}
			}
			else
			{
				// m_env.d[i].s/t/q = v;

				switch(j)
				{
				case 0: movaps(xmmword[&m_env.d[i].s], xmm2); break;
				case 1: movaps(xmmword[&m_env.d[i].t], xmm2); break;
				case 2: movaps(xmmword[&m_env.d[i].q], xmm2); break;
				}
			}
		}
	}
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
{
    if(!m_sel.zb)
    {
        return;
    }

    // int za = fza_base.y + fza_offset->y;

    mov(ebp, dword[esi + 4]);
    add(ebp, dword[edi + 4]);

    // GSVector4i zs = zi;

    if(!m_sel.sprite)
    {
        if(m_sel.zoverflow)
        {
            // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());

            static float half = 0.5f;

            movss(temp1, dword[&half]);
            shufps(temp1, temp1, _MM_SHUFFLE(0, 0, 0, 0));
            mulps(temp1, xmm0);
            cvttps2dq(temp1, temp1);
            pslld(temp1, 1);

            cvttps2dq(xmm0, xmm0);
            pcmpeqd(temp2, temp2);
            psrld(temp2, 31);
            pand(xmm0, temp2);

            por(xmm0, temp1);
        }
        else
        {
            // zs = GSVector4i(z);

            cvttps2dq(xmm0, xmm0);
        }

        if(m_sel.zwrite)
        {
            movdqa(xmmword[&m_env.temp.zs], xmm0);
        }
    }

    if(m_sel.ztest)
    {
        ReadPixel(xmm1, ebp);

        if(m_sel.zwrite && m_sel.zpsm < 2)
        {
            movdqa(xmmword[&m_env.temp.zd], xmm1);
        }

        // zd &= 0xffffffff >> m_sel.zpsm * 8;

        if(m_sel.zpsm)
        {
            pslld(xmm1, m_sel.zpsm * 8);
            psrld(xmm1, m_sel.zpsm * 8);
        }

        if(m_sel.zoverflow || m_sel.zpsm == 0)
        {
            // GSVector4i o = GSVector4i::x80000000();

            pcmpeqd(xmm4, xmm4);
            pslld(xmm4, 31);

            // GSVector4i zso = zs - o;

            psubd(xmm0, xmm4);

            // GSVector4i zdo = zd - o;

            psubd(xmm1, xmm4);
        }

        switch(m_sel.ztst)
        {
        case ZTST_GEQUAL:
            // test |= zso < zdo; // ~(zso >= zdo)
            pcmpgtd(xmm1, xmm0);
            por(xmm7, xmm1);
            break;

        case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
            // test |= zso <= zdo; // ~(zso > zdo)
            pcmpgtd(xmm0, xmm1);
            pcmpeqd(xmm4, xmm4);
            pxor(xmm0, xmm4);
            por(xmm7, xmm0);
            break;
        }

        alltrue();
    }
}
void GSDrawScanlineCodeGenerator::Step()
{
    // steps -= 4;

    sub(ecx, 4);

    // fza_offset++;

    add(edi, 8);

    if(!m_sel.sprite)
    {
        // z += m_env.d4.z;

        if(m_sel.zb)
        {
            movaps(xmm0, xmmword[&m_env.temp.z]);
            addps(xmm0, xmmword[&m_env.d4.z]);
            movaps(xmmword[&m_env.temp.z], xmm0);
        }

        // f = f.add16(m_env.d4.f);

        if(m_sel.fwrite && m_sel.fge)
        {
            movdqa(xmm1, xmmword[&m_env.temp.f]);
            paddw(xmm1, xmmword[&m_env.d4.f]);
            movdqa(xmmword[&m_env.temp.f], xmm1);
        }
    }
    else
    {
        if(m_sel.ztest)
        {
            movdqa(xmm0, xmmword[&m_env.p.z]);
        }
    }

    if(m_sel.fb)
    {
        if(m_sel.tfx != TFX_NONE)
        {
            if(m_sel.fst)
            {
                // GSVector4i st = m_env.d4.st;

                // si += st.xxxx();
                // if(!sprite) ti += st.yyyy();

                movdqa(xmm4, xmmword[&m_env.d4.st]);

                pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
                paddd(xmm2, xmmword[&m_env.temp.s]);
                movdqa(xmmword[&m_env.temp.s], xmm2);

                if(!m_sel.sprite)
                {
                    pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
                    paddd(xmm3, xmmword[&m_env.temp.t]);
                    movdqa(xmmword[&m_env.temp.t], xmm3);
                }
                else
                {
                    movdqa(xmm3, xmmword[&m_env.temp.t]);
                }
            }
            else
            {
                // GSVector4 stq = m_env.d4.stq;

                // s += stq.xxxx();
                // t += stq.yyyy();
                // q += stq.zzzz();

                movaps(xmm2, xmmword[&m_env.d4.stq]);
                movaps(xmm3, xmm2);
                movaps(xmm4, xmm2);

                shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
                shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
                shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));

                addps(xmm2, xmmword[&m_env.temp.s]);
                addps(xmm3, xmmword[&m_env.temp.t]);
                addps(xmm4, xmmword[&m_env.temp.q]);

                movaps(xmmword[&m_env.temp.s], xmm2);
                movaps(xmmword[&m_env.temp.t], xmm3);
                movaps(xmmword[&m_env.temp.q], xmm4);

                rcpps(xmm4, xmm4);
                mulps(xmm2, xmm4);
                mulps(xmm3, xmm4);
            }
        }

        if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
        {
            if(m_sel.iip)
            {
                // GSVector4i c = m_env.d4.c;

                // rb = rb.add16(c.xxxx());
                // ga = ga.add16(c.yyyy());

                movdqa(xmm7, xmmword[&m_env.d4.c]);

                pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1));

                paddw(xmm5, xmmword[&m_env.temp.rb]);
                paddw(xmm6, xmmword[&m_env.temp.ga]);

                movdqa(xmmword[&m_env.temp.rb], xmm5);
                movdqa(xmmword[&m_env.temp.ga], xmm6);
            }
            else
            {
                if(m_sel.tfx == TFX_NONE)
                {
                    movdqa(xmm5, xmmword[&m_env.c.rb]);
                    movdqa(xmm6, xmmword[&m_env.c.ga]);
                }
            }
        }
    }

    // test = m_test[7 + (steps & (steps >> 31))];

    mov(edx, ecx);
    sar(edx, 31);
    and(edx, ecx);
    shl(edx, 4);

    movdqa(xmm7, xmmword[edx + (size_t)&m_test[7]]);
}
void GSDrawScanlineCodeGenerator::Init(int params)
{
    const int _top = params + 4;
    const int _v = params + 8;

    // int skip = left & 3;

    mov(ebx, edx);
    and(edx, 3);

    // left -= skip;

    sub(ebx, edx);

    // int steps = right - left - 4;

    sub(ecx, ebx);
    sub(ecx, 4);

    // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];

    shl(edx, 4);

    movdqa(xmm7, xmmword[edx + (size_t)&m_test[0]]);

    mov(eax, ecx);
    sar(eax, 31);
    and(eax, ecx);
    shl(eax, 4);

    por(xmm7, xmmword[eax + (size_t)&m_test[7]]);

    // GSVector2i* fza_base = &m_env.fzbr[top];

    mov(esi, dword[esp + _top]);
    lea(esi, ptr[esi * 8]);
    add(esi, dword[&m_env.fzbr]);

    // GSVector2i* fza_offset = &m_env.fzbc[left >> 2];

    lea(edi, ptr[ebx * 2]);
    add(edi, dword[&m_env.fzbc]);

    if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
    {
        // edx = &m_env.d[skip]

        shl(edx, 4);
        lea(edx, ptr[edx + (size_t)m_env.d]);

        // ebx = &v

        mov(ebx, dword[esp + _v]);
    }

    if(!m_sel.sprite)
    {
        if(m_sel.fwrite && m_sel.fge || m_sel.zb)
        {
            movaps(xmm0, xmmword[ebx + 16]); // v.p

            if(m_sel.fwrite && m_sel.fge)
            {
                // f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f);

                cvttps2dq(xmm1, xmm0);
                pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
                pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
                paddw(xmm1, xmmword[edx + 16 * 6]);

                movdqa(xmmword[&m_env.temp.f], xmm1);
            }

            if(m_sel.zb)
            {
                // z = vp.zzzz() + m_env.d[skip].z;

                shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
                addps(xmm0, xmmword[edx]);

                movaps(xmmword[&m_env.temp.z], xmm0);
            }
        }
    }
    else
    {
        if(m_sel.ztest)
        {
            movdqa(xmm0, xmmword[&m_env.p.z]);
        }
    }

    if(m_sel.fb)
    {
        if(m_sel.edge || m_sel.tfx != TFX_NONE)
        {
            movaps(xmm4, xmmword[ebx + 32]); // v.t
        }

        if(m_sel.edge)
        {
            pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
            pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3));
            psrlw(xmm3, 9);

            movdqa(xmmword[&m_env.temp.cov], xmm3);
        }

        if(m_sel.tfx != TFX_NONE)
        {
            if(m_sel.fst)
            {
                // GSVector4i vti(vt);

                cvttps2dq(xmm4, xmm4);

                // si = vti.xxxx() + m_env.d[skip].si;
                // ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti;

                pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));

                paddd(xmm2, xmmword[edx + 16 * 7]);

                if(!m_sel.sprite)
                {
                    paddd(xmm3, xmmword[edx + 16 * 8]);
                }
                else
                {
                    if(m_sel.ltf)
                    {
                        movdqa(xmm4, xmm3);
                        pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
                        pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
                        psrlw(xmm4, 1);
                        movdqa(xmmword[&m_env.temp.vf], xmm4);
                    }
                }

                movdqa(xmmword[&m_env.temp.s], xmm2);
                movdqa(xmmword[&m_env.temp.t], xmm3);
            }
            else
            {
                // s = vt.xxxx() + m_env.d[skip].s;
                // t = vt.yyyy() + m_env.d[skip].t;
                // q = vt.zzzz() + m_env.d[skip].q;

                movaps(xmm2, xmm4);
                movaps(xmm3, xmm4);

                shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
                shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
                shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));

                addps(xmm2, xmmword[edx + 16 * 1]);
                addps(xmm3, xmmword[edx + 16 * 2]);
                addps(xmm4, xmmword[edx + 16 * 3]);

                movaps(xmmword[&m_env.temp.s], xmm2);
                movaps(xmmword[&m_env.temp.t], xmm3);
                movaps(xmmword[&m_env.temp.q], xmm4);

                rcpps(xmm4, xmm4);
                mulps(xmm2, xmm4);
                mulps(xmm3, xmm4);
            }
        }

        if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
        {
            if(m_sel.iip)
            {
                // GSVector4i vc = GSVector4i(v.c);

                cvttps2dq(xmm6, xmmword[ebx]); // v.c

                // vc = vc.upl16(vc.zwxy());

                pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2));
                punpcklwd(xmm6, xmm5);

                // rb = vc.xxxx().add16(m_env.d[skip].rb);
                // ga = vc.zzzz().add16(m_env.d[skip].ga);

                pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));

                paddw(xmm5, xmmword[edx + 16 * 4]);
                paddw(xmm6, xmmword[edx + 16 * 5]);

                movdqa(xmmword[&m_env.temp.rb], xmm5);
                movdqa(xmmword[&m_env.temp.ga], xmm6);
            }
            else
            {
                if(m_sel.tfx == TFX_NONE)
                {
                    movdqa(xmm5, xmmword[&m_env.c.rb]);
                    movdqa(xmm6, xmmword[&m_env.c.ga]);
                }
            }
        }
    }
}
示例#10
0
float TfirFilter::vec_inner_prod_sse(const float *eax, const float *edi, int ecx)
{
  __m128 xmm3,xmm4,xmm0,xmm1,xmm5,xmm6;
  xorps (xmm3, xmm3);
  xorps (xmm4, xmm4);

  ecx-=8;// sub $8, %%ecx
  if (ecx<0) goto //jb
   mul8_skip;

mul8_loop:
  movups (xmm0,eax);
  movups (xmm1,edi);
  movups (xmm5,16/sizeof(float)+eax);
  movups (xmm6,16/sizeof(float)+edi);
  eax+=32/sizeof(float);
  edi+=32/sizeof(float);
  mulps (xmm1,xmm0);
  mulps (xmm6,xmm5);
  addps (xmm3,xmm1);
  addps (xmm4,xmm6);

  ecx-=8;
  if (ecx>=0) //jae
   goto mul8_loop;

mul8_skip:

  addps (xmm3,xmm4);

  ecx+=4;
  if (ecx<0) //jl
   goto mul4_skip;

  movups (xmm0,eax);
  movups (xmm1,edi);
  eax+=16/sizeof(float);
  edi+=16/sizeof(float);
  mulps (xmm1, xmm0);
  addps (xmm3, xmm1);

  ecx-=4;

mul4_skip:

  ecx+=4;

  goto cond1;

mul1_loop:
  movss (xmm0,eax);
  movss (xmm1,edi);
  eax+=4/sizeof(float);
  edi+=4/sizeof(float);
  mulss (xmm1,xmm0);
  addss (xmm3,xmm1);

cond1:
  ecx-=1;
  if (ecx>=0) // jae
   goto mul1_loop;

  movhlps  (xmm4,xmm3);
  addps    (xmm3,xmm4);
  movaps   (xmm4,xmm3);
  //FIXME: which one?
  xmm4=_mm_shuffle_ps(xmm4,xmm4,0x55);// shufps $0x55, xmm4, xmm4
                                      // shufps $33, xmm4, xmm4
  addss    (xmm3, xmm4);
  float sum;
  movss    (&sum , xmm3);
  return sum;
}