Beispiel #1
0
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
                uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t* p0 = (uint16_t *)buff + 8;
    uint16_t* p1 = p0 + bstride;
    uint16_t* p2 = p1 + bstride;
    uint16_t* p3 = p2 + bstride;
    uint16_t* p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 alpha = _mm_set1_ps((float)0.96043387);
    __m128 beta = _mm_set1_ps((float)0.39782473);
    __m128i pmax = _mm_set1_epi32(0xFFFF);
    __m128i min = _mm_set1_epi16((int16_t)eh->min);
    __m128i max = _mm_set1_epi16((int16_t)eh->max);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);
        uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2};
        uint16_t* posv[] = {p0, p1, p3, p4};

        for (int x = 0; x < width; x += 8) {
            __m128 sumx[2] = {(__m128)zero, (__m128)zero};
            __m128 sumy[2] = {(__m128)zero, (__m128)zero};

            for (int i = 0; i < 4; i++) {
                __m128 xmul = _mm_load_ps(ar_mulxf[i]);
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x));
                __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul));
                sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul));

                xmul = _mm_load_ps(ar_mulyf[i]);
                xmm0 = _mm_load_si128((__m128i *)(posv[i] + x));
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul));
                sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul));
            }

            __m128i out[2];
            for (int i = 0; i < 2; i++) {
                sumx[i] = mm_abs_ps(sumx[i]);
                sumy[i] = mm_abs_ps(sumy[i]);
                __m128 t0 = _mm_max_ps(sumx[i], sumy[i]);
                __m128 t1 = _mm_min_ps(sumx[i], sumy[i]);
                t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1));
                out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift);
                out[i] = mm_min_epi32(out[i], pmax);
            }
            out[0] = mm_cast_epi32(out[0], out[1]);

            out[1] = MM_MIN_EPU16(out[0], max);
            out[1] = _mm_cmpeq_epi16(out[1], max);
            out[0] = _mm_or_si128(out[1], out[0]);

            out[1] = MM_MAX_EPU16(out[0], min);
            out[1] = _mm_cmpeq_epi16(out[1], min);
            out[0] = _mm_andnot_si128(out[1], out[0]);

            _mm_store_si128((__m128i *)(dstp + x), out[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
void
transform8_otherrgb_avx(ThreadInfo* t)
{
	RS_IMAGE16 *input = t->input;
	GdkPixbuf *output = t->output;
	RS_MATRIX3 *matrix = t->matrix;
	gint x,y;
	gint width;

	float mat_ps[4*4*3] __attribute__ ((aligned (16)));
	for (x = 0; x < 4; x++ ) {
		mat_ps[x] = matrix->coeff[0][0];
		mat_ps[x+4] = matrix->coeff[0][1];
		mat_ps[x+8] = matrix->coeff[0][2];
		mat_ps[12+x] = matrix->coeff[1][0];
		mat_ps[12+x+4] = matrix->coeff[1][1];
		mat_ps[12+x+8] = matrix->coeff[1][2];
		mat_ps[24+x] = matrix->coeff[2][0];
		mat_ps[24+x+4] = matrix->coeff[2][1];
		mat_ps[24+x+8] = matrix->coeff[2][2];
	}
	
	int start_x = t->start_x;
	/* Always have aligned input and output adress */
	if (start_x & 3)
		start_x = ((start_x) / 4) * 4;
	
	int complete_w = t->end_x - start_x;
	/* If width is not multiple of 4, check if we can extend it a bit */
	if (complete_w & 3)
	{
		if ((t->end_x+4) < input->w)
			complete_w = ((complete_w+3) / 4 * 4);
	}
	__m128 gamma = _mm_set1_ps(t->output_gamma);

	for(y=t->start_y ; y<t->end_y ; y++)
	{
		gushort *i = GET_PIXEL(input, start_x, y);
		guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
		gboolean aligned_write = !((guintptr)(o)&0xf);

		width = complete_w >> 2;

		while(width--)
		{
			/* Load and convert to float */
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_load_si128((__m128i*)i); // Load two pixels
			__m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels
			_mm_prefetch(i + 64, _MM_HINT_NTA);
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128i p2 =_mm_unpackhi_epi16(in, zero);
			__m128i p3 =_mm_unpacklo_epi16(in2, zero);
			__m128i p4 =_mm_unpackhi_epi16(in2, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);
			__m128 p2f  = _mm_cvtepi32_ps(p2);
			__m128 p3f  = _mm_cvtepi32_ps(p3);
			__m128 p4f  = _mm_cvtepi32_ps(p4);
			
			/* Convert to planar */
			__m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
			__m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
			__m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
			__m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
			__m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
			__m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
			__m128 b = _mm_movelh_ps(b1b0, b3b2);

			/* Apply matrix to convert to sRGB */
			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			/* Normalize to 0->1 and clamp */
			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
			g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2)));
			b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2)));

			/* Apply Gamma */
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
			g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma));
			b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma));

			/* Convert to 8 bit unsigned  and interleave*/
			__m128i r_i = _mm_cvtps_epi32(r);
			__m128i g_i = _mm_cvtps_epi32(g);
			__m128i b_i = _mm_cvtps_epi32(b);
			
			r_i = _mm_packs_epi32(r_i, r_i);
			g_i = _mm_packs_epi32(g_i, g_i);
			b_i = _mm_packs_epi32(b_i, b_i);

			/* Set alpha value to 255 and store */
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			__m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
			__m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
			p1 = _mm_unpacklo_epi32(rg_i, bb_i);
			p2 = _mm_unpackhi_epi32(rg_i, bb_i);
	
			p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));

			if (aligned_write)
				_mm_store_si128((__m128i*)o, p1);
			else
				_mm_storeu_si128((__m128i*)o, p1);

			i += 16;
			o += 16;
		}
		/* Process remaining pixels */
		width = complete_w & 3;
		while(width--)
		{
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);

			/* Splat r,g,b */
			__m128 r =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0));
			__m128 g =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1));
			__m128 b =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2));

			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			r = _mm_unpacklo_ps(r2, g2);	// GG RR GG RR
			r = _mm_movelh_ps(r, b2);		// BB BB GG RR

			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r)));
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
			
			/* Convert to 8 bit unsigned */
			zero = _mm_setzero_si128();
			__m128i r_i = _mm_cvtps_epi32(r);
			/* To 16 bit signed */
			r_i = _mm_packs_epi32(r_i, zero);
			/* To 8 bit unsigned - set alpha channel*/
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero));
			*(int*)o = _mm_cvtsi128_si32(r_i);
			i+=4;
			o+=4;
		}
	}
}
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
                int height, int stride, uint8_t *d, const uint8_t *s)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *p3 = p2 + bstride;
    uint16_t *p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i max = _mm_set1_epi32(0xFFFF);
    __m128 matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_set1_ps((float)ch->m[i]);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);
        uint16_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        };
        
        for (int x = 0; x < width; x += 8) {
            __m128 sum[2] = {(__m128)zero, (__m128)zero};

            for (int i = 0; i < 25; i++) {
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
                __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
                xmm1 = _mm_mul_ps(xmm1, matrix[i]);
                xmm2 = _mm_mul_ps(xmm2, matrix[i]);
                sum[0] = _mm_add_ps(sum[0], xmm1);
                sum[1] = _mm_add_ps(sum[1], xmm2);
            }

            __m128i sumi[2];
            for (int i = 0; i < 2; i++) {
                sum[i]  = _mm_mul_ps(sum[i], rdiv);
                sum[i]  = _mm_add_ps(sum[i], bias);
                if (!ch->saturate) {
                    sum[i] = mm_abs_ps(sum[i]);
                }
                sumi[i] = _mm_cvtps_epi32(sum[i]);
                sumi[i] = mm_min_epi32(sumi[i], max);
                __m128i mask = _mm_cmpgt_epi32(sumi[i], zero);
                sumi[i] = _mm_and_si128(sumi[i], mask);
            }

            sumi[0] = mm_cast_epi32(sumi[0], sumi[1]);

            _mm_store_si128((__m128i *)(dstp + x), sumi[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Beispiel #4
0
void YUV422ToRGB888(const XnUInt8* pYUVImage, XnUInt8* pRGBAImage, XnUInt32 nYUVSize, XnUInt32 nRGBSize)
{
	const XnUInt8* pYUVLast = pYUVImage + nYUVSize - 8;
	XnUInt8* pRGBLast = pRGBAImage + nRGBSize - 16;

	const __m128 minus128 = _mm_set_ps1(-128);
	const __m128 plus113983 = _mm_set_ps1(1.13983F);
	const __m128 minus039466 = _mm_set_ps1(-0.39466F);
	const __m128 minus058060 = _mm_set_ps1(-0.58060F);
	const __m128 plus203211 = _mm_set_ps1(2.03211F);
	const __m128 zero = _mm_set_ps1(0);
	const __m128 plus255 = _mm_set_ps1(255);

	// define YUV floats
	__m128 y;
	__m128 u;
	__m128 v;

	__m128 temp;

	// define RGB floats
	__m128 r;
	__m128 g;
	__m128 b;

	// define RGB integers
	__m128i iR;
	__m128i iG;
	__m128i iB;

	XnUInt32* piR = (XnUInt32*)&iR;
	XnUInt32* piG = (XnUInt32*)&iG;
	XnUInt32* piB = (XnUInt32*)&iB;

	while (pYUVImage <= pYUVLast && pRGBAImage <= pRGBLast)
	{
		// process 4 pixels at once (values should be ordered backwards)
		y = _mm_set_ps(pYUVImage[YUV422_Y2 + YUV422_BPP], pYUVImage[YUV422_Y1 + YUV422_BPP], pYUVImage[YUV422_Y2], pYUVImage[YUV422_Y1]);
		u = _mm_set_ps(pYUVImage[YUV422_U + YUV422_BPP],  pYUVImage[YUV422_U + YUV422_BPP],  pYUVImage[YUV422_U],  pYUVImage[YUV422_U]);
		v = _mm_set_ps(pYUVImage[YUV422_V + YUV422_BPP],  pYUVImage[YUV422_V + YUV422_BPP],  pYUVImage[YUV422_V],  pYUVImage[YUV422_V]);

		u = _mm_add_ps(u, minus128); // u -= 128
		v = _mm_add_ps(v, minus128); // v -= 128

		/*

		http://en.wikipedia.org/wiki/YUV

		From YUV to RGB:
		R =     Y + 1.13983 V
		G =     Y - 0.39466 U - 0.58060 V
		B =     Y + 2.03211 U

		*/ 

		temp = _mm_mul_ps(plus113983, v);
		r = _mm_add_ps(y, temp);

		temp = _mm_mul_ps(minus039466, u);
		g = _mm_add_ps(y, temp);
		temp = _mm_mul_ps(minus058060, v);
		g = _mm_add_ps(g, temp);

		temp = _mm_mul_ps(plus203211, u);
		b = _mm_add_ps(y, temp);

		// make sure no value is smaller than 0
		r = _mm_max_ps(r, zero);
		g = _mm_max_ps(g, zero);
		b = _mm_max_ps(b, zero);

		// make sure no value is bigger than 255
		r = _mm_min_ps(r, plus255);
		g = _mm_min_ps(g, plus255);
		b = _mm_min_ps(b, plus255);

		// convert floats to int16 (there is no conversion to uint8, just to int8).
		iR = _mm_cvtps_epi32(r);
		iG = _mm_cvtps_epi32(g);
		iB = _mm_cvtps_epi32(b);

		// extract the 4 pixels RGB values.
		// because we made sure values are between 0 and 255, we can just take the lower byte
		// of each INT16
		pRGBAImage[0] = piR[0];
		pRGBAImage[1] = piG[0];
		pRGBAImage[2] = piB[0];
		pRGBAImage[3] = 255;

		pRGBAImage[4] = piR[1];
		pRGBAImage[5] = piG[1];
		pRGBAImage[6] = piB[1];
		pRGBAImage[7] = 255;

		pRGBAImage[8] = piR[2];
		pRGBAImage[9] = piG[2];
		pRGBAImage[10] = piB[2];
		pRGBAImage[11] = 255;

		pRGBAImage[12] = piR[3];
		pRGBAImage[13] = piG[3];
		pRGBAImage[14] = piB[3];
		pRGBAImage[15] = 255;

		// advance the streams
		pYUVImage += 8;
		pRGBAImage += 16;
	}
}
void Permutohedral::init ( const MatrixXf & feature )
{
    // Compute the lattice coordinates for each feature [there is going to be a lot of magic here
    N_ = feature.cols();
    d_ = feature.rows();
    HashTable hash_table( d_, N_/**(d_+1)*/ );

    const int blocksize = sizeof(__m128) / sizeof(float);
    const __m128 invdplus1   = _mm_set1_ps( 1.0f / (d_+1) );
    const __m128 dplus1      = _mm_set1_ps( d_+1 );
    const __m128 Zero        = _mm_set1_ps( 0 );
    const __m128 One         = _mm_set1_ps( 1 );

    // Allocate the class memory
    offset_.resize( (d_+1)*(N_+16) );
    std::fill( offset_.begin(), offset_.end(), 0 );
    barycentric_.resize( (d_+1)*(N_+16) );
    std::fill( barycentric_.begin(), barycentric_.end(), 0 );
    rank_.resize( (d_+1)*(N_+16) );

    // Allocate the local memory
    __m128 * scale_factor = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
    __m128 * f            = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
    __m128 * elevated     = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
    __m128 * rem0         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
    __m128 * rank         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 );
    float * barycentric = new float[(d_+2)*blocksize];
    short * canonical = new short[(d_+1)*(d_+1)];
    short * key = new short[d_+1];

    // Compute the canonical simplex
    for( int i=0; i<=d_; i++ ){
        for( int j=0; j<=d_-i; j++ )
            canonical[i*(d_+1)+j] = i;
        for( int j=d_-i+1; j<=d_; j++ )
            canonical[i*(d_+1)+j] = i - (d_+1);
    }

    // Expected standard deviation of our filter (p.6 in [Adams etal 2010])
    float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1);
    // Compute the diagonal part of E (p.5 in [Adams etal 2010])
    for( int i=0; i<d_; i++ )
        scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( (i+2)*(i+1) ) * inv_std_dev );

    // Setup the SSE rounding
#ifndef __SSE4_1__
    const unsigned int old_rounding = _mm_getcsr();
    _mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST );
#endif

    // Compute the simplex each feature lies in
    for( int k=0; k<N_; k+=blocksize ){
        // Load the feature from memory
        float * ff = (float*)f;
        for( int j=0; j<d_; j++ )
            for( int i=0; i<blocksize; i++ )
                ff[ j*blocksize + i ] = k+i < N_ ? feature(j,k+i) : 0.0;

        // Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010])

        // sm contains the sum of 1..n of our faeture vector
        __m128 sm = Zero;
        for( int j=d_; j>0; j-- ){
            __m128 cf = f[j-1]*scale_factor[j-1];
            elevated[j] = sm - _mm_set1_ps(j)*cf;
            sm += cf;
        }
        elevated[0] = sm;

        // Find the closest 0-colored simplex through rounding
        __m128 sum = Zero;
        for( int i=0; i<=d_; i++ ){
            __m128 v = invdplus1 * elevated[i];
#ifdef __SSE4_1__
            v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT );
#else
            v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) );
#endif
            rem0[i] = v*dplus1;
            sum += v;
        }

        // Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values)
        for( int i=0; i<=d_; i++ )
            rank[i] = Zero;
        for( int i=0; i<d_; i++ ){
            __m128 di = elevated[i] - rem0[i];
            for( int j=i+1; j<=d_; j++ ){
                __m128 dj = elevated[j] - rem0[j];
                __m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) );
                rank[i] += c;
                rank[j] += One-c;
            }
        }

        // If the point doesn't lie on the plane (sum != 0) bring it back
        for( int i=0; i<=d_; i++ ){
            rank[i] += sum;
            __m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) );
            __m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) );
            rank[i] += add-sub;
            rem0[i] += add-sub;
        }

        // Compute the barycentric coordinates (p.10 in [Adams etal 2010])
        for( int i=0; i<(d_+2)*blocksize; i++ )
            barycentric[ i ] = 0;
        for( int i=0; i<=d_; i++ ){
            __m128 v = (elevated[i] - rem0[i])*invdplus1;

            // Didn't figure out how to SSE this
            float * fv = (float*)&v;
            float * frank = (float*)&rank[i];
            for( int j=0; j<blocksize; j++ ){
                int p = d_-frank[j];
                barycentric[j*(d_+2)+p  ] += fv[j];
                barycentric[j*(d_+2)+p+1] -= fv[j];
            }
        }

        // The rest is not SSE'd
        for( int j=0; j<blocksize; j++ ){
            // Wrap around
            barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1];

            float * frank = (float*)rank;
            float * frem0 = (float*)rem0;
            // Compute all vertices and their offset
            for( int remainder=0; remainder<=d_; remainder++ ){
                for( int i=0; i<d_; i++ ){
                    key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ];
                }
                offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true );
                rank_[ (j+k)*(d_+1)+remainder ] = frank[remainder*blocksize+j];
                barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ];
            }
        }
    }
    _mm_free( scale_factor );
    _mm_free( f );
    _mm_free( elevated );
    _mm_free( rem0 );
    _mm_free( rank );
    delete [] barycentric;
    delete [] canonical;
    delete [] key;

    // Reset the SSE rounding
#ifndef __SSE4_1__
    _mm_setcsr( old_rounding );
#endif

    // This is normally fast enough so no SSE needed here
    // Find the Neighbors of each lattice point

    // Get the number of vertices in the lattice
    M_ = hash_table.size();

    // Create the neighborhood structure
    blur_neighbors_.resize( (d_+1)*M_ );

    short * n1 = new short[d_+1];
    short * n2 = new short[d_+1];

    // For each of d+1 axes,
    for( int j = 0; j <= d_; j++ ){
        for( int i=0; i<M_; i++ ){
            const short * key = hash_table.getKey( i );
            for( int k=0; k<d_; k++ ){
                n1[k] = key[k] - 1;
                n2[k] = key[k] + 1;
            }
            n1[j] = key[j] + d_;
            n2[j] = key[j] - d_;

            blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 );
            blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 );
        }
    }
    delete[] n1;
    delete[] n2;
}
Beispiel #6
0
static void SinCos(const float rad, float &sin, float &cos) // #include <emmintrin.h>, #include <xmmintrin.h>
{
	const __m128 _ps_fopi = _mm_set1_ps(4.0f / pi);

	const __m128 _ps_0p5 = _mm_set1_ps(0.5f);
	const __m128 _ps_1   = _mm_set1_ps(1.0f);

	const __m128 _ps_dp1 = _mm_set1_ps(-0.7851562f);
	const __m128 _ps_dp2 = _mm_set1_ps(-2.4187564849853515625e-4f);
	const __m128 _ps_dp3 = _mm_set1_ps(-3.77489497744594108e-8f);

	const __m128 _ps_sincof_p0 = _mm_set1_ps(2.443315711809948e-5f);
	const __m128 _ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f);
	const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f);
	const __m128 _ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f);
	const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f);
	const __m128 _ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f);

	const __m128i _pi32_1  = _mm_set1_epi32(1);
	const __m128i _pi32_i1 = _mm_set1_epi32(~1);
	const __m128i _pi32_2  = _mm_set1_epi32(2);
	const __m128i _pi32_4  = _mm_set1_epi32(4);

	const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000));
	const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));

	__m128  mm1,  mm2;
	__m128i mmi0, mmi2, mmi4;

	__m128 x, y, z;
	__m128 y1,  y2;

	__m128 a = _mm_set1_ps(rad);

	x = _mm_and_ps(a, _mask_sign_inv);
	y = _mm_mul_ps(x, _ps_fopi);

	mmi2 = _mm_cvtps_epi32(y);
	mmi2 = _mm_add_epi32(mmi2, _pi32_1);
	mmi2 = _mm_and_si128(mmi2, _pi32_i1);
	y    = _mm_cvtepi32_ps(mmi2);

	mmi4 = mmi2;

	mmi0 = _mm_and_si128(mmi2, _pi32_4);
	mmi0 = _mm_slli_epi32(mmi0, 29);
	__m128 swap_sign_bit_sin = _mm_castsi128_ps(mmi0);

	mmi2 = _mm_and_si128(mmi2, _pi32_2);
	mmi2 = _mm_cmpeq_epi32(mmi2, _mm_setzero_si128());
	__m128 poly_mask = _mm_castsi128_ps(mmi2);

	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp1));
	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp2));
	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp3));

	mmi4 = _mm_sub_epi32(mmi4, _pi32_2);
	mmi4 = _mm_andnot_si128(mmi4, _pi32_4);
	mmi4 = _mm_slli_epi32(mmi4, 29);

	__m128 sign_bit_cos = _mm_castsi128_ps(mmi4);
	__m128 sign_bit_sin = _mm_xor_ps(_mm_and_ps(a, _mask_sign_raw), swap_sign_bit_sin);


	z  = _mm_mul_ps(x, x);

	y1 = _mm_mul_ps(_ps_coscof_p0, z);
	y1 = _mm_add_ps(y1, _ps_coscof_p1);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_add_ps(y1, _ps_coscof_p2);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_sub_ps(y1, _mm_mul_ps(z, _ps_0p5));
	y1 = _mm_add_ps(y1, _ps_1);

	y2 = _mm_mul_ps(_ps_sincof_p0, z);
	y2 = _mm_add_ps(y2, _ps_sincof_p1);
	y2 = _mm_mul_ps(y2, z);
	y2 = _mm_add_ps(y2, _ps_sincof_p2);
	y2 = _mm_mul_ps(y2, z);
	y2 = _mm_mul_ps(y2, x);
	y2 = _mm_add_ps(y2, x);


	__m128 sin1y = _mm_andnot_ps(poly_mask, y1);
	__m128 sin2y = _mm_and_ps(poly_mask, y2);


	mm1 = _mm_add_ps(sin1y, sin2y);
	mm2 = _mm_add_ps(_mm_sub_ps(y1, sin1y), _mm_sub_ps(y2, sin2y));

	sin = _mm_cvtss_f32(_mm_xor_ps(mm1, sign_bit_sin));
	cos = _mm_cvtss_f32(_mm_xor_ps(mm2, sign_bit_cos));
}
Beispiel #7
0
bool CResizeEngine::horizontalFilter(CDIBSection *src, uint src_height,
                                     CDIBSection *dst, uint dst_yoffset, uint dst_height,
                                     ILongTimeRunCallback *pCallback) {
	assert(src->getBitCounts() == dst->getBitCounts());
	int bitcount = src->getBitCounts();
	assert((int)src_height <= src->getHeight());
	assert(src_height >= dst_height);
	uint dst_ymax = dst_yoffset + dst_height;
	assert((int)dst_ymax <= dst->getHeight());
	uint src_width = src->getWidth();
	uint dst_width = dst->getWidth();

	if (dst_width == src_width) {

		uint8 *src_bits = src->getData();
		uint8 *dst_bits = dst->getLine(dst_yoffset);
		assert(src_bits && dst_bits);

		uint height = min(dst_height, src_height);
		memcpy(dst_bits, src_bits, height * dst->getStride());

	} else if (!m_pFilter) { // fast (COLORONCOLOR)
		double ratio_w = (double)src_width / (double)dst_width;
		uint bytespp = bitcount / 8;

		for (uint y = dst_yoffset, sy = 0; y < dst_ymax; ++ y, ++ sy) {
			uint8 *dst_data = (uint8 *)dst->getLine(y);
			uint8 *src_line = (uint8 *)src->getLine(sy);

			for (uint x = 0; x < dst_width; ++ x) {
				uint sx = (uint)(x * ratio_w + 0.5);
				if (sx >= src_width) {
					sx = src_width - 1;
				}

				uint8 *src_data = src_line + sx * bytespp;
				for (uint i = 0; i < bytespp; ++ i) {
					*dst_data ++ = *src_data ++;
				}
			}
		}

	} else { // use m_pFilter
		uint index; // pixel index
		CWeightsTable weightsTable(m_pFilter, dst_width, src_width);
#ifdef USE_SSE
		__m128i value, t;
		__m128 a, b, c, v05 = _mm_set_ps1(0.5);
#elif (defined(USE_SSE2))
		__m128i value, t;
		__m128d a, b, c, v05 = _mm_set1_pd(0.5);
#endif

		uint bytespp = src->getBitCounts() / 8;
		assert(bytespp == 3 || bytespp == 4);
		for (uint dsty = dst_yoffset, srcy = 0; dsty < dst_ymax; ++ dsty, ++ srcy) {
			// test for stop
			if (srcy % 32 == 0) {
				if (pCallback && pCallback->shouldStop()) {
					return false;
				}
			}

			uint8 *src_bits = src->getLine(srcy);
			uint8 *dst_bits = dst->getLine(dsty);

			for(uint x = 0; x < dst_width; ++ x) {
				int iLeft = weightsTable.getLeftBoundary(x);
				int iRight = weightsTable.getRightBoundary(x);
				index = iLeft * bytespp;
#ifdef USE_SSE
				__m128 v = _mm_set_ps1(0.0);
				_mm_prefetch((const char *)src_bits + index, _MM_HINT_T0);
#elif defined(USE_SSE2)
				__m128d v1 = _mm_set1_pd(0.0);
				__m128d v2 = _mm_set1_pd(0.0);
#elif defined(USE_FLOAT)
				float value[4] = {0, 0, 0, 0};
#else
				double value[4] = {0, 0, 0, 0}; // 4 = 32bpp max
#endif
				for(int i = iLeft; i <= iRight; ++ i) {
#ifdef USE_SSE
					float weight = (float)weightsTable.getWeight(x, i - iLeft);

					a = _mm_set_ps1(weight);
					if (bytespp == 3) {
						t = _mm_set_epi32(0, src_bits[index + 2], src_bits[index + 1], src_bits[index]);
					} else {
						t = _mm_set_epi32(src_bits[index + 3], src_bits[index + 2], src_bits[index + 1], src_bits[index]);
					}
					b = _mm_cvtepi32_ps(t);
					c = _mm_mul_ps(a, b);
					v = _mm_add_ps(v, c);
					index += bytespp;
#elif defined(USE_SSE2)
					double weight = weightsTable.getWeight(x, i-iLeft);

					a = _mm_set1_pd(weight);
					t = _mm_set_epi32(0, 0, src_bits[index + 1], src_bits[index]);
					b = _mm_cvtepi32_pd(t);
					c = _mm_mul_pd(a, b);
					v1 = _mm_add_pd(v1, c);

					t = _mm_set_epi32(0, 0, bytespp == 3 ? 0 : src_bits[index + 3], src_bits[index + 2]);
					b = _mm_cvtepi32_pd(t);
					c = _mm_mul_pd(a, b);
					v2 = _mm_add_pd(v2, c);
					index += bytespp;
#elif defined(USE_FLOAT)
					float weight = (float)weightsTable.getWeight(x, i-iLeft);

					for (uint j = 0; j < bytespp; ++ j) {
						value[j] += (weight * (float)src_bits[index ++]); 
					}
#else
					double weight = weightsTable.getWeight(x, i-iLeft);

					for (uint j = 0; j < bytespp; ++ j) {
						value[j] += (weight * (double)src_bits[index ++]); 
					}
#endif
				} 

#ifdef USE_SSE
				v = _mm_add_ps(v, v05);
				value = _mm_cvtps_epi32(v);
				dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[2]), (int)255);
				if (bytespp == 4) {
					dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[3]), (int)255);
				}
#elif defined (USE_SSE2)
				v1 = _mm_add_pd(v1, v05);
				v2 = _mm_add_pd(v2, v05);
				value = _mm_cvtpd_epi32(v1);
				dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				value = _mm_cvtpd_epi32(v2);
				dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				if (bytespp == 4) {
					dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				}
#else
				for (uint j = 0; j < bytespp; ++ j) {
					dst_bits[j] = (unsigned char)MIN(MAX((int)0, (int)(value[j] + 0.5)), (int)255);
				}
#endif

				dst_bits += bytespp;
			}
		}
	}
	return true;
}
Beispiel #8
0
bool CResizeEngine::verticalFilter(CDIBSection *src, CDIBSection *dst, ILongTimeRunCallback *pCallback) {
	assert(src->getBitCounts() == dst->getBitCounts());
	int bitcount = src->getBitCounts();
	uint src_width = src->getWidth();
	uint src_height = src->getHeight();
	uint dst_width = dst->getWidth();
	uint dst_height = dst->getHeight();
	assert(src_width == dst_width);
	src_width = src_width;
	if (src_height == dst_height) {

		unsigned char *src_bits = (unsigned char *)src->getData();
		unsigned char *dst_bits = (unsigned char *)dst->getData();
		assert(src_bits && dst_bits);

		memcpy(dst_bits, src_bits, dst_height * dst->getStride());

	} else if (!m_pFilter) { // fast (COLOR ON COLOR)

		double ratio_h = (double)src_height / (double)dst_height;

		uint bytespp = bitcount / 8;
		for (uint y = 0; y < dst_height; ++ y) {
			uint sy = (uint)(y * ratio_h + 0.5);
			if (sy >= src_height) {
				sy = src_height - 1;
			}
			uint8 *dst_data = (uint8 *)dst->getLine(y);
			uint8 *src_line = (uint8 *)src->getLine(sy);

			for (uint x = 0; x < dst_width; ++ x) {
				uint8 *src_data = src_line + x * bytespp;
				for (uint i = 0; i < bytespp; ++ i) {
					*dst_data ++ = *src_data ++;
				}
			}
		}

	} else {
#ifdef USE_SSE
		__m128i value, t;
		__m128 a, b, c, v05 = _mm_set_ps1(0.5);
#elif (defined(USE_SSE2))
		__m128i value, t;
		__m128d a, b, c, v05 = _mm_set1_pd(0.5);
#endif
		uint index; // pixel index
		CWeightsTable weightsTable(m_pFilter, dst_height, src_height);

		uint bytespp = src->getBitCounts() / 8;
		assert(bytespp == 3 || bytespp == 4);

		unsigned src_pitch = src->getStride();
		unsigned dst_pitch = dst->getStride();

		for(uint x = 0; x < dst_width; ++ x) {
			// test for stop
			if (x % 16 == 0) {
				if (pCallback && pCallback->shouldStop()) {
					return false;
				}
			}
			index = x * bytespp;

			unsigned char *dst_bits = (unsigned char *)dst->getData();
			dst_bits += index;

			for(uint y = 0; y < dst_height; ++ y) {
#ifdef USE_SSE
				__m128 v = _mm_set_ps1(0.0);
#elif defined (USE_SSE2)
				__m128d v1 = _mm_set1_pd(0.0);
				__m128d v2 = _mm_set1_pd(0.0);
#elif defined (USE_FLOAT)
				float value[4] = {0, 0, 0, 0};
#else
				double value[4] = {0, 0, 0, 0}; // 4 = 32bpp max
#endif
				int iLeft = weightsTable.getLeftBoundary(y);
				int iRight = weightsTable.getRightBoundary(y);

				uint8 *src_bits = src->getLine(iLeft);
				src_bits += index;

				for(int i = iLeft; i <= iRight; ++ i) {
#ifdef USE_SSE
					float weight = (float)weightsTable.getWeight(y, i - iLeft);
					a = _mm_set_ps1(weight);
					if (bytespp == 3) {
						t = _mm_set_epi32(0, src_bits[2], src_bits[1], src_bits[0]);
					} else {
						t = _mm_set_epi32(src_bits[3], src_bits[2], src_bits[1], src_bits[0]);
					}

					b = _mm_cvtepi32_ps(t);
					c = _mm_mul_ps(a, b);
					v = _mm_add_ps(v, c);
#elif defined(USE_SSE2)
					double weight = weightsTable.getWeight(y, i - iLeft);

					a = _mm_set1_pd(weight);
					t = _mm_set_epi32(0, 0, src_bits[1], src_bits[0]);
					b = _mm_cvtepi32_pd(t);
					c = _mm_mul_pd(a, b);
					v1 = _mm_add_pd(v1, c);

					t = _mm_set_epi32(0, 0, bytespp == 3 ? 0 : src_bits[3], src_bits[2]);
					b = _mm_cvtepi32_pd(t);
					c = _mm_mul_pd(a, b);
					v2 = _mm_add_pd(v2, c);
#elif defined (USE_FLOAT)
					float weight = (float)weightsTable.getWeight(y, i - iLeft);							
					for (uint j = 0; j < bytespp; ++ j) {
						value[j] += (weight * (float)src_bits[j]);
					}
#else
					double weight = weightsTable.getWeight(y, i - iLeft);							
					for (uint j = 0; j < bytespp; ++ j) {
						value[j] += (weight * (double)src_bits[j]);
					}
#endif

					src_bits += src_pitch;
				}

				// clamp and place result in destination pixel
#ifdef USE_SSE
				v = _mm_add_ps(v, v05);
				value = _mm_cvtps_epi32(v);
// 				__m128i flag = _mm_cmpgt_epi32(value, _mm_set1_epi32(0));
// 				value = _mm_and_si128(value, flag);
// 				dst_bits[0] = (unsigned char)MIN(255, value.m128i_i32[0]);
// 				dst_bits[1] = (unsigned char)MIN(255, value.m128i_i32[1]);
// 				dst_bits[2] = (unsigned char)MIN(255, value.m128i_i32[2]);
// 				if (bytespp == 4) {
// 					dst_bits[3] = (unsigned char)MIN(255, value.m128i_i32[3]);
// 				}
				dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[2]), (int)255);
				if (bytespp == 4) {
					dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[3]), (int)255);
				}
#elif defined (USE_SSE2)
				v1 = _mm_add_pd(v1, v05);
				v2 = _mm_add_pd(v2, v05);
				value = _mm_cvtpd_epi32(v1);
 				dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				value = _mm_cvtpd_epi32(v2);
				dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255);
				if (bytespp == 4) {
					dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255);
				}
#else
				for (unsigned j = 0; j < bytespp; ++ j) {
					dst_bits[j] = (unsigned char)MIN(MAX((int)0, (int)(value[j] + 0.5)), (int)255);
				}
#endif

				dst_bits += dst_pitch;
			}
		}
	}
	return true;
}
Beispiel #9
0
INLINE __m128 shade(BilinearSamplePos const& bsp, const SWR_TRIANGLE_DESC & work, WideVector<BilinearSamplePos::NUM_ATTRIBUTES, __m128> const& pAttrs, BYTE* pBuffer, BYTE*, UINT*)
{
	TextureView *pTxv = (TextureView*)work.pTextureViews[KNOB_NUMBER_OF_TEXTURE_VIEWS + 0];
	Sampler *pSmp = (Sampler*)work.pSamplers[0];

	TexCoord tcidx;
	tcidx.U = get<4>(pAttrs);
	tcidx.V = get<5>(pAttrs);
	UINT mips[] = {0,0,0,0};
	WideColor color;
	SampleSimplePointRGBAF32(*pTxv, *pSmp, tcidx, mips, color);

	// modulate
	color.R = _mm_mul_ps(color.R, get<0>(pAttrs));
	color.G = _mm_mul_ps(color.G, get<1>(pAttrs));
	color.B = _mm_mul_ps(color.B, get<2>(pAttrs));
	color.A = _mm_mul_ps(color.A, get<3>(pAttrs));

	// convert float to unorm
	__m128i r = vFloatToUnorm( color.R );
	__m128i g = vFloatToUnorm( color.G );
	__m128i b = vFloatToUnorm( color.B );
	__m128i a = vFloatToUnorm( color.A );

	// pack
	__m128i vPixel = b;
	vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(g, 8));
	vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(r, 16));
	vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(a, 24));

	// blend with GL_ONE and GL_ONE
	if (bsp.sFactor == GL_ONE && bsp.dFactor == GL_ONE)
	{ 
		__m128i vColorBuffer = _mm_load_si128((const __m128i*)pBuffer);
		vPixel = _mm_adds_epu8(vPixel, vColorBuffer);
	}
	
	if (bsp.sFactor == GL_SRC_ALPHA && bsp.dFactor == GL_ONE_MINUS_SRC_ALPHA)
	{
		const __m128i SHUF_ALPHA = _mm_set_epi32(0x8080800f, 0x8080800b, 0x80808007, 0x80808003);
		const __m128i SHUF_RED = _mm_set_epi32(0x8080800e, 0x8080800a, 0x80808006, 0x80808002);
		const __m128i SHUF_GREEN = _mm_set_epi32(0x8080800d, 0x80808009, 0x80808005, 0x80808001);
		const __m128i SHUF_BLUE = _mm_set_epi32(0x8080800c, 0x80808008, 0x80808004, 0x80808000);

		// mul by src_alpha
		__m128 vSrcRedF = _mm_mul_ps(color.R, color.A);
		__m128 vSrcGreenF = _mm_mul_ps(color.G, color.A);
		__m128 vSrcBlueF = _mm_mul_ps(color.B, color.A);

		// convert to int
		__m128i vSrcRed = vFloatToUnorm(vSrcRedF);
		__m128i vSrcGreen = vFloatToUnorm(vSrcGreenF);
		__m128i vSrcBlue = vFloatToUnorm(vSrcBlueF);
		__m128i vSrcAlpha = vFloatToUnorm(color.A);

		// pack
		__m128i vSrcPixel = vSrcBlue;
		vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcGreen, 8));
		vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcRed, 16));
		vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcAlpha, 24));

		// shuffle dst R,G,B,A
		__m128i vColorBuffer = _mm_load_si128((const __m128i*)pBuffer);

		__m128i vDstAlpha = _mm_shuffle_epi8(vColorBuffer, SHUF_ALPHA);
		__m128i vDstRed = _mm_shuffle_epi8(vColorBuffer, SHUF_RED);
		__m128i vDstGreen = _mm_shuffle_epi8(vColorBuffer, SHUF_GREEN);
		__m128i vDstBlue = _mm_shuffle_epi8(vColorBuffer, SHUF_BLUE);

		// convert to float
		__m128 vDstAlphaF = _mm_cvtepi32_ps(vDstAlpha);
		__m128 vDstRedF = _mm_cvtepi32_ps(vDstRed);
		__m128 vDstGreenF = _mm_cvtepi32_ps(vDstGreen);
		__m128 vDstBlueF = _mm_cvtepi32_ps(vDstBlue);

		// mul by 1-src_alpha
		__m128 vOneMinusSrcAlphaF = _mm_sub_ps(_mm_set1_ps(1.0f), color.A);
		
		vDstAlphaF = _mm_mul_ps(vDstAlphaF, vOneMinusSrcAlphaF);
		vDstRedF = _mm_mul_ps(vDstRedF, vOneMinusSrcAlphaF);
		vDstGreenF = _mm_mul_ps(vDstGreenF, vOneMinusSrcAlphaF);
		vDstBlueF = _mm_mul_ps(vDstBlueF, vOneMinusSrcAlphaF);

		// convert to int
		vDstAlpha = _mm_cvtps_epi32(vDstAlphaF);
		vDstRed = _mm_cvtps_epi32(vDstRedF);
		vDstGreen = _mm_cvtps_epi32(vDstGreenF);
		vDstBlue = _mm_cvtps_epi32(vDstBlueF);

		// pack
		__m128i vDstPixel = vDstBlue;
		vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstGreen, 8));
		vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstRed, 16));
		vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstAlpha, 24));

		// final rgba = min(src + dst,255)
		vPixel = _mm_adds_epu8(vSrcPixel, vDstPixel);
	}

	return _mm_castsi128_ps(vPixel);
}
Beispiel #10
0
// --------------------------------------------------------------
vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter)
// --------------------------------------------------------------
{
    // version avec test de sortie en float

    vuint32   iter  = _mm_set1_epi32(0);
    vfloat32  fiter = _mm_set_ps(0,0,0,0);

    vfloat32 x,y,t,t2,zero,un,deux,quatre;
    // COMPLETER ICI
    int test,i = 0;
    // initialisation des variables
    x      = _mm_set_ps(0,0,0,0);
    y      = _mm_set_ps(0,0,0,0);
    deux   = _mm_set_ps(2,2,2,2);
    quatre = _mm_set_ps(4,4,4,4);
    un     = _mm_set_ps(1,1,1,1);
    zero   = _mm_set_ps(0,0,0,0);

    // iteration zero
    t  = _mm_mul_ps(x, x);
    t2 = _mm_mul_ps(y, y);

    y  = _mm_mul_ps(x,y);
    y  = _mm_mul_ps(y,deux);
    y  = _mm_add_ps(y,b);

    x = _mm_sub_ps(t,t2);
    x = _mm_add_ps(x,a);

    // calcul
    while(i<max_iter && _mm_movemask_ps(t) != 15) {


        t  = _mm_mul_ps(x, x);
        t2 = _mm_mul_ps(y, y);

        y  = _mm_mul_ps(_mm_mul_ps(x,y),deux);
        y  = _mm_add_ps(y,b);

        x = _mm_sub_ps(t,t2);
        x = _mm_add_ps(x,a);

        t2 = _mm_add_ps(t,t2);

        t2 = _mm_cmple_ps(t2,quatre);

        t = _mm_blendv_ps(zero,un,t2);

        fiter = _mm_add_ps(fiter,t);

        t = _mm_cmpeq_ps(t, zero);
        //display_vfloat32(t,"%f\t","T :: ");
        //printf(" MASK::%d \n",_mm_movemask_ps(t));

        i+=1;
    }


    iter = _mm_cvtps_epi32(fiter);

    return iter;
}
//-----------------------------------------------------------------------------------------
// Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer
// If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee
// as visible. If all rasterized AABB pixels are occluded then the occludee is culled
//-----------------------------------------------------------------------------------------
bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	// Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. 
	// so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3);
	__m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
	float* pDepthBuffer = (float*)pRenderTargetPixels;
	
	// Rasterize the AABB triangles 4 at a time
	for(UINT i = 0; i < AABB_TRIANGLES; i += SSE)
	{
		vFloat4 xformedPos[3];
		Gather(xformedPos, i, pXformedPos, idx);

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		for(int m = 0; m < 3; m++)
		{
			fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X);
			fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y);
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea));

		__m128 Z[3];
		Z[0] = xformedPos[0].Z;
		Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea);
		
		// Use bounding box traversal strategy to determine which pixels to rasterize 
		//__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
		__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3));
		__m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1));

		__m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
		__m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < SSE; lane++)
        {
			// Skip triangle if area is zero 
			if(triArea.m128i_i32[lane] <= 0)
			{
				continue;
			}

			// Extract this triangle's properties from the SIMD versions
			__m256 zz[3];
			for (int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]);
			}

			int startXx = startX.m128i_i32[lane];
			int endXx = endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy = endY.m128i_i32[lane];

			__m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]);
			__m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]);
			__m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]);

			__m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]);
			__m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]);
			__m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]);

			__m256i aa0Inc = _mm256_slli_epi32(aa0, 2);
			__m256i aa1Inc = _mm256_slli_epi32(aa1, 2);
			__m256i aa2Inc = _mm256_slli_epi32(aa2, 2);

			__m256i bb0Inc = _mm256_slli_epi32(bb0, 1);
			__m256i bb1Inc = _mm256_slli_epi32(bb1, 1);
			__m256i bb2Inc = _mm256_slli_epi32(bb2, 1);

			__m256i row, col;

			// Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better performance
			int	rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx));
			__m256i aa0Col = _mm256_mullo_epi32(aa0, col);
			__m256i aa1Col = _mm256_mullo_epi32(aa1, col);
			__m256i aa2Col = _mm256_mullo_epi32(aa2, col);

			row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy));
			__m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane]));
			__m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane]));
			__m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane]));

			__m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row);
			__m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row);
			__m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row);

			__m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for (int r = startYy; r < endYy; r += 2,
				rowIdx += 2 * SCREENW,
				sum0Row = _mm256_add_epi32(sum0Row, bb0Inc),
				sum1Row = _mm256_add_epi32(sum1Row, bb1Inc),
				sum2Row = _mm256_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m256i alpha = sum0Row;
				__m256i beta = sum1Row;
				__m256i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m256 depth = zz[0];
				depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1]));
				depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2]));
				__m256i anyOut = _mm256_setzero_si256();

				for (int c = startXx; c < endXx; c += 4,
					index += 8,
					alpha = _mm256_add_epi32(alpha, aa0Inc),
					beta = _mm256_add_epi32(beta, aa1Inc),
					gama = _mm256_add_epi32(gama, aa2Inc),
					depth = _mm256_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama);

					__m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]);
					__m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D);
					__m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask));
					anyOut = _mm256_or_si256(anyOut, finalMask);
				}//for each column	

				if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000)))
				{
					return true; //early exit
				}
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles

	return false;
}
//----------------------------------------------------------------
// Transforms the AABB vertices to screen space once every frame
// Also performs a coarse depth pre-test
//----------------------------------------------------------------
PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary)
{
	// w ends up being garbage, but it doesn't matter - we ignore it anyway.
	__m128 vCenter = _mm_loadu_ps(&mBBCenter.x);
	__m128 vHalf   = _mm_loadu_ps(&mBBHalf.x);

	__m128 vMin    = _mm_sub_ps(vCenter, vHalf);
	__m128 vMax    = _mm_add_ps(vCenter, vHalf);

	// transforms
	__m128 xRow[2], yRow[2], zRow[2];
	xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0];
	xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0];
	yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1];
	yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1];
	zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2];
	zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2];

	__m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0));
	__m128 screenMin = _mm_set1_ps(FLT_MAX);
	__m128 screenMax = _mm_set1_ps(-FLT_MAX);

	for(UINT i = 0; i < AABB_VERTICES; i++)
	{
		// Transform the vertex
		__m128 vert = cumulativeMatrix[3];
		vert += xRow[sBBxInd[i]];
		vert += yRow[sBByInd[i]];
		vert += zRow[sBBzInd[i]];

		// We have inverted z; z is in front of near plane iff z <= w.
		__m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz
		__m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww
		__m128 zIn = _mm_cmple_ps(vertZ, vertW);
		zAllIn = _mm_and_ps(zAllIn, zIn);

		// project
		xformedPos[i] = _mm_div_ps(vert, vertW);
		
	    // update bounds
	    screenMin = _mm_min_ps(screenMin, xformedPos[i]);
	    screenMax = _mm_max_ps(screenMax, xformedPos[i]);
	}

	// if any of the verts are z-clipped, we (conservatively) say the box is in
	if(_mm_movemask_ps(zAllIn) != 0xf)
		return ePT_VISIBLE;

	// Clip against screen bounds
	screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX));
	screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX));

	// Quick rejection test
	if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin)))
		return ePT_INVISIBLE;

	// Prepare integer bounds
	__m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY
	__m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY);
	__m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3);

	__m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa);

	// Traverse all 8x8 blocks covered by 2d screen-space BBox;
	// if we know for sure that this box is behind the geometry we know is there,
	// we can stop.
	int rX0 = minMaxXYis.m128i_i32[0];
	int rY0 = minMaxXYis.m128i_i32[1];
	int rX1 = minMaxXYis.m128i_i32[2];
	int rY1 = minMaxXYis.m128i_i32[3];

	__m128 anyCloser = _mm_setzero_ps();
	for(int by = rY0; by <= rY1; by++)
	{
		const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE);

		// If for any 8x8 block, maxZ is not less than (=behind) summarized
		// min Z, box might be visible.
		for(int bx = rX0; bx <= rX1; bx++)
		{
			anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx])));
		}

		if(_mm_movemask_ps(anyCloser))
		{
			return ePT_UNSURE; // okay, box might be in
		}
	}

	// If we get here, we know for sure that the box is fully behind the stuff in the
	// depth buffer.
	return ePT_INVISIBLE;
}
Beispiel #13
0
void sINLINE RNMarchingCubesBase<T>::func(const sVector31 &v,typename T::FieldType &pot,const funcinfo &fi)
{
  __m128 vx = _mm_load_ps1(&v.x);
  __m128 vy = _mm_load_ps1(&v.y);
  __m128 vz = _mm_load_ps1(&v.z);
  __m128 po = _mm_setzero_ps();           // p
  __m128 nx = _mm_setzero_ps();
  __m128 ny = _mm_setzero_ps();
  __m128 nz = _mm_setzero_ps();
  __m128 akkur = _mm_setzero_ps();
  __m128 akkug = _mm_setzero_ps();
  __m128 akkub = _mm_setzero_ps();
  __m128 akkua = _mm_setzero_ps();
  __m128 s255 = _mm_set_ps1(255.0f);
  
  sBool good = 0;

  for(sInt i=0;i<fi.pn4;i++)
  {
    const T::SimdType *part = fi.parts4 + i;

    __m128 dx = _mm_sub_ps(vx,part->x);
    __m128 dy = _mm_sub_ps(vy,part->y);
    __m128 dz = _mm_sub_ps(vz,part->z);
    __m128 ddx = _mm_mul_ps(dx,dx);
    __m128 ddy = _mm_mul_ps(dy,dy);
    __m128 ddz = _mm_mul_ps(dz,dz);
    __m128 pp = _mm_add_ps(_mm_add_ps(ddx,ddy),ddz);

    if(_mm_movemask_ps(_mm_cmple_ps(pp,fi.treshf4))!=0)
    {
      __m128 pp2 = _mm_sub_ps(_mm_div_ps(fi.one,pp),fi.tresh4);
      __m128 pp3 = _mm_max_ps(pp2,_mm_setzero_ps());
      po = _mm_add_ps(po,pp3);                  // p = p+pp;
      __m128 pp4 = _mm_mul_ps(pp3,pp3);         // pp*pp
      nx = _mm_add_ps(nx,_mm_mul_ps(pp4,dx));   // n += d*(pp*pp)
      ny = _mm_add_ps(ny,_mm_mul_ps(pp4,dy));
      nz = _mm_add_ps(nz,_mm_mul_ps(pp4,dz));
      if(T::Color==1)
      {
        akkur = _mm_add_ps(akkur,_mm_mul_ps(pp3,part->cr));
        akkug = _mm_add_ps(akkug,_mm_mul_ps(pp3,part->cg));
        akkub = _mm_add_ps(akkub,_mm_mul_ps(pp3,part->cb));
        good = 1;
      }
    }
  }

  sF32 p = 0;
  sVector30 n;
  
  _MM_TRANSPOSE4_PS(po,nx,ny,nz);
  __m128 r = _mm_add_ps(_mm_add_ps(_mm_add_ps(nx,ny),nz),po);
  n.x = r.m128_f32[1];
  n.y = r.m128_f32[2];
  n.z = r.m128_f32[3];
  p = r.m128_f32[0];

  if(p==0)
    n.Init(0,0,0);
  else
    n.UnitFast();
  pot.x = n.x;
  pot.y = n.y;
  pot.z = n.z;
  pot.w = p-fi.iso;
  if(T::Color)
  {
    if(good)
    {
      r = _mm_mul_ss(s255,_mm_rcp_ss(r));
  //    r = _mm_rcp_ss(r);
      _MM_TRANSPOSE4_PS(akkub,akkug,akkur,akkua);
      __m128 r2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(akkur,akkug),akkub),akkua);

      r2 = _mm_mul_ps(r2,_mm_shuffle_ps(r,r,0x00));
      __m128i r3 = _mm_cvtps_epi32(r2);
      r3 = _mm_packs_epi32(r3,r3);
      __m128i r4 = _mm_packus_epi16(r3,r3);
      pot.c = r4.m128i_u32[0]|0xff000000;
    }
    else
    {
      pot.c = 0;
    }
  }
}
Beispiel #14
0
void audio_thread::operator()()
{
	thread_ctrl::set_native_priority(1);

	AudioDumper m_dump(g_cfg.audio.dump_to_file ? 2 : 0); // Init AudioDumper for 2 channels if enabled

	float buf2ch[2 * BUFFER_SIZE]{}; // intermediate buffer for 2 channels
	float buf8ch[8 * BUFFER_SIZE]{}; // intermediate buffer for 8 channels

	const u32 buf_sz = BUFFER_SIZE * (g_cfg.audio.convert_to_u16 ? 2 : 4) * (g_cfg.audio.downmix_to_2ch ? 2 : 8);

	std::unique_ptr<float[]> out_buffer[BUFFER_NUM];

	for (u32 i = 0; i < BUFFER_NUM; i++)
	{
		out_buffer[i].reset(new float[8 * BUFFER_SIZE] {});
	}

	const auto audio = Emu.GetCallbacks().get_audio();
	audio->Open(buf8ch, buf_sz);

	while (thread_ctrl::state() != thread_state::aborting && !Emu.IsStopped())
	{
		if (Emu.IsPaused())
		{
			thread_ctrl::wait_for(1000); // hack
			continue;
		}

		const u64 stamp0 = get_system_time();

		const u64 time_pos = stamp0 - start_time - Emu.GetPauseTime();

		// TODO: send beforemix event (in ~2,6 ms before mixing)

		// precise time of sleeping: 5,(3) ms (or 256/48000 sec)
		const u64 expected_time = m_counter * AUDIO_SAMPLES * 1000000 / 48000;
		if (expected_time >= time_pos)
		{
			thread_ctrl::wait_for(1000); // hack
			continue;
		}

		m_counter++;

		const u32 out_pos = m_counter % BUFFER_NUM;

		bool first_mix = true;

		// mixing:
		for (auto& port : ports)
		{
			if (port.state != audio_port_state::started) continue;

			const u32 block_size = port.channel * AUDIO_SAMPLES;
			const u32 position = port.tag % port.block; // old value
			const u32 buf_addr = port.addr.addr() + position * block_size * sizeof(float);

			auto buf = vm::_ptr<f32>(buf_addr);

			static const float k = 1.0f; // may be 1.0f
			const float& m = port.level;

			auto step_volume = [](audio_port& port) // part of cellAudioSetPortLevel functionality
			{
				const auto param = port.level_set.load();

				if (param.inc != 0.0f)
				{
					port.level += param.inc;
					const bool dec = param.inc < 0.0f;

					if ((!dec && param.value - port.level <= 0.0f) || (dec && param.value - port.level >= 0.0f))
					{
						port.level = param.value;
						port.level_set.compare_and_swap(param, { param.value, 0.0f });
					}
				}
			};

			if (port.channel == 2)
			{
				if (first_mix)
				{
					for (u32 i = 0; i < std::size(buf2ch); i += 2)
					{
						step_volume(port);

						// reverse byte order
						const float left = buf[i + 0] * m;
						const float right = buf[i + 1] * m;

						buf2ch[i + 0] = left;
						buf2ch[i + 1] = right;

						buf8ch[i * 4 + 0] = left;
						buf8ch[i * 4 + 1] = right;
						buf8ch[i * 4 + 2] = 0.0f;
						buf8ch[i * 4 + 3] = 0.0f;
						buf8ch[i * 4 + 4] = 0.0f;
						buf8ch[i * 4 + 5] = 0.0f;
						buf8ch[i * 4 + 6] = 0.0f;
						buf8ch[i * 4 + 7] = 0.0f;
					}
					first_mix = false;
				}
				else
				{
					for (u32 i = 0; i < std::size(buf2ch); i += 2)
					{
						step_volume(port);

						const float left = buf[i + 0] * m;
						const float right = buf[i + 1] * m;

						buf2ch[i + 0] += left;
						buf2ch[i + 1] += right;

						buf8ch[i * 4 + 0] += left;
						buf8ch[i * 4 + 1] += right;
					}
				}
			}
			else if (port.channel == 8)
			{
				if (first_mix)
				{
					for (u32 i = 0; i < std::size(buf2ch); i += 2)
					{
						step_volume(port);

						const float left = buf[i * 4 + 0] * m;
						const float right = buf[i * 4 + 1] * m;
						const float center = buf[i * 4 + 2] * m;
						const float low_freq = buf[i * 4 + 3] * m;
						const float rear_left = buf[i * 4 + 4] * m;
						const float rear_right = buf[i * 4 + 5] * m;
						const float side_left = buf[i * 4 + 6] * m;
						const float side_right = buf[i * 4 + 7] * m;

						const float mid = (center + low_freq) * 0.708f;
						buf2ch[i + 0] = (left + rear_left + side_left + mid) * k;
						buf2ch[i + 1] = (right + rear_right + side_right + mid) * k;

						buf8ch[i * 4 + 0] = left;
						buf8ch[i * 4 + 1] = right;
						buf8ch[i * 4 + 2] = center;
						buf8ch[i * 4 + 3] = low_freq;
						buf8ch[i * 4 + 4] = rear_left;
						buf8ch[i * 4 + 5] = rear_right;
						buf8ch[i * 4 + 6] = side_left;
						buf8ch[i * 4 + 7] = side_right;
					}
					first_mix = false;
				}
				else
				{
					for (u32 i = 0; i < std::size(buf2ch); i += 2)
					{
						step_volume(port);

						const float left = buf[i * 4 + 0] * m;
						const float right = buf[i * 4 + 1] * m;
						const float center = buf[i * 4 + 2] * m;
						const float low_freq = buf[i * 4 + 3] * m;
						const float rear_left = buf[i * 4 + 4] * m;
						const float rear_right = buf[i * 4 + 5] * m;
						const float side_left = buf[i * 4 + 6] * m;
						const float side_right = buf[i * 4 + 7] * m;

						const float mid = (center + low_freq) * 0.708f;
						buf2ch[i + 0] += (left + rear_left + side_left + mid) * k;
						buf2ch[i + 1] += (right + rear_right + side_right + mid) * k;

						buf8ch[i * 4 + 0] += left;
						buf8ch[i * 4 + 1] += right;
						buf8ch[i * 4 + 2] += center;
						buf8ch[i * 4 + 3] += low_freq;
						buf8ch[i * 4 + 4] += rear_left;
						buf8ch[i * 4 + 5] += rear_right;
						buf8ch[i * 4 + 6] += side_left;
						buf8ch[i * 4 + 7] += side_right;
					}
				}
			}
			else
			{
				fmt::throw_exception("Unknown channel count (port=%u, channel=%d)" HERE, port.number, port.channel);
			}

			memset(buf, 0, block_size * sizeof(float));
		}


		if (!first_mix)
		{
			// Copy output data (2ch or 8ch)
			if (g_cfg.audio.downmix_to_2ch)
			{
				for (u32 i = 0; i < std::size(buf2ch); i++)
				{
					out_buffer[out_pos][i] = buf2ch[i];
				}
			}
			else
			{
				for (u32 i = 0; i < std::size(buf8ch); i++)
				{
					out_buffer[out_pos][i] = buf8ch[i];
				}
			}
		}

		const u64 stamp1 = get_system_time();

		if (first_mix)
		{
			std::memset(out_buffer[out_pos].get(), 0, 8 * BUFFER_SIZE * sizeof(float));
		}

		if (g_cfg.audio.convert_to_u16)
		{
			// convert the data from float to u16 with clipping:
			// 2x MULPS
			// 2x MAXPS (optional)
			// 2x MINPS (optional)
			// 2x CVTPS2DQ (converts float to s32)
			// PACKSSDW (converts s32 to s16 with signed saturation)

			__m128i buf_u16[BUFFER_SIZE];

			for (size_t i = 0; i < 8 * BUFFER_SIZE; i += 8)
			{
				const auto scale = _mm_set1_ps(0x8000);
				buf_u16[i / 8] = _mm_packs_epi32(
					_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(out_buffer[out_pos].get() + i), scale)),
					_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(out_buffer[out_pos].get() + i + 4), scale)));
			}

			audio->AddData(buf_u16, buf_sz);
		}
		else
		{
			audio->AddData(out_buffer[out_pos].get(), buf_sz);
		}

		const u64 stamp2 = get_system_time();

		{
			// update indices:

			for (u32 i = 0; i < AUDIO_PORT_COUNT; i++)
			{
				audio_port& port = ports[i];

				if (port.state != audio_port_state::started) continue;

				u32 position = port.tag % port.block; // old value
				port.counter = m_counter;
				port.tag++; // absolute index of block that will be read
				m_indexes[i] = (position + 1) % port.block; // write new value
			}

			// send aftermix event (normal audio event)

			auto _locked = g_idm->lock<named_thread<audio_thread>>(0);

			for (u64 key : keys)
			{
				// TODO: move out of the lock scope
				if (auto queue = lv2_event_queue::find(key))
				{
					queue->send(0, 0, 0, 0); // TODO: check arguments
				}
			}
		}

		const u64 stamp3 = get_system_time();

		switch (m_dump.GetCh())
		{
		case 2: m_dump.WriteData(&buf2ch, sizeof(buf2ch)); break; // write file data (2 ch)
		case 8: m_dump.WriteData(&buf8ch, sizeof(buf8ch)); break; // write file data (8 ch)
		}

		cellAudio.trace("Audio perf: (access=%d, AddData=%d, events=%d, dump=%d)",
			stamp1 - stamp0, stamp2 - stamp1, stamp3 - stamp2, get_system_time() - stamp3);
	}
}
//-------------------------------------------------------------------------------
// For each tile go through all the bins and process all the triangles in it.
// Rasterize each triangle to the CPU depth buffer. 
//-------------------------------------------------------------------------------
void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m128i colOffset = _mm_setr_epi32(0, 1, 0, 1);
	__m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1);

	__m128i fxptZero = _mm_setzero_si128();
	float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; 

	// Based on TaskId determine which tile to process
	UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS;
    UINT tileX = tileId % screenWidthInTiles;
    UINT tileY = tileId / screenWidthInTiles;

    int tileStartX = tileX * TILE_WIDTH_IN_PIXELS;
	int tileEndX   = tileStartX + TILE_WIDTH_IN_PIXELS - 1;
	
	int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS;
	int tileEndY   = tileStartY + TILE_HEIGHT_IN_PIXELS - 1;

	ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx);

	UINT bin = 0;
	UINT binIndex = 0;
	UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX;
	UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX;
	UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];

	__m128 gatherBuf[4][3];
	bool done = false;
	bool allBinsEmpty = true;
	mNumRasterizedTris[idx][tileId] = numTrisInBin;
	while(!done)
	{
		// Loop through all the bins and process 4 binned traingles at a time
		UINT ii;
		int numSimdTris = 0;
		for(ii = 0; ii < SSE; ii++)
		{
			while(numTrisInBin <= 0)
			{
				 // This bin is empty.  Move to next bin.
				if(++bin >= 1)
				{
					break;
				}
				numTrisInBin = mpNumTrisInBin[idx][offset1 + bin];
				mNumRasterizedTris[idx][tileId] += numTrisInBin;
				binIndex = 0;
			}
			if(!numTrisInBin)
			{
				break; // No more tris in the bins
			}
			USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex];
			mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx);
			allBinsEmpty = false;
			numSimdTris++; 

			++binIndex;
			--numTrisInBin;
		}
		done = bin >= NUM_XFORMVERTS_TASKS;
		
		if(allBinsEmpty)
		{
			return;
		}

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		__m128 Z[3];
		for(int i = 0; i < 3; i++)
		{
			__m128 v0 = gatherBuf[0][i];
			__m128 v1 = gatherBuf[1][i];
			__m128 v2 = gatherBuf[2][i];
			__m128 v3 = gatherBuf[3][i];

  			// transpose into SoA layout
			_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
			fxPtX[i] = _mm_cvtps_epi32(v0);
			fxPtY[i] = _mm_cvtps_epi32(v1);
			Z[i] = v2;
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));

		Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea);

		// Use bounding box traversal strategy to determine which pixels to rasterize 
		__m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endX   = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX));

		__m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE));
		__m128i endY   = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < numSimdTris; lane++)
        {
			// Extract this triangle's properties from the SIMD versions
            __m128 zz[3];
			for(int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]);
			}
									
			int startXx = startX.m128i_i32[lane];
			int endXx	= endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy	= endY.m128i_i32[lane];
		
			__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
			__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
			__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);

			__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
			__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
			__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);

			__m128i aa0Inc = _mm_slli_epi32(aa0, 1);
			__m128i aa1Inc = _mm_slli_epi32(aa1, 1);
			__m128i aa2Inc = _mm_slli_epi32(aa2, 1);

			__m128i row, col;

			// Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better perfromance
			int rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx));
			__m128i aa0Col = _mm_mullo_epi32(aa0, col);
			__m128i aa1Col = _mm_mullo_epi32(aa1, col);
			__m128i aa2Col = _mm_mullo_epi32(aa2, col);

			row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy));
			__m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane]));
			__m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane]));
			__m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane]));

			__m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row);
			__m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row);
			__m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row);

			__m128i bb0Inc = _mm_slli_epi32(bb0, 1);
			__m128i bb1Inc = _mm_slli_epi32(bb1, 1);
			__m128i bb2Inc = _mm_slli_epi32(bb2, 1);

			__m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for(int r = startYy; r < endYy; r += 2,
											rowIdx += 2 * SCREENW,
											sum0Row = _mm_add_epi32(sum0Row, bb0Inc),
											sum1Row = _mm_add_epi32(sum1Row, bb1Inc),
											sum2Row = _mm_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m128i alpha = sum0Row;
				__m128i beta = sum1Row;
				__m128i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m128 depth = zz[0];
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
				depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));

				for(int c = startXx; c < endXx; c += 2,
												index += 4,
												alpha = _mm_add_epi32(alpha, aa0Inc),
												beta  = _mm_add_epi32(beta, aa1Inc),
												gama  = _mm_add_epi32(gama, aa2Inc),
												depth = _mm_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama);
					
					__m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]);
					__m128 mergedDepth = _mm_max_ps(depth, previousDepthValue);
					__m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask));
					_mm_store_ps(&pDepthBuffer[index], finalDepth);
				}//for each column											
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles
}