static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t* p0 = (uint16_t *)buff + 8; uint16_t* p1 = p0 + bstride; uint16_t* p2 = p1 + bstride; uint16_t* p3 = p2 + bstride; uint16_t* p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 alpha = _mm_set1_ps((float)0.96043387); __m128 beta = _mm_set1_ps((float)0.39782473); __m128i pmax = _mm_set1_epi32(0xFFFF); __m128i min = _mm_set1_epi16((int16_t)eh->min); __m128i max = _mm_set1_epi16((int16_t)eh->max); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint16_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 8) { __m128 sumx[2] = {(__m128)zero, (__m128)zero}; __m128 sumy[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 4; i++) { __m128 xmul = _mm_load_ps(ar_mulxf[i]); __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); xmul = _mm_load_ps(ar_mulyf[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); } __m128i out[2]; for (int i = 0; i < 2; i++) { sumx[i] = mm_abs_ps(sumx[i]); sumy[i] = mm_abs_ps(sumy[i]); __m128 t0 = _mm_max_ps(sumx[i], sumy[i]); __m128 t1 = _mm_min_ps(sumx[i], sumy[i]); t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1)); out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift); out[i] = mm_min_epi32(out[i], pmax); } out[0] = mm_cast_epi32(out[0], out[1]); out[1] = MM_MIN_EPU16(out[0], max); out[1] = _mm_cmpeq_epi16(out[1], max); out[0] = _mm_or_si128(out[1], out[0]); out[1] = MM_MAX_EPU16(out[0], min); out[1] = _mm_cmpeq_epi16(out[1], min); out[0] = _mm_andnot_si128(out[1], out[0]); _mm_store_si128((__m128i *)(dstp + x), out[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
void transform8_otherrgb_avx(ThreadInfo* t) { RS_IMAGE16 *input = t->input; GdkPixbuf *output = t->output; RS_MATRIX3 *matrix = t->matrix; gint x,y; gint width; float mat_ps[4*4*3] __attribute__ ((aligned (16))); for (x = 0; x < 4; x++ ) { mat_ps[x] = matrix->coeff[0][0]; mat_ps[x+4] = matrix->coeff[0][1]; mat_ps[x+8] = matrix->coeff[0][2]; mat_ps[12+x] = matrix->coeff[1][0]; mat_ps[12+x+4] = matrix->coeff[1][1]; mat_ps[12+x+8] = matrix->coeff[1][2]; mat_ps[24+x] = matrix->coeff[2][0]; mat_ps[24+x+4] = matrix->coeff[2][1]; mat_ps[24+x+8] = matrix->coeff[2][2]; } int start_x = t->start_x; /* Always have aligned input and output adress */ if (start_x & 3) start_x = ((start_x) / 4) * 4; int complete_w = t->end_x - start_x; /* If width is not multiple of 4, check if we can extend it a bit */ if (complete_w & 3) { if ((t->end_x+4) < input->w) complete_w = ((complete_w+3) / 4 * 4); } __m128 gamma = _mm_set1_ps(t->output_gamma); for(y=t->start_y ; y<t->end_y ; y++) { gushort *i = GET_PIXEL(input, start_x, y); guchar *o = GET_PIXBUF_PIXEL(output, start_x, y); gboolean aligned_write = !((guintptr)(o)&0xf); width = complete_w >> 2; while(width--) { /* Load and convert to float */ __m128i zero = _mm_setzero_si128(); __m128i in = _mm_load_si128((__m128i*)i); // Load two pixels __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels _mm_prefetch(i + 64, _MM_HINT_NTA); __m128i p1 =_mm_unpacklo_epi16(in, zero); __m128i p2 =_mm_unpackhi_epi16(in, zero); __m128i p3 =_mm_unpacklo_epi16(in2, zero); __m128i p4 =_mm_unpackhi_epi16(in2, zero); __m128 p1f = _mm_cvtepi32_ps(p1); __m128 p2f = _mm_cvtepi32_ps(p2); __m128 p3f = _mm_cvtepi32_ps(p3); __m128 p4f = _mm_cvtepi32_ps(p4); /* Convert to planar */ __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f); __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f); __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f); __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f); __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2); __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0); __m128 b = _mm_movelh_ps(b1b0, b3b2); /* Apply matrix to convert to sRGB */ __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b); __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b); __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b); /* Normalize to 0->1 and clamp */ __m128 normalize = _mm_load_ps(_normalize); __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_setzero_ps(); r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2))); g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2))); b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2))); /* Apply Gamma */ __m128 upscale = _mm_load_ps(_8bit); r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma)); g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma)); b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma)); /* Convert to 8 bit unsigned and interleave*/ __m128i r_i = _mm_cvtps_epi32(r); __m128i g_i = _mm_cvtps_epi32(g); __m128i b_i = _mm_cvtps_epi32(b); r_i = _mm_packs_epi32(r_i, r_i); g_i = _mm_packs_epi32(g_i, g_i); b_i = _mm_packs_epi32(b_i, b_i); /* Set alpha value to 255 and store */ __m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask); __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i); __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i); p1 = _mm_unpacklo_epi32(rg_i, bb_i); p2 = _mm_unpackhi_epi32(rg_i, bb_i); p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2)); if (aligned_write) _mm_store_si128((__m128i*)o, p1); else _mm_storeu_si128((__m128i*)o, p1); i += 16; o += 16; } /* Process remaining pixels */ width = complete_w & 3; while(width--) { __m128i zero = _mm_setzero_si128(); __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels __m128i p1 =_mm_unpacklo_epi16(in, zero); __m128 p1f = _mm_cvtepi32_ps(p1); /* Splat r,g,b */ __m128 r = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0)); __m128 g = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1)); __m128 b = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2)); __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b); __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b); __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b); r = _mm_unpacklo_ps(r2, g2); // GG RR GG RR r = _mm_movelh_ps(r, b2); // BB BB GG RR __m128 normalize = _mm_load_ps(_normalize); __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_setzero_ps(); r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r))); __m128 upscale = _mm_load_ps(_8bit); r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma)); /* Convert to 8 bit unsigned */ zero = _mm_setzero_si128(); __m128i r_i = _mm_cvtps_epi32(r); /* To 16 bit signed */ r_i = _mm_packs_epi32(r_i, zero); /* To 8 bit unsigned - set alpha channel*/ __m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask); r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero)); *(int*)o = _mm_cvtsi128_si32(r_i); i+=4; o+=4; } } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i max = _mm_set1_epi32(0xFFFF); __m128 matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_set1_ps((float)ch->m[i]); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 8) { __m128 sum[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 25; i++) { __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); xmm1 = _mm_mul_ps(xmm1, matrix[i]); xmm2 = _mm_mul_ps(xmm2, matrix[i]); sum[0] = _mm_add_ps(sum[0], xmm1); sum[1] = _mm_add_ps(sum[1], xmm2); } __m128i sumi[2]; for (int i = 0; i < 2; i++) { sum[i] = _mm_mul_ps(sum[i], rdiv); sum[i] = _mm_add_ps(sum[i], bias); if (!ch->saturate) { sum[i] = mm_abs_ps(sum[i]); } sumi[i] = _mm_cvtps_epi32(sum[i]); sumi[i] = mm_min_epi32(sumi[i], max); __m128i mask = _mm_cmpgt_epi32(sumi[i], zero); sumi[i] = _mm_and_si128(sumi[i], mask); } sumi[0] = mm_cast_epi32(sumi[0], sumi[1]); _mm_store_si128((__m128i *)(dstp + x), sumi[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
void YUV422ToRGB888(const XnUInt8* pYUVImage, XnUInt8* pRGBAImage, XnUInt32 nYUVSize, XnUInt32 nRGBSize) { const XnUInt8* pYUVLast = pYUVImage + nYUVSize - 8; XnUInt8* pRGBLast = pRGBAImage + nRGBSize - 16; const __m128 minus128 = _mm_set_ps1(-128); const __m128 plus113983 = _mm_set_ps1(1.13983F); const __m128 minus039466 = _mm_set_ps1(-0.39466F); const __m128 minus058060 = _mm_set_ps1(-0.58060F); const __m128 plus203211 = _mm_set_ps1(2.03211F); const __m128 zero = _mm_set_ps1(0); const __m128 plus255 = _mm_set_ps1(255); // define YUV floats __m128 y; __m128 u; __m128 v; __m128 temp; // define RGB floats __m128 r; __m128 g; __m128 b; // define RGB integers __m128i iR; __m128i iG; __m128i iB; XnUInt32* piR = (XnUInt32*)&iR; XnUInt32* piG = (XnUInt32*)&iG; XnUInt32* piB = (XnUInt32*)&iB; while (pYUVImage <= pYUVLast && pRGBAImage <= pRGBLast) { // process 4 pixels at once (values should be ordered backwards) y = _mm_set_ps(pYUVImage[YUV422_Y2 + YUV422_BPP], pYUVImage[YUV422_Y1 + YUV422_BPP], pYUVImage[YUV422_Y2], pYUVImage[YUV422_Y1]); u = _mm_set_ps(pYUVImage[YUV422_U + YUV422_BPP], pYUVImage[YUV422_U + YUV422_BPP], pYUVImage[YUV422_U], pYUVImage[YUV422_U]); v = _mm_set_ps(pYUVImage[YUV422_V + YUV422_BPP], pYUVImage[YUV422_V + YUV422_BPP], pYUVImage[YUV422_V], pYUVImage[YUV422_V]); u = _mm_add_ps(u, minus128); // u -= 128 v = _mm_add_ps(v, minus128); // v -= 128 /* http://en.wikipedia.org/wiki/YUV From YUV to RGB: R = Y + 1.13983 V G = Y - 0.39466 U - 0.58060 V B = Y + 2.03211 U */ temp = _mm_mul_ps(plus113983, v); r = _mm_add_ps(y, temp); temp = _mm_mul_ps(minus039466, u); g = _mm_add_ps(y, temp); temp = _mm_mul_ps(minus058060, v); g = _mm_add_ps(g, temp); temp = _mm_mul_ps(plus203211, u); b = _mm_add_ps(y, temp); // make sure no value is smaller than 0 r = _mm_max_ps(r, zero); g = _mm_max_ps(g, zero); b = _mm_max_ps(b, zero); // make sure no value is bigger than 255 r = _mm_min_ps(r, plus255); g = _mm_min_ps(g, plus255); b = _mm_min_ps(b, plus255); // convert floats to int16 (there is no conversion to uint8, just to int8). iR = _mm_cvtps_epi32(r); iG = _mm_cvtps_epi32(g); iB = _mm_cvtps_epi32(b); // extract the 4 pixels RGB values. // because we made sure values are between 0 and 255, we can just take the lower byte // of each INT16 pRGBAImage[0] = piR[0]; pRGBAImage[1] = piG[0]; pRGBAImage[2] = piB[0]; pRGBAImage[3] = 255; pRGBAImage[4] = piR[1]; pRGBAImage[5] = piG[1]; pRGBAImage[6] = piB[1]; pRGBAImage[7] = 255; pRGBAImage[8] = piR[2]; pRGBAImage[9] = piG[2]; pRGBAImage[10] = piB[2]; pRGBAImage[11] = 255; pRGBAImage[12] = piR[3]; pRGBAImage[13] = piG[3]; pRGBAImage[14] = piB[3]; pRGBAImage[15] = 255; // advance the streams pYUVImage += 8; pRGBAImage += 16; } }
void Permutohedral::init ( const MatrixXf & feature ) { // Compute the lattice coordinates for each feature [there is going to be a lot of magic here N_ = feature.cols(); d_ = feature.rows(); HashTable hash_table( d_, N_/**(d_+1)*/ ); const int blocksize = sizeof(__m128) / sizeof(float); const __m128 invdplus1 = _mm_set1_ps( 1.0f / (d_+1) ); const __m128 dplus1 = _mm_set1_ps( d_+1 ); const __m128 Zero = _mm_set1_ps( 0 ); const __m128 One = _mm_set1_ps( 1 ); // Allocate the class memory offset_.resize( (d_+1)*(N_+16) ); std::fill( offset_.begin(), offset_.end(), 0 ); barycentric_.resize( (d_+1)*(N_+16) ); std::fill( barycentric_.begin(), barycentric_.end(), 0 ); rank_.resize( (d_+1)*(N_+16) ); // Allocate the local memory __m128 * scale_factor = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * f = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * elevated = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rem0 = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rank = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 ); float * barycentric = new float[(d_+2)*blocksize]; short * canonical = new short[(d_+1)*(d_+1)]; short * key = new short[d_+1]; // Compute the canonical simplex for( int i=0; i<=d_; i++ ){ for( int j=0; j<=d_-i; j++ ) canonical[i*(d_+1)+j] = i; for( int j=d_-i+1; j<=d_; j++ ) canonical[i*(d_+1)+j] = i - (d_+1); } // Expected standard deviation of our filter (p.6 in [Adams etal 2010]) float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1); // Compute the diagonal part of E (p.5 in [Adams etal 2010]) for( int i=0; i<d_; i++ ) scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( (i+2)*(i+1) ) * inv_std_dev ); // Setup the SSE rounding #ifndef __SSE4_1__ const unsigned int old_rounding = _mm_getcsr(); _mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST ); #endif // Compute the simplex each feature lies in for( int k=0; k<N_; k+=blocksize ){ // Load the feature from memory float * ff = (float*)f; for( int j=0; j<d_; j++ ) for( int i=0; i<blocksize; i++ ) ff[ j*blocksize + i ] = k+i < N_ ? feature(j,k+i) : 0.0; // Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010]) // sm contains the sum of 1..n of our faeture vector __m128 sm = Zero; for( int j=d_; j>0; j-- ){ __m128 cf = f[j-1]*scale_factor[j-1]; elevated[j] = sm - _mm_set1_ps(j)*cf; sm += cf; } elevated[0] = sm; // Find the closest 0-colored simplex through rounding __m128 sum = Zero; for( int i=0; i<=d_; i++ ){ __m128 v = invdplus1 * elevated[i]; #ifdef __SSE4_1__ v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT ); #else v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) ); #endif rem0[i] = v*dplus1; sum += v; } // Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values) for( int i=0; i<=d_; i++ ) rank[i] = Zero; for( int i=0; i<d_; i++ ){ __m128 di = elevated[i] - rem0[i]; for( int j=i+1; j<=d_; j++ ){ __m128 dj = elevated[j] - rem0[j]; __m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) ); rank[i] += c; rank[j] += One-c; } } // If the point doesn't lie on the plane (sum != 0) bring it back for( int i=0; i<=d_; i++ ){ rank[i] += sum; __m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) ); __m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) ); rank[i] += add-sub; rem0[i] += add-sub; } // Compute the barycentric coordinates (p.10 in [Adams etal 2010]) for( int i=0; i<(d_+2)*blocksize; i++ ) barycentric[ i ] = 0; for( int i=0; i<=d_; i++ ){ __m128 v = (elevated[i] - rem0[i])*invdplus1; // Didn't figure out how to SSE this float * fv = (float*)&v; float * frank = (float*)&rank[i]; for( int j=0; j<blocksize; j++ ){ int p = d_-frank[j]; barycentric[j*(d_+2)+p ] += fv[j]; barycentric[j*(d_+2)+p+1] -= fv[j]; } } // The rest is not SSE'd for( int j=0; j<blocksize; j++ ){ // Wrap around barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1]; float * frank = (float*)rank; float * frem0 = (float*)rem0; // Compute all vertices and their offset for( int remainder=0; remainder<=d_; remainder++ ){ for( int i=0; i<d_; i++ ){ key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ]; } offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true ); rank_[ (j+k)*(d_+1)+remainder ] = frank[remainder*blocksize+j]; barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ]; } } } _mm_free( scale_factor ); _mm_free( f ); _mm_free( elevated ); _mm_free( rem0 ); _mm_free( rank ); delete [] barycentric; delete [] canonical; delete [] key; // Reset the SSE rounding #ifndef __SSE4_1__ _mm_setcsr( old_rounding ); #endif // This is normally fast enough so no SSE needed here // Find the Neighbors of each lattice point // Get the number of vertices in the lattice M_ = hash_table.size(); // Create the neighborhood structure blur_neighbors_.resize( (d_+1)*M_ ); short * n1 = new short[d_+1]; short * n2 = new short[d_+1]; // For each of d+1 axes, for( int j = 0; j <= d_; j++ ){ for( int i=0; i<M_; i++ ){ const short * key = hash_table.getKey( i ); for( int k=0; k<d_; k++ ){ n1[k] = key[k] - 1; n2[k] = key[k] + 1; } n1[j] = key[j] + d_; n2[j] = key[j] - d_; blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 ); blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 ); } } delete[] n1; delete[] n2; }
static void SinCos(const float rad, float &sin, float &cos) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_fopi = _mm_set1_ps(4.0f / pi); const __m128 _ps_0p5 = _mm_set1_ps(0.5f); const __m128 _ps_1 = _mm_set1_ps(1.0f); const __m128 _ps_dp1 = _mm_set1_ps(-0.7851562f); const __m128 _ps_dp2 = _mm_set1_ps(-2.4187564849853515625e-4f); const __m128 _ps_dp3 = _mm_set1_ps(-3.77489497744594108e-8f); const __m128 _ps_sincof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f); const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f); const __m128 _ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f); const __m128 _ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f); const __m128i _pi32_1 = _mm_set1_epi32(1); const __m128i _pi32_i1 = _mm_set1_epi32(~1); const __m128i _pi32_2 = _mm_set1_epi32(2); const __m128i _pi32_4 = _mm_set1_epi32(4); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2; __m128i mmi0, mmi2, mmi4; __m128 x, y, z; __m128 y1, y2; __m128 a = _mm_set1_ps(rad); x = _mm_and_ps(a, _mask_sign_inv); y = _mm_mul_ps(x, _ps_fopi); mmi2 = _mm_cvtps_epi32(y); mmi2 = _mm_add_epi32(mmi2, _pi32_1); mmi2 = _mm_and_si128(mmi2, _pi32_i1); y = _mm_cvtepi32_ps(mmi2); mmi4 = mmi2; mmi0 = _mm_and_si128(mmi2, _pi32_4); mmi0 = _mm_slli_epi32(mmi0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(mmi0); mmi2 = _mm_and_si128(mmi2, _pi32_2); mmi2 = _mm_cmpeq_epi32(mmi2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(mmi2); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp1)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp2)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp3)); mmi4 = _mm_sub_epi32(mmi4, _pi32_2); mmi4 = _mm_andnot_si128(mmi4, _pi32_4); mmi4 = _mm_slli_epi32(mmi4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(mmi4); __m128 sign_bit_sin = _mm_xor_ps(_mm_and_ps(a, _mask_sign_raw), swap_sign_bit_sin); z = _mm_mul_ps(x, x); y1 = _mm_mul_ps(_ps_coscof_p0, z); y1 = _mm_add_ps(y1, _ps_coscof_p1); y1 = _mm_mul_ps(y1, z); y1 = _mm_add_ps(y1, _ps_coscof_p2); y1 = _mm_mul_ps(y1, z); y1 = _mm_mul_ps(y1, z); y1 = _mm_sub_ps(y1, _mm_mul_ps(z, _ps_0p5)); y1 = _mm_add_ps(y1, _ps_1); y2 = _mm_mul_ps(_ps_sincof_p0, z); y2 = _mm_add_ps(y2, _ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, _ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); __m128 sin1y = _mm_andnot_ps(poly_mask, y1); __m128 sin2y = _mm_and_ps(poly_mask, y2); mm1 = _mm_add_ps(sin1y, sin2y); mm2 = _mm_add_ps(_mm_sub_ps(y1, sin1y), _mm_sub_ps(y2, sin2y)); sin = _mm_cvtss_f32(_mm_xor_ps(mm1, sign_bit_sin)); cos = _mm_cvtss_f32(_mm_xor_ps(mm2, sign_bit_cos)); }
bool CResizeEngine::horizontalFilter(CDIBSection *src, uint src_height, CDIBSection *dst, uint dst_yoffset, uint dst_height, ILongTimeRunCallback *pCallback) { assert(src->getBitCounts() == dst->getBitCounts()); int bitcount = src->getBitCounts(); assert((int)src_height <= src->getHeight()); assert(src_height >= dst_height); uint dst_ymax = dst_yoffset + dst_height; assert((int)dst_ymax <= dst->getHeight()); uint src_width = src->getWidth(); uint dst_width = dst->getWidth(); if (dst_width == src_width) { uint8 *src_bits = src->getData(); uint8 *dst_bits = dst->getLine(dst_yoffset); assert(src_bits && dst_bits); uint height = min(dst_height, src_height); memcpy(dst_bits, src_bits, height * dst->getStride()); } else if (!m_pFilter) { // fast (COLORONCOLOR) double ratio_w = (double)src_width / (double)dst_width; uint bytespp = bitcount / 8; for (uint y = dst_yoffset, sy = 0; y < dst_ymax; ++ y, ++ sy) { uint8 *dst_data = (uint8 *)dst->getLine(y); uint8 *src_line = (uint8 *)src->getLine(sy); for (uint x = 0; x < dst_width; ++ x) { uint sx = (uint)(x * ratio_w + 0.5); if (sx >= src_width) { sx = src_width - 1; } uint8 *src_data = src_line + sx * bytespp; for (uint i = 0; i < bytespp; ++ i) { *dst_data ++ = *src_data ++; } } } } else { // use m_pFilter uint index; // pixel index CWeightsTable weightsTable(m_pFilter, dst_width, src_width); #ifdef USE_SSE __m128i value, t; __m128 a, b, c, v05 = _mm_set_ps1(0.5); #elif (defined(USE_SSE2)) __m128i value, t; __m128d a, b, c, v05 = _mm_set1_pd(0.5); #endif uint bytespp = src->getBitCounts() / 8; assert(bytespp == 3 || bytespp == 4); for (uint dsty = dst_yoffset, srcy = 0; dsty < dst_ymax; ++ dsty, ++ srcy) { // test for stop if (srcy % 32 == 0) { if (pCallback && pCallback->shouldStop()) { return false; } } uint8 *src_bits = src->getLine(srcy); uint8 *dst_bits = dst->getLine(dsty); for(uint x = 0; x < dst_width; ++ x) { int iLeft = weightsTable.getLeftBoundary(x); int iRight = weightsTable.getRightBoundary(x); index = iLeft * bytespp; #ifdef USE_SSE __m128 v = _mm_set_ps1(0.0); _mm_prefetch((const char *)src_bits + index, _MM_HINT_T0); #elif defined(USE_SSE2) __m128d v1 = _mm_set1_pd(0.0); __m128d v2 = _mm_set1_pd(0.0); #elif defined(USE_FLOAT) float value[4] = {0, 0, 0, 0}; #else double value[4] = {0, 0, 0, 0}; // 4 = 32bpp max #endif for(int i = iLeft; i <= iRight; ++ i) { #ifdef USE_SSE float weight = (float)weightsTable.getWeight(x, i - iLeft); a = _mm_set_ps1(weight); if (bytespp == 3) { t = _mm_set_epi32(0, src_bits[index + 2], src_bits[index + 1], src_bits[index]); } else { t = _mm_set_epi32(src_bits[index + 3], src_bits[index + 2], src_bits[index + 1], src_bits[index]); } b = _mm_cvtepi32_ps(t); c = _mm_mul_ps(a, b); v = _mm_add_ps(v, c); index += bytespp; #elif defined(USE_SSE2) double weight = weightsTable.getWeight(x, i-iLeft); a = _mm_set1_pd(weight); t = _mm_set_epi32(0, 0, src_bits[index + 1], src_bits[index]); b = _mm_cvtepi32_pd(t); c = _mm_mul_pd(a, b); v1 = _mm_add_pd(v1, c); t = _mm_set_epi32(0, 0, bytespp == 3 ? 0 : src_bits[index + 3], src_bits[index + 2]); b = _mm_cvtepi32_pd(t); c = _mm_mul_pd(a, b); v2 = _mm_add_pd(v2, c); index += bytespp; #elif defined(USE_FLOAT) float weight = (float)weightsTable.getWeight(x, i-iLeft); for (uint j = 0; j < bytespp; ++ j) { value[j] += (weight * (float)src_bits[index ++]); } #else double weight = weightsTable.getWeight(x, i-iLeft); for (uint j = 0; j < bytespp; ++ j) { value[j] += (weight * (double)src_bits[index ++]); } #endif } #ifdef USE_SSE v = _mm_add_ps(v, v05); value = _mm_cvtps_epi32(v); dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255); dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255); dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[2]), (int)255); if (bytespp == 4) { dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[3]), (int)255); } #elif defined (USE_SSE2) v1 = _mm_add_pd(v1, v05); v2 = _mm_add_pd(v2, v05); value = _mm_cvtpd_epi32(v1); dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255); dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255); value = _mm_cvtpd_epi32(v2); dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255); if (bytespp == 4) { dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255); } #else for (uint j = 0; j < bytespp; ++ j) { dst_bits[j] = (unsigned char)MIN(MAX((int)0, (int)(value[j] + 0.5)), (int)255); } #endif dst_bits += bytespp; } } } return true; }
bool CResizeEngine::verticalFilter(CDIBSection *src, CDIBSection *dst, ILongTimeRunCallback *pCallback) { assert(src->getBitCounts() == dst->getBitCounts()); int bitcount = src->getBitCounts(); uint src_width = src->getWidth(); uint src_height = src->getHeight(); uint dst_width = dst->getWidth(); uint dst_height = dst->getHeight(); assert(src_width == dst_width); src_width = src_width; if (src_height == dst_height) { unsigned char *src_bits = (unsigned char *)src->getData(); unsigned char *dst_bits = (unsigned char *)dst->getData(); assert(src_bits && dst_bits); memcpy(dst_bits, src_bits, dst_height * dst->getStride()); } else if (!m_pFilter) { // fast (COLOR ON COLOR) double ratio_h = (double)src_height / (double)dst_height; uint bytespp = bitcount / 8; for (uint y = 0; y < dst_height; ++ y) { uint sy = (uint)(y * ratio_h + 0.5); if (sy >= src_height) { sy = src_height - 1; } uint8 *dst_data = (uint8 *)dst->getLine(y); uint8 *src_line = (uint8 *)src->getLine(sy); for (uint x = 0; x < dst_width; ++ x) { uint8 *src_data = src_line + x * bytespp; for (uint i = 0; i < bytespp; ++ i) { *dst_data ++ = *src_data ++; } } } } else { #ifdef USE_SSE __m128i value, t; __m128 a, b, c, v05 = _mm_set_ps1(0.5); #elif (defined(USE_SSE2)) __m128i value, t; __m128d a, b, c, v05 = _mm_set1_pd(0.5); #endif uint index; // pixel index CWeightsTable weightsTable(m_pFilter, dst_height, src_height); uint bytespp = src->getBitCounts() / 8; assert(bytespp == 3 || bytespp == 4); unsigned src_pitch = src->getStride(); unsigned dst_pitch = dst->getStride(); for(uint x = 0; x < dst_width; ++ x) { // test for stop if (x % 16 == 0) { if (pCallback && pCallback->shouldStop()) { return false; } } index = x * bytespp; unsigned char *dst_bits = (unsigned char *)dst->getData(); dst_bits += index; for(uint y = 0; y < dst_height; ++ y) { #ifdef USE_SSE __m128 v = _mm_set_ps1(0.0); #elif defined (USE_SSE2) __m128d v1 = _mm_set1_pd(0.0); __m128d v2 = _mm_set1_pd(0.0); #elif defined (USE_FLOAT) float value[4] = {0, 0, 0, 0}; #else double value[4] = {0, 0, 0, 0}; // 4 = 32bpp max #endif int iLeft = weightsTable.getLeftBoundary(y); int iRight = weightsTable.getRightBoundary(y); uint8 *src_bits = src->getLine(iLeft); src_bits += index; for(int i = iLeft; i <= iRight; ++ i) { #ifdef USE_SSE float weight = (float)weightsTable.getWeight(y, i - iLeft); a = _mm_set_ps1(weight); if (bytespp == 3) { t = _mm_set_epi32(0, src_bits[2], src_bits[1], src_bits[0]); } else { t = _mm_set_epi32(src_bits[3], src_bits[2], src_bits[1], src_bits[0]); } b = _mm_cvtepi32_ps(t); c = _mm_mul_ps(a, b); v = _mm_add_ps(v, c); #elif defined(USE_SSE2) double weight = weightsTable.getWeight(y, i - iLeft); a = _mm_set1_pd(weight); t = _mm_set_epi32(0, 0, src_bits[1], src_bits[0]); b = _mm_cvtepi32_pd(t); c = _mm_mul_pd(a, b); v1 = _mm_add_pd(v1, c); t = _mm_set_epi32(0, 0, bytespp == 3 ? 0 : src_bits[3], src_bits[2]); b = _mm_cvtepi32_pd(t); c = _mm_mul_pd(a, b); v2 = _mm_add_pd(v2, c); #elif defined (USE_FLOAT) float weight = (float)weightsTable.getWeight(y, i - iLeft); for (uint j = 0; j < bytespp; ++ j) { value[j] += (weight * (float)src_bits[j]); } #else double weight = weightsTable.getWeight(y, i - iLeft); for (uint j = 0; j < bytespp; ++ j) { value[j] += (weight * (double)src_bits[j]); } #endif src_bits += src_pitch; } // clamp and place result in destination pixel #ifdef USE_SSE v = _mm_add_ps(v, v05); value = _mm_cvtps_epi32(v); // __m128i flag = _mm_cmpgt_epi32(value, _mm_set1_epi32(0)); // value = _mm_and_si128(value, flag); // dst_bits[0] = (unsigned char)MIN(255, value.m128i_i32[0]); // dst_bits[1] = (unsigned char)MIN(255, value.m128i_i32[1]); // dst_bits[2] = (unsigned char)MIN(255, value.m128i_i32[2]); // if (bytespp == 4) { // dst_bits[3] = (unsigned char)MIN(255, value.m128i_i32[3]); // } dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255); dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255); dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[2]), (int)255); if (bytespp == 4) { dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[3]), (int)255); } #elif defined (USE_SSE2) v1 = _mm_add_pd(v1, v05); v2 = _mm_add_pd(v2, v05); value = _mm_cvtpd_epi32(v1); dst_bits[0] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255); dst_bits[1] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255); value = _mm_cvtpd_epi32(v2); dst_bits[2] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[0]), (int)255); if (bytespp == 4) { dst_bits[3] = (unsigned char)MIN(MAX((int)0, value.m128i_i32[1]), (int)255); } #else for (unsigned j = 0; j < bytespp; ++ j) { dst_bits[j] = (unsigned char)MIN(MAX((int)0, (int)(value[j] + 0.5)), (int)255); } #endif dst_bits += dst_pitch; } } } return true; }
INLINE __m128 shade(BilinearSamplePos const& bsp, const SWR_TRIANGLE_DESC & work, WideVector<BilinearSamplePos::NUM_ATTRIBUTES, __m128> const& pAttrs, BYTE* pBuffer, BYTE*, UINT*) { TextureView *pTxv = (TextureView*)work.pTextureViews[KNOB_NUMBER_OF_TEXTURE_VIEWS + 0]; Sampler *pSmp = (Sampler*)work.pSamplers[0]; TexCoord tcidx; tcidx.U = get<4>(pAttrs); tcidx.V = get<5>(pAttrs); UINT mips[] = {0,0,0,0}; WideColor color; SampleSimplePointRGBAF32(*pTxv, *pSmp, tcidx, mips, color); // modulate color.R = _mm_mul_ps(color.R, get<0>(pAttrs)); color.G = _mm_mul_ps(color.G, get<1>(pAttrs)); color.B = _mm_mul_ps(color.B, get<2>(pAttrs)); color.A = _mm_mul_ps(color.A, get<3>(pAttrs)); // convert float to unorm __m128i r = vFloatToUnorm( color.R ); __m128i g = vFloatToUnorm( color.G ); __m128i b = vFloatToUnorm( color.B ); __m128i a = vFloatToUnorm( color.A ); // pack __m128i vPixel = b; vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(g, 8)); vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(r, 16)); vPixel = _mm_or_si128(vPixel, _mm_slli_epi32(a, 24)); // blend with GL_ONE and GL_ONE if (bsp.sFactor == GL_ONE && bsp.dFactor == GL_ONE) { __m128i vColorBuffer = _mm_load_si128((const __m128i*)pBuffer); vPixel = _mm_adds_epu8(vPixel, vColorBuffer); } if (bsp.sFactor == GL_SRC_ALPHA && bsp.dFactor == GL_ONE_MINUS_SRC_ALPHA) { const __m128i SHUF_ALPHA = _mm_set_epi32(0x8080800f, 0x8080800b, 0x80808007, 0x80808003); const __m128i SHUF_RED = _mm_set_epi32(0x8080800e, 0x8080800a, 0x80808006, 0x80808002); const __m128i SHUF_GREEN = _mm_set_epi32(0x8080800d, 0x80808009, 0x80808005, 0x80808001); const __m128i SHUF_BLUE = _mm_set_epi32(0x8080800c, 0x80808008, 0x80808004, 0x80808000); // mul by src_alpha __m128 vSrcRedF = _mm_mul_ps(color.R, color.A); __m128 vSrcGreenF = _mm_mul_ps(color.G, color.A); __m128 vSrcBlueF = _mm_mul_ps(color.B, color.A); // convert to int __m128i vSrcRed = vFloatToUnorm(vSrcRedF); __m128i vSrcGreen = vFloatToUnorm(vSrcGreenF); __m128i vSrcBlue = vFloatToUnorm(vSrcBlueF); __m128i vSrcAlpha = vFloatToUnorm(color.A); // pack __m128i vSrcPixel = vSrcBlue; vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcGreen, 8)); vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcRed, 16)); vSrcPixel = _mm_or_si128(vSrcPixel, _mm_slli_epi32(vSrcAlpha, 24)); // shuffle dst R,G,B,A __m128i vColorBuffer = _mm_load_si128((const __m128i*)pBuffer); __m128i vDstAlpha = _mm_shuffle_epi8(vColorBuffer, SHUF_ALPHA); __m128i vDstRed = _mm_shuffle_epi8(vColorBuffer, SHUF_RED); __m128i vDstGreen = _mm_shuffle_epi8(vColorBuffer, SHUF_GREEN); __m128i vDstBlue = _mm_shuffle_epi8(vColorBuffer, SHUF_BLUE); // convert to float __m128 vDstAlphaF = _mm_cvtepi32_ps(vDstAlpha); __m128 vDstRedF = _mm_cvtepi32_ps(vDstRed); __m128 vDstGreenF = _mm_cvtepi32_ps(vDstGreen); __m128 vDstBlueF = _mm_cvtepi32_ps(vDstBlue); // mul by 1-src_alpha __m128 vOneMinusSrcAlphaF = _mm_sub_ps(_mm_set1_ps(1.0f), color.A); vDstAlphaF = _mm_mul_ps(vDstAlphaF, vOneMinusSrcAlphaF); vDstRedF = _mm_mul_ps(vDstRedF, vOneMinusSrcAlphaF); vDstGreenF = _mm_mul_ps(vDstGreenF, vOneMinusSrcAlphaF); vDstBlueF = _mm_mul_ps(vDstBlueF, vOneMinusSrcAlphaF); // convert to int vDstAlpha = _mm_cvtps_epi32(vDstAlphaF); vDstRed = _mm_cvtps_epi32(vDstRedF); vDstGreen = _mm_cvtps_epi32(vDstGreenF); vDstBlue = _mm_cvtps_epi32(vDstBlueF); // pack __m128i vDstPixel = vDstBlue; vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstGreen, 8)); vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstRed, 16)); vDstPixel = _mm_or_si128(vDstPixel, _mm_slli_epi32(vDstAlpha, 24)); // final rgba = min(src + dst,255) vPixel = _mm_adds_epu8(vSrcPixel, vDstPixel); } return _mm_castsi128_ps(vPixel); }
// -------------------------------------------------------------- vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter) // -------------------------------------------------------------- { // version avec test de sortie en float vuint32 iter = _mm_set1_epi32(0); vfloat32 fiter = _mm_set_ps(0,0,0,0); vfloat32 x,y,t,t2,zero,un,deux,quatre; // COMPLETER ICI int test,i = 0; // initialisation des variables x = _mm_set_ps(0,0,0,0); y = _mm_set_ps(0,0,0,0); deux = _mm_set_ps(2,2,2,2); quatre = _mm_set_ps(4,4,4,4); un = _mm_set_ps(1,1,1,1); zero = _mm_set_ps(0,0,0,0); // iteration zero t = _mm_mul_ps(x, x); t2 = _mm_mul_ps(y, y); y = _mm_mul_ps(x,y); y = _mm_mul_ps(y,deux); y = _mm_add_ps(y,b); x = _mm_sub_ps(t,t2); x = _mm_add_ps(x,a); // calcul while(i<max_iter && _mm_movemask_ps(t) != 15) { t = _mm_mul_ps(x, x); t2 = _mm_mul_ps(y, y); y = _mm_mul_ps(_mm_mul_ps(x,y),deux); y = _mm_add_ps(y,b); x = _mm_sub_ps(t,t2); x = _mm_add_ps(x,a); t2 = _mm_add_ps(t,t2); t2 = _mm_cmple_ps(t2,quatre); t = _mm_blendv_ps(zero,un,t2); fiter = _mm_add_ps(fiter,t); t = _mm_cmpeq_ps(t, zero); //display_vfloat32(t,"%f\t","T :: "); //printf(" MASK::%d \n",_mm_movemask_ps(t)); i+=1; } iter = _mm_cvtps_epi32(fiter); return iter; }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3); __m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i, pXformedPos, idx); // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; for(int m = 0; m < 3; m++) { fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X); fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea)); __m128 Z[3]; Z[0] = xformedPos[0].Z; Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize //__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3)); __m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1)); __m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m256 zz[3]; for (int vv = 0; vv < 3; vv++) { zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]); __m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]); __m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]); __m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]); __m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]); __m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]); __m256i aa0Inc = _mm256_slli_epi32(aa0, 2); __m256i aa1Inc = _mm256_slli_epi32(aa1, 2); __m256i aa2Inc = _mm256_slli_epi32(aa2, 2); __m256i bb0Inc = _mm256_slli_epi32(bb0, 1); __m256i bb1Inc = _mm256_slli_epi32(bb1, 1); __m256i bb2Inc = _mm256_slli_epi32(bb2, 1); __m256i row, col; // Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X // This method provides better performance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx)); __m256i aa0Col = _mm256_mullo_epi32(aa0, col); __m256i aa1Col = _mm256_mullo_epi32(aa1, col); __m256i aa2Col = _mm256_mullo_epi32(aa2, col); row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy)); __m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane])); __m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane])); __m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane])); __m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row); __m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row); __m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row); __m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm256_add_epi32(sum0Row, bb0Inc), sum1Row = _mm256_add_epi32(sum1Row, bb1Inc), sum2Row = _mm256_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m256i alpha = sum0Row; __m256i beta = sum1Row; __m256i gama = sum2Row; //Compute barycentric-interpolated depth __m256 depth = zz[0]; depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1])); depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2])); __m256i anyOut = _mm256_setzero_si256(); for (int c = startXx; c < endXx; c += 4, index += 8, alpha = _mm256_add_epi32(alpha, aa0Inc), beta = _mm256_add_epi32(beta, aa1Inc), gama = _mm256_add_epi32(gama, aa2Inc), depth = _mm256_add_ps(depth, zx)) { //Test Pixel inside triangle __m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama); __m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]); __m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D); __m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask)); anyOut = _mm256_or_si256(anyOut, finalMask); }//for each column if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000))) { return true; //early exit } }// for each row }// for each triangle }// for each set of SIMD# triangles return false; }
//---------------------------------------------------------------- // Transforms the AABB vertices to screen space once every frame // Also performs a coarse depth pre-test //---------------------------------------------------------------- PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary) { // w ends up being garbage, but it doesn't matter - we ignore it anyway. __m128 vCenter = _mm_loadu_ps(&mBBCenter.x); __m128 vHalf = _mm_loadu_ps(&mBBHalf.x); __m128 vMin = _mm_sub_ps(vCenter, vHalf); __m128 vMax = _mm_add_ps(vCenter, vHalf); // transforms __m128 xRow[2], yRow[2], zRow[2]; xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0]; xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0]; yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1]; yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1]; zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2]; zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2]; __m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0)); __m128 screenMin = _mm_set1_ps(FLT_MAX); __m128 screenMax = _mm_set1_ps(-FLT_MAX); for(UINT i = 0; i < AABB_VERTICES; i++) { // Transform the vertex __m128 vert = cumulativeMatrix[3]; vert += xRow[sBBxInd[i]]; vert += yRow[sBByInd[i]]; vert += zRow[sBBzInd[i]]; // We have inverted z; z is in front of near plane iff z <= w. __m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz __m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww __m128 zIn = _mm_cmple_ps(vertZ, vertW); zAllIn = _mm_and_ps(zAllIn, zIn); // project xformedPos[i] = _mm_div_ps(vert, vertW); // update bounds screenMin = _mm_min_ps(screenMin, xformedPos[i]); screenMax = _mm_max_ps(screenMax, xformedPos[i]); } // if any of the verts are z-clipped, we (conservatively) say the box is in if(_mm_movemask_ps(zAllIn) != 0xf) return ePT_VISIBLE; // Clip against screen bounds screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX)); screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX)); // Quick rejection test if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin))) return ePT_INVISIBLE; // Prepare integer bounds __m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY __m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY); __m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3); __m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa); // Traverse all 8x8 blocks covered by 2d screen-space BBox; // if we know for sure that this box is behind the geometry we know is there, // we can stop. int rX0 = minMaxXYis.m128i_i32[0]; int rY0 = minMaxXYis.m128i_i32[1]; int rX1 = minMaxXYis.m128i_i32[2]; int rY1 = minMaxXYis.m128i_i32[3]; __m128 anyCloser = _mm_setzero_ps(); for(int by = rY0; by <= rY1; by++) { const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE); // If for any 8x8 block, maxZ is not less than (=behind) summarized // min Z, box might be visible. for(int bx = rX0; bx <= rX1; bx++) { anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx]))); } if(_mm_movemask_ps(anyCloser)) { return ePT_UNSURE; // okay, box might be in } } // If we get here, we know for sure that the box is fully behind the stuff in the // depth buffer. return ePT_INVISIBLE; }
void sINLINE RNMarchingCubesBase<T>::func(const sVector31 &v,typename T::FieldType &pot,const funcinfo &fi) { __m128 vx = _mm_load_ps1(&v.x); __m128 vy = _mm_load_ps1(&v.y); __m128 vz = _mm_load_ps1(&v.z); __m128 po = _mm_setzero_ps(); // p __m128 nx = _mm_setzero_ps(); __m128 ny = _mm_setzero_ps(); __m128 nz = _mm_setzero_ps(); __m128 akkur = _mm_setzero_ps(); __m128 akkug = _mm_setzero_ps(); __m128 akkub = _mm_setzero_ps(); __m128 akkua = _mm_setzero_ps(); __m128 s255 = _mm_set_ps1(255.0f); sBool good = 0; for(sInt i=0;i<fi.pn4;i++) { const T::SimdType *part = fi.parts4 + i; __m128 dx = _mm_sub_ps(vx,part->x); __m128 dy = _mm_sub_ps(vy,part->y); __m128 dz = _mm_sub_ps(vz,part->z); __m128 ddx = _mm_mul_ps(dx,dx); __m128 ddy = _mm_mul_ps(dy,dy); __m128 ddz = _mm_mul_ps(dz,dz); __m128 pp = _mm_add_ps(_mm_add_ps(ddx,ddy),ddz); if(_mm_movemask_ps(_mm_cmple_ps(pp,fi.treshf4))!=0) { __m128 pp2 = _mm_sub_ps(_mm_div_ps(fi.one,pp),fi.tresh4); __m128 pp3 = _mm_max_ps(pp2,_mm_setzero_ps()); po = _mm_add_ps(po,pp3); // p = p+pp; __m128 pp4 = _mm_mul_ps(pp3,pp3); // pp*pp nx = _mm_add_ps(nx,_mm_mul_ps(pp4,dx)); // n += d*(pp*pp) ny = _mm_add_ps(ny,_mm_mul_ps(pp4,dy)); nz = _mm_add_ps(nz,_mm_mul_ps(pp4,dz)); if(T::Color==1) { akkur = _mm_add_ps(akkur,_mm_mul_ps(pp3,part->cr)); akkug = _mm_add_ps(akkug,_mm_mul_ps(pp3,part->cg)); akkub = _mm_add_ps(akkub,_mm_mul_ps(pp3,part->cb)); good = 1; } } } sF32 p = 0; sVector30 n; _MM_TRANSPOSE4_PS(po,nx,ny,nz); __m128 r = _mm_add_ps(_mm_add_ps(_mm_add_ps(nx,ny),nz),po); n.x = r.m128_f32[1]; n.y = r.m128_f32[2]; n.z = r.m128_f32[3]; p = r.m128_f32[0]; if(p==0) n.Init(0,0,0); else n.UnitFast(); pot.x = n.x; pot.y = n.y; pot.z = n.z; pot.w = p-fi.iso; if(T::Color) { if(good) { r = _mm_mul_ss(s255,_mm_rcp_ss(r)); // r = _mm_rcp_ss(r); _MM_TRANSPOSE4_PS(akkub,akkug,akkur,akkua); __m128 r2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(akkur,akkug),akkub),akkua); r2 = _mm_mul_ps(r2,_mm_shuffle_ps(r,r,0x00)); __m128i r3 = _mm_cvtps_epi32(r2); r3 = _mm_packs_epi32(r3,r3); __m128i r4 = _mm_packus_epi16(r3,r3); pot.c = r4.m128i_u32[0]|0xff000000; } else { pot.c = 0; } } }
void audio_thread::operator()() { thread_ctrl::set_native_priority(1); AudioDumper m_dump(g_cfg.audio.dump_to_file ? 2 : 0); // Init AudioDumper for 2 channels if enabled float buf2ch[2 * BUFFER_SIZE]{}; // intermediate buffer for 2 channels float buf8ch[8 * BUFFER_SIZE]{}; // intermediate buffer for 8 channels const u32 buf_sz = BUFFER_SIZE * (g_cfg.audio.convert_to_u16 ? 2 : 4) * (g_cfg.audio.downmix_to_2ch ? 2 : 8); std::unique_ptr<float[]> out_buffer[BUFFER_NUM]; for (u32 i = 0; i < BUFFER_NUM; i++) { out_buffer[i].reset(new float[8 * BUFFER_SIZE] {}); } const auto audio = Emu.GetCallbacks().get_audio(); audio->Open(buf8ch, buf_sz); while (thread_ctrl::state() != thread_state::aborting && !Emu.IsStopped()) { if (Emu.IsPaused()) { thread_ctrl::wait_for(1000); // hack continue; } const u64 stamp0 = get_system_time(); const u64 time_pos = stamp0 - start_time - Emu.GetPauseTime(); // TODO: send beforemix event (in ~2,6 ms before mixing) // precise time of sleeping: 5,(3) ms (or 256/48000 sec) const u64 expected_time = m_counter * AUDIO_SAMPLES * 1000000 / 48000; if (expected_time >= time_pos) { thread_ctrl::wait_for(1000); // hack continue; } m_counter++; const u32 out_pos = m_counter % BUFFER_NUM; bool first_mix = true; // mixing: for (auto& port : ports) { if (port.state != audio_port_state::started) continue; const u32 block_size = port.channel * AUDIO_SAMPLES; const u32 position = port.tag % port.block; // old value const u32 buf_addr = port.addr.addr() + position * block_size * sizeof(float); auto buf = vm::_ptr<f32>(buf_addr); static const float k = 1.0f; // may be 1.0f const float& m = port.level; auto step_volume = [](audio_port& port) // part of cellAudioSetPortLevel functionality { const auto param = port.level_set.load(); if (param.inc != 0.0f) { port.level += param.inc; const bool dec = param.inc < 0.0f; if ((!dec && param.value - port.level <= 0.0f) || (dec && param.value - port.level >= 0.0f)) { port.level = param.value; port.level_set.compare_and_swap(param, { param.value, 0.0f }); } } }; if (port.channel == 2) { if (first_mix) { for (u32 i = 0; i < std::size(buf2ch); i += 2) { step_volume(port); // reverse byte order const float left = buf[i + 0] * m; const float right = buf[i + 1] * m; buf2ch[i + 0] = left; buf2ch[i + 1] = right; buf8ch[i * 4 + 0] = left; buf8ch[i * 4 + 1] = right; buf8ch[i * 4 + 2] = 0.0f; buf8ch[i * 4 + 3] = 0.0f; buf8ch[i * 4 + 4] = 0.0f; buf8ch[i * 4 + 5] = 0.0f; buf8ch[i * 4 + 6] = 0.0f; buf8ch[i * 4 + 7] = 0.0f; } first_mix = false; } else { for (u32 i = 0; i < std::size(buf2ch); i += 2) { step_volume(port); const float left = buf[i + 0] * m; const float right = buf[i + 1] * m; buf2ch[i + 0] += left; buf2ch[i + 1] += right; buf8ch[i * 4 + 0] += left; buf8ch[i * 4 + 1] += right; } } } else if (port.channel == 8) { if (first_mix) { for (u32 i = 0; i < std::size(buf2ch); i += 2) { step_volume(port); const float left = buf[i * 4 + 0] * m; const float right = buf[i * 4 + 1] * m; const float center = buf[i * 4 + 2] * m; const float low_freq = buf[i * 4 + 3] * m; const float rear_left = buf[i * 4 + 4] * m; const float rear_right = buf[i * 4 + 5] * m; const float side_left = buf[i * 4 + 6] * m; const float side_right = buf[i * 4 + 7] * m; const float mid = (center + low_freq) * 0.708f; buf2ch[i + 0] = (left + rear_left + side_left + mid) * k; buf2ch[i + 1] = (right + rear_right + side_right + mid) * k; buf8ch[i * 4 + 0] = left; buf8ch[i * 4 + 1] = right; buf8ch[i * 4 + 2] = center; buf8ch[i * 4 + 3] = low_freq; buf8ch[i * 4 + 4] = rear_left; buf8ch[i * 4 + 5] = rear_right; buf8ch[i * 4 + 6] = side_left; buf8ch[i * 4 + 7] = side_right; } first_mix = false; } else { for (u32 i = 0; i < std::size(buf2ch); i += 2) { step_volume(port); const float left = buf[i * 4 + 0] * m; const float right = buf[i * 4 + 1] * m; const float center = buf[i * 4 + 2] * m; const float low_freq = buf[i * 4 + 3] * m; const float rear_left = buf[i * 4 + 4] * m; const float rear_right = buf[i * 4 + 5] * m; const float side_left = buf[i * 4 + 6] * m; const float side_right = buf[i * 4 + 7] * m; const float mid = (center + low_freq) * 0.708f; buf2ch[i + 0] += (left + rear_left + side_left + mid) * k; buf2ch[i + 1] += (right + rear_right + side_right + mid) * k; buf8ch[i * 4 + 0] += left; buf8ch[i * 4 + 1] += right; buf8ch[i * 4 + 2] += center; buf8ch[i * 4 + 3] += low_freq; buf8ch[i * 4 + 4] += rear_left; buf8ch[i * 4 + 5] += rear_right; buf8ch[i * 4 + 6] += side_left; buf8ch[i * 4 + 7] += side_right; } } } else { fmt::throw_exception("Unknown channel count (port=%u, channel=%d)" HERE, port.number, port.channel); } memset(buf, 0, block_size * sizeof(float)); } if (!first_mix) { // Copy output data (2ch or 8ch) if (g_cfg.audio.downmix_to_2ch) { for (u32 i = 0; i < std::size(buf2ch); i++) { out_buffer[out_pos][i] = buf2ch[i]; } } else { for (u32 i = 0; i < std::size(buf8ch); i++) { out_buffer[out_pos][i] = buf8ch[i]; } } } const u64 stamp1 = get_system_time(); if (first_mix) { std::memset(out_buffer[out_pos].get(), 0, 8 * BUFFER_SIZE * sizeof(float)); } if (g_cfg.audio.convert_to_u16) { // convert the data from float to u16 with clipping: // 2x MULPS // 2x MAXPS (optional) // 2x MINPS (optional) // 2x CVTPS2DQ (converts float to s32) // PACKSSDW (converts s32 to s16 with signed saturation) __m128i buf_u16[BUFFER_SIZE]; for (size_t i = 0; i < 8 * BUFFER_SIZE; i += 8) { const auto scale = _mm_set1_ps(0x8000); buf_u16[i / 8] = _mm_packs_epi32( _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(out_buffer[out_pos].get() + i), scale)), _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(out_buffer[out_pos].get() + i + 4), scale))); } audio->AddData(buf_u16, buf_sz); } else { audio->AddData(out_buffer[out_pos].get(), buf_sz); } const u64 stamp2 = get_system_time(); { // update indices: for (u32 i = 0; i < AUDIO_PORT_COUNT; i++) { audio_port& port = ports[i]; if (port.state != audio_port_state::started) continue; u32 position = port.tag % port.block; // old value port.counter = m_counter; port.tag++; // absolute index of block that will be read m_indexes[i] = (position + 1) % port.block; // write new value } // send aftermix event (normal audio event) auto _locked = g_idm->lock<named_thread<audio_thread>>(0); for (u64 key : keys) { // TODO: move out of the lock scope if (auto queue = lv2_event_queue::find(key)) { queue->send(0, 0, 0, 0); // TODO: check arguments } } } const u64 stamp3 = get_system_time(); switch (m_dump.GetCh()) { case 2: m_dump.WriteData(&buf2ch, sizeof(buf2ch)); break; // write file data (2 ch) case 8: m_dump.WriteData(&buf8ch, sizeof(buf8ch)); break; // write file data (8 ch) } cellAudio.trace("Audio perf: (access=%d, AddData=%d, events=%d, dump=%d)", stamp1 - stamp0, stamp2 - stamp1, stamp3 - stamp2, get_system_time() - stamp3); } }
//------------------------------------------------------------------------------- // For each tile go through all the bins and process all the triangles in it. // Rasterize each triangle to the CPU depth buffer. //------------------------------------------------------------------------------- void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_setr_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; // Based on TaskId determine which tile to process UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS; UINT tileX = tileId % screenWidthInTiles; UINT tileY = tileId / screenWidthInTiles; int tileStartX = tileX * TILE_WIDTH_IN_PIXELS; int tileEndX = tileStartX + TILE_WIDTH_IN_PIXELS - 1; int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS; int tileEndY = tileStartY + TILE_HEIGHT_IN_PIXELS - 1; ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx); UINT bin = 0; UINT binIndex = 0; UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX; UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX; UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; __m128 gatherBuf[4][3]; bool done = false; bool allBinsEmpty = true; mNumRasterizedTris[idx][tileId] = numTrisInBin; while(!done) { // Loop through all the bins and process 4 binned traingles at a time UINT ii; int numSimdTris = 0; for(ii = 0; ii < SSE; ii++) { while(numTrisInBin <= 0) { // This bin is empty. Move to next bin. if(++bin >= 1) { break; } numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; mNumRasterizedTris[idx][tileId] += numTrisInBin; binIndex = 0; } if(!numTrisInBin) { break; // No more tris in the bins } USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx); allBinsEmpty = false; numSimdTris++; ++binIndex; --numTrisInBin; } done = bin >= NUM_XFORMVERTS_TASKS; if(allBinsEmpty) { return; } // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; __m128 Z[3]; for(int i = 0; i < 3; i++) { __m128 v0 = gatherBuf[0][i]; __m128 v1 = gatherBuf[1][i]; __m128 v2 = gatherBuf[2][i]; __m128 v3 = gatherBuf[3][i]; // transpose into SoA layout _MM_TRANSPOSE4_PS(v0, v1, v2, v3); fxPtX[i] = _mm_cvtps_epi32(v0); fxPtY[i] = _mm_cvtps_epi32(v1); Z[i] = v2; } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX)); __m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < numSimdTris; lane++) { // Extract this triangle's properties from the SIMD versions __m128 zz[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane])); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane])); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane])); __m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row); __m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row); __m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); __m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm_add_epi32(sum0Row, bb0Inc), sum1Row = _mm_add_epi32(sum1Row, bb1Inc), sum2Row = _mm_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m128i alpha = sum0Row; __m128i beta = sum1Row; __m128i gama = sum2Row; //Compute barycentric-interpolated depth __m128 depth = zz[0]; depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); for(int c = startXx; c < endXx; c += 2, index += 4, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc), depth = _mm_add_ps(depth, zx)) { //Test Pixel inside triangle __m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama); __m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]); __m128 mergedDepth = _mm_max_ps(depth, previousDepthValue); __m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask)); _mm_store_ps(&pDepthBuffer[index], finalDepth); }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }