//------------------------------------------------------------------- // blend void effect(float *pBuffer[], const Cbmp *bmp, const float weight) { size_t width = bmp->getWidth(); size_t height = bmp->getAbsHeight(); const float fmul0 = weight; const float fmul1 = 1.0f - weight; __m256 weight0 = _mm256_broadcast_ss(&fmul0); __m256 weight1 = _mm256_broadcast_ss(&fmul1); float *pF[2]; pF[0] = pBuffer[0]; pF[1] = pBuffer[1]; for (size_t y = 0; y < height; y++) { for (size_t x = 0; x < width; x += 8, pF[0] += 8, pF[1] += 8) { __m256 p0 = _mm256_load_ps(pF[0]); p0 = _mm256_mul_ps(p0, weight0); __m256 p1 = _mm256_load_ps(pF[1]); p1 = _mm256_mul_ps(p1, weight1); __m256 r = _mm256_add_ps(p0, p1); _mm256_store_ps(pF[0], r); } } }
//-------------------------------------------------------------------------- // amp AVX static void effectAvx(float fData[], const float amp, const size_t length) { __m256 psAmp = _mm256_broadcast_ss(&); __m256 *pIn = (__m256 *)fData; for (size_t i = 0; i < length; i += 8, pIn++) { __m256 a = _mm256_mul_ps(*pIn, psAmp); _mm256_store_ps(&fData[i], a); } }
static void dct_1d_general(float* in_data, float* out_data, float lookup[64]) { __m256 current, dct_values, multiplied, sum; current = _mm256_broadcast_ss(in_data); dct_values = _mm256_load_ps(lookup); multiplied = _mm256_mul_ps(dct_values, current); sum = multiplied; // Broadcasts a single float (scalar) to every element in 'current'. current = _mm256_broadcast_ss(in_data + 1); // Loads DCT values from the lookup table. iDCT uses a transposed lookup table here. dct_values = _mm256_load_ps(lookup + 8); // Vertically multiply the scalar with the DCT values. multiplied = _mm256_mul_ps(dct_values, current); // Vertically add to the previous sum. sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 2); dct_values = _mm256_load_ps(lookup + 16); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 3); dct_values = _mm256_load_ps(lookup + 24); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 4); dct_values = _mm256_load_ps(lookup + 32); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 5); dct_values = _mm256_load_ps(lookup + 40); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 6); dct_values = _mm256_load_ps(lookup + 48); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 7); dct_values = _mm256_load_ps(lookup + 56); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); _mm256_store_ps(out_data, sum); }
__m256 test_mm256_broadcast_ss(float const *__a) { // CHECK-LABEL: @test_mm256_broadcast_ss // CHECK: insertelement <8 x float> {{.*}}, i32 0 // CHECK: insertelement <8 x float> {{.*}}, i32 1 // CHECK: insertelement <8 x float> {{.*}}, i32 2 // CHECK: insertelement <8 x float> {{.*}}, i32 3 // CHECK: insertelement <8 x float> {{.*}}, i32 4 // CHECK: insertelement <8 x float> {{.*}}, i32 5 // CHECK: insertelement <8 x float> {{.*}}, i32 6 // CHECK: insertelement <8 x float> {{.*}}, i32 7 return _mm256_broadcast_ss(__a); }
void plot(u32 w, u32 h, float x1, float y1, float x2, float y2, float dx, float dy, u32 max_iter = 4096) { assert(w % 8 == 0); // AVX Constants float const constants[] { x1, y1, dx, dy, 1.0f, 4.0f }; __m256 const vx1 = _mm256_broadcast_ss(constants); __m256 const vy1 = _mm256_broadcast_ss(constants + 1); __m256 const vdx = _mm256_broadcast_ss(constants + 2); __m256 const vdy = _mm256_broadcast_ss(constants + 3); __m256 const v1 = _mm256_broadcast_ss(constants + 4); __m256 const v4 = _mm256_broadcast_ss(constants + 5); // Start timing std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2; std::chrono::duration<double> dt; t1 = std::chrono::high_resolution_clock::now(); // Zero line counter __m256 vj = _mm256_xor_ps(v1, v1); for (u32 j = 0; j < h; j++) { for (u32 i = 0; i < w; i += 8) { // Fill column counter float const vi_[8] { i+0.f, i+1.f, i+2.f, i+3.f, i+4.f, i+5.f, i+6.f, i+7.f }; __m256 vi = _mm256_load_ps(vi_); // Compute start point __m256 vx0 = _mm256_mul_ps(vi, vdx); vx0 = _mm256_add_ps(vx0, vx1); __m256 vy0 = _mm256_mul_ps(vj, vdy); vy0 = _mm256_add_ps(vy0, vy1); __m256 vx = vx0; __m256 vy = vy0; __m256 vcount = _mm256_xor_ps(v1, v1); // Zero iteration counter u32 iter = 0; u8 no_overflow = 0; do { // Compute products __m256 vxx = _mm256_mul_ps(vx, vx); __m256 vyy = _mm256_mul_ps(vy, vy); // Check termination condition __m256 vtmp = _mm256_add_ps(vxx, vyy); vtmp = _mm256_cmp_ps(vtmp, v4, _CMP_LT_OQ); no_overflow = _mm256_movemask_ps(vtmp) & 0xff; // Accumulate iteration counter vtmp = _mm256_and_ps(vtmp, v1); vcount = _mm256_add_ps(vcount, vtmp); // Step vtmp = _mm256_mul_ps(vx, vy); vtmp = _mm256_add_ps(vtmp, vtmp); vy = _mm256_add_ps(vtmp, vy0); vtmp = _mm256_sub_ps(vxx, vyy); vx = _mm256_add_ps(vtmp, vx0); ++iter; } while (no_overflow && (iter < max_iter)); for (u32 k = 0; k < 8; k++) { u32 n = ((float *) &vcount)[k] + 0.5f; if (n == max_iter) n = 0; char c = ' '; if (n > 0) { static char const charset[] = ".,c8M@jawrpogOQEPGJ"; c = charset[n % (sizeof(charset) - 1)]; } attron(COLOR_PAIR((n % 7) + 1)); addch(c); attroff(COLOR_PAIR((n % 7) + 1)); if (i + k + 1 == w) addch('\n'); } } // Increment line counter vj = _mm256_add_ps(vj, v1); } // End timing t2 = std::chrono::high_resolution_clock::now(); dt = t2 - t1; std::string info = std::to_string(dt.count() * 1000.0) + "ms"; attron(COLOR_PAIR(1)); printw(info.c_str()); attroff(COLOR_PAIR(1)); }
void kernel_strmv_u_n_8_lib8(int kmax, float *A, float *x, float *y, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, x_1, x_2, x_3, y_0, y_0_b, y_0_c, y_0_d, z_0; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_0_b = _mm256_setzero_ps(); y_0_c = _mm256_setzero_ps(); y_0_d = _mm256_setzero_ps(); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); x_0 = _mm256_blend_ps( zeros, x_0, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); x_1 = _mm256_blend_ps( zeros, x_1, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); x_2 = _mm256_blend_ps( zeros, x_2, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); x_3 = _mm256_blend_ps( zeros, x_3, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); x_0 = _mm256_blend_ps( zeros, x_0, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); x_1 = _mm256_blend_ps( zeros, x_1, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); x_2 = _mm256_blend_ps( zeros, x_2, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; k=8; for(; k<kmax-7; k+=8) { __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; } for(; k<kmax-3; k+=4) { __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; } y_0 = _mm256_add_ps( y_0 , y_0_c ); y_0_b = _mm256_add_ps( y_0_b, y_0_d ); if(kmax%4>=2) { a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); A += 2*lda; x += 2; } y_0 = _mm256_add_ps( y_0 , y_0_b ); if(kmax%2==1) { a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); /* A += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps( z_0, y_0 ); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps( z_0, y_0 ); _mm256_storeu_ps(&y[0], z_0); } }
/*---------------------------------------------------------------------------*/ __m256 TTriangle::THit::HitTest8(__m256 mask, const TPoint8& orig, const D3DXVECTOR3& d, HitResult8* result) const { int u, v, w; w = ci; u = w == 0 ? 1 : 0; v = w == 2 ? 1 : 2; __m256 nu = _mm256_broadcast_ss(&this->nu); __m256 np = _mm256_broadcast_ss(&this->np); __m256 nv = _mm256_broadcast_ss(&this->nv); __m256 pu = _mm256_broadcast_ss(&this->pu); __m256 pv = _mm256_broadcast_ss(&this->pv); __m256 e0u = _mm256_broadcast_ss(&this->e0u); __m256 e0v = _mm256_broadcast_ss(&this->e0v); __m256 e1u = _mm256_broadcast_ss(&this->e1u); __m256 e1v = _mm256_broadcast_ss(&this->e1v); __m256 ou = orig[u]; __m256 ov = orig[v]; __m256 ow = orig[w]; __m256 du = _mm256_broadcast_ss(&d[u]); __m256 dv = _mm256_broadcast_ss(&d[v]); __m256 dw = _mm256_broadcast_ss(&d[w]); __m256 dett = np -(ou*nu+ov*nv+ow); __m256 det = du*nu+dv*nv+dw; __m256 Du = du*dett - (pu-ou)*det; __m256 Dv = dv*dett - (pv-ov)*det; __m256 detu = (e1v*Du - e1u*Dv); __m256 detv = (e0u*Dv - e0v*Du); __m256 tmpdet0 = det - detu - detv; __m256 detMask = _mm256_xor_ps(_mm256_xor_ps(tmpdet0, detv) | _mm256_xor_ps(detv, detu), g_one8) > _mm256_setzero_ps(); mask = mask & detMask; __m256 rdet = _mm256_rcp_ps(det); result->t = dett * rdet; result->u = detu * rdet; result->v = detv * rdet; return mask & (result->t > _mm256_setzero_ps()); /**/ }
float tricub_x86_f(float *src, float *abcd, float x, float y){ float *s; float x0, x1, x2, x3, y0, y1, y2, y3; float dst[4]; #if defined(__AVX2__) && defined(__x86_64__) __m256 v1, v2, v3, v4; __m256 va, vb, vc, vd; __m128 va4, vb4, vc4, vd4; __m128 v128a, v128b; __m128 vy0, vy1, vy2, vy3; #else int i, ni2, ni3, ninj2, ninj3; float va4[4], vb4[4], vc4[4], vd4[4]; ninj2 = ninj + ninj; ninj3 = ninj2 + ninj; ni2 = ni + ni; ni3 = ni2 + ni; #endif #if defined(__AVX2__) && defined(__x86_64__) // ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ==== va = _mm256_broadcast_ss(abcd); // promote constants to vectors vb = _mm256_broadcast_ss(abcd+1); vc = _mm256_broadcast_ss(abcd+2); vd = _mm256_broadcast_ss(abcd+3); s = src; // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3) v128a = _mm_loadu_ps(s); // Z0 row 0 v1 = _mm256_insertf128_ps(v1,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z0 row 1 v1 = _mm256_insertf128_ps(v1,v128b,1); v1 = _mm256_mul_ps(v1,va); // v1 = v1*va s += ninj; v128a = _mm_loadu_ps(s); // Z1 row 0 v2 = _mm256_insertf128_ps(v2,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z1 row 1 v2 = _mm256_insertf128_ps(v2,v128b,1); v1 = _mm256_fmadd_ps(v2,vb,v1); // v1 += v2*vb s += ninj; v128a = _mm_loadu_ps(s); // Z2 row 0 v3 = _mm256_insertf128_ps(v3,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z2 row 1 v3 = _mm256_insertf128_ps(v3,v128b,1); v1 = _mm256_fmadd_ps(v3,vc,v1); // v1 += v3*vc s += ninj; v128a = _mm_loadu_ps(s); // Z3 row 0 v4 = _mm256_insertf128_ps(v4,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z3 row 1 v4 = _mm256_insertf128_ps(v4,v128b,1); v1 = _mm256_fmadd_ps(v4,vd,v1); // v1 += v4*vd // split vector of length 8 into 2 vectors of length 4 vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low) vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high) s = src + 2*ni; // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3) v128a = _mm_loadu_ps(s); // Z0 row 2 v1 = _mm256_insertf128_ps(v1,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z0 row 3 v1 = _mm256_insertf128_ps(v1,v128b,1); v1 = _mm256_mul_ps(v1,va); // v1 = v1*va s += ninj; v128a = _mm_loadu_ps(s); // Z1 row 2 v2 = _mm256_insertf128_ps(v2,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z1 row 3 v2 = _mm256_insertf128_ps(v2,v128b,1); v1 = _mm256_fmadd_ps(v2,vb,v1); // v1 += v2*vb s += ninj; v128a = _mm_loadu_ps(s); // Z2 row 2 v3 = _mm256_insertf128_ps(v3,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z2 row 3 v3 = _mm256_insertf128_ps(v3,v128b,1); v1 = _mm256_fmadd_ps(v3,vc,v1); // v1 += v3*vc s += ninj; v128a = _mm_loadu_ps(s); // Z3 row 2 v4 = _mm256_insertf128_ps(v4,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z3 row 3 v4 = _mm256_insertf128_ps(v4,v128b,1); v1 = _mm256_fmadd_ps(v4,vd,v1); // v1 += v4*vd // split vector of length 8 into 2 vectors of length 4 vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2 (v1 low) vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3 (v1 high) // ==== interpolation along Y, vector length is 4 (4 rows) ==== y0 = cm167*y*(y-one)*(y-two); y1 = cp5*(y+one)*(y-one)*(y-two); y2 = cm5*y*(y+one)*(y-two); y3 = cp167*y*(y+one)*(y-one); va4 = _mm_broadcast_ss(&y0); // promote constants to vectors vb4 = _mm_broadcast_ss(&y1); vc4 = _mm_broadcast_ss(&y2); vd4 = _mm_broadcast_ss(&y3); vy0 = _mm_mul_ps(vy0,va4); // vy0 * va4 vy0 = _mm_fmadd_ps(vy1,vb4,vy0); // += vy1 * vb4 vy0 = _mm_fmadd_ps(vy2,vc4,vy0); // += vy2 * vc4 vy0 = _mm_fmadd_ps(vy3,vd4,vy0); // += vy3 * vd4 _mm_storeu_ps(dst,vy0); // store 4 values along X #else y0 = cm167*y*(y-one)*(y-two); y1 = cp5*(y+one)*(y-one)*(y-two); y2 = cm5*y*(y+one)*(y-two); y3 = cp167*y*(y+one)*(y-one); for (i=0 ; i<4 ; i++){ va4[i] = src[i ]*abcd[0] + src[i +ninj]*abcd[1] + src[i +ninj2]*abcd[2] + src[i +ninj3]*abcd[3]; vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] + src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3]; vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] + src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3]; vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] + src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3]; dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3; } #endif // ==== interpolation along x, scalar ==== x0 = cm167*x*(x-one)*(x-two); x1 = cp5*(x+one)*(x-one)*(x-two); x2 = cm5*x*(x+one)*(x-two); x3 = cp167*x*(x+one)*(x-one); return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3); }
void run_dct(int width, int height, float *quant, float *input, int32_t *output) { float acosvals[8][8]; /* Calculating cosines is expensive, and there * are only 64 cosines that need to be calculated * so precompute them and cache. */ for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { if (j == 0) { acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j); } else { acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j); } } } /* Separate the parallel from the for, so each processor gets its * own copy of the buffers and variables. */ #pragma omp parallel { float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; avload[0] = sqrt(1.0 / 8.0); __m256 row0, row1, row2, row3, row4, row5, row6, row7; __m256 loaderlow, loaderhigh; __m256 temp; __m256 minus128 = _mm256_set1_ps(-128.0); __m256 avxcosloader, avxcos; float avxcosmover; __m256i integer; /* The DCT breaks the image into 8 by 8 blocks and then * transforms them into color frequencies. */ #pragma omp for for (int brow = 0; brow < height / 8; brow++) { for (int bcol = 0; bcol < width / 8; bcol++) { int head_pointer = bcol * 8 + brow * 8 * width; row0 = _mm256_setzero_ps(); row1 = _mm256_setzero_ps(); row2 = _mm256_setzero_ps(); row3 = _mm256_setzero_ps(); row4 = _mm256_setzero_ps(); row5 = _mm256_setzero_ps(); row6 = _mm256_setzero_ps(); row7 = _mm256_setzero_ps(); /* This pair of loops uses AVX instuctions to add the frequency * component from each pixel to all of the buckets at once. Allows * us to do the DCT on a block in 64 iterations of a loop rather * than 64 iterations of 64 iterations of a loop (all 64 pixels affect * all 64 frequencies) */ for (int x = 0; x < 8; x++) { for (int y = 0; y < 4; y++) { loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]); loaderlow = _mm256_add_ps(loaderlow, minus128); loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]); loaderhigh = _mm256_add_ps(loaderhigh, minus128); avxcos = _mm256_loadu_ps(&acosvals[x][0]); loaderlow = _mm256_mul_ps(loaderlow, avxcos); loaderhigh = _mm256_mul_ps(loaderhigh, avxcos); avxcosloader = _mm256_loadu_ps(&acosvals[y][0]); avxcosmover = avxcosloader[0]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row0 = _mm256_add_ps(row0, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row0 = _mm256_add_ps(row0, temp); avxcosmover = avxcosloader[1]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row1 = _mm256_add_ps(row1, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row1 = _mm256_sub_ps(row1, temp); avxcosmover = avxcosloader[2]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row2 = _mm256_add_ps(row2, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row2 = _mm256_add_ps(row2, temp); avxcosmover = avxcosloader[3]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row3 = _mm256_add_ps(row3, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row3 = _mm256_sub_ps(row3, temp); avxcosmover = avxcosloader[4]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row4 = _mm256_add_ps(row4, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row4 = _mm256_add_ps(row4, temp); avxcosmover = avxcosloader[5]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row5 = _mm256_add_ps(row5, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row5 = _mm256_sub_ps(row5, temp); avxcosmover = avxcosloader[6]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row6 = _mm256_add_ps(row6, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row6 = _mm256_add_ps(row6, temp); avxcosmover = avxcosloader[7]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row7 = _mm256_add_ps(row7, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row7 = _mm256_sub_ps(row7, temp); } } /* Each frequency stored as a float needs to be divided by * the quantization value, then rounded to the nearest integer. * Also changes the order of the values from pixel order to * each 8 by 8 block stored one after another. */ temp = _mm256_loadu_ps(&quant[0]); row0 = _mm256_div_ps(row0, temp); row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row0); _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[8]); row1 = _mm256_div_ps(row1, temp); row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row1); _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[16]); row2 = _mm256_div_ps(row2, temp); row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row2); _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[24]); row3 = _mm256_div_ps(row3, temp); row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row3); _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[32]); row4 = _mm256_div_ps(row4, temp); row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row4); _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[40]); row5 = _mm256_div_ps(row5, temp); row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row5); _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[48]); row6 = _mm256_div_ps(row6, temp); row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row6); _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[56]); row7 = _mm256_div_ps(row7, temp); row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row7); _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer); } } } }
INLINE avxi( const int& a ) : m256(_mm256_castps_si256(_mm256_broadcast_ss((const float*)&a))) {}
INLINE const avxi broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k, k_left, ii; float k_left_d; const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5}; float temp_space[8] = {}; __m256 mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; mask = _mm256_loadu_ps( mask_f ); zeros = _mm256_setzero_ps(); x_n_0 = _mm256_broadcast_ss( &x_n[0] ); x_n_1 = _mm256_broadcast_ss( &x_n[1] ); x_n_2 = _mm256_broadcast_ss( &x_n[2] ); x_n_3 = _mm256_broadcast_ss( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_ps( zeros, x_n_0 ); x_n_1 = _mm256_sub_ps( zeros, x_n_1 ); x_n_2 = _mm256_sub_ps( zeros, x_n_2 ); x_n_3 = _mm256_sub_ps( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_ps(); y_t_1 = _mm256_setzero_ps(); y_t_2 = _mm256_setzero_ps(); y_t_3 = _mm256_setzero_ps(); k=0; // corner if(tri==1) { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ /*exit(1);*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( a_00, zeros, 0x00 ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); a_00 = _mm256_blend_ps( a_00, zeros, 0x01 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( a_01, zeros, 0x01 ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_01 = _mm256_blend_ps( a_01, zeros, 0x03 ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( a_02, zeros, 0x03 ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_02 = _mm256_blend_ps( a_02, zeros, 0x07 ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( a_03, zeros, 0x07 ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_03 = _mm256_blend_ps( a_03, zeros, 0x0f ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} /* for(; k<kna; k++)*/ { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /*printf("\nk_left = %d\n", k_left);*/ /*exit(1);*/ A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(kna>0 || tri==1) { A += (sda-1)*lda; } for(; k<kmax-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); y_n_0 = _mm256_loadu_ps( &y_n[0] ); x_t_0 = _mm256_loadu_ps( &x_t[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); temp = _mm256_mul_ps( a_00, x_n_0 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += sda*lda; y_n += 8; x_t += 8; } if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} { k_left = kmax-k; k_left_d = 8.0 - k_left; /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*printf("\nk_left2 = %d\n", k_left, kmax, k);*/ a_00 = _mm256_load_ps( &A[0+lda*0] ); /*printf("\nk_left2 = %d\n", k_left);*/ a_01 = _mm256_load_ps( &A[0+lda*1] ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_03 = _mm256_load_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /* A += 1;*/ /* y_n += 1;*/ /* x_t += 1;*/ } // reduction __m128 z_0, z_1; y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1); y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3); y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2); y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01); z_0 = _mm256_castps256_ps128(y_t_0); z_1 = _mm256_castps256_ps128(y_t_1); z_1 = _mm_add_ps(z_0, z_1); if(alg==1) { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_add_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } else // alg==-1 { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_sub_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } }