//------------------------------------------------------------------- // blend void effect(float *pBuffer[], const Cbmp *bmp, const float weight) { size_t width = bmp->getWidth(); size_t height = bmp->getAbsHeight(); const float fmul0 = weight; const float fmul1 = 1.0f - weight; __m256 weight0 = _mm256_broadcast_ss(&fmul0); __m256 weight1 = _mm256_broadcast_ss(&fmul1); float *pF[2]; pF[0] = pBuffer[0]; pF[1] = pBuffer[1]; for (size_t y = 0; y < height; y++) { for (size_t x = 0; x < width; x += 8, pF[0] += 8, pF[1] += 8) { __m256 p0 = _mm256_load_ps(pF[0]); p0 = _mm256_mul_ps(p0, weight0); __m256 p1 = _mm256_load_ps(pF[1]); p1 = _mm256_mul_ps(p1, weight1); __m256 r = _mm256_add_ps(p0, p1); _mm256_store_ps(pF[0], r); } } }
/* * compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z * member variables of the st_coords structure arr */ void comp_s(st_coords arr, int L) { for(int i=0; i<L; i+=8) { __m256 x = _mm256_load_ps(&arr.x[i]); __m256 y = _mm256_load_ps(&arr.y[i]); __m256 z = _mm256_load_ps(&arr.z[i]); __m256 t = _mm256_load_ps(&arr.t[i]); register __m256 s0, s1; /* Temporaries */ /* _TODO_B_ Complete this function using intrinsics so that is computes: s[i:i+8] = sqrt(t[i:i+8]**2 - (x[i:i+8]**2 + y[i:i+8]**2 + z[i:i+8]**2)) where, s, t, x, y, z are members of arr. You will use: _mm256_mul_ps(); _mm256_add_ps(); _mm256_sub_ps(); _mm256_sqrt_ps(); */ _mm256_store_ps(&arr.s[i], s0); } return; }
//----------------------------------------------------------------- // SOA -> AOS // // pBlu: b0, b1, b2, b3, b4, ... // pGrn: g0, g1, g2, g3, g4, ... // pRed: r0, r1, r2, r3, r4, ... // -> // pBgr: b0,g0,r0, b1,g1,r1, b2,g2,r2, b3,g3,r3, b4,g4,r4, ... // void soa2aos(float *pBlu, float *pGrn, float *pRed, float *pBgr, const size_t length) { __m128 *bgr = (__m128 *)pBgr; //1回に24ユニット、8x+8y+8z、x,y,z=float for (size_t i = 0; i < length; i += 24) { __m256 b = _mm256_load_ps(pBlu + (i / 3)); __m256 g = _mm256_load_ps(pGrn + (i / 3)); __m256 r = _mm256_load_ps(pRed + (i / 3)); __m256 bg = _mm256_shuffle_ps(b, g, _MM_SHUFFLE(2, 0, 2, 0)); __m256 gr = _mm256_shuffle_ps(g, r, _MM_SHUFFLE(3, 1, 3, 1)); __m256 rb = _mm256_shuffle_ps(r, b, _MM_SHUFFLE(3, 1, 2, 0)); __m256 r03 = _mm256_shuffle_ps(bg, rb, _MM_SHUFFLE(2, 0, 2, 0)); __m256 r14 = _mm256_shuffle_ps(gr, bg, _MM_SHUFFLE(3, 1, 2, 0)); __m256 r25 = _mm256_shuffle_ps(rb, gr, _MM_SHUFFLE(3, 1, 3, 1)); *bgr++ = _mm256_castps256_ps128(r03); *bgr++ = _mm256_castps256_ps128(r14); *bgr++ = _mm256_castps256_ps128(r25); *bgr++ = _mm256_extractf128_ps(r03, 1); *bgr++ = _mm256_extractf128_ps(r14, 1); *bgr++ = _mm256_extractf128_ps(r25, 1); } }
/* * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z * member variables of the st_coords structure arr. * * Traverse elements randomly */ void comp_s(st_coords arr, int L) { for(int j=0; j<L; j+=8) { int i = (rand() % (L/8)) * 8; __m256 x = _mm256_load_ps(&arr.x[i]); __m256 y = _mm256_load_ps(&arr.y[i]); __m256 z = _mm256_load_ps(&arr.z[i]); __m256 t = _mm256_load_ps(&arr.t[i]); #ifdef FMA register __m256 s0; s0 = _mm256_mul_ps(x, x); s0 = _mm256_fmadd_ps(y, y, s0); s0 = _mm256_fmadd_ps(z, z, s0); s0 = _mm256_fmsub_ps(t, t, s0); s0 = _mm256_sqrt_ps(s0); #else register __m256 s0, s1; s1 = _mm256_mul_ps(x, x); s0 = _mm256_mul_ps(y, y); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(z, z); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(t, t); s1 = _mm256_sub_ps(s0, s1); s0 = _mm256_sqrt_ps(s1); #endif _mm256_store_ps(&arr.s[i], s0); } return; }
static inline void rectifier_kernel_avx5(float *a, const size_t blocks) { for (size_t i = 0; i < blocks; ++i) { _mm256_store_ps( &a[i*8*5 + 0*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 0*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*5 + 1*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 1*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*5 + 2*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 2*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*5 + 3*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 3*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*5 + 4*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*5 + 4*8] ) , _mm256_setzero_ps() ) ); } }
void MaddMemcpy(float* arg1, float* arg2, float* arg3, int size1, int size2, float* result) { memcpy(arg2, arg1, size1); memcpy(arg3, arg1, size2); __m256 vec1 = _mm256_load_ps(arg1); __m256 vec2 = _mm256_load_ps(arg2); __m256 vec3 = _mm256_load_ps(arg3); __m256 res = _mm256_fmadd_ps(vec1, vec2, vec3); _mm256_storeu_ps(result, res); }
irreg_poly_area_func_sign(float, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256 values_0_3, values_4_7, values_8_11, values_12_15, values_16_19 = _mm256_load_ps((const float *)&cords[0][0]), accum_sum = _mm256_setzero_ps(); float accum_sum_aux; #define _float_cords_dot_prod(curr, next, index) \ _mm256_dp_ps( \ curr, \ _mm256_xor_ps( \ _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\ _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f) \ ), \ 0b11110000 | (1 << (index)) \ ) unsigned long index; for (index = 0; index < (cords_len - 16); index += 16) { values_0_3 = values_16_19; values_4_7 = _mm256_load_ps((const float *)&cords[index + 4]); values_8_11 = _mm256_load_ps((const float *)&cords[index + 8]); values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]); values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]); accum_sum = _mm256_add_ps( accum_sum, _mm256_add_ps( _mm256_add_ps( _float_cords_dot_prod(values_0_3, values_4_7, 0), _float_cords_dot_prod(values_4_7, values_8_11, 1) ), _mm256_add_ps( _float_cords_dot_prod(values_8_11, values_12_15, 2), _float_cords_dot_prod(values_12_15, values_16_19, 3) ) ) ); } accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3 accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ... accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ... for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
void AlignedAvxMult(float* d, float const* a, float const* b) { for(int i = 0; i < gNumFloats; i += 8) { __m256 v1 = _mm256_load_ps(&a[i]); __m256 v2 = _mm256_load_ps(&b[i]); __m256 r = _mm256_mul_ps(v1, v2); _mm256_store_ps(&d[i], r); } }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m256 sum_l = _mm256_setzero_ps(); __m256 sum_r = _mm256_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m256 delta = _mm256_set1_ps((float) (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 8) { __m256 buf_l = _mm256_loadu_ps(buffer_l + i); __m256 buf_r = _mm256_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m256 deltas = _mm256_load_ps(delta_table + i); __m256 sinc = _mm256_add_ps(_mm256_load_ps(phase_table + i), _mm256_mul_ps(deltas, delta)); #else __m256 sinc = _mm256_load_ps(phase_table + i); #endif sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); } /* hadd on AVX is weird, and acts on low-lanes * and high-lanes separately. */ __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); res_l = _mm256_hadd_ps(res_l, res_l); res_r = _mm256_hadd_ps(res_r, res_r); res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); /* This is optimized to mov %xmmN, [mem]. * There doesn't seem to be any _mm256_store_ss intrinsic. */ _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0)); _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0)); }
static void quantize_block(const float *in_data, float *out_data, float *quant_tbl) { int zigzag; __m256 result, dct_values, quant_values; __m256 factor = _mm256_set1_ps(0.25f); for (zigzag = 0; zigzag < 64; zigzag += 8) { // Set the dct_values for the current interation dct_values = _mm256_set_ps(in_data[UV_indexes[zigzag + 7]], in_data[UV_indexes[zigzag + 6]], in_data[UV_indexes[zigzag + 5]], in_data[UV_indexes[zigzag + 4]], in_data[UV_indexes[zigzag + 3]], in_data[UV_indexes[zigzag + 2]], in_data[UV_indexes[zigzag + 1]], in_data[UV_indexes[zigzag]]); // Multiply with 0.25 to divide by 4.0 result = _mm256_mul_ps(dct_values, factor); // Load quant-values and multiply with previous product quant_values = _mm256_load_ps(quant_tbl + zigzag); result = _mm256_div_ps(result, quant_values); // Round off values and store in out_data buffer result = c63_mm256_roundhalfawayfromzero_ps(result); _mm256_store_ps(out_data + zigzag, result); } }
static inline void rectifier_kernel_back_avx(float *a, const size_t blocks) { assert(blocks > 0); size_t i = blocks; do { _mm256_store_ps( &a[i*8], _mm256_max_ps( _mm256_load_ps( &a[i*8] ) , _mm256_setzero_ps() ) ); } while(--i); }
void aaaa(const int c, float * const a3, float * const a4, const float const * fck){ __m256 a_fck = _mm256_load_ps(fck); for(int i=0;i<c;i+=16){ __m256 b_i = _mm256_load_ps(&a4[i]); __m256 out_i = _mm256_mul_ps(b_i, a_fck); __m256 c_i = _mm256_load_ps(&a4[i+8]); __m256 out_i2 = _mm256_mul_ps(c_i, a_fck); _mm256_store_ps(&a3[i], out_i); _mm256_store_ps(&a3[i+8], out_i2); } }
float nv_vector_dot(const nv_matrix_t *vec1, int m1, const nv_matrix_t *vec2, int m2) { NV_ASSERT(vec1->n == vec2->n); #if NV_ENABLE_AVX { NV_ALIGNED(float, mm[8], 32); __m256 x, u; int n; int pk_lp = (vec1->n & 0xfffffff8); float dp = 0.0f; u = _mm256_setzero_ps(); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec2, m2, n)); u = _mm256_add_ps(u, _mm256_mul_ps(x, *(__m256 *)&NV_MAT_V(vec1, m1, n))); } _mm256_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3] + mm[4] + mm[5] + mm[6] + mm[7]; for (n = pk_lp; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #elif NV_ENABLE_SSE2 { NV_ALIGNED(float, mm[4], 16); __m128 x, u; int n; int pk_lp = (vec1->n & 0xfffffffc); float dp = 0.0f; u = _mm_setzero_ps(); for (n = 0; n < pk_lp; n += 4) { x = _mm_load_ps(&NV_MAT_V(vec2, m2, n)); u = _mm_add_ps(u, _mm_mul_ps(x, *(__m128 *)&NV_MAT_V(vec1, m1, n))); } _mm_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3]; for (n = pk_lp; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #else { int n; float dp = 0.0f; for (n = 0; n < vec1->n; ++n) { dp += NV_MAT_V(vec1, m1, n) * NV_MAT_V(vec2, m2, n); } return dp; } #endif }
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y) { #ifdef AVX //Black magic //But it does produce the same results as the not AVX code __m256 accumulator; __m256 x_s = _mm256_load_ps(x->Features); __m256 y_s = _mm256_load_ps(y->Features); __m256 result = _mm256_sub_ps(x_s, y_s); accumulator = _mm256_mul_ps(result, result); x_s = _mm256_load_ps(&x->Features[8]); y_s = _mm256_load_ps(&y->Features[8]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[16]); y_s = _mm256_load_ps(&y->Features[16]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[24]); y_s = _mm256_load_ps(&y->Features[24]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); //We now have a vector of 8 floats __m256 t1 = _mm256_hadd_ps(accumulator, accumulator); __m256 t2 = _mm256_hadd_ps(t1, t1); __m128 t3 = _mm256_extractf128_ps(t2, 1); __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3); //And now we don't return std::sqrtf(_mm_cvtss_f32(t4)); #endif #ifndef AVX //Can be autovectorized float accumulator[32]; float distance = 0; for (int i = 0; i < 30; i++) { accumulator[i] = x->Features[i] - y->Features[i]; } //If done properly this should be 4(8) instructions for (int i = 0; i < 30; i++) { distance += accumulator[i] * accumulator[i]; } return std::sqrtf(distance); #endif }
// ============================================================= // ====================== RGBX2BGRX_32F ======================== // ============================================================= void _rgbx2bgrx_32f(const float* _src, float* _dest, unsigned int _width, unsigned int _pitchs, unsigned int _pitchd, unsigned int _start, unsigned int _stop) { #ifdef USE_SSE const unsigned int widthz = (_pitchs/8); // Get start positions for buffers const float* tsrc; float* tdest; for( unsigned int y=_start; y<=_stop; ++y ) { tsrc = _src+(y*_pitchs); tdest = _dest+(y*_pitchd); for( unsigned int x=0; x<widthz; ++x ) { #ifdef USE_AVX1 const __m256 v0 = _mm256_load_ps(tsrc); tsrc+=8; __m256 r0 = _mm256_permute_ps(v0,0xc6); _mm256_store_ps(tdest, r0 ); tdest+=8; #else // NOT TESTED const __m128 v0 = _mm_load_ps(tsrc); tsrc+=4; const __m128 v1 = _mm_load_ps(tsrc); tsrc+=4; //__m128 r0 = _mm_shuffle_ps(v0,0xc6); //__m128 r1 = _mm_shuffle_ps(v1,0xc6); //_mm_store_ps(tdest, r0 ); tdest+=4; //_mm_store_ps(tdest, r1 ); tdest+=4; #endif } } #else const float* tsrc; float* tdest; for( unsigned int y=_start; y<=_stop; ++y ) { tsrc = _src+(y*_pitchs); tdest = _dest+(y*_pitchd); for( unsigned int x=0; x<_width; ++x ) { float t = tsrc[4*x]; tdest[4*x] = tsrc[4*x+2]; tdest[4*x+2] = t; } } #endif }
void rectifier_avx_2(float *a, const size_t len) { float *p = a; for (; p + 8 <= &a[len]; p += 8) { _mm256_store_ps(p, _mm256_max_ps( _mm256_load_ps(p) , _mm256_setzero_ps() ) ); } for (; p < &a[len]; ++p) { *p = *p > 0.0 ? *p : 0.0; } }
static __forceinline void convert_float_to_half(uint8_t* dstp, const float* srcp, size_t count) { for (size_t x = 0; x < count; x += 8) { __m256 s = _mm256_load_ps(srcp + x); __m128i d = _mm256_cvtps_ph(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); _mm_stream_si128(reinterpret_cast<__m128i*>(dstp + 2 * x), d); } }
int main(int argc, const char * argv[]) { ALIGN32 float a1[ 16 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; ALIGN32 float a2[ 16 ] = { 15, 12, 4, 7, 9, 0, 3, 13, 6, 10, 1, 8, 5, 11, 2, 14 }; ALIGN32 float aout[ 16 ]; __m128 x1[ 4 ] = { _mm_load_ps( a1 ), _mm_load_ps( a1 + 4 ), _mm_load_ps( a1 + 8 ), _mm_load_ps( a1 + 12 ) }; __m128 x2[ 4 ] = { _mm_load_ps( a2 ), _mm_load_ps( a2 + 4 ), _mm_load_ps( a2 + 8 ), _mm_load_ps( a2 + 12 ) }; __m128 xout[ 4 ]; __m256 y1[ 2 ] = { _mm256_load_ps( a1 ), _mm256_load_ps( a1 + 8 ) }; __m256 y2[ 2 ] = { _mm256_load_ps( a2 ), _mm256_load_ps( a2 + 8 ) }; __m256 yout[ 2 ]; std::cout << "FPU Mult" << std::endl; mul( a1, a2, aout ); trace( aout, 4 ); std::cout << "SSE Mult" << std::endl; mulX4( x1, x2, xout ); trace( xout, 4 ); std::cout << "AVX2 Mult" << std::endl; mulX8( y1, y2, yout ); trace( yout, 4 ); std::cout << "FPU Transpose" << std::endl; transpose( a1, aout ); trace( aout, 4 ); std::cout << "SSE Transpose" << std::endl; transposeX4( x1, xout ); trace( xout, 4 ); std::cout << "AVX Transpose" << std::endl; transposeX8( y1, yout ); trace( yout, 4 ); return 0; }
static void scale_block(float *in_data, float *out_data) { __m256 in_vector, result; // Load the a1 values into a register static float a1_values[8] __attribute__((aligned(32))) = { ISQRT2, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; __m256 a1 = _mm256_load_ps(a1_values); // Load the a2 values into a register for the exception case __m256 a2 = _mm256_set1_ps(ISQRT2); /* First row is an exception * Requires two _mm256_mul_ps operations */ in_vector = _mm256_load_ps(in_data); result = _mm256_mul_ps(in_vector, a1); result = _mm256_mul_ps(result, a2); _mm256_store_ps(out_data, result); // Remaining calculations can be done with one _mm256_mul_ps operation in_vector = _mm256_load_ps(in_data + 8); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 8, result); in_vector = _mm256_load_ps(in_data + 16); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 16, result); in_vector = _mm256_load_ps(in_data + 24); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 24, result); in_vector = _mm256_load_ps(in_data + 32); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 32, result); in_vector = _mm256_load_ps(in_data + 40); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 40, result); in_vector = _mm256_load_ps(in_data + 48); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 48, result); in_vector = _mm256_load_ps(in_data + 56); result = _mm256_mul_ps(in_vector, a1); _mm256_store_ps(out_data + 56, result); }
float dot_product(const int N, const float *X, const int incX, const float *Y, const int incY) { __m256 accum = _mm256_setzero_ps(); for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) { __m256 xval = _mm256_load_ps(X); __m256 yval = _mm256_load_ps(Y); __m256 val = _mm256_mul_ps(xval, yval); accum = _mm256_add_ps(val, accum); } // Reduce the values in accum into the smallest 32-bit subsection // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3 __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum), _mm256_extractf128_ps(accum, 1)); // b0 b1 b2 b3 -> c0 c1 b2 b3 accum2 = _mm_add_ps(accum2, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8))); __m128 final_val = _mm_add_ss( _mm_insert_ps(accum2, accum2, 0x4e), accum2); // Add the high and low halves return final_val[0]; }
static void dct_1d_general(float* in_data, float* out_data, float lookup[64]) { __m256 current, dct_values, multiplied, sum; current = _mm256_broadcast_ss(in_data); dct_values = _mm256_load_ps(lookup); multiplied = _mm256_mul_ps(dct_values, current); sum = multiplied; // Broadcasts a single float (scalar) to every element in 'current'. current = _mm256_broadcast_ss(in_data + 1); // Loads DCT values from the lookup table. iDCT uses a transposed lookup table here. dct_values = _mm256_load_ps(lookup + 8); // Vertically multiply the scalar with the DCT values. multiplied = _mm256_mul_ps(dct_values, current); // Vertically add to the previous sum. sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 2); dct_values = _mm256_load_ps(lookup + 16); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 3); dct_values = _mm256_load_ps(lookup + 24); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 4); dct_values = _mm256_load_ps(lookup + 32); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 5); dct_values = _mm256_load_ps(lookup + 40); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 6); dct_values = _mm256_load_ps(lookup + 48); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); current = _mm256_broadcast_ss(in_data + 7); dct_values = _mm256_load_ps(lookup + 56); multiplied = _mm256_mul_ps(dct_values, current); sum = _mm256_add_ps(sum, multiplied); _mm256_store_ps(out_data, sum); }
void rectifier_avx_3(float *a, const size_t len) { float *p = a; assert(len > 8); for (; (uintptr_t)p%32 != 0; ++p) { *p = *p > 0.0 ? *p : 0.0; } for (; p + 8 <= &a[len]; p += 8) { _mm256_stream_ps(p, _mm256_max_ps( _mm256_load_ps(p) , _mm256_setzero_ps() ) ); } for (; p < &a[len]; ++p) { *p = *p > 0.0 ? *p : 0.0; } }
void nv_vector_add(nv_matrix_t *vec0, int m0, const nv_matrix_t *vec1, int m1, const nv_matrix_t *vec2, int m2) { NV_ASSERT(vec1->n == vec2->n); NV_ASSERT(vec2->n == vec0->n); #if NV_ENABLE_AVX { __m256 x; int n; int pk_lp = (vec1->n & 0xfffffff8); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec1, m1, n)); _mm256_store_ps(&NV_MAT_V(vec0, m0, n), _mm256_add_ps(x, *(const __m256 *)&NV_MAT_V(vec2, m2, n))); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #elif NV_ENABLE_SSE2 { int n; int pk_lp = (vec1->n & 0xfffffffc); #ifdef _OPENMP //#pragma omp parallel for #endif for (n = 0; n < pk_lp; n += 4) { __m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n)); _mm_store_ps(&NV_MAT_V(vec0, m0, n), _mm_add_ps(x, *(const __m128 *)&NV_MAT_V(vec2, m2, n))); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #else { int n; for (n = 0; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + NV_MAT_V(vec2, m2, n); } } #endif }
static void dequantize_block(float *in_data, float *out_data, float *quant_tbl) { int zigzag; // Temporary buffer float temp_buf[8] __attribute__((aligned(32))); __m256 result, dct_values, quant_values; __m256 factor = _mm256_set1_ps(0.25f); for (zigzag = 0; zigzag < 64; zigzag += 8) { // Load dct-values dct_values = _mm256_load_ps(in_data + zigzag); quant_values = _mm256_load_ps(quant_tbl + zigzag); result = _mm256_mul_ps(dct_values, quant_values); // Multiply with 0.25 to divide by 4.0 result = _mm256_mul_ps(result, factor); // Round off products and store them temporarily result = c63_mm256_roundhalfawayfromzero_ps(result); _mm256_store_ps(temp_buf, result); // Store the results at the correct places in the out_data buffer out_data[UV_indexes[zigzag]] = temp_buf[0]; out_data[UV_indexes[zigzag + 1]] = temp_buf[1]; out_data[UV_indexes[zigzag + 2]] = temp_buf[2]; out_data[UV_indexes[zigzag + 3]] = temp_buf[3]; out_data[UV_indexes[zigzag + 4]] = temp_buf[4]; out_data[UV_indexes[zigzag + 5]] = temp_buf[5]; out_data[UV_indexes[zigzag + 6]] = temp_buf[6]; out_data[UV_indexes[zigzag + 7]] = temp_buf[7]; } }
void nv_vector_inv(nv_matrix_t *a, int am, const nv_matrix_t *x, int xm) { NV_ASSERT(a->n >= x->n); #if NV_ENABLE_AVX { __m256 xx, vv; int n; int pk_lp = (x->n & 0xfffffff8); vv = _mm256_set1_ps(1.0f); for (n = 0; n < pk_lp; n += 8) { xx = _mm256_load_ps(&NV_MAT_V(x, xm, n)); xx = _mm256_div_ps(vv, xx); _mm256_store_ps(&NV_MAT_V(a, am, n), xx); } for (n = pk_lp; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #elif NV_ENABLE_SSE2 { __m128 xx, vv; int n; int pk_lp = (x->n & 0xfffffffc); vv = _mm_set1_ps(1.0f); for (n = 0; n < pk_lp; n += 4) { xx = _mm_load_ps(&NV_MAT_V(x, xm, n)); xx = _mm_div_ps(vv, xx); _mm_store_ps(&NV_MAT_V(a, am, n), xx); } for (n = pk_lp; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #else { int n; for (n = 0; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #endif }
void warmup(float *x, float *y, int size, float alpha) { #pragma ivdep int i; __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0); #pragma vector aligned for (i=0; i<size; i+=4) { __m256 t = _mm256_load_ps(x+2*i); __m256 l = _mm256_mul_ps(t, m); // premultiply __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits __m256 res = _mm256_hadd_ps(l, r); __m128 s = _mm256_extractf128_ps (res, 0); _mm_store_ps(y+i,s); // store it } }
void polynomial(float *ret, const float *const r_values, int num) { // r*r*r*(10+r*(-15+r*6)); __m256 const_6 = _mm256_set1_ps(6.0f); __m256 const_neg_15 = _mm256_set1_ps(-15.0f); __m256 const_10 = _mm256_set1_ps(10.0f); // constants const int loop_factor = 8; for (int i = 0; i < num; i+=loop_factor) { #ifdef USE_IACA IACA_START #endif __m256 r; __m256 left; __m256 right; // aligned load of 256 bits r r = _mm256_load_ps(&r_values[i]); left = _mm256_mul_ps(r, r); // r * r #ifndef __FMA__ right = _mm256_mul_ps(r, const_6); // r * 6 left = _mm256_mul_ps(left, r); // r * r * r right = _mm256_add_ps(right, const_neg_15); //-15 + r * 6 right = _mm256_mul_ps(right, r); //r * (-15 + r * 6) right = _mm256_add_ps(right, const_10); //10 + (r * (-15 + r * 6)) #else right = _mm256_fmadd_ps(r, const_6, const_neg_15); left = _mm256_mul_ps(left, r); right = _mm256_fmadd_ps(r, right, const_10); #endif right = _mm256_mul_ps(right, left); // r*r*r *(10 + r * (-15 + r * 6)) _mm256_store_ps(&ret[i], right); // store 8 values to ret[i] } #ifdef USE_IACA IACA_END #endif }
void TransLut_FindIndexAvx2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); const __m256 scale = _mm256_set1_ps (1 << LINLUT_RES_L2); const __m256i offset = _mm256_set1_epi32 (-LINLUT_MIN_F * (1 << LINLUT_RES_L2)); const __m256i val_min = _mm256_setzero_si256 (); const __m256i val_max = _mm256_set1_epi32 (LINLUT_SIZE_F - 2); const __m256 v = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_scl = _mm256_mul_ps (v, scale); const __m256i index_raw = _mm256_cvtps_epi32 (val_scl); __m256i index_tmp = _mm256_add_epi32 (index_raw, offset); index_tmp = _mm256_min_epi32 (index_tmp, val_max); index = _mm256_max_epi32 (index_tmp, val_min); frac = _mm256_sub_ps (val_scl, _mm256_cvtepi32_ps (index_raw)); }
bool enqueue_try_nosync(ArenaT& arena, const T* entry) { const float* pSrc = (const float*)entry; float* pDst = (float*)&mCurBlock[mTail]; auto lambda = [&](int32_t i) { __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH); _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc); }; const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4); static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T), "FIFO element size should be multiple of SIMD width."); UnrollerL<0, numSimdLines, 1>::step(lambda); mTail ++; if (mTail == mBlockSize) { if (++mCurBlockIdx < mBlocks.size()) { mCurBlock = mBlocks[mCurBlockIdx]; } else { T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4); SWR_ASSERT(newBlock); mBlocks.push_back(newBlock); mCurBlock = newBlock; } mTail = 0; } mNumEntries ++; return true; }
// memory load and store operations INLINE avxb load8b(const void* const a) { return _mm256_load_ps((const float*)a); }