inline void Multiply(const PVector4df &v, PVector4df &out) { #ifdef __SSE_AVAIL__ __m128 v1 = _mm_load_ps(v._Vec); __m128 m0 = _mm_load_ps(Row1); __m128 m1 = _mm_load_ps(Row2); __m128 m2 = _mm_load_ps(Row3); __m128 m3 = _mm_load_ps(Row4); m0 = _mm_mul_ps(m0, v1); //(e11 * v.X) , (e21 * v.Y), (e31 * v.Z), (e41 * v.W) m1 = _mm_mul_ps(m1, v1); //(e12 * v.X) , (e22 * v.Y), (e32 * v.Z), (e42 * v.W) m2 = _mm_mul_ps(m2, v1); //(e13 * v.X) , (e23 * v.Y), (e33 * v.Z), (e43 * v.W) m3 = _mm_mul_ps(m3, v1); //(e14 * v.X) , (e24 * v.Y), (e34 * v.Z), (e44 * v.W) m0 = _mm_hadd_ps(m0, m1); m0 = _mm_hadd_ps(m0, m0); m2 = _mm_hadd_ps(m2, m3); m2 = _mm_hadd_ps(m2, m2); _mm_store_ps(out._Vec, _mm_movehl_ps(m2, m0)); #else out.X = v.X * e11 + v.Y * e21 + v.Z * e31 + v.W * e41; out.Y = v.X * e12 + v.Y * e22 + v.Z * e32 + v.W * e42; out.Z = v.X * e13 + v.Y * e23 + v.Z * e33 + v.W * e43; out.W = v.X * e14 + v.Y * e24 + v.Z * e34 + v.W * e44; #endif }
void shz::math::matrix<shz::math::f32, 4, 4>::mul(const shz::math::f32* left, const shz::math::f32*right, shz::math::f32* target) { shz::math::f32 _ALIGNED(16) left_transposed[shz::math::matrix<shz::math::f32, 4, 4>::size]; matrix<shz::math::f32, 4, 4>::transpose(left, left_transposed); shz::math::f32 _ALIGNED(16) temp[4]; for(size_t m=0; m<4; ++m){ __m128 right_operand = _mm_load_ps(right); shz::math::f32* transposed = left_transposed; // This probably can be further optimized. Needs benchmarking // Idea: unroll loop and exploit _mm_hadd_ps to sum partials from 2 rows at once // AVX: with AVX instruction set this loop is trivial for(size_t n=0; n<4; ++n){ __m128 left_operand = _mm_load_ps(transposed); __m128 mul_result = _mm_mul_ps(left_operand, right_operand); __m128 added = _mm_hadd_ps(mul_result, mul_result); added = _mm_hadd_ps(added, added); _mm_store_ps(temp, added); *target = temp[0]; target++; transposed += 4; } right += 4; } }
inline PVector4df operator*(const PVector4df &v) { #ifdef __SSE_AVAIL__ __m128 v1 = _mm_load_ps(v._Vec); __m128 m0 = _mm_load_ps(Row1); __m128 m1 = _mm_load_ps(Row2); __m128 m2 = _mm_load_ps(Row3); __m128 m3 = _mm_load_ps(Row4); m0 = _mm_mul_ps(m0, v1); //(e11 * v.X) , (e21 * v.Y), (e31 * v.Z), (e41 * v.W) m1 = _mm_mul_ps(m1, v1); //(e12 * v.X) , (e22 * v.Y), (e32 * v.Z), (e42 * v.W) m2 = _mm_mul_ps(m2, v1); //(e13 * v.X) , (e23 * v.Y), (e33 * v.Z), (e43 * v.W) m3 = _mm_mul_ps(m3, v1); //(e14 * v.X) , (e24 * v.Y), (e34 * v.Z), (e44 * v.W) m0 = _mm_hadd_ps(m0, m1); m0 = _mm_hadd_ps(m0, m0); m2 = _mm_hadd_ps(m2, m3); m2 = _mm_hadd_ps(m2, m2); m0 = _mm_movehl_ps(m2, m0); PVector4df val; _mm_store_ps(val._Vec, m0); return val; #else return PVector4df(v.X * e11 + v.Y * e21 + v.Z * e31 + v.W * e41, v.X * e12 + v.Y * e22 + v.Z * e32 + v.W * e42, v.X * e13 + v.Y * e23 + v.Z * e33 + v.W * e43, v.X * e14 + v.Y * e24 + v.Z * e34 + v.W * e44); #endif }
HW_FORCE_INLINE Vec<N> spreadDot(const Vec<N>& a, const Vec<N>& b) { __m128 x = _mm_mul_ps(a.xmm, b.xmm); x = _mm_hadd_ps(x, x); x = _mm_hadd_ps(x, x); return Vec<N>(x); }
static void mexsoftmax(float* y, float* shift, mwSize m, mwSize n) { __m128 i1, i2; __m128 o1, o2; while (m>0) { mwSize curn = n; float sum = 0.0f; declconst128(zero, 0.0f); while (curn>0 && ((unsigned long)(y+curn) & 15) != 0) { --curn; y[curn]=fastexp(y[curn]-*shift); sum += y[curn]; } __m128 s1 = _mm_load1_ps (shift); __m128 sum1 = zero; while (curn>7) { i1 = _mm_load_ps (y+curn-4); i2 = _mm_load_ps (y+curn-8); i1 = _mm_sub_ps (i1, s1); i2 = _mm_sub_ps (i2, s1); o1 = vfastexp(i1); o2 = vfastexp(i2); _mm_store_ps (y+curn-4, o1); sum1 = _mm_add_ps (sum1, o1); _mm_store_ps (y+curn-8, o2); sum1 = _mm_add_ps (sum1, o2); curn-=8; } sum1 = _mm_hadd_ps (sum1, sum1); sum1 = _mm_hadd_ps (sum1, sum1); sum += _mm_cvtss_f32 (sum1); while(curn>0) { --curn; y[curn]=fastexp(y[curn]-*shift); sum += y[curn]; } sum = 1.0f / sum; ptrdiff_t n_pdt = n; ptrdiff_t one_pdt = 1; sscal (&n_pdt, &sum, y, &one_pdt); ++shift; y+=n; --m; } }
HW_FORCE_INLINE float dot(const Vec<N>& a, const Vec<N>& b) { __m128 x = _mm_mul_ps(a.xmm, b.xmm); x = _mm_hadd_ps(x, x); x = _mm_hadd_ps(x, x); float tmp; _mm_store_ss(&tmp, x); return tmp; }
static inline __m128 horizontal_add(const __m128 a) { #if 0 //!! needs SSE3 const __m128 ftemp = _mm_hadd_ps(a, a); return _mm_hadd_ps(ftemp, ftemp); #else const __m128 ftemp = _mm_add_ps(a, _mm_movehl_ps(a, a)); //a0+a2,a1+a3 return _mm_add_ss(ftemp, _mm_shuffle_ps(ftemp, ftemp, _MM_SHUFFLE(1, 1, 1, 1))); //(a0+a2)+(a1+a3) #endif }
HW_FORCE_INLINE Vec<N> normalized(const Vec<N>& a) { __m128 x = _mm_mul_ps(a.xmm, a.xmm); x = _mm_hadd_ps(x, x); x = _mm_hadd_ps(x, x); x = _mm_rsqrt_ps(x); x = _mm_mul_ps(a.xmm, x); return Vec<N>(x); }
_XOINL float QuaternionSquareSum(const Quaternion& q) { #if defined(XO_SSE) __m128 square = _mm_mul_ps(q.xmm, q.xmm); square = _mm_hadd_ps(square, square); square = _mm_hadd_ps(square, square); return _mm_cvtss_f32(square); #else return q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w; #endif }
float length2() const { Vec3 a = *this; a.w = 0.0f; __m128 &D = a.m128; D = _mm_mul_ps(D, D); D = _mm_hadd_ps(D, D); D = _mm_hadd_ps(D, D); return a.x; }
inline float hadd(const vector4f& rhs) { #if SSE_INSTR_SET >= 3 // SSE3 __m128 tmp0 = _mm_hadd_ps(rhs, rhs); __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); #else __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs)); __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); #endif return _mm_cvtss_f32(tmp1); }
// use MMX/SSE extensions void dotprod_rrrf_execute_mmx(dotprod_rrrf _q, float * _x, float * _y) { // first cut: ... __m128 v; // input vector __m128 h; // coefficients vector __m128 s; // dot product __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/4)) unsigned int t = (_q->n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) v = _mm_loadu_ps(&_x[i]); // load coefficients into register (aligned) h = _mm_load_ps(&_q->h[i]); // compute multiplication s = _mm_mul_ps(v, h); // parallel addition sum = _mm_add_ps( sum, s ); } // aligned output array float w[4] __attribute__((aligned(16))); #if HAVE_PMMINTRIN_H // fold down into single value __m128 z = _mm_setzero_ps(); sum = _mm_hadd_ps(sum, z); sum = _mm_hadd_ps(sum, z); // unload single (lower value) _mm_store_ss(w, sum); float total = w[0]; #else // unload packed array _mm_store_ps(w, sum); float total = w[0] + w[1] + w[2] + w[3]; #endif // cleanup for (; i<_q->n; i++) total += _x[i] * _q->h[i]; // set return value *_y = total; }
HW_FORCE_INLINE float invLength(const Vec<N>& a) { __m128 x = _mm_mul_ps(a.xmm, a.xmm); x = _mm_hadd_ps(x, x); x = _mm_hadd_ps(x, x); x = _mm_rsqrt_ss(x); float tmp; _mm_store_ss(&tmp, x); return tmp; }
inline float dot_product(__m128 a, __m128 b) { #if defined(SSE4) __m128 m = _mm_dp_ps(a, b, 0xff); return m.m128_f32[0]; #elif defined(SSE3) __m128 m = _mm_mul_ps(a, b); m = _mm_hadd_ps(m, m); m = _mm_hadd_ps(m, m); return m.m128_f32[0]; #else __m128 m = _mm_mul_ps(a, b); return m.m128_f32[0] + m.m128_f32[1] + m.m128_f32[2] + m.m128_f32[3]; #endif }
// ~~~~~~~~~~~~~~~ Task2 void mulVectorSse(MATRIX_TYPE** matrix, MATRIX_TYPE* vector, MATRIX_TYPE* result, size_t size) { for (size_t i = 0; i < size; i++) { __m128 localSum = _mm_setzero_ps(); for (size_t j = 0; j < size; j += 4) { __m128 tempMatix = _mm_load_ps(&matrix[i][j]); __m128 tempVector = _mm_load_ps(&vector[j]); localSum = _mm_add_ps(localSum, _mm_mul_ps(tempMatix, tempVector)); } localSum = _mm_hadd_ps(localSum, localSum); localSum = _mm_hadd_ps(localSum, localSum); _mm_store_ss(&result[i], localSum); } }
inline float32x4_t dot(const float32x4_t xmm1, const float32x4_t xmm2) { #if TWIST_ARCH & TWIST_ARCH_SSE3_BIT float32x4_t mul0 = _mm_mul_ps(xmm1, xmm2); float32x4_t hadd0 = _mm_hadd_ps(mul0, mul0); float32x4_t hadd1 = _mm_hadd_ps(hadd0, hadd0); return hadd1; #else // SSE3 float32x4_t mul0 = _mm_mul_ps(xmm1, xmm2); float32x4_t swap0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1)); float32x4_t add0 = _mm_add_ps(mul0, swap0); float32x4_t swap1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3)); float32x4_t add1 = _mm_add_ps(add0, swap1); return add1; #endif // SSE }
inline vector4f haddp(const vector4f* row) { #if SSE_INSTR_SET >= 3 // SSE3 return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]), _mm_hadd_ps(row[2], row[3])); #else __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); tmp0 = _mm_add_ps(tmp0, tmp1); tmp1 = _mm_unpacklo_ps(row[2], row[3]); tmp1 = _mm_add_ps(tmp1, tmp2); tmp2 = _mm_movehl_ps(tmp1, tmp0); tmp0 = _mm_movelh_ps(tmp0, tmp1); return _mm_add_ps(tmp0, tmp2); #endif }
float reduction_sum_sse(float *v, int n) { int i; float sum; __m128 *v4 = (__m128 *)v; __m128 vsum = _mm_set1_ps(0.0f); for (i = 0; i < n / 4; i++) vsum = _mm_add_ps(vsum, v4[i]); vsum = _mm_hadd_ps(vsum, vsum); vsum = _mm_hadd_ps(vsum, vsum); _mm_store_ss(&sum, vsum); return sum; }
core::F32_t core::Vector3::dot( core::Vector3 &a, core::Vector3 &b ) { ALIGNED_16 core::F32_t aVector[] = {a.x, a.y, a.z, 0}; ALIGNED_16 core::F32_t bVector[] = {b.x, b.y, b.z, 0}; __m128 ma; __m128 mb; //__m128 mr; ma = _mm_load_ps(aVector); mb = _mm_load_ps(bVector); ALIGNED_16 core::F32_t res[4]; ma = _mm_mul_ps(ma, mb); ma = _mm_hadd_ps(ma, ma); ma = _mm_hadd_ps(ma, ma); _mm_store_ps(&res[0], ma); return res[0]; }
const Vec3 &normalize() { w = 0.f; __m128 D = m128; D = _mm_mul_ps(D, D); D = _mm_hadd_ps(D, D); D = _mm_hadd_ps(D, D); // 1 iteration of Newton-raphson -- Idea from Intel's Embree. __m128 r = _mm_rsqrt_ps(D); r = _mm_add_ps( _mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(D, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r), _mm_mul_ps(r, r))); m128 = _mm_mul_ps(m128, r); return *this; }
float DotProductSIMD(const float* a, const float* b, std::size_t n) { std::size_t i = 0; __m128 sum = _mm_setzero_ps(); for(; i < ROUND_DOWN(n, 4); i += 4) { __m128 x = _mm_loadu_ps(a + i); __m128 y = _mm_loadu_ps(a + i); x = _mm_mul_ps(x, y); sum = _mm_add_ps(x, sum); } sum = _mm_hadd_ps(sum, sum); sum = _mm_hadd_ps(sum, sum); float product = _mm_cvtss_f32(sum); for(; i < n; i++) { product += a[i] * b[i]; } return product; }
int distance_scan_to_map( map_t * map, scan_t * scan, position_t position) { int npoints = 0; /* number of points where scan matches map */ int64_t sum = 0; /* sum of map values at those points */ /* Pre-compute sine and cosine of angle for rotation */ double position_theta_radians = radians(position.theta_degrees); double costheta = cos(position_theta_radians) * map->scale_pixels_per_mm; double sintheta = sin(position_theta_radians) * map->scale_pixels_per_mm; /* Pre-compute pixel offset for translation */ double pos_x_pix = position.x_mm * map->scale_pixels_per_mm; double pos_y_pix = position.y_mm * map->scale_pixels_per_mm; __m128 sincos128 = _mm_set_ps (costheta, -sintheta, sintheta, costheta); __m128 posxy128 = _mm_set_ps (pos_x_pix, pos_y_pix, pos_x_pix, pos_y_pix); int i = 0; for (i=0; i<scan->npoints; i++) { /* Consider only scan points representing obstacles */ if (scan->value[i] == OBSTACLE) { /* Compute coordinate pair using SSE */ __m128 xy128 = _mm_set_ps (scan->x_mm[i], scan->y_mm[i], scan->x_mm[i], scan->y_mm[i]); xy128 = _mm_mul_ps(sincos128, xy128); xy128 = _mm_hadd_ps(xy128, xy128); xy128 = _mm_add_ps(xy128, posxy128); cs_pos_mmx_t pos; pos.mmx = _mm_cvtps_pi32(xy128); /* Extract coordinates */ int x = pos.pos.x; int y = pos.pos.y; /* Empty the multimedia state to avoid floating-point errors later */ _mm_empty(); /* Add point if in map bounds */ if (x >= 0 && x < map->size_pixels && y >= 0 && y < map->size_pixels) { sum += map->pixels[y * map->size_pixels + x]; npoints++; } } } /* Return sum scaled by number of points, or -1 if none */ return npoints ? (int)(sum * 1024 / npoints) : -1; }
v4f step_t::operator () (float t) const { // Evaluate the polynomial f by Estrin's method. Return // (0 0 0 0) if t < t0, // (f f f f) if t0 <= t < t1, // (1 1 1 1) if t > t1. v4f c4 = load4f (c); v4f one = { 1.0f, 1.0f, 1.0f, 1.0f }; v4f tttt = _mm_set1_ps (t); // t t t t v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t v4f f0 = c4 * tt; // c0 c1*t c2 c3*t v4f ha = _mm_hadd_ps (f0, f0) * tt * tt; v4f f = _mm_hadd_ps (ha, ha); // f f f f v4f f1 = _mm_unpacklo_ps (f, one); // f 1 f 1 v4f tx = load4f (T); // t0 t1 t1 inf v4f lo = _mm_movelh_ps (tx, tx); // t0 t1 t0 t1 v4f hi = _mm_movehl_ps (tx, tx); // t1 inf t1 inf v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi)); v4f val = _mm_and_ps (sel, f1); // f? 1? f? 1? return _mm_hadd_ps (val, val); }
// ~~~~~~~~~~~~~~~ Task3 void mulMatrix_IJKAlgSse(MATRIX_TYPE** first, MATRIX_TYPE** second, MATRIX_TYPE** result, size_t size) { transpose(second, size); for (size_t i = 0; i < size; i++) { for (size_t j = 0; j < size; j++) { __m128 temp = _mm_setzero_ps(); for (size_t k = 0; k < size; k += 4) { __m128 tempFirst = _mm_load_ps(&first[i][k]); __m128 tempSecond = _mm_load_ps(&second[j][k]); temp = _mm_add_ps(temp, _mm_mul_ps(tempFirst, tempSecond)); } temp = _mm_hadd_ps(temp, temp); temp = _mm_hadd_ps(temp, temp); _mm_store_ss(&result[i][j], temp); } } transpose(second, size); }
SSH float* ssh_mtx_mtx(const float* m1, const float* m2) { static float flt[16]; __m128 _m[4]; _m[0] = _mm_set_ps(m2[0], m2[4], m2[8], m2[12]); _m[1] = _mm_set_ps(m2[1], m2[5], m2[9], m2[13]); _m[2] = _mm_set_ps(m2[2], m2[6], m2[10], m2[14]); _m[3] = _mm_set_ps(m2[3], m2[7], m2[11], m2[15]); for(ssh_u i = 0; i < 4; i++) { for(ssh_u j = 0; j < 4; j++) { __m128 _tmp(_mm_mul_ps(*(__m128*)&m1[i * 4], _m[j])); _tmp = _mm_hadd_ps(_tmp, _tmp); flt[i * 4 + j] = _mm_hadd_ps(_tmp, _tmp).m128_f32[0]; } } return flt; }
/** transform vector by rigid transform */ inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec) { #ifdef SIMPLE_GL_USE_SSE4 __m128 res; __m128 dotProd; res = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\ dotProd = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\ dotProd = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\ dotProd = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) ); return Matrix<float, 4, 1>(res); #elif defined(SIMPLE_GL_USE_SSE3) __m128 res; __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); __m128 vec01 = _mm_unpacklo_ps(dotProd0, dotProd1); __m128 vec23 = _mm_unpackhi_ps(dotProd2, dotProd3); res = _mm_movelh_ps(vec01, vec23); return Matrix<float, 4, 1>(res); #else // SSE2 // TODO: Think about good sse optimization Matrix<float, 4, 1> res; res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3]; res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3]; res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3]; res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3]; return res; #endif }
float vsum(const float *a, int _n) { float sum; int n = _n - _n%3; __m128 vsum = _mm_set1_ps(0.0f); assert((n & 3) == 0); assert(((uintptr_t)a & 15) == 0); for (int i = 0; i < n; i += 4) { __m128 v = _mm_load_ps(&a[i]); vsum = _mm_add_ps(vsum, v); } vsum = _mm_hadd_ps(vsum, vsum); vsum = _mm_hadd_ps(vsum, vsum); _mm_store_ss(&sum, vsum); for(int i=n;i<_n;i++){ sum += a[i]; } return sum; }
void lx_matmul_sse3_aligned(const float *in_A, const float *in_x, const float *out_y, LXInteger n) { __m128 A0 = _mm_load_ps((const float *)(in_A + 0)); __m128 A1 = _mm_load_ps((const float *)(in_A + 4)); __m128 A2 = _mm_load_ps((const float *)(in_A + 8)); __m128 A3 = _mm_load_ps((const float *)(in_A + 12)); for (LXInteger i = 0; i < n; i++) { __m128 x = _mm_load_ps((const float*)(in_x + i*4)); __m128 m0 = _mm_mul_ps(A0, x); __m128 m1 = _mm_mul_ps(A1, x); __m128 m2 = _mm_mul_ps(A2, x); __m128 m3 = _mm_mul_ps(A3, x); __m128 sum_01 = _mm_hadd_ps(m0, m1); __m128 sum_23 = _mm_hadd_ps(m2, m3); __m128 result = _mm_hadd_ps(sum_01, sum_23); _mm_store_ps((float*)(out_y + i*4), result); } }
static void matvec_sse() { /* Assume that the data size is an even multiple of the 128 bit * SSE vectors (i.e. 4 floats) */ assert(!(SIZE & 0x3)); /* TASK: Implement your SSE version of the matrix-vector * multiplication here. */ /* HINT: You might find at least the following instructions * useful: * - _mm_setzero_ps * - _mm_load_ps * - _mm_hadd_ps * - _mm_cvtss_f32 * * HINT: You can create the sum of all elements in a vector * using two hadd instructions. */ __m128 dummy=_mm_setzero_ps(); for(int i=0;i<SIZE;++i){ __m128 temp=_mm_setzero_ps(); for(int j=0;j<SIZE;j+=4){ __m128 mm_vec_b=_mm_load_ps((__m128*)(vec_b+j)); __m128 mm_matr=_mm_load_ps((__m128*)(mat_a+MINDEX(i,j))); __m128 out=_mm_mul_ps(mm_vec_b,mm_matr); temp=_mm_add_ps(temp,out); // vec_c[i]+=_mm_cvtss_f32(_mm_dp_ps(mm_matr,mm_vec_b,0xf1)); } __m128 res=_mm_hadd_ps(_mm_hadd_ps(temp,dummy),dummy); vec_c[i]=_mm_cvtss_f32(res); } }
int main(){ typedef union{ __m128 m128; float flt[4]; } m128f; __m128 x = {1.0,2.0,3.0,4.0}; __m128 y = {10.0,20.0,30.0,40.0}; m128f s,h; s.m128=haddps(x,y); h.m128=_mm_hadd_ps(x,y); printf("Software hadd: %f %f %f %f\n",s.flt[0],s.flt[1],s.flt[2],s.flt[3]); printf("Hardware hadd: %f %f %f %f\n",h.flt[0],h.flt[1],h.flt[2],h.flt[3]); return; }