int main(void) { // testing dense matrix // /****************************************************/ /* LtkInt2x2 a; */ /* LtkInt4x1 b; */ /* LtkInt4x4 c; */ /* LtkInt4 v, w; */ /* */ /* v = LtkInt4_ltk_vector_new (); */ /* w = LtkInt4_ltk_vector_new (); */ /* */ /* int i; */ /* for (i = 0; i < LtkInt4_ltk_vector_size (); ++i) */ /* { */ /* LtkInt4_ltk_vector_set (v, i, 0); */ /* LtkInt4_ltk_vector_set (w, i, 3); */ /* } */ /* */ /* printf ("v = "); */ /* for (i = 0; i < 4; ++i) */ /* printf ("%d ", LtkInt4_ltk_vector_get (v, i)); */ /* printf ("\n"); */ /* */ /* printf ("w = "); */ /* for (i = 0; i < 4; ++i) */ /* printf ("%d ", LtkInt4_ltk_vector_get (w, i)); */ /* printf ("\n"); */ /****************************************************/ // testing intrinsics // printf("INSTRUCTION SET -> %d\n",INSTRSET); __m128 aligned_float = _mm_setzero_ps(); float *p = &aligned_float; printf("%f,%f,%f,%f\n",p[0],p[1],p[2],p[3]); // adder // __m128 a_1,a_2; a_1 = _mm_set_ps(3,3,3,3); a_2 = _mm_set_ps(1,2,3,4); aligned_float = _mm_add_ps(a_1,a_2); p = &aligned_float; printf("%f,%f,%f,%f\n",p[0],p[1],p[2],p[3]); // testing Objects // //LTK_MATRIX_DECLARE(TypeName, type, r, c); // Object *ob = New(ObjectClass); // int el = ObjectClass->GetElement(ob); return 0; }
inline IsotropicDipoleQuery(const Spectrum &_zr, const Spectrum &_zv, const Spectrum &_sigmaTr, Float Fdt, const Point &p) : Fdt(Fdt), p(p) { zr = _mm_set_ps(_zr[0], _zr[1], _zr[2], 0); zv = _mm_set_ps(_zv[0], _zv[1], _zv[2], 0); sigmaTr = _mm_set_ps(_sigmaTr[0], _sigmaTr[1], _sigmaTr[2], 0); zrSqr = _mm_mul_ps(zr, zr); zvSqr = _mm_mul_ps(zv, zv); result.ps = _mm_setzero_ps(); count = 0; }
static double rcp_d(double x) { __m128d xd = _mm_load_sd(&x); double xi = _mm_cvtss_f32(_mm_rcp_ss(_mm_cvtsd_ss(_mm_setzero_ps(), xd))); xi = xi + xi * (1.0 - x * xi); xi = xi + xi * (1.0 - x * xi); return xi; }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128 & dx, const __m128 & dy, Buffer & buffer, size_t col) { __m128 bestDot = _mm_setzero_ps(); __m128i bestIndex = _mm_setzero_si128(); for(int i = 0; i < buffer.size; ++i) { __m128 dot = _mm_add_ps(_mm_mul_ps(dx, buffer.cos[i]), _mm_mul_ps(dy, buffer.sin[i])); __m128 mask = _mm_cmpgt_ps(dot, bestDot); bestDot = _mm_max_ps(dot, bestDot); bestIndex = Combine(_mm_castps_si128(mask), buffer.pos[i], bestIndex); dot = _mm_sub_ps(_mm_setzero_ps(), dot); mask = _mm_cmpgt_ps(dot, bestDot); bestDot = _mm_max_ps(dot, bestDot); bestIndex = Combine(_mm_castps_si128(mask), buffer.neg[i], bestIndex); } Store<align>((__m128i*)(buffer.index + col), bestIndex); Sse::Store<align>(buffer.value + col, _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy)))); }
F32 Aabb::testPlane(const Plane& p) const { const Aabb& aabb = *this; #if ANKI_SIMD == ANKI_SIMD_SSE __m128 gezero = _mm_cmpge_ps(p.getNormal().getSimd(), _mm_setzero_ps()); Vec4 diagMin; diagMin.getSimd() = _mm_or_ps(_mm_and_ps(gezero, aabb.getMin().getSimd()), _mm_andnot_ps(gezero, aabb.getMax().getSimd())); #else Vec4 diagMin(0.0), diagMax(0.0); // set min/max values for x,y,z direction for(U i = 0; i < 3; i++) { if(p.getNormal()[i] >= 0.0) { diagMin[i] = aabb.getMin()[i]; diagMax[i] = aabb.getMax()[i]; } else { diagMin[i] = aabb.getMax()[i]; diagMax[i] = aabb.getMin()[i]; } } #endif // minimum on positive side of plane, box on positive side ANKI_ASSERT(diagMin.w() == 0.0); F32 test = p.test(diagMin); if(test > 0.0) { return test; } #if ANKI_SIMD == ANKI_SIMD_SSE Vec4 diagMax; diagMax.getSimd() = _mm_or_ps(_mm_and_ps(gezero, aabb.getMax().getSimd()), _mm_andnot_ps(gezero, aabb.getMin().getSimd())); #endif ANKI_ASSERT(diagMax.w() == 0.0); test = p.test(diagMax); if(test >= 0.0) { // min on non-positive side, max on non-negative side, intersection return 0.0; } else { // max on negative side, box on negative side return test; } }
// use MMX/SSE extensions (unrolled) void dotprod_rrrf_execute_sse4u(dotprod_rrrf _q, float * _x, float * _y) { __m128 v0, v1, v2, v3; __m128 h0, h1, h2, h3; __m128 s0, s1, s2, s3; __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/16)) unsigned int r = (_q->n >> 4) << 2; // unsigned int i; for (i=0; i<r; i+=4) { // load inputs into register (unaligned) v0 = _mm_loadu_ps(&_x[4*i+ 0]); v1 = _mm_loadu_ps(&_x[4*i+ 4]); v2 = _mm_loadu_ps(&_x[4*i+ 8]); v3 = _mm_loadu_ps(&_x[4*i+12]); // load coefficients into register (aligned) h0 = _mm_load_ps(&_q->h[4*i+ 0]); h1 = _mm_load_ps(&_q->h[4*i+ 4]); h2 = _mm_load_ps(&_q->h[4*i+ 8]); h3 = _mm_load_ps(&_q->h[4*i+12]); // compute dot products s0 = _mm_dp_ps(v0, h0, 0xffffffff); s1 = _mm_dp_ps(v1, h1, 0xffffffff); s2 = _mm_dp_ps(v2, h2, 0xffffffff); s3 = _mm_dp_ps(v3, h3, 0xffffffff); // parallel addition // FIXME: these additions are by far the limiting factor sum = _mm_add_ps( sum, s0 ); sum = _mm_add_ps( sum, s1 ); sum = _mm_add_ps( sum, s2 ); sum = _mm_add_ps( sum, s3 ); } // aligned output array float w[4] __attribute__((aligned(16))); // unload packed array _mm_store_ps(w, sum); float total = w[0]; // cleanup for (i=4*r; i<_q->n; i++) total += _x[i] * _q->h[i]; // set return value *_y = total; }
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { int i; int limit = data_len - 8; __m128 sum0, sum1; (void) lag; FLAC__ASSERT(lag <= 8); FLAC__ASSERT(lag <= data_len); sum0 = _mm_setzero_ps(); sum1 = _mm_setzero_ps(); for(i = 0; i <= limit; i++) { __m128 d, d0, d1; d0 = _mm_loadu_ps(data+i); d1 = _mm_loadu_ps(data+i+4); d = d0; d = _mm_shuffle_ps(d, d, 0); sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); } { __m128 d0 = _mm_setzero_ps(); __m128 d1 = _mm_setzero_ps(); limit++; if(limit < 0) limit = 0; for(i = data_len-1; i >= limit; i--) { __m128 d; d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); d1 = _mm_move_ss(d1, d0); d0 = _mm_move_ss(d0, d); sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); } } _mm_storeu_ps(autoc, sum0); _mm_storeu_ps(autoc+4, sum1); }
// -------------------------------------------------------------- vuint32 mandelbrot_simd(vfloat32 a, vfloat32 b, size_t max_iter) // -------------------------------------------------------------- { vuint32 num_iter = _mm_set1_epi32(0); vfloat32 zero=_mm_set1_ps(0); vfloat32 one=_mm_set1_ps(1); vfloat32 two=_mm_set1_ps(2); vfloat32 x=_mm_setzero_ps(); vfloat32 y=_mm_setzero_ps(); vfloat32 tmp; vfloat32 z; for(int i=0;i<max_iter;i++){ tmp=x; x=_mm_add_ps(a,_mm_sub_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y))); y=_mm_add_ps(b,_mm_mul_ps(_mm_mul_ps(y,tmp),two)); z=_mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y))); num_iter=_mm_add_epi32(num_iter,_mm_cvtps_epi32(_mm_and_ps(_mm_cmplt_ps(z,two),one))); } return num_iter; }
int conv2D(float* in, float* out, int data_size_X, int data_size_Y, float* kernel) { // the x coordinate of the kernel's center int kern_cent_X = (KERNX - 1)/2; // the y coordinate of the kernel's center int kern_cent_Y = (KERNY - 1)/2; // we want to flip the kernel first so that it doesn't happen in every single iteration of the loop. int length = KERNX*KERNY; float flipped_kernel[length]; for (int i = 0; i < length; i++) { flipped_kernel[length - (i+1)] = kernel[i]; } float *padded_in = calloc((data_size_Y+2)*(data_size_X+2), sizeof(float)); int d_x = data_size_X + 2; int size_f = sizeof(float)*data_size_X; float* pad_1 = padded_in + 1; for (int i = 1; i <= data_size_Y ; i++) { memcpy(pad_1 + i*d_x, in + (data_size_X*(i-1)), size_f); } //__m128 vpad; //__m128 vfk; //float final[4]; for(int y = 1; y <= data_size_Y/4*4; y+=4) { for(int x = 1; x <= data_size_X/4*4; x+=4) { int xy_location = (x-1)+ (y-1)*data_size_X; __m128 res = _mm_setzero_ps(); for(int i = -kern_cent_X; i <= kern_cent_X; i++) { for(int j = -kern_cent_Y; j <= kern_cent_Y; j++){ out[xy_location] += flipped_kernel[(kern_cent_X + i) + (kern_cent_Y + j)*KERNY] * padded_in[(x+i) + (y+j) * d_x]; // vfk = _mm_loadu_ps(flipped_kernel + (kern_cent_X + i) + (kern_cent_Y + j) * KERNY); // vpad = _mm_loadu_ps(padded_in + (x+i) + (y+j) * d_x); // res = _mm_add_ps(res, _mm_mul_ps(vfk, vpad)); } // _mm_storeu_ps(final, res); // out[xy_location] = final[0] + final[1] + final[2] + final[3]; // // // (insert tail case here) } } } free(padded_in); return 1; }
static inline void foo (__m128 *x) { __m128 y = _mm_setzero_ps (); __m128 v = _mm_movehl_ps (y, *x); __m128 w = _mm_movehl_ps (*x, y); check (*x, 9, 1, 2, -3); check (v, 2, -3, 0, 0); check (w, 0, 0, 2, -3); }
// use MMX/SSE extensions void dotprod_crcf_execute_mmx(dotprod_crcf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // first cut: ... __m128 v; // input vector __m128 h; // coefficients vector __m128 s; // dot product __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/4)) unsigned int t = (n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) v = _mm_loadu_ps(&x[i]); // load coefficients into register (aligned) h = _mm_load_ps(&_q->h[i]); // compute multiplication s = _mm_mul_ps(v, h); // accumulate sum = _mm_add_ps(sum, s); } // aligned output array float w[4] __attribute__((aligned(16))); // unload packed array _mm_store_ps(w, sum); // add in-phase and quadrature components w[0] += w[2]; w[1] += w[3]; // cleanup (note: n _must_ be even) for (; i<n; i+=2) { w[0] += x[i ] * _q->h[i ]; w[1] += x[i+1] * _q->h[i+1]; } // set return value *_y = w[0] + _Complex_I*w[1]; }
__m128 Expression::EvaluateVelocity(EntityContext &aContext) { if (const Entity *entity = Database::entity.Get(aContext.mId)) { return _mm_setr_ps(entity->GetVelocity().x, entity->GetVelocity().y, entity->GetOmega(), 0.0f); } else { return _mm_setzero_ps(); } }
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh) { __m128 ms0 = _mm_load_ps(src); __m128 ms1 = _mm_load_ps(src + 4); const __m128 mm = _mm_set1_ps(0.5f); __m128 a = _mm_add_ps(ms0, ms1); __m128 b = _mm_sub_ps(ms0, ms1); __m128 t1 = _mm_unpacklo_ps(a, b); __m128 t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth); ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk); #ifdef _KEEP_00_COEF_ ms0 = _mm_blend_ps(ms0, a, 1); #endif msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth); ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk); a = _mm_add_ps(ms0, ms1); b = _mm_sub_ps(ms0, ms1); t1 = _mm_unpacklo_ps(a, b); t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); _mm_store_ps(dest, a); _mm_store_ps(dest + 4, b); }
hv_size_t sSamphold_init(SignalSamphold *o) { #if HV_SIMD_AVX o->s = _mm256_setzero_ps(); #elif HV_SIMD_SSE o->s = _mm_setzero_ps(); #elif HV_SIMD_NEON o->s = vdupq_n_f32(0.0f); #else o->s = 0.0f; #endif return 0; }
__m128 exp_ps(__m128 x) { typedef __m128 v4sf; typedef __m128i v4si; v4sf tmp = _mm_setzero_ps(), fx; v4si emm0; v4sf one = constants::ps_1.ps; x = _mm_min_ps(x, constants::exp_hi.ps); x = _mm_max_ps(x, constants::exp_lo.ps); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm_mul_ps(x, constants::cephes_LOG2EF.ps); fx = _mm_add_ps(fx, constants::ps_0p5.ps); /* how to perform a floorf with SSE: just below */ emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); /* if greater, substract 1 */ v4sf mask = _mm_cmpgt_ps(tmp, fx); mask = _mm_and_ps(mask, one); fx = _mm_sub_ps(tmp, mask); tmp = _mm_mul_ps(fx, constants::cephes_exp_C1.ps); v4sf z = _mm_mul_ps(fx, constants::cephes_exp_C2.ps); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x,x); v4sf y = constants::cephes_exp_p0.ps; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p1.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p2.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p3.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p4.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p5.ps); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, one); /* build 2^n */ emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, constants::pi32_0x7f.pi); emm0 = _mm_slli_epi32(emm0, 23); v4sf pow2n = _mm_castsi128_ps(emm0); y = _mm_mul_ps(y, pow2n); return y; }
__m128 Expression::EvaluatePosition(EntityContext &aContext) { if (const Entity *entity = Database::entity.Get(aContext.mId)) { const Transform2 transform = entity->GetInterpolatedTransform(sim_fraction); return _mm_setr_ps(transform.p.x, transform.p.y, transform.a, 1.0f); } else { return _mm_setzero_ps(); } }
hv_size_t sDel1_init(SignalDel1 *o) { #if HV_SIMD_AVX o->x = _mm256_setzero_ps(); #elif HV_SIMD_SSE o->x = _mm_setzero_ps(); #elif HV_SIMD_NEON o->x = vdupq_n_f32(0.0f); #else o->x = 0.0f; #endif return 0; }
// ~~~~~~~~~~~~~~~ Task1 void maxMatrixValueSse(MATRIX_TYPE** matrix, size_t size, MATRIX_TYPE &maxValue) { __m128 maxSse = _mm_setzero_ps(); for (size_t i = 0; i < size; i++) { for (size_t j = 0; j < size; j+=4) { __m128 current = _mm_loadu_ps(&matrix[i][j]); maxSse = _mm_max_ps(maxSse, current); } } maxValue = maxSse.m128_f32[0]; }
void ProxyRwSse2 <SplFmt_FLOAT>::read_flt_partial (const PtrConst::Type &ptr, __m128 &src0, __m128 &src1, const __m128i &/*zero*/, int len) { if (len >= 4) { src0 = _mm_loadu_ps (ptr ); src1 = fstb::ToolsSse2::load_ps_partial (ptr + 4, len - 4); } else { src0 = fstb::ToolsSse2::load_ps_partial (ptr , len ); src1 = _mm_setzero_ps (); } }
BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const { #ifdef URHO3D_SSE const __m128 one = _mm_set_ss(1.f); __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one)); __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one)); __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f)); __m128 halfSize = _mm_sub_ps(centerPoint, minPt); __m128 m0 = _mm_loadu_ps(&transform.m00_); __m128 m1 = _mm_loadu_ps(&transform.m10_); __m128 m2 = _mm_loadu_ps(&transform.m20_); __m128 r0 = _mm_mul_ps(m0, centerPoint); __m128 r1 = _mm_mul_ps(m1, centerPoint); __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1)); __m128 r2 = _mm_mul_ps(m2, centerPoint); const __m128 zero = _mm_setzero_ps(); __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero)); __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize)); __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize)); __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize)); t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y)); t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero)); __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir)); #else Vector3 newCenter = transform * Center(); Vector3 oldEdge = Size() * 0.5f; Vector3 newEdge = Vector3( Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_, Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_, Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_ ); return BoundingBox(newCenter - newEdge, newCenter + newEdge); #endif }
float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength) { unsigned i; __m128 in0, in1, in2, in3, acc0, acc1, acc2, acc3; float out[4]; ASSERT_ALIGNED16(aInput); ASSERT_MULTIPLE16(aLength); acc0 = _mm_setzero_ps(); acc1 = _mm_setzero_ps(); acc2 = _mm_setzero_ps(); acc3 = _mm_setzero_ps(); for (i = 0; i < aLength; i += 16) { in0 = _mm_load_ps(&aInput[i]); in1 = _mm_load_ps(&aInput[i + 4]); in2 = _mm_load_ps(&aInput[i + 8]); in3 = _mm_load_ps(&aInput[i + 12]); in0 = _mm_mul_ps(in0, in0); in1 = _mm_mul_ps(in1, in1); in2 = _mm_mul_ps(in2, in2); in3 = _mm_mul_ps(in3, in3); acc0 = _mm_add_ps(acc0, in0); acc1 = _mm_add_ps(acc1, in1); acc2 = _mm_add_ps(acc2, in2); acc3 = _mm_add_ps(acc3, in3); } acc0 = _mm_add_ps(acc0, acc1); acc0 = _mm_add_ps(acc0, acc2); acc0 = _mm_add_ps(acc0, acc3); _mm_store_ps(out, acc0); return out[0] + out[1] + out[2] + out[3]; }
static void matvec_sse() { /* Assume that the data size is an even multiple of the 128 bit * SSE vectors (i.e. 4 floats) */ assert(!(SIZE & 0x3)); /* TASK: Implement your SSE version of the matrix-vector * multiplication here. */ /* HINT: You might find at least the following instructions * useful: * - _mm_setzero_ps * - _mm_load_ps * - _mm_hadd_ps * - _mm_cvtss_f32 * * HINT: You can create the sum of all elements in a vector * using two hadd instructions. */ __m128 dummy=_mm_setzero_ps(); for(int i=0;i<SIZE;++i){ __m128 temp=_mm_setzero_ps(); for(int j=0;j<SIZE;j+=4){ __m128 mm_vec_b=_mm_load_ps((__m128*)(vec_b+j)); __m128 mm_matr=_mm_load_ps((__m128*)(mat_a+MINDEX(i,j))); __m128 out=_mm_mul_ps(mm_vec_b,mm_matr); temp=_mm_add_ps(temp,out); // vec_c[i]+=_mm_cvtss_f32(_mm_dp_ps(mm_matr,mm_vec_b,0xf1)); } __m128 res=_mm_hadd_ps(_mm_hadd_ps(temp,dummy),dummy); vec_c[i]=_mm_cvtss_f32(res); } }
void fconv_SSE( const Mat &A, const Mat &F, Mat &R ) { int RowA = A.rows, ColA = A.cols, NumFeatures = A.channels(); int RowF = F.rows, ColF = F.cols, ChnF = F.channels(); if( NumFeatures!=ChnF || (NumFeatures%4) ) throw runtime_error(""); int loop = NumFeatures / 4; int RowR = RowA - RowF + 1, ColR = ColA - ColF + 1; float *R_src0 = (float*)R.data; int Rstep = R.step1(); const float *F_src = F.ptr<float>(0,0); const float *A_src0 = A.ptr<float>(0,0); __m128 a,b,c; for( int rr=0; rr<RowR; rr++ ){ const float *A_src1 = A_src0 + rr*A.cols*NumFeatures; // start addr of A.row(rr) float *R_scr1 = R_src0 + rr*Rstep; // start addr of R.row(rr) for( int cc=0; cc<ColR; cc++ ){ const float *A_src= A_src1 + cc*NumFeatures;// A.ptr<float>(rr,cc); float *R_src = R_scr1 + cc; // An acceleration trick of using SSE programming >>> __m128 v = _mm_setzero_ps(); const float *B_off = F_src; for( int rp=0; rp<RowF; rp++ ){ const float *A_off = A_src + rp*A.cols*NumFeatures; for( int cp=0; cp<ColF; cp++ ){ for( int l=0; l<loop; l++ ){ a = _mm_load_ps(A_off); b = _mm_load_ps(B_off); c = _mm_mul_ps(a, b); v = _mm_add_ps(v, c); A_off += 4; B_off += 4; } } } #ifdef WIN32 *R_src = v.m128_f32[0] + v.m128_f32[1] + v.m128_f32[2] + v.m128_f32[3]; #else float buf[4] __attribute__ ((aligned (16))); _mm_store_ps(buf, v); _mm_empty(); *R_src = buf[0]+buf[1]+buf[2]+buf[3]; #endif } } }
// Projection Matrix ------------------------------------------------------------------ static Float4x4 VFunction PerspectiveFovLH(Float fov, Float aspect, Float near, Float far) { // n - rotation axis; a - theta; v - vector being rotated // v * cos(a) + (dot(v, n) * n * (1 - cos(a))) + (cross(n, v) * sin(a)); // NOTE: PROJECTION MATRIX IS HARDCODED FOR Forward-Right-Up // THIS MEANS TRANSPOSE OF ZXY, WHICH IS YZX Float2 angles = SinCos(0.5f * fov); Float fRange = far / (far - near); // Note: This is recorded on the stack Float Height = angles.y / angles.x; Vector rMem = { Height / aspect, Height, fRange, -fRange * near }; // Copy from memory to SSE register Vector vValues = rMem; Vector vTemp = _mm_setzero_ps(); // Copy x only vTemp = _mm_move_ss(vTemp, vValues); // CosFov / SinFov,0,0,0 Float4x4 M; M.y = vTemp; // 0,Height / AspectHByW,0,0 vTemp = vValues; vTemp = _mm_and_ps(vTemp, Constant::MaskY); M.z = vTemp; // x=fRange,y=-fRange * NearZ,0,1.0f vTemp = _mm_setzero_ps(); vValues = _mm_shuffle_ps(vValues, Constant::IdentityR3, _MM_SHUFFLE(3, 2, 3, 2)); // 0,0,fRange,1.0f vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0)); M.x = vTemp; // 0,0,-fRange * NearZ,0.0f vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0)); M.w = vTemp; return M; }
bool BBox::intersect(const Ray& ray, float *tnear, float *tfar) const { // you may already have those values hanging around somewhere const __m128 plus_inf = loadps(ps_cst_plus_inf), minus_inf = loadps(ps_cst_minus_inf); // use whatever's apropriate to load. const __m128 box_min = loadps(&min), box_max = loadps(&max), pos = loadps(&ray.o), inv_dir = loadps(&ray.inv_d); // use a div if inverted directions aren't available const __m128 l1 = mulps(subps(box_min, pos), inv_dir); const __m128 l2 = mulps(subps(box_max, pos), inv_dir); // the order we use for those min/max is vital to filter out // NaNs that happens when an inv_dir is +/- inf and // (box_min - pos) is 0. inf * 0 = NaN const __m128 filtered_l1a = minps(l1, plus_inf); const __m128 filtered_l2a = minps(l2, plus_inf); const __m128 filtered_l1b = maxps(l1, minus_inf); const __m128 filtered_l2b = maxps(l2, minus_inf); // now that we're back on our feet, test those slabs. __m128 lmax = maxps(filtered_l1a, filtered_l2a); __m128 lmin = minps(filtered_l1b, filtered_l2b); // unfold back. try to hide the latency of the shufps & co. const __m128 lmax0 = rotatelps(lmax); const __m128 lmin0 = rotatelps(lmin); lmax = minss(lmax, lmax0); lmin = maxss(lmin, lmin0); const __m128 lmax1 = muxhps(lmax,lmax); const __m128 lmin1 = muxhps(lmin,lmin); lmax = minss(lmax, lmax1); lmin = maxss(lmin, lmin1); const bool ret = _mm_comige_ss(lmax, _mm_setzero_ps()) & _mm_comige_ss(lmax,lmin); storess(lmin, tnear); storess(lmax, tfar); return ret; }
void conv_filter_sse(int imgHeight, int imgWidth, int imgHeightF, int imgWidthF, int imgFOfssetH, int imgFOfssetW, float* filter, float *imgFloatSrc, float *imgFloatDst) { //1. const register __declspec(align(16)) auto const_0 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); //2. const register __declspec(align(16)) auto const_255 = _mm_set_ps(255.0, 255.0, 255.0, 255.0); //3. __declspec(align(16)) __m128 filter_l[FILTER_SIZE]; #pragma omp parallel for for (auto i = 0; i < FILTER_SIZE; i++) { //mind a 4 floatba ugyanazt tölti // float -> m128 konverzió filter_l[i] = _mm_load_ps1(filter + i); } const auto rw_base = (imgFOfssetW + imgFOfssetH * imgWidthF) << 2; const auto imgWidthbyte = imgWidth << 2; const auto imgWidthFbyte = imgWidthF << 2; const auto imgLengthbyte = imgHeight * imgWidthbyte; //4. register __declspec(align(16)) __m128 a_sse; //8. reg register __declspec(align(16)) __m128 r_sse; #pragma omp parallel for for (auto row = 0; row < imgLengthbyte; row += 4) { // RGBA komponensek akkumulátora r_sse = _mm_setzero_ps(); // konvolúció minden komponensre for (auto y = 0; y < FILTER_H; y++ ) { r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (y * imgWidthFbyte)), filter_l[5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (4 + y * imgWidthFbyte)), filter_l[1 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (8 + y * imgWidthFbyte)), filter_l[2 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (12 + y * imgWidthFbyte)), filter_l[3 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (16 + y * imgWidthFbyte)), filter_l[4 + 5 * y])); } a_sse = _mm_load_ps(imgFloatSrc + row + 8 + 2 * imgWidthFbyte); //számítás eredményének limitálása 0-255 közé // kimenetí pixel írása _mm_store_ps(imgFloatDst + rw_base + row, _mm_min_ps(const_255, _mm_add_ps(a_sse, _mm_max_ps(const_0, _mm_sub_ps(a_sse, _mm_min_ps(const_255, _mm_max_ps(const_0, r_sse))))))); } }
hv_size_t sLine_init(SignalLine *o) { #if HV_SIMD_AVX o->n = _mm_setzero_si128(); o->x = _mm256_setzero_ps(); o->m = _mm256_setzero_ps(); o->t = _mm256_setzero_ps(); #elif HV_SIMD_SSE o->n = _mm_setzero_si128(); o->x = _mm_setzero_ps(); o->m = _mm_setzero_ps(); o->t = _mm_setzero_ps(); #elif HV_SIMD_NEON o->n = vdupq_n_s32(0); o->x = vdupq_n_f32(0.0f); o->m = vdupq_n_f32(0.0f); o->t = vdupq_n_f32(0.0f); #else // HV_SIMD_NONE o->n = 0; o->x = 0.0f; o->m = 0.0f; o->t = 0.0f; #endif return 0; }
void sDel1_onMessage(HvBase *_c, SignalDel1 *o, int letIn, const HvMessage *m) { if (letIn == 2) { if (msg_compareSymbol(m, 0, "clear")) { #if HV_SIMD_AVX o->x = _mm256_setzero_ps(); #elif HV_SIMD_SSE o->x = _mm_setzero_ps(); #elif HV_SIMD_NEON o->x = vdupq_n_f32(0.0f); #else o->x = 0.0f; #endif } } }
// ~~~~~~~~~~~~~~~ Task2 void mulVectorSse(MATRIX_TYPE** matrix, MATRIX_TYPE* vector, MATRIX_TYPE* result, size_t size) { for (size_t i = 0; i < size; i++) { __m128 localSum = _mm_setzero_ps(); for (size_t j = 0; j < size; j += 4) { __m128 tempMatix = _mm_load_ps(&matrix[i][j]); __m128 tempVector = _mm_load_ps(&vector[j]); localSum = _mm_add_ps(localSum, _mm_mul_ps(tempMatix, tempVector)); } localSum = _mm_hadd_ps(localSum, localSum); localSum = _mm_hadd_ps(localSum, localSum); _mm_store_ss(&result[i], localSum); } }
static void negative_f32_sse_unroll2 (float *dest, float *src1, int n) { /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = -(*src1++); } for (; n >= 8; n -= 8) { __m128 xmm0, xmm1; xmm0 = _mm_setzero_ps(); xmm1 = _mm_loadu_ps(src1); xmm0 = _mm_sub_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); xmm0 = _mm_setzero_ps(); xmm1 = _mm_loadu_ps(src1 + 4); xmm0 = _mm_sub_ps(xmm0, xmm1); _mm_store_ps(dest + 4, xmm0); dest += 8; src1 += 8; } for (; n > 0; n--) { *dest++ = -(*src1++); } }