Exemplos de _mm_add_ss em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: gdal_priv_templates.hpp Projeto: bbradbury/lib_gdal

inline void GDALCopyWordSSE(const float fValueIn, Tout &tValueOut)
{
    float fMaxVal, fMinVal;
    GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
    __m128 xmm = _mm_set_ss(fValueIn);
    __m128 xmm_min = _mm_set_ss(fMinVal);
    __m128 xmm_max = _mm_set_ss(fMaxVal);
    xmm = _mm_min_ss(_mm_max_ss(xmm, xmm_min), xmm_max);
#ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE
    __m128 p0d5 = _mm_set_ss(0.5f);
    if (std::numeric_limits<Tout>::is_signed)
    {
        __m128 mask = _mm_cmpge_ss(xmm, _mm_set_ss(0.f));
        __m128 m0d5 = _mm_set_ss(-0.5f);
        xmm = _mm_add_ss(xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5)));
    }
    else
    {
        xmm = _mm_add_ss(xmm, p0d5);
    }
#endif

#ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE
    tValueOut = (Tout)_mm_cvttss_si32(xmm);
#else
    tValueOut = (Tout)_mm_cvtss_si32(xmm);
#endif
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: intr_frustum_opt.c Projeto: matovitch/The-Dream-Machine

int intr_frustum_box(const float* frustum, const float* box_min, const float* box_max)
{
    const __m128 min = _mm_load_ps(box_min);
    const __m128 max = _mm_load_ps(box_max);

    for (int i = 0; i < 6; i++) {
        const __m128 plane = _mm_load_ps(frustum + 4 * i);

        const __m128 mask = _mm_cmplt_ps(plane, _mm_setzero_ps());
        const __m128 n = _mm_or_ps(_mm_and_ps(mask, max),
                                   _mm_andnot_ps(mask, min));

        const __m128 d = _mm_mul_ps(n, plane);
        const __m128 d0 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(2, 1, 0, 3));
        const __m128 d1 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(1, 0, 3, 2));
        const __m128 d2 = _mm_shuffle_ps(d, d, _MM_SHUFFLE(0, 3, 2, 1));
        const __m128 dot = _mm_add_ss(_mm_add_ss(d0, d), _mm_add_ss(d1, d2));

        const __m128 ret = _mm_cmpgt_ss(dot, _mm_setzero_ps());
        
        float reti;
        _mm_store_ss(&reti, ret);
        if (reti != 0)
            return 0;
    }
    
    return 1;
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: pitch_sse.c Projeto: 110Percent/BeeBot

void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
      int N, opus_val32 *xy1, opus_val32 *xy2)
{
   int i;
   __m128 xsum1, xsum2;
   xsum1 = _mm_setzero_ps();
   xsum2 = _mm_setzero_ps();
   for (i=0;i<N-3;i+=4)
   {
      __m128 xi = _mm_loadu_ps(x+i);
      __m128 y1i = _mm_loadu_ps(y01+i);
      __m128 y2i = _mm_loadu_ps(y02+i);
      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
   }
   /* Horizontal sum */
   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
   _mm_store_ss(xy1, xsum1);
   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
   _mm_store_ss(xy2, xsum2);
   for (;i<N;i++)
   {
      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
   }
}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: SplineCommon.cpp Projeto: CTimpone/libretro-ppsspp

inline __m128 SSENormalizeMultiplierSSE2(__m128 v)
{
	const __m128 sq = _mm_mul_ps(v, v);
	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
	const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));

	const __m128 rt = _mm_rsqrt_ss(res);
	return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: fold.c Projeto: swiggumj/psrfits_utils

/* Combines unpack and accumulate */
void vector_accumulate_8bit(float *out, const char *in, int n) {
#ifdef FOLD_USE_INTRINSICS
    __m128 in_, out_, tmp_;
    float ftmp;
    int ii;
    for (ii = 0 ; ii < (n & -16) ; ii += 16) {
        __builtin_prefetch(out + 64, 1, 0);
        __builtin_prefetch(in  + 64, 0, 0);

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < (n & -4) ; ii += 4) {
        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < n ; ii++) {  // Cast these without intrinsics
        ftmp = (float)(*in);
        out_ = _mm_load_ss(out);
        in_ = _mm_load_ss(&ftmp);
        tmp_ = _mm_add_ss(out_, in_);
        _mm_store_ss(out, tmp_);
        in  += 1;
        out += 1;
    }
    _mm_empty();
#else
    int i;
    for (i=0; i<n; i++) { out[i] += (float)in[i]; }
#endif
}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: dridkernels.c Projeto: huynhqu1/mdtraj

int
drid_moments(float* coords, int32_t index, int32_t* partners, int32_t n_partners, double* moments)
{
    int32_t i;
    float d;
    moments_t onlinemoments;
    __m128 x, y, r, r2, s;

    moments_clear(&onlinemoments);
    x = load_float3(&coords[3 * index]);

    for (i = 0; i < n_partners; i++) {
        y = load_float3(&coords[3 * partners[i]]);
        r = _mm_sub_ps(x, y);     /* x - y       */
        r2 = _mm_mul_ps(r, r);    /* (x - y)**2  */

        /* horizontal add the components of d2 with */
        /* two instructions. note: it's critical */
        /* here that the last entry of x1 and x2 was 0 */
        /* so that d2.w = 0 */
	s = _mm_add_ps(r2, _mm_movehl_ps(r2, r2));
        s = _mm_add_ss(s, _mm_shuffle_ps(s, s, 1));
        /* store into a regular float. I tried using _mm_rsqrt_ps, but it's not
           accurate to pass the tests */
        _mm_store_ss(&d, s);
        moments_push(&onlinemoments, 1.0 / sqrt((double) d));
    }

    moments[0] = moments_mean(&onlinemoments);
    moments[1] = sqrt(moments_second(&onlinemoments));
    moments[2] = cbrt(moments_third(&onlinemoments));

    return 1;
}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: iqconverter_float.c Projeto: FlagJollyRoger/host

_inline float process_folded_fir_sse2(const float *fir_kernel, const float *queue_head, const float *queue_tail, int len)
{
	__m128 acc = _mm_set_ps(0, 0, 0, 0);

	queue_tail -= 3;

	len >>= 2;
	while (len > 0)
	{
		__m128 head = _mm_loadu_ps(queue_head);
		__m128 tail = _mm_loadu_ps(queue_tail);
		__m128 kern = _mm_load_ps(fir_kernel);

		tail = _mm_shuffle_ps(tail, tail, 0x1b);  // swap the order
		__m128 t1 = _mm_add_ps(tail, head);       // add the head
		t1 = _mm_mul_ps(t1, kern);                // mul
		acc = _mm_add_ps(acc, t1);                // add

		queue_head += 4;
		queue_tail -= 4;
		fir_kernel += 4;
		len--;
	}

	// horizontal sum
	const __m128 t = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
	const __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));

	return sum.m128_f32[0];
}

Exemplo n.º 8

0

Exibir arquivo

Arquivo: i386-nodebug-attr.c Projeto: BackupTheBerlios/iphone-binutils-svn

__m128  COMPUTE ()
{
	__m128 v0 = _mm_setzero_ps();
	__m128 v1 = _mm_set1_ps(1.0);
	__m128 v2 = _mm_set1_ps(1.0);;
	__m128 v3 = _mm_sub_ss (v1, v2);
	return _mm_add_ss (v3, v0);
}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: fold.c Projeto: swiggumj/psrfits_utils

void vector_accumulate(float *out, const float *in, int n) {
#ifdef FOLD_USE_INTRINSICS
    __m128 in_, out_, tmp_;
    int ii;
    for (ii = 0 ; ii < (n & -16) ; ii += 16) {
        __builtin_prefetch(out + 64, 1, 0);
        __builtin_prefetch(in  + 64, 0, 0);

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < (n & -4) ; ii += 4) {
        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < n ; ii++) {
        in_  = _mm_load_ss(in);
        out_ = _mm_load_ss(out);
        tmp_ = _mm_add_ss(out_, in_);
        _mm_store_ss(out, tmp_);
        in  += 1;
        out += 1;
    }
    _mm_empty();
#else
    int i;
    for (i=0; i<n; i++) { out[i] += in[i]; }
#endif
}

Exemplo n.º 10

0

Exibir arquivo

Arquivo: DatabaseBuilder.cpp Projeto: Nessphoro/KaggleNN

inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y)
{ 
#ifdef AVX
	//Black magic
	//But it does produce the same results as the not AVX code
	__m256 accumulator;
	__m256 x_s = _mm256_load_ps(x->Features);
	__m256 y_s = _mm256_load_ps(y->Features);
	__m256 result = _mm256_sub_ps(x_s, y_s);
	accumulator = _mm256_mul_ps(result, result);

	x_s = _mm256_load_ps(&x->Features[8]);
	y_s = _mm256_load_ps(&y->Features[8]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[16]);
	y_s = _mm256_load_ps(&y->Features[16]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[24]);
	y_s = _mm256_load_ps(&y->Features[24]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);
	//We now have a vector of 8 floats

	__m256 t1 = _mm256_hadd_ps(accumulator, accumulator);
	__m256 t2 = _mm256_hadd_ps(t1, t1);
	__m128 t3 = _mm256_extractf128_ps(t2, 1);
	__m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3);
	//And now we don't
	return std::sqrtf(_mm_cvtss_f32(t4));
#endif
#ifndef AVX
	//Can be autovectorized
	float accumulator[32];
	float distance = 0;
	for (int i = 0; i < 30; i++)
	{
		accumulator[i] = x->Features[i] - y->Features[i];
	}

	//If done properly this should be 4(8) instructions
	for (int i = 0; i < 30; i++)
	{
		distance += accumulator[i] * accumulator[i];
	}

	return std::sqrtf(distance);
#endif

	
}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: src_sinc_opt.c Projeto: CarnyPriest/SAMbuild

static inline __m128 horizontal_add(const __m128 a)
{
#if 0 //!! needs SSE3
    const __m128 ftemp = _mm_hadd_ps(a, a);
    return _mm_hadd_ps(ftemp, ftemp);
#else    
    const __m128 ftemp = _mm_add_ps(a, _mm_movehl_ps(a, a)); //a0+a2,a1+a3
    return _mm_add_ss(ftemp, _mm_shuffle_ps(ftemp, ftemp, _MM_SHUFFLE(1, 1, 1, 1))); //(a0+a2)+(a1+a3)
#endif
}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: debug.hpp Projeto: HazyResearch/EmptyHeaded

//Thanks stack overflow.
static inline float _mm256_reduce_add_ps(__m256 x) {
    /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
    const int imm = 1;
    const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, imm), _mm256_castps256_ps128(x));
    /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
    /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
    /* Conversion to float is a no-op on x86-64 */
    return _mm_cvtss_f32(x32);
}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: update_locations_asm.c Projeto: nathanbarnesduncan/442-HW-1

// Update location by velocity, one time-step
void update_coords(uint32_t i, float* x, float* y, float* z, float* vx, float* vy, float* vz) {
	__m128 vec, flo, out;


	vec = _mm_set_ss(vx[i]);
	flo = _mm_set_ss(x[i]);
	out = _mm_add_ss(vec, flo);
	_mm_store_ss(&x[i], out);

	vec = _mm_set_ss(vy[i]);
	flo = _mm_set_ss(y[i]);
	out = _mm_add_ss(vec, flo);
	_mm_store_ss(&y[i], out);

	vec = _mm_set_ss(vz[i]);
	flo = _mm_set_ss(z[i]);
	out = _mm_add_ss(vec, flo);
	_mm_store_ss(&z[i], out);

}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: nx_sse_float.hpp Projeto: mywoodstock/nxsimd

    inline float hadd(const vector4f& rhs)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        __m128 tmp0 = _mm_hadd_ps(rhs, rhs);
        __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
#else
        __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs));
        __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
#endif
        return _mm_cvtss_f32(tmp1);
    }

Exemplo n.º 15

0

Exibir arquivo

Arquivo: hyperloglog_simd.c Projeto: fsaintjacques/libtwiddle

/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */
static inline float horizontal_sum_avx2(__m256 x)
{
  const __m128 hi_quad = _mm256_extractf128_ps(x, 1);
  const __m128 lo_quad = _mm256_castps256_ps128(x);
  const __m128 sum_quad = _mm_add_ps(lo_quad, hi_quad);
  const __m128 lo_dual = sum_quad;
  const __m128 hi_dual = _mm_movehl_ps(sum_quad, sum_quad);
  const __m128 sum_dual = _mm_add_ps(lo_dual, hi_dual);
  const __m128 lo = sum_dual;
  const __m128 hi = _mm_shuffle_ps(sum_dual, sum_dual, 0x1);
  const __m128 sum = _mm_add_ss(lo, hi);
  return _mm_cvtss_f32(sum);
}

Exemplo n.º 16

0

Exibir arquivo

Arquivo: FastResampler_FirFilter_SSE2.cpp Projeto: Faik-man/ssr

void FastResampler_FirFilter2_C1_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
	Q_UNUSED(channels);
	__m128 sum = _mm_setzero_ps();
	__m128 v_frac = _mm_set1_ps(frac);
	for(unsigned int i = 0; i < filter_length / 4; ++i) {
		__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
		coef1 += 4; coef2 += 4;
		__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
		__m128 v_input = _mm_loadu_ps(input);
		input += 4;
		sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value));
	}
	__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e));
	__m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01));
	_mm_store_ss(output, sum3);
}

Exemplo n.º 17

0

Exibir arquivo

Arquivo: update_locations_asm.c Projeto: nathanbarnesduncan/442-HW-1

// Sums an array of floats; needed in replacement of Python sum()
float sum(float* a, uint_fast32_t num_elements)
{
	__m128 avec, sumflo, sumout;
	float* sum = _mm_malloc(sizeof(float), sizeof(int16_t));
	sumflo = _mm_set_ss(*sum);

	for (uint_fast32_t i = 0; i < num_elements; i++) {
		avec = _mm_set_ss(a[i]);
		sumout = _mm_add_ss(avec, sumflo);
		_mm_store_ss(sum, sumout);

	}
	return *sum;




}

Exemplo n.º 18

0

Exibir arquivo

Arquivo: dot-vector3.cpp Projeto: jcranmer/vec-timer

float dot_product(const int N, const float *X, const int incX, const float *Y,
                  const int incY) {
  __m256 accum = _mm256_setzero_ps();
  for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) {
    __m256 xval = _mm256_load_ps(X);
    __m256 yval = _mm256_load_ps(Y);
    __m256 val = _mm256_mul_ps(xval, yval);
    accum = _mm256_add_ps(val, accum);
  }
  // Reduce the values in accum into the smallest 32-bit subsection
  // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3
  __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum),
      _mm256_extractf128_ps(accum, 1));
  // b0 b1 b2 b3 -> c0 c1 b2 b3
  accum2 = _mm_add_ps(accum2,
      _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8)));
  __m128 final_val = _mm_add_ss(
      _mm_insert_ps(accum2, accum2, 0x4e), accum2);
  // Add the high and low halves
  return final_val[0];
}

Exemplo n.º 19

0

Exibir arquivo

Arquivo: _ccv_nnc_gemm_cpu_opt.c Projeto: Corion/image-ccv

static int _ccv_nnc_gemm_forw_sse2(const ccv_nnc_tensor_view_t* const a, const ccv_nnc_tensor_view_t* const w, const ccv_nnc_tensor_view_t* const bias, ccv_nnc_tensor_view_t* const b)
{
	const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
	const int* adim = (a_nd == 1) ? a->info.dim : a->info.dim + 1;
	const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
	const int* bdim = (b_nd == 1) ? b->info.dim : b->info.dim + 1;
	assert(bdim[0] == bias->info.dim[0]);
	assert(bdim[0] == w->info.dim[0]);
	assert(adim[0] == w->info.dim[1]);
	const int* ainc = CCV_IS_TENSOR_VIEW(a) ? (a_nd == 1 ? a->inc : a->inc + 1) : adim;
	const int* binc = CCV_IS_TENSOR_VIEW(b) ? (b_nd == 1 ? b->inc : b->inc + 1) : bdim;
	const int* winc = CCV_IS_TENSOR_VIEW(w) ? w->inc : w->info.dim;
	const int batch_size = a_nd == 1 ? 1 : ccv_max(1, a->info.dim[0]);
	int i;
	for (i = 0; i < batch_size; i++)
	{
		const float* const ap = a->data.f32 + i * ainc[0];
		float* const bp = b->data.f32 + i * binc[0];
		parallel_for(j, bdim[0]) {
			const float* const wp = w->data.f32 + j * winc[1];
			int k;
			__m128 v40 = _mm_set_ss(bias->data.f32[j]);
			__m128 v41 = _mm_setzero_ps();
			for (k = 0; k < adim[0] - 7; k += 8)
			{
				__m128 ap40 = _mm_load_ps(ap + k);
				__m128 ap41 = _mm_load_ps(ap + k + 4);
				__m128 w40 = _mm_load_ps(wp + k);
				__m128 w41 = _mm_load_ps(wp + k + 4);
				v40 =_mm_add_ps(_mm_mul_ps(w40, ap40), v40);
				v41 =_mm_add_ps(_mm_mul_ps(w41, ap41), v41);
			}
			v40 = _mm_add_ps(v40, v41);
			v41 = _mm_add_ps(v40, _mm_movehl_ps(v40, v40));
			v40 = _mm_add_ss(v41, _mm_shuffle_ps(v41, v41, 1));
			_mm_store_ss(bp + j, v40);
		} parallel_endfor
	}
	return CCV_NNC_EXEC_SUCCESS;
}

Exemplo n.º 20

0

Exibir arquivo

Arquivo: adrian-aec.c Projeto: Drakey83/steamlink-sdk

static REAL dotp_sse(REAL a[], REAL b[])
{
#ifdef __SSE__
  /* This is taken from speex's inner product implementation */
  int j;
  REAL sum;
  __m128 acc = _mm_setzero_ps();

  for (j=0;j<NLMS_LEN;j+=8)
  {
    acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j), _mm_loadu_ps(b+j)));
    acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j+4), _mm_loadu_ps(b+j+4)));
  }
  acc = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
  acc = _mm_add_ss(acc, _mm_shuffle_ps(acc, acc, 0x55));
  _mm_store_ss(&sum, acc);

  return sum;
#else
  return dotp(a, b);
#endif
}

Exemplo n.º 21

0

Exibir arquivo

Arquivo: csr.cpp Projeto: dsheffie/Toys

/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */
inline float sum8(__m256 x) 
{
  // hiQuad = ( x7, x6, x5, x4 )
  const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
  // loQuad = ( x3, x2, x1, x0 )
  const __m128 loQuad = _mm256_castps256_ps128(x);
  // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
  const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
  // loDual = ( -, -, x1 + x5, x0 + x4 )
  const __m128 loDual = sumQuad;
  // hiDual = ( -, -, x3 + x7, x2 + x6 )
  const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
  // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
  const __m128 sumDual = _mm_add_ps(loDual, hiDual);
  // lo = ( -, -, -, x0 + x2 + x4 + x6 )
  const __m128 lo = sumDual;
  // hi = ( -, -, -, x1 + x3 + x5 + x7 )
  const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
  // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
  const __m128 sum = _mm_add_ss(lo, hi);
  return _mm_cvtss_f32(sum);
}

Exemplo n.º 22

0

Exibir arquivo

Arquivo: kernels.cpp Projeto: caomw/r4r

double CMercerKernel<float>::Evaluate(float* x, float* y) {

#ifndef __SSE4_1__

    float result = 0;

    for(size_t i=0; i<m_n; i++)
        result += x[i]*y[i];

    return static_cast<double>(result);

#else
    __m128* px = reinterpret_cast<__m128*>(x);
    __m128* py = reinterpret_cast<__m128*>(y);

    float zero = 0.0;
    __m128 sum = _mm_load1_ps(&zero);

    const int mask = 241;       // 4 MSB mask input, 4 LSB mask output

    for(size_t i=0; i<m_offset/4; i++) {

        __m128 temp = _mm_dp_ps(px[i],py[i],mask);
        sum = _mm_add_ss(sum,temp);                         // accumulate result in first register

    }

    float result[4] = {0.0,0.0,0.0,0.0};
    _mm_storeu_ps(result,sum);

    // add offset
    for(size_t i=m_offset; i<m_n; i++)
        result[0] += x[i]*y[i];

    return static_cast<double>(result[0]);
#endif

}

Exemplo n.º 23

0

Exibir arquivo

Arquivo: vector_float.c Projeto: Jaroslav23/sipxtapi

SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n)
{
    int i;
    float z;
    __m128 n1;
    __m128 n2;
    __m128 n3;
    __m128 n4;
 
    z = 0.0f;
    if ((i = n & ~3))
    {
        n4 = _mm_setzero_ps();  //sets sum to zero
        for (i -= 4;  i >= 0;  i -= 4)
        {
            n1 = _mm_loadu_ps(x + i);
            n2 = _mm_loadu_ps(y + i);
            n3 = _mm_mul_ps(n1, n2);
            n4 = _mm_add_ps(n4, n3);
        }
        n4 = _mm_add_ps(_mm_movehl_ps(n4, n4), n4);
        n4 = _mm_add_ss(_mm_shuffle_ps(n4, n4, 1), n4);
        _mm_store_ss(&z, n4);
    }
    /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
    switch (n & 3)
    {
    case 3:
        z += x[n - 3]*y[n - 3];
    case 2:
        z += x[n - 2]*y[n - 2];
    case 1:
        z += x[n - 1]*y[n - 1];
    }
    return z;
}

Exemplo n.º 24

0

Exibir arquivo

Arquivo: pitch_sse.c Projeto: 110Percent/BeeBot

opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
      int N)
{
   int i;
   float xy;
   __m128 sum;
   sum = _mm_setzero_ps();
   /* FIXME: We should probably go 8-way and use 2 sums. */
   for (i=0;i<N-3;i+=4)
   {
      __m128 xi = _mm_loadu_ps(x+i);
      __m128 yi = _mm_loadu_ps(y+i);
      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
   }
   /* Horizontal sum */
   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
   _mm_store_ss(&xy, sum);
   for (;i<N;i++)
   {
      xy = MAC16_16(xy, x[i], y[i]);
   }
   return xy;
}

Exemplo n.º 25

0

Exibir arquivo

Arquivo: x86intrin-2.c Projeto: AlexDenisov/clang

__m128 __attribute__((__target__("sse"))) mm_add_ss_wrap(__m128 a, __m128 b) {
  return _mm_add_ss(a, b);
}

Exemplo n.º 26

0

Exibir arquivo

Arquivo: ibMtx4.cpp Projeto: Innabus/Innabus

ibMtx4& ibMtx4::Invert()
{
	f32* src = &data.a[0][0];
	__m128 minor0, minor1, minor2, minor3;
	__m128 row0, row1, row2, row3;
	__m128 det, tmp1;
#if !defined NDEBUG || defined STATIC
	// Suppress RTC error for uninit vars
	f32 init = 0.f;
	row3 = row1 = tmp1 = _mm_load_ps1( &init );
#endif // NDEBUG
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	// -----------------------------------------------
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);

	return *this;
}

Exemplo n.º 27

0

Exibir arquivo

Arquivo: phash.cpp Projeto: BenanHamid/track-o-bot

phash phash_for_pixmap(const QPixmap& pixmap) {
  static bool cos_table_initialized = false;

  ALIGN(16, static float cos_table[8][8][32][32]);
  ALIGN(16, float intensity[32][32]);

  if(!cos_table_initialized) {
    cos_table_initialized = true;

    // 32x32 DCT, though we are only interested in the top left 8x8, representing lowest frequencies in the image
    for(int u = 0; u < 8; u++) {
      for(int v = 0; v < 8; v++) {
        for(int y = 0; y < 32; y++) {
          for(int x = 0; x < 32; x++) {
            cos_table[v][u][y][x] = cosf(M_PI / 32.0f * (x + 0.5f) * u)
                                  * cosf(M_PI / 32.0f * (y + 0.5f) * v);
          }
        }
      }
    }
  }

  // Scale down to 32x32
  QImage image = pixmap.scaled(32, 32, Qt::IgnoreAspectRatio, Qt::SmoothTransformation).toImage();

  float dct[64];
  int counter = 0;

  // Convert to grayscale
  const __m128 luminance = _mm_set_ps(.0f, 0.2126f, 0.7152f, 0.0722f);

  for(int y = 0; y < 32; y++) {
    for(int x = 0; x < 32; x++) {
      QRgb pixel = image.pixel(x, y);

      __m128 p = _mm_set_ps(0, qRed(pixel), qGreen(pixel), qBlue(pixel));
      __m128 v = _mm_mul_ps(luminance, p);
      __m128 t = _mm_add_ps(v, _mm_movehl_ps(v, v));
      __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
      _mm_store_ss(&intensity[y][x], sum);
    }
  }

  // DCT
  for(int u = 0; u < 8; u++) {
    for(int v = 0; v < 8; v++) {
      __m128 acc = _mm_setzero_ps();

      for(int y = 0; y < 32; y++) {
        for(int x = 0; x < 32; x+=4) {
          __m128 in = _mm_load_ps(&intensity[y][x]);
          __m128 cos = _mm_load_ps(&cos_table[v][u][y][x]);
          __m128 out = _mm_mul_ps(in, cos);
          acc = _mm_add_ps(out, acc);
        }
      }

      __m128 t = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
      __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
      _mm_store_ss(&dct[counter++], sum);
    }
  }

  // Mean, skip first one
  float mean = 0.0;
  for(int i = 1; i < 64; i++) {
    mean += dct[i];
  }
  mean /= 63;

  // Calculate the final hash
  phash hash = 0;

  for(int i = 0; i < 64; i++) {
    phash val = dct[i] > mean;
    hash |= val << i;
  }

  return hash;
}

Exemplo n.º 28

0

Exibir arquivo

Arquivo: avx_vectorization.hpp Projeto: wichtounet/etl

 /*!
  * \brief Perform an horizontal sum of the given vector.
  * \param in The input vector type
  * \return the horizontal sum of the vector
  */
 ETL_STATIC_INLINE(float) hadd(avx_simd_float in) {
     const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value));
     const __m128 x64  = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
     const __m128 x32  = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
     return _mm_cvtss_f32(x32);
 }

Exemplo n.º 29

0

Exibir arquivo

Arquivo: kernel_sgemv_x86_atom_lib4.c Projeto: m-sonntag/hpmpc

void kernel_sgemv_t_1_lib4(int kmax, int kna, float *A, int sda, float *x, float *y, int alg)
	{
	if(kmax<=0) return;
	
	const int lda = 4;
	
	int k;
	int ka = kmax-kna; // number from aligned positon
	
	__m128
		a_00_10_20_30,
		x_0_1_2_3,
		y_0, y_1;
	
	y_0 = _mm_setzero_ps();	

	k = 0;
	if(kna>0)
		{
		for(; k<kna; k++)
			{
		
			x_0_1_2_3 = _mm_load_ss( &x[0] );

			a_00_10_20_30 = _mm_load_ss( &A[0+lda*0] );
		
/*			y_0 += a_00_10_20_30 * x_0_1_2_3;*/
			a_00_10_20_30 = _mm_mul_ss( a_00_10_20_30, x_0_1_2_3 );
			y_0 = _mm_add_ss( y_0, a_00_10_20_30 );
		
			x += 1;
			A += 1;

			}

		A += (sda-1)*lda;
		}

	k = 0;
	for(; k<ka-3; k+=4)
		{
		
		x_0_1_2_3 = _mm_loadu_ps( &x[0] );

		a_00_10_20_30 = _mm_load_ps( &A[0+lda*0] );
		
/*		y_0 += a_00_10_20_30 * x_0_1_2_3;*/
		a_00_10_20_30 = _mm_mul_ps( a_00_10_20_30, x_0_1_2_3 );
		y_0 = _mm_add_ps( y_0, a_00_10_20_30 );
		
		x += 4;
		A += 4;

		A += (sda-1)*lda;

		}
	for(; k<ka; k++)
		{
		
		x_0_1_2_3 = _mm_load_ss( &x[0] );

		a_00_10_20_30 = _mm_load_ss( &A[0+lda*0] );
	
/*		y_0 += a_00_10_20_30 * x_0_1_2_3;*/
		a_00_10_20_30 = _mm_mul_ss( a_00_10_20_30, x_0_1_2_3 );
		y_0 = _mm_add_ss( y_0, a_00_10_20_30 );
	
		x += 1;
		A += 1;
		
		}

	__m128
		y_0_1_2_3;

	y_1 = _mm_setzero_ps();
	y_0 = _mm_hadd_ps(y_0, y_1);
	y_0 = _mm_hadd_ps(y_0, y_1);

	if(alg==0)
		{
		_mm_store_ss(&y[0], y_0);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm_load_ss( &y[0] );

		y_0_1_2_3 = _mm_add_ss(y_0_1_2_3, y_0);
	
		_mm_store_ss(&y[0], y_0_1_2_3);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm_load_ss( &y[0] );

		y_0_1_2_3 = _mm_sub_ss(y_0_1_2_3, y_0);
	
		_mm_store_ss(&y[0], y_0_1_2_3);
		}

	}

Exemplo n.º 30

0

Exibir arquivo

Arquivo: Mat44.cpp Projeto: wrdn/SeasonalGlobe

// Does inverse according to Cramers Rule
// See ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
void Mat44::Cramers_Inverse_SSE(const Mat44 *out, f32 &detv) const
{
	f32 *src = (f32*)&mat;

	__m128 minor0=_mm_setzero_ps(), minor1=_mm_setzero_ps(), minor2=_mm_setzero_ps(), minor3=_mm_setzero_ps();
	__m128 row0=_mm_setzero_ps(),   row1=_mm_setzero_ps(),   row2=_mm_setzero_ps(),   row3=_mm_setzero_ps();
	__m128 det=_mm_setzero_ps(),    tmp1=_mm_setzero_ps();

	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	
	_mm_store_ss(&detv, det);

	Mat44 t;
	if(out)
	{
		src = (f32*)out->mat;
	}
	else
	{
		src = t.mat;
	}

	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);
};