Ejemplo n.º 1
0
/* Combines unpack and accumulate */
void vector_accumulate_8bit(float *out, const char *in, int n) {
#ifdef FOLD_USE_INTRINSICS
    __m128 in_, out_, tmp_;
    float ftmp;
    int ii;
    for (ii = 0 ; ii < (n & -16) ; ii += 16) {
        __builtin_prefetch(out + 64, 1, 0);
        __builtin_prefetch(in  + 64, 0, 0);

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < (n & -4) ; ii += 4) {
        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < n ; ii++) {  // Cast these without intrinsics
        ftmp = (float)(*in);
        out_ = _mm_load_ss(out);
        in_ = _mm_load_ss(&ftmp);
        tmp_ = _mm_add_ss(out_, in_);
        _mm_store_ss(out, tmp_);
        in  += 1;
        out += 1;
    }
    _mm_empty();
#else
    int i;
    for (i=0; i<n; i++) { out[i] += (float)in[i]; }
#endif
}
Ejemplo n.º 2
0
static SColor4ub sse_ConvertRGBColorTo4ub(const RGBColor& src)
{
	const __m128 zero = _mm_setzero_ps();
	const __m128 _255 = _mm_set_ss(255.0f);
	__m128 r = _mm_load_ss(&src.X);
	__m128 g = _mm_load_ss(&src.Y);
	__m128 b = _mm_load_ss(&src.Z);

	// C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
	r = _mm_max_ss(r, zero);
	g = _mm_max_ss(g, zero);
	b = _mm_max_ss(b, zero);

	r = _mm_mul_ss(r, _255);
	g = _mm_mul_ss(g, _255);
	b = _mm_mul_ss(b, _255);

	r = _mm_min_ss(r, _255);
	g = _mm_min_ss(g, _255);
	b = _mm_min_ss(b, _255);

	// convert to integer and combine channels using bit logic
	int ri = _mm_cvtss_si32(r);
	int gi = _mm_cvtss_si32(g);
	int bi = _mm_cvtss_si32(b);

	return SColor4ub(ri, gi, bi, 0xFF);
}
Ejemplo n.º 3
0
void vector_accumulate(float *out, const float *in, int n) {
#ifdef FOLD_USE_INTRINSICS
    __m128 in_, out_, tmp_;
    int ii;
    for (ii = 0 ; ii < (n & -16) ; ii += 16) {
        __builtin_prefetch(out + 64, 1, 0);
        __builtin_prefetch(in  + 64, 0, 0);

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < (n & -4) ; ii += 4) {
        in_  = _MM_LOAD_PS(in);
        out_ = _MM_LOAD_PS(out);
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < n ; ii++) {
        in_  = _mm_load_ss(in);
        out_ = _mm_load_ss(out);
        tmp_ = _mm_add_ss(out_, in_);
        _mm_store_ss(out, tmp_);
        in  += 1;
        out += 1;
    }
    _mm_empty();
#else
    int i;
    for (i=0; i<n; i++) { out[i] += in[i]; }
#endif
}
Ejemplo n.º 4
0
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
	int i;
	int limit = data_len - 4;
	__m128 sum0;

	(void) lag;
	FLAC__ASSERT(lag <= 4);
	FLAC__ASSERT(lag <= data_len);

	sum0 = _mm_setzero_ps();

	for(i = 0; i <= limit; i++) {
		__m128 d, d0;
		d0 = _mm_loadu_ps(data+i);
		d = d0; d = _mm_shuffle_ps(d, d, 0);
		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
	}

	{
		__m128 d0 = _mm_setzero_ps();
		limit++; if(limit < 0) limit = 0;

		for(i = data_len-1; i >= limit; i--) {
			__m128 d;
			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
			d0 = _mm_move_ss(d0, d);
			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
		}
	}

	_mm_storeu_ps(autoc,   sum0);
}
Ejemplo n.º 5
0
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
	__m128 xmm0, xmm2, xmm5;

	(void) lag;
	FLAC__ASSERT(lag > 0);
	FLAC__ASSERT(lag <= 4);
	FLAC__ASSERT(lag <= data_len);
	FLAC__ASSERT(data_len > 0);

	xmm5 = _mm_setzero_ps();

	xmm0 = _mm_load_ss(data++);
	xmm2 = xmm0;
	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);

	xmm0 = _mm_mul_ps(xmm0, xmm2);
	xmm5 = _mm_add_ps(xmm5, xmm0);

	data_len--;

	while(data_len)
	{
		xmm0 = _mm_load1_ps(data++);

		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
		xmm2 = _mm_move_ss(xmm2, xmm0);
		xmm0 = _mm_mul_ps(xmm0, xmm2);
		xmm5 = _mm_add_ps(xmm5, xmm0);

		data_len--;
	}

	_mm_storeu_ps(autoc, xmm5);
}
Ejemplo n.º 6
0
/*
 =================
 FloatToShort
 =================
*/
short FloatToShort (float f){

#if defined SIMD_X86

	__m128	xmm;
	int		i;

	xmm = _mm_load_ss(&f);
	xmm = _mm_max_ss(xmm, _mm_set_ss(-32768.0f));
	xmm = _mm_min_ss(xmm, _mm_set_ss(32767.0f));
	i = _mm_cvtt_ss2si(xmm);

	return i;

#else

	int		i;

	i = (int)f;

	if (i < -32768)
		return -32768;

	if (i > 32767)
		return 32767;

	return i;

#endif
}
Ejemplo n.º 7
0
static inline long
conv_yF_yHalf (const float *src, uint16_t *dst, long samples)
{
  const __v4sf *s_vec;
  uint64_t     *d_vec;

  long n = samples;

  s_vec = (const __v4sf *)src;
  d_vec = (uint64_t *)dst;

  while (n >= 4)
    {
      __m128 in_val = _mm_loadu_ps((float *)s_vec++);
      __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
      _mm_storel_epi64((__m128i *)d_vec++, out_val);
      n -= 4;
    }

  src = (const float *)s_vec;
  dst = (uint16_t *)d_vec;

  while (n)
    {
      __m128 in_val = _mm_load_ss(src++);
      __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
      *dst++ = _mm_extract_epi16(out_val, 0);
      n -= 1;
    }

  return samples;
}
Ejemplo n.º 8
0
/*
====================
idMD5Mesh::CalculateBounds
====================
*/
void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const {

	__m128 minX = vector_float_posInfinity;
	__m128 minY = vector_float_posInfinity;
	__m128 minZ = vector_float_posInfinity;
	__m128 maxX = vector_float_negInfinity;
	__m128 maxY = vector_float_negInfinity;
	__m128 maxZ = vector_float_negInfinity;
	for ( int i = 0; i < numMeshJoints; i++ ) {
		const idJointMat & joint = entJoints[meshJoints[i]];
		__m128 x = _mm_load_ps( joint.ToFloatPtr() + 0 * 4 );
		__m128 y = _mm_load_ps( joint.ToFloatPtr() + 1 * 4 );
		__m128 z = _mm_load_ps( joint.ToFloatPtr() + 2 * 4 );
		minX = _mm_min_ps( minX, x );
		minY = _mm_min_ps( minY, y );
		minZ = _mm_min_ps( minZ, z );
		maxX = _mm_max_ps( maxX, x );
		maxY = _mm_max_ps( maxY, y );
		maxZ = _mm_max_ps( maxZ, z );
	}
	__m128 expand = _mm_splat_ps( _mm_load_ss( & maxJointVertDist ), 0 );
	minX = _mm_sub_ps( minX, expand );
	minY = _mm_sub_ps( minY, expand );
	minZ = _mm_sub_ps( minZ, expand );
	maxX = _mm_add_ps( maxX, expand );
	maxY = _mm_add_ps( maxY, expand );
	maxZ = _mm_add_ps( maxZ, expand );
	_mm_store_ss( bounds.ToFloatPtr() + 0, _mm_splat_ps( minX, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 1, _mm_splat_ps( minY, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 2, _mm_splat_ps( minZ, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 3, _mm_splat_ps( maxX, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );

}
Ejemplo n.º 9
0
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
	int i;
	int limit = data_len - 16;
	__m128 sum0, sum1, sum2, sum3;

	(void) lag;
	FLAC__ASSERT(lag <= 16);
	FLAC__ASSERT(lag <= data_len);

	sum0 = _mm_setzero_ps();
	sum1 = _mm_setzero_ps();
	sum2 = _mm_setzero_ps();
	sum3 = _mm_setzero_ps();

	for(i = 0; i <= limit; i++) {
		__m128 d, d0, d1, d2, d3;
		d0 = _mm_loadu_ps(data+i);
		d1 = _mm_loadu_ps(data+i+4);
		d2 = _mm_loadu_ps(data+i+8);
		d3 = _mm_loadu_ps(data+i+12);
		d = d0; d = _mm_shuffle_ps(d, d, 0);
		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
		sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d));
	}

	{
		__m128 d0 = _mm_setzero_ps();
		__m128 d1 = _mm_setzero_ps();
		__m128 d2 = _mm_setzero_ps();
		__m128 d3 = _mm_setzero_ps();
		limit++; if(limit < 0) limit = 0;

		for(i = data_len-1; i >= limit; i--) {
			__m128 d;
			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
			d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3));
			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
			d3 = _mm_move_ss(d3, d2);
			d2 = _mm_move_ss(d2, d1);
			d1 = _mm_move_ss(d1, d0);
			d0 = _mm_move_ss(d0, d);
			sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3));
			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
		}
	}

	_mm_storeu_ps(autoc,   sum0);
	_mm_storeu_ps(autoc+4, sum1);
	_mm_storeu_ps(autoc+8, sum2);
	_mm_storeu_ps(autoc+12,sum3);
}
// static
void LLViewerJointMesh::updateGeometry(LLFace *mFace, LLPolyMesh *mMesh)
{
	LLStrider<LLVector3> o_vertices;
	LLStrider<LLVector3> o_normals;

	//get vertex and normal striders
	LLVertexBuffer* buffer = mFace->getVertexBuffer();
	buffer->getVertexStrider(o_vertices,  0);
	buffer->getNormalStrider(o_normals,   0);

	F32* __restrict vert = o_vertices[0].mV;
	F32* __restrict norm = o_normals[0].mV;

	const F32* __restrict weights = mMesh->getWeights();
	const LLVector4a* __restrict coords = (LLVector4a*) mMesh->getCoords();
	const LLVector4a* __restrict normals = (LLVector4a*) mMesh->getNormals();

	U32 offset = mMesh->mFaceVertexOffset*4;
	vert += offset;
	norm += offset;

	for (U32 index = 0; index < mMesh->getNumVertices(); index++)
	{
		// equivalent to joint = floorf(weights[index]);
		S32 joint = _mm_cvtt_ss2si(_mm_load_ss(weights+index));
		F32 w = weights[index] - joint;		

		LLMatrix4a gBlendMat;

		if (w != 0.f)
		{
			// blend between matrices and apply
			gBlendMat.setLerp(gJointMatAligned[joint+0],
							  gJointMatAligned[joint+1], w);

			LLVector4a res;
			gBlendMat.affineTransform(coords[index], res);
			res.store4a(vert+index*4);
			gBlendMat.rotate(normals[index], res);
			res.store4a(norm+index*4);
		}
		else
		{  // No lerp required in this case.
			LLVector4a res;
			gJointMatAligned[joint].affineTransform(coords[index], res);
			res.store4a(vert+index*4);
			gJointMatAligned[joint].rotate(normals[index], res);
			res.store4a(norm+index*4);
		}
	}

	buffer->flush();
}
Ejemplo n.º 11
0
void FastResampler_FirFilter2_Cn_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
	Q_UNUSED(channels);
	for(unsigned int c = 0; c < channels; ++c) {
		__m128 sum = _mm_setzero_ps();
		__m128 v_frac = _mm_set1_ps(frac);
		float *input2 = input + c;
		for(unsigned int i = 0; i < filter_length / 4; ++i) {
			__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
			coef1 += 4; coef2 += 4;
			__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
			__m128 v_input1 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input2 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input3 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input4 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input = _mm_movelh_ps(_mm_unpacklo_ps(v_input1, v_input2), _mm_unpacklo_ps(v_input3, v_input4));
			sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value));
		}
		__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e));
		__m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01));
		_mm_store_ss(output + c, sum3);
	}
}
Ejemplo n.º 12
0
/*
=====================
R_CopyDecalSurface
=====================
*/
static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes,
									const decal_t * decal, const float fadeColor[4] ) {
	assert_16_byte_aligned( &verts[numVerts] );
	assert_16_byte_aligned( &indexes[numIndexes] );
	assert_16_byte_aligned( decal->indexes );
	assert_16_byte_aligned( decal->verts );
	assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
	assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
	assert_16_byte_aligned( fadeColor );


	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
	const __m128 vector_fade_color = _mm_load_ps( fadeColor );
	const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 );

	// copy vertices and apply depth/time based fading
	assert_offsetof( idDrawVert, color, 6 * 4 );
	for ( int i = 0; i < decal->numVerts; i++ ) {
		const idDrawVert &srcVert = decal->verts[i];
		idDrawVert &dstVert = verts[numVerts + i];

		__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert +  0 ) );
		__m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) );
		__m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 );

		__m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color );
		__m128i colorInt = _mm_cvtps_epi32( timeDepthFade );
		__m128i colorShort = _mm_packs_epi32( colorInt, colorInt );
		__m128i colorByte = _mm_packus_epi16( colorShort, colorShort );
		v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) );

		_mm_stream_si128( (__m128i *)( (byte *)&dstVert +  0 ), v0 );
		_mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 );
	}

	// copy indexes
	assert( ( decal->numIndexes & 7 ) == 0 );
	assert( sizeof( triIndex_t ) == 2 );
	for ( int i = 0; i < decal->numIndexes; i += 8 ) {
		__m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] );

		vi = _mm_add_epi16( vi, vector_short_num_verts );

		_mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi );
	}

	_mm_sfence();

}
Ejemplo n.º 13
0
void unpack_a8_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = (__m128i)_mm_load_ss((float*)&source[i * 4]);

		t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), t0);
		t0 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0);

		_mm_stream_si128((__m128i*)&dest[i * 16], t0);
	}
}
Ejemplo n.º 14
0
void replace_a8_rgba8_sse2(const Uint8* alpha, const Uint32 size, Uint8* source)
{
	__m128i t0;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]);

		t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), t0);
		t0 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0);

		_mm_maskmoveu_si128(t0, _mm_set1_epi32(0xFF000000),
			(char*)&source[i * 16]);
	}
}
Ejemplo n.º 15
0
void unpack_l8_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = (__m128i)_mm_load_ss((float*)&source[i * 4]);

		t0 = _mm_unpacklo_epi8(t0, t0);
		t0 = _mm_unpacklo_epi16(t0, t0);
		t0 = _mm_or_si128(t0, _mm_set1_epi32(0xFF000000));

		_mm_stream_si128((__m128i*)&dest[i * 16], t0);
	}
}
Ejemplo n.º 16
0
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0,
	const Uint8* source1, Uint8* dest)
{
	__m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = _mm_load_si128((__m128i*)&source0[i * 16]);
		t1 = _mm_load_si128((__m128i*)&source1[i * 16]);
		t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]);

		t2 = _mm_unpacklo_epi8(t2, t2);
		t2 = _mm_unpacklo_epi16(t2, t2);

		t3 = _mm_unpacklo_epi8(t0, t0);
		t4 = _mm_unpacklo_epi8(t1, t1);

		t5 = _mm_unpacklo_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t9 = _mm_adds_epu16(t7, t8);
		t9 = _mm_srli_epi16(t9, 8);

		t3 = _mm_unpackhi_epi8(t0, t0);
		t4 = _mm_unpackhi_epi8(t1, t1);

		t5 = _mm_unpackhi_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t10 = _mm_adds_epu16(t7, t8);
		t10 = _mm_srli_epi16(t10, 8);

		t10 = _mm_packus_epi16(t9, t10);

		_mm_stream_si128((__m128i*)&dest[i * 16], t10);
	}
}
Ejemplo n.º 17
0
static void
TEST (void)
{
  union128 u, s1;
  float e[4];
  int i;
   
  s1.x = _mm_set_ps (24.43, 68.346, 43.35, 546.46);
  u.x = test (s1.x); 
  
  for (i = 0; i < 4; i++)
    {
      __m128 tmp = _mm_load_ss (&s1.a[i]);
      tmp = _mm_rsqrt_ss (tmp);
      _mm_store_ss (&e[i], tmp);
    }

  if (check_union128 (u, e))
    abort ();
}
Ejemplo n.º 18
0
void Vec3f::Normalize()
{
/*

	float mod = sqrt(x*x + y*y + z*z);
	x = x / mod;
	y = y / mod;
	z = z / mod;
*/
	const float lengthSq = x * x + y * y + z * z;
	if ((*(int*)&lengthSq) != 0) 
	{
		float inv;// = InvSqrt(lengthSq);
		__m128 in = _mm_load_ss(&lengthSq);
		_mm_store_ss(&inv, _mm_rsqrt_ss(in));
		x = x*inv;
		y = y*inv;
		z = z*inv;
	}

}
void
intrin_sse_scalar_mult_add_su3_vector(su3_vectorf* aa, su3_vectorf* bb, float cc, su3_vectorf* dd)
{

	 /* XMM Variables */
	 __m128 xmm2, xmm3, xmm0, xmm1, xmm4;

	xmm4 = _mm_load_ss((float *)&((cc)) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm4, 0x00 );
	xmm0 = _mm_loadu_ps((float *)&((aa)->c[0]) );
	xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((aa)->c[2]) );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 );
	xmm2 = _mm_loadu_ps((float *)&((bb)->c[0]) );
	xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&((bb)->c[2]) );
	xmm3 = _mm_shuffle_ps( xmm3, xmm3, 0x44 );
	xmm2 = _mm_mul_ps( xmm2, xmm4 );
	xmm3 = _mm_mul_ps( xmm3, xmm4 );
	xmm0 = _mm_add_ps( xmm0, xmm2 );
	xmm1 = _mm_add_ps( xmm1, xmm3 );
	_mm_storeu_ps((float *)&((dd)->c[0]), xmm0 );
	_mm_storel_pi((__m64 *)&((dd)->c[2]), xmm1 );
}
Ejemplo n.º 20
0
/*
============
idSIMD_SSE::CmpLT

  dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
	int i, cnt, pre, post;
	float *aligned;
	__m128 xmm0, xmm1;
	__m128i xmm0i;
	int cnt_l;
	char *src0_p;
	char *constant_p;
	char *dst_p;
	int mask_l;
	int dst_l;
	
	/* if the float array is not aligned on a 4 byte boundary */
	if ( ((int) src0) & 3 ) {
		/* unaligned memory access */
		pre = 0;
		cnt = count >> 2;
		post = count - (cnt<<2);

	/*
		__asm	mov			edx, cnt													
		__asm	test		edx, edx													
		__asm	je			doneCmp														
	*/
		cnt_l = cnt;
		if(cnt_l != 0) {
	/*
		__asm	push		ebx															
		__asm	neg			edx															
		__asm	mov			esi, src0													
		__asm	prefetchnta	[esi+64]													
		__asm	movss		xmm1, constant												
		__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )						
		__asm	mov			edi, dst													
		__asm	mov			cl, bitNum	
	*/
			cnt_l = -cnt_l;
			src0_p = (char *) src0;
			_mm_prefetch(src0_p+64, _MM_HINT_NTA);
			constant_p = (char *) &constant;
			xmm1 = _mm_load_ss((float *)constant_p);
			xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
			dst_p = (char *)dst;
	/*
			__asm loopNA:																	
	*/
			do {
	/*
		__asm	movups		xmm0, [esi]													
		__asm	prefetchnta	[esi+128]													
		__asm	cmpltps		xmm0, xmm1													
		__asm	movmskps	eax, xmm0																												\
		__asm	mov			ah, al														
		__asm	shr			ah, 1														
		__asm	mov			bx, ax														
		__asm	shl			ebx, 14														
		__asm	mov			bx, ax														
		__asm	and			ebx, 0x01010101												
		__asm	shl			ebx, cl														
		__asm	or			ebx, dword ptr [edi]										
		__asm	mov			dword ptr [edi], ebx										
		__asm	add			esi, 16														
		__asm	add			edi, 4														
		__asm	inc			edx															
		__asm	jl			loopNA														
		__asm	pop			ebx		
	*/
				xmm0 = _mm_loadu_ps((float *) src0_p);
				_mm_prefetch(src0_p+128, _MM_HINT_NTA);
				xmm0 = _mm_cmplt_ps(xmm0, xmm1);
				// Simplify using SSE2
				xmm0i = (__m128i) xmm0;
				xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
				xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
				mask_l = _mm_cvtsi128_si32(xmm0i);
				// End 
				mask_l = mask_l &  0x01010101;
				mask_l = mask_l << bitNum;
				dst_l  = *((int *) dst_p);
				mask_l = mask_l | dst_l;
				*((int *) dst_p) = mask_l;
				src0_p = src0_p + 16;
				dst_p = dst_p + 4;
				cnt_l = cnt_l + 1;
			} while (cnt_l < 0);
		}													
	}																					
Ejemplo n.º 21
0
inline int ftoi_fast(float f)
{
    return _mm_cvtt_ss2si(_mm_load_ss(&f));     // SSE1 instructions for float->int
}
Ejemplo n.º 22
0
void mexFunction(int nlhs, mxArray *plhs[],
                 int nrhs, const mxArray *prhs[])
{
        const float * kf   = coeff;
        float * src = _src;
        float * dst = _dst;
        int i = 0, k, nz = length;
        
        // float delta = 0.000001f;
        __m128 d4 = _mm_setzero_ps();
        
        float * S;
        
        __m128 s0, s1, s2, s3, 
               t0, t1, t2, t3;
        __m128 f;
        
        for(i = 0; i <= width - 16; i += 16 )
        {
            s0 = d4, s1 = d4, s2 = d4, s3 = d4;

            for( k = 0; k < nz; k++ )
            {
                f = _mm_load_ss(kf + k);
                f = _mm_shuffle_ps(f, f, 0);  // (__m128 f, __m128 f, unsigned int imm8)
                S = src + i + k;

                t0 = _mm_loadu_ps(S);
                t1 = _mm_loadu_ps(S + 4);
                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
                s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));

                t0 = _mm_loadu_ps(S + 8);
                t1 = _mm_loadu_ps(S + 12);
                s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
                s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
            }

            _mm_storeu_ps(dst + i, s0);
            _mm_storeu_ps(dst + i + 4, s1);
            _mm_storeu_ps(dst + i + 8, s2);
            _mm_storeu_ps(dst + i + 12, s3);
        }
// 
        for( ; i <= width - 4; i += 4 )
        {
            s0 = d4;

            for( k = 0; k < nz; k++ )
            {
                f = _mm_load_ss(kf + k);
                f = _mm_shuffle_ps(f, f, 0);
                t0 = _mm_loadu_ps(src + k + i);
                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
            }
            _mm_storeu_ps(dst + i, s0);
        }
        
        for (; i < width; i++)
        {
            for( k = 0; k < nz; k++ )
            {
                *(dst + i) += *(src + i + k) * *(kf + k); 
            }
        }

        return;
}
Ejemplo n.º 23
0
inline int FloatToInt( float x )
{
	return _mm_cvtt_ss2si( _mm_load_ss( &x ) );
}
Ejemplo n.º 24
0
	inline void SSESqrt( float * pOut, float * pIn )
	{
	   _mm_store_ss( pOut, _mm_sqrt_ss( _mm_load_ss( pIn ) ) );
	   // compiles to movss, sqrtss, movss
	}
// This function handles the case when set_count = 2, in which we cannot
// unroll the set loop by 4 to meet the SSE requirement (4 elements).
static void InternalUnroll2Inv(
    const OMX_F32 *in,
    OMX_F32 *out,
    const OMX_F32 *twiddle,
    OMX_INT n) {
  OMX_INT i;
  OMX_INT n_by_2 = n >> 1;
  OMX_INT n_by_4 = n >> 2;
  OMX_INT n_mul_2 = n << 1;
  OMX_F32 *out0 = out;

  for (i = 0; i < n_by_2; i += 8) {
    const OMX_F32 *tw1  = twiddle + i;
    const OMX_F32 *tw2  = tw1 + i;
    const OMX_F32 *tw3  = tw2 + i;
    const OMX_F32 *tw1e = tw1 + 4;
    const OMX_F32 *tw2e = tw2 + 8;
    const OMX_F32 *tw3e = tw3 + 12;

    VC v_tw1;
    VC v_tw2;
    VC v_tw3;
    VC v_t0;
    VC v_t1;
    VC v_t2;
    VC v_t3;
    VC v_t4;
    VC v_t5;
    VC v_t6;
    VC v_t7;

    v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
                                _mm_load_ss(tw1e),
                                _MM_SHUFFLE(0, 0, 0, 0));
    v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
                                _mm_load_ss(tw1e + n_mul_2),
                                _MM_SHUFFLE(0, 0, 0, 0));
    v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
                                _mm_load_ss(tw2e),
                                _MM_SHUFFLE(0, 0, 0, 0));
    v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
                                _mm_load_ss(tw2e + n_mul_2),
                                _MM_SHUFFLE(0, 0, 0, 0));
    v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
                                _mm_load_ss(tw3e),
                                _MM_SHUFFLE(0, 0, 0, 0));
    v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
                                _mm_load_ss(tw3e + n_mul_2),
                                _MM_SHUFFLE(0, 0, 0, 0));

    __m128 xmm0;
    __m128 xmm1;
    __m128 xmm2;
    __m128 xmm3;
    __m128 xmm4;
    __m128 xmm5;
    __m128 xmm6;
    __m128 xmm7;

    const OMX_F32 *in0 = in + (i << 1);
    xmm0 = _mm_load_ps(in0);
    xmm1 = _mm_load_ps(in0 + 4);
    xmm2 = _mm_load_ps(in0 + 8);
    xmm3 = _mm_load_ps(in0 + 12);
    v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
    v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
    v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
    v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));

    xmm4 = _mm_load_ps(in0 + n);
    xmm5 = _mm_load_ps(in0 + n + 4);
    xmm6 = _mm_load_ps(in0 + n + 8);
    xmm7 = _mm_load_ps(in0 + n + 12);
    v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
    v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
    v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
    v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));

    OMX_F32 *out1 = out0 + n_by_4;
    OMX_F32 *out2 = out1 + n_by_4;
    OMX_F32 *out3 = out2 + n_by_4;

    RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
                         &v_tw1, &v_tw2, &v_tw3,
                         &v_t0, &v_t1, &v_t2, &v_t3);

    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
                               &v_t4, &v_t5, &v_t6, &v_t7, n);

    out0 += 4;
  }
}
void
intrin_sse_mult_su3_mat_vec(su3_matrixf *aa, su3_vectorf* bb, su3_vectorf* cc)
{

	 /* XMM Variables */
	 __m128 xmm2, xmm3, xmm0, xmm1, xmm6, xmm7, xmm4, xmm5;

	xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->c[0]) );
	xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->c[1]) );
	xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->c[2]) );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x44 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0x44 );
	xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].real) );
	xmm3 = _mm_shuffle_ps( xmm3, xmm7, 0x00 );
	xmm4 = _mm_load_ss((float *)&((aa)->e[0][1].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].real) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 );
	xmm3 = _mm_mul_ps( xmm3, xmm0 );
	xmm4 = _mm_mul_ps( xmm4, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm4 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].real) );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xB1 );
	 	 xmm0 = _mm_xor_ps( xmm0, _sse_sgn13.xmm );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x11 );
	 	 xmm1 = _mm_xor_ps( xmm1, _sse_sgn13.xmm );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB1 );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn13.xmm );
	xmm4 = _mm_load_ss((float *)&((aa)->e[0][0].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 );
	xmm4 = _mm_mul_ps( xmm4, xmm0 );
	xmm3 = _mm_add_ps( xmm3, xmm4 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][1].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	_mm_storeu_ps((float *)&((cc)->c[0]), xmm3 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) );
	xmm5 = _mm_load_ss((float *)&((aa)->e[2][1].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm1 );
	xmm6 = _mm_add_ps( xmm6, xmm5 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB4 );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn4.xmm );
	xmm7 = _mm_loadl_pi(xmm7, (__m64 *)&((aa)->e[2][2]) );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x05 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm6 = _mm_add_ps( xmm6, xmm7 );
	xmm7 = xmm6 ; 
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0xEE );
	xmm6 = _mm_add_ps( xmm6, xmm7 );
	_mm_storel_pi((__m64 *)&((cc)->c[2]), xmm6 );
}
Ejemplo n.º 27
0
void mBior53::transcols(char** dest, char** sour, unsigned int w, unsigned int h) const
{
        float fz = 0.0f;

        int n;
        float s, d;
        __m128 ms, md;
        unsigned int h2 = h / 2;

        const vec1D& tH = gettH();
        const vec1D& tG = gettG();

        for (unsigned int x = 0; x < w / 4; x++) {   //x<w/4   x = 4*x
                for (unsigned int k = 0; k < h2; k++) {

                        ms = _mm_load_ss(&fz);
                        md = ms;

                        for (int m = -2; m <= 2; m++) {
                                n = 2 * k + m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                ms = _mm_add_ps(ms, _mm_mul_ps(_mm_load_ps1(tH.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x]))));
                        }
                        for (int m = 0; m <= 2; m++) {
                                n = 2 * k + m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                md = _mm_add_ps(md, _mm_mul_ps(_mm_load_ps1(tG.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x]))));
                        }

                        if (4*x < w / 2) {
                                if ((w / 2) - (4*x) >= 4)
                                        mmxround4(&dest[k][4*x], ms);
                                else
                                        mmxround4TH(&dest[k][4*x], ms, (w / 2) - (4*x));   //skip first from LL part 10/2-4=1 [lo] o o o o * | * * * o o [hi]
                        } else
                                mmxround4TH(&dest[k][4*x], ms);

                        mmxround4TH(&dest[k+h2][4*x], md);
                }
        }
        _mm_empty();


        //odd remainder
        for (unsigned int x = w - (w % 4); x < w; x++) {
                for (unsigned int k = 0; k < h2; k++) {
                        s = 0;
                        d = 0;
                        for (int m = -2; m <= 2; m++) {
                                n = 2 * k + m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                s += tH[m] * float(sour[n][x]);
                        }
                        for (int m = 0; m <= 2; m++) {
                                n = 2 * k + m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                d += tG[m] * float(sour[n][x]);
                        }

                        if (x < w / 2)
                                dest[k][x] = mmxround(s);
                        else
                                dest[k][x] = mmxroundTH(s); 	        //is this needed? hi band were TH'ed on transrows

                        dest[k+h2][x] = mmxroundTH(d);          //is this needed? hi band were TH'ed on transrows on x>w/2
                }
        }
}
Ejemplo n.º 28
0
void BrushToolEdit::drawInner(const QPoint &pt, float strength)
{
    float fixedStrength = params.strength;
    strength *= fixedStrength;

    auto color = params.color;
    std::array<int, 3> colorParts = Terrain::expandColor(color);
    __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0);

    SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST);
    (void) roundingModeScope;

    switch (tool->type()) {
    case BrushType::Blur:
        drawBlur(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Smoothen:
        drawSmoothen(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Raise:
    case BrushType::Lower:
        if (tool->type() == BrushType::Lower) {
            fixedStrength = -fixedStrength;
            strength = -strength;
        }
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength *= 3.f;
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                (void) before;
                current -= tip * strength;
            });
            break;
        case BrushPressureMode::Constant:
            if (tool->type() == BrushType::Lower) {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength));
                });
            } else {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength));
                });
            }
            break;
        case BrushPressureMode::Adjustable:
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                current = Terrain::quantizeOne(before - tip * strength);
            });
            break;
        }
        break;
    case BrushType::Paint:
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength = 1.f - std::exp2(-strength);

            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                (void) before;

                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                auto factor = _mm_set1_ps(tip * strength);

                // blend
                auto diff = _mm_sub_ps(colorMM, currentMF);
                diff = _mm_mul_ps(diff, factor);
                currentMF = _mm_add_ps(currentMF, diff);

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Constant:
            fixedStrength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);
                // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128());

                // use "before" image to which way of color change is possible, and
                // compute possible range of result color
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * fixedStrength);
                auto adddiff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, adddiff);
                auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps());

                // compute output image
                auto out1 = _mm_max_ps(currentMF, beforeMF);
                auto out2 = _mm_min_ps(currentMF, beforeMF);
                currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2));

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Adjustable:
            strength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);

                // blend
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * strength);
                diff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, diff);

                // convert to RGB32
                beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128());
                beforeMM = _mm_cvttps_epi32(beforeMF);
                beforeMM = _mm_packs_epi32(beforeMM, beforeMM);
                beforeMM = _mm_packus_epi16(beforeMM, beforeMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(beforeMM));
            });
            break;
        }
        break;
    }

}
Ejemplo n.º 29
0
void mBior53::synthcols(char** dest, char** sour, unsigned int w, unsigned int h) const     //w,h of the LO part
{
        float fz = 0.0f;
        float mul2 = 2.0f;

        int n;
        float s2k, s2k1;
        __m128 ms2k, ms2k1;
        unsigned int w2 = 2 * w;

        const vec1D& H2m = getH2m();
        const vec1D& G2m = getG2m();
        const vec1D& H2m1 = getH2m1();
        const vec1D& G2m1 = getG2m1();

        for (unsigned int x = 0; x < w2 / 4; x++) {      //x<w2/2   x = 4*x
                for (unsigned int k = 0; k < h; k++) {

                        ms2k = _mm_load_ss(&fz);
                        ms2k1 = ms2k;

                        for (int m = 0; m <= 0; m++) {                             //s2k even H
                                n = k - m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                ms2k = _mm_add_ps(ms2k, _mm_mul_ps(_mm_load_ps1(H2m.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x]))));
                        }
                        for (int m = 0; m <= 1; m++) {                             //s2k even G
                                n = k - m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                ms2k = _mm_add_ps(ms2k, _mm_mul_ps(_mm_load_ps1(G2m.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n+h][4*x]))));
                        }

                        for (int m = -1; m <= 0; m++) {                           //s2k1 odd H
                                n = k - m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                ms2k1 = _mm_add_ps(ms2k1, _mm_mul_ps(_mm_load_ps1(H2m1.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x]))));
                        }
                        for (int m = -1; m <= 1; m++) {                          //s2k1 odd G
                                n = k - m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                ms2k1 = _mm_add_ps(ms2k1, _mm_mul_ps(_mm_load_ps1(G2m1.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n+h][4*x]))));
                        }

                        __m128 mmul2 = _mm_load_ps1(&mul2);

                        mmxround4(&dest[2*k][4*x], _mm_mul_ps(ms2k, mmul2));
                        mmxround4(&dest[2*k+1][4*x], _mm_mul_ps(ms2k1, mmul2));
                }
        }
        _mm_empty();


        //odd remainder
        for (unsigned int x = w2 - (w2 % 4); x < w2; x++) {
                for (unsigned int k = 0; k < h; k++) {

                        s2k = 0;
                        s2k1 = 0;

                        for (int m = H2m.first(); m <= H2m.last(); m++) {        //s2k even H
                                n = k - m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                s2k += H2m[m] * float(sour[n][x]);
                        }
                        for (int m = G2m.first(); m <= G2m.last(); m++) {        //s2k even G
                                n = k - m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                s2k += G2m[m] * float(sour[n+h][x]);
                        }

                        for (int m = H2m1.first(); m <= H2m1.last(); m++) {     //s2k1 odd H
                                n = k - m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                s2k1 += H2m1[m] * float(sour[n][x]);
                        }
                        for (int m = G2m1.first(); m <= G2m1.last(); m++) {      //s2k1 odd G
                                n = k - m;
                                if (n < 0) n = 0 - n;
                                if (n >= (int)h) n -= 2 * (1 + n - h);
                                s2k1 += G2m1[m] * float(sour[n+h][x]);
                        }

                        dest[2*k][x] = mmxround(2.0f * s2k);
                        dest[2*k+1][x] = mmxround(2.0f * s2k1);
                }
        }
}
Ejemplo n.º 30
0
test (float *e)
{
  return _mm_load_ss (e); 
}