/* Combines unpack and accumulate */ void vector_accumulate_8bit(float *out, const char *in, int n) { #ifdef FOLD_USE_INTRINSICS __m128 in_, out_, tmp_; float ftmp; int ii; for (ii = 0 ; ii < (n & -16) ; ii += 16) { __builtin_prefetch(out + 64, 1, 0); __builtin_prefetch(in + 64, 0, 0); out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < (n & -4) ; ii += 4) { out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < n ; ii++) { // Cast these without intrinsics ftmp = (float)(*in); out_ = _mm_load_ss(out); in_ = _mm_load_ss(&ftmp); tmp_ = _mm_add_ss(out_, in_); _mm_store_ss(out, tmp_); in += 1; out += 1; } _mm_empty(); #else int i; for (i=0; i<n; i++) { out[i] += (float)in[i]; } #endif }
static SColor4ub sse_ConvertRGBColorTo4ub(const RGBColor& src) { const __m128 zero = _mm_setzero_ps(); const __m128 _255 = _mm_set_ss(255.0f); __m128 r = _mm_load_ss(&src.X); __m128 g = _mm_load_ss(&src.Y); __m128 b = _mm_load_ss(&src.Z); // C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) ) r = _mm_max_ss(r, zero); g = _mm_max_ss(g, zero); b = _mm_max_ss(b, zero); r = _mm_mul_ss(r, _255); g = _mm_mul_ss(g, _255); b = _mm_mul_ss(b, _255); r = _mm_min_ss(r, _255); g = _mm_min_ss(g, _255); b = _mm_min_ss(b, _255); // convert to integer and combine channels using bit logic int ri = _mm_cvtss_si32(r); int gi = _mm_cvtss_si32(g); int bi = _mm_cvtss_si32(b); return SColor4ub(ri, gi, bi, 0xFF); }
void vector_accumulate(float *out, const float *in, int n) { #ifdef FOLD_USE_INTRINSICS __m128 in_, out_, tmp_; int ii; for (ii = 0 ; ii < (n & -16) ; ii += 16) { __builtin_prefetch(out + 64, 1, 0); __builtin_prefetch(in + 64, 0, 0); in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < (n & -4) ; ii += 4) { in_ = _MM_LOAD_PS(in); out_ = _MM_LOAD_PS(out); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < n ; ii++) { in_ = _mm_load_ss(in); out_ = _mm_load_ss(out); tmp_ = _mm_add_ss(out_, in_); _mm_store_ss(out, tmp_); in += 1; out += 1; } _mm_empty(); #else int i; for (i=0; i<n; i++) { out[i] += in[i]; } #endif }
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { int i; int limit = data_len - 4; __m128 sum0; (void) lag; FLAC__ASSERT(lag <= 4); FLAC__ASSERT(lag <= data_len); sum0 = _mm_setzero_ps(); for(i = 0; i <= limit; i++) { __m128 d, d0; d0 = _mm_loadu_ps(data+i); d = d0; d = _mm_shuffle_ps(d, d, 0); sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); } { __m128 d0 = _mm_setzero_ps(); limit++; if(limit < 0) limit = 0; for(i = data_len-1; i >= limit; i--) { __m128 d; d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); d0 = _mm_move_ss(d0, d); sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); } } _mm_storeu_ps(autoc, sum0); }
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm2, xmm5; (void) lag; FLAC__ASSERT(lag > 0); FLAC__ASSERT(lag <= 4); FLAC__ASSERT(lag <= data_len); FLAC__ASSERT(data_len > 0); xmm5 = _mm_setzero_ps(); xmm0 = _mm_load_ss(data++); xmm2 = xmm0; xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); xmm0 = _mm_mul_ps(xmm0, xmm2); xmm5 = _mm_add_ps(xmm5, xmm0); data_len--; while(data_len) { xmm0 = _mm_load1_ps(data++); xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); xmm2 = _mm_move_ss(xmm2, xmm0); xmm0 = _mm_mul_ps(xmm0, xmm2); xmm5 = _mm_add_ps(xmm5, xmm0); data_len--; } _mm_storeu_ps(autoc, xmm5); }
/* ================= FloatToShort ================= */ short FloatToShort (float f){ #if defined SIMD_X86 __m128 xmm; int i; xmm = _mm_load_ss(&f); xmm = _mm_max_ss(xmm, _mm_set_ss(-32768.0f)); xmm = _mm_min_ss(xmm, _mm_set_ss(32767.0f)); i = _mm_cvtt_ss2si(xmm); return i; #else int i; i = (int)f; if (i < -32768) return -32768; if (i > 32767) return 32767; return i; #endif }
static inline long conv_yF_yHalf (const float *src, uint16_t *dst, long samples) { const __v4sf *s_vec; uint64_t *d_vec; long n = samples; s_vec = (const __v4sf *)src; d_vec = (uint64_t *)dst; while (n >= 4) { __m128 in_val = _mm_loadu_ps((float *)s_vec++); __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); _mm_storel_epi64((__m128i *)d_vec++, out_val); n -= 4; } src = (const float *)s_vec; dst = (uint16_t *)d_vec; while (n) { __m128 in_val = _mm_load_ss(src++); __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); *dst++ = _mm_extract_epi16(out_val, 0); n -= 1; } return samples; }
/* ==================== idMD5Mesh::CalculateBounds ==================== */ void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const { __m128 minX = vector_float_posInfinity; __m128 minY = vector_float_posInfinity; __m128 minZ = vector_float_posInfinity; __m128 maxX = vector_float_negInfinity; __m128 maxY = vector_float_negInfinity; __m128 maxZ = vector_float_negInfinity; for ( int i = 0; i < numMeshJoints; i++ ) { const idJointMat & joint = entJoints[meshJoints[i]]; __m128 x = _mm_load_ps( joint.ToFloatPtr() + 0 * 4 ); __m128 y = _mm_load_ps( joint.ToFloatPtr() + 1 * 4 ); __m128 z = _mm_load_ps( joint.ToFloatPtr() + 2 * 4 ); minX = _mm_min_ps( minX, x ); minY = _mm_min_ps( minY, y ); minZ = _mm_min_ps( minZ, z ); maxX = _mm_max_ps( maxX, x ); maxY = _mm_max_ps( maxY, y ); maxZ = _mm_max_ps( maxZ, z ); } __m128 expand = _mm_splat_ps( _mm_load_ss( & maxJointVertDist ), 0 ); minX = _mm_sub_ps( minX, expand ); minY = _mm_sub_ps( minY, expand ); minZ = _mm_sub_ps( minZ, expand ); maxX = _mm_add_ps( maxX, expand ); maxY = _mm_add_ps( maxY, expand ); maxZ = _mm_add_ps( maxZ, expand ); _mm_store_ss( bounds.ToFloatPtr() + 0, _mm_splat_ps( minX, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 1, _mm_splat_ps( minY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 2, _mm_splat_ps( minZ, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 3, _mm_splat_ps( maxX, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) ); }
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { int i; int limit = data_len - 16; __m128 sum0, sum1, sum2, sum3; (void) lag; FLAC__ASSERT(lag <= 16); FLAC__ASSERT(lag <= data_len); sum0 = _mm_setzero_ps(); sum1 = _mm_setzero_ps(); sum2 = _mm_setzero_ps(); sum3 = _mm_setzero_ps(); for(i = 0; i <= limit; i++) { __m128 d, d0, d1, d2, d3; d0 = _mm_loadu_ps(data+i); d1 = _mm_loadu_ps(data+i+4); d2 = _mm_loadu_ps(data+i+8); d3 = _mm_loadu_ps(data+i+12); d = d0; d = _mm_shuffle_ps(d, d, 0); sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d)); sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d)); } { __m128 d0 = _mm_setzero_ps(); __m128 d1 = _mm_setzero_ps(); __m128 d2 = _mm_setzero_ps(); __m128 d3 = _mm_setzero_ps(); limit++; if(limit < 0) limit = 0; for(i = data_len-1; i >= limit; i--) { __m128 d; d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3)); d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3)); d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); d3 = _mm_move_ss(d3, d2); d2 = _mm_move_ss(d2, d1); d1 = _mm_move_ss(d1, d0); d0 = _mm_move_ss(d0, d); sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3)); sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2)); sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); } } _mm_storeu_ps(autoc, sum0); _mm_storeu_ps(autoc+4, sum1); _mm_storeu_ps(autoc+8, sum2); _mm_storeu_ps(autoc+12,sum3); }
// static void LLViewerJointMesh::updateGeometry(LLFace *mFace, LLPolyMesh *mMesh) { LLStrider<LLVector3> o_vertices; LLStrider<LLVector3> o_normals; //get vertex and normal striders LLVertexBuffer* buffer = mFace->getVertexBuffer(); buffer->getVertexStrider(o_vertices, 0); buffer->getNormalStrider(o_normals, 0); F32* __restrict vert = o_vertices[0].mV; F32* __restrict norm = o_normals[0].mV; const F32* __restrict weights = mMesh->getWeights(); const LLVector4a* __restrict coords = (LLVector4a*) mMesh->getCoords(); const LLVector4a* __restrict normals = (LLVector4a*) mMesh->getNormals(); U32 offset = mMesh->mFaceVertexOffset*4; vert += offset; norm += offset; for (U32 index = 0; index < mMesh->getNumVertices(); index++) { // equivalent to joint = floorf(weights[index]); S32 joint = _mm_cvtt_ss2si(_mm_load_ss(weights+index)); F32 w = weights[index] - joint; LLMatrix4a gBlendMat; if (w != 0.f) { // blend between matrices and apply gBlendMat.setLerp(gJointMatAligned[joint+0], gJointMatAligned[joint+1], w); LLVector4a res; gBlendMat.affineTransform(coords[index], res); res.store4a(vert+index*4); gBlendMat.rotate(normals[index], res); res.store4a(norm+index*4); } else { // No lerp required in this case. LLVector4a res; gJointMatAligned[joint].affineTransform(coords[index], res); res.store4a(vert+index*4); gJointMatAligned[joint].rotate(normals[index], res); res.store4a(norm+index*4); } } buffer->flush(); }
void FastResampler_FirFilter2_Cn_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { Q_UNUSED(channels); for(unsigned int c = 0; c < channels; ++c) { __m128 sum = _mm_setzero_ps(); __m128 v_frac = _mm_set1_ps(frac); float *input2 = input + c; for(unsigned int i = 0; i < filter_length / 4; ++i) { __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); coef1 += 4; coef2 += 4; __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); __m128 v_input1 = _mm_load_ss(input2); input2 += channels; __m128 v_input2 = _mm_load_ss(input2); input2 += channels; __m128 v_input3 = _mm_load_ss(input2); input2 += channels; __m128 v_input4 = _mm_load_ss(input2); input2 += channels; __m128 v_input = _mm_movelh_ps(_mm_unpacklo_ps(v_input1, v_input2), _mm_unpacklo_ps(v_input3, v_input4)); sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value)); } __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e)); __m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01)); _mm_store_ss(output + c, sum3); } }
/* ===================== R_CopyDecalSurface ===================== */ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes, const decal_t * decal, const float fadeColor[4] ) { assert_16_byte_aligned( &verts[numVerts] ); assert_16_byte_aligned( &indexes[numIndexes] ); assert_16_byte_aligned( decal->indexes ); assert_16_byte_aligned( decal->verts ); assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); const __m128 vector_fade_color = _mm_load_ps( fadeColor ); const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 ); // copy vertices and apply depth/time based fading assert_offsetof( idDrawVert, color, 6 * 4 ); for ( int i = 0; i < decal->numVerts; i++ ) { const idDrawVert &srcVert = decal->verts[i]; idDrawVert &dstVert = verts[numVerts + i]; __m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 0 ) ); __m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) ); __m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 ); __m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color ); __m128i colorInt = _mm_cvtps_epi32( timeDepthFade ); __m128i colorShort = _mm_packs_epi32( colorInt, colorInt ); __m128i colorByte = _mm_packus_epi16( colorShort, colorShort ); v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 0 ), v0 ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 ); } // copy indexes assert( ( decal->numIndexes & 7 ) == 0 ); assert( sizeof( triIndex_t ) == 2 ); for ( int i = 0; i < decal->numIndexes; i += 8 ) { __m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] ); vi = _mm_add_epi16( vi, vector_short_num_verts ); _mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi ); } _mm_sfence(); }
void unpack_a8_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = (__m128i)_mm_load_ss((float*)&source[i * 4]); t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), t0); t0 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0); _mm_stream_si128((__m128i*)&dest[i * 16], t0); } }
void replace_a8_rgba8_sse2(const Uint8* alpha, const Uint32 size, Uint8* source) { __m128i t0; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]); t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), t0); t0 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0); _mm_maskmoveu_si128(t0, _mm_set1_epi32(0xFF000000), (char*)&source[i * 16]); } }
void unpack_l8_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = (__m128i)_mm_load_ss((float*)&source[i * 4]); t0 = _mm_unpacklo_epi8(t0, t0); t0 = _mm_unpacklo_epi16(t0, t0); t0 = _mm_or_si128(t0, _mm_set1_epi32(0xFF000000)); _mm_stream_si128((__m128i*)&dest[i * 16], t0); } }
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0, const Uint8* source1, Uint8* dest) { __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = _mm_load_si128((__m128i*)&source0[i * 16]); t1 = _mm_load_si128((__m128i*)&source1[i * 16]); t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]); t2 = _mm_unpacklo_epi8(t2, t2); t2 = _mm_unpacklo_epi16(t2, t2); t3 = _mm_unpacklo_epi8(t0, t0); t4 = _mm_unpacklo_epi8(t1, t1); t5 = _mm_unpacklo_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t9 = _mm_adds_epu16(t7, t8); t9 = _mm_srli_epi16(t9, 8); t3 = _mm_unpackhi_epi8(t0, t0); t4 = _mm_unpackhi_epi8(t1, t1); t5 = _mm_unpackhi_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t10 = _mm_adds_epu16(t7, t8); t10 = _mm_srli_epi16(t10, 8); t10 = _mm_packus_epi16(t9, t10); _mm_stream_si128((__m128i*)&dest[i * 16], t10); } }
static void TEST (void) { union128 u, s1; float e[4]; int i; s1.x = _mm_set_ps (24.43, 68.346, 43.35, 546.46); u.x = test (s1.x); for (i = 0; i < 4; i++) { __m128 tmp = _mm_load_ss (&s1.a[i]); tmp = _mm_rsqrt_ss (tmp); _mm_store_ss (&e[i], tmp); } if (check_union128 (u, e)) abort (); }
void Vec3f::Normalize() { /* float mod = sqrt(x*x + y*y + z*z); x = x / mod; y = y / mod; z = z / mod; */ const float lengthSq = x * x + y * y + z * z; if ((*(int*)&lengthSq) != 0) { float inv;// = InvSqrt(lengthSq); __m128 in = _mm_load_ss(&lengthSq); _mm_store_ss(&inv, _mm_rsqrt_ss(in)); x = x*inv; y = y*inv; z = z*inv; } }
void intrin_sse_scalar_mult_add_su3_vector(su3_vectorf* aa, su3_vectorf* bb, float cc, su3_vectorf* dd) { /* XMM Variables */ __m128 xmm2, xmm3, xmm0, xmm1, xmm4; xmm4 = _mm_load_ss((float *)&((cc)) ); xmm4 = _mm_shuffle_ps( xmm4, xmm4, 0x00 ); xmm0 = _mm_loadu_ps((float *)&((aa)->c[0]) ); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((aa)->c[2]) ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 ); xmm2 = _mm_loadu_ps((float *)&((bb)->c[0]) ); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&((bb)->c[2]) ); xmm3 = _mm_shuffle_ps( xmm3, xmm3, 0x44 ); xmm2 = _mm_mul_ps( xmm2, xmm4 ); xmm3 = _mm_mul_ps( xmm3, xmm4 ); xmm0 = _mm_add_ps( xmm0, xmm2 ); xmm1 = _mm_add_ps( xmm1, xmm3 ); _mm_storeu_ps((float *)&((dd)->c[0]), xmm0 ); _mm_storel_pi((__m64 *)&((dd)->c[2]), xmm1 ); }
/* ============ idSIMD_SSE::CmpLT dst[i] |= ( src0[i] < constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { int i, cnt, pre, post; float *aligned; __m128 xmm0, xmm1; __m128i xmm0i; int cnt_l; char *src0_p; char *constant_p; char *dst_p; int mask_l; int dst_l; /* if the float array is not aligned on a 4 byte boundary */ if ( ((int) src0) & 3 ) { /* unaligned memory access */ pre = 0; cnt = count >> 2; post = count - (cnt<<2); /* __asm mov edx, cnt __asm test edx, edx __asm je doneCmp */ cnt_l = cnt; if(cnt_l != 0) { /* __asm push ebx __asm neg edx __asm mov esi, src0 __asm prefetchnta [esi+64] __asm movss xmm1, constant __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) __asm mov edi, dst __asm mov cl, bitNum */ cnt_l = -cnt_l; src0_p = (char *) src0; _mm_prefetch(src0_p+64, _MM_HINT_NTA); constant_p = (char *) &constant; xmm1 = _mm_load_ss((float *)constant_p); xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )); dst_p = (char *)dst; /* __asm loopNA: */ do { /* __asm movups xmm0, [esi] __asm prefetchnta [esi+128] __asm cmpltps xmm0, xmm1 __asm movmskps eax, xmm0 \ __asm mov ah, al __asm shr ah, 1 __asm mov bx, ax __asm shl ebx, 14 __asm mov bx, ax __asm and ebx, 0x01010101 __asm shl ebx, cl __asm or ebx, dword ptr [edi] __asm mov dword ptr [edi], ebx __asm add esi, 16 __asm add edi, 4 __asm inc edx __asm jl loopNA __asm pop ebx */ xmm0 = _mm_loadu_ps((float *) src0_p); _mm_prefetch(src0_p+128, _MM_HINT_NTA); xmm0 = _mm_cmplt_ps(xmm0, xmm1); // Simplify using SSE2 xmm0i = (__m128i) xmm0; xmm0i = _mm_packs_epi32(xmm0i, xmm0i); xmm0i = _mm_packs_epi16(xmm0i, xmm0i); mask_l = _mm_cvtsi128_si32(xmm0i); // End mask_l = mask_l & 0x01010101; mask_l = mask_l << bitNum; dst_l = *((int *) dst_p); mask_l = mask_l | dst_l; *((int *) dst_p) = mask_l; src0_p = src0_p + 16; dst_p = dst_p + 4; cnt_l = cnt_l + 1; } while (cnt_l < 0); } }
inline int ftoi_fast(float f) { return _mm_cvtt_ss2si(_mm_load_ss(&f)); // SSE1 instructions for float->int }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { const float * kf = coeff; float * src = _src; float * dst = _dst; int i = 0, k, nz = length; // float delta = 0.000001f; __m128 d4 = _mm_setzero_ps(); float * S; __m128 s0, s1, s2, s3, t0, t1, t2, t3; __m128 f; for(i = 0; i <= width - 16; i += 16 ) { s0 = d4, s1 = d4, s2 = d4, s3 = d4; for( k = 0; k < nz; k++ ) { f = _mm_load_ss(kf + k); f = _mm_shuffle_ps(f, f, 0); // (__m128 f, __m128 f, unsigned int imm8) S = src + i + k; t0 = _mm_loadu_ps(S); t1 = _mm_loadu_ps(S + 4); s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); t0 = _mm_loadu_ps(S + 8); t1 = _mm_loadu_ps(S + 12); s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); } _mm_storeu_ps(dst + i, s0); _mm_storeu_ps(dst + i + 4, s1); _mm_storeu_ps(dst + i + 8, s2); _mm_storeu_ps(dst + i + 12, s3); } // for( ; i <= width - 4; i += 4 ) { s0 = d4; for( k = 0; k < nz; k++ ) { f = _mm_load_ss(kf + k); f = _mm_shuffle_ps(f, f, 0); t0 = _mm_loadu_ps(src + k + i); s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); } _mm_storeu_ps(dst + i, s0); } for (; i < width; i++) { for( k = 0; k < nz; k++ ) { *(dst + i) += *(src + i + k) * *(kf + k); } } return; }
inline int FloatToInt( float x ) { return _mm_cvtt_ss2si( _mm_load_ss( &x ) ); }
inline void SSESqrt( float * pOut, float * pIn ) { _mm_store_ss( pOut, _mm_sqrt_ss( _mm_load_ss( pIn ) ) ); // compiles to movss, sqrtss, movss }
// This function handles the case when set_count = 2, in which we cannot // unroll the set loop by 4 to meet the SSE requirement (4 elements). static void InternalUnroll2Inv( const OMX_F32 *in, OMX_F32 *out, const OMX_F32 *twiddle, OMX_INT n) { OMX_INT i; OMX_INT n_by_2 = n >> 1; OMX_INT n_by_4 = n >> 2; OMX_INT n_mul_2 = n << 1; OMX_F32 *out0 = out; for (i = 0; i < n_by_2; i += 8) { const OMX_F32 *tw1 = twiddle + i; const OMX_F32 *tw2 = tw1 + i; const OMX_F32 *tw3 = tw2 + i; const OMX_F32 *tw1e = tw1 + 4; const OMX_F32 *tw2e = tw2 + 8; const OMX_F32 *tw3e = tw3 + 12; VC v_tw1; VC v_tw2; VC v_tw3; VC v_t0; VC v_t1; VC v_t2; VC v_t3; VC v_t4; VC v_t5; VC v_t6; VC v_t7; v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1), _mm_load_ss(tw1e), _MM_SHUFFLE(0, 0, 0, 0)); v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2), _mm_load_ss(tw1e + n_mul_2), _MM_SHUFFLE(0, 0, 0, 0)); v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2), _mm_load_ss(tw2e), _MM_SHUFFLE(0, 0, 0, 0)); v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2), _mm_load_ss(tw2e + n_mul_2), _MM_SHUFFLE(0, 0, 0, 0)); v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3), _mm_load_ss(tw3e), _MM_SHUFFLE(0, 0, 0, 0)); v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2), _mm_load_ss(tw3e + n_mul_2), _MM_SHUFFLE(0, 0, 0, 0)); __m128 xmm0; __m128 xmm1; __m128 xmm2; __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6; __m128 xmm7; const OMX_F32 *in0 = in + (i << 1); xmm0 = _mm_load_ps(in0); xmm1 = _mm_load_ps(in0 + 4); xmm2 = _mm_load_ps(in0 + 8); xmm3 = _mm_load_ps(in0 + 12); v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0)); v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2)); v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0)); v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2)); xmm4 = _mm_load_ps(in0 + n); xmm5 = _mm_load_ps(in0 + n + 4); xmm6 = _mm_load_ps(in0 + n + 8); xmm7 = _mm_load_ps(in0 + n + 12); v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); OMX_F32 *out1 = out0 + n_by_4; OMX_F32 *out2 = out1 + n_by_4; OMX_F32 *out3 = out2 + n_by_4; RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, &v_tw1, &v_tw2, &v_tw3, &v_t0, &v_t1, &v_t2, &v_t3); RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, &v_t4, &v_t5, &v_t6, &v_t7, n); out0 += 4; } }
void intrin_sse_mult_su3_mat_vec(su3_matrixf *aa, su3_vectorf* bb, su3_vectorf* cc) { /* XMM Variables */ __m128 xmm2, xmm3, xmm0, xmm1, xmm6, xmm7, xmm4, xmm5; xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->c[0]) ); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->c[1]) ); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->c[2]) ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x44 ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0x44 ); xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].real) ); xmm3 = _mm_shuffle_ps( xmm3, xmm7, 0x00 ); xmm4 = _mm_load_ss((float *)&((aa)->e[0][1].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].real) ); xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 ); xmm3 = _mm_mul_ps( xmm3, xmm0 ); xmm4 = _mm_mul_ps( xmm4, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm4 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].real) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].real) ); xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) ); xmm6 = _mm_shuffle_ps( xmm6, xmm7, 0x00 ); xmm6 = _mm_mul_ps( xmm6, xmm1 ); xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xB1 ); xmm0 = _mm_xor_ps( xmm0, _sse_sgn13.xmm ); xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x11 ); xmm1 = _mm_xor_ps( xmm1, _sse_sgn13.xmm ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB1 ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn13.xmm ); xmm4 = _mm_load_ss((float *)&((aa)->e[0][0].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) ); xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 ); xmm4 = _mm_mul_ps( xmm4, xmm0 ); xmm3 = _mm_add_ps( xmm3, xmm4 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][1].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm1 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].imag) ); xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm2 ); xmm3 = _mm_add_ps( xmm3, xmm5 ); _mm_storeu_ps((float *)&((cc)->c[0]), xmm3 ); xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 ); xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) ); xmm5 = _mm_load_ss((float *)&((aa)->e[2][1].imag) ); xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 ); xmm5 = _mm_mul_ps( xmm5, xmm1 ); xmm6 = _mm_add_ps( xmm6, xmm5 ); xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB4 ); xmm2 = _mm_xor_ps( xmm2, _sse_sgn4.xmm ); xmm7 = _mm_loadl_pi(xmm7, (__m64 *)&((aa)->e[2][2]) ); xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x05 ); xmm7 = _mm_mul_ps( xmm7, xmm2 ); xmm6 = _mm_add_ps( xmm6, xmm7 ); xmm7 = xmm6 ; xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0xEE ); xmm6 = _mm_add_ps( xmm6, xmm7 ); _mm_storel_pi((__m64 *)&((cc)->c[2]), xmm6 ); }
void mBior53::transcols(char** dest, char** sour, unsigned int w, unsigned int h) const { float fz = 0.0f; int n; float s, d; __m128 ms, md; unsigned int h2 = h / 2; const vec1D& tH = gettH(); const vec1D& tG = gettG(); for (unsigned int x = 0; x < w / 4; x++) { //x<w/4 x = 4*x for (unsigned int k = 0; k < h2; k++) { ms = _mm_load_ss(&fz); md = ms; for (int m = -2; m <= 2; m++) { n = 2 * k + m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms = _mm_add_ps(ms, _mm_mul_ps(_mm_load_ps1(tH.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x])))); } for (int m = 0; m <= 2; m++) { n = 2 * k + m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); md = _mm_add_ps(md, _mm_mul_ps(_mm_load_ps1(tG.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x])))); } if (4*x < w / 2) { if ((w / 2) - (4*x) >= 4) mmxround4(&dest[k][4*x], ms); else mmxround4TH(&dest[k][4*x], ms, (w / 2) - (4*x)); //skip first from LL part 10/2-4=1 [lo] o o o o * | * * * o o [hi] } else mmxround4TH(&dest[k][4*x], ms); mmxround4TH(&dest[k+h2][4*x], md); } } _mm_empty(); //odd remainder for (unsigned int x = w - (w % 4); x < w; x++) { for (unsigned int k = 0; k < h2; k++) { s = 0; d = 0; for (int m = -2; m <= 2; m++) { n = 2 * k + m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s += tH[m] * float(sour[n][x]); } for (int m = 0; m <= 2; m++) { n = 2 * k + m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); d += tG[m] * float(sour[n][x]); } if (x < w / 2) dest[k][x] = mmxround(s); else dest[k][x] = mmxroundTH(s); //is this needed? hi band were TH'ed on transrows dest[k+h2][x] = mmxroundTH(d); //is this needed? hi band were TH'ed on transrows on x>w/2 } } }
void BrushToolEdit::drawInner(const QPoint &pt, float strength) { float fixedStrength = params.strength; strength *= fixedStrength; auto color = params.color; std::array<int, 3> colorParts = Terrain::expandColor(color); __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0); SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST); (void) roundingModeScope; switch (tool->type()) { case BrushType::Blur: drawBlur(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Smoothen: drawSmoothen(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Raise: case BrushType::Lower: if (tool->type() == BrushType::Lower) { fixedStrength = -fixedStrength; strength = -strength; } switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength *= 3.f; drawRaiseLower(pt, [=](float ¤t, float before, float tip) { (void) before; current -= tip * strength; }); break; case BrushPressureMode::Constant: if (tool->type() == BrushType::Lower) { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength)); }); } else { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength)); }); } break; case BrushPressureMode::Adjustable: drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(before - tip * strength); }); break; } break; case BrushType::Paint: switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength = 1.f - std::exp2(-strength); drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { (void) before; // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); auto factor = _mm_set1_ps(tip * strength); // blend auto diff = _mm_sub_ps(colorMM, currentMF); diff = _mm_mul_ps(diff, factor); currentMF = _mm_add_ps(currentMF, diff); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Constant: fixedStrength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128()); // use "before" image to which way of color change is possible, and // compute possible range of result color auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * fixedStrength); auto adddiff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, adddiff); auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps()); // compute output image auto out1 = _mm_max_ps(currentMF, beforeMF); auto out2 = _mm_min_ps(currentMF, beforeMF); currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2)); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Adjustable: strength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // blend auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * strength); diff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, diff); // convert to RGB32 beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128()); beforeMM = _mm_cvttps_epi32(beforeMF); beforeMM = _mm_packs_epi32(beforeMM, beforeMM); beforeMM = _mm_packus_epi16(beforeMM, beforeMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(beforeMM)); }); break; } break; } }
void mBior53::synthcols(char** dest, char** sour, unsigned int w, unsigned int h) const //w,h of the LO part { float fz = 0.0f; float mul2 = 2.0f; int n; float s2k, s2k1; __m128 ms2k, ms2k1; unsigned int w2 = 2 * w; const vec1D& H2m = getH2m(); const vec1D& G2m = getG2m(); const vec1D& H2m1 = getH2m1(); const vec1D& G2m1 = getG2m1(); for (unsigned int x = 0; x < w2 / 4; x++) { //x<w2/2 x = 4*x for (unsigned int k = 0; k < h; k++) { ms2k = _mm_load_ss(&fz); ms2k1 = ms2k; for (int m = 0; m <= 0; m++) { //s2k even H n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms2k = _mm_add_ps(ms2k, _mm_mul_ps(_mm_load_ps1(H2m.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x])))); } for (int m = 0; m <= 1; m++) { //s2k even G n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms2k = _mm_add_ps(ms2k, _mm_mul_ps(_mm_load_ps1(G2m.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n+h][4*x])))); } for (int m = -1; m <= 0; m++) { //s2k1 odd H n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms2k1 = _mm_add_ps(ms2k1, _mm_mul_ps(_mm_load_ps1(H2m1.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x])))); } for (int m = -1; m <= 1; m++) { //s2k1 odd G n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms2k1 = _mm_add_ps(ms2k1, _mm_mul_ps(_mm_load_ps1(G2m1.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n+h][4*x])))); } __m128 mmul2 = _mm_load_ps1(&mul2); mmxround4(&dest[2*k][4*x], _mm_mul_ps(ms2k, mmul2)); mmxround4(&dest[2*k+1][4*x], _mm_mul_ps(ms2k1, mmul2)); } } _mm_empty(); //odd remainder for (unsigned int x = w2 - (w2 % 4); x < w2; x++) { for (unsigned int k = 0; k < h; k++) { s2k = 0; s2k1 = 0; for (int m = H2m.first(); m <= H2m.last(); m++) { //s2k even H n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s2k += H2m[m] * float(sour[n][x]); } for (int m = G2m.first(); m <= G2m.last(); m++) { //s2k even G n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s2k += G2m[m] * float(sour[n+h][x]); } for (int m = H2m1.first(); m <= H2m1.last(); m++) { //s2k1 odd H n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s2k1 += H2m1[m] * float(sour[n][x]); } for (int m = G2m1.first(); m <= G2m1.last(); m++) { //s2k1 odd G n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s2k1 += G2m1[m] * float(sour[n+h][x]); } dest[2*k][x] = mmxround(2.0f * s2k); dest[2*k+1][x] = mmxround(2.0f * s2k1); } } }
test (float *e) { return _mm_load_ss (e); }