void OptimizedSelfAdjointMatrix6x6f::rankUpdate(const Eigen::Matrix<float, 6, 1>& u, const float& alpha) { __m128 s = _mm_set1_ps(alpha); __m128 v1234 = _mm_loadu_ps(u.data()); __m128 v56xx = _mm_loadu_ps(u.data() + 4); __m128 v1212 = _mm_movelh_ps(v1234, v1234); __m128 v3434 = _mm_movehl_ps(v1234, v1234); __m128 v5656 = _mm_movelh_ps(v56xx, v56xx); __m128 v1122 = _mm_mul_ps(s, _mm_unpacklo_ps(v1212, v1212)); _mm_store_ps(data + 0, _mm_add_ps(_mm_load_ps(data + 0), _mm_mul_ps(v1122, v1212))); _mm_store_ps(data + 4, _mm_add_ps(_mm_load_ps(data + 4), _mm_mul_ps(v1122, v3434))); _mm_store_ps(data + 8, _mm_add_ps(_mm_load_ps(data + 8), _mm_mul_ps(v1122, v5656))); __m128 v3344 = _mm_mul_ps(s, _mm_unpacklo_ps(v3434, v3434)); _mm_store_ps(data + 12, _mm_add_ps(_mm_load_ps(data + 12), _mm_mul_ps(v3344, v3434))); _mm_store_ps(data + 16, _mm_add_ps(_mm_load_ps(data + 16), _mm_mul_ps(v3344, v5656))); __m128 v5566 = _mm_mul_ps(s, _mm_unpacklo_ps(v5656, v5656)); _mm_store_ps(data + 20, _mm_add_ps(_mm_load_ps(data + 20), _mm_mul_ps(v5566, v5656))); }
/* apparently this is retarded */ void mulMatrix1(Matrix4x4 ret, Matrix4x4 mat1, Matrix4x4 mat2) { /* for some reason not aligning the matrix segfaults, * but aligning deadlocks the program */ /* aha we can heavily sse this: * 1. transpose mat2 * 2. dotproduct the rows */ /* 1. transpose mat2 */ __m128 row0, row1, row2, row3; __m128 tmp0, tmp1, tmp2, tmp3; /* Load 4x4 mat2 from memory into four SSE registers. */ row0 = _mm_load_ps( mat2[0] ); row1 = _mm_load_ps( mat2[1] ); row2 = _mm_load_ps( mat2[2] ); row3 = _mm_load_ps( mat2[3] ); /* Interleave bottom/top two pixels from two SSE registers with each other * into a single SSE register. */ tmp0 = _mm_unpacklo_ps( row0, row1 ); tmp2 = _mm_unpacklo_ps( row2, row3 ); tmp1 = _mm_unpackhi_ps( row0, row1 ); tmp3 = _mm_unpackhi_ps( row2, row3 ); /* Move bottom/top two pixels from two SSE registers into one SSE register. */ row0 = _mm_movelh_ps( tmp0, tmp2 ); row1 = _mm_movehl_ps( tmp2, tmp0 ); row2 = _mm_movelh_ps( tmp1, tmp3 ); row3 = _mm_movehl_ps( tmp3, tmp1 ); /* Store 4x4 matrix from all four SSE registers into memory. */ _mm_store_ps( mat2[0], row0 ); _mm_store_ps( mat2[1], row1 ); _mm_store_ps( mat2[2], row2 ); _mm_store_ps( mat2[3], row3 ); /* 2. dotproduct the rows */ /* OMG 16 DOT PRODUCTS */ ret[0][0] = mul_asm(mat1[0], mat2[0]); ret[0][1] = mul_asm(mat1[0], mat2[1]); ret[0][2] = mul_asm(mat1[0], mat2[2]); ret[0][3] = mul_asm(mat1[0], mat2[3]); ret[1][0] = mul_asm(mat1[1], mat2[0]); ret[1][1] = mul_asm(mat1[1], mat2[1]); ret[1][2] = mul_asm(mat1[1], mat2[2]); ret[1][3] = mul_asm(mat1[1], mat2[3]); ret[2][0] = mul_asm(mat1[2], mat2[0]); ret[2][1] = mul_asm(mat1[2], mat2[1]); ret[2][2] = mul_asm(mat1[2], mat2[2]); ret[2][3] = mul_asm(mat1[2], mat2[3]); ret[3][0] = mul_asm(mat1[3], mat2[0]); ret[3][1] = mul_asm(mat1[3], mat2[1]); ret[3][2] = mul_asm(mat1[3], mat2[2]); ret[3][3] = mul_asm(mat1[3], mat2[3]); return; }
/// Transform this box using the specified transform matrix. /// /// @param[in] rTransform Matrix by which to transform. void Helium::Simd::AaBox::TransformBy( const Matrix44& rTransform ) { // Expand each corner position. Register minVec = m_minimum.GetSimdVector(); Register maxVec = m_maximum.GetSimdVector(); Vector3Soa corners0; corners0.m_x = _mm_shuffle_ps( minVec, minVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); corners0.m_y = _mm_shuffle_ps( minVec, maxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) ); corners0.m_z = _mm_unpackhi_ps( minVec, maxVec ); corners0.m_z = _mm_movelh_ps( corners0.m_z, corners0.m_z ); Vector3Soa corners1; corners1.m_x = _mm_shuffle_ps( maxVec, maxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); corners1.m_y = corners0.m_y; corners1.m_z = corners0.m_z; // Transform all corners by the provided transformation matrix. Matrix44Soa transformSplat( rTransform ); transformSplat.TransformPoint( corners0, corners0 ); transformSplat.TransformPoint( corners1, corners1 ); // Compute the minimum. Register minX = Simd::MinF32( corners0.m_x, corners1.m_x ); Register minY = Simd::MinF32( corners0.m_y, corners1.m_y ); Register minXYLo = _mm_unpacklo_ps( minX, minY ); Register minXYHi = _mm_unpackhi_ps( minX, minY ); Register minXY = Simd::MinF32( minXYLo, minXYHi ); Register minZ = Simd::MinF32( corners0.m_z, corners1.m_z ); Register minZLo = _mm_unpacklo_ps( minZ, minZ ); Register minZHi = _mm_unpackhi_ps( minZ, minZ ); minZ = Simd::MinF32( minZLo, minZHi ); Register minLo = _mm_movelh_ps( minXY, minZ ); Register minHi = _mm_movehl_ps( minZ, minXY ); m_minimum.SetSimdVector( Simd::MinF32( minLo, minHi ) ); // Compute the maximum. Register maxX = Simd::MaxF32( corners0.m_x, corners1.m_x ); Register maxY = Simd::MaxF32( corners0.m_y, corners1.m_y ); Register maxXYLo = _mm_unpacklo_ps( maxX, maxY ); Register maxXYHi = _mm_unpackhi_ps( maxX, maxY ); Register maxXY = Simd::MaxF32( maxXYLo, maxXYHi ); Register maxZ = Simd::MaxF32( corners0.m_z, corners1.m_z ); Register maxZLo = _mm_unpacklo_ps( maxZ, maxZ ); Register maxZHi = _mm_unpackhi_ps( maxZ, maxZ ); maxZ = Simd::MaxF32( maxZLo, maxZHi ); Register maxLo = _mm_movelh_ps( maxXY, maxZ ); Register maxHi = _mm_movehl_ps( maxZ, maxXY ); m_maximum.SetSimdVector( Simd::MaxF32( maxLo, maxHi ) ); }
static void NOINLINE transposeX4( const __m128 *v1, __m128 *vout ) { __m128 a0 = _mm_unpacklo_ps( v1[ 0 ], v1[ 2 ] ); __m128 a1 = _mm_unpacklo_ps( v1[ 1 ], v1[ 3 ] ); __m128 a2 = _mm_unpackhi_ps( v1[ 0 ], v1[ 2 ] ); __m128 a3 = _mm_unpackhi_ps( v1[ 1 ], v1[ 3 ] ); vout[ 0 ] = _mm_unpacklo_ps( a0, a1 ); vout[ 1 ] = _mm_unpackhi_ps( a0, a1 ); vout[ 2 ] = _mm_unpacklo_ps( a2, a3 ); vout[ 3 ] = _mm_unpackhi_ps( a2, a3 ); }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out) { dt_develop_t *dev = self->dev; const int ch = piece->colors; const __m128 upper = _mm_set_ps(FLT_MAX, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f); const __m128 lower = _mm_set_ps(FLT_MAX, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f); const int colorscheme = dev->overexposed.colorscheme; const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]); const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]); #ifdef _OPENMP #pragma omp parallel for default(none) shared(ovoid) schedule(static) #endif for(int k=0; k<roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width; float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width; for (int j=0; j<roi_out->width; j++,in+=4,out+=4) { const __m128 pixel = _mm_load_ps(in); __m128 isoe = _mm_cmpge_ps(pixel, upper); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); __m128 isue = _mm_cmple_ps(pixel, lower); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel), _mm_and_ps(isoe, upper_color)); result = _mm_or_ps(_mm_andnot_ps(isue, result), _mm_and_ps(isue, lower_color)); _mm_stream_ps(out, result); } } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
void BufferComplexMultiply_SSE(const float* aInput, const float* aScale, float* aOutput, uint32_t aSize) { unsigned i; __m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0, outimag1, outimag2, outimag3; ASSERT_ALIGNED16(aInput); ASSERT_ALIGNED16(aScale); ASSERT_ALIGNED16(aOutput); ASSERT_MULTIPLE16(aSize); for (i = 0; i < aSize * 2; i += 16) { in0 = _mm_load_ps(&aInput[i]); in1 = _mm_load_ps(&aInput[i + 4]); in2 = _mm_load_ps(&aInput[i + 8]); in3 = _mm_load_ps(&aInput[i + 12]); outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0)); outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1)); outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0)); outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1)); in0 = _mm_load_ps(&aScale[i]); in1 = _mm_load_ps(&aScale[i + 4]); in2 = _mm_load_ps(&aScale[i + 8]); in3 = _mm_load_ps(&aScale[i + 12]); outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0)); outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1)); outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0)); outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1)); in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1), _mm_mul_ps(outimag0, outimag1)); in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1), _mm_mul_ps(outimag0, outreal1)); in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3), _mm_mul_ps(outimag2, outimag3)); in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3), _mm_mul_ps(outimag2, outreal3)); outreal0 = _mm_unpacklo_ps(in0, in1); outreal1 = _mm_unpackhi_ps(in0, in1); outreal2 = _mm_unpacklo_ps(in2, in3); outreal3 = _mm_unpackhi_ps(in2, in3); _mm_store_ps(&aOutput[i], outreal0); _mm_store_ps(&aOutput[i + 4], outreal1); _mm_store_ps(&aOutput[i + 8], outreal2); _mm_store_ps(&aOutput[i + 12], outreal3); } }
inline float16 Transpose(const float16 & a) { float16 temp, res; temp.x.m128 = _mm_unpacklo_ps(a.x.m128, a.z.m128); temp.y.m128 = _mm_unpacklo_ps(a.y.m128, a.w.m128); temp.z.m128 = _mm_unpackhi_ps(a.x.m128, a.z.m128); temp.w.m128 = _mm_unpackhi_ps(a.y.m128, a.w.m128); res.x.m128 = _mm_unpacklo_ps(temp.x.m128, temp.y.m128); res.y.m128 = _mm_unpackhi_ps(temp.x.m128, temp.y.m128); res.z.m128 = _mm_unpacklo_ps(temp.z.m128, temp.w.m128); res.w.m128 = _mm_unpackhi_ps(temp.z.m128, temp.w.m128); return res; }
void SoundSSE::pack_16bit_stereo(float *input[2], int size, short *output) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/4)*4; __m128 constant1 = _mm_set1_ps(32767); for (int i = 0; i < sse_size; i+=4) { __m128 samples0 = _mm_loadu_ps(input[0]+i); __m128 samples1 = _mm_loadu_ps(input[1]+i); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); __m128 tmp0, tmp1; tmp0 = _mm_unpacklo_ps(samples0, samples1); tmp1 = _mm_unpackhi_ps(samples0, samples1); __m128i isamples0 = _mm_cvtps_epi32(tmp0); __m128i isamples1 = _mm_cvtps_epi32(tmp1); __m128i isamples = _mm_packs_epi32(isamples0, isamples1); _mm_storeu_si128((__m128i*)(output+i*2), isamples); } #else const int sse_size = 0; #endif // Pack remaining for (int i = sse_size; i < size; i++) { output[i*2] = input[0][i]*32767; output[i*2 + 1] = input[1][i]*32767; } }
void SoundSSE::pack_float_stereo(float *input[2], int size, float *output) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/4)*4; for (int i = 0; i < sse_size; i+=4) { __m128 samples0 = _mm_loadu_ps(input[0]+i); __m128 samples1 = _mm_loadu_ps(input[1]+i); __m128 tmp0, tmp1; tmp0 = _mm_unpacklo_ps(samples0, samples1); tmp1 = _mm_unpackhi_ps(samples0, samples1); _mm_storeu_ps(output+i*2, tmp0); _mm_storeu_ps(output+i*2+4, tmp1); } #else const int sse_size = 0; #endif // Pack remaining for (int i = sse_size; i < size; i++) { output[i*2] = input[0][i]; output[i*2 + 1] = input[1][i]; } }
static void SSE2_FloatToStereoMix(const float *pIn1, const float *pIn2, int32 *pOut, uint32 nCount, const float _f2ic) //-------------------------------------------------------------------------------------------------------------------- { __m128 f2ic = _mm_load_ps1(&_f2ic); __m128i *out = reinterpret_cast<__m128i *>(pOut); // We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE nCount = (nCount + 3) / 4; do { __m128 fl = _mm_loadu_ps(pIn1); // Load four float values, LLLL __m128 fr = _mm_loadu_ps(pIn2); // Load four float values, RRRR pIn1 += 4; pIn2 += 4; fl = _mm_mul_ps(fl, f2ic); // Apply int->float factor fr = _mm_mul_ps(fr, f2ic); // Apply int->float factor __m128 f1 = _mm_unpacklo_ps(fl, fr); // LL__+RR__ => LRLR __m128 f2 = _mm_unpackhi_ps(fl, fr); // __LL+__RR => LRLR __m128i i1 =_mm_cvtps_epi32(f1); // Convert to four ints __m128i i2 =_mm_cvtps_epi32(f2); // Convert to four ints _mm_storeu_si128(out, i1); // Store four int values, LRLR _mm_storeu_si128(out + 1, i2); // Store four int values, LRLR out += 2; } while(--nCount); }
void lfModifier::ModifyCoord_Dist_PTLens_SSE (void *data, float *iocoord, int count) { // See "Note about PT-based distortion models" at the top of mod-coord.cpp. /* * If buffer is not aligned, fall back to plain code */ if((uintptr_t)(iocoord) & 0xf) { return ModifyCoord_Dist_PTLens(data, iocoord, count); } lfCoordDistCallbackData* cddata = (lfCoordDistCallbackData*) data; // Rd = Ru * (a_ * Ru^3 + b_ * Ru^2 + c_ * Ru + 1) __m128 a_ = _mm_set_ps1 (cddata->Terms [0]); __m128 b_ = _mm_set_ps1 (cddata->Terms [1]); __m128 c_ = _mm_set_ps1 (cddata->Terms [2]); __m128 cx = _mm_set_ps1 (cddata->centerX); __m128 cy = _mm_set_ps1 (cddata->centerY); __m128 cc = _mm_set_ps1 (cddata->coordinate_correction); __m128 one = _mm_set_ps1 (1.0f); // SSE Loop processes 4 pixels/loop int loop_count = count / 4; for (int i = 0; i < loop_count ; i++) { __m128 c0 = _mm_load_ps (&iocoord [8 * i]); __m128 c1 = _mm_load_ps (&iocoord [8 * i + 4]); __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0)); __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1)); x = _mm_sub_ps(_mm_mul_ps(x, cc), cx); y = _mm_sub_ps(_mm_mul_ps(y, cc), cy); __m128 ru2 = _mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y)); __m128 ru = _mm_rcp_ps (_mm_rsqrt_ps (ru2)); // Calculate poly3 = a_ * ru2 * ru + b_ * ru2 + c_ * ru + 1; __m128 t = _mm_mul_ps (ru2, b_); __m128 poly3 = _mm_mul_ps (_mm_mul_ps (a_, ru2), ru); t = _mm_add_ps (t, _mm_mul_ps (ru, c_)); poly3 = _mm_add_ps (t, _mm_add_ps (poly3, one)); x = _mm_add_ps(_mm_mul_ps (x, poly3), cx); y = _mm_add_ps(_mm_mul_ps (y, poly3), cy); x = _mm_div_ps (x, cc); y = _mm_div_ps (y, cc); c0 = _mm_unpacklo_ps(x, y); c1 = _mm_unpackhi_ps(x, y); _mm_store_ps (&iocoord [8 * i], c0); _mm_store_ps (&iocoord [8 * i + 4], c1); } loop_count *= 4; int remain = count - loop_count; if (remain) ModifyCoord_Dist_PTLens (data, &iocoord [loop_count * 2], remain); }
/// Compute the corners of this view frustum. /// /// A view frustum can have either four or eight corners depending on whether a far clip plane exists (eight /// corners) or whether an infinite far clip plane is used (four corners). /// /// Note that this assumes that the frustum is always properly defined, with each possible combination of /// neighboring clip planes intersecting at a valid point. /// /// @param[out] pCorners Array in which the frustum corners will be stored. This must point to a region of memory /// large enough for four points if this frustum has an infinite far clip plane, or eight /// points if this frustum has a normal far clip plane. /// /// @return Number of clip planes computed (either four or eight). size_t Helium::Simd::Frustum::ComputeCorners( Vector3* pCorners ) const { HELIUM_ASSERT( pCorners ); // Compute the corners in struct-of-arrays format. HELIUM_SIMD_ALIGN_PRE float32_t cornersX[ 8 ] HELIUM_SIMD_ALIGN_POST; HELIUM_SIMD_ALIGN_PRE float32_t cornersY[ 8 ] HELIUM_SIMD_ALIGN_POST; HELIUM_SIMD_ALIGN_PRE float32_t cornersZ[ 8 ] HELIUM_SIMD_ALIGN_POST; size_t cornerCount = ComputeCornersSoa( cornersX, cornersY, cornersZ ); HELIUM_ASSERT( cornerCount == 4 || cornerCount == 8 ); // Swizzle the results and store in the output array. Helium::Simd::Register cornerXVec = Helium::Simd::LoadAligned( cornersX ); Helium::Simd::Register cornerYVec = Helium::Simd::LoadAligned( cornersY ); Helium::Simd::Register cornerZVec = Helium::Simd::LoadAligned( cornersZ ); Helium::Simd::Register xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec ); Helium::Simd::Register xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec ); Helium::Simd::Register zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec ); Helium::Simd::Register zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec ); pCorners[ 0 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) ); pCorners[ 1 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) ); pCorners[ 2 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) ); pCorners[ 3 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) ); if( cornerCount == 8 ) { cornerXVec = Helium::Simd::LoadAligned( cornersX + 4 ); cornerYVec = Helium::Simd::LoadAligned( cornersY + 4 ); cornerZVec = Helium::Simd::LoadAligned( cornersZ + 4 ); xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec ); xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec ); zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec ); zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec ); pCorners[ 4 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) ); pCorners[ 5 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) ); pCorners[ 6 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) ); pCorners[ 7 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) ); } return cornerCount; }
void Shuffle16Elems(__m128 &io_data0, __m128 &io_data1, __m128 &io_data2, __m128 &io_data3) { __m128 ccdd1 = _mm_unpackhi_ps(io_data0, io_data1); __m128 ccdd2 = _mm_unpackhi_ps(io_data2, io_data3); __m128 aabb1 = _mm_unpacklo_ps(io_data0, io_data1); __m128 aabb2 = _mm_unpacklo_ps(io_data2, io_data3); io_data0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2))); io_data1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2))); io_data2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2))); io_data3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2))); }
matrix4 matrix4::transposed() const { #ifdef __SSE__ __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps(x.v, y.v); tmp2 = _mm_unpacklo_ps(z.v, w.v); tmp1 = _mm_unpackhi_ps(x.v, y.v); tmp3 = _mm_unpackhi_ps(z.v, w.v); return matrix4(_mm_movelh_ps(tmp0, tmp2), _mm_movehl_ps(tmp2, tmp0), _mm_movelh_ps(tmp1, tmp3), _mm_movehl_ps(tmp3, tmp1)); #else return matrix4(float4(x.x, y.x, z.x, w.x), float4(x.y, y.y, z.y, w.y), float4(x.z, y.z, z.z, w.z), float4(x.w, y.w, z.w, w.w)); #endif }
inline vector4f haddp(const vector4f* row) { #if SSE_INSTR_SET >= 3 // SSE3 return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]), _mm_hadd_ps(row[2], row[3])); #else __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); tmp0 = _mm_add_ps(tmp0, tmp1); tmp1 = _mm_unpacklo_ps(row[2], row[3]); tmp1 = _mm_add_ps(tmp1, tmp2); tmp2 = _mm_movehl_ps(tmp1, tmp0); tmp0 = _mm_movelh_ps(tmp0, tmp1); return _mm_add_ps(tmp0, tmp2); #endif }
inline __m128 CalcWeights(float x, float y) { __m128 ssx = _mm_set_ss(x); __m128 ssy = _mm_set_ss(y); __m128 psXY = _mm_unpacklo_ps(ssx, ssy); // 0 0 y x //__m128 psXYfloor = _mm_floor_ps(psXY); // use this line for if you have SSE4 __m128 psXYfloor = _mm_cvtepi32_ps(_mm_cvtps_epi32(psXY)); __m128 psXYfrac = _mm_sub_ps(psXY, psXYfloor); // = frac(psXY) __m128 psXYfrac1 = _mm_sub_ps(CONST_1111, psXYfrac); // ? ? (1-y) (1-x) __m128 w_x = _mm_unpacklo_ps(psXYfrac1, psXYfrac); // ? ? x (1-x) w_x = _mm_movelh_ps(w_x, w_x); // x (1-x) x (1-x) __m128 w_y = _mm_shuffle_ps(psXYfrac1, psXYfrac, _MM_SHUFFLE(1, 1, 1, 1)); // y y (1-y) (1-y) // complete weight vector return _mm_mul_ps(w_x, w_y); }
void fast(element_t * const elements, const int num_elts, const float a) { element_t * elts = elements; float logf_a = logf(a); float logf_1_a = logf(1.0/a); v4sf log_a = _mm_load1_ps(&logf_a); v4sf log_1_a = _mm_load1_ps(&logf_1_a); assert(num_elts % 3 == 0); // operates on 3 elements at a time // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a); for (int i = 0; i < num_elts; i += 3) { // transpose // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0 v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0 v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0 v4sf r3 = _mm_setzero_ps(); // 0, 0, 0, 0 v4sf t0 = _mm_unpacklo_ps(r0, r1); // x1,x2,y1,y2 v4sf t1 = _mm_unpacklo_ps(r2, r3); // x3,0, y3,0 v4sf t2 = _mm_unpackhi_ps(r0, r1); // z1,z2,0, 0 v4sf t3 = _mm_unpackhi_ps(r2, r3); // z3,0, 0, 0 r0 = _mm_movelh_ps(t0, t1); // x1,x2,x3,0 r1 = _mm_movehl_ps(t1, t0); // y1,y2,y3,0 r2 = _mm_movelh_ps(t2, t3); // z1,z2,z3,0 // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a)) v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0 v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0 v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0 v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0 v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0 v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0 // sum v4sf s1 = _mm_add_ps(ex0, ex1); v4sf s2 = _mm_add_ps(sum1, ex2); // pow(sum, 1/a) = exp(sum * log(1/a)) v4sf ps = _mm_mul_ps(s2, log_1_a); v4sf es = exp_ps(ps); ALIGN16_BEG float re[4] ALIGN16_END; _mm_store_ps(re, es); elts[0].re = re[0]; elts[1].re = re[1]; elts[2].re = re[2]; elts += 3; } }
void extern avx512vl_test (void) { x = _mm256_unpacklo_ps (y, z); x = _mm256_mask_unpacklo_ps (x, 2, y, z); x = _mm256_maskz_unpacklo_ps (2, y, z); xx = _mm_unpacklo_ps (yy, zz); xx = _mm_mask_unpacklo_ps (xx, 2, yy, zz); xx = _mm_maskz_unpacklo_ps (2, yy, zz); }
void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh) { __m128 ms0 = _mm_load_ps(src); __m128 ms1 = _mm_load_ps(src + 4); const __m128 mm = _mm_set1_ps(0.5f); __m128 a = _mm_add_ps(ms0, ms1); __m128 b = _mm_sub_ps(ms0, ms1); __m128 t1 = _mm_unpacklo_ps(a, b); __m128 t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; const __m128 mth = _mm_set1_ps(thresh); __m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth); ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk); #ifdef _KEEP_00_COEF_ ms0 = _mm_blend_ps(ms0, a, 1); #endif msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth); ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk); a = _mm_add_ps(ms0, ms1); b = _mm_sub_ps(ms0, ms1); t1 = _mm_unpacklo_ps(a, b); t2 = _mm_unpackhi_ps(a, b); ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0)); ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2)); a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1)); b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1)); _mm_store_ps(dest, a); _mm_store_ps(dest + 4, b); }
v4f step_t::operator () (float t) const { // Evaluate the polynomial f by Estrin's method. Return // (0 0 0 0) if t < t0, // (f f f f) if t0 <= t < t1, // (1 1 1 1) if t > t1. v4f c4 = load4f (c); v4f one = { 1.0f, 1.0f, 1.0f, 1.0f }; v4f tttt = _mm_set1_ps (t); // t t t t v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t v4f f0 = c4 * tt; // c0 c1*t c2 c3*t v4f ha = _mm_hadd_ps (f0, f0) * tt * tt; v4f f = _mm_hadd_ps (ha, ha); // f f f f v4f f1 = _mm_unpacklo_ps (f, one); // f 1 f 1 v4f tx = load4f (T); // t0 t1 t1 inf v4f lo = _mm_movelh_ps (tx, tx); // t0 t1 t0 t1 v4f hi = _mm_movehl_ps (tx, tx); // t1 inf t1 inf v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi)); v4f val = _mm_and_ps (sel, f1); // f? 1? f? 1? return _mm_hadd_ps (val, val); }
void FastResampler_FirFilter2_Cn_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { Q_UNUSED(channels); for(unsigned int c = 0; c < channels; ++c) { __m128 sum = _mm_setzero_ps(); __m128 v_frac = _mm_set1_ps(frac); float *input2 = input + c; for(unsigned int i = 0; i < filter_length / 4; ++i) { __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); coef1 += 4; coef2 += 4; __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); __m128 v_input1 = _mm_load_ss(input2); input2 += channels; __m128 v_input2 = _mm_load_ss(input2); input2 += channels; __m128 v_input3 = _mm_load_ss(input2); input2 += channels; __m128 v_input4 = _mm_load_ss(input2); input2 += channels; __m128 v_input = _mm_movelh_ps(_mm_unpacklo_ps(v_input1, v_input2), _mm_unpacklo_ps(v_input3, v_input4)); sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value)); } __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e)); __m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01)); _mm_store_ss(output + c, sum3); } }
BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const { #ifdef URHO3D_SSE const __m128 one = _mm_set_ss(1.f); __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one)); __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one)); __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f)); __m128 halfSize = _mm_sub_ps(centerPoint, minPt); __m128 m0 = _mm_loadu_ps(&transform.m00_); __m128 m1 = _mm_loadu_ps(&transform.m10_); __m128 m2 = _mm_loadu_ps(&transform.m20_); __m128 r0 = _mm_mul_ps(m0, centerPoint); __m128 r1 = _mm_mul_ps(m1, centerPoint); __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1)); __m128 r2 = _mm_mul_ps(m2, centerPoint); const __m128 zero = _mm_setzero_ps(); __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero)); __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize)); __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize)); __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize)); t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y)); t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero)); __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0)); return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir)); #else Vector3 newCenter = transform * Center(); Vector3 oldEdge = Size() * 0.5f; Vector3 newEdge = Vector3( Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_, Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_, Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_ ); return BoundingBox(newCenter - newEdge, newCenter + newEdge); #endif }
/** transform vector by rigid transform */ inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec) { #ifdef SIMPLE_GL_USE_SSE4 __m128 res; __m128 dotProd; res = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\ dotProd = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\ dotProd = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\ dotProd = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) ); return Matrix<float, 4, 1>(res); #elif defined(SIMPLE_GL_USE_SSE3) __m128 res; __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); __m128 vec01 = _mm_unpacklo_ps(dotProd0, dotProd1); __m128 vec23 = _mm_unpackhi_ps(dotProd2, dotProd3); res = _mm_movelh_ps(vec01, vec23); return Matrix<float, 4, 1>(res); #else // SSE2 // TODO: Think about good sse optimization Matrix<float, 4, 1> res; res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3]; res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3]; res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3]; res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3]; return res; #endif }
void FastResampler_FirFilter2_C2_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) { Q_UNUSED(channels); __m128 sum = _mm_setzero_ps(); __m128 v_frac = _mm_set1_ps(frac); for(unsigned int i = 0; i < filter_length / 4; ++i) { __m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2); coef1 += 4; coef2 += 4; __m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac)); __m128 v_input1 = _mm_loadu_ps(input), v_input2 = _mm_loadu_ps(input + 4); input += 8; sum = _mm_add_ps(sum, _mm_mul_ps(v_input1, _mm_unpacklo_ps(filter_value, filter_value))); sum = _mm_add_ps(sum, _mm_mul_ps(v_input2, _mm_unpackhi_ps(filter_value, filter_value))); } __m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0xee)); _mm_store_sd((double*) output, _mm_castps_pd(sum2)); }
static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa) { uint16_t i, k, ah, ac; for (k = 0; k < l1; k++) { ah = k*ido; ac = 2*k*ido; for (i = 0; i < ido; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14; __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24; __m128 w1, w2, w3, w4; m1 = _mm_load_ps(&RE(cc[ac+i])); m2 = _mm_load_ps(&RE(cc[ac+ido+i])); m5 = _mm_load_ps(&RE(cc[ac+i+2])); m6 = _mm_load_ps(&RE(cc[ac+ido+i+2])); w1 = _mm_load_ps(&RE(wa[i])); w3 = _mm_load_ps(&RE(wa[i+2])); m3 = _mm_add_ps(m1, m2); m15 = _mm_add_ps(m5, m6); m4 = _mm_sub_ps(m1, m2); m16 = _mm_sub_ps(m5, m6); _mm_store_ps(&RE(ch[ah+i]), m3); _mm_store_ps(&RE(ch[ah+i+2]), m15); w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); m7 = _mm_mul_ps(m4, w1); m17 = _mm_mul_ps(m16, w3); m8 = _mm_mul_ps(m4, w2); m18 = _mm_mul_ps(m16, w4); m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0)); m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1)); m11 = _mm_add_ps(m9, m10); m21 = _mm_add_ps(m19, m20); m12 = _mm_sub_ps(m9, m10); m22 = _mm_sub_ps(m19, m20); m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2)); m14 = _mm_unpacklo_ps(m12, m13); m24 = _mm_unpacklo_ps(m22, m23); _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24); } } }
static void rftbsub_128_SSE2(float* a) { const float* c = rdft_w + 32; int j1, j2, k1, k2; float wkr, wki, xr, xi, yr, yi; static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; const __m128 mm_half = _mm_load_ps(k_half); a[1] = -a[1]; // Vectorized code (four at once). // Note: commented number are indexes for the first iteration of the loop. for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { // Load 'wk'. const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, const __m128 wkr_ = _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, const __m128 wki_ = c_j1; // 1, 2, 3, 4, // Load and shuffle 'a'. const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, const __m128 a_j2_p0 = _mm_shuffle_ps( a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8, const __m128 a_j2_p1 = _mm_shuffle_ps( a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9, const __m128 a_k2_p0 = _mm_shuffle_ps( a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120, const __m128 a_k2_p1 = _mm_shuffle_ps( a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121, // Calculate 'x'. const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); // 2-126, 4-124, 6-122, 8-120, const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); // 3-127, 5-125, 7-123, 9-121, // Calculate product into 'y'. // yr = wkr * xr + wki * xi; // yi = wkr * xi - wki * xr; const __m128 a_ = _mm_mul_ps(wkr_, xr_); const __m128 b_ = _mm_mul_ps(wki_, xi_); const __m128 c_ = _mm_mul_ps(wkr_, xi_); const __m128 d_ = _mm_mul_ps(wki_, xr_); const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, // Update 'a'. // a[j2 + 0] = a[j2 + 0] - yr; // a[j2 + 1] = yi - a[j2 + 1]; // a[k2 + 0] = yr + a[k2 + 0]; // a[k2 + 1] = yi - a[k2 + 1]; const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9, const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121, // Shuffle in right order and store. const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); // 2, 3, 4, 5, const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); // 6, 7, 8, 9, const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); // 122, 123, 120, 121, const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); // 126, 127, 124, 125, const __m128 a_k2_0n = _mm_shuffle_ps( a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123, const __m128 a_k2_4n = _mm_shuffle_ps( a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127, _mm_storeu_ps(&a[0 + j2], a_j2_0n); _mm_storeu_ps(&a[4 + j2], a_j2_4n); _mm_storeu_ps(&a[122 - j2], a_k2_0n); _mm_storeu_ps(&a[126 - j2], a_k2_4n); } // Scalar code for the remaining items. for (; j2 < 64; j1 += 1, j2 += 2) { k2 = 128 - j2; k1 = 32 - j1; wkr = 0.5f - c[k1]; wki = c[j1]; xr = a[j2 + 0] - a[k2 + 0]; xi = a[j2 + 1] + a[k2 + 1]; yr = wkr * xr + wki * xi; yi = wkr * xi - wki * xr; a[j2 + 0] = a[j2 + 0] - yr; a[j2 + 1] = yi - a[j2 + 1]; a[k2 + 0] = yr + a[k2 + 0]; a[k2 + 1] = yi - a[k2 + 1]; } a[65] = -a[65]; }
QSharedPointer<Terrain> BrushTool::tip(QPoint origin) { bool needToGenerate = false; if (!tip_) { tip_ = QSharedPointer<Terrain>::create(QSize(parameters_.size, parameters_.size)); needToGenerate = true; } if (origin.x() < -500) { origin = lastTipOrigin_; } if (origin != lastTipOrigin_) { switch (parameters_.tipType) { case BrushTipType::Mountains: needToGenerate = true; break; default: // position invariant break; } } if (needToGenerate) { Terrain *t = tip_.data(); auto size = parameters_.size; float scale = 1.f / size; switch (parameters_.tipType) { case BrushTipType::Mountains: { // Set rounding mode (required by CoherentNoiseGenerator) SseRoundingModeScope roundingModeScope(_MM_ROUND_DOWN); (void) roundingModeScope; if (noiseGenSeed != parameters_.seed) { noiseGenSeed = parameters_.seed; noiseGen.randomize(static_cast<std::uint_fast32_t>(noiseGenSeed)); } auto noise = noiseGen.sampler(); __m128i originMM = _mm_setr_epi32(origin.x(), origin.y(), 0, 0); float noiseScale = 10.f / parameters_.scale; for (int y = 0; y < size; ++y) { for (int x = 0; x < size; ++x) { int cx = (x << 1) - size + 1; int cy = (y << 1) - size + 1; float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale; float alt; if (sq <= 0.f) { alt = 0.f; } else { auto posI = _mm_add_epi32(_mm_setr_epi32(x, y, 0, 0), originMM); auto pos = _mm_cvtepi32_ps(posI); pos = _mm_mul_ps(pos, _mm_set1_ps(noiseScale)); auto pos1 = _mm_mul_ps(pos, _mm_set1_ps(0.1f)); pos = _mm_unpacklo_ps(_mm_hadd_ps(pos, pos), _mm_hsub_ps(pos, pos)); auto pos2 = _mm_mul_ps(pos, _mm_set1_ps(0.15f)); auto pos3 = _mm_mul_ps(pos, _mm_set1_ps(0.3f)); auto pos4 = _mm_mul_ps(pos, _mm_set1_ps(0.03f)); float noiseVal = noise.sample(pos1); noiseVal += noise.sample(pos2) * .3f; noiseVal += noise.sample(pos3) * .15f; noiseVal += noise.sample(pos4) * 1.5f; noiseVal = std::max(std::min(0.5f + noiseVal * 1.1f, 1.f), 0.f); float sqBase = sq; sq *= sq * (3.f - 2.f * sq) * 0.8f; sq *= sq; sq -= 0.1f; sq += (sqBase - sq) * std::abs(noiseVal); alt = std::max(0.f, sq); } t->landform(x, y) = alt; } } } break; case BrushTipType::Bell: for (int y = 0; y < size; ++y) { for (int x = 0; x < size; ++x) { int cx = (x << 1) - size + 1; int cy = (y << 1) - size + 1; float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale; float alt; if (sq <= 0.f) { alt = 0.f; } else { sq *= sq * (3.f - 2.f * sq); alt = sq; } t->landform(x, y) = alt; } } break; case BrushTipType::Cone: for (int y = 0; y < size; ++y) { for (int x = 0; x < size; ++x) { int cx = (x << 1) - size + 1; int cy = (y << 1) - size + 1; float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale; float alt; if (sq <= 0.f) { alt = 0.f; } else { alt = sq; } t->landform(x, y) = alt; } } break; case BrushTipType::Sphere: scale *= scale; for (int y = 0; y < size; ++y) { for (int x = 0; x < size; ++x) { int cx = (x << 1) - size + 1; int cy = (y << 1) - size + 1; float sq = 1.f - (cx * cx + cy * cy) * scale; float alt; if (sq <= 0.f) { alt = 0.f; } else { alt = std::sqrt(sq); } t->landform(x, y) = alt; } } break; case BrushTipType::Cylinder: for (int y = 0; y < size; ++y) { for (int x = 0; x < size; ++x) { int cx = (x << 1) - size + 1; int cy = (y << 1) - size + 1; float sq = size * size - (cx * cx + cy * cy); float alt; if (sq <= 0.f) { alt = 0.f; } else { alt = 1.f; } t->landform(x, y) = alt; } } break; case BrushTipType::Square: for (int y = 0; y < size; ++y) { for (int x = 0; x < size; ++x) { t->landform(x, y) = 1.f; } } break; } } return tip_; }
UINT AudioSource::QueryAudio(float curVolume) { LPVOID buffer; UINT numAudioFrames; QWORD newTimestamp; if(GetNextBuffer((void**)&buffer, &numAudioFrames, &newTimestamp)) { //------------------------------------------------------------ // convert to float float *captureBuffer; if(!bFloat) { UINT totalSamples = numAudioFrames*inputChannels; if(convertBuffer.Num() < totalSamples) convertBuffer.SetSize(totalSamples); if(inputBitsPerSample == 8) { float *tempConvert = convertBuffer.Array(); char *tempSByte = (char*)buffer; while(totalSamples--) { *(tempConvert++) = float(*(tempSByte++))/127.0f; } } else if(inputBitsPerSample == 16) { float *tempConvert = convertBuffer.Array(); short *tempShort = (short*)buffer; while(totalSamples--) { *(tempConvert++) = float(*(tempShort++))/32767.0f; } } else if(inputBitsPerSample == 24) { float *tempConvert = convertBuffer.Array(); BYTE *tempTriple = (BYTE*)buffer; TripleToLong valOut; while(totalSamples--) { TripleToLong &valIn = (TripleToLong&)tempTriple; valOut.wVal = valIn.wVal; valOut.tripleVal = valIn.tripleVal; if(valOut.tripleVal > 0x7F) valOut.lastByte = 0xFF; *(tempConvert++) = float(double(valOut.val)/8388607.0); tempTriple += 3; } } else if(inputBitsPerSample == 32) { float *tempConvert = convertBuffer.Array(); long *tempShort = (long*)buffer; while(totalSamples--) { *(tempConvert++) = float(double(*(tempShort++))/2147483647.0); } } captureBuffer = convertBuffer.Array(); } else captureBuffer = (float*)buffer; //------------------------------------------------------------ // channel upmix/downmix if(tempBuffer.Num() < numAudioFrames*2) tempBuffer.SetSize(numAudioFrames*2); float *dataOutputBuffer = tempBuffer.Array(); float *tempOut = dataOutputBuffer; if(inputChannels == 1) { UINT numFloats = numAudioFrames; float *inputTemp = (float*)captureBuffer; float *outputTemp = dataOutputBuffer; if((UPARAM(inputTemp) & 0xF) == 0 && (UPARAM(outputTemp) & 0xF) == 0) { UINT alignedFloats = numFloats & 0xFFFFFFFC; for(UINT i=0; i<alignedFloats; i += 4) { __m128 inVal = _mm_load_ps(inputTemp+i); __m128 outVal1 = _mm_unpacklo_ps(inVal, inVal); __m128 outVal2 = _mm_unpackhi_ps(inVal, inVal); _mm_store_ps(outputTemp+(i*2), outVal1); _mm_store_ps(outputTemp+(i*2)+4, outVal2); } numFloats -= alignedFloats; inputTemp += alignedFloats; outputTemp += alignedFloats*2; } while(numFloats--) { float inputVal = *inputTemp; *(outputTemp++) = inputVal; *(outputTemp++) = inputVal; inputTemp++; } } else if(inputChannels == 2) //straight up copy { SSECopy(dataOutputBuffer, captureBuffer, numAudioFrames*2*sizeof(float)); } else { //todo: downmix optimization, also support for other speaker configurations than ones I can merely "think" of. ugh. float *inputTemp = (float*)captureBuffer; float *outputTemp = dataOutputBuffer; if(inputChannelMask == KSAUDIO_SPEAKER_QUAD) { UINT numFloats = numAudioFrames*4; float *endTemp = inputTemp+numFloats; while(inputTemp < endTemp) { float left = inputTemp[0]; float right = inputTemp[1]; float rearLeft = inputTemp[2]*surroundMix4; float rearRight = inputTemp[3]*surroundMix4; // When in doubt, use only left and right .... and rear left and rear right :) // Same idea as with 5.1 downmix *(outputTemp++) = (left + rearLeft) * attn4dotX; *(outputTemp++) = (right + rearRight) * attn4dotX; inputTemp += 4; } } else if(inputChannelMask == KSAUDIO_SPEAKER_2POINT1) { UINT numFloats = numAudioFrames*3; float *endTemp = inputTemp+numFloats; while(inputTemp < endTemp) { float left = inputTemp[0]; float right = inputTemp[1]; // Drop LFE since we don't need it //float lfe = inputTemp[2]*lowFreqMix; *(outputTemp++) = left; *(outputTemp++) = right; inputTemp += 3; } } else if(inputChannelMask == KSAUDIO_SPEAKER_4POINT1) { UINT numFloats = numAudioFrames*5; float *endTemp = inputTemp+numFloats; while(inputTemp < endTemp) { float left = inputTemp[0]; float right = inputTemp[1]; // Skip LFE , we don't really need it. //float lfe = inputTemp[2]; float rearLeft = inputTemp[3]*surroundMix4; float rearRight = inputTemp[4]*surroundMix4; // Same idea as with 5.1 downmix *(outputTemp++) = (left + rearLeft) * attn4dotX; *(outputTemp++) = (right + rearRight) * attn4dotX; inputTemp += 5; } } else if(inputChannelMask == KSAUDIO_SPEAKER_SURROUND) { UINT numFloats = numAudioFrames*4; float *endTemp = inputTemp+numFloats; while(inputTemp < endTemp) { float left = inputTemp[0]; float right = inputTemp[1]; float frontCenter = inputTemp[2]; float rearCenter = inputTemp[3]; // When in doubt, use only left and right :) Seriously. // THIS NEEDS TO BE PROPERLY IMPLEMENTED! *(outputTemp++) = left; *(outputTemp++) = right; inputTemp += 4; } } // Both speakers configs share the same format, the difference is in rear speakers position // See: http://msdn.microsoft.com/en-us/library/windows/hardware/ff537083(v=vs.85).aspx // Probably for KSAUDIO_SPEAKER_5POINT1_SURROUND we will need a different coefficient for rear left/right else if(inputChannelMask == KSAUDIO_SPEAKER_5POINT1 || inputChannelMask == KSAUDIO_SPEAKER_5POINT1_SURROUND) { UINT numFloats = numAudioFrames*6; float *endTemp = inputTemp+numFloats; while(inputTemp < endTemp) { float left = inputTemp[0]; float right = inputTemp[1]; float center = inputTemp[2]*centerMix; //We don't need LFE channel so skip it (see below) //float lowFreq = inputTemp[3]*lowFreqMix; float rearLeft = inputTemp[4]*surroundMix; float rearRight = inputTemp[5]*surroundMix; // According to ITU-R BS.775-1 recommendation, the downmix from a 3/2 source to stereo // is the following: // L = FL + k0*C + k1*RL // R = FR + k0*C + k1*RR // FL = front left // FR = front right // C = center // RL = rear left // RR = rear right // k0 = centerMix = dbMinus3 = 0.7071067811865476 [for k0 we can use dbMinus6 = 0.5 too, probably it's better] // k1 = surroundMix = dbMinus3 = 0.7071067811865476 // The output (L,R) can be out of (-1,1) domain so we attenuate it [ attn5dot1 = 1/(1 + centerMix + surroundMix) ] // Note: this method of downmixing is far from "perfect" (pretty sure it's not the correct way) but the resulting downmix is "okayish", at least no more bleeding ears. // (maybe have a look at http://forum.doom9.org/archive/index.php/t-148228.html too [ 5.1 -> stereo ] the approach seems almost the same [but different coefficients]) // http://acousticsfreq.com/blog/wp-content/uploads/2012/01/ITU-R-BS775-1.pdf // http://ir.lib.nctu.edu.tw/bitstream/987654321/22934/1/030104001.pdf *(outputTemp++) = (left + center + rearLeft) * attn5dot1; *(outputTemp++) = (right + center + rearRight) * attn5dot1; inputTemp += 6; } } // According to http://msdn.microsoft.com/en-us/library/windows/hardware/ff537083(v=vs.85).aspx // KSAUDIO_SPEAKER_7POINT1 is obsolete and no longer supported in Windows Vista and later versions of Windows // Not sure what to do about it, meh , drop front left of center/front right of center -> 5.1 -> stereo; else if(inputChannelMask == KSAUDIO_SPEAKER_7POINT1) { UINT numFloats = numAudioFrames*8; float *endTemp = inputTemp+numFloats; while(inputTemp < endTemp) { float left = inputTemp[0]; float right = inputTemp[1]; float center = inputTemp[2] * centerMix; // Drop LFE since we don't need it //float lowFreq = inputTemp[3]*lowFreqMix; float rearLeft = inputTemp[4] * surroundMix; float rearRight = inputTemp[5] * surroundMix; // Drop SPEAKER_FRONT_LEFT_OF_CENTER , SPEAKER_FRONT_RIGHT_OF_CENTER //float centerLeft = inputTemp[6]; //float centerRight = inputTemp[7]; // Downmix from 5.1 to stereo *(outputTemp++) = (left + center + rearLeft) * attn5dot1; *(outputTemp++) = (right + center + rearRight) * attn5dot1; inputTemp += 8; } } // Downmix to 5.1 (easy stuff) then downmix to stereo as done in KSAUDIO_SPEAKER_5POINT1 else if(inputChannelMask == KSAUDIO_SPEAKER_7POINT1_SURROUND) { UINT numFloats = numAudioFrames*8; float *endTemp = inputTemp+numFloats; while(inputTemp < endTemp) { float left = inputTemp[0]; float right = inputTemp[1]; float center = inputTemp[2] * centerMix; // Skip LFE we don't need it //float lowFreq = inputTemp[3]*lowFreqMix; float rearLeft = inputTemp[4]; float rearRight = inputTemp[5]; float sideLeft = inputTemp[6]; float sideRight = inputTemp[7]; // combine the rear/side channels first , baaam! 5.1 rearLeft = (rearLeft + sideLeft) * 0.5f; rearRight = (rearRight + sideRight) * 0.5f; // downmix to stereo as in 5.1 case *(outputTemp++) = (left + center + rearLeft * surroundMix) * attn5dot1; *(outputTemp++) = (right + center + rearRight * surroundMix) * attn5dot1; inputTemp += 8; } } } ReleaseBuffer(); //------------------------------------------------------------ // resample if(bResample) { UINT frameAdjust = UINT((double(numAudioFrames) * resampleRatio) + 1.0); UINT newFrameSize = frameAdjust*2; if(tempResampleBuffer.Num() < newFrameSize) tempResampleBuffer.SetSize(newFrameSize); SRC_DATA data; data.src_ratio = resampleRatio; data.data_in = tempBuffer.Array(); data.input_frames = numAudioFrames; data.data_out = tempResampleBuffer.Array(); data.output_frames = frameAdjust; data.end_of_input = 0; int err = src_process((SRC_STATE*)resampler, &data); if(err) { RUNONCE AppWarning(TEXT("AudioSource::QueryAudio: Was unable to resample audio for device '%s'"), GetDeviceName()); return NoAudioAvailable; } if(data.input_frames_used != numAudioFrames) { RUNONCE AppWarning(TEXT("AudioSource::QueryAudio: Failed to downsample buffer completely, which shouldn't actually happen because it should be using 10ms of samples")); return NoAudioAvailable; } numAudioFrames = data.output_frames_gen; } //----------------------------------------------------------------------------- // sort all audio frames into 10 millisecond increments (done because not all devices output in 10ms increments) // NOTE: 0.457+ - instead of using the timestamps from windows, just compare and make sure it stays within a 100ms of their timestamps if(!bFirstBaseFrameReceived) { lastUsedTimestamp = newTimestamp; bFirstBaseFrameReceived = true; } float *newBuffer = (bResample) ? tempResampleBuffer.Array() : tempBuffer.Array(); if (bSmoothTimestamps) { lastUsedTimestamp += 10; QWORD difVal = GetQWDif(newTimestamp, lastUsedTimestamp); if(difVal > 70) { //OSDebugOut(TEXT("----------------------------1\r\nlastUsedTimestamp before: %llu - device: %s\r\n"), lastUsedTimestamp, GetDeviceName()); lastUsedTimestamp = newTimestamp; //OSDebugOut(TEXT("lastUsedTimestamp after: %llu\r\n"), lastUsedTimestamp); } if(lastUsedTimestamp > lastSentTimestamp) { QWORD adjustVal = (lastUsedTimestamp-lastSentTimestamp); if(adjustVal < 10) lastUsedTimestamp += 10-adjustVal; AudioSegment *newSegment = new AudioSegment(newBuffer, numAudioFrames*2, lastUsedTimestamp); AddAudioSegment(newSegment, curVolume*sourceVolume); lastSentTimestamp = lastUsedTimestamp; } } else { // OSDebugOut(TEXT("newTimestamp: %llu\r\n"), newTimestamp); AudioSegment *newSegment = new AudioSegment(newBuffer, numAudioFrames*2, newTimestamp); AddAudioSegment(newSegment, curVolume*sourceVolume); } //----------------------------------------------------------------------------- return AudioAvailable; } return NoAudioAvailable; }
static inline void sacEvaluateModelSPRT(PROSAC_HEST* p){ unsigned i; unsigned isInlier; double lambda = 1.0; double lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon)); double lambdaAccept = (( p->delta ) / ( p->epsilon )); float distSq = p->maxD*p->maxD; float* src = (float*)p->src; float* dst = (float*)p->dst; float* H = p->H; p->inl = 0; p->N_tested = 0; p->good = 1; /* VECTOR */ const __m128 distSqV=_mm_set1_ps(distSq); const __m128 H00=_mm_set1_ps(H[0]); const __m128 H01=_mm_set1_ps(H[1]); const __m128 H02=_mm_set1_ps(H[2]); const __m128 H10=_mm_set1_ps(H[4]); const __m128 H11=_mm_set1_ps(H[5]); const __m128 H12=_mm_set1_ps(H[6]); const __m128 H20=_mm_set1_ps(H[8]); const __m128 H21=_mm_set1_ps(H[9]); const __m128 H22=_mm_set1_ps(H[10]); for(i=0;i<(p->N-3) && p->good;i+=4){ /* Backproject */ __m128 x, y, X, Y, inter0, inter1, inter2, inter3; x=_mm_load_ps(src+2*i); y=_mm_load_ps(src+2*i+4); X=_mm_load_ps(dst+2*i); Y=_mm_load_ps(dst+2*i+4); inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0 inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2 inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0 inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2 x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); __m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02); __m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12); __m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22); __m128 recipZ = _mm_rcp_ps(reprojZ); reprojX = _mm_mul_ps(reprojX, recipZ); reprojY = _mm_mul_ps(reprojY, recipZ); //reprojX = _mm_div_ps(reprojX, reprojZ); //reprojY = _mm_div_ps(reprojY, reprojZ); reprojX = _mm_sub_ps(reprojX, X); reprojY = _mm_sub_ps(reprojY, Y); reprojX = _mm_mul_ps(reprojX, reprojX); reprojY = _mm_mul_ps(reprojY, reprojY); __m128 reprojDistV = _mm_add_ps(reprojX, reprojY); __m128 cmp = _mm_cmple_ps(reprojDistV, distSqV); int msk = _mm_movemask_ps(cmp); /* ... */ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15*/ unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; p->inl += bitCnt[msk]; /* SPRT */ lambda *= p->lambdaTBL[msk]; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } /* SCALAR */ for(;i<p->N && p->good;i++){ /* Backproject */ float x=src[i*2],y=src[i*2+1]; float X=dst[i*2],Y=dst[i*2+1]; float reprojX=H[0]*x+H[1]*y+H[2]; // ( X_1 ) ( H_11 H_12 H_13 ) (x_1) float reprojY=H[4]*x+H[5]*y+H[6]; // ( X_2 ) = ( H_21 H_22 H_23 ) (x_2) float reprojZ=H[8]*x+H[9]*y+H[10];// ( X_3 ) ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0) //reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z. reprojX/=reprojZ; reprojY/=reprojZ; //Compute distance reprojX-=X; reprojY-=Y; reprojX*=reprojX; reprojY*=reprojY; float reprojDist = reprojX+reprojY; /* ... */ isInlier = reprojDist <= distSq; p->inl += isInlier; /* SPRT */ lambda *= isInlier ? lambdaAccept : lambdaReject; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } p->N_tested = i; }
test (__m128 s1, __m128 s2) { return _mm_unpacklo_ps (s1, s2); }