C++ (Cpp) _mm_unpacklo_ps 예제들

예제 #1

0

파일 보기

파일: OptimizedSelfAdjointMatrix6x6f.cpp 프로젝트: aprovodi/ROSplayground

void OptimizedSelfAdjointMatrix6x6f::rankUpdate(const Eigen::Matrix<float, 6, 1>& u, const float& alpha)
{
  __m128 s = _mm_set1_ps(alpha);
  __m128 v1234 = _mm_loadu_ps(u.data());
  __m128 v56xx = _mm_loadu_ps(u.data() + 4);

  __m128 v1212 = _mm_movelh_ps(v1234, v1234);
  __m128 v3434 = _mm_movehl_ps(v1234, v1234);
  __m128 v5656 = _mm_movelh_ps(v56xx, v56xx);

  __m128 v1122 = _mm_mul_ps(s, _mm_unpacklo_ps(v1212, v1212));

  _mm_store_ps(data + 0, _mm_add_ps(_mm_load_ps(data + 0), _mm_mul_ps(v1122, v1212)));
  _mm_store_ps(data + 4, _mm_add_ps(_mm_load_ps(data + 4), _mm_mul_ps(v1122, v3434)));
  _mm_store_ps(data + 8, _mm_add_ps(_mm_load_ps(data + 8), _mm_mul_ps(v1122, v5656)));

  __m128 v3344 = _mm_mul_ps(s, _mm_unpacklo_ps(v3434, v3434));

  _mm_store_ps(data + 12, _mm_add_ps(_mm_load_ps(data + 12), _mm_mul_ps(v3344, v3434)));
  _mm_store_ps(data + 16, _mm_add_ps(_mm_load_ps(data + 16), _mm_mul_ps(v3344, v5656)));

  __m128 v5566 = _mm_mul_ps(s, _mm_unpacklo_ps(v5656, v5656));

  _mm_store_ps(data + 20, _mm_add_ps(_mm_load_ps(data + 20), _mm_mul_ps(v5566, v5656)));
}

예제 #2

0

파일 보기

파일: util.cpp 프로젝트: shewu/raytracer

/* apparently this is retarded */
void mulMatrix1(Matrix4x4 ret, Matrix4x4 mat1, Matrix4x4 mat2)
{
    /* for some reason not aligning the matrix segfaults,
     * but aligning deadlocks the program */
    /* aha we can heavily sse this:
     * 1. transpose mat2
     * 2. dotproduct the rows */

    /* 1. transpose mat2 */
    __m128 row0, row1, row2, row3;
    __m128 tmp0, tmp1, tmp2, tmp3;

    /* Load 4x4 mat2 from memory into four SSE registers. */
    row0 = _mm_load_ps( mat2[0] );
    row1 = _mm_load_ps( mat2[1] );
    row2 = _mm_load_ps( mat2[2] );
    row3 = _mm_load_ps( mat2[3] );

    /* Interleave bottom/top two pixels from two SSE registers with each other
     * into a single SSE register. */
    tmp0 = _mm_unpacklo_ps( row0, row1 );
    tmp2 = _mm_unpacklo_ps( row2, row3 );
    tmp1 = _mm_unpackhi_ps( row0, row1 );
    tmp3 = _mm_unpackhi_ps( row2, row3 );

    /* Move bottom/top two pixels from two SSE registers into one SSE register. */
    row0 = _mm_movelh_ps( tmp0, tmp2 );
    row1 = _mm_movehl_ps( tmp2, tmp0 );
    row2 = _mm_movelh_ps( tmp1, tmp3 );
    row3 = _mm_movehl_ps( tmp3, tmp1 );

    /* Store 4x4 matrix from all four SSE registers into memory. */
    _mm_store_ps( mat2[0], row0 );
    _mm_store_ps( mat2[1], row1 );
    _mm_store_ps( mat2[2], row2 );
    _mm_store_ps( mat2[3], row3 );

    /* 2. dotproduct the rows */
    /* OMG 16 DOT PRODUCTS */
    ret[0][0] = mul_asm(mat1[0], mat2[0]);
    ret[0][1] = mul_asm(mat1[0], mat2[1]);
    ret[0][2] = mul_asm(mat1[0], mat2[2]);
    ret[0][3] = mul_asm(mat1[0], mat2[3]);
    ret[1][0] = mul_asm(mat1[1], mat2[0]);
    ret[1][1] = mul_asm(mat1[1], mat2[1]);
    ret[1][2] = mul_asm(mat1[1], mat2[2]);
    ret[1][3] = mul_asm(mat1[1], mat2[3]);
    ret[2][0] = mul_asm(mat1[2], mat2[0]);
    ret[2][1] = mul_asm(mat1[2], mat2[1]);
    ret[2][2] = mul_asm(mat1[2], mat2[2]);
    ret[2][3] = mul_asm(mat1[2], mat2[3]);
    ret[3][0] = mul_asm(mat1[3], mat2[0]);
    ret[3][1] = mul_asm(mat1[3], mat2[1]);
    ret[3][2] = mul_asm(mat1[3], mat2[2]);
    ret[3][3] = mul_asm(mat1[3], mat2[3]);

    return;
}

예제 #3

0

파일 보기

/// Transform this box using the specified transform matrix.
///
/// @param[in] rTransform  Matrix by which to transform.
void Helium::Simd::AaBox::TransformBy( const Matrix44& rTransform )
{
    // Expand each corner position.
    Register minVec = m_minimum.GetSimdVector();
    Register maxVec = m_maximum.GetSimdVector();

    Vector3Soa corners0;
    corners0.m_x = _mm_shuffle_ps( minVec, minVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    corners0.m_y = _mm_shuffle_ps( minVec, maxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    corners0.m_z = _mm_unpackhi_ps( minVec, maxVec );
    corners0.m_z = _mm_movelh_ps( corners0.m_z, corners0.m_z );

    Vector3Soa corners1;
    corners1.m_x = _mm_shuffle_ps( maxVec, maxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    corners1.m_y = corners0.m_y;
    corners1.m_z = corners0.m_z;

    // Transform all corners by the provided transformation matrix.
    Matrix44Soa transformSplat( rTransform );
    transformSplat.TransformPoint( corners0, corners0 );
    transformSplat.TransformPoint( corners1, corners1 );

    // Compute the minimum.
    Register minX = Simd::MinF32( corners0.m_x, corners1.m_x );
    Register minY = Simd::MinF32( corners0.m_y, corners1.m_y );
    Register minXYLo = _mm_unpacklo_ps( minX, minY );
    Register minXYHi = _mm_unpackhi_ps( minX, minY );
    Register minXY = Simd::MinF32( minXYLo, minXYHi );

    Register minZ = Simd::MinF32( corners0.m_z, corners1.m_z );
    Register minZLo = _mm_unpacklo_ps( minZ, minZ );
    Register minZHi = _mm_unpackhi_ps( minZ, minZ );
    minZ = Simd::MinF32( minZLo, minZHi );

    Register minLo = _mm_movelh_ps( minXY, minZ );
    Register minHi = _mm_movehl_ps( minZ, minXY );

    m_minimum.SetSimdVector( Simd::MinF32( minLo, minHi ) );

    // Compute the maximum.
    Register maxX = Simd::MaxF32( corners0.m_x, corners1.m_x );
    Register maxY = Simd::MaxF32( corners0.m_y, corners1.m_y );
    Register maxXYLo = _mm_unpacklo_ps( maxX, maxY );
    Register maxXYHi = _mm_unpackhi_ps( maxX, maxY );
    Register maxXY = Simd::MaxF32( maxXYLo, maxXYHi );

    Register maxZ = Simd::MaxF32( corners0.m_z, corners1.m_z );
    Register maxZLo = _mm_unpacklo_ps( maxZ, maxZ );
    Register maxZHi = _mm_unpackhi_ps( maxZ, maxZ );
    maxZ = Simd::MaxF32( maxZLo, maxZHi );

    Register maxLo = _mm_movelh_ps( maxXY, maxZ );
    Register maxHi = _mm_movehl_ps( maxZ, maxXY );

    m_maximum.SetSimdVector( Simd::MaxF32( maxLo, maxHi ) );
}

예제 #4

0

파일 보기

파일: TestMatrixMult.cpp 프로젝트: shobomaru/TestMatrixMult

static void NOINLINE transposeX4( const __m128 *v1, __m128 *vout )
{
    __m128 a0 = _mm_unpacklo_ps( v1[ 0 ], v1[ 2 ] );
    __m128 a1 = _mm_unpacklo_ps( v1[ 1 ], v1[ 3 ] );
    __m128 a2 = _mm_unpackhi_ps( v1[ 0 ], v1[ 2 ] );
    __m128 a3 = _mm_unpackhi_ps( v1[ 1 ], v1[ 3 ] );
    vout[ 0 ] = _mm_unpacklo_ps( a0, a1 );
    vout[ 1 ] = _mm_unpackhi_ps( a0, a1 );
    vout[ 2 ] = _mm_unpacklo_ps( a2, a3 );
    vout[ 3 ] = _mm_unpackhi_ps( a2, a3 );
}

예제 #5

0

파일 보기

파일: overexposed.c 프로젝트: Coshibu/darktable

void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out)
{
  dt_develop_t *dev = self->dev;

  const int ch = piece->colors;

  const __m128 upper = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f);
  const __m128 lower = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f);

  const int colorscheme = dev->overexposed.colorscheme;
  const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]);
  const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]);

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(ovoid) schedule(static)
#endif
  for(int k=0; k<roi_out->height; k++)
  {
    const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
    {
      const __m128 pixel = _mm_load_ps(in);

      __m128 isoe = _mm_cmpge_ps(pixel, upper);
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));

      __m128 isue = _mm_cmple_ps(pixel, lower);
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));

      __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
                                _mm_and_ps(isoe, upper_color));

      result = _mm_or_ps(_mm_andnot_ps(isue, result),
                         _mm_and_ps(isue, lower_color));

      _mm_stream_ps(out, result);
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}

예제 #6

0

파일 보기

파일: AudioNodeEngineSSE2.cpp 프로젝트: jasonLaster/gecko-dev

void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
                               float* aOutput, uint32_t aSize) {
  unsigned i;
  __m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0,
      outimag1, outimag2, outimag3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aScale);
  ASSERT_ALIGNED16(aOutput);
  ASSERT_MULTIPLE16(aSize);

  for (i = 0; i < aSize * 2; i += 16) {
    in0 = _mm_load_ps(&aInput[i]);
    in1 = _mm_load_ps(&aInput[i + 4]);
    in2 = _mm_load_ps(&aInput[i + 8]);
    in3 = _mm_load_ps(&aInput[i + 12]);

    outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
    outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
    outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
    outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));

    in0 = _mm_load_ps(&aScale[i]);
    in1 = _mm_load_ps(&aScale[i + 4]);
    in2 = _mm_load_ps(&aScale[i + 8]);
    in3 = _mm_load_ps(&aScale[i + 12]);

    outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
    outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
    outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
    outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));

    in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
                     _mm_mul_ps(outimag0, outimag1));
    in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
                     _mm_mul_ps(outimag0, outreal1));
    in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
                     _mm_mul_ps(outimag2, outimag3));
    in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
                     _mm_mul_ps(outimag2, outreal3));

    outreal0 = _mm_unpacklo_ps(in0, in1);
    outreal1 = _mm_unpackhi_ps(in0, in1);
    outreal2 = _mm_unpacklo_ps(in2, in3);
    outreal3 = _mm_unpackhi_ps(in2, in3);

    _mm_store_ps(&aOutput[i], outreal0);
    _mm_store_ps(&aOutput[i + 4], outreal1);
    _mm_store_ps(&aOutput[i + 8], outreal2);
    _mm_store_ps(&aOutput[i + 12], outreal3);
  }
}

예제 #7

0

파일 보기

파일: Float16.hpp 프로젝트: adamsmigielski/Utilities

		inline float16 Transpose(const float16 & a)
		{
			float16 temp, res;

			temp.x.m128 = _mm_unpacklo_ps(a.x.m128, a.z.m128);
			temp.y.m128 = _mm_unpacklo_ps(a.y.m128, a.w.m128);
			temp.z.m128 = _mm_unpackhi_ps(a.x.m128, a.z.m128);
			temp.w.m128 = _mm_unpackhi_ps(a.y.m128, a.w.m128);

			res.x.m128 = _mm_unpacklo_ps(temp.x.m128, temp.y.m128);
			res.y.m128 = _mm_unpackhi_ps(temp.x.m128, temp.y.m128);
			res.z.m128 = _mm_unpacklo_ps(temp.z.m128, temp.w.m128);
			res.w.m128 = _mm_unpackhi_ps(temp.z.m128, temp.w.m128);

			return res;
		}

예제 #8

0

파일 보기

파일: sound_sse.cpp 프로젝트: punkkeks/ClanLib

void SoundSSE::pack_16bit_stereo(float *input[2], int size, short *output)
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/4)*4;

	__m128 constant1 = _mm_set1_ps(32767);
	for (int i = 0; i < sse_size; i+=4)
	{
		__m128 samples0 = _mm_loadu_ps(input[0]+i);
		__m128 samples1 = _mm_loadu_ps(input[1]+i);
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		__m128 tmp0, tmp1;
		tmp0 = _mm_unpacklo_ps(samples0, samples1);
		tmp1 = _mm_unpackhi_ps(samples0, samples1);
		__m128i isamples0 = _mm_cvtps_epi32(tmp0);
		__m128i isamples1 = _mm_cvtps_epi32(tmp1);
		__m128i isamples = _mm_packs_epi32(isamples0, isamples1);
		_mm_storeu_si128((__m128i*)(output+i*2), isamples);
	}

#else
	const int sse_size = 0;
#endif

	// Pack remaining
	for (int i = sse_size; i < size; i++)
	{
		output[i*2] = input[0][i]*32767;
		output[i*2 + 1] = input[1][i]*32767;
	}
}

예제 #9

0

파일 보기

파일: sound_sse.cpp 프로젝트: punkkeks/ClanLib

void SoundSSE::pack_float_stereo(float *input[2], int size, float *output)
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/4)*4;

	for (int i = 0; i < sse_size; i+=4)
	{
		__m128 samples0 = _mm_loadu_ps(input[0]+i);
		__m128 samples1 = _mm_loadu_ps(input[1]+i);
		__m128 tmp0, tmp1;
		tmp0 = _mm_unpacklo_ps(samples0, samples1);
		tmp1 = _mm_unpackhi_ps(samples0, samples1);
		_mm_storeu_ps(output+i*2, tmp0);
		_mm_storeu_ps(output+i*2+4, tmp1);
	}

#else
	const int sse_size = 0;
#endif

	// Pack remaining
	for (int i = sse_size; i < size; i++)
	{
		output[i*2] = input[0][i];
		output[i*2 + 1] = input[1][i];
	}
}

예제 #10

0

파일 보기

파일: MixerLoops.cpp 프로젝트: Kinglions/modizer

static void SSE2_FloatToStereoMix(const float *pIn1, const float *pIn2, int32 *pOut, uint32 nCount, const float _f2ic)
//--------------------------------------------------------------------------------------------------------------------
{
	__m128 f2ic = _mm_load_ps1(&_f2ic);
	__m128i *out = reinterpret_cast<__m128i *>(pOut);

	// We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE
	nCount = (nCount + 3) / 4;
	do
	{
		__m128 fl = _mm_loadu_ps(pIn1);			// Load four float values, LLLL
		__m128 fr = _mm_loadu_ps(pIn2);			// Load four float values, RRRR
		pIn1 += 4;
		pIn2 += 4;
		fl = _mm_mul_ps(fl, f2ic);				// Apply int->float factor
		fr = _mm_mul_ps(fr, f2ic);				// Apply int->float factor
		__m128 f1 = _mm_unpacklo_ps(fl, fr);	// LL__+RR__ => LRLR
		__m128 f2 = _mm_unpackhi_ps(fl, fr);	// __LL+__RR => LRLR
		__m128i i1 =_mm_cvtps_epi32(f1);		// Convert to four ints
		__m128i i2 =_mm_cvtps_epi32(f2);		// Convert to four ints
		_mm_storeu_si128(out, i1);				// Store four int values, LRLR
		_mm_storeu_si128(out + 1, i2);			// Store four int values, LRLR
		out += 2;
	} while(--nCount);
}

예제 #11

0

파일 보기

파일: mod-coord-sse.cpp 프로젝트: lensfun/lensfun

void lfModifier::ModifyCoord_Dist_PTLens_SSE (void *data, float *iocoord, int count)
{
  // See "Note about PT-based distortion models" at the top of mod-coord.cpp.
  /*
   * If buffer is not aligned, fall back to plain code
   */
  if((uintptr_t)(iocoord) & 0xf)
  {
    return ModifyCoord_Dist_PTLens(data, iocoord, count);
  }

  lfCoordDistCallbackData* cddata = (lfCoordDistCallbackData*) data;

  // Rd = Ru * (a_ * Ru^3 + b_ * Ru^2 + c_ * Ru + 1)
  __m128 a_ = _mm_set_ps1 (cddata->Terms [0]);
  __m128 b_ = _mm_set_ps1 (cddata->Terms [1]);
  __m128 c_ = _mm_set_ps1 (cddata->Terms [2]);
  __m128 cx = _mm_set_ps1 (cddata->centerX);
  __m128 cy = _mm_set_ps1 (cddata->centerY);
  __m128 cc = _mm_set_ps1 (cddata->coordinate_correction);
  __m128 one = _mm_set_ps1 (1.0f);

  // SSE Loop processes 4 pixels/loop
  int loop_count = count / 4;
  for (int i = 0; i < loop_count ; i++)
  {
    __m128 c0 = _mm_load_ps (&iocoord [8 * i]);
    __m128 c1 = _mm_load_ps (&iocoord [8 * i + 4]);
    __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0));
    __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1));
    x = _mm_sub_ps(_mm_mul_ps(x, cc), cx);
    y = _mm_sub_ps(_mm_mul_ps(y, cc), cy);

    __m128 ru2 = _mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y));
    __m128 ru = _mm_rcp_ps (_mm_rsqrt_ps (ru2));

    // Calculate poly3 = a_ * ru2 * ru + b_ * ru2 + c_ * ru + 1;
    __m128 t = _mm_mul_ps (ru2, b_);
    __m128 poly3 = _mm_mul_ps (_mm_mul_ps (a_, ru2), ru);
    t = _mm_add_ps (t, _mm_mul_ps (ru, c_));
    poly3 = _mm_add_ps (t, _mm_add_ps (poly3, one));

    x = _mm_add_ps(_mm_mul_ps (x, poly3), cx);
    y = _mm_add_ps(_mm_mul_ps (y, poly3), cy);
    x = _mm_div_ps (x, cc);
    y = _mm_div_ps (y, cc);

    c0 = _mm_unpacklo_ps(x, y);
    c1 = _mm_unpackhi_ps(x, y);

    _mm_store_ps (&iocoord [8 * i], c0);
    _mm_store_ps (&iocoord [8 * i + 4], c1);
  }

  loop_count *= 4;
  int remain = count - loop_count;
  if (remain)
    ModifyCoord_Dist_PTLens (data, &iocoord [loop_count * 2], remain);
}

예제 #12

0

파일 보기

파일: SimdFrustumSse.cpp 프로젝트: euler0/Helium

/// Compute the corners of this view frustum.
///
/// A view frustum can have either four or eight corners depending on whether a far clip plane exists (eight
/// corners) or whether an infinite far clip plane is used (four corners).
///
/// Note that this assumes that the frustum is always properly defined, with each possible combination of
/// neighboring clip planes intersecting at a valid point.
///
/// @param[out] pCorners  Array in which the frustum corners will be stored.  This must point to a region of memory
///                       large enough for four points if this frustum has an infinite far clip plane, or eight
///                       points if this frustum has a normal far clip plane.
///
/// @return  Number of clip planes computed (either four or eight).
size_t Helium::Simd::Frustum::ComputeCorners( Vector3* pCorners ) const
{
    HELIUM_ASSERT( pCorners );

    // Compute the corners in struct-of-arrays format.
    HELIUM_SIMD_ALIGN_PRE float32_t cornersX[ 8 ] HELIUM_SIMD_ALIGN_POST;
    HELIUM_SIMD_ALIGN_PRE float32_t cornersY[ 8 ] HELIUM_SIMD_ALIGN_POST;
    HELIUM_SIMD_ALIGN_PRE float32_t cornersZ[ 8 ] HELIUM_SIMD_ALIGN_POST;

    size_t cornerCount = ComputeCornersSoa( cornersX, cornersY, cornersZ );
    HELIUM_ASSERT( cornerCount == 4 || cornerCount == 8 );

    // Swizzle the results and store in the output array.
    Helium::Simd::Register cornerXVec = Helium::Simd::LoadAligned( cornersX );
    Helium::Simd::Register cornerYVec = Helium::Simd::LoadAligned( cornersY );
    Helium::Simd::Register cornerZVec = Helium::Simd::LoadAligned( cornersZ );

    Helium::Simd::Register xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec );
    Helium::Simd::Register xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec );
    Helium::Simd::Register zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec );
    Helium::Simd::Register zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec );

    pCorners[ 0 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) );
    pCorners[ 1 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) );
    pCorners[ 2 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) );
    pCorners[ 3 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) );

    if( cornerCount == 8 )
    {
        cornerXVec = Helium::Simd::LoadAligned( cornersX + 4 );
        cornerYVec = Helium::Simd::LoadAligned( cornersY + 4 );
        cornerZVec = Helium::Simd::LoadAligned( cornersZ + 4 );

        xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec );
        xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec );
        zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec );
        zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec );

        pCorners[ 4 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) );
        pCorners[ 5 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) );
        pCorners[ 6 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) );
        pCorners[ 7 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) );
    }

    return cornerCount;
}

예제 #13

0

파일 보기

파일: MySSEFunctions.cpp 프로젝트: e-sha/Background-subtraction

	void Shuffle16Elems(__m128 &io_data0, __m128 &io_data1, __m128 &io_data2,
		__m128 &io_data3)
	{
		__m128 ccdd1 = _mm_unpackhi_ps(io_data0, io_data1);
		__m128 ccdd2 = _mm_unpackhi_ps(io_data2, io_data3);
		__m128 aabb1 = _mm_unpacklo_ps(io_data0, io_data1);
		__m128 aabb2 = _mm_unpacklo_ps(io_data2, io_data3);

		io_data0 = 
			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2)));
		io_data1 =
			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2)));
		io_data2 =
			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2)));
		io_data3 = 
			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2)));
	}

예제 #14

0

파일 보기

파일: matrix4.cpp 프로젝트: Nate1595/OpenTomb

matrix4 matrix4::transposed() const
{
#ifdef __SSE__
    __m128 tmp3, tmp2, tmp1, tmp0;
    tmp0 = _mm_unpacklo_ps(x.v, y.v);
    tmp2 = _mm_unpacklo_ps(z.v, w.v);
    tmp1 = _mm_unpackhi_ps(x.v, y.v);
    tmp3 = _mm_unpackhi_ps(z.v, w.v);

    return matrix4(_mm_movelh_ps(tmp0, tmp2), _mm_movehl_ps(tmp2, tmp0), _mm_movelh_ps(tmp1, tmp3), _mm_movehl_ps(tmp3, tmp1));
#else
    return matrix4(float4(x.x, y.x, z.x, w.x),
                   float4(x.y, y.y, z.y, w.y),
                   float4(x.z, y.z, z.z, w.z),
                   float4(x.w, y.w, z.w, w.w));
#endif
}

예제 #15

0

파일 보기

파일: nx_sse_float.hpp 프로젝트: mywoodstock/nxsimd

    inline vector4f haddp(const vector4f* row)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
                           _mm_hadd_ps(row[2], row[3]));
#else
        __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
        __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
        __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
        tmp0 = _mm_add_ps(tmp0, tmp1);
        tmp1 = _mm_unpacklo_ps(row[2], row[3]);
        tmp1 = _mm_add_ps(tmp1, tmp2);
        tmp2 = _mm_movehl_ps(tmp1, tmp0);
        tmp0 = _mm_movelh_ps(tmp0, tmp1);
        return _mm_add_ps(tmp0, tmp2);
#endif
    }

예제 #16

0

파일 보기

파일: simd.cpp 프로젝트: hjwhang/Image_Rescale

inline __m128 CalcWeights(float x, float y)
{
 __m128 ssx = _mm_set_ss(x);
 __m128 ssy = _mm_set_ss(y);
 __m128 psXY = _mm_unpacklo_ps(ssx, ssy);      // 0 0 y x

 //__m128 psXYfloor = _mm_floor_ps(psXY); // use this line for if you have SSE4
 __m128 psXYfloor = _mm_cvtepi32_ps(_mm_cvtps_epi32(psXY));
 __m128 psXYfrac = _mm_sub_ps(psXY, psXYfloor); // = frac(psXY)
 
 __m128 psXYfrac1 = _mm_sub_ps(CONST_1111, psXYfrac); // ? ? (1-y) (1-x)
 __m128 w_x = _mm_unpacklo_ps(psXYfrac1, psXYfrac);   // ? ?     x (1-x)
        w_x = _mm_movelh_ps(w_x, w_x);      // x (1-x) x (1-x)
 __m128 w_y = _mm_shuffle_ps(psXYfrac1, psXYfrac, _MM_SHUFFLE(1, 1, 1, 1)); // y y (1-y) (1-y)

 // complete weight vector
 return _mm_mul_ps(w_x, w_y);
}

예제 #17

0

파일 보기

파일: main.cpp 프로젝트: KubaO/stackoverflown

void fast(element_t * const elements, const int num_elts, const float a) {
    element_t * elts = elements;
    float logf_a = logf(a);
    float logf_1_a = logf(1.0/a);
    v4sf log_a = _mm_load1_ps(&logf_a);
    v4sf log_1_a = _mm_load1_ps(&logf_1_a);
    assert(num_elts % 3 == 0); // operates on 3 elements at a time

    // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    for (int i = 0; i < num_elts; i += 3) {
        // transpose
        // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output
        v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0
        v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0
        v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0
        v4sf r3 = _mm_setzero_ps();        // 0, 0, 0, 0
        v4sf t0 = _mm_unpacklo_ps(r0, r1); //  x1,x2,y1,y2
        v4sf t1 = _mm_unpacklo_ps(r2, r3); //  x3,0, y3,0
        v4sf t2 = _mm_unpackhi_ps(r0, r1); //  z1,z2,0, 0
        v4sf t3 = _mm_unpackhi_ps(r2, r3); //  z3,0, 0, 0
        r0 = _mm_movelh_ps(t0, t1);        // x1,x2,x3,0
        r1 = _mm_movehl_ps(t1, t0);        // y1,y2,y3,0
        r2 = _mm_movelh_ps(t2, t3);        // z1,z2,z3,0
        // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a))
        v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0
        v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0
        v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0
        v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0
        v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0
        v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0
        // sum
        v4sf s1 = _mm_add_ps(ex0, ex1);
        v4sf s2 = _mm_add_ps(sum1, ex2);
        // pow(sum, 1/a) = exp(sum * log(1/a))
        v4sf ps = _mm_mul_ps(s2, log_1_a);
        v4sf es = exp_ps(ps);
        ALIGN16_BEG float re[4] ALIGN16_END;
        _mm_store_ps(re, es);
        elts[0].re = re[0];
        elts[1].re = re[1];
        elts[2].re = re[2];
        elts += 3;
    }
}

예제 #18

0

파일 보기

파일: avx512vl-vunpcklps-1.c 프로젝트: 0day-ci/gcc

void extern
avx512vl_test (void)
{
  x = _mm256_unpacklo_ps (y, z);
  x = _mm256_mask_unpacklo_ps (x, 2, y, z);
  x = _mm256_maskz_unpacklo_ps (2, y, z);
  xx = _mm_unpacklo_ps (yy, zz);
  xx = _mm_mask_unpacklo_ps (xx, 2, yy, zz);
  xx = _mm_maskz_unpacklo_ps (2, yy, zz);
}

예제 #19

0

파일 보기

파일: dct_simd.cpp 프로젝트: ArtisticCoding/OpenCP

void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh)
{
	__m128 ms0 = _mm_load_ps(src);
	__m128 ms1 = _mm_load_ps(src + 4);
	const __m128 mm = _mm_set1_ps(0.5f);
	__m128 a = _mm_add_ps(ms0, ms1);
	__m128 b = _mm_sub_ps(ms0, ms1);

	__m128 t1 = _mm_unpacklo_ps(a, b);
	__m128 t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);

	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth);
	ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk);
#ifdef _KEEP_00_COEF_
	ms0 = _mm_blend_ps(ms0, a, 1);
#endif
	msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth);
	ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk);

	a = _mm_add_ps(ms0, ms1);
	b = _mm_sub_ps(ms0, ms1);

	t1 = _mm_unpacklo_ps(a, b);
	t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	_mm_store_ps(dest, a);
	_mm_store_ps(dest + 4, b);
}

예제 #20

0

파일 보기

파일: bump.cpp 프로젝트: bustercopley/polymorph

v4f step_t::operator () (float t) const
{
  // Evaluate the polynomial f by Estrin's method. Return
  //   (0 0 0 0)  if t < t0,
  //   (f f f f)  if t0 <= t < t1,
  //   (1 1 1 1)  if t > t1.
  v4f c4 = load4f (c);
  v4f one = { 1.0f, 1.0f, 1.0f, 1.0f };
  v4f tttt = _mm_set1_ps (t);           // t t t t
  v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t
  v4f f0 = c4 * tt;                     // c0 c1*t c2 c3*t
  v4f ha = _mm_hadd_ps (f0, f0) * tt * tt;
  v4f f = _mm_hadd_ps (ha, ha);         // f f f f
  v4f f1 = _mm_unpacklo_ps (f, one);    // f 1 f 1
  v4f tx = load4f (T);                  // t0  t1 t1 inf
  v4f lo = _mm_movelh_ps (tx, tx);      // t0  t1 t0  t1
  v4f hi = _mm_movehl_ps (tx, tx);      // t1 inf t1 inf
  v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi));
  v4f val = _mm_and_ps (sel, f1);       // f? 1? f? 1?
  return _mm_hadd_ps (val, val);
}

예제 #21

0

파일 보기

파일: FastResampler_FirFilter_SSE2.cpp 프로젝트: Faik-man/ssr

void FastResampler_FirFilter2_Cn_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
	Q_UNUSED(channels);
	for(unsigned int c = 0; c < channels; ++c) {
		__m128 sum = _mm_setzero_ps();
		__m128 v_frac = _mm_set1_ps(frac);
		float *input2 = input + c;
		for(unsigned int i = 0; i < filter_length / 4; ++i) {
			__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
			coef1 += 4; coef2 += 4;
			__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
			__m128 v_input1 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input2 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input3 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input4 = _mm_load_ss(input2); input2 += channels;
			__m128 v_input = _mm_movelh_ps(_mm_unpacklo_ps(v_input1, v_input2), _mm_unpacklo_ps(v_input3, v_input4));
			sum = _mm_add_ps(sum, _mm_mul_ps(v_input, filter_value));
		}
		__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0x0e));
		__m128 sum3 = _mm_add_ss(sum2, _mm_shuffle_ps(sum2, sum2, 0x01));
		_mm_store_ss(output + c, sum3);
	}
}

예제 #22

0

파일 보기

파일: BoundingBox.cpp 프로젝트: boberfly/Urho3D

BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const
{
#ifdef URHO3D_SSE
    const __m128 one = _mm_set_ss(1.f);
    __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one));
    __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one));
    __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f));
    __m128 halfSize = _mm_sub_ps(centerPoint, minPt);
    __m128 m0 = _mm_loadu_ps(&transform.m00_);
    __m128 m1 = _mm_loadu_ps(&transform.m10_);
    __m128 m2 = _mm_loadu_ps(&transform.m20_);
    __m128 r0 = _mm_mul_ps(m0, centerPoint);
    __m128 r1 = _mm_mul_ps(m1, centerPoint);
    __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1));
    __m128 r2 = _mm_mul_ps(m2, centerPoint);
    const __m128 zero = _mm_setzero_ps();
    __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero));
    __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
    const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize));
    __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize));
    __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize));
    t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y));
    t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero));
    __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
    return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir));
#else
    Vector3 newCenter = transform * Center();
    Vector3 oldEdge = Size() * 0.5f;
    Vector3 newEdge = Vector3(
        Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_,
        Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_,
        Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_
    );

    return BoundingBox(newCenter - newEdge, newCenter + newEdge);
#endif
}

예제 #23

0

파일 보기

파일: RigidTransform.hpp 프로젝트: BackupTheBerlios/slon

/** transform vector by rigid transform */
inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec)
{
#ifdef SIMPLE_GL_USE_SSE4
    __m128 res;
    __m128 dotProd;

    res      = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\
    dotProd  = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) );

    return Matrix<float, 4, 1>(res);
#elif defined(SIMPLE_GL_USE_SSE3)
    __m128 res;

    __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);

    __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);

    __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);

    __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);

    __m128 vec01    = _mm_unpacklo_ps(dotProd0, dotProd1);
    __m128 vec23    = _mm_unpackhi_ps(dotProd2, dotProd3);
    res             = _mm_movelh_ps(vec01, vec23);

    return Matrix<float, 4, 1>(res);
#else // SSE2
    // TODO: Think about good sse optimization
    Matrix<float, 4, 1> res;
    res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3];
    res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3];
    res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3];
    res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3];
    return res;
#endif
}

예제 #24

0

파일 보기

파일: FastResampler_FirFilter_SSE2.cpp 프로젝트: Faik-man/ssr

void FastResampler_FirFilter2_C2_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
	Q_UNUSED(channels);
	__m128 sum = _mm_setzero_ps();
	__m128 v_frac = _mm_set1_ps(frac);
	for(unsigned int i = 0; i < filter_length / 4; ++i) {
		__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
		coef1 += 4; coef2 += 4;
		__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
		__m128 v_input1 = _mm_loadu_ps(input), v_input2 = _mm_loadu_ps(input + 4);
		input += 8;
		sum = _mm_add_ps(sum, _mm_mul_ps(v_input1, _mm_unpacklo_ps(filter_value, filter_value)));
		sum = _mm_add_ps(sum, _mm_mul_ps(v_input2, _mm_unpackhi_ps(filter_value, filter_value)));
	}
	__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0xee));
	_mm_store_sd((double*) output, _mm_castps_pd(sum2));
}

예제 #25

0

파일 보기

파일: cfft.c 프로젝트: TravisKraatz/cinelerra

static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
                              complex_t *ch, const complex_t *wa)
{
    uint16_t i, k, ah, ac;

    for (k = 0; k < l1; k++)
    {
        ah = k*ido;
        ac = 2*k*ido;

        for (i = 0; i < ido; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
            __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
            __m128 w1, w2, w3, w4;

            m1 = _mm_load_ps(&RE(cc[ac+i]));
            m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
            m5 = _mm_load_ps(&RE(cc[ac+i+2]));
            m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
            w1 = _mm_load_ps(&RE(wa[i]));
            w3 = _mm_load_ps(&RE(wa[i+2]));

            m3 = _mm_add_ps(m1, m2);
            m15 = _mm_add_ps(m5, m6);

            m4 = _mm_sub_ps(m1, m2);
            m16 = _mm_sub_ps(m5, m6);

            _mm_store_ps(&RE(ch[ah+i]), m3);
            _mm_store_ps(&RE(ch[ah+i+2]), m15);


            w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
            w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));

            m7 = _mm_mul_ps(m4, w1);
            m17 = _mm_mul_ps(m16, w3);
            m8 = _mm_mul_ps(m4, w2);
            m18 = _mm_mul_ps(m16, w4);

            m9  = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
            m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
            m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
            m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));

            m11 = _mm_add_ps(m9, m10);
            m21 = _mm_add_ps(m19, m20);
            m12 = _mm_sub_ps(m9, m10);
            m22 = _mm_sub_ps(m19, m20);

            m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
            m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));

            m14 = _mm_unpacklo_ps(m12, m13);
            m24 = _mm_unpacklo_ps(m22, m23);

            _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
            _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
        }
    }
}

예제 #26

0

파일 보기

파일: aec_rdft_sse2.c 프로젝트: 0x0B501E7E/webrtc

static void rftbsub_128_SSE2(float* a) {
  const float* c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

  static const ALIGN16_BEG float ALIGN16_END
      k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
  const __m128 mm_half = _mm_load_ps(k_half);

  a[1] = -a[1];
  // Vectorized code (four at once).
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
    const __m128 wkr_ =
        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
    // Load and shuffle 'a'.
    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
    const __m128 a_j2_p0 = _mm_shuffle_ps(
        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
    const __m128 a_j2_p1 = _mm_shuffle_ps(
        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
    const __m128 a_k2_p0 = _mm_shuffle_ps(
        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
    const __m128 a_k2_p1 = _mm_shuffle_ps(
        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
    // Calculate 'x'.
    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
    // 2-126, 4-124, 6-122, 8-120,
    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
    // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr + wki * xi;
    //    yi = wkr * xi - wki * xr;
    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
    const __m128 b_ = _mm_mul_ps(wki_, xi_);
    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
    const __m128 d_ = _mm_mul_ps(wki_, xr_);
    const __m128 yr_ = _mm_add_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
    const __m128 yi_ = _mm_sub_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
                                            // Update 'a'.
                                            //    a[j2 + 0] = a[j2 + 0] - yr;
                                            //    a[j2 + 1] = yi - a[j2 + 1];
                                            //    a[k2 + 0] = yr + a[k2 + 0];
    //    a[k2 + 1] = yi - a[k2 + 1];
    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
    const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
    const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
    // Shuffle in right order and store.
    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
    //   2,   3,   4,   5,
    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
    //   6,   7,   8,   9,
    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
    // 122, 123, 120, 121,
    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
    // 126, 127, 124, 125,
    const __m128 a_k2_0n = _mm_shuffle_ps(
        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
    const __m128 a_k2_4n = _mm_shuffle_ps(
        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
    k1 = 32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
    xi = a[j2 + 1] + a[k2 + 1];
    yr = wkr * xr + wki * xi;
    yi = wkr * xi - wki * xr;
    a[j2 + 0] = a[j2 + 0] - yr;
    a[j2 + 1] = yi - a[j2 + 1];
    a[k2 + 0] = yr + a[k2 + 0];
    a[k2 + 1] = yi - a[k2 + 1];
  }
  a[65] = -a[65];
}

예제 #27

0

파일 보기

파일: brushtool.cpp 프로젝트: BuildandShoot/terravox

QSharedPointer<Terrain> BrushTool::tip(QPoint origin)
{
    bool needToGenerate = false;
    if (!tip_) {
        tip_ = QSharedPointer<Terrain>::create(QSize(parameters_.size, parameters_.size));
        needToGenerate = true;
    }
    if (origin.x() < -500) {
        origin = lastTipOrigin_;
    }
    if (origin != lastTipOrigin_) {
        switch (parameters_.tipType) {
        case BrushTipType::Mountains:
            needToGenerate = true;
            break;
        default:
            // position invariant
            break;
        }
    }
    if (needToGenerate) {
        Terrain *t = tip_.data();
        auto size = parameters_.size;

        float scale = 1.f / size;

        switch (parameters_.tipType) {
        case BrushTipType::Mountains:
            {
                // Set rounding mode (required by CoherentNoiseGenerator)
                SseRoundingModeScope roundingModeScope(_MM_ROUND_DOWN);
                (void) roundingModeScope;

                if (noiseGenSeed != parameters_.seed) {
                    noiseGenSeed = parameters_.seed;
                    noiseGen.randomize(static_cast<std::uint_fast32_t>(noiseGenSeed));
                }

                auto noise = noiseGen.sampler();
                __m128i originMM = _mm_setr_epi32(origin.x(), origin.y(), 0, 0);
                float noiseScale = 10.f / parameters_.scale;

                for (int y = 0; y < size; ++y) {
                    for (int x = 0; x < size; ++x) {
                        int cx = (x << 1) - size + 1;
                        int cy = (y << 1) - size + 1;
                        float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale;
                        float alt;
                        if (sq <= 0.f) {
                            alt = 0.f;
                        } else {
                            auto posI = _mm_add_epi32(_mm_setr_epi32(x, y, 0, 0), originMM);
                            auto pos = _mm_cvtepi32_ps(posI);
                            pos = _mm_mul_ps(pos, _mm_set1_ps(noiseScale));
                            auto pos1 = _mm_mul_ps(pos, _mm_set1_ps(0.1f));
                            pos = _mm_unpacklo_ps(_mm_hadd_ps(pos, pos), _mm_hsub_ps(pos, pos));
                            auto pos2 = _mm_mul_ps(pos, _mm_set1_ps(0.15f));
                            auto pos3 = _mm_mul_ps(pos, _mm_set1_ps(0.3f));
                            auto pos4 = _mm_mul_ps(pos, _mm_set1_ps(0.03f));
                            float noiseVal = noise.sample(pos1);
                            noiseVal += noise.sample(pos2) * .3f;
                            noiseVal += noise.sample(pos3) * .15f;
                            noiseVal += noise.sample(pos4) * 1.5f;
                            noiseVal = std::max(std::min(0.5f + noiseVal * 1.1f, 1.f), 0.f);

                            float sqBase = sq;
                            sq *= sq * (3.f - 2.f * sq) * 0.8f;
                            sq *= sq;
                            sq -= 0.1f;
                            sq += (sqBase - sq) * std::abs(noiseVal);
                            alt = std::max(0.f, sq);
                        }
                        t->landform(x, y) = alt;
                    }
                }
            }
            break;
        case BrushTipType::Bell:
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    int cx = (x << 1) - size + 1;
                    int cy = (y << 1) - size + 1;
                    float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale;
                    float alt;
                    if (sq <= 0.f) {
                        alt = 0.f;
                    } else {
                        sq *= sq * (3.f - 2.f * sq);
                        alt = sq;
                    }
                    t->landform(x, y) = alt;
                }
            }
            break;
        case BrushTipType::Cone:
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    int cx = (x << 1) - size + 1;
                    int cy = (y << 1) - size + 1;
                    float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale;
                    float alt;
                    if (sq <= 0.f) {
                        alt = 0.f;
                    } else {
                        alt = sq;
                    }
                    t->landform(x, y) = alt;
                }
            }
            break;
        case BrushTipType::Sphere:
            scale *= scale;
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    int cx = (x << 1) - size + 1;
                    int cy = (y << 1) - size + 1;
                    float sq = 1.f - (cx * cx + cy * cy) * scale;
                    float alt;
                    if (sq <= 0.f) {
                        alt = 0.f;
                    } else {
                        alt = std::sqrt(sq);
                    }
                    t->landform(x, y) = alt;
                }
            }
            break;
        case BrushTipType::Cylinder:
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    int cx = (x << 1) - size + 1;
                    int cy = (y << 1) - size + 1;
                    float sq = size * size - (cx * cx + cy * cy);
                    float alt;
                    if (sq <= 0.f) {
                        alt = 0.f;
                    } else {
                        alt = 1.f;
                    }
                    t->landform(x, y) = alt;
                }
            }
            break;
        case BrushTipType::Square:
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    t->landform(x, y) = 1.f;
                }
            }
            break;
        }

    }
    return tip_;
}

예제 #28

0

파일 보기

파일: AudioSource.cpp 프로젝트: ascendedguard/OBS

UINT AudioSource::QueryAudio(float curVolume)
{
    LPVOID buffer;
    UINT numAudioFrames;
    QWORD newTimestamp;

    if(GetNextBuffer((void**)&buffer, &numAudioFrames, &newTimestamp))
    {
        //------------------------------------------------------------
        // convert to float

        float *captureBuffer;

        if(!bFloat)
        {
            UINT totalSamples = numAudioFrames*inputChannels;
            if(convertBuffer.Num() < totalSamples)
                convertBuffer.SetSize(totalSamples);

            if(inputBitsPerSample == 8)
            {
                float *tempConvert = convertBuffer.Array();
                char *tempSByte = (char*)buffer;

                while(totalSamples--)
                {
                    *(tempConvert++) = float(*(tempSByte++))/127.0f;
                }
            }
            else if(inputBitsPerSample == 16)
            {
                float *tempConvert = convertBuffer.Array();
                short *tempShort = (short*)buffer;

                while(totalSamples--)
                {
                    *(tempConvert++) = float(*(tempShort++))/32767.0f;
                }
            }
            else if(inputBitsPerSample == 24)
            {
                float *tempConvert = convertBuffer.Array();
                BYTE *tempTriple = (BYTE*)buffer;
                TripleToLong valOut;

                while(totalSamples--)
                {
                    TripleToLong &valIn  = (TripleToLong&)tempTriple;

                    valOut.wVal = valIn.wVal;
                    valOut.tripleVal = valIn.tripleVal;
                    if(valOut.tripleVal > 0x7F)
                        valOut.lastByte = 0xFF;

                    *(tempConvert++) = float(double(valOut.val)/8388607.0);
                    tempTriple += 3;
                }
            }
            else if(inputBitsPerSample == 32)
            {
                float *tempConvert = convertBuffer.Array();
                long *tempShort = (long*)buffer;

                while(totalSamples--)
                {
                    *(tempConvert++) = float(double(*(tempShort++))/2147483647.0);
                }
            }

            captureBuffer = convertBuffer.Array();
        }
        else
            captureBuffer = (float*)buffer;

        //------------------------------------------------------------
        // channel upmix/downmix

        if(tempBuffer.Num() < numAudioFrames*2)
            tempBuffer.SetSize(numAudioFrames*2);

        float *dataOutputBuffer = tempBuffer.Array();
        float *tempOut = dataOutputBuffer;

        if(inputChannels == 1)
        {
            UINT  numFloats   = numAudioFrames;
            float *inputTemp  = (float*)captureBuffer;
            float *outputTemp = dataOutputBuffer;

            if((UPARAM(inputTemp) & 0xF) == 0 && (UPARAM(outputTemp) & 0xF) == 0)
            {
                UINT alignedFloats = numFloats & 0xFFFFFFFC;
                for(UINT i=0; i<alignedFloats; i += 4)
                {
                    __m128 inVal   = _mm_load_ps(inputTemp+i);

                    __m128 outVal1 = _mm_unpacklo_ps(inVal, inVal);
                    __m128 outVal2 = _mm_unpackhi_ps(inVal, inVal);

                    _mm_store_ps(outputTemp+(i*2),   outVal1);
                    _mm_store_ps(outputTemp+(i*2)+4, outVal2);
                }

                numFloats  -= alignedFloats;
                inputTemp  += alignedFloats;
                outputTemp += alignedFloats*2;
            }

            while(numFloats--)
            {
                float inputVal = *inputTemp;
                *(outputTemp++) = inputVal;
                *(outputTemp++) = inputVal;

                inputTemp++;
            }
        }
        else if(inputChannels == 2) //straight up copy
        {
            SSECopy(dataOutputBuffer, captureBuffer, numAudioFrames*2*sizeof(float));
        }
        else
        {
            //todo: downmix optimization, also support for other speaker configurations than ones I can merely "think" of.  ugh.
            float *inputTemp  = (float*)captureBuffer;
            float *outputTemp = dataOutputBuffer;

            if(inputChannelMask == KSAUDIO_SPEAKER_QUAD)
            {
                UINT numFloats = numAudioFrames*4;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                    float rearLeft  = inputTemp[2]*surroundMix4;
                    float rearRight = inputTemp[3]*surroundMix4;

                    // When in doubt, use only left and right .... and rear left and rear right :) 
                    // Same idea as with 5.1 downmix

                    *(outputTemp++) = (left  + rearLeft)  * attn4dotX;
                    *(outputTemp++) = (right + rearRight) * attn4dotX;

                    inputTemp  += 4;
                }
            }
            else if(inputChannelMask == KSAUDIO_SPEAKER_2POINT1)
            {
                UINT numFloats = numAudioFrames*3;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                   
                    // Drop LFE since we don't need it
                    //float lfe       = inputTemp[2]*lowFreqMix;

                    *(outputTemp++) = left;
                    *(outputTemp++) = right;

                    inputTemp  += 3;
                }
            }
            else if(inputChannelMask == KSAUDIO_SPEAKER_4POINT1)
            {
                UINT numFloats = numAudioFrames*5;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];

                    // Skip LFE , we don't really need it.
                    //float lfe       = inputTemp[2];

                    float rearLeft  = inputTemp[3]*surroundMix4;
                    float rearRight = inputTemp[4]*surroundMix4;

                    // Same idea as with 5.1 downmix

                    *(outputTemp++) = (left  + rearLeft)  * attn4dotX;
                    *(outputTemp++) = (right + rearRight) * attn4dotX;

                    inputTemp  += 5;
                }
            }
            else if(inputChannelMask == KSAUDIO_SPEAKER_SURROUND)
            {
                UINT numFloats = numAudioFrames*4;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                    float frontCenter    = inputTemp[2];
                    float rearCenter     = inputTemp[3];
                    
                    // When in doubt, use only left and right :) Seriously.
                    // THIS NEEDS TO BE PROPERLY IMPLEMENTED!

                    *(outputTemp++) = left;
                    *(outputTemp++) = right;

                    inputTemp  += 4;
                }
            }
            // Both speakers configs share the same format, the difference is in rear speakers position 
            // See: http://msdn.microsoft.com/en-us/library/windows/hardware/ff537083(v=vs.85).aspx
            // Probably for KSAUDIO_SPEAKER_5POINT1_SURROUND we will need a different coefficient for rear left/right
            else if(inputChannelMask == KSAUDIO_SPEAKER_5POINT1 || inputChannelMask == KSAUDIO_SPEAKER_5POINT1_SURROUND)
            {
                UINT numFloats = numAudioFrames*6;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                    float center    = inputTemp[2]*centerMix;
                    
                    //We don't need LFE channel so skip it (see below)
                    //float lowFreq   = inputTemp[3]*lowFreqMix;
                    
                    float rearLeft  = inputTemp[4]*surroundMix;
                    float rearRight = inputTemp[5]*surroundMix;
                    
                    // According to ITU-R  BS.775-1 recommendation, the downmix from a 3/2 source to stereo
                    // is the following:
                    // L = FL + k0*C + k1*RL
                    // R = FR + k0*C + k1*RR
                    // FL = front left
                    // FR = front right
                    // C  = center
                    // RL = rear left
                    // RR = rear right
                    // k0 = centerMix   = dbMinus3 = 0.7071067811865476 [for k0 we can use dbMinus6 = 0.5 too, probably it's better]
                    // k1 = surroundMix = dbMinus3 = 0.7071067811865476

                    // The output (L,R) can be out of (-1,1) domain so we attenuate it [ attn5dot1 = 1/(1 + centerMix + surroundMix) ]
                    // Note: this method of downmixing is far from "perfect" (pretty sure it's not the correct way) but the resulting downmix is "okayish", at least no more bleeding ears.
                    // (maybe have a look at http://forum.doom9.org/archive/index.php/t-148228.html too [ 5.1 -> stereo ] the approach seems almost the same [but different coefficients])

                    
                    // http://acousticsfreq.com/blog/wp-content/uploads/2012/01/ITU-R-BS775-1.pdf
                    // http://ir.lib.nctu.edu.tw/bitstream/987654321/22934/1/030104001.pdf

                    *(outputTemp++) = (left  + center  + rearLeft) * attn5dot1;
                    *(outputTemp++) = (right + center  + rearRight) * attn5dot1;

                    inputTemp  += 6;
                }
            }

            // According to http://msdn.microsoft.com/en-us/library/windows/hardware/ff537083(v=vs.85).aspx
            // KSAUDIO_SPEAKER_7POINT1 is obsolete and no longer supported in Windows Vista and later versions of Windows
            // Not sure what to do about it, meh , drop front left of center/front right of center -> 5.1 -> stereo; 

            else if(inputChannelMask == KSAUDIO_SPEAKER_7POINT1)
            {
                UINT numFloats = numAudioFrames*8;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left          = inputTemp[0];
                    float right         = inputTemp[1];
                    
                    float center        = inputTemp[2] * centerMix;
                    
                    // Drop LFE since we don't need it
                    //float lowFreq       = inputTemp[3]*lowFreqMix;
                    
                    float rearLeft      = inputTemp[4] * surroundMix;
                    float rearRight     = inputTemp[5] * surroundMix;

                    // Drop SPEAKER_FRONT_LEFT_OF_CENTER , SPEAKER_FRONT_RIGHT_OF_CENTER
                    //float centerLeft    = inputTemp[6];
                    //float centerRight   = inputTemp[7];
                    
                    // Downmix from 5.1 to stereo
                    *(outputTemp++) = (left  + center  + rearLeft)  * attn5dot1;
                    *(outputTemp++) = (right + center  + rearRight) * attn5dot1;

                    inputTemp  += 8;
                }
            }

            // Downmix to 5.1 (easy stuff) then downmix to stereo as done in KSAUDIO_SPEAKER_5POINT1
            else if(inputChannelMask == KSAUDIO_SPEAKER_7POINT1_SURROUND)
            {
                UINT numFloats = numAudioFrames*8;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                    float center    = inputTemp[2] * centerMix;

                    // Skip LFE we don't need it
                    //float lowFreq   = inputTemp[3]*lowFreqMix;

                    float rearLeft  = inputTemp[4];
                    float rearRight = inputTemp[5];
                    float sideLeft  = inputTemp[6];
                    float sideRight = inputTemp[7];

                    // combine the rear/side channels first , baaam! 5.1
                    rearLeft  = (rearLeft  + sideLeft)  * 0.5f;
                    rearRight = (rearRight + sideRight) * 0.5f;
                    
                    // downmix to stereo as in 5.1 case
                    *(outputTemp++) = (left  + center + rearLeft  * surroundMix) * attn5dot1;
                    *(outputTemp++) = (right + center + rearRight * surroundMix) * attn5dot1;

                    inputTemp  += 8;
                }
            }
        }

        ReleaseBuffer();

        //------------------------------------------------------------
        // resample

        if(bResample)
        {
            UINT frameAdjust = UINT((double(numAudioFrames) * resampleRatio) + 1.0);
            UINT newFrameSize = frameAdjust*2;

            if(tempResampleBuffer.Num() < newFrameSize)
                tempResampleBuffer.SetSize(newFrameSize);

            SRC_DATA data;
            data.src_ratio = resampleRatio;

            data.data_in = tempBuffer.Array();
            data.input_frames = numAudioFrames;

            data.data_out = tempResampleBuffer.Array();
            data.output_frames = frameAdjust;

            data.end_of_input = 0;

            int err = src_process((SRC_STATE*)resampler, &data);
            if(err)
            {
                RUNONCE AppWarning(TEXT("AudioSource::QueryAudio: Was unable to resample audio for device '%s'"), GetDeviceName());
                return NoAudioAvailable;
            }

            if(data.input_frames_used != numAudioFrames)
            {
                RUNONCE AppWarning(TEXT("AudioSource::QueryAudio: Failed to downsample buffer completely, which shouldn't actually happen because it should be using 10ms of samples"));
                return NoAudioAvailable;
            }

            numAudioFrames = data.output_frames_gen;
        }

        //-----------------------------------------------------------------------------
        // sort all audio frames into 10 millisecond increments (done because not all devices output in 10ms increments)
        // NOTE: 0.457+ - instead of using the timestamps from windows, just compare and make sure it stays within a 100ms of their timestamps

        if(!bFirstBaseFrameReceived)
        {
            lastUsedTimestamp = newTimestamp;
            bFirstBaseFrameReceived = true;
        }

        float *newBuffer = (bResample) ? tempResampleBuffer.Array() : tempBuffer.Array();

        if (bSmoothTimestamps) {
            lastUsedTimestamp += 10;

            QWORD difVal = GetQWDif(newTimestamp, lastUsedTimestamp);
            if(difVal > 70)
            {
                //OSDebugOut(TEXT("----------------------------1\r\nlastUsedTimestamp before: %llu - device: %s\r\n"), lastUsedTimestamp, GetDeviceName());
                lastUsedTimestamp = newTimestamp;
                //OSDebugOut(TEXT("lastUsedTimestamp after: %llu\r\n"), lastUsedTimestamp);
            }

            if(lastUsedTimestamp > lastSentTimestamp)
            {
                QWORD adjustVal = (lastUsedTimestamp-lastSentTimestamp);
                if(adjustVal < 10)
                    lastUsedTimestamp += 10-adjustVal;

                AudioSegment *newSegment = new AudioSegment(newBuffer, numAudioFrames*2, lastUsedTimestamp);
                AddAudioSegment(newSegment, curVolume*sourceVolume);

                lastSentTimestamp = lastUsedTimestamp;
            }
        } else {
           // OSDebugOut(TEXT("newTimestamp: %llu\r\n"), newTimestamp);
            AudioSegment *newSegment = new AudioSegment(newBuffer, numAudioFrames*2, newTimestamp);
            AddAudioSegment(newSegment, curVolume*sourceVolume);
        }

        //-----------------------------------------------------------------------------

        return AudioAvailable;
    }

    return NoAudioAvailable;
}

예제 #29

0

파일 보기

파일: SPaRTaSSE.c 프로젝트: asolis/detection3D

static inline void   sacEvaluateModelSPRT(PROSAC_HEST* p){
	unsigned i;
	unsigned isInlier;
	double   lambda       = 1.0;
	double   lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon));
	double   lambdaAccept = ((   p->delta   ) / (    p->epsilon  ));
	float    distSq = p->maxD*p->maxD;
	float*   src = (float*)p->src;
	float*   dst = (float*)p->dst;
	float*   H   = p->H;
	
	
	p->inl      = 0;
	p->N_tested = 0;
	p->good     = 1;
	
	
	/* VECTOR */
	const __m128 distSqV=_mm_set1_ps(distSq);
	
	const __m128 H00=_mm_set1_ps(H[0]);
	const __m128 H01=_mm_set1_ps(H[1]);
	const __m128 H02=_mm_set1_ps(H[2]);
	const __m128 H10=_mm_set1_ps(H[4]);
	const __m128 H11=_mm_set1_ps(H[5]);
	const __m128 H12=_mm_set1_ps(H[6]);
	const __m128 H20=_mm_set1_ps(H[8]);
	const __m128 H21=_mm_set1_ps(H[9]);
	const __m128 H22=_mm_set1_ps(H[10]);
	
	for(i=0;i<(p->N-3) && p->good;i+=4){
		/* Backproject */
		__m128 x, y, X, Y, inter0, inter1, inter2, inter3;
		x=_mm_load_ps(src+2*i);
		y=_mm_load_ps(src+2*i+4);
		X=_mm_load_ps(dst+2*i);
		Y=_mm_load_ps(dst+2*i+4);
		
		inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0
		inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2
		inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0
		inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2
		
		x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		
		__m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02);
		__m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12);
		__m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22);
		
		__m128 recipZ = _mm_rcp_ps(reprojZ);
		reprojX = _mm_mul_ps(reprojX, recipZ);
		reprojY = _mm_mul_ps(reprojY, recipZ);
		//reprojX = _mm_div_ps(reprojX, reprojZ);
		//reprojY = _mm_div_ps(reprojY, reprojZ);
		
		reprojX = _mm_sub_ps(reprojX, X);
		reprojY = _mm_sub_ps(reprojY, Y);
		
		reprojX = _mm_mul_ps(reprojX, reprojX);
		reprojY = _mm_mul_ps(reprojY, reprojY);
		
		__m128 reprojDistV = _mm_add_ps(reprojX, reprojY);
		
		__m128 cmp = _mm_cmple_ps(reprojDistV, distSqV);
		int msk = _mm_movemask_ps(cmp);
		
		/* ... */
		/*                   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15*/
		unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
		p->inl     += bitCnt[msk];
		
		
		/* SPRT */
		lambda *= p->lambdaTBL[msk];
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	/* SCALAR */
	for(;i<p->N && p->good;i++){
		/* Backproject */
		float x=src[i*2],y=src[i*2+1];
		float X=dst[i*2],Y=dst[i*2+1];
		
		float reprojX=H[0]*x+H[1]*y+H[2]; //  ( X_1 )     ( H_11 H_12    H_13  ) (x_1)
		float reprojY=H[4]*x+H[5]*y+H[6]; //  ( X_2 )  =  ( H_21 H_22    H_23  ) (x_2)
		float reprojZ=H[8]*x+H[9]*y+H[10];//  ( X_3 )     ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0)
		
		//reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z.
		reprojX/=reprojZ;
		reprojY/=reprojZ;
		
		//Compute distance
		reprojX-=X;
		reprojY-=Y;
		reprojX*=reprojX;
		reprojY*=reprojY;
		float reprojDist = reprojX+reprojY;
		
		/* ... */
		isInlier    = reprojDist <= distSq;
		p->inl     += isInlier;
		
		
		/* SPRT */
		lambda *= isInlier ? lambdaAccept : lambdaReject;
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	
	p->N_tested = i;
}

예제 #30

0

파일 보기

파일: sse-unpcklps-1.c 프로젝트: 0day-ci/gcc

test (__m128 s1, __m128 s2)
{
  return _mm_unpacklo_ps (s1, s2); 
}