C++ (Cpp) _mm_unpackhi_ps示例

示例#1

0

显示文件

文件： util.cpp 项目： shewu/raytracer

/* apparently this is retarded */
void mulMatrix1(Matrix4x4 ret, Matrix4x4 mat1, Matrix4x4 mat2)
{
    /* for some reason not aligning the matrix segfaults,
     * but aligning deadlocks the program */
    /* aha we can heavily sse this:
     * 1. transpose mat2
     * 2. dotproduct the rows */

    /* 1. transpose mat2 */
    __m128 row0, row1, row2, row3;
    __m128 tmp0, tmp1, tmp2, tmp3;

    /* Load 4x4 mat2 from memory into four SSE registers. */
    row0 = _mm_load_ps( mat2[0] );
    row1 = _mm_load_ps( mat2[1] );
    row2 = _mm_load_ps( mat2[2] );
    row3 = _mm_load_ps( mat2[3] );

    /* Interleave bottom/top two pixels from two SSE registers with each other
     * into a single SSE register. */
    tmp0 = _mm_unpacklo_ps( row0, row1 );
    tmp2 = _mm_unpacklo_ps( row2, row3 );
    tmp1 = _mm_unpackhi_ps( row0, row1 );
    tmp3 = _mm_unpackhi_ps( row2, row3 );

    /* Move bottom/top two pixels from two SSE registers into one SSE register. */
    row0 = _mm_movelh_ps( tmp0, tmp2 );
    row1 = _mm_movehl_ps( tmp2, tmp0 );
    row2 = _mm_movelh_ps( tmp1, tmp3 );
    row3 = _mm_movehl_ps( tmp3, tmp1 );

    /* Store 4x4 matrix from all four SSE registers into memory. */
    _mm_store_ps( mat2[0], row0 );
    _mm_store_ps( mat2[1], row1 );
    _mm_store_ps( mat2[2], row2 );
    _mm_store_ps( mat2[3], row3 );

    /* 2. dotproduct the rows */
    /* OMG 16 DOT PRODUCTS */
    ret[0][0] = mul_asm(mat1[0], mat2[0]);
    ret[0][1] = mul_asm(mat1[0], mat2[1]);
    ret[0][2] = mul_asm(mat1[0], mat2[2]);
    ret[0][3] = mul_asm(mat1[0], mat2[3]);
    ret[1][0] = mul_asm(mat1[1], mat2[0]);
    ret[1][1] = mul_asm(mat1[1], mat2[1]);
    ret[1][2] = mul_asm(mat1[1], mat2[2]);
    ret[1][3] = mul_asm(mat1[1], mat2[3]);
    ret[2][0] = mul_asm(mat1[2], mat2[0]);
    ret[2][1] = mul_asm(mat1[2], mat2[1]);
    ret[2][2] = mul_asm(mat1[2], mat2[2]);
    ret[2][3] = mul_asm(mat1[2], mat2[3]);
    ret[3][0] = mul_asm(mat1[3], mat2[0]);
    ret[3][1] = mul_asm(mat1[3], mat2[1]);
    ret[3][2] = mul_asm(mat1[3], mat2[2]);
    ret[3][3] = mul_asm(mat1[3], mat2[3]);

    return;
}

示例#2

0

显示文件

/// Transform this box using the specified transform matrix.
///
/// @param[in] rTransform  Matrix by which to transform.
void Helium::Simd::AaBox::TransformBy( const Matrix44& rTransform )
{
    // Expand each corner position.
    Register minVec = m_minimum.GetSimdVector();
    Register maxVec = m_maximum.GetSimdVector();

    Vector3Soa corners0;
    corners0.m_x = _mm_shuffle_ps( minVec, minVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    corners0.m_y = _mm_shuffle_ps( minVec, maxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    corners0.m_z = _mm_unpackhi_ps( minVec, maxVec );
    corners0.m_z = _mm_movelh_ps( corners0.m_z, corners0.m_z );

    Vector3Soa corners1;
    corners1.m_x = _mm_shuffle_ps( maxVec, maxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    corners1.m_y = corners0.m_y;
    corners1.m_z = corners0.m_z;

    // Transform all corners by the provided transformation matrix.
    Matrix44Soa transformSplat( rTransform );
    transformSplat.TransformPoint( corners0, corners0 );
    transformSplat.TransformPoint( corners1, corners1 );

    // Compute the minimum.
    Register minX = Simd::MinF32( corners0.m_x, corners1.m_x );
    Register minY = Simd::MinF32( corners0.m_y, corners1.m_y );
    Register minXYLo = _mm_unpacklo_ps( minX, minY );
    Register minXYHi = _mm_unpackhi_ps( minX, minY );
    Register minXY = Simd::MinF32( minXYLo, minXYHi );

    Register minZ = Simd::MinF32( corners0.m_z, corners1.m_z );
    Register minZLo = _mm_unpacklo_ps( minZ, minZ );
    Register minZHi = _mm_unpackhi_ps( minZ, minZ );
    minZ = Simd::MinF32( minZLo, minZHi );

    Register minLo = _mm_movelh_ps( minXY, minZ );
    Register minHi = _mm_movehl_ps( minZ, minXY );

    m_minimum.SetSimdVector( Simd::MinF32( minLo, minHi ) );

    // Compute the maximum.
    Register maxX = Simd::MaxF32( corners0.m_x, corners1.m_x );
    Register maxY = Simd::MaxF32( corners0.m_y, corners1.m_y );
    Register maxXYLo = _mm_unpacklo_ps( maxX, maxY );
    Register maxXYHi = _mm_unpackhi_ps( maxX, maxY );
    Register maxXY = Simd::MaxF32( maxXYLo, maxXYHi );

    Register maxZ = Simd::MaxF32( corners0.m_z, corners1.m_z );
    Register maxZLo = _mm_unpacklo_ps( maxZ, maxZ );
    Register maxZHi = _mm_unpackhi_ps( maxZ, maxZ );
    maxZ = Simd::MaxF32( maxZLo, maxZHi );

    Register maxLo = _mm_movelh_ps( maxXY, maxZ );
    Register maxHi = _mm_movehl_ps( maxZ, maxXY );

    m_maximum.SetSimdVector( Simd::MaxF32( maxLo, maxHi ) );
}

示例#3

0

显示文件

文件： TestMatrixMult.cpp 项目： shobomaru/TestMatrixMult

static void NOINLINE transposeX4( const __m128 *v1, __m128 *vout )
{
    __m128 a0 = _mm_unpacklo_ps( v1[ 0 ], v1[ 2 ] );
    __m128 a1 = _mm_unpacklo_ps( v1[ 1 ], v1[ 3 ] );
    __m128 a2 = _mm_unpackhi_ps( v1[ 0 ], v1[ 2 ] );
    __m128 a3 = _mm_unpackhi_ps( v1[ 1 ], v1[ 3 ] );
    vout[ 0 ] = _mm_unpacklo_ps( a0, a1 );
    vout[ 1 ] = _mm_unpackhi_ps( a0, a1 );
    vout[ 2 ] = _mm_unpacklo_ps( a2, a3 );
    vout[ 3 ] = _mm_unpackhi_ps( a2, a3 );
}

示例#4

0

显示文件

文件： overexposed.c 项目： Coshibu/darktable

void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out)
{
  dt_develop_t *dev = self->dev;

  const int ch = piece->colors;

  const __m128 upper = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f);
  const __m128 lower = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f);

  const int colorscheme = dev->overexposed.colorscheme;
  const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]);
  const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]);

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(ovoid) schedule(static)
#endif
  for(int k=0; k<roi_out->height; k++)
  {
    const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
    {
      const __m128 pixel = _mm_load_ps(in);

      __m128 isoe = _mm_cmpge_ps(pixel, upper);
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));

      __m128 isue = _mm_cmple_ps(pixel, lower);
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));

      __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
                                _mm_and_ps(isoe, upper_color));

      result = _mm_or_ps(_mm_andnot_ps(isue, result),
                         _mm_and_ps(isue, lower_color));

      _mm_stream_ps(out, result);
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}

示例#5

0

显示文件

文件： AudioNodeEngineSSE2.cpp 项目： jasonLaster/gecko-dev

void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
                               float* aOutput, uint32_t aSize) {
  unsigned i;
  __m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0,
      outimag1, outimag2, outimag3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aScale);
  ASSERT_ALIGNED16(aOutput);
  ASSERT_MULTIPLE16(aSize);

  for (i = 0; i < aSize * 2; i += 16) {
    in0 = _mm_load_ps(&aInput[i]);
    in1 = _mm_load_ps(&aInput[i + 4]);
    in2 = _mm_load_ps(&aInput[i + 8]);
    in3 = _mm_load_ps(&aInput[i + 12]);

    outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
    outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
    outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
    outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));

    in0 = _mm_load_ps(&aScale[i]);
    in1 = _mm_load_ps(&aScale[i + 4]);
    in2 = _mm_load_ps(&aScale[i + 8]);
    in3 = _mm_load_ps(&aScale[i + 12]);

    outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
    outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
    outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
    outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));

    in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
                     _mm_mul_ps(outimag0, outimag1));
    in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
                     _mm_mul_ps(outimag0, outreal1));
    in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
                     _mm_mul_ps(outimag2, outimag3));
    in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
                     _mm_mul_ps(outimag2, outreal3));

    outreal0 = _mm_unpacklo_ps(in0, in1);
    outreal1 = _mm_unpackhi_ps(in0, in1);
    outreal2 = _mm_unpacklo_ps(in2, in3);
    outreal3 = _mm_unpackhi_ps(in2, in3);

    _mm_store_ps(&aOutput[i], outreal0);
    _mm_store_ps(&aOutput[i + 4], outreal1);
    _mm_store_ps(&aOutput[i + 8], outreal2);
    _mm_store_ps(&aOutput[i + 12], outreal3);
  }
}

示例#6

0

显示文件

文件： sound_sse.cpp 项目： punkkeks/ClanLib

void SoundSSE::pack_float_stereo(float *input[2], int size, float *output)
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/4)*4;

	for (int i = 0; i < sse_size; i+=4)
	{
		__m128 samples0 = _mm_loadu_ps(input[0]+i);
		__m128 samples1 = _mm_loadu_ps(input[1]+i);
		__m128 tmp0, tmp1;
		tmp0 = _mm_unpacklo_ps(samples0, samples1);
		tmp1 = _mm_unpackhi_ps(samples0, samples1);
		_mm_storeu_ps(output+i*2, tmp0);
		_mm_storeu_ps(output+i*2+4, tmp1);
	}

#else
	const int sse_size = 0;
#endif

	// Pack remaining
	for (int i = sse_size; i < size; i++)
	{
		output[i*2] = input[0][i];
		output[i*2 + 1] = input[1][i];
	}
}

示例#7

0

显示文件

文件： MixerLoops.cpp 项目： Kinglions/modizer

static void SSE2_FloatToStereoMix(const float *pIn1, const float *pIn2, int32 *pOut, uint32 nCount, const float _f2ic)
//--------------------------------------------------------------------------------------------------------------------
{
	__m128 f2ic = _mm_load_ps1(&_f2ic);
	__m128i *out = reinterpret_cast<__m128i *>(pOut);

	// We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE
	nCount = (nCount + 3) / 4;
	do
	{
		__m128 fl = _mm_loadu_ps(pIn1);			// Load four float values, LLLL
		__m128 fr = _mm_loadu_ps(pIn2);			// Load four float values, RRRR
		pIn1 += 4;
		pIn2 += 4;
		fl = _mm_mul_ps(fl, f2ic);				// Apply int->float factor
		fr = _mm_mul_ps(fr, f2ic);				// Apply int->float factor
		__m128 f1 = _mm_unpacklo_ps(fl, fr);	// LL__+RR__ => LRLR
		__m128 f2 = _mm_unpackhi_ps(fl, fr);	// __LL+__RR => LRLR
		__m128i i1 =_mm_cvtps_epi32(f1);		// Convert to four ints
		__m128i i2 =_mm_cvtps_epi32(f2);		// Convert to four ints
		_mm_storeu_si128(out, i1);				// Store four int values, LRLR
		_mm_storeu_si128(out + 1, i2);			// Store four int values, LRLR
		out += 2;
	} while(--nCount);
}

示例#8

0

显示文件

文件： sound_sse.cpp 项目： punkkeks/ClanLib

void SoundSSE::pack_16bit_stereo(float *input[2], int size, short *output)
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/4)*4;

	__m128 constant1 = _mm_set1_ps(32767);
	for (int i = 0; i < sse_size; i+=4)
	{
		__m128 samples0 = _mm_loadu_ps(input[0]+i);
		__m128 samples1 = _mm_loadu_ps(input[1]+i);
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		__m128 tmp0, tmp1;
		tmp0 = _mm_unpacklo_ps(samples0, samples1);
		tmp1 = _mm_unpackhi_ps(samples0, samples1);
		__m128i isamples0 = _mm_cvtps_epi32(tmp0);
		__m128i isamples1 = _mm_cvtps_epi32(tmp1);
		__m128i isamples = _mm_packs_epi32(isamples0, isamples1);
		_mm_storeu_si128((__m128i*)(output+i*2), isamples);
	}

#else
	const int sse_size = 0;
#endif

	// Pack remaining
	for (int i = sse_size; i < size; i++)
	{
		output[i*2] = input[0][i]*32767;
		output[i*2 + 1] = input[1][i]*32767;
	}
}

示例#9

0

显示文件

文件： Float16.hpp 项目： adamsmigielski/Utilities

		inline float16 Transpose(const float16 & a)
		{
			float16 temp, res;

			temp.x.m128 = _mm_unpacklo_ps(a.x.m128, a.z.m128);
			temp.y.m128 = _mm_unpacklo_ps(a.y.m128, a.w.m128);
			temp.z.m128 = _mm_unpackhi_ps(a.x.m128, a.z.m128);
			temp.w.m128 = _mm_unpackhi_ps(a.y.m128, a.w.m128);

			res.x.m128 = _mm_unpacklo_ps(temp.x.m128, temp.y.m128);
			res.y.m128 = _mm_unpackhi_ps(temp.x.m128, temp.y.m128);
			res.z.m128 = _mm_unpacklo_ps(temp.z.m128, temp.w.m128);
			res.w.m128 = _mm_unpackhi_ps(temp.z.m128, temp.w.m128);

			return res;
		}

示例#10

0

显示文件

文件： Quat.cpp 项目： Garfield-Chen/tng

Quat MUST_USE_RESULT Quat::RotateFromTo(const float4 &sourceDirection, const float4 &targetDirection)
{
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 12.289 nsecs / 33.144 ticks, Avg: 12.489 nsecs, Worst: 14.210 nsecs
	simd4f cosAngle = dot4_ps(sourceDirection.v, targetDirection.v);
	cosAngle = negate3_ps(cosAngle); // [+ - - -]
	// XYZ channels use the trigonometric formula sin(x/2) = +/-sqrt(0.5-0.5*cosx))
	// The W channel uses the trigonometric formula cos(x/2) = +/-sqrt(0.5+0.5*cosx))
	simd4f half = set1_ps(0.5f);
	simd4f cosSinHalfAngle = sqrt_ps(add_ps(half, mul_ps(half, cosAngle))); // [cos(x/2), sin(x/2), sin(x/2), sin(x/2)]
	simd4f axis = cross_ps(sourceDirection.v, targetDirection.v);
	simd4f recipLen = rsqrt_ps(dot4_ps(axis, axis));
	axis = mul_ps(axis, recipLen); // [0 z y x]
	// Set the w component to one.
	simd4f one = add_ps(half, half); // [1 1 1 1]
	simd4f highPart = _mm_unpackhi_ps(axis, one); // [_ _ 1 z]
	axis = _mm_movelh_ps(axis, highPart); // [1 z y x]
	Quat q;
	q.q = mul_ps(axis, cosSinHalfAngle);
	return q;
#else
	// Best: 19.970 nsecs / 53.632 ticks, Avg: 20.197 nsecs, Worst: 21.122 nsecs
	assume(EqualAbs(sourceDirection.w, 0.f));
	assume(EqualAbs(targetDirection.w, 0.f));
	return Quat::RotateFromTo(sourceDirection.xyz(), targetDirection.xyz());
#endif
}

示例#11

0

显示文件

文件： mod-coord-sse.cpp 项目： lensfun/lensfun

void lfModifier::ModifyCoord_Dist_PTLens_SSE (void *data, float *iocoord, int count)
{
  // See "Note about PT-based distortion models" at the top of mod-coord.cpp.
  /*
   * If buffer is not aligned, fall back to plain code
   */
  if((uintptr_t)(iocoord) & 0xf)
  {
    return ModifyCoord_Dist_PTLens(data, iocoord, count);
  }

  lfCoordDistCallbackData* cddata = (lfCoordDistCallbackData*) data;

  // Rd = Ru * (a_ * Ru^3 + b_ * Ru^2 + c_ * Ru + 1)
  __m128 a_ = _mm_set_ps1 (cddata->Terms [0]);
  __m128 b_ = _mm_set_ps1 (cddata->Terms [1]);
  __m128 c_ = _mm_set_ps1 (cddata->Terms [2]);
  __m128 cx = _mm_set_ps1 (cddata->centerX);
  __m128 cy = _mm_set_ps1 (cddata->centerY);
  __m128 cc = _mm_set_ps1 (cddata->coordinate_correction);
  __m128 one = _mm_set_ps1 (1.0f);

  // SSE Loop processes 4 pixels/loop
  int loop_count = count / 4;
  for (int i = 0; i < loop_count ; i++)
  {
    __m128 c0 = _mm_load_ps (&iocoord [8 * i]);
    __m128 c1 = _mm_load_ps (&iocoord [8 * i + 4]);
    __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0));
    __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1));
    x = _mm_sub_ps(_mm_mul_ps(x, cc), cx);
    y = _mm_sub_ps(_mm_mul_ps(y, cc), cy);

    __m128 ru2 = _mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y));
    __m128 ru = _mm_rcp_ps (_mm_rsqrt_ps (ru2));

    // Calculate poly3 = a_ * ru2 * ru + b_ * ru2 + c_ * ru + 1;
    __m128 t = _mm_mul_ps (ru2, b_);
    __m128 poly3 = _mm_mul_ps (_mm_mul_ps (a_, ru2), ru);
    t = _mm_add_ps (t, _mm_mul_ps (ru, c_));
    poly3 = _mm_add_ps (t, _mm_add_ps (poly3, one));

    x = _mm_add_ps(_mm_mul_ps (x, poly3), cx);
    y = _mm_add_ps(_mm_mul_ps (y, poly3), cy);
    x = _mm_div_ps (x, cc);
    y = _mm_div_ps (y, cc);

    c0 = _mm_unpacklo_ps(x, y);
    c1 = _mm_unpackhi_ps(x, y);

    _mm_store_ps (&iocoord [8 * i], c0);
    _mm_store_ps (&iocoord [8 * i + 4], c1);
  }

  loop_count *= 4;
  int remain = count - loop_count;
  if (remain)
    ModifyCoord_Dist_PTLens (data, &iocoord [loop_count * 2], remain);
}

示例#12

0

显示文件

文件： MySSEFunctions.cpp 项目： e-sha/Background-subtraction

	void Shuffle16Elems(__m128 &io_data0, __m128 &io_data1, __m128 &io_data2,
		__m128 &io_data3)
	{
		__m128 ccdd1 = _mm_unpackhi_ps(io_data0, io_data1);
		__m128 ccdd2 = _mm_unpackhi_ps(io_data2, io_data3);
		__m128 aabb1 = _mm_unpacklo_ps(io_data0, io_data1);
		__m128 aabb2 = _mm_unpacklo_ps(io_data2, io_data3);

		io_data0 = 
			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2)));
		io_data1 =
			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2)));
		io_data2 =
			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2)));
		io_data3 = 
			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2)));
	}

示例#13

0

显示文件

文件： nx_sse_float.hpp 项目： mywoodstock/nxsimd

    inline vector4f haddp(const vector4f* row)
    {
#if SSE_INSTR_SET >= 3  // SSE3
        return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
                           _mm_hadd_ps(row[2], row[3]));
#else
        __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
        __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
        __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
        tmp0 = _mm_add_ps(tmp0, tmp1);
        tmp1 = _mm_unpacklo_ps(row[2], row[3]);
        tmp1 = _mm_add_ps(tmp1, tmp2);
        tmp2 = _mm_movehl_ps(tmp1, tmp0);
        tmp0 = _mm_movelh_ps(tmp0, tmp1);
        return _mm_add_ps(tmp0, tmp2);
#endif
    }

示例#14

0

显示文件

文件： matrix4.cpp 项目： Nate1595/OpenTomb

matrix4 matrix4::transposed() const
{
#ifdef __SSE__
    __m128 tmp3, tmp2, tmp1, tmp0;
    tmp0 = _mm_unpacklo_ps(x.v, y.v);
    tmp2 = _mm_unpacklo_ps(z.v, w.v);
    tmp1 = _mm_unpackhi_ps(x.v, y.v);
    tmp3 = _mm_unpackhi_ps(z.v, w.v);

    return matrix4(_mm_movelh_ps(tmp0, tmp2), _mm_movehl_ps(tmp2, tmp0), _mm_movelh_ps(tmp1, tmp3), _mm_movehl_ps(tmp3, tmp1));
#else
    return matrix4(float4(x.x, y.x, z.x, w.x),
                   float4(x.y, y.y, z.y, w.y),
                   float4(x.z, y.z, z.z, w.z),
                   float4(x.w, y.w, z.w, w.w));
#endif
}

示例#15

0

显示文件

文件： SimdFrustumSse.cpp 项目： euler0/Helium

/// Compute the corners of this view frustum.
///
/// A view frustum can have either four or eight corners depending on whether a far clip plane exists (eight
/// corners) or whether an infinite far clip plane is used (four corners).
///
/// Note that this assumes that the frustum is always properly defined, with each possible combination of
/// neighboring clip planes intersecting at a valid point.
///
/// @param[out] pCorners  Array in which the frustum corners will be stored.  This must point to a region of memory
///                       large enough for four points if this frustum has an infinite far clip plane, or eight
///                       points if this frustum has a normal far clip plane.
///
/// @return  Number of clip planes computed (either four or eight).
size_t Helium::Simd::Frustum::ComputeCorners( Vector3* pCorners ) const
{
    HELIUM_ASSERT( pCorners );

    // Compute the corners in struct-of-arrays format.
    HELIUM_SIMD_ALIGN_PRE float32_t cornersX[ 8 ] HELIUM_SIMD_ALIGN_POST;
    HELIUM_SIMD_ALIGN_PRE float32_t cornersY[ 8 ] HELIUM_SIMD_ALIGN_POST;
    HELIUM_SIMD_ALIGN_PRE float32_t cornersZ[ 8 ] HELIUM_SIMD_ALIGN_POST;

    size_t cornerCount = ComputeCornersSoa( cornersX, cornersY, cornersZ );
    HELIUM_ASSERT( cornerCount == 4 || cornerCount == 8 );

    // Swizzle the results and store in the output array.
    Helium::Simd::Register cornerXVec = Helium::Simd::LoadAligned( cornersX );
    Helium::Simd::Register cornerYVec = Helium::Simd::LoadAligned( cornersY );
    Helium::Simd::Register cornerZVec = Helium::Simd::LoadAligned( cornersZ );

    Helium::Simd::Register xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec );
    Helium::Simd::Register xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec );
    Helium::Simd::Register zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec );
    Helium::Simd::Register zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec );

    pCorners[ 0 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) );
    pCorners[ 1 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) );
    pCorners[ 2 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) );
    pCorners[ 3 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) );

    if( cornerCount == 8 )
    {
        cornerXVec = Helium::Simd::LoadAligned( cornersX + 4 );
        cornerYVec = Helium::Simd::LoadAligned( cornersY + 4 );
        cornerZVec = Helium::Simd::LoadAligned( cornersZ + 4 );

        xy01 = _mm_unpacklo_ps( cornerXVec, cornerYVec );
        xy23 = _mm_unpackhi_ps( cornerXVec, cornerYVec );
        zz01 = _mm_unpacklo_ps( cornerZVec, cornerZVec );
        zz23 = _mm_unpackhi_ps( cornerZVec, cornerZVec );

        pCorners[ 4 ].SetSimdVector( _mm_movelh_ps( xy01, zz01 ) );
        pCorners[ 5 ].SetSimdVector( _mm_movehl_ps( zz01, xy01 ) );
        pCorners[ 6 ].SetSimdVector( _mm_movelh_ps( xy23, zz23 ) );
        pCorners[ 7 ].SetSimdVector( _mm_movehl_ps( zz23, xy23 ) );
    }

    return cornerCount;
}

示例#16

0

显示文件

文件： main.cpp 项目： KubaO/stackoverflown

void fast(element_t * const elements, const int num_elts, const float a) {
    element_t * elts = elements;
    float logf_a = logf(a);
    float logf_1_a = logf(1.0/a);
    v4sf log_a = _mm_load1_ps(&logf_a);
    v4sf log_1_a = _mm_load1_ps(&logf_1_a);
    assert(num_elts % 3 == 0); // operates on 3 elements at a time

    // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    for (int i = 0; i < num_elts; i += 3) {
        // transpose
        // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output
        v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0
        v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0
        v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0
        v4sf r3 = _mm_setzero_ps();        // 0, 0, 0, 0
        v4sf t0 = _mm_unpacklo_ps(r0, r1); //  x1,x2,y1,y2
        v4sf t1 = _mm_unpacklo_ps(r2, r3); //  x3,0, y3,0
        v4sf t2 = _mm_unpackhi_ps(r0, r1); //  z1,z2,0, 0
        v4sf t3 = _mm_unpackhi_ps(r2, r3); //  z3,0, 0, 0
        r0 = _mm_movelh_ps(t0, t1);        // x1,x2,x3,0
        r1 = _mm_movehl_ps(t1, t0);        // y1,y2,y3,0
        r2 = _mm_movelh_ps(t2, t3);        // z1,z2,z3,0
        // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a))
        v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0
        v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0
        v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0
        v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0
        v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0
        v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0
        // sum
        v4sf s1 = _mm_add_ps(ex0, ex1);
        v4sf s2 = _mm_add_ps(sum1, ex2);
        // pow(sum, 1/a) = exp(sum * log(1/a))
        v4sf ps = _mm_mul_ps(s2, log_1_a);
        v4sf es = exp_ps(ps);
        ALIGN16_BEG float re[4] ALIGN16_END;
        _mm_store_ps(re, es);
        elts[0].re = re[0];
        elts[1].re = re[1];
        elts[2].re = re[2];
        elts += 3;
    }
}

示例#17

0

显示文件

文件： dct_simd.cpp 项目： ArtisticCoding/OpenCP

void fDCT2x2_2pack_32f_and_thresh_and_iDCT2x2_2pack(float* src, float* dest, float thresh)
{
	__m128 ms0 = _mm_load_ps(src);
	__m128 ms1 = _mm_load_ps(src + 4);
	const __m128 mm = _mm_set1_ps(0.5f);
	__m128 a = _mm_add_ps(ms0, ms1);
	__m128 b = _mm_sub_ps(ms0, ms1);

	__m128 t1 = _mm_unpacklo_ps(a, b);
	__m128 t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);

	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(a, *(const __m128*)v32f_absmask), mth);
	ms0 = _mm_blendv_ps(_mm_setzero_ps(), a, msk);
#ifdef _KEEP_00_COEF_
	ms0 = _mm_blend_ps(ms0, a, 1);
#endif
	msk = _mm_cmpgt_ps(_mm_and_ps(b, *(const __m128*)v32f_absmask), mth);
	ms1 = _mm_blendv_ps(_mm_setzero_ps(), b, msk);

	a = _mm_add_ps(ms0, ms1);
	b = _mm_sub_ps(ms0, ms1);

	t1 = _mm_unpacklo_ps(a, b);
	t2 = _mm_unpackhi_ps(a, b);
	ms0 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(1, 0, 1, 0));
	ms1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3, 2, 3, 2));

	a = _mm_mul_ps(mm, _mm_add_ps(ms0, ms1));
	b = _mm_mul_ps(mm, _mm_sub_ps(ms0, ms1));

	_mm_store_ps(dest, a);
	_mm_store_ps(dest + 4, b);
}

示例#18

0

显示文件

文件： BoundingBox.cpp 项目： boberfly/Urho3D

BoundingBox BoundingBox::Transformed(const Matrix3x4& transform) const
{
#ifdef URHO3D_SSE
    const __m128 one = _mm_set_ss(1.f);
    __m128 minPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&min_.x_), _mm_unpacklo_ps(_mm_set_ss(min_.z_), one));
    __m128 maxPt = _mm_movelh_ps(_mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&max_.x_), _mm_unpacklo_ps(_mm_set_ss(max_.z_), one));
    __m128 centerPoint = _mm_mul_ps(_mm_add_ps(minPt, maxPt), _mm_set1_ps(0.5f));
    __m128 halfSize = _mm_sub_ps(centerPoint, minPt);
    __m128 m0 = _mm_loadu_ps(&transform.m00_);
    __m128 m1 = _mm_loadu_ps(&transform.m10_);
    __m128 m2 = _mm_loadu_ps(&transform.m20_);
    __m128 r0 = _mm_mul_ps(m0, centerPoint);
    __m128 r1 = _mm_mul_ps(m1, centerPoint);
    __m128 t0 = _mm_add_ps(_mm_unpacklo_ps(r0, r1), _mm_unpackhi_ps(r0, r1));
    __m128 r2 = _mm_mul_ps(m2, centerPoint);
    const __m128 zero = _mm_setzero_ps();
    __m128 t2 = _mm_add_ps(_mm_unpacklo_ps(r2, zero), _mm_unpackhi_ps(r2, zero));
    __m128 newCenter = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
    const __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
    __m128 x = _mm_and_ps(absMask, _mm_mul_ps(m0, halfSize));
    __m128 y = _mm_and_ps(absMask, _mm_mul_ps(m1, halfSize));
    __m128 z = _mm_and_ps(absMask, _mm_mul_ps(m2, halfSize));
    t0 = _mm_add_ps(_mm_unpacklo_ps(x, y), _mm_unpackhi_ps(x, y));
    t2 = _mm_add_ps(_mm_unpacklo_ps(z, zero), _mm_unpackhi_ps(z, zero));
    __m128 newDir = _mm_add_ps(_mm_movelh_ps(t0, t2), _mm_movehl_ps(t2, t0));
    return BoundingBox(_mm_sub_ps(newCenter, newDir), _mm_add_ps(newCenter, newDir));
#else
    Vector3 newCenter = transform * Center();
    Vector3 oldEdge = Size() * 0.5f;
    Vector3 newEdge = Vector3(
        Abs(transform.m00_) * oldEdge.x_ + Abs(transform.m01_) * oldEdge.y_ + Abs(transform.m02_) * oldEdge.z_,
        Abs(transform.m10_) * oldEdge.x_ + Abs(transform.m11_) * oldEdge.y_ + Abs(transform.m12_) * oldEdge.z_,
        Abs(transform.m20_) * oldEdge.x_ + Abs(transform.m21_) * oldEdge.y_ + Abs(transform.m22_) * oldEdge.z_
    );

    return BoundingBox(newCenter - newEdge, newCenter + newEdge);
#endif
}

示例#19

0

显示文件

文件： RigidTransform.hpp 项目： BackupTheBerlios/slon

/** transform vector by rigid transform */
inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec)
{
#ifdef SIMPLE_GL_USE_SSE4
    __m128 res;
    __m128 dotProd;

    res      = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\
    dotProd  = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) );

    return Matrix<float, 4, 1>(res);
#elif defined(SIMPLE_GL_USE_SSE3)
    __m128 res;

    __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);

    __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);

    __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);

    __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);

    __m128 vec01    = _mm_unpacklo_ps(dotProd0, dotProd1);
    __m128 vec23    = _mm_unpackhi_ps(dotProd2, dotProd3);
    res             = _mm_movelh_ps(vec01, vec23);

    return Matrix<float, 4, 1>(res);
#else // SSE2
    // TODO: Think about good sse optimization
    Matrix<float, 4, 1> res;
    res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3];
    res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3];
    res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3];
    res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3];
    return res;
#endif
}

示例#20

0

显示文件

文件： FastResampler_FirFilter_SSE2.cpp 项目： Faik-man/ssr

void FastResampler_FirFilter2_C2_SSE2(unsigned int channels, unsigned int filter_length, float* coef1, float* coef2, float frac, float* input, float* output) {
	Q_UNUSED(channels);
	__m128 sum = _mm_setzero_ps();
	__m128 v_frac = _mm_set1_ps(frac);
	for(unsigned int i = 0; i < filter_length / 4; ++i) {
		__m128 v_coef1 = _mm_load_ps(coef1), v_coef2 = _mm_load_ps(coef2);
		coef1 += 4; coef2 += 4;
		__m128 filter_value = _mm_add_ps(v_coef1, _mm_mul_ps(_mm_sub_ps(v_coef2, v_coef1), v_frac));
		__m128 v_input1 = _mm_loadu_ps(input), v_input2 = _mm_loadu_ps(input + 4);
		input += 8;
		sum = _mm_add_ps(sum, _mm_mul_ps(v_input1, _mm_unpacklo_ps(filter_value, filter_value)));
		sum = _mm_add_ps(sum, _mm_mul_ps(v_input2, _mm_unpackhi_ps(filter_value, filter_value)));
	}
	__m128 sum2 = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, 0xee));
	_mm_store_sd((double*) output, _mm_castps_pd(sum2));
}

示例#21

0

显示文件

文件： Quat.cpp 项目： Garfield-Chen/tng

void Quat::ToAxisAngle(float4 &axis, float &angle) const
{
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 35.332 nsecs / 94.328 ticks, Avg: 35.870 nsecs, Worst: 57.607 nsecs
	assume2(this->IsNormalized(), *this, this->Length());
	simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
	simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle)));
	angle = Acos(s4f_x(cosAngle)) * 2.f;
	simd4f a = mul_ps(q, rcpSinAngle);

	// Set the w component to zero.
	simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z]
	axis.v = _mm_movelh_ps(a, highPart); // [0 z y x]
#else
	// Best: 85.258 nsecs / 227.656 ticks, Avg: 85.492 nsecs, Worst: 86.410 nsecs
	ToAxisAngle(reinterpret_cast<float3&>(axis), angle);
	axis.w = 0.f;
#endif
}

示例#22

0

显示文件

文件： Quat.cpp 项目： ChunHungLiu/MathGeoLib

void Quat::SetFromAxisAngle(const float4 &axis, float angle)
{
	assume1(EqualAbs(axis.w, 0.f), axis);
	assume2(axis.IsNormalized(1e-4f), axis, axis.Length4());
	assume1(MATH_NS::IsFinite(angle), angle);

#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE2)
	// Best: 26.499 nsecs / 71.024 ticks, Avg: 26.856 nsecs, Worst: 27.651 nsecs
	simd4f halfAngle = set1_ps(0.5f*angle);
	simd4f sinAngle, cosAngle;
	sincos_ps(halfAngle, &sinAngle, &cosAngle);
	simd4f quat = mul_ps(axis, sinAngle);

	// Set the w component to cosAngle.
	simd4f highPart = _mm_unpackhi_ps(quat, cosAngle); // [_ _ 1 z]
	q = _mm_movelh_ps(quat, highPart); // [1 z y x]
#else
	// Best: 36.868 nsecs / 98.312 ticks, Avg: 36.980 nsecs, Worst: 41.477 nsecs
	SetFromAxisAngle(axis.xyz(), angle);
#endif
}

示例#23

0

显示文件

文件： SimdFrustumSse.cpp 项目： euler0/Helium

/// Test whether this frustum intersects a given axis-aligned bounding box in world space.
///
/// @param[in] rBox  Box to test.
///
/// @return  True if the box intersects this frustum, false if not.
bool Helium::Simd::Frustum::Intersects( const AaBox& rBox ) const
{
    Helium::Simd::Register boxMinVec = rBox.GetMinimum().GetSimdVector();
    Helium::Simd::Register boxMaxVec = rBox.GetMaximum().GetSimdVector();

    Helium::Simd::Register boxX0 = _mm_shuffle_ps( boxMinVec, boxMinVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    Helium::Simd::Register boxX1 = _mm_shuffle_ps( boxMaxVec, boxMaxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) );
    Helium::Simd::Register boxY = _mm_shuffle_ps( boxMinVec, boxMaxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) );
    Helium::Simd::Register boxZ = _mm_unpackhi_ps( boxMinVec, boxMaxVec );
    boxZ = _mm_movelh_ps( boxZ, boxZ );

    PlaneSoa plane;
    Vector3Soa points( boxX0, boxY, boxZ );
    Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros();

    size_t planeCount = ( m_bInfiniteFarClip ? PLANE_FAR : PLANE_MAX );
    for( size_t planeIndex = 0; planeIndex < planeCount; ++planeIndex )
    {
        plane.Load1Splat(
            m_planeA + planeIndex,
            m_planeB + planeIndex,
            m_planeC + planeIndex,
            m_planeD + planeIndex );

        points.m_x = boxX0;
        Helium::Simd::Mask containsPoints0 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec );

        points.m_x = boxX1;
        Helium::Simd::Mask containsPoints1 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec );

        int resultMask = _mm_movemask_ps( Helium::Simd::Or( containsPoints0, containsPoints1 ) );
        if( resultMask == 0 )
        {
            return false;
        }
    }

    return true;
}

示例#24

0

显示文件

文件： Quat.cpp 项目： Garfield-Chen/tng

vec Quat::Axis() const
{
	assume2(this->IsNormalized(), *this, this->Length());
#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE)
	// Best: 6.145 nsecs / 16.88 ticks, Avg: 6.367 nsecs, Worst: 6.529 nsecs
	assume2(this->IsNormalized(), *this, this->Length());
	simd4f cosAngle = _mm_shuffle_ps(q, q, _MM_SHUFFLE(3, 3, 3, 3));
	simd4f rcpSinAngle = rsqrt_ps(sub_ps(set1_ps(1.f), mul_ps(cosAngle, cosAngle)));
	simd4f a = mul_ps(q, rcpSinAngle);

	// Set the w component to zero.
	simd4f highPart = _mm_unpackhi_ps(a, zero_ps()); // [_ _ 0 z]
	a = _mm_movelh_ps(a, highPart); // [0 z y x]
	return FLOAT4_TO_DIR(a);
#else
	// Best: 6.529 nsecs / 18.152 ticks, Avg: 6.851 nsecs, Worst: 8.065 nsecs

	// Convert cos to sin via the identity sin^2 + cos^2 = 1, and fuse reciprocal and square root to the same instruction,
	// since we are about to divide by it.
	float rcpSinAngle = RSqrt(1.f - w*w);
	return DIR_VEC(x, y, z) * rcpSinAngle;
#endif
}

示例#25

0

显示文件

文件： aec_rdft_sse2.c 项目： 0x0B501E7E/webrtc

static void rftbsub_128_SSE2(float* a) {
  const float* c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

  static const ALIGN16_BEG float ALIGN16_END
      k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
  const __m128 mm_half = _mm_load_ps(k_half);

  a[1] = -a[1];
  // Vectorized code (four at once).
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
    const __m128 wkr_ =
        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
    // Load and shuffle 'a'.
    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
    const __m128 a_j2_p0 = _mm_shuffle_ps(
        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
    const __m128 a_j2_p1 = _mm_shuffle_ps(
        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
    const __m128 a_k2_p0 = _mm_shuffle_ps(
        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
    const __m128 a_k2_p1 = _mm_shuffle_ps(
        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
    // Calculate 'x'.
    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
    // 2-126, 4-124, 6-122, 8-120,
    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
    // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr + wki * xi;
    //    yi = wkr * xi - wki * xr;
    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
    const __m128 b_ = _mm_mul_ps(wki_, xi_);
    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
    const __m128 d_ = _mm_mul_ps(wki_, xr_);
    const __m128 yr_ = _mm_add_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
    const __m128 yi_ = _mm_sub_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
                                            // Update 'a'.
                                            //    a[j2 + 0] = a[j2 + 0] - yr;
                                            //    a[j2 + 1] = yi - a[j2 + 1];
                                            //    a[k2 + 0] = yr + a[k2 + 0];
    //    a[k2 + 1] = yi - a[k2 + 1];
    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
    const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
    const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
    // Shuffle in right order and store.
    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
    //   2,   3,   4,   5,
    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
    //   6,   7,   8,   9,
    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
    // 122, 123, 120, 121,
    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
    // 126, 127, 124, 125,
    const __m128 a_k2_0n = _mm_shuffle_ps(
        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
    const __m128 a_k2_4n = _mm_shuffle_ps(
        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
    k1 = 32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
    xi = a[j2 + 1] + a[k2 + 1];
    yr = wkr * xr + wki * xi;
    yi = wkr * xi - wki * xr;
    a[j2 + 0] = a[j2 + 0] - yr;
    a[j2 + 1] = yi - a[j2 + 1];
    a[k2 + 0] = yr + a[k2 + 0];
    a[k2 + 1] = yi - a[k2 + 1];
  }
  a[65] = -a[65];
}

示例#26

0

显示文件

文件： AudioSource.cpp 项目： ascendedguard/OBS

UINT AudioSource::QueryAudio(float curVolume)
{
    LPVOID buffer;
    UINT numAudioFrames;
    QWORD newTimestamp;

    if(GetNextBuffer((void**)&buffer, &numAudioFrames, &newTimestamp))
    {
        //------------------------------------------------------------
        // convert to float

        float *captureBuffer;

        if(!bFloat)
        {
            UINT totalSamples = numAudioFrames*inputChannels;
            if(convertBuffer.Num() < totalSamples)
                convertBuffer.SetSize(totalSamples);

            if(inputBitsPerSample == 8)
            {
                float *tempConvert = convertBuffer.Array();
                char *tempSByte = (char*)buffer;

                while(totalSamples--)
                {
                    *(tempConvert++) = float(*(tempSByte++))/127.0f;
                }
            }
            else if(inputBitsPerSample == 16)
            {
                float *tempConvert = convertBuffer.Array();
                short *tempShort = (short*)buffer;

                while(totalSamples--)
                {
                    *(tempConvert++) = float(*(tempShort++))/32767.0f;
                }
            }
            else if(inputBitsPerSample == 24)
            {
                float *tempConvert = convertBuffer.Array();
                BYTE *tempTriple = (BYTE*)buffer;
                TripleToLong valOut;

                while(totalSamples--)
                {
                    TripleToLong &valIn  = (TripleToLong&)tempTriple;

                    valOut.wVal = valIn.wVal;
                    valOut.tripleVal = valIn.tripleVal;
                    if(valOut.tripleVal > 0x7F)
                        valOut.lastByte = 0xFF;

                    *(tempConvert++) = float(double(valOut.val)/8388607.0);
                    tempTriple += 3;
                }
            }
            else if(inputBitsPerSample == 32)
            {
                float *tempConvert = convertBuffer.Array();
                long *tempShort = (long*)buffer;

                while(totalSamples--)
                {
                    *(tempConvert++) = float(double(*(tempShort++))/2147483647.0);
                }
            }

            captureBuffer = convertBuffer.Array();
        }
        else
            captureBuffer = (float*)buffer;

        //------------------------------------------------------------
        // channel upmix/downmix

        if(tempBuffer.Num() < numAudioFrames*2)
            tempBuffer.SetSize(numAudioFrames*2);

        float *dataOutputBuffer = tempBuffer.Array();
        float *tempOut = dataOutputBuffer;

        if(inputChannels == 1)
        {
            UINT  numFloats   = numAudioFrames;
            float *inputTemp  = (float*)captureBuffer;
            float *outputTemp = dataOutputBuffer;

            if((UPARAM(inputTemp) & 0xF) == 0 && (UPARAM(outputTemp) & 0xF) == 0)
            {
                UINT alignedFloats = numFloats & 0xFFFFFFFC;
                for(UINT i=0; i<alignedFloats; i += 4)
                {
                    __m128 inVal   = _mm_load_ps(inputTemp+i);

                    __m128 outVal1 = _mm_unpacklo_ps(inVal, inVal);
                    __m128 outVal2 = _mm_unpackhi_ps(inVal, inVal);

                    _mm_store_ps(outputTemp+(i*2),   outVal1);
                    _mm_store_ps(outputTemp+(i*2)+4, outVal2);
                }

                numFloats  -= alignedFloats;
                inputTemp  += alignedFloats;
                outputTemp += alignedFloats*2;
            }

            while(numFloats--)
            {
                float inputVal = *inputTemp;
                *(outputTemp++) = inputVal;
                *(outputTemp++) = inputVal;

                inputTemp++;
            }
        }
        else if(inputChannels == 2) //straight up copy
        {
            SSECopy(dataOutputBuffer, captureBuffer, numAudioFrames*2*sizeof(float));
        }
        else
        {
            //todo: downmix optimization, also support for other speaker configurations than ones I can merely "think" of.  ugh.
            float *inputTemp  = (float*)captureBuffer;
            float *outputTemp = dataOutputBuffer;

            if(inputChannelMask == KSAUDIO_SPEAKER_QUAD)
            {
                UINT numFloats = numAudioFrames*4;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                    float rearLeft  = inputTemp[2]*surroundMix4;
                    float rearRight = inputTemp[3]*surroundMix4;

                    // When in doubt, use only left and right .... and rear left and rear right :) 
                    // Same idea as with 5.1 downmix

                    *(outputTemp++) = (left  + rearLeft)  * attn4dotX;
                    *(outputTemp++) = (right + rearRight) * attn4dotX;

                    inputTemp  += 4;
                }
            }
            else if(inputChannelMask == KSAUDIO_SPEAKER_2POINT1)
            {
                UINT numFloats = numAudioFrames*3;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                   
                    // Drop LFE since we don't need it
                    //float lfe       = inputTemp[2]*lowFreqMix;

                    *(outputTemp++) = left;
                    *(outputTemp++) = right;

                    inputTemp  += 3;
                }
            }
            else if(inputChannelMask == KSAUDIO_SPEAKER_4POINT1)
            {
                UINT numFloats = numAudioFrames*5;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];

                    // Skip LFE , we don't really need it.
                    //float lfe       = inputTemp[2];

                    float rearLeft  = inputTemp[3]*surroundMix4;
                    float rearRight = inputTemp[4]*surroundMix4;

                    // Same idea as with 5.1 downmix

                    *(outputTemp++) = (left  + rearLeft)  * attn4dotX;
                    *(outputTemp++) = (right + rearRight) * attn4dotX;

                    inputTemp  += 5;
                }
            }
            else if(inputChannelMask == KSAUDIO_SPEAKER_SURROUND)
            {
                UINT numFloats = numAudioFrames*4;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                    float frontCenter    = inputTemp[2];
                    float rearCenter     = inputTemp[3];
                    
                    // When in doubt, use only left and right :) Seriously.
                    // THIS NEEDS TO BE PROPERLY IMPLEMENTED!

                    *(outputTemp++) = left;
                    *(outputTemp++) = right;

                    inputTemp  += 4;
                }
            }
            // Both speakers configs share the same format, the difference is in rear speakers position 
            // See: http://msdn.microsoft.com/en-us/library/windows/hardware/ff537083(v=vs.85).aspx
            // Probably for KSAUDIO_SPEAKER_5POINT1_SURROUND we will need a different coefficient for rear left/right
            else if(inputChannelMask == KSAUDIO_SPEAKER_5POINT1 || inputChannelMask == KSAUDIO_SPEAKER_5POINT1_SURROUND)
            {
                UINT numFloats = numAudioFrames*6;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                    float center    = inputTemp[2]*centerMix;
                    
                    //We don't need LFE channel so skip it (see below)
                    //float lowFreq   = inputTemp[3]*lowFreqMix;
                    
                    float rearLeft  = inputTemp[4]*surroundMix;
                    float rearRight = inputTemp[5]*surroundMix;
                    
                    // According to ITU-R  BS.775-1 recommendation, the downmix from a 3/2 source to stereo
                    // is the following:
                    // L = FL + k0*C + k1*RL
                    // R = FR + k0*C + k1*RR
                    // FL = front left
                    // FR = front right
                    // C  = center
                    // RL = rear left
                    // RR = rear right
                    // k0 = centerMix   = dbMinus3 = 0.7071067811865476 [for k0 we can use dbMinus6 = 0.5 too, probably it's better]
                    // k1 = surroundMix = dbMinus3 = 0.7071067811865476

                    // The output (L,R) can be out of (-1,1) domain so we attenuate it [ attn5dot1 = 1/(1 + centerMix + surroundMix) ]
                    // Note: this method of downmixing is far from "perfect" (pretty sure it's not the correct way) but the resulting downmix is "okayish", at least no more bleeding ears.
                    // (maybe have a look at http://forum.doom9.org/archive/index.php/t-148228.html too [ 5.1 -> stereo ] the approach seems almost the same [but different coefficients])

                    
                    // http://acousticsfreq.com/blog/wp-content/uploads/2012/01/ITU-R-BS775-1.pdf
                    // http://ir.lib.nctu.edu.tw/bitstream/987654321/22934/1/030104001.pdf

                    *(outputTemp++) = (left  + center  + rearLeft) * attn5dot1;
                    *(outputTemp++) = (right + center  + rearRight) * attn5dot1;

                    inputTemp  += 6;
                }
            }

            // According to http://msdn.microsoft.com/en-us/library/windows/hardware/ff537083(v=vs.85).aspx
            // KSAUDIO_SPEAKER_7POINT1 is obsolete and no longer supported in Windows Vista and later versions of Windows
            // Not sure what to do about it, meh , drop front left of center/front right of center -> 5.1 -> stereo; 

            else if(inputChannelMask == KSAUDIO_SPEAKER_7POINT1)
            {
                UINT numFloats = numAudioFrames*8;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left          = inputTemp[0];
                    float right         = inputTemp[1];
                    
                    float center        = inputTemp[2] * centerMix;
                    
                    // Drop LFE since we don't need it
                    //float lowFreq       = inputTemp[3]*lowFreqMix;
                    
                    float rearLeft      = inputTemp[4] * surroundMix;
                    float rearRight     = inputTemp[5] * surroundMix;

                    // Drop SPEAKER_FRONT_LEFT_OF_CENTER , SPEAKER_FRONT_RIGHT_OF_CENTER
                    //float centerLeft    = inputTemp[6];
                    //float centerRight   = inputTemp[7];
                    
                    // Downmix from 5.1 to stereo
                    *(outputTemp++) = (left  + center  + rearLeft)  * attn5dot1;
                    *(outputTemp++) = (right + center  + rearRight) * attn5dot1;

                    inputTemp  += 8;
                }
            }

            // Downmix to 5.1 (easy stuff) then downmix to stereo as done in KSAUDIO_SPEAKER_5POINT1
            else if(inputChannelMask == KSAUDIO_SPEAKER_7POINT1_SURROUND)
            {
                UINT numFloats = numAudioFrames*8;
                float *endTemp = inputTemp+numFloats;

                while(inputTemp < endTemp)
                {
                    float left      = inputTemp[0];
                    float right     = inputTemp[1];
                    float center    = inputTemp[2] * centerMix;

                    // Skip LFE we don't need it
                    //float lowFreq   = inputTemp[3]*lowFreqMix;

                    float rearLeft  = inputTemp[4];
                    float rearRight = inputTemp[5];
                    float sideLeft  = inputTemp[6];
                    float sideRight = inputTemp[7];

                    // combine the rear/side channels first , baaam! 5.1
                    rearLeft  = (rearLeft  + sideLeft)  * 0.5f;
                    rearRight = (rearRight + sideRight) * 0.5f;
                    
                    // downmix to stereo as in 5.1 case
                    *(outputTemp++) = (left  + center + rearLeft  * surroundMix) * attn5dot1;
                    *(outputTemp++) = (right + center + rearRight * surroundMix) * attn5dot1;

                    inputTemp  += 8;
                }
            }
        }

        ReleaseBuffer();

        //------------------------------------------------------------
        // resample

        if(bResample)
        {
            UINT frameAdjust = UINT((double(numAudioFrames) * resampleRatio) + 1.0);
            UINT newFrameSize = frameAdjust*2;

            if(tempResampleBuffer.Num() < newFrameSize)
                tempResampleBuffer.SetSize(newFrameSize);

            SRC_DATA data;
            data.src_ratio = resampleRatio;

            data.data_in = tempBuffer.Array();
            data.input_frames = numAudioFrames;

            data.data_out = tempResampleBuffer.Array();
            data.output_frames = frameAdjust;

            data.end_of_input = 0;

            int err = src_process((SRC_STATE*)resampler, &data);
            if(err)
            {
                RUNONCE AppWarning(TEXT("AudioSource::QueryAudio: Was unable to resample audio for device '%s'"), GetDeviceName());
                return NoAudioAvailable;
            }

            if(data.input_frames_used != numAudioFrames)
            {
                RUNONCE AppWarning(TEXT("AudioSource::QueryAudio: Failed to downsample buffer completely, which shouldn't actually happen because it should be using 10ms of samples"));
                return NoAudioAvailable;
            }

            numAudioFrames = data.output_frames_gen;
        }

        //-----------------------------------------------------------------------------
        // sort all audio frames into 10 millisecond increments (done because not all devices output in 10ms increments)
        // NOTE: 0.457+ - instead of using the timestamps from windows, just compare and make sure it stays within a 100ms of their timestamps

        if(!bFirstBaseFrameReceived)
        {
            lastUsedTimestamp = newTimestamp;
            bFirstBaseFrameReceived = true;
        }

        float *newBuffer = (bResample) ? tempResampleBuffer.Array() : tempBuffer.Array();

        if (bSmoothTimestamps) {
            lastUsedTimestamp += 10;

            QWORD difVal = GetQWDif(newTimestamp, lastUsedTimestamp);
            if(difVal > 70)
            {
                //OSDebugOut(TEXT("----------------------------1\r\nlastUsedTimestamp before: %llu - device: %s\r\n"), lastUsedTimestamp, GetDeviceName());
                lastUsedTimestamp = newTimestamp;
                //OSDebugOut(TEXT("lastUsedTimestamp after: %llu\r\n"), lastUsedTimestamp);
            }

            if(lastUsedTimestamp > lastSentTimestamp)
            {
                QWORD adjustVal = (lastUsedTimestamp-lastSentTimestamp);
                if(adjustVal < 10)
                    lastUsedTimestamp += 10-adjustVal;

                AudioSegment *newSegment = new AudioSegment(newBuffer, numAudioFrames*2, lastUsedTimestamp);
                AddAudioSegment(newSegment, curVolume*sourceVolume);

                lastSentTimestamp = lastUsedTimestamp;
            }
        } else {
           // OSDebugOut(TEXT("newTimestamp: %llu\r\n"), newTimestamp);
            AudioSegment *newSegment = new AudioSegment(newBuffer, numAudioFrames*2, newTimestamp);
            AddAudioSegment(newSegment, curVolume*sourceVolume);
        }

        //-----------------------------------------------------------------------------

        return AudioAvailable;
    }

    return NoAudioAvailable;
}

示例#27

0

显示文件

文件： SPaRTaSSE.c 项目： asolis/detection3D

static inline void   sacEvaluateModelSPRT(PROSAC_HEST* p){
	unsigned i;
	unsigned isInlier;
	double   lambda       = 1.0;
	double   lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon));
	double   lambdaAccept = ((   p->delta   ) / (    p->epsilon  ));
	float    distSq = p->maxD*p->maxD;
	float*   src = (float*)p->src;
	float*   dst = (float*)p->dst;
	float*   H   = p->H;
	
	
	p->inl      = 0;
	p->N_tested = 0;
	p->good     = 1;
	
	
	/* VECTOR */
	const __m128 distSqV=_mm_set1_ps(distSq);
	
	const __m128 H00=_mm_set1_ps(H[0]);
	const __m128 H01=_mm_set1_ps(H[1]);
	const __m128 H02=_mm_set1_ps(H[2]);
	const __m128 H10=_mm_set1_ps(H[4]);
	const __m128 H11=_mm_set1_ps(H[5]);
	const __m128 H12=_mm_set1_ps(H[6]);
	const __m128 H20=_mm_set1_ps(H[8]);
	const __m128 H21=_mm_set1_ps(H[9]);
	const __m128 H22=_mm_set1_ps(H[10]);
	
	for(i=0;i<(p->N-3) && p->good;i+=4){
		/* Backproject */
		__m128 x, y, X, Y, inter0, inter1, inter2, inter3;
		x=_mm_load_ps(src+2*i);
		y=_mm_load_ps(src+2*i+4);
		X=_mm_load_ps(dst+2*i);
		Y=_mm_load_ps(dst+2*i+4);
		
		inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0
		inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2
		inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0
		inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2
		
		x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		
		__m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02);
		__m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12);
		__m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22);
		
		__m128 recipZ = _mm_rcp_ps(reprojZ);
		reprojX = _mm_mul_ps(reprojX, recipZ);
		reprojY = _mm_mul_ps(reprojY, recipZ);
		//reprojX = _mm_div_ps(reprojX, reprojZ);
		//reprojY = _mm_div_ps(reprojY, reprojZ);
		
		reprojX = _mm_sub_ps(reprojX, X);
		reprojY = _mm_sub_ps(reprojY, Y);
		
		reprojX = _mm_mul_ps(reprojX, reprojX);
		reprojY = _mm_mul_ps(reprojY, reprojY);
		
		__m128 reprojDistV = _mm_add_ps(reprojX, reprojY);
		
		__m128 cmp = _mm_cmple_ps(reprojDistV, distSqV);
		int msk = _mm_movemask_ps(cmp);
		
		/* ... */
		/*                   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15*/
		unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
		p->inl     += bitCnt[msk];
		
		
		/* SPRT */
		lambda *= p->lambdaTBL[msk];
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	/* SCALAR */
	for(;i<p->N && p->good;i++){
		/* Backproject */
		float x=src[i*2],y=src[i*2+1];
		float X=dst[i*2],Y=dst[i*2+1];
		
		float reprojX=H[0]*x+H[1]*y+H[2]; //  ( X_1 )     ( H_11 H_12    H_13  ) (x_1)
		float reprojY=H[4]*x+H[5]*y+H[6]; //  ( X_2 )  =  ( H_21 H_22    H_23  ) (x_2)
		float reprojZ=H[8]*x+H[9]*y+H[10];//  ( X_3 )     ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0)
		
		//reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z.
		reprojX/=reprojZ;
		reprojY/=reprojZ;
		
		//Compute distance
		reprojX-=X;
		reprojY-=Y;
		reprojX*=reprojX;
		reprojY*=reprojY;
		float reprojDist = reprojX+reprojY;
		
		/* ... */
		isInlier    = reprojDist <= distSq;
		p->inl     += isInlier;
		
		
		/* SPRT */
		lambda *= isInlier ? lambdaAccept : lambdaReject;
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	
	p->N_tested = i;
}

示例#28

0

显示文件

文件： ModelDecal.cpp 项目： Deepfreeze32/taken

/*
============
R_DecalPointCullStatic
============
*/
static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, const idDrawVert * verts, const int numVerts ) {
	assert_16_byte_aligned( cullBits );
	assert_16_byte_aligned( verts );


	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );

	const __m128 vector_float_zero	= { 0.0f, 0.0f, 0.0f, 0.0f };
	const __m128i vector_int_mask0	= _mm_set1_epi32( 1 << 0 );
	const __m128i vector_int_mask1	= _mm_set1_epi32( 1 << 1 );
	const __m128i vector_int_mask2	= _mm_set1_epi32( 1 << 2 );
	const __m128i vector_int_mask3	= _mm_set1_epi32( 1 << 3 );
	const __m128i vector_int_mask4	= _mm_set1_epi32( 1 << 4 );
	const __m128i vector_int_mask5	= _mm_set1_epi32( 1 << 5 );

	const __m128 p0 = _mm_loadu_ps( planes[0].ToFloatPtr() );
	const __m128 p1 = _mm_loadu_ps( planes[1].ToFloatPtr() );
	const __m128 p2 = _mm_loadu_ps( planes[2].ToFloatPtr() );
	const __m128 p3 = _mm_loadu_ps( planes[3].ToFloatPtr() );
	const __m128 p4 = _mm_loadu_ps( planes[4].ToFloatPtr() );
	const __m128 p5 = _mm_loadu_ps( planes[5].ToFloatPtr() );

	const __m128 p0X = _mm_splat_ps( p0, 0 );
	const __m128 p0Y = _mm_splat_ps( p0, 1 );
	const __m128 p0Z = _mm_splat_ps( p0, 2 );
	const __m128 p0W = _mm_splat_ps( p0, 3 );

	const __m128 p1X = _mm_splat_ps( p1, 0 );
	const __m128 p1Y = _mm_splat_ps( p1, 1 );
	const __m128 p1Z = _mm_splat_ps( p1, 2 );
	const __m128 p1W = _mm_splat_ps( p1, 3 );

	const __m128 p2X = _mm_splat_ps( p2, 0 );
	const __m128 p2Y = _mm_splat_ps( p2, 1 );
	const __m128 p2Z = _mm_splat_ps( p2, 2 );
	const __m128 p2W = _mm_splat_ps( p2, 3 );

	const __m128 p3X = _mm_splat_ps( p3, 0 );
	const __m128 p3Y = _mm_splat_ps( p3, 1 );
	const __m128 p3Z = _mm_splat_ps( p3, 2 );
	const __m128 p3W = _mm_splat_ps( p3, 3 );

	const __m128 p4X = _mm_splat_ps( p4, 0 );
	const __m128 p4Y = _mm_splat_ps( p4, 1 );
	const __m128 p4Z = _mm_splat_ps( p4, 2 );
	const __m128 p4W = _mm_splat_ps( p4, 3 );

	const __m128 p5X = _mm_splat_ps( p5, 0 );
	const __m128 p5Y = _mm_splat_ps( p5, 1 );
	const __m128 p5Z = _mm_splat_ps( p5, 2 );
	const __m128 p5W = _mm_splat_ps( p5, 3 );

	for ( int i = 0; i < numVerts; ) {

		const int nextNumVerts = vertsODS.FetchNextBatch() - 4;

		for ( ; i <= nextNumVerts; i += 4 ) {
			const __m128 v0 = _mm_load_ps( vertsODS[i + 0].xyz.ToFloatPtr() );
			const __m128 v1 = _mm_load_ps( vertsODS[i + 1].xyz.ToFloatPtr() );
			const __m128 v2 = _mm_load_ps( vertsODS[i + 2].xyz.ToFloatPtr() );
			const __m128 v3 = _mm_load_ps( vertsODS[i + 3].xyz.ToFloatPtr() );

			const __m128 r0 = _mm_unpacklo_ps( v0, v2 );	// v0.x, v2.x, v0.z, v2.z
			const __m128 r1 = _mm_unpackhi_ps( v0, v2 );	// v0.y, v2.y, v0.w, v2.w
			const __m128 r2 = _mm_unpacklo_ps( v1, v3 );	// v1.x, v3.x, v1.z, v3.z
			const __m128 r3 = _mm_unpackhi_ps( v1, v3 );	// v1.y, v3.y, v1.w, v3.w

			const __m128 vX = _mm_unpacklo_ps( r0, r2 );	// v0.x, v1.x, v2.x, v3.x
			const __m128 vY = _mm_unpackhi_ps( r0, r2 );	// v0.y, v1.y, v2.y, v3.y
			const __m128 vZ = _mm_unpacklo_ps( r1, r3 );	// v0.z, v1.z, v2.z, v3.z

			const __m128 d0 = _mm_madd_ps( vX, p0X, _mm_madd_ps( vY, p0Y, _mm_madd_ps( vZ, p0Z, p0W ) ) );
			const __m128 d1 = _mm_madd_ps( vX, p1X, _mm_madd_ps( vY, p1Y, _mm_madd_ps( vZ, p1Z, p1W ) ) );
			const __m128 d2 = _mm_madd_ps( vX, p2X, _mm_madd_ps( vY, p2Y, _mm_madd_ps( vZ, p2Z, p2W ) ) );
			const __m128 d3 = _mm_madd_ps( vX, p3X, _mm_madd_ps( vY, p3Y, _mm_madd_ps( vZ, p3Z, p3W ) ) );
			const __m128 d4 = _mm_madd_ps( vX, p4X, _mm_madd_ps( vY, p4Y, _mm_madd_ps( vZ, p4Z, p4W ) ) );
			const __m128 d5 = _mm_madd_ps( vX, p5X, _mm_madd_ps( vY, p5Y, _mm_madd_ps( vZ, p5Z, p5W ) ) );

			__m128i c0 = __m128c( _mm_cmpgt_ps( d0, vector_float_zero ) );
			__m128i c1 = __m128c( _mm_cmpgt_ps( d1, vector_float_zero ) );
			__m128i c2 = __m128c( _mm_cmpgt_ps( d2, vector_float_zero ) );
			__m128i c3 = __m128c( _mm_cmpgt_ps( d3, vector_float_zero ) );
			__m128i c4 = __m128c( _mm_cmpgt_ps( d4, vector_float_zero ) );
			__m128i c5 = __m128c( _mm_cmpgt_ps( d5, vector_float_zero ) );

			c0 = _mm_and_si128( c0, vector_int_mask0 );
			c1 = _mm_and_si128( c1, vector_int_mask1 );
			c2 = _mm_and_si128( c2, vector_int_mask2 );
			c3 = _mm_and_si128( c3, vector_int_mask3 );
			c4 = _mm_and_si128( c4, vector_int_mask4 );
			c5 = _mm_and_si128( c5, vector_int_mask5 );

			c0 = _mm_or_si128( c0, c1 );
			c2 = _mm_or_si128( c2, c3 );
			c4 = _mm_or_si128( c4, c5 );

			c0 = _mm_or_si128( c0, c2 );
			c0 = _mm_or_si128( c0, c4 );

			__m128i s0 = _mm_packs_epi32( c0, c0 );
			__m128i b0 = _mm_packus_epi16( s0, s0 );

			*(unsigned int *)&cullBits[i] = _mm_cvtsi128_si32( b0 );
		}
	}

}

示例#29

0

显示文件

文件： colorout.c 项目： dirkbr/darktable

void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid,
             const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->mode == DT_PROFILE_GAMUTCHECK);

  if(!isnan(d->cmatrix[0]))
  {
// fprintf(stderr,"Using cmatrix codepath\n");
// convert to rgb using matrix
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none) shared(roi_in, roi_out, ivoid, ovoid)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {

      float *in = (float *)ivoid + (size_t)ch * roi_in->width * j;
      float *out = (float *)ovoid + (size_t)ch * roi_out->width * j;
      const __m128 m0 = _mm_set_ps(0.0f, d->cmatrix[6], d->cmatrix[3], d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f, d->cmatrix[7], d->cmatrix[4], d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f, d->cmatrix[8], d->cmatrix[5], d->cmatrix[2]);

      for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
      {
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t
            = _mm_add_ps(_mm_mul_ps(m0, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(0, 0, 0, 0))),
                         _mm_add_ps(_mm_mul_ps(m1, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(1, 1, 1, 1))),
                                    _mm_mul_ps(m2, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(2, 2, 2, 2)))));

        _mm_stream_ps(out, t);
      }
    }
    _mm_sfence();
// apply profile
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none) shared(roi_in, roi_out, ivoid, ovoid)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {

      float *in = (float *)ivoid + (size_t)ch * roi_in->width * j;
      float *out = (float *)ovoid + (size_t)ch * roi_out->width * j;

      for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
      {
        for(int i = 0; i < 3; i++)
          if(d->lut[i][0] >= 0.0f)
          {
            out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i])
                                     : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]);
          }
      }
    }
  }
  else
  {
    // fprintf(stderr,"Using xform codepath\n");
    const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none) shared(ivoid, ovoid, roi_out)
#endif
    for(int k = 0; k < roi_out->height; k++)
    {
      const float *in = ((float *)ivoid) + (size_t)ch * k * roi_out->width;
      float *out = ((float *)ovoid) + (size_t)ch * k * roi_out->width;

      if(!gamutcheck)
      {
        cmsDoTransform(d->xform, in, out, roi_out->width);
      }
      else
      {
        void *rgb = dt_alloc_align(16, 4 * sizeof(float) * roi_out->width);
        cmsDoTransform(d->xform, in, rgb, roi_out->width);
        float *rgbptr = (float *)rgb;
        for(int j = 0; j < roi_out->width; j++, rgbptr += 4, out += 4)
        {
          const __m128 pixel = _mm_load_ps(rgbptr);
          __m128 ingamut = _mm_cmplt_ps(pixel, _mm_set_ps(-FLT_MAX, 0.0f, 0.0f, 0.0f));

          ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut));
          ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut));

          const __m128 result
              = _mm_or_ps(_mm_and_ps(ingamut, outofgamutpixel), _mm_andnot_ps(ingamut, pixel));
          _mm_stream_ps(out, result);
        }
        dt_free_align(rgb);
      }
    }
    _mm_sfence();
  }

  if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}

示例#30

0

显示文件

文件： aec_core_sse2.c 项目： AGProjects/python-sipsimple

static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
  int i, j;
  for (i = 0; i < NR_PART; i++) {
    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
    int pos = i * PART_LEN1;
    // Check for wrap
    if (i + aec->xfBufBlockPos >= NR_PART) {
      xPos -= NR_PART * PART_LEN1;
    }

#ifdef UNCONSTR
    for (j = 0; j < PART_LEN1; j++) {
      aec->wfBuf[pos + j][0] += MulRe(aec->xfBuf[xPos + j][0],
                                      -aec->xfBuf[xPos + j][1],
                                      ef[j][0], ef[j][1]);
      aec->wfBuf[pos + j][1] += MulIm(aec->xfBuf[xPos + j][0],
                                      -aec->xfBuf[xPos + j][1],
                                      ef[j][0], ef[j][1]);
    }
#else
    // Process the whole array...
    for (j = 0; j < PART_LEN; j+= 4) {
      // Load xfBuf and ef.
      const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
      const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
      const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);
      const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);
      // Calculate the product of conjugate(xfBuf) by ef.
      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
      const __m128 a = _mm_mul_ps(xfBuf_re, ef_re);
      const __m128 b = _mm_mul_ps(xfBuf_im, ef_im);
      const __m128 c = _mm_mul_ps(xfBuf_re, ef_im);
      const __m128 d = _mm_mul_ps(xfBuf_im, ef_re);
      const __m128 e = _mm_add_ps(a, b);
      const __m128 f = _mm_sub_ps(c, d);
      // Interleave real and imaginary parts.
      const __m128 g = _mm_unpacklo_ps(e, f);
      const __m128 h = _mm_unpackhi_ps(e, f);
      // Store
      _mm_storeu_ps(&fft[2*j + 0], g);
      _mm_storeu_ps(&fft[2*j + 4], h);
    }
    // ... and fixup the first imaginary entry.
    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
                   -aec->xfBuf[1][xPos + PART_LEN],
                   ef[0][PART_LEN], ef[1][PART_LEN]);

    aec_rdft_inverse_128(fft);
    memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);

    // fft scaling
    {
      float scale = 2.0f / PART_LEN2;
      const __m128 scale_ps = _mm_load_ps1(&scale);
      for (j = 0; j < PART_LEN; j+=4) {
        const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
        const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
        _mm_storeu_ps(&fft[j], fft_scale);
      }
    }
    aec_rdft_forward_128(fft);

    {
      float wt1 = aec->wfBuf[1][pos];
      aec->wfBuf[0][pos + PART_LEN] += fft[1];
      for (j = 0; j < PART_LEN; j+= 4) {
        __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
        __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
        const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
        const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
        const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2 ,0));
        const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3 ,1));
        wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
        wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
        _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
        _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);
      }
      aec->wfBuf[1][pos] = wt1;
    }
#endif // UNCONSTR
  }
}