static void FilterAdaptationSSE2(
    int num_partitions,
    int x_fft_buf_block_pos,
    float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
    float e_fft[2][PART_LEN1],
    float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
  float fft[PART_LEN2];
  int i, j;
  for (i = 0; i < num_partitions; i++) {
    int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1);
    int pos = i * PART_LEN1;
    // Check for wrap
    if (i + x_fft_buf_block_pos >= num_partitions) {
      xPos -= num_partitions * PART_LEN1;
    }

    // Process the whole array...
    for (j = 0; j < PART_LEN; j += 4) {
      // Load x_fft_buf and e_fft.
      const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]);
      const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]);
      const __m128 e_fft_re = _mm_loadu_ps(&e_fft[0][j]);
      const __m128 e_fft_im = _mm_loadu_ps(&e_fft[1][j]);
      // Calculate the product of conjugate(x_fft_buf) by e_fft.
      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
      const __m128 a = _mm_mul_ps(x_fft_buf_re, e_fft_re);
      const __m128 b = _mm_mul_ps(x_fft_buf_im, e_fft_im);
      const __m128 c = _mm_mul_ps(x_fft_buf_re, e_fft_im);
      const __m128 d = _mm_mul_ps(x_fft_buf_im, e_fft_re);
      const __m128 e = _mm_add_ps(a, b);
      const __m128 f = _mm_sub_ps(c, d);
      // Interleave real and imaginary parts.
      const __m128 g = _mm_unpacklo_ps(e, f);
      const __m128 h = _mm_unpackhi_ps(e, f);
      // Store
      _mm_storeu_ps(&fft[2 * j + 0], g);
      _mm_storeu_ps(&fft[2 * j + 4], h);
    }
    // ... and fixup the first imaginary entry.
    fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],
                   -x_fft_buf[1][xPos + PART_LEN],
                   e_fft[0][PART_LEN],
                   e_fft[1][PART_LEN]);

    aec_rdft_inverse_128(fft);
    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

    // fft scaling
    {
      float scale = 2.0f / PART_LEN2;
      const __m128 scale_ps = _mm_load_ps1(&scale);
      for (j = 0; j < PART_LEN; j += 4) {
        const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
        const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
        _mm_storeu_ps(&fft[j], fft_scale);
      }
    }
    aec_rdft_forward_128(fft);

    {
      float wt1 = h_fft_buf[1][pos];
      h_fft_buf[0][pos + PART_LEN] += fft[1];
      for (j = 0; j < PART_LEN; j += 4) {
        __m128 wtBuf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]);
        __m128 wtBuf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]);
        const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
        const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
        const __m128 fft_re =
            _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
        const __m128 fft_im =
            _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
        wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
        wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
        _mm_storeu_ps(&h_fft_buf[0][pos + j], wtBuf_re);
        _mm_storeu_ps(&h_fft_buf[1][pos + j], wtBuf_im);
      }
      h_fft_buf[1][pos] = wt1;
    }
  }
}
//////////////////////////////////////////////////////////////////////////////
//
// LightProjectNoClipXmm()
//
void Camera::LightProjectNoClipXmm( Plane *srcP, Vector *srcVect, Vector *srcNorm, UVPair *srcUV, U32 countV, U16 *srcI, U32 countI, VertexTL **dstV, U32 *dstVCount, U32 *dstICount) // = NULL, = NULL, = NULL)
{
  if (dstV)
  {
    // in case we exit before bottom
    *dstV = NULL;
    *dstVCount = 0;
    *dstICount = 0;
  }

  ASSERT( countI <= MAXINDICES);

  U16              notCulledV[MAXVERTS];
  static VertexTL *litV;
  U16             *litI;

  // This is called before BackfaceCull only because we don't know the countI.
  // We may want to try to put litI in a temporary buffer during backface cull
  // and then allocate bucket mem after BackfaceCull.
  // This works for now.
  if (!Vid::LockIndexedPrimitiveMem( (void **)&litV, countV, &litI, countI))
  {
    return;
  }

#ifdef _DEBUG
  U32 orig_countV = countV; // used in ASSERT below
#endif

  BackfaceCull(srcP, notCulledV, countV, srcI, litI, countI);

  if (!countV)
  {
    // no forward faces
    Vid::UnlockIndexedPrimitiveMem( 0, 0, FALSE, TRUE);
    return;
  }

  // make the count a multiple of four by lighting the last vert multiple times
  notCulledV[countV+0] = notCulledV[countV+1] = notCulledV[countV+2] = notCulledV[countV+3] = notCulledV[countV-1];
  U32 countV_rem = ((4 - (countV % SIMD_WIDTH)) << 30) >> 30;
  countV += countV_rem; 

  ASSERT( countV <= orig_countV );
  ASSERT( (countV % SIMD_WIDTH) == 0 );

  Material &material = *DxLight::Manager::curMaterial;
  U32 diffa = Utils::FtoL( material.desc.diffuse.a  * 255.0f);

  // calculate the parts of the diffuse color that are the same for all output vertexes
  ColorValueXmm diffInit;

  // FIXME: do we really want to ignore material ambient values (yes for DR2)
#ifdef DOBZ2
  diffInit.Set(
    material->desc.diffuse.r * Vid::renderState.ambientR + mateial->desc.emissive.r,
    material->desc.diffuse.g * Vid::renderState.ambientG + mateial->desc.emissive.g,
    material->desc.diffuse.b * Vid::renderState.ambientB + mateial->desc.emissive.b,
    material.desc.diffuse.a);
#else
  diffInit.Set(
    Vid::renderState.ambientR + material.desc.emissive.r,
    Vid::renderState.ambientG + material.desc.emissive.g,
    Vid::renderState.ambientB + material.desc.emissive.b,
    material.desc.diffuse.a);
#endif

  const static __m128 zero = _mm_setzero_ps();
  const static ColorValueXmm specInit(zero, zero, zero, _mm_set_ps1(1.0f));

  for ( U32 vc = 0; vc < countV; vc += SIMD_WIDTH )
  {
    // set-up xmm vertex
    U16 i0 = notCulledV[vc+0], 
        i1 = notCulledV[vc+1], 
        i2 = notCulledV[vc+2], 
        i3 = notCulledV[vc+3];

    VertexXmm vert;
    vert.V0 = _mm_loadu_ps(&srcVect[i0]);
    vert.V1 = _mm_loadu_ps(&srcVect[i1]);
    vert.V2 = _mm_loadu_ps(&srcVect[i2]);
    vert.V3 = _mm_loadu_ps(&srcVect[i3]);

    // need to add function: TransformProjectModelXmm()

    __m128 lit[4];
    SetHomogeneousFromModelXmm(lit, &vert.V0);

    TRANSPOSE_4X4(vert.V0, vert.V1, vert.V2, vert.V3);
    vert.NV.Set(srcNorm[i0], srcNorm[i1], srcNorm[i2], srcNorm[i3]);
    vert.DIFFUSE = diffInit;
    vert.SPECULAR = specInit;

    // light four verts
    DxLight::Manager::LightModel(vert, material);

    VertexTL &out0 = litV[vc+0], 
             &out1 = litV[vc+1], 
             &out2 = litV[vc+2], 
             &out3 = litV[vc+3];

    _mm_storeu_ps(&out0, lit[0]);
    _mm_storeu_ps(&out1, lit[1]);
    _mm_storeu_ps(&out2, lit[2]);
    _mm_storeu_ps(&out3, lit[3]);

    vert.DIFFUSE.GetRGBA (out0.diffuse,  out1.diffuse,  out2.diffuse,  out3.diffuse );
    vert.SPECULAR.GetRGBA(out0.specular, out1.specular, out2.specular, out3.specular);

    out0.uv = srcUV[i0];
    out1.uv = srcUV[i1];
    out2.uv = srcUV[i2];
    out3.uv = srcUV[i3];

    ProjectFromHomogeneousXmm(out0);
    ProjectFromHomogeneousXmm(out1);
    ProjectFromHomogeneousXmm(out2);
    ProjectFromHomogeneousXmm(out3);
  }

  countV -= countV_rem;

  if (dstV)
  {
    // pass back to caller
    *dstV = litV;
    *dstVCount = countV;
    *dstICount = countI;
  }
  else
  {
    Vid::UnlockIndexedPrimitiveMem( countV, countI, FALSE, TRUE);
  }
}
Exemple #3
0
static inline void gauss_reduce_sse2(
    const float *const input, // fine input buffer
    float *const coarse,      // coarse scale, blurred input buf
    const int wd,             // fine res
    const int ht)
{
  // blur, store only coarse res
  const int cw = (wd-1)/2+1, ch = (ht-1)/2+1;

  // this version is inspired by opencv's pyrDown_ :
  // - allocate 5 rows of ring buffer (aligned)
  // - for coarse res y
  //   - fill 5 coarse-res row buffers with 1 4 6 4 1 weights (reuse some from last time)
  //   - do vertical convolution via sse and write to coarse output buf

  const int stride = ((cw+8)&~7); // assure sse alignment of rows
  float *ringbuf = dt_alloc_align(16, sizeof(*ringbuf)*stride*5);
  float *rows[5] = {0};
  int rowj = 0; // we initialised this many rows so far

  for(int j=1;j<ch-1;j++)
  {
    // horizontal pass, convolve with 1 4 6 4 1 kernel and decimate
    for(;rowj<=2*j+2;rowj++)
    {
      float *const row = ringbuf + (rowj % 5)*stride;
      const float *const in = input + rowj*wd;
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
#endif
      for(int i=1;i<cw-1;i++)
        row[i] = 6*in[2*i] + 4*(in[2*i-1]+in[2*i+1]) + in[2*i-2] + in[2*i+2];
    }

    // init row pointers
    for(int k=0;k<5;k++)
      rows[k] = ringbuf + ((2*j-2+k)%5)*stride;

    // vertical pass, convolve and decimate using SIMD:
    // note that we're ignoring the (1..cw-1) buffer limit, we'll pull in
    // garbage and fix it later by border filling.
    float *const out = coarse + j*cw;
    const float *const row0 = rows[0], *const row1 = rows[1],
                *const row2 = rows[2], *const row3 = rows[3], *const row4 = rows[4];
    const __m128 four = _mm_set1_ps(4.f), scale = _mm_set1_ps(1.f/256.f);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
#endif
    for(int i=0;i<=cw-8;i+=8)
    {
      __m128 r0, r1, r2, r3, r4, t0, t1;
      r0 = _mm_load_ps(row0 + i);
      r1 = _mm_load_ps(row1 + i);
      r2 = _mm_load_ps(row2 + i);
      r3 = _mm_load_ps(row3 + i);
      r4 = _mm_load_ps(row4 + i);
      r0 = _mm_add_ps(r0, r4);
      r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
      r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
      t0 = _mm_add_ps(r0, _mm_mul_ps(r1, four));

      r0 = _mm_load_ps(row0 + i + 4);
      r1 = _mm_load_ps(row1 + i + 4);
      r2 = _mm_load_ps(row2 + i + 4);
      r3 = _mm_load_ps(row3 + i + 4);
      r4 = _mm_load_ps(row4 + i + 4);
      r0 = _mm_add_ps(r0, r4);
      r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
      r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
      t1 = _mm_add_ps(r0, _mm_mul_ps(r1, four));

      t0 = _mm_mul_ps(t0, scale);
      t1 = _mm_mul_ps(t1, scale);

      _mm_storeu_ps(out + i, t0);
      _mm_storeu_ps(out + i + 4, t1);
    }
    // process the rest
    for(int i=cw&~7;i<cw-1;i++)
      out[i] = (6*row2[i] + 4*(row1[i] + row3[i]) + row0[i] + row4[i])*(1.0f/256.0f);
  }
  dt_free_align(ringbuf);
  ll_fill_boundary1(coarse, cw, ch);
}
mlib_status
mlib_ImageColorConvert2_F32(
    const mlib_f32 *src,
    mlib_s32 slb,
    mlib_f32 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize,
    const mlib_d64 *fmat,
    const mlib_d64 *offset)
{
	/* pointers for pixel and line of source */
	mlib_f32 *sa, *sl;

	/* pointers for pixel and line of destination */
	mlib_f32 *da, *dl;

	/* indices */
	mlib_s32 i, j;

	/* intermediate */
	__m128 p0, p1, p2, t0, t1, t2, s0, s1, q;

	/* packed kernel */
	__m128 k0, k1, k2;

	/* packed offset */
	__m128 off;

	/* load transposed kernel */
	k0 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[6],
			(mlib_f32)fmat[3],
			(mlib_f32)fmat[0]);
	k1 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[7],
			(mlib_f32)fmat[4],
			(mlib_f32)fmat[1]);
	k2 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[8],
			(mlib_f32)fmat[5],
			(mlib_f32)fmat[2]);

	/* load offset */
	off = _mm_set_ps(0.0f,
			(mlib_f32)offset[2],
			(mlib_f32)offset[1],
			(mlib_f32)offset[0]);

	sa = sl = (mlib_f32 *)src;
	da = dl = dst;

	for (j = 0; j < ysize; j++) {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (i = 0; i < (xsize - 1); i ++) {
			p0 = _mm_load1_ps(sa);
			sa ++;
			p1 = _mm_load1_ps(sa);
			sa ++;
			p2 = _mm_load1_ps(sa);
			sa ++;

			t0 = _mm_mul_ps(p0, k0);
			t1 = _mm_mul_ps(p1, k1);
			t2 = _mm_mul_ps(p2, k2);

			s0 = _mm_add_ps(t0, t1);
			s1 = _mm_add_ps(t2, off);
			q = _mm_add_ps(s0, s1);

			_mm_storeu_ps(da, q);
			da += 3;
		}

		/*
		 * process the last pixel of each row separately
		 * to avoid out of bound write
		 */
		p0 = _mm_load1_ps(sa);
		sa ++;
		p1 = _mm_load1_ps(sa);
		sa ++;
		p2 = _mm_load1_ps(sa);
		sa ++;

		t0 = _mm_mul_ps(p0, k0);
		t1 = _mm_mul_ps(p1, k1);
		t2 = _mm_mul_ps(p2, k2);

		s0 = _mm_add_ps(t0, t1);
		s1 = _mm_add_ps(t2, off);
		q = _mm_add_ps(s0, s1);

		_mm_storel_pi((__m64 *)da, q);
		da += 2;
		q = _mm_shuffle_ps(q, q, 0xaa);
		_mm_store_ss(da, q);

		/* set src pointer to next row */
		sa = sl = sl + slb;
		/* set dst pointer to next row */
		da = dl = dl + dlb;
	}

	return (MLIB_SUCCESS);
}
Exemple #5
0
static void
thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
{
    int i, j;
    Size roi = _src.size();
    roi.width *= _src.channels();
    const float* src = (const float*)_src.data;
    float* dst = (float*)_dst.data;
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);

#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
#endif

    if( _src.isContinuous() && _dst.isContinuous() )
    {
        roi.width *= roi.height;
        roi.height = 1;
    }

#ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
        return;
#endif

#if defined(HAVE_IPP)
    IppiSize sz = { roi.width, roi.height };
    switch( type )
    {
    case THRESH_TRUNC:
        if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
            return;
        setIppErrorStatus();
        break;
    case THRESH_TOZERO:
        if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
            return;
        setIppErrorStatus();
        break;
    case THRESH_TOZERO_INV:
        if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
            return;
        setIppErrorStatus();
        break;
    }
#endif

    switch( type )
    {
        case THRESH_BINARY:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmpgt_ps( v0, thresh4 );
                        v1 = _mm_cmpgt_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] > thresh ? maxval : 0;
            }
            break;

        case THRESH_BINARY_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmple_ps( v0, thresh4 );
                        v1 = _mm_cmple_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] <= thresh ? maxval : 0;
            }
            break;

        case THRESH_TRUNC:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_min_ps( v0, thresh4 );
                        v1 = _mm_min_ps( v1, thresh4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = std::min(src[j], thresh);
            }
            break;

        case THRESH_TOZERO:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v > thresh ? v : 0;
                }
            }
            break;

        case THRESH_TOZERO_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif
                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v <= thresh ? v : 0;
                }
            }
            break;
        default:
            return CV_Error( CV_StsBadArg, "" );
    }
}
Exemple #6
0
void kernel_strmv_u_t_4_lib8(int kmax, float *A, int sda, float *x, float *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 8;
/*	const int bs  = 8;*/
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int
		k;
	
	__m256
		zeros,
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	zeros = _mm256_setzero_ps();

	y_0 = _mm256_setzero_ps();
	y_1 = _mm256_setzero_ps();
	y_2 = _mm256_setzero_ps();
	y_3 = _mm256_setzero_ps();
	y_4 = _mm256_setzero_ps();
	y_5 = _mm256_setzero_ps();
	y_6 = _mm256_setzero_ps();
	y_7 = _mm256_setzero_ps();
	
	k=0;
	for(; k<kmax-7; k+=8)
		{
		
		x_0 = _mm256_loadu_ps( &x[0] );

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*1] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_1 = _mm256_add_ps( y_1, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*2] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_2 = _mm256_add_ps( y_2, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*3] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_3 = _mm256_add_ps( y_3, ax_temp );
		
		A += sda*lda;
		x += 8;

		}

	x_0 = _mm256_loadu_ps( &x[0] );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x01 );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*1] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x03 );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_1 = _mm256_add_ps( y_1, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*2] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x07 );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_2 = _mm256_add_ps( y_2, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*3] );
	a_03 = _mm256_blend_ps( zeros, a_03, 0x0f );
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_3 = _mm256_add_ps( y_3, ax_temp );

	// reduction
	__m128
		z_0, z_1;

	y_0 = _mm256_hadd_ps(y_0, y_1);
	y_2 = _mm256_hadd_ps(y_2, y_3);

	y_0 = _mm256_hadd_ps(y_0, y_2);

	y_1 = _mm256_permute2f128_ps(y_0, y_0, 0x01);
	
	z_0 = _mm256_castps256_ps128(y_0);
	z_1 = _mm256_castps256_ps128(y_1);
	
	z_1 = _mm_add_ps(z_0, z_1);

	if(alg==0)
		{
		_mm_storeu_ps(&y[0], z_1);
		}
	else if(alg==1)
		{
		z_0 = _mm_loadu_ps( &y[0] );

		z_0 = _mm_add_ps(z_0, z_1);

		_mm_storeu_ps(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm_loadu_ps( &y[0] );

		z_0 = _mm_sub_ps(z_0, z_1);

		_mm_storeu_ps(&y[0], z_0);
		}

	}
void sgemm( int m, int n, float *A, float *C )
{
    __m128 a;
    __m128 a1;
    __m128 a2; 
    __m128 a3;
    __m128 a4;
    __m128 a5;
    
    __m128 b;
    __m128 b1;
    __m128 b2;
    __m128 b3;
    __m128 b4;
    __m128 b5;
    __m128 b6;
    __m128 b7;
    __m128 b8;
    __m128 b9;
    __m128 b10;
    __m128 b11;
    __m128 b12;
    /*
    __m128 b13;
    __m128 b14;
    __m128 b15;
    __m128 b16;
    __m128 b17;
    __m128 b18;
    __m128 b19;
    __m128 b20;
    */
    
    __m128 c;
    __m128 c1;
    __m128 c2;
    __m128 c3;
    __m128 c4;
    
    int i, j, k, l;
    int mod = m%4;
    int end = m/4 * 4;
    int total = n*m;
    float num[4];
    float* A_address;
    float* C_address;
    int m3 = 3 * m;
    int m2 = 2 * m;
    int end1 = total/m3 * m3;
#pragma omp parallel for private(a, a1, a2, a3, b, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, c, c1, c2, c3, c4, i, j, k, l)
    for( i = 0; i < end; i += 4 ){
	for( k = 0; k < end; k += 4 ) {
	    c1 = _mm_setzero_ps();
	    c2 = _mm_setzero_ps();
	    c3 = _mm_setzero_ps();
	    c4 = _mm_setzero_ps();
	    float* A_address1 = A + i;
	    float* A_address2 = A + k;
	    float* A_address21 = A + k + 1;
	    for( j = 0; j < end1; j += m3, A_address1 += m3, A_address2 += m3, A_address21 += m3){
		a1 = _mm_loadu_ps(A_address1);
		a2 = _mm_loadu_ps(A_address1 + m);
		a3 = _mm_loadu_ps(A_address1 + m2);
		
		b1 = _mm_load1_ps(A_address2);
		b2 = _mm_load1_ps(A_address2 + m);
		b3 = _mm_load1_ps(A_address2 + m2);
		/*
		b4 = _mm_load1_ps(A_address2 + m3);
		b5 = _mm_load1_ps(A_address2 + m4);
		*/
		
		b4 = _mm_load1_ps(A_address21);
		b5 = _mm_load1_ps(A_address21 + m);
		b6 = _mm_load1_ps(A_address21 + m2);
		/*
		b9 = _mm_load1_ps(A_address21 + m3);
		b10 = _mm_load1_ps(A_address21 + m4);
		*/
		b7 = _mm_load1_ps(A + k + 2 + j);
		b8 = _mm_load1_ps(A + k + 2 + j + m);
		b9 = _mm_load1_ps(A + k + 2 + j + m2);
		/*
		b14 = _mm_load1_ps(A + k + 2 + j + m3);
		b15 = _mm_load1_ps(A + k + 2 + j + m4);
		*/
		
		b10 = _mm_load1_ps(A + k + 3 + j);
		b11 = _mm_load1_ps(A + k + 3 + j + m);
		b12 = _mm_load1_ps(A + k + 3 + j + m2);
		/*
		b19 = _mm_load1_ps(A + k + 3 + j + m3);
		b20 = _mm_load1_ps(A + k + 3 + j + m4);
		*/
		
		c1 = _mm_add_ps(c1, _mm_mul_ps(a1, b1));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a2, b2));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a3, b3));
		/*
		c1 = _mm_add_ps(c1, _mm_mul_ps(a4, b4));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a5, b5));
		*/
		c2 = _mm_add_ps(c2, _mm_mul_ps(a1, b4));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a2, b5));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a3, b6));
		/*
		c2 = _mm_add_ps(c2, _mm_mul_ps(a4, b9));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a5, b10));
		*/
		c3 = _mm_add_ps(c3, _mm_mul_ps(a1, b7));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a2, b8));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a3, b9));
		/*
		c3 = _mm_add_ps(c3, _mm_mul_ps(a4, b14));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a5, b15));
		*/
		
		c4 = _mm_add_ps(c4, _mm_mul_ps(a1, b10));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a2, b11));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a3, b12));
		/*
		c4 = _mm_add_ps(c4, _mm_mul_ps(a4, b19));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a5, b20));
		*/
		
	    }
	    for( j = end1; j < total; j += m){
		a = _mm_loadu_ps(A + i + j);
		
		b1 = _mm_load1_ps(A + k + j);
		b2 = _mm_load1_ps(A + k + 1 + j);
		b3 = _mm_load1_ps(A + k + 2 + j);
		b4 = _mm_load1_ps(A + k + 3 + j);
		
		c1 = _mm_add_ps(c1, _mm_mul_ps(a, b1));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a, b2));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a, b3));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a, b4));
	    }
	    _mm_storeu_ps(C + i + (k)*m, c1);
	    _mm_storeu_ps(C + i + (k+1)*m, c2);
	    _mm_storeu_ps(C + i + (k+2)*m, c3);
	    _mm_storeu_ps(C + i + (k+3)*m, c4);
	}
	for(k = end; k < m; k++){
	    float* A_address1 = A + i;
	    float* A_address2 = A + k;
	    c = _mm_setzero_ps();
	    for( j = 0; j < end1; j += m3, A_address1 += m3, A_address2 += m3){
		a1 = _mm_loadu_ps(A_address1);
		a2 = _mm_loadu_ps(A + i + j + m);
		a3 = _mm_loadu_ps(A + i + j + m2);
		
		b1 = _mm_load1_ps(A_address2);
		b2 = _mm_load1_ps(A + k + j + m);
		b3 = _mm_load1_ps(A + k + j + m2);
		
		c = _mm_add_ps(c, _mm_mul_ps(a1, b1));
		c = _mm_add_ps(c, _mm_mul_ps(a2, b2));
		c = _mm_add_ps(c, _mm_mul_ps(a3, b3));
	    }
	    for( j = end1; j < total; j += m){
		a = _mm_loadu_ps(A + i + j);
		
		b = _mm_load1_ps(A + k + j);
		
		c = _mm_add_ps(c, _mm_mul_ps(a, b));
	    }
	    _mm_storeu_ps(C + i + k*m, c);
	}
    }
    if (mod != 0){
	if (mod == 3){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address),*(A_address + 1),*(A_address + 2), 0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 3; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
	else if (mod == 2){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address),*(A_address + 1),0 ,0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 2; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
	else if (mod == 1){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address), 0, 0, 0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 1; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
    }
}	
Exemple #8
0
void Float32ToNativeInt32( const float *src, int *dst, unsigned int numToConvert )
{
	const float *src0 = src;
	int *dst0 = dst;
	unsigned int count = numToConvert;
	
	if (count >= 4) {
		// vector -- requires 4+ samples
		ROUNDMODE_NEG_INF
		const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
		const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f };
		const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32  };
		const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f  };
		__m128 vf0;
		__m128i vi0;
	
#define F32TOLE32(x) \
		vf##x = _mm_mul_ps(vf##x, vscale);			\
		vf##x = _mm_add_ps(vf##x, vround);			\
		vf##x = _mm_max_ps(vf##x, vmin);			\
		vf##x = _mm_min_ps(vf##x, vmax);			\
		vi##x = _mm_cvtps_epi32(vf##x);			\

		int falign = (uintptr_t)src & 0xF;
		int ialign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vf0 = _mm_loadu_ps(src);
			F32TOLE32(0)
			_mm_storeu_si128((__m128i *)dst, vi0);
			
			// and advance such that the destination ints are aligned
			unsigned int n = (16 - ialign) / 4;
			src += n;
			dst += n;
			count -= n;

			falign = (uintptr_t)src & 0xF;
			if (falign != 0) {
				// unaligned loads, aligned stores
				while (count >= 4) {
					vf0 = _mm_loadu_ps(src);
					F32TOLE32(0)
					_mm_store_si128((__m128i *)dst, vi0);
					src += 4;
					dst += 4;
					count -= 4;
				}
				goto VectorCleanup;
			}
		}
	
		while (count >= 4) {
			vf0 = _mm_load_ps(src);
			F32TOLE32(0)
			_mm_store_si128((__m128i *)dst, vi0);
			
			src += 4;
			dst += 4;
			count -= 4;
		}
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 4;
			dst = dst0 + numToConvert - 4;
			vf0 = _mm_loadu_ps(src);
			F32TOLE32(0)
			_mm_storeu_si128((__m128i *)dst, vi0);
		}
		RESTORE_ROUNDMODE
		return;
	}
	
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.;
		ROUNDMODE_NEG_INF
		
		while (count-- > 0) {
			double f0 = *src++;
			f0 = f0 * scale + round;
			int i0 = FloatToInt(f0, min32, max32);
			*dst++ = i0;
		}
		RESTORE_ROUNDMODE
	}
}


void NativeInt32ToFloat32( const int *src, float *dst, unsigned int numToConvert )
{
	const int *src0 = src;
	float *dst0 = dst;
	unsigned int count = numToConvert;

	if (count >= 4) {
		// vector -- requires 4+ samples
#define LEI32TOF32(x) \
	vf##x = _mm_cvtepi32_ps(vi##x); \
	vf##x = _mm_mul_ps(vf##x, vscale); \
		
		const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f  };
		__m128 vf0;
		__m128i vi0;

		int ialign = (uintptr_t)src & 0xF;
		int falign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vi0 = _mm_loadu_si128((__m128i const *)src);
			LEI32TOF32(0)
			_mm_storeu_ps(dst, vf0);
			
			// and advance such that the destination floats are aligned
			unsigned int n = (16 - falign) / 4;
			src += n;
			dst += n;
			count -= n;

			ialign = (uintptr_t)src & 0xF;
			if (ialign != 0) {
				// unaligned loads, aligned stores
				while (count >= 4) {
					vi0 = _mm_loadu_si128((__m128i const *)src);
					LEI32TOF32(0)
					_mm_store_ps(dst, vf0);
					src += 4;
					dst += 4;
					count -= 4;
				}
				goto VectorCleanup;
			}
		}
	
		// aligned loads, aligned stores
		while (count >= 4) {
			vi0 = _mm_load_si128((__m128i const *)src);
			LEI32TOF32(0)
			_mm_store_ps(dst, vf0);
			src += 4;
			dst += 4;
			count -= 4;
		}
		
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 4;
			dst = dst0 + numToConvert - 4;
			vi0 = _mm_loadu_si128((__m128i const *)src);
			LEI32TOF32(0)
			_mm_storeu_ps(dst, vf0);
		}
		return;
	}
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 1./2147483648.0f;
		while (count-- > 0) {
			int i = *src++;
			double f = (double)i * scale;
			*dst++ = f;
		}
	}
}

int alsa_set_hwparams(alsa_dev_t *dev, snd_pcm_t *handle, snd_pcm_hw_params_t *params, snd_pcm_access_t access)
{
  unsigned int rrate;
  snd_pcm_uframes_t size;
  int err, dir;
  
  /* choose all parameters */
  err = snd_pcm_hw_params_any(handle, params);
  if (err < 0) {
    printf("Broken configuration for playback: no configurations available: %s\n", snd_strerror(err));
    return err;
  }
  
  /* set the interleaved read/write format */
  err = snd_pcm_hw_params_set_access(handle, params, access);
  if (err < 0) {
    printf("Access type not available for playback: %s\n", snd_strerror(err));
    return err;
  }
  /* set the sample format */
  err = snd_pcm_hw_params_set_format(handle, params, dev->format);
  if (err < 0) {
    printf("Sample format not available for playback: %s\n", snd_strerror(err));
    return err;
  }
  /* set the count of channels */
  err = snd_pcm_hw_params_set_channels(handle, params, dev->channels);
  if (err < 0) {
    printf("Channels count (%d) not available for playbacks: %s\n", dev->channels, snd_strerror(err));
    return err;
  }
  /* set the stream rate */
  rrate = dev->rate;
  err = snd_pcm_hw_params_set_rate_near(handle, params, &rrate, 0);
  if (err < 0) {
    printf("Rate %d Hz not available for playback: %s\n", dev->rate, snd_strerror(err));
    return err;
  }
  if (rrate != dev->rate) {
    printf("Rate doesn't match (requested %dHz, get %dHz)\n", dev->rate, rrate);
    return -EINVAL;
  }
  
  /* set the period size */
  err = snd_pcm_hw_params_set_period_size(handle, params, dev->period_size, 0);
  if (err < 0) {
    printf("Unable to set period size %d for playback: %s\n", (int)dev->period_size, snd_strerror(err));
    return err;
  }
  
  err = snd_pcm_hw_params_get_period_size(params, &size, &dir);
  if (err < 0) {
    printf("Unable to get period size for playback: %s\n", snd_strerror(err));
    return err;
  }
  
  if (dev->period_size != size) {
    printf("Period size doesn't match (requested %d, got %d)\n", (int)dev->period_size, (int)size);
    return -EINVAL;
  }
  
    /* set the buffer size */
  err = snd_pcm_hw_params_set_buffer_size(handle, params, dev->buffer_size);
  if (err < 0) {
    printf("Unable to set buffer size %d for playback: %s\n", (int)dev->buffer_size, snd_strerror(err));
    return err;
  }
  err = snd_pcm_hw_params_get_buffer_size(params, &size);
  if (err < 0) {
    printf("Unable to get buffer size for playback: %s\n", snd_strerror(err));
    return err;
  }
  
  if (size != (snd_pcm_uframes_t)dev->buffer_size) {
    printf("Buffer size doesn't match (requested %d, got %d)\n", (int)dev->buffer_size, (int)size);
    return -EINVAL;
  }

  /* write the parameters to device */
  err = snd_pcm_hw_params(handle, params);
  if (err < 0) {
    printf("Unable to set hw params for playback: %s\n", snd_strerror(err));
    return err;
  }
  return 0;
}

int alsa_set_swparams(alsa_dev_t *dev, snd_pcm_t *handle, snd_pcm_sw_params_t *swparams)
{
  int err;
  
  /* get the current swparams */
  err = snd_pcm_sw_params_current(handle, swparams);
  if (err < 0) {
    printf("Unable to determine current swparams for playback: %s\n", snd_strerror(err));
    return err;
  }
  /* allow the transfer when at least period_size samples can be processed */
  /* or disable this mechanism when period event is enabled (aka interrupt like style processing) */
  err = snd_pcm_sw_params_set_avail_min(handle, swparams, dev->period_size);
  if (err < 0) {
    printf("Unable to set avail min for playback: %s\n", snd_strerror(err));
    return err;
  }
  /* enable period events */
  err = snd_pcm_sw_params_set_period_event(handle, swparams, 1);
  if (err < 0) {
    printf("Unable to set period event: %s\n", snd_strerror(err));
    return err;
  }

  /* write the parameters to the playback device */
  err = snd_pcm_sw_params(handle, swparams);
  if (err < 0) {
    printf("Unable to set sw params for playback: %s\n", snd_strerror(err));
    return err;
  }
  return 0;
}
static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) {
  int i, j;
  for (i = 0; i < NR_PART; i++) {
    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
    int pos = i * PART_LEN1;
    // Check for wrap
    if (i + aec->xfBufBlockPos >= NR_PART) {
      xPos -= NR_PART * PART_LEN1;
    }

#ifdef UNCONSTR
    for (j = 0; j < PART_LEN1; j++) {
      aec->wfBuf[pos + j][0] += MulRe(aec->xfBuf[xPos + j][0],
                                      -aec->xfBuf[xPos + j][1],
                                      ef[j][0], ef[j][1]);
      aec->wfBuf[pos + j][1] += MulIm(aec->xfBuf[xPos + j][0],
                                      -aec->xfBuf[xPos + j][1],
                                      ef[j][0], ef[j][1]);
    }
#else
    // Process the whole array...
    for (j = 0; j < PART_LEN; j+= 4) {
      // Load xfBuf and ef.
      const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
      const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
      const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);
      const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);
      // Calculate the product of conjugate(xfBuf) by ef.
      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
      const __m128 a = _mm_mul_ps(xfBuf_re, ef_re);
      const __m128 b = _mm_mul_ps(xfBuf_im, ef_im);
      const __m128 c = _mm_mul_ps(xfBuf_re, ef_im);
      const __m128 d = _mm_mul_ps(xfBuf_im, ef_re);
      const __m128 e = _mm_add_ps(a, b);
      const __m128 f = _mm_sub_ps(c, d);
      // Interleave real and imaginary parts.
      const __m128 g = _mm_unpacklo_ps(e, f);
      const __m128 h = _mm_unpackhi_ps(e, f);
      // Store
      _mm_storeu_ps(&fft[2*j + 0], g);
      _mm_storeu_ps(&fft[2*j + 4], h);
    }
    // ... and fixup the first imaginary entry.
    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
                   -aec->xfBuf[1][xPos + PART_LEN],
                   ef[0][PART_LEN], ef[1][PART_LEN]);

    aec_rdft_inverse_128(fft);
    memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN);

    // fft scaling
    {
      float scale = 2.0f / PART_LEN2;
      const __m128 scale_ps = _mm_load_ps1(&scale);
      for (j = 0; j < PART_LEN; j+=4) {
        const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
        const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
        _mm_storeu_ps(&fft[j], fft_scale);
      }
    }
    aec_rdft_forward_128(fft);

    {
      float wt1 = aec->wfBuf[1][pos];
      aec->wfBuf[0][pos + PART_LEN] += fft[1];
      for (j = 0; j < PART_LEN; j+= 4) {
        __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
        __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
        const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
        const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
        const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2 ,0));
        const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3 ,1));
        wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
        wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
        _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
        _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);
      }
      aec->wfBuf[1][pos] = wt1;
    }
#endif // UNCONSTR
  }
}
Exemple #10
0
float tricub_x86_f(float *src, float *abcd, float x, float y){
  float *s;
  float x0, x1, x2, x3, y0, y1, y2, y3;
  float dst[4];
#if defined(__AVX2__) && defined(__x86_64__)
  __m256 v1, v2, v3, v4;
  __m256 va, vb, vc, vd;
  __m128 va4, vb4, vc4, vd4;
  __m128 v128a, v128b;
  __m128 vy0, vy1, vy2, vy3;
#else
  int i, ni2, ni3, ninj2, ninj3;
  float va4[4], vb4[4], vc4[4], vd4[4];
  ninj2 = ninj + ninj;
  ninj3 = ninj2 + ninj;
  ni2 = ni + ni;
  ni3 = ni2 + ni;
#endif

#if defined(__AVX2__) && defined(__x86_64__)

// ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ====

  va = _mm256_broadcast_ss(abcd);   // promote constants to vectors
  vb = _mm256_broadcast_ss(abcd+1);
  vc = _mm256_broadcast_ss(abcd+2);
  vd = _mm256_broadcast_ss(abcd+3);

  s = src;                          // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 0
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 1
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 0
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 1
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 0
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 1
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 0
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 1
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low)
  vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high)

  s = src + 2*ni;                   // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 2
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 3
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 2
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 3
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 2
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 3
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 2
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 3
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2  (v1 low)
  vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3  (v1 high)

// ==== interpolation along Y, vector length is 4 (4 rows) ====

  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);

  va4 = _mm_broadcast_ss(&y0);      // promote constants to vectors
  vb4 = _mm_broadcast_ss(&y1);
  vc4 = _mm_broadcast_ss(&y2);
  vd4 = _mm_broadcast_ss(&y3);

  vy0 = _mm_mul_ps(vy0,va4);        //    vy0 * va4
  vy0 = _mm_fmadd_ps(vy1,vb4,vy0);  // += vy1 * vb4
  vy0 = _mm_fmadd_ps(vy2,vc4,vy0);  // += vy2 * vc4
  vy0 = _mm_fmadd_ps(vy3,vd4,vy0);  // += vy3 * vd4
  
  _mm_storeu_ps(dst,vy0);           // store 4 values along X
#else
  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);
  for (i=0 ; i<4 ; i++){
    va4[i] = src[i    ]*abcd[0] + src[i    +ninj]*abcd[1] +  src[i    +ninj2]*abcd[2] + src[i    +ninj3]*abcd[3];
    vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] +  src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3];
    vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] +  src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3];
    vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] +  src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3];
    dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3;
  }
#endif

// ==== interpolation along x, scalar ====

  x0 = cm167*x*(x-one)*(x-two);
  x1 = cp5*(x+one)*(x-one)*(x-two);
  x2 = cm5*x*(x+one)*(x-two);
  x3 = cp167*x*(x+one)*(x-one);

  return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3);
}
Exemple #11
0
// Store the pixel's values into a given pixel list's position
inline void StorePixel(float * rgbaBuffer, const __m128 pix, const float outAlpha)
{
    _mm_storeu_ps(rgbaBuffer, pix);
    rgbaBuffer[3] = outAlpha;
}
Exemple #12
0
int main()
{	

	float m=1.0; /* initial  magnification		*/

	/* Timing variables */
	struct timeval start_time, stop_time;
	long long compute_time;

	/* 			*/
	/* Create a screen to render to */
	Screen *screen;
	screen = new Screen(HXRES, HYRES);
	gettimeofday(&start_time, NULL);
	int depth=0;
	//#pragma omp parallel
	//{
	while (depth < MAX_DEPTH) {
		if(GREGG){	
			int hx, hy;
			for (hy=0; hy<HYRES; hy++) {
				for (hx=0; hx<HXRES; hx++) {
					int iterations;

					/* 
					 * Translate pixel coordinates to complex plane coordinates centred
					 * on PX, PY
					 */
					float cx = ((((float)hx/(float)HXRES) -0.5 + (PX/(4.0/m)))*(4.0f/m));
					float cy = ((((float)hy/(float)HYRES) -0.5 + (PY/(4.0/m)))*(4.0f/m));
					//__m128 cx_m = _mm_set1_ps(cx);
					//__m128 cy_m = _mm_set1_ps(cy);
					//if (!member_speed(cx_m, cy_m, iterations)) 
					if (!member(cx, cy, iterations)) {
						/* Point is not a member, colour based on number of iterations before escape */
						int i=(iterations%40) - 1;
						int b = i*3;
						screen->putpixel(hx, hy, pal[b], pal[b+1], pal[b+2]);
					} else {
						/* Point is a member, colour it black */
						screen->putpixel(hx, hy, 0, 0, 0);
					}
				}
			}
		}
		else{
			#pragma omp parallel
			{
			float * answers = (float *)malloc(sizeof(float) * 4);
			#pragma omp for schedule(dynamic)
			for (int hy=0; hy<HYRES; hy++) {
				float cy = ((((float)hy/(float)HYRES) -0.5 + (PY/(4.0/m)))*(4.0f/m));
				__m128 cy_m = _mm_set1_ps(cy);
				//__m128 cy_m = _mm_setr_ps(hy, hy+1, hy+2, hy+3);
				__m128 four_m = _mm_set1_ps((4.0/m));
				//cy_m = _mm_div_ps(cy_m, _mm_set1_ps(HYRES));
				//cy_m = _mm_sub_ps(cy_m, _mm_set1_ps(0.5));
				//cy_m = _mm_add_ps(cy_m, _mm_mul_ps(_mm_div_ps(_mm_set1_ps(PY),four_m), four_m));
				for (int hx=0; hx<HXRES; hx+=4) {
					//float cx = ((((float)hx/(float)HXRES) -0.5 + (PX/(4.0/m)))*(4.0f/m));
					//float cy = ((((float)hy/(float)HYRES) -0.5 + (PY/(4.0/m)))*(4.0f/m));
					__m128 cx_m = _mm_setr_ps(hx, hx+1, hx+2, hx+3);
					cx_m = _mm_div_ps(cx_m, _mm_set1_ps(HXRES));
					cx_m = _mm_sub_ps(cx_m, _mm_set1_ps(0.5));
					cx_m = _mm_add_ps(cx_m, _mm_div_ps(_mm_set1_ps(PX),four_m));
					cx_m = _mm_mul_ps(cx_m, four_m);

					_mm_storeu_ps(answers, member_speed(cx_m, cy_m));
					for (int k = 0; k < 4; k++) {
						if (answers[k] != MAX_ITS) {
							int l=((int)(answers[k]) % 40) - 1;
							l = l*3;
							screen->putpixel(hx+k, hy, pal[l], pal[l+1], pal[l+2]);
						}else{
							screen->putpixel(hx+k, hy, 0, 0, 0);
						}
					}
				}
			}
			}
		}
		
		screen->flip();
		/* Show the rendered image on the screen */
		std::cerr << "Render done " << depth++ << " " << m << std::endl;
		/* Zoom in */
		m *= ZOOM_FACTOR;
	}
	//}
	gettimeofday(&stop_time, NULL);
	compute_time = (stop_time.tv_sec - start_time.tv_sec) * 1000000L +
		(stop_time.tv_usec - start_time.tv_usec);
	fprintf(stderr, "Time to find Richys tour: %lld microseconds\n", compute_time);
	sleep(5);
	std::cout << "Clean Exit"<< std::endl;

}
static void ScaleErrorSignalSSE2(int extended_filter_enabled,
                                 float normal_mu,
                                 float normal_error_threshold,
                                 float x_pow[PART_LEN1],
                                 float ef[2][PART_LEN1]) {
  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
  const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
      : _mm_set1_ps(normal_mu);
  const __m128 kThresh = extended_filter_enabled
                             ? _mm_set1_ps(kExtendedErrorThreshold)
                             : _mm_set1_ps(normal_error_threshold);

  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const __m128 x_pow_local = _mm_loadu_ps(&x_pow[i]);
    const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
    const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);

    const __m128 xPowPlus = _mm_add_ps(x_pow_local, k1e_10f);
    __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
    __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
    const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
    const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
    const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
    const __m128 absEf = _mm_sqrt_ps(ef_sum2);
    const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
    __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
    const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
    __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
    __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
    ef_re_if = _mm_and_ps(bigger, ef_re_if);
    ef_im_if = _mm_and_ps(bigger, ef_im_if);
    ef_re = _mm_andnot_ps(bigger, ef_re);
    ef_im = _mm_andnot_ps(bigger, ef_im);
    ef_re = _mm_or_ps(ef_re, ef_re_if);
    ef_im = _mm_or_ps(ef_im, ef_im_if);
    ef_re = _mm_mul_ps(ef_re, kMu);
    ef_im = _mm_mul_ps(ef_im, kMu);

    _mm_storeu_ps(&ef[0][i], ef_re);
    _mm_storeu_ps(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  {
    const float mu =
        extended_filter_enabled ? kExtendedMu : normal_mu;
    const float error_threshold = extended_filter_enabled
                                      ? kExtendedErrorThreshold
                                      : normal_error_threshold;
    for (; i < (PART_LEN1); i++) {
      float abs_ef;
      ef[0][i] /= (x_pow[i] + 1e-10f);
      ef[1][i] /= (x_pow[i] + 1e-10f);
      abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

      if (abs_ef > error_threshold) {
        abs_ef = error_threshold / (abs_ef + 1e-10f);
        ef[0][i] *= abs_ef;
        ef[1][i] *= abs_ef;
      }

      // Stepsize factor
      ef[0][i] *= mu;
      ef[1][i] *= mu;
    }
  }
}
// Updates the following smoothed  Power Spectral Densities (PSD):
//  - sd  : near-end
//  - se  : residual echo
//  - sx  : far-end
//  - sde : cross-PSD of near-end and residual echo
//  - sxd : cross-PSD of near-end and far-end
//
// In addition to updating the PSDs, also the filter diverge state is determined
// upon actions are taken.
static void SmoothedPSD(AecCore* aec,
                        float efw[2][PART_LEN1],
                        float dfw[2][PART_LEN1],
                        float xfw[2][PART_LEN1],
                        int* extreme_filter_divergence) {
  // Power estimate smoothing coefficients.
  const float* ptrGCoh = aec->extended_filter_enabled
      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
  int i;
  float sdSum = 0, seSum = 0;
  const __m128 vec_15 =  _mm_set1_ps(WebRtcAec_kMinFarendPSD);
  const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);
  const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);
  __m128 vec_sdSum = _mm_set1_ps(0.0f);
  __m128 vec_seSum = _mm_set1_ps(0.0f);

  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);
    const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);
    const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);
    const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);
    const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);
    const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);
    __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0);
    __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0);
    __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0);
    __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);
    __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);
    __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);
    vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));
    vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));
    vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));
    vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);
    vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));
    vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));
    vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));
    _mm_storeu_ps(&aec->sd[i], vec_sd);
    _mm_storeu_ps(&aec->se[i], vec_se);
    _mm_storeu_ps(&aec->sx[i], vec_sx);

    {
      const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);
      const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
      __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
                                    _MM_SHUFFLE(2, 0, 2, 0));
      __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
                                    _MM_SHUFFLE(3, 1, 3, 1));
      __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);
      __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);
      vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
      vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
      vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011,
                                  _mm_mul_ps(vec_dfw1, vec_efw1));
      vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110,
                                  _mm_mul_ps(vec_dfw1, vec_efw0));
      vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));
      vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));
      _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));
      _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
    }

    {
      const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
      const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
      __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
                                    _MM_SHUFFLE(2, 0, 2, 0));
      __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
                                    _MM_SHUFFLE(3, 1, 3, 1));
      __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);
      __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);
      vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
      vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
      vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011,
                                  _mm_mul_ps(vec_dfw1, vec_xfw1));
      vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110,
                                  _mm_mul_ps(vec_dfw1, vec_xfw0));
      vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));
      vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));
      _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));
      _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
    }

    vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);
    vec_seSum = _mm_add_ps(vec_seSum, vec_se);
  }

  _mm_add_ps_4x1(vec_sdSum, &sdSum);
  _mm_add_ps_4x1(vec_seSum, &seSum);

  for (; i < PART_LEN1; i++) {
    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
    aec->se[i] = ptrGCoh[0] * aec->se[i] +
                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
    // We threshold here to protect against the ill-effects of a zero farend.
    // The threshold is not arbitrarily chosen, but balances protection and
    // adverse interaction with the algorithm's tuning.
    // TODO(bjornv): investigate further why this is so sensitive.
    aec->sx[i] =
        ptrGCoh[0] * aec->sx[i] +
        ptrGCoh[1] * WEBRTC_SPL_MAX(
            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
            WebRtcAec_kMinFarendPSD);

    aec->sde[i][0] =
        ptrGCoh[0] * aec->sde[i][0] +
        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
    aec->sde[i][1] =
        ptrGCoh[0] * aec->sde[i][1] +
        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);

    aec->sxd[i][0] =
        ptrGCoh[0] * aec->sxd[i][0] +
        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
    aec->sxd[i][1] =
        ptrGCoh[0] * aec->sxd[i][1] +
        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);

    sdSum += aec->sd[i];
    seSum += aec->se[i];
  }

  // Divergent filter safeguard update.
  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;

  // Signal extreme filter divergence if the error is significantly larger
  // than the nearend (13 dB).
  *extreme_filter_divergence = (seSum > (19.95f * sdSum));
}
void sgemm( int m, int n, int d, float *A, float *C )
{
    int n1 = n+1, nEnd = n/VERTICAL_ROLL*VERTICAL_ROLL;
    float *B = A, *D = C;
	#pragma omp parallel for
	 for (int j = 0; j < n; j++) {
		int jn1 = j*(n+1), jn = j*n; float *Cjn = D+jn;
		// for (int b = 0; b < m; b+= BLOCKSIZE) {
			for (int i = 0; i < nEnd; i+=VERTICAL_ROLL) {
			    float *Cjni = Cjn+i;
			    float *Cjni1 = Cjni + 4;
			    float *Cjni2 = Cjni + 8;
			    float *Cjni3 = Cjni + 12;
			    float *Cjni4 = Cjni + 16;
			    float *Cjni5 = Cjni + 20;
			    float *Cjni6 = Cjni + 24;
			    float *Cjni7 = Cjni + 28;

			    int i1 = i+4;
			    int i2 = i+8;
			    int i3 = i+12;
			    int i4 = i+16;
			    int i5 = i+20;
			    int i6 = i+24;
			    int i7 = i+28;

			    __m128 Cij = _mm_loadu_ps(Cjni);
			    __m128 Cij1 = _mm_loadu_ps(Cjni1);
			    __m128 Cij2 = _mm_loadu_ps(Cjni2);
			    __m128 Cij3 = _mm_loadu_ps(Cjni3);
			    __m128 Cij4 = _mm_loadu_ps(Cjni4);
			    __m128 Cij5 = _mm_loadu_ps(Cjni5);
			    __m128 Cij6 = _mm_loadu_ps(Cjni6);
			    __m128 Cij7 = _mm_loadu_ps(Cjni7);


			    // for (int k = b; k < b+BLOCKSIZE && k < m; k++) {
			    for (int k = 0; k < m; k++) {
					int k1 = k + 1; float *Akn = B+k*n;
					__m128 Ajk = _mm_load1_ps(Akn+jn1);

					__m128 Aik = _mm_loadu_ps(Akn+i);
					__m128 Ai1k = _mm_loadu_ps(Akn+i1);
					__m128 Ai2k = _mm_loadu_ps(Akn+i2);
					__m128 Ai3k = _mm_loadu_ps(Akn+i3);
					__m128 Ai4k = _mm_loadu_ps(Akn+i4);
					__m128 Ai5k = _mm_loadu_ps(Akn+i5);
					__m128 Ai6k = _mm_loadu_ps(Akn+i6);
					__m128 Ai7k = _mm_loadu_ps(Akn+i7);

					Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
					Cij1 = _mm_add_ps(Cij1, _mm_mul_ps(Ajk, Ai1k));
					Cij2 = _mm_add_ps(Cij2, _mm_mul_ps(Ajk, Ai2k));
					Cij3 = _mm_add_ps(Cij3, _mm_mul_ps(Ajk, Ai3k));
					Cij4 = _mm_add_ps(Cij4, _mm_mul_ps(Ajk, Ai4k));
					Cij5 = _mm_add_ps(Cij5, _mm_mul_ps(Ajk, Ai5k));
					Cij6 = _mm_add_ps(Cij6, _mm_mul_ps(Ajk, Ai6k));
					Cij7 = _mm_add_ps(Cij7, _mm_mul_ps(Ajk, Ai7k));
			    }
			    _mm_storeu_ps(Cjni, Cij);
			    _mm_storeu_ps(Cjni1, Cij1);
			    _mm_storeu_ps(Cjni2, Cij2);
			    _mm_storeu_ps(Cjni3, Cij3);
			    _mm_storeu_ps(Cjni4, Cij4);
			    _mm_storeu_ps(Cjni5, Cij5);
			    _mm_storeu_ps(Cjni6, Cij6);
			    _mm_storeu_ps(Cjni7, Cij7);
			}
		// }
    }
    if (n % VERTICAL_ROLL != 0 && (n - (nEnd) >= 4)) {
		#pragma omp parallel for
		for (int j = 0; j < n; j++) {
			for (int i = nEnd; i < n/4*4; i+=4) {
				float *addrCij = D+i+j*n;
				float *Ajn1 = B+j*n1;
				float *Ai = A+i;
				__m128 Cij = _mm_loadu_ps(addrCij);
				for (int k = 0; k < m; k++) {
				    int kn = k*n;				    
				    __m128 Ajk = _mm_load1_ps(Ajn1+k*n);
				    __m128 Aik = _mm_loadu_ps(Ai+k*n);
				    Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
				}
				_mm_storeu_ps(addrCij, Cij);
			}
		}
    }
    if ((n - nEnd) % 4 != 0) {
		#pragma omp parallel for
		for (int j = 0; j < n; j++) {
		    float *Ajn1 = B+j*n1;
		    for (int i = n/4*4; i < n; i++) {
			float *addrCij = D+i+j*n;
			float *Ajn1 = B+j*n1;
			float *Ai = B+i;
			__m128 Cij = _mm_loadu_ps(addrCij);
			for (int k = 0; k < m; k++) {
			    int kn = k*n;
			    __m128 Ajk = _mm_load1_ps(Ajn1+kn);
			    __m128 Aik = _mm_loadu_ps(Ai+kn);
			    Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
			}
			_mm_store_ss(addrCij, Cij);
		    }
		}	
	}	
}	
static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1],
                                     const float hNlFb,
                                     float efw[2][PART_LEN1]) {
  int i;
  const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
  const __m128 vec_one = _mm_set1_ps(1.0f);
  const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
  const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i+=4) {
    // Weight subbands
    __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
    const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
    const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
    const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(
        vec_weightCurve, vec_hNlFb);
    const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
    const __m128 vec_one_weightCurve_hNl = _mm_mul_ps(
        vec_one_weightCurve, vec_hNl);
    const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
    const __m128 vec_if1 = _mm_and_ps(
        bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
    vec_hNl = _mm_or_ps(vec_if0, vec_if1);

    {
      const __m128 vec_overDriveCurve = _mm_loadu_ps(
          &WebRtcAec_overDriveCurve[i]);
      const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps(
          vec_overDriveSm, vec_overDriveCurve);
      vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
      _mm_storeu_ps(&hNl[i], vec_hNl);
    }

    // Suppress error signal
    {
      __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
      __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
      vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);

      // Ooura fft returns incorrect sign on imaginary component. It matters
      // here because we are making an additive change with comfort noise.
      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
      _mm_storeu_ps(&efw[0][i], vec_efw_re);
      _mm_storeu_ps(&efw[1][i], vec_efw_im);
    }
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    // Weight subbands
    if (hNl[i] > hNlFb) {
      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
          (1 - WebRtcAec_weightCurve[i]) * hNl[i];
    }
    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);

    // Suppress error signal
    efw[0][i] *= hNl[i];
    efw[1][i] *= hNl[i];

    // Ooura fft returns incorrect sign on imaginary component. It matters
    // here because we are making an additive change with comfort noise.
    efw[1][i] *= -1;
  }
}
Exemple #17
0
void mexFunction(int nlhs, mxArray *plhs[],
                 int nrhs, const mxArray *prhs[])
{
        const float * kf   = coeff;
        float * src = _src;
        float * dst = _dst;
        int i = 0, k, nz = length;
        
        // float delta = 0.000001f;
        __m128 d4 = _mm_setzero_ps();
        
        float * S;
        
        __m128 s0, s1, s2, s3, 
               t0, t1, t2, t3;
        __m128 f;
        
        for(i = 0; i <= width - 16; i += 16 )
        {
            s0 = d4, s1 = d4, s2 = d4, s3 = d4;

            for( k = 0; k < nz; k++ )
            {
                f = _mm_load_ss(kf + k);
                f = _mm_shuffle_ps(f, f, 0);  // (__m128 f, __m128 f, unsigned int imm8)
                S = src + i + k;

                t0 = _mm_loadu_ps(S);
                t1 = _mm_loadu_ps(S + 4);
                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
                s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));

                t0 = _mm_loadu_ps(S + 8);
                t1 = _mm_loadu_ps(S + 12);
                s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
                s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
            }

            _mm_storeu_ps(dst + i, s0);
            _mm_storeu_ps(dst + i + 4, s1);
            _mm_storeu_ps(dst + i + 8, s2);
            _mm_storeu_ps(dst + i + 12, s3);
        }
// 
        for( ; i <= width - 4; i += 4 )
        {
            s0 = d4;

            for( k = 0; k < nz; k++ )
            {
                f = _mm_load_ss(kf + k);
                f = _mm_shuffle_ps(f, f, 0);
                t0 = _mm_loadu_ps(src + k + i);
                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
            }
            _mm_storeu_ps(dst + i, s0);
        }
        
        for (; i < width; i++)
        {
            for( k = 0; k < nz; k++ )
            {
                *(dst + i) += *(src + i + k) * *(kf + k); 
            }
        }

        return;
}
void
intrin_sse_mult_su3_mat_vec(su3_matrixf *aa, su3_vectorf* bb, su3_vectorf* cc)
{

	 /* XMM Variables */
	 __m128 xmm2, xmm3, xmm0, xmm1, xmm6, xmm7, xmm4, xmm5;

	xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->c[0]) );
	xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->c[1]) );
	xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->c[2]) );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x44 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0x44 );
	xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].real) );
	xmm3 = _mm_shuffle_ps( xmm3, xmm7, 0x00 );
	xmm4 = _mm_load_ss((float *)&((aa)->e[0][1].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].real) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 );
	xmm3 = _mm_mul_ps( xmm3, xmm0 );
	xmm4 = _mm_mul_ps( xmm4, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm4 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].real) );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xB1 );
	 	 xmm0 = _mm_xor_ps( xmm0, _sse_sgn13.xmm );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x11 );
	 	 xmm1 = _mm_xor_ps( xmm1, _sse_sgn13.xmm );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB1 );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn13.xmm );
	xmm4 = _mm_load_ss((float *)&((aa)->e[0][0].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 );
	xmm4 = _mm_mul_ps( xmm4, xmm0 );
	xmm3 = _mm_add_ps( xmm3, xmm4 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][1].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	_mm_storeu_ps((float *)&((cc)->c[0]), xmm3 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) );
	xmm5 = _mm_load_ss((float *)&((aa)->e[2][1].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm1 );
	xmm6 = _mm_add_ps( xmm6, xmm5 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB4 );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn4.xmm );
	xmm7 = _mm_loadl_pi(xmm7, (__m64 *)&((aa)->e[2][2]) );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x05 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm6 = _mm_add_ps( xmm6, xmm7 );
	xmm7 = xmm6 ; 
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0xEE );
	xmm6 = _mm_add_ps( xmm6, xmm7 );
	_mm_storel_pi((__m64 *)&((cc)->c[2]), xmm6 );
}
Exemple #19
0
int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
{
    int i = 0, k;
    const float *S, *S2;
    const __m128 d4 = _mm_set1_ps(delta);
    const __m256 d8 = _mm256_set1_ps(delta);

    for( ; i <= width - 16; i += 16 )
    {
        __m256 f = _mm256_set1_ps(ky[0]);
        __m256 s0, s1;
        __m256 x0;
        S = src[0] + i;
        s0 = _mm256_loadu_ps(S);
#if CV_FMA3
        s0 = _mm256_fmadd_ps(s0, f, d8);
#else
        s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8);
#endif
        s1 = _mm256_loadu_ps(S+8);
#if CV_FMA3
        s1 = _mm256_fmadd_ps(s1, f, d8);
#else
        s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8);
#endif

        for( k = 1; k <= ksize2; k++ )
        {
            S = src[k] + i;
            S2 = src[-k] + i;
            f = _mm256_set1_ps(ky[k]);
            x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
            x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8));
#if CV_FMA3
            s1 = _mm256_fmadd_ps(x0, f, s1);
#else
            s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
#endif
        }

        _mm256_storeu_ps(dst + i, s0);
        _mm256_storeu_ps(dst + i + 8, s1);
    }

    for( ; i <= width - 4; i += 4 )
    {
        __m128 f = _mm_set1_ps(ky[0]);
        __m128 x0, s0 = _mm_load_ps(src[0] + i);
        s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);

        for( k = 1; k <= ksize2; k++ )
        {
            f = _mm_set1_ps(ky[k]);
            S = src[k] + i;
            S2 = src[-k] + i;
            x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
            s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
        }

        _mm_storeu_ps(dst + i, s0);
    }

    _mm256_zeroupper();
    return i;
}
void Print(__m128 m)
{
	float val[4];
	_mm_storeu_ps(val, m);
	fprintf(stderr, "[%g, %g, %g, %g]\n", val[3], val[2], val[1], val[0]);
}
Exemple #21
0
int conv2D(float* in, float* out, int data_size_X, int data_size_Y,
                    float* kernel)
{   
    omp_set_num_threads(16);
    float t[9];
    kernel=memcpy(t, kernel,36);
    #pragma omp parallel shared(kernel, out, in)
    {
        #pragma omp for
        for (int y = 1; y < data_size_Y-1; y++){
            int lastColumn = data_size_X-1;
            float first = 0, last = 0;
            for(int rowNum = -1; rowNum <= 1; rowNum++){
                float *firsttemp =  (y+rowNum)*data_size_X+in, *lasttemp =  (y+rowNum)*data_size_X+lastColumn+in;
                int kerPositionTemp =(1-rowNum)*3;
                for(int col = -1; col <= 1; col++){
                    float *kernelPosition= (1-col)+kerPositionTemp+kernel;
                    if(y+rowNum!=data_size_Y&&y+rowNum!=-1 ){
                        if(col!=-1 && col!=data_size_X) first += *(kernelPosition) * *(firsttemp+col);                    
                        if(lastColumn+col!=-1  && lastColumn+col!=data_size_X) last += *(kernelPosition) * *(lasttemp+col);
                    }
                }
            }
            int row = y*data_size_X;
            out[row] = first;
            out[lastColumn+row] = last;
        }
    }


    #pragma omp parallel shared(kernel, out, in)
    {
        #pragma omp for
        for (int x = 0;x<data_size_X;x++){
            int  lastLineNum=data_size_Y-1;
            float first = 0, last = 0;
            for(int j = -1; j <= 1; j++){

                int firstLineTemp = j*data_size_X+x,
                    secondLineTemp = (lastLineNum+j)*data_size_X+x,
                    kerPositionTemp =(1-j)*3;

                for(int i = -1; i <= 1; i++){
                    int kernelPosition= (1-i)+kerPositionTemp;
                    if(x+i>-1 && x+i<data_size_X ){
                        if(j!=-1 && j!=data_size_Y) first += kernel[kernelPosition] * in[i + firstLineTemp];                    
                        if(lastLineNum+j!=-1 && lastLineNum+j!=data_size_Y) last  += kernel[kernelPosition] * in[i + secondLineTemp];
                    }
                }
            }
            out[x] = first;
            out[x+lastLineNum*data_size_X] = last;
        }
    }  

    float tt[9];
    kernel=memcpy(tt, kernel,36);

omp_set_num_threads(16);
#pragma  omp parallel 
{
    #pragma omp for 
    for (int y = 1; y < data_size_Y-1; y++){   
        int row = y*data_size_X, x = 1;
        for (; x < (data_size_X-2)/32*32; x+=32){
            __m128 vector1 = _mm_set1_ps(0.0f);
            __m128 vector2=  vector1;
            __m128 vector3 = vector1;
            __m128 vector4 = vector1;
            __m128 vector5 = vector1;
            __m128 vector6 = vector1;
            __m128 vector7 = vector1;
            __m128 vector8 = vector1;
            for (int row = -1; row < 2; row++){
                int temp = (y+row)*data_size_X;
                int kerPositionTemp = (1-row)*3;
                for (int col = -1; col < 2; col++){                 
                    __m128 kerVal = _mm_set1_ps(kernel[(1-col)+kerPositionTemp]);
                    vector1 = _mm_add_ps (vector1, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp),kerVal));
                    vector2 = _mm_add_ps (vector2, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+4),kerVal));
                    vector3 = _mm_add_ps (vector3, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+8),kerVal));
                    vector4 = _mm_add_ps (vector4, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+12),kerVal));
                    vector5 = _mm_add_ps (vector5, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+16),kerVal));
                    vector6 = _mm_add_ps (vector6, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+20),kerVal));
                    vector7 = _mm_add_ps (vector7, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+24),kerVal));
                    vector8 = _mm_add_ps (vector8, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+28),kerVal));
                }

            }
            int ot = x+row;
            _mm_storeu_ps(out+ot, vector1);
            _mm_storeu_ps(out+ot+4, vector2);
            _mm_storeu_ps(out+ot+8, vector3);
            _mm_storeu_ps(out+ot+12, vector4);
            _mm_storeu_ps(out+ot+16, vector5);
            _mm_storeu_ps(out+ot+20, vector6);
            _mm_storeu_ps(out+ot+24, vector7);
            _mm_storeu_ps(out+ot+28, vector8);

        }
        for (;x<data_size_X-1; x++){
                for(int i = -1; i <= 1; i++){
                    for(int j = -1; j <= 1; j++){
                        out[x+y*data_size_X] +=
                        kernel[(1-i)+(1-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
                    }
                }
            }
        }
}

   


    return 1;
}
// Scalar horizonal max across four lanes.
float hmax(__m128 m)
{
	float f[4];
	_mm_storeu_ps(f, m);
	return fmax(fmax(f[0], f[1]), fmax(f[2], f[3]));
}
void eigen_hessian_3d(float* dst_e0, float* dst_e1, float* dst_e2, const float* src, size_t w, size_t h, size_t d, bool ref) {
    auto v = [=](size_t x, size_t y, size_t z) -> const float& {
        return src[(z*h + y)*w + x];
    };
    auto ev = [=](float* vol, size_t x, size_t y, size_t z) -> float& { return vol[(z*h + y)*w + x]; };
    for (size_t z=1; z<d-1; ++z) {
        for (size_t y=1; y<h-1; ++y) {
            auto ref_pixel = [=](size_t x) {
                const float f = 2.0f*v(x, y, z);
                const float f_xx = v(x + 1, y, z) + v(x - 1, y, z) - f;
                const float f_yy = v(x, y + 1, z) + v(x, y - 1, z) - f;
                const float f_zz = v(x, y, z + 1) + v(x, y, z - 1) - f;
                const float f_xy = v(x + 1, y + 1, z) + v(x - 1, y - 1, z) - v(x - 1, y + 1, z) - v(x + 1, y - 1, z);
                const float f_yz = v(x, y + 1, z + 1) + v(x, y - 1, z - 1) - v(x, y - 1, z + 1) - v(x, y + 1, z - 1);
                const float f_zx = v(x + 1, y, z + 1) + v(x - 1, y, z - 1) - v(x + 1, y, z - 1) - v(x - 1, y, z + 1);
                float m[9] = {
                    f_xx, f_xy, f_zx,
                    f_xy, f_yy, f_yz,
                    f_zx, f_yz, f_zz
                };
                float eigen[3];
                cubic_eigen<newton_iter>(m, eigen);
                ev(dst_e0, x, y, z) = eigen[0];
                ev(dst_e1, x, y, z) = eigen[1];
                ev(dst_e2, x, y, z) = eigen[2];
            };
            auto simd_pixel = [=](size_t x) {
                __m128 vf2 = _mm_loadu_ps((const float*)&src[(z*h + y)*w + x]);
                vf2 = _mm_add_ps(vf2, vf2);
                __m128 vfxm1 = _mm_loadu_ps((const float*)&src[(z*h + y)*w + x - 1]);
                __m128 vfxp1 = _mm_loadu_ps((const float*)&src[(z*h + y)*w + x + 1]);
                __m128 vfym1 = _mm_loadu_ps((const float*)&src[(z*h + y - 1)*w + x]);
                __m128 vfyp1 = _mm_loadu_ps((const float*)&src[(z*h + y + 1)*w + x]);
                __m128 vfzm1 = _mm_loadu_ps((const float*)&src[((z - 1)*h + y)*w + x]);
                __m128 vfzp1 = _mm_loadu_ps((const float*)&src[((z + 1)*h + y)*w + x]);
                __m128 vfxx = _mm_sub_ps(_mm_add_ps(vfxm1, vfxp1), vf2);
                __m128 vfyy = _mm_sub_ps(_mm_add_ps(vfym1, vfyp1), vf2);
                __m128 vfzz = _mm_sub_ps(_mm_add_ps(vfzm1, vfzp1), vf2);
                __m128 vfxm1ym1 = _mm_loadu_ps((const float*)&src[(z*h + y - 1)*w + x - 1]);
                __m128 vfxp1ym1 = _mm_loadu_ps((const float*)&src[(z*h + y - 1)*w + x + 1]);
                __m128 vfxm1yp1 = _mm_loadu_ps((const float*)&src[(z*h + y + 1)*w + x - 1]);
                __m128 vfxp1yp1 = _mm_loadu_ps((const float*)&src[(z*h + y + 1)*w + x + 1]);
                __m128 vfym1zm1 = _mm_loadu_ps((const float*)&src[((z - 1)*h + y - 1)*w + x]);
                __m128 vfyp1zm1 = _mm_loadu_ps((const float*)&src[((z - 1)*h + y + 1)*w + x]);
                __m128 vfym1zp1 = _mm_loadu_ps((const float*)&src[((z + 1)*h + y - 1)*w + x]);
                __m128 vfyp1zp1 = _mm_loadu_ps((const float*)&src[((z + 1)*h + y + 1)*w + x]);
                __m128 vfzm1xm1 = _mm_loadu_ps((const float*)&src[((z - 1)*h + y)*w + x - 1]);
                __m128 vfzp1xm1 = _mm_loadu_ps((const float*)&src[((z + 1)*h + y)*w + x - 1]);
                __m128 vfzm1xp1 = _mm_loadu_ps((const float*)&src[((z - 1)*h + y)*w + x + 1]);
                __m128 vfzp1xp1 = _mm_loadu_ps((const float*)&src[((z + 1)*h + y)*w + x + 1]);
                __m128 vfxy = _mm_sub_ps(_mm_add_ps(vfxp1yp1, vfxm1ym1), _mm_add_ps(vfxm1yp1, vfxp1ym1));
                __m128 vfyz = _mm_sub_ps(_mm_add_ps(vfyp1zp1, vfym1zm1), _mm_add_ps(vfym1zp1, vfyp1zm1));
                __m128 vfzx = _mm_sub_ps(_mm_add_ps(vfzp1xp1, vfzm1xm1), _mm_add_ps(vfzm1xp1, vfzp1xm1));
                __m128 ve0, ve1, ve2;
                cubic_eigen<newton_iter>(vfxx, vfxy, vfzx, vfyy, vfyz, vfzz, ve0, ve1, ve2);
                _mm_storeu_ps((float*)&dst_e0[(z*h + y)*w + x], ve0);
                _mm_storeu_ps((float*)&dst_e1[(z*h + y)*w + x], ve1);
                _mm_storeu_ps((float*)&dst_e2[(z*h + y)*w + x], ve2);
            };
            size_t x = 1;
            for (; x<std::min<size_t>(w-1, 4); ++x)
                ref_pixel(x);
            if (!ref) {
                for (; x<w-1-4; x+=4)
                    simd_pixel(x);
            }
            for (; x<w-1; ++x)
                ref_pixel(x);
        }
    }
}
mlib_status
F_NAME(
    mlib_f32 *dst,
    const mlib_f32 *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
{
	mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buffT;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *dl;
	__m128 *dp0, *dp1;
	__m128 aa, bb, cc, dd, e0, e1, e2, e3, e4, ee, f0, f1, f2, f3, f4, ff,
	    r0, r1, t0, t1;
	__m128 e_mask;
	mlib_s32 i, j, wid16, tail;

	wid = (wid - KSIZE1) * SSIZE;
	wid16 = (wid + 15) & ~15;
	pbuff = __mlib_malloc(4 * wid16);
	buff0 = pbuff;
	buff1 = buff0 + wid16;
	buff2 = buff1 + wid16;
	buff3 = buff2 + wid16;

	sl = (mlib_u8 *)src;
	dl = (mlib_u8 *)dst + 2 * (dlb + SSIZE);

	tail = wid & 15;

	((mlib_d64 *)&e_mask)[0] =
		((mlib_d64 *)((__m128 *) mlib_mask128i_arr + tail))[0];
	((mlib_d64 *)&e_mask)[1] =
		((mlib_d64 *)((__m128 *) mlib_mask128i_arr + tail))[1];

	for (j = 0; j < 2; j++) {
		sp0 = buff0;
		sp1 = buff1;
		sp4 = sl;
		sp5 = sl + slb;
		sl += 2 * slb;

		for (i = 0; i < wid; i += 16) {
			e0 = _mm_loadu_ps((mlib_f32 *)sp4);
			e1 = _mm_loadu_ps((mlib_f32 *)(sp4 + SSIZE));
			e2 = _mm_loadu_ps((mlib_f32 *)(sp4 + 2 * SSIZE));
			e3 = _mm_loadu_ps((mlib_f32 *)(sp4 + 3 * SSIZE));
			e4 = _mm_loadu_ps((mlib_f32 *)(sp4 + 4 * SSIZE));
			f0 = _mm_loadu_ps((mlib_f32 *)sp5);
			f1 = _mm_loadu_ps((mlib_f32 *)(sp5 + SSIZE));
			f2 = _mm_loadu_ps((mlib_f32 *)(sp5 + 2 * SSIZE));
			f3 = _mm_loadu_ps((mlib_f32 *)(sp5 + 3 * SSIZE));
			f4 = _mm_loadu_ps((mlib_f32 *)(sp5 + 4 * SSIZE));

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			_mm_storeu_ps((mlib_f32 *)sp0, ee);
			_mm_storeu_ps((mlib_f32 *)sp1, ff);

			sp0 += 16;
			sp1 += 16;
			sp4 += 16;
			sp5 += 16;
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buffT;
	}

	for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = sl;
		sp5 = sl + slb;

/*
 *    line0:        aa
 *    line1:        bb
 *    line2:        cc
 *    line3:        dd
 *    line4:  e0 e1 e2 e3 e4
 *    line5:  f0 f1 f2 f3 f4
 */

		for (i = 0; i <= wid - 16; i += 16) {
			aa = _mm_loadu_ps((mlib_f32 *)sp0);
			bb = _mm_loadu_ps((mlib_f32 *)sp1);
			cc = _mm_loadu_ps((mlib_f32 *)sp2);
			dd = _mm_loadu_ps((mlib_f32 *)sp3);
			e0 = _mm_loadu_ps((mlib_f32 *)sp4);
			e1 = _mm_loadu_ps((mlib_f32 *)(sp4 + SSIZE));
			e2 = _mm_loadu_ps((mlib_f32 *)(sp4 + 2 * SSIZE));
			e3 = _mm_loadu_ps((mlib_f32 *)(sp4 + 3 * SSIZE));
			e4 = _mm_loadu_ps((mlib_f32 *)(sp4 + 4 * SSIZE));
			f0 = _mm_loadu_ps((mlib_f32 *)sp5);
			f1 = _mm_loadu_ps((mlib_f32 *)(sp5 + SSIZE));
			f2 = _mm_loadu_ps((mlib_f32 *)(sp5 + 2 * SSIZE));
			f3 = _mm_loadu_ps((mlib_f32 *)(sp5 + 3 * SSIZE));
			f4 = _mm_loadu_ps((mlib_f32 *)(sp5 + 4 * SSIZE));

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, ff);

			_mm_storeu_ps((mlib_f32 *)sp0, ee);
			_mm_storeu_ps((mlib_f32 *)sp1, ff);

			_mm_storeu_ps((mlib_f32 *)dp0, r0);
			dp0++;
			_mm_storeu_ps((mlib_f32 *)dp1, r1);
			dp1++;

			sp0 += 16;
			sp1 += 16;
			sp2 += 16;
			sp3 += 16;
			sp4 += 16;
			sp5 += 16;
		}

		if (tail) {
			aa = _mm_loadu_ps((mlib_f32 *)sp0);
			bb = _mm_loadu_ps((mlib_f32 *)sp1);
			cc = _mm_loadu_ps((mlib_f32 *)sp2);
			dd = _mm_loadu_ps((mlib_f32 *)sp3);
			e0 = _mm_loadu_ps((mlib_f32 *)sp4);
			e1 = _mm_loadu_ps((mlib_f32 *)(sp4 + SSIZE));
			e2 = _mm_loadu_ps((mlib_f32 *)(sp4 + 2 * SSIZE));
			e3 = _mm_loadu_ps((mlib_f32 *)(sp4 + 3 * SSIZE));
			e4 = _mm_loadu_ps((mlib_f32 *)(sp4 + 4 * SSIZE));
			f0 = _mm_loadu_ps((mlib_f32 *)sp5);
			f1 = _mm_loadu_ps((mlib_f32 *)(sp5 + SSIZE));
			f2 = _mm_loadu_ps((mlib_f32 *)(sp5 + 2 * SSIZE));
			f3 = _mm_loadu_ps((mlib_f32 *)(sp5 + 3 * SSIZE));
			f4 = _mm_loadu_ps((mlib_f32 *)(sp5 + 4 * SSIZE));

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, ff);

			_mm_storeu_ps((mlib_f32 *)sp0, ee);
			_mm_storeu_ps((mlib_f32 *)sp1, ff);

			t0 = _mm_loadu_ps((mlib_f32 *)dp0);
			t1 = _mm_loadu_ps((mlib_f32 *)dp1);
			t0 =
			    _mm_or_ps(_mm_and_ps(e_mask, r0),
			    _mm_andnot_ps(e_mask, t0));
			t1 =
			    _mm_or_ps(_mm_and_ps(e_mask, r1),
			    _mm_andnot_ps(e_mask, t1));
			_mm_storeu_ps((mlib_f32 *)dp0, t0);
			_mm_storeu_ps((mlib_f32 *)dp1, t1);
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buffT;

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - KSIZE1 - 1)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = sl;

		for (i = 0; i <= wid - 16; i += 16) {
			aa = _mm_loadu_ps((mlib_f32 *)sp0);
			bb = _mm_loadu_ps((mlib_f32 *)sp1);
			cc = _mm_loadu_ps((mlib_f32 *)sp2);
			dd = _mm_loadu_ps((mlib_f32 *)sp3);
			e0 = _mm_loadu_ps((mlib_f32 *)sp4);
			e1 = _mm_loadu_ps((mlib_f32 *)(sp4 + SSIZE));
			e2 = _mm_loadu_ps((mlib_f32 *)(sp4 + 2 * SSIZE));
			e3 = _mm_loadu_ps((mlib_f32 *)(sp4 + 3 * SSIZE));
			e4 = _mm_loadu_ps((mlib_f32 *)(sp4 + 4 * SSIZE));

			ee = C_COMP(e0, e1);
			e2 = C_COMP(e2, e3);
			ee = C_COMP(ee, e4);
			ee = C_COMP(ee, e2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);

			_mm_storeu_ps((mlib_f32 *)dp0, r0);
			dp0++;

			sp0 += 16;
			sp1 += 16;
			sp2 += 16;
			sp3 += 16;
			sp4 += 16;
		}

		if (tail) {
			aa = _mm_loadu_ps((mlib_f32 *)sp0);
			bb = _mm_loadu_ps((mlib_f32 *)sp1);
			cc = _mm_loadu_ps((mlib_f32 *)sp2);
			dd = _mm_loadu_ps((mlib_f32 *)sp3);
			e0 = _mm_loadu_ps((mlib_f32 *)sp4);
			e1 = _mm_loadu_ps((mlib_f32 *)(sp4 + SSIZE));
			e2 = _mm_loadu_ps((mlib_f32 *)(sp4 + 2 * SSIZE));
			e3 = _mm_loadu_ps((mlib_f32 *)(sp4 + 3 * SSIZE));
			e4 = _mm_loadu_ps((mlib_f32 *)(sp4 + 4 * SSIZE));

			ee = C_COMP(e0, e1);
			e2 = C_COMP(e2, e3);
			ee = C_COMP(ee, e4);
			ee = C_COMP(ee, e2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);

			t0 = _mm_loadu_ps((mlib_f32 *)dp0);
			t0 =
			    _mm_or_ps(_mm_and_ps(e_mask, r0),
			    _mm_andnot_ps(e_mask, t0));
			_mm_storeu_ps((mlib_f32 *)dp0, t0);
		}
	}

	__mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Exemple #25
0
RETf STRu(float &x, const __m128 y) { _mm_storeu_ps(&x, y); return y; }
int conv2D(float* in, float* out, int data_size_X, int data_size_Y,
                    float* kernel)
{	
	// the x coordinate of the kernel's center
	int kern_cent_X = (KERNX - 1)/2;
	// the y coordinate of the kernel's center
	int kern_cent_Y = (KERNY - 1)/2;
	// the x/y size of the padded matrix
	
	int padded_X = calcExtra(data_size_X+(kern_cent_X*2));
	int padded_Y = data_size_Y+(kern_cent_X*2);
	float padded_matrix[padded_X*padded_Y];
	//memset(padded_matrix, 0, sizeof(float) * padded_X * padded_Y);
	int padded_start = kern_cent_X+kern_cent_X*(padded_X);
	float newKern[KERNX*KERNY];
	
	
	//matrix convolution
	#pragma omp parallel 
	{
		//inverting kernel
		#pragma omp for
		for (int i = 0; i < KERNX*KERNY; i++)
		{
			newKern[i] = kernel[KERNX*KERNY-1-i];
		}

		// zero initiializng padded matrix
		#pragma omp for 
		for (int a = 0; a < padded_X*padded_Y; a++)
		{
			padded_matrix[a] = 0;
		}
		
		// moving in values into paddedmatrix from in
		#pragma omp for 
		for (int m = 0; m < data_size_Y; m++)
			memcpy(padded_start+padded_matrix+padded_X*m, in+m*data_size_X, sizeof(float)*data_size_X);

		__m128 paddedrow,multipliedvalues, kernelvalue, total;
		float outSum;
		int x;

	#pragma omp for firstprivate(padded_X, data_size_X, data_size_Y, newKern) private(paddedrow, multipliedvalues, kernelvalue, total, outSum, x)
	for (int y = 0; y < data_size_Y; y++)
	{
		for (x = 0; x <= data_size_X-4; x+=4) {
			total = _mm_setzero_ps();
			for (int j = 0; j < KERNY; j++) {
				for(int i = 0; i < KERNX; i++) {
					kernelvalue = _mm_load1_ps(newKern + (i+j*KERNX));
					paddedrow = _mm_loadu_ps(padded_matrix+x+i+(y+j)*padded_X);
					multipliedvalues = _mm_mul_ps(paddedrow, kernelvalue);
					total = _mm_add_ps(total,multipliedvalues);
				}		 
			}
			_mm_storeu_ps(out+(x+y*data_size_X), total);
		}
		for (; x < data_size_X; x++)
		{
			outSum = 0;
			for (int j = 0; j < KERNY; j++) {
				for(int i = 0; i < KERNX; i++) {
					outSum += newKern[i+j*KERNX] * padded_matrix[x+i+(y+j)*padded_X];	
				}		 
			}
			out[x+y*data_size_X] = outSum;

		}
	}
	}
	return 1;
}
//////////////////////////////////////////////////////////////////////////////
//
// LightProjectClipXmm()
//
// works in model space with the old pipe
// assumes SetupLightsModel has been called
// if dstV and dstCount are non NULL bucket mem 
//
void Camera::LightProjectClipXmm( Plane *srcP, Vector *srcVect, Vector *srcNorm, UVPair *srcUV, U32 countV, U16 *srcI, U32 countI, VertexTL **dstV, U32 *dstVCount, U32 *dstICount) // = NULL, = NULL, = NULL)
{
  ASSERT( countI <= MAXINDICES);

  // temp pools : FIXME : eliminate temp copying
  U16              notCulledV[MAXVERTS];
  static VertexTL  litV_unaligned[MAXVERTS+1];
  static VertexTL *litV = (VertexTL*) (((U32)litV_unaligned+0x0000000f) & 0xfffffff0);
  U16              litI[MAXINDICES];

  BackfaceCull(srcP, notCulledV, countV, srcI, litI, countI);

  if (!countV)
  {
    // no forward faces
    if (dstV)
    {
      *dstV = NULL;
      *dstVCount = 0;
      *dstICount = 0;
    }
    return;
  }

  Material &material = *DxLight::Manager::curMaterial;
  Color diffa = (Color) Utils::FtoL( material.desc.diffuse.a * 255.0f) << 24;

  // calculate the parts of the diffuse color that are the same for all output vertexes
  ColorValueXmm diffInit;

  // FIXME: do we really want to ignore material ambient values (yes for DR2)
//  diffInit.r = curMaterial->desc.diffuse.r * Vid::renderState.ambientR + curMaterial->desc.emissive.r;
//  diffInit.g = curMaterial->desc.diffuse.g * Vid::renderState.ambientG + curMaterial->desc.emissive.g;
//  diffInit.b = curMaterial->desc.diffuse.b * Vid::renderState.ambientB + curMaterial->desc.emissive.b;

  diffInit.Set(
    Vid::renderState.ambientR + material.desc.emissive.r,
    Vid::renderState.ambientG + material.desc.emissive.g,
    Vid::renderState.ambientB + material.desc.emissive.b,
    material.desc.diffuse.a);

  const static __m128 zero = _mm_setzero_ps();
  const static ColorValueXmm specInit(zero, zero, zero, _mm_set_ps1(1.0f));

  // make the count a multiple of four by lighting the last vert multiple times
  notCulledV[countV+0] = notCulledV[countV+1] = notCulledV[countV+2] = notCulledV[countV+3] = notCulledV[countV-1];
  U32 countV_rem = ((4 - (countV % SIMD_WIDTH)) << 30) >> 30;
  countV += countV_rem;
  ASSERT( (countV % SIMD_WIDTH) == 0 );

  U8 clip_flags[MAXVERTS];

  for ( U32 vc = 0; vc < countV; vc += SIMD_WIDTH )
  {
    // set-up xmm vertex
    U16 i0 = notCulledV[vc+0], 
        i1 = notCulledV[vc+1], 
        i2 = notCulledV[vc+2], 
        i3 = notCulledV[vc+3];

    VertexXmm vert;
    vert.V0 = _mm_loadu_ps(&srcVect[i0]);
    vert.V1 = _mm_loadu_ps(&srcVect[i1]);
    vert.V2 = _mm_loadu_ps(&srcVect[i2]);
    vert.V3 = _mm_loadu_ps(&srcVect[i3]);

    __m128 lit[4];
    SetHomogeneousFromModelXmm(lit, &vert.V0);

    // generate clip flags
    const static U32 pos_w_mask_U32 = 0x7fffffff, 
                     neg_w_mask_U32 = 0x80000000;
    const static __m128 pos_w_mask = _mm_set_ps1(*((F32*) &pos_w_mask_U32)),
                        neg_w_mask = _mm_set_ps1(*((F32*) &neg_w_mask_U32));

    __m128 neg_w, pos_w;

#define COMPUTE_SUTHERLAND(i)                                                           \
    {                                                                                   \
      pos_w = _mm_and_ps(pos_w_mask, _mm_shuffle_ps(lit[(i)], lit[(i)], 0xff));         \
      neg_w = _mm_or_ps (neg_w_mask, pos_w);                                            \
      clip_flags[vc+(i)] = 0;                                                           \
      clip_flags[vc+(i)] |= (U8) (_mm_movemask_ps(_mm_cmplt_ps(lit[(i)], neg_w)));      \
      clip_flags[vc+(i)] << 4;                                                          \
      clip_flags[vc+(i)] |= (U8) (_mm_movemask_ps(_mm_cmplt_ps(pos_w, lit[(i)])));      \
      clip_flags[vc+(i)] &= 0x77;                                                       \
    }

    COMPUTE_SUTHERLAND(0);
    COMPUTE_SUTHERLAND(1);
    COMPUTE_SUTHERLAND(2);
    COMPUTE_SUTHERLAND(3);

    TRANSPOSE_4X4(vert.V0, vert.V1, vert.V2, vert.V3);
    vert.NV.Set(srcNorm[i0], srcNorm[i1], srcNorm[i2], srcNorm[i3]);
    vert.DIFFUSE = diffInit;
    vert.SPECULAR = specInit;

    // light four verts
    DxLight::Manager::LightModel(vert, material);

    VertexTL &out0 = litV[vc+0], 
             &out1 = litV[vc+1], 
             &out2 = litV[vc+2], 
             &out3 = litV[vc+3];

    _mm_storeu_ps(&out0, lit[0]);
    _mm_storeu_ps(&out1, lit[1]);
    _mm_storeu_ps(&out2, lit[2]);
    _mm_storeu_ps(&out3, lit[3]);

    vert.DIFFUSE.GetRGBA (out0.diffuse,  out1.diffuse,  out2.diffuse,  out3.diffuse );
    vert.SPECULAR.GetRGBA(out0.specular, out1.specular, out2.specular, out3.specular);

    out0.uv = srcUV[i0];
    out1.uv = srcUV[i1];
    out2.uv = srcUV[i2];
    out3.uv = srcUV[i3];
  }

  countV -= countV_rem;

  ClipToBucket( litV, countV, litI, countI, clip_flags, dstV, dstVCount, dstICount);
}
static void rftfsub_128_SSE2(float *a) {
  const float *c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

  static const ALIGN16_BEG float ALIGN16_END k_half[4] =
    {0.5f, 0.5f, 0.5f, 0.5f};
  const __m128 mm_half = _mm_load_ps(k_half);

  // Vectorized code (four at once).
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
    const __m128 c_j1 = _mm_loadu_ps(&c[     j1]);         //  1,  2,  3,  4,
    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);         // 28, 29, 30, 31,
    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);         // 28, 29, 30, 31,
    const __m128 wkr_ =
      _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
    const __m128 wki_ = c_j1;                              //  1,  2,  3,  4,
    // Load and shuffle 'a'.
    const __m128 a_j2_0 = _mm_loadu_ps(&a[0   + j2]);  //   2,   3,   4,   5,
    const __m128 a_j2_4 = _mm_loadu_ps(&a[4   + j2]);  //   6,   7,   8,   9,
    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
    const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
                            _MM_SHUFFLE(2, 0, 2 ,0));  //   2,   4,   6,   8,
    const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
                            _MM_SHUFFLE(3, 1, 3 ,1));  //   3,   5,   7,   9,
    const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
                            _MM_SHUFFLE(0, 2, 0 ,2));  // 126, 124, 122, 120,
    const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
                            _MM_SHUFFLE(1, 3, 1 ,3));  // 127, 125, 123, 121,
    // Calculate 'x'.
    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
                                               // 2-126, 4-124, 6-122, 8-120,
    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
                                               // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr - wki * xi;
    //    yi = wkr * xi + wki * xr;
    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
    const __m128 b_ = _mm_mul_ps(wki_, xi_);
    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
    const __m128 d_ = _mm_mul_ps(wki_, xr_);
    const __m128 yr_ = _mm_sub_ps(a_, b_);     // 2-126, 4-124, 6-122, 8-120,
    const __m128 yi_ = _mm_add_ps(c_, d_);     // 3-127, 5-125, 7-123, 9-121,
    // Update 'a'.
    //    a[j2 + 0] -= yr;
    //    a[j2 + 1] -= yi;
    //    a[k2 + 0] += yr;
    //    a[k2 + 1] -= yi;
    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
    const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_);  //   3,   5,   7,   9,
    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
    const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_);  // 127, 125, 123, 121,
    // Shuffle in right order and store.
    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
                                                       //   2,   3,   4,   5,
    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
                                                       //   6,   7,   8,   9,
    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
                                                       // 122, 123, 120, 121,
    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
                                                       // 126, 127, 124, 125,
    const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
                            _MM_SHUFFLE(1, 0, 3 ,2));  // 120, 121, 122, 123,
    const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
                            _MM_SHUFFLE(1, 0, 3 ,2));  // 124, 125, 126, 127,
    _mm_storeu_ps(&a[0   + j2], a_j2_0n);
    _mm_storeu_ps(&a[4   + j2], a_j2_4n);
    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
    k1 =  32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
    xi = a[j2 + 1] + a[k2 + 1];
    yr = wkr * xr - wki * xi;
    yi = wkr * xi + wki * xr;
    a[j2 + 0] -= yr;
    a[j2 + 1] -= yi;
    a[k2 + 0] += yr;
    a[k2 + 1] -= yi;
  }
}
void sgemm( int m, int n, int d, float *A, float *C )
{
	#pragma omp parallel
	{
		__m128 vect, ATmatrix, ATvect1, ATvect2, ATvect3, ATvect4, ATvect5, ATvect6, ATvect7, Cmatrix, vect2, vect3, vect4, ATvect3j, AT3j, ATvect2j, AT2j, ATvect1j, AT1j, ATjmatrix, ATj;
		#pragma omp for
		for( int j = 0; j < n/2 * 2; j+=2 ) {
			for( int k = 0; k < m/4 * 4; k+=4 ) {
				ATmatrix =  _mm_load1_ps(A + (j * (n + 1) + (k) * (n)));
				float AT = A[j*(n+1)+k*(n)];
				ATjmatrix =  _mm_load1_ps(A + ((j+1) * (n + 1) + (k) * (n)));
				float ATj = A[(j+1)*(n+1)+k*(n)];

				ATvect1 =  _mm_load1_ps(A + (j * (n + 1) + (k+1) * (n)));
				float AT1 = A[j*(n+1)+(k+1)*(n)];
				ATvect1j =  _mm_load1_ps(A + ((j + 1) * (n + 1) + (k + 1) * (n)));
				float AT1j = A[(j + 1)*(n+1)+(k + 1)*(n)];

				ATvect2 =  _mm_load1_ps(A + (j * (n + 1) + (k+2) * (n)));
				float AT2 = A[j*(n+1)+(k+2)*(n)];
				ATvect2j =  _mm_load1_ps(A + ((j + 1) * (n + 1) + (k + 2) * (n)));
				float AT2j = A[(j + 1)*(n+1)+(k + 2)*(n)];

				ATvect3 =  _mm_load1_ps(A + (j * (n + 1) + (k+3) * (n)));
				float AT3 = A[j*(n+1)+(k+3)*(n)];
				ATvect3j =  _mm_load1_ps(A + ((j + 1) * (n + 1) + (k + 3) * (n)));
				float AT3j = A[(j + 1)*(n+1)+(k + 3)*(n)];

				for( int i = 0; i < n/8 * 8; i+= 8 ) {
					float *temp = C + i + j * n;
					float *tempj = C + i + (j + 1) * n;
					float *tmp = A + i + (k)*(n);
					float *tmp1 = A + i + (k + 1)*(n);
					float *tmp2 = A + i + (k + 2)*(n);
					float *tmp3 = A + i + (k + 3)*(n);

					//i = 0
					Cmatrix = _mm_loadu_ps(temp);
				    vect = _mm_mul_ps(_mm_loadu_ps(tmp), ATmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps(tmp1), ATvect1);
				    vect3 = _mm_mul_ps(_mm_loadu_ps(tmp2), ATvect2);
				    vect4 = _mm_mul_ps(_mm_loadu_ps(tmp3), ATvect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps(temp, Cmatrix);
				    //j + 1
				    Cmatrix = _mm_loadu_ps(tempj);
				    vect = _mm_mul_ps(_mm_loadu_ps(tmp), ATjmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps(tmp1), ATvect1j);
				    vect3 = _mm_mul_ps(_mm_loadu_ps(tmp2), ATvect2j);
				    vect4 = _mm_mul_ps(_mm_loadu_ps(tmp3), ATvect3j);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps(tempj, Cmatrix);

				    // i = 1

				    Cmatrix = _mm_loadu_ps((temp) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((tmp) + 4), ATmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps((tmp1) + 4), ATvect1);
				    vect3 = _mm_mul_ps(_mm_loadu_ps((tmp2) + 4), ATvect2);
				    vect4 = _mm_mul_ps(_mm_loadu_ps((tmp3) + 4), ATvect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps((temp) + 4, Cmatrix);
				    // j + 1
				    Cmatrix = _mm_loadu_ps(tempj + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps(tmp + 4), ATjmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps(tmp1 + 4), ATvect1j);
				    vect3 = _mm_mul_ps(_mm_loadu_ps(tmp2 + 4), ATvect2j);
				    vect4 = _mm_mul_ps(_mm_loadu_ps(tmp3 + 4), ATvect3j);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps(tempj + 4, Cmatrix);

				}
				for (int i = n/8 * 8; i < n; i += 1) {
					C[i+j*n] += A[i+k*(n)] * AT + A[i+(k+1)*(n)] * AT1 + A[i+(k+2)*(n)] * AT2 + A[i+(k+3)*(n)] * AT3;
					C[i+(j + 1)*n] += A[i+k*(n)] * ATj + A[i+(k+1)*(n)] * AT1j + A[i+(k+2)*(n)] * AT2j + A[i+(k+3)*(n)] * AT3j;
				}
			}
			for (int k = m/4 * 4; k < m; k += 1) {
				ATmatrix =  _mm_load1_ps(A + (j * (n + 1) + (k) * (n)));
				float AT = A[j*(n+1)+k*(n)];

				ATjmatrix =  _mm_load1_ps(A + ((j + 1) * (n + 1) + (k) * (n)));
				float ATj = A[(j + 1)*(n+1)+k*(n)];
				for( int i = 0; i < n/12 * 12; i+= 12 ) {
					float *temp = C + i + j * n;
					float *t2 = A + i + (k)*(n);
					float *tempj = C + i + (j+1) * n;

					//i = 0
					Cmatrix = _mm_loadu_ps(temp);
				    vect = _mm_mul_ps(_mm_loadu_ps(t2), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps(temp, Cmatrix);
				    //j + 1
				    Cmatrix = _mm_loadu_ps(tempj);
				    vect = _mm_mul_ps(_mm_loadu_ps(t2), ATjmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps(tempj, Cmatrix);

				    //i = 1
				    Cmatrix = _mm_loadu_ps((temp) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((t2) + 4), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((temp) + 4, Cmatrix);
				    //j + 1
				    Cmatrix = _mm_loadu_ps((tempj) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((t2) + 4), ATjmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((tempj) + 4, Cmatrix);

				    //i = 2
				    Cmatrix = _mm_loadu_ps((temp) + 8);
				    vect = _mm_mul_ps(_mm_loadu_ps((t2) + 8), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((temp) + 8, Cmatrix);
				    //j + 1
				    Cmatrix = _mm_loadu_ps((tempj) + 8);
				    vect = _mm_mul_ps(_mm_loadu_ps((t2) + 8), ATjmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((tempj) + 8, Cmatrix);
				}
				for (int i = n/12 * 12; i < n; i += 1) {
					C[i+j*n] += A[i+k*(n)] * AT;
					C[i+(j+1)*n] += A[i+k*(n)] * ATj;
				}
			}
		}
	}
	#pragma omp parallel
	{
		__m128 vect, ATmatrix, ATvect1, ATvect2, ATvect3, ATvect4, ATvect5, ATvect6, ATvect7, Cmatrix, vect2, vect3, vect4, ATvect3j, AT3j, ATvect2j, AT2j, ATvect1j, AT1j, ATjmatrix, ATj;
		#pragma omp for
		for (int j = n/2 * 2; j < n; j ++) {
			for( int k = 0; k < m/4 * 4; k+=4 ) {
				ATmatrix =  _mm_load1_ps(A + (j * (n + 1) + (k) * (n)));
				float AT = A[j*(n+1)+k*(n)];

				ATvect1 =  _mm_load1_ps(A + (j * (n + 1) + (k+1) * (n)));
				float AT1 = A[j*(n+1)+(k+1)*(n)];

				ATvect2 =  _mm_load1_ps(A + (j * (n + 1) + (k+2) * (n)));
				float AT2 = A[j*(n+1)+(k+2)*(n)];

				ATvect3 =  _mm_load1_ps(A + (j * (n + 1) + (k+3) * (n)));
				float AT3 = A[j*(n+1)+(k+3)*(n)];
				for( int i = 0; i < n/8 * 8; i+= 8 ) {
					float *temp = C + i + j * n;
					float *tmp = A + i + (k)*(n);
					float *tmp1 = A + i + (k + 1)*(n);
					float *tmp2 = A + i + (k + 2)*(n);
					float *tmp3 = A + i + (k + 3)*(n);
					//i = 0
					Cmatrix = _mm_loadu_ps(temp);
				    vect = _mm_mul_ps(_mm_loadu_ps(tmp), ATmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps(tmp1), ATvect1);
				    vect3 = _mm_mul_ps(_mm_loadu_ps(tmp2), ATvect2);
				    vect4 = _mm_mul_ps(_mm_loadu_ps(tmp3), ATvect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps(temp, Cmatrix);

				    //i = 1
				    Cmatrix = _mm_loadu_ps((temp) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((tmp) + 4), ATmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps((tmp1) + 4), ATvect1);
				    vect3 = _mm_mul_ps(_mm_loadu_ps((tmp2) + 4), ATvect2);
				    vect4 = _mm_mul_ps(_mm_loadu_ps((tmp3) + 4), ATvect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps((temp) + 4, Cmatrix);
				    
				}
				for (int i = n/8 * 8; i < n; i += 1) {
					C[i+j*n] += A[i+k*(n)] * AT + A[i+(k+1)*(n)] * AT1 + A[i+(k+2)*(n)] * AT2 + A[i+(k+3)*(n)] * AT3;
				}
			}
			for (int k = m/4 * 4; k < m; k += 1) {
				ATmatrix =  _mm_load1_ps(A + (j * (n + 1) + (k) * (n)));
				float AT = A[j*(n+1)+k*(n)];
				for( int i = 0; i < n/8 * 8; i+= 8 ) {

					float *temp = C + i + j * n;
					float *tempj = C + i + (j + 1) * n;
					//i = 0
					Cmatrix = _mm_loadu_ps(temp);
				    vect = _mm_mul_ps(_mm_loadu_ps(A + i + (k)*(n)), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps(temp, Cmatrix);

				    //i = 1
				    Cmatrix = _mm_loadu_ps((temp) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((A + i + (k)*(n)) + 4), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((temp) + 4, Cmatrix);
				}
				for (int i = n/8 * 8; i < n; i += 1) {
					C[i+j*n] += A[i+k*(n)] * AT;
				}
			}
		}
	}
}
unsigned long long ComputeMandelbrot_SSE(float *srcReal, float *srcImag, uint32_t *dst, int strideSrc, int strideDst, int x, int y, int yIncr, int w, int h, float left, float top, float incrX, float incrY, unsigned int numItersBefore, unsigned int numIters)
{
  for(int Y = y; Y < h; Y += yIncr)
  {
    float *sr = (float*)((uintptr_t)srcReal + strideSrc * Y) + x;
    float *si = (float*)((uintptr_t)srcImag + strideSrc * Y) + x;
    uint32_t *d = (uint32_t*)((uintptr_t)dst + strideDst * Y) + x;
    float imag = top + Y * incrY;
    __m128 Imag = _mm_set1_ps(imag);
    __m128 four = _mm_set1_ps(4.f);
    for(int X = 0; X < w; X += 4)
    {
      float real = left + (x + X) * incrX;
      __m128 Real = _mm_set_ps(real + 3*incrX, real + 2*incrX, real + incrX, real);
      __m128 v_real = _mm_loadu_ps(sr+X);
//      float v_real = sr[X];
//      if (v_real != INFINITY)
      {
        __m128 v_imag = _mm_loadu_ps(si+X);
//        float v_imag = si[X];

        __m128 oldColor = _mm_loadu_ps((float*)d+X);
        if (anyzero_ps(oldColor))
        //if (d[X] == 0 || d[X+1] == 0 || d[X+2] == 0 || d[X+3] == 0)
        {
          __m128 oldIterating = _mm_cmpeq_ps(oldColor, _mm_setzero_ps());
          for(unsigned int i = 0; i < numIters; ++i)
          {
            // (x+yi)^2 = x^2 - y^2 + 2xyi
            // ||x_+yi||^2 = x^2+y^2
            //float new_real = v_real*v_real - v_imag*v_imag + real;
            __m128 new_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v_real, v_real), _mm_mul_ps(v_imag, v_imag)), Real);
            //v_imag = 2.f * v_real * v_imag + imag;
            __m128 v_ri = _mm_mul_ps(v_real, v_imag);
            v_imag = _mm_add_ps(_mm_add_ps(v_ri, v_ri), Imag);
            v_real = new_real;

  /*
            new_real = v_real*v_real - v_imag*v_imag + real;
            v_imag = 2.f * v_real * v_imag + imag;
            v_real = new_real;
  */
            __m128 len = _mm_add_ps(_mm_mul_ps(v_real, v_real), _mm_mul_ps(v_imag, v_imag));
            __m128 diverged = _mm_cmpgt_ps(len, four);
            __m128 divergedNow = _mm_and_ps(diverged, oldIterating);
            oldIterating = _mm_andnot_ps(divergedNow, oldIterating);
            //__m128 diverged = _mm_cmpge_ps(len, _mm_set1_ps(0)); 
            //__m128 old = _mm_loadu_ps((float*)d+X);

            if (any_ps(divergedNow))
            {
              uint32_t color = ColorMap(numItersBefore + i);
              if (xnotzero_ss(divergedNow)) d[X] = color;
              if (ynotzero_ss(divergedNow)) d[X+1] = color;
              if (znotzero_ss(divergedNow)) d[X+2] = color;
              if (wnotzero_ss(divergedNow)) d[X+3] = color;
//              _mm_storeu_ps((float*)d+X, _mm_or_ps(old, diverged));
            }
            /*
            if (v_real*v_real + v_imag*v_imag > 4.f)
            {
              d[X] = ColorMap(numItersBefore + i);
              v_real = INFINITY;
              break;
            }
            */
          }
          //sr[X] = v_real;
          //si[X] = v_imag;
          _mm_storeu_ps(sr+X, v_real);
          _mm_storeu_ps(si+X, v_imag);
        }
      }
//      real += incrX*4;
//      Real = _mm_set_ps(real + 3*incrX, real + 2*incrX, real + incrX, real);
    }
  }
  return (unsigned long long)((h-y)/yIncr)*w*numIters;
}