Example #1
1
static void
thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
{
    int i, j;
    Size roi = _src.size();
    roi.width *= _src.channels();
    const float* src = (const float*)_src.data;
    float* dst = (float*)_dst.data;
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);

#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
#endif

    if( _src.isContinuous() && _dst.isContinuous() )
    {
        roi.width *= roi.height;
        roi.height = 1;
    }

#ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
        return;
#endif

#if defined(HAVE_IPP)
    IppiSize sz = { roi.width, roi.height };
    switch( type )
    {
    case THRESH_TRUNC:
        if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
            return;
        setIppErrorStatus();
        break;
    case THRESH_TOZERO:
        if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
            return;
        setIppErrorStatus();
        break;
    case THRESH_TOZERO_INV:
        if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
            return;
        setIppErrorStatus();
        break;
    }
#endif

    switch( type )
    {
        case THRESH_BINARY:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmpgt_ps( v0, thresh4 );
                        v1 = _mm_cmpgt_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] > thresh ? maxval : 0;
            }
            break;

        case THRESH_BINARY_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmple_ps( v0, thresh4 );
                        v1 = _mm_cmple_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] <= thresh ? maxval : 0;
            }
            break;

        case THRESH_TRUNC:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_min_ps( v0, thresh4 );
                        v1 = _mm_min_ps( v1, thresh4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = std::min(src[j], thresh);
            }
            break;

        case THRESH_TOZERO:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif

                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v > thresh ? v : 0;
                }
            }
            break;

        case THRESH_TOZERO_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#endif
                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v <= thresh ? v : 0;
                }
            }
            break;
        default:
            return CV_Error( CV_StsBadArg, "" );
    }
}
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
    __m128i          ewitab;
    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    real             *ewtab;
    __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
    real             rswitch_scalar,d_scalar;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
    real             *vftab;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
Example #5
0
	static inline Simd spread(float value) {
		Simd res;
		res.reg = _mm_set1_ps(value);
		return res;
	}
Example #6
0
#include "AL/alc.h"
#include "alMain.h"
#include "alu.h"

#include "alSource.h"
#include "alAuxEffectSlot.h"
#include "defs.h"
#include "hrtfbase.h"


template<>
const ALfloat *Resample_<BSincTag,SSETag>(const InterpState *state, const ALfloat *RESTRICT src,
    ALsizei frac, ALint increment, ALfloat *RESTRICT dst, ALsizei dstlen)
{
    const ALfloat *const filter{state->bsinc.filter};
    const __m128 sf4{_mm_set1_ps(state->bsinc.sf)};
    const ALsizei m{state->bsinc.m};

    ASSUME(m > 0);
    ASSUME(dstlen > 0);
    ASSUME(increment > 0);
    ASSUME(frac >= 0);

    src -= state->bsinc.l;
    for(ALsizei i{0};i < dstlen;i++)
    {
        // Calculate the phase index and factor.
#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
        const ALsizei pi{frac >> FRAC_PHASE_BITDIFF};
        const ALfloat pf{(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF))};
#undef FRAC_PHASE_BITDIFF
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_ps(fr->epsfac);
    charge           = mdatoms->chargeA;
Example #8
0
void process(
    struct dt_iop_module_t *self,
    dt_dev_pixelpipe_iop_t *piece,
    void *ivoid,
    void *ovoid,
    const dt_iop_roi_t *roi_in,
    const dt_iop_roi_t *roi_out)
{
  const int filters = dt_image_flipped_filter(&piece->pipe->image);
  dt_iop_highlights_data_t *data = (dt_iop_highlights_data_t *)piece->data;

  const float clip = data->clip * fminf(piece->pipe->processed_maximum[0], fminf(piece->pipe->processed_maximum[1], piece->pipe->processed_maximum[2]));
  // const int ch = piece->colors;
  if(dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) || !filters)
  {
    const __m128 clipm = _mm_set1_ps(clip);
#ifdef _OPENMP
    #pragma omp parallel for schedule(dynamic) default(none) shared(ovoid, ivoid, roi_in, roi_out, data, piece)
#endif
    for(int j=0; j<roi_out->height; j++)
    {
      float *out = (float *)ovoid + (size_t)4*roi_out->width*j;
      float *in  = (float *)ivoid + (size_t)4*roi_in->width*j;
      for(int i=0; i<roi_out->width; i++)
      {
        _mm_stream_ps(out, _mm_min_ps(clipm, _mm_set_ps(in[3],in[2],in[1],in[0])));
        in += 4;
        out += 4;
      }
    }
    _mm_sfence();
    return;
  }

  switch(data->mode)
  {
    case DT_IOP_HIGHLIGHTS_INPAINT: // a1ex's (magiclantern) idea of color inpainting:
    {
      const float clips[4] = {
        0.987*data->clip * piece->pipe->processed_maximum[0],
        0.987*data->clip * piece->pipe->processed_maximum[1],
        0.987*data->clip * piece->pipe->processed_maximum[2],
        clip};
#ifdef _OPENMP
      #pragma omp parallel for schedule(dynamic) default(none) shared(ovoid, ivoid, roi_in, roi_out, data, piece)
#endif
      for(int j=0; j<roi_out->height; j++)
      {
        _interpolate_color(ivoid, ovoid, roi_out, 0, 1, j, clips, filters, 0);
        _interpolate_color(ivoid, ovoid, roi_out, 0, -1, j, clips, filters, 1);
      }

      // up/down directions
#ifdef _OPENMP
      #pragma omp parallel for schedule(dynamic) default(none) shared(ovoid, ivoid, roi_in, roi_out, data, piece)
#endif
      for(int i=0; i<roi_out->width; i++)
      {
        _interpolate_color(ivoid, ovoid, roi_out, 1, 1, i, clips, filters, 2);
        _interpolate_color(ivoid, ovoid, roi_out, 1, -1, i, clips, filters, 3);
      }
      break;
    }
    case DT_IOP_HIGHLIGHTS_LCH:
#ifdef _OPENMP
      #pragma omp parallel for schedule(dynamic) default(none) shared(ovoid, ivoid, roi_in, roi_out, data, piece)
#endif
      for(int j=0; j<roi_out->height; j++)
      {
        float *out = (float *)ovoid + (size_t)roi_out->width*j;
        float *in  = (float *)ivoid + (size_t)roi_out->width*j;
        for(int i=0; i<roi_out->width; i++)
        {
          if(i==0 || i==roi_out->width-1 || j==0 || j==roi_out->height-1)
          {
            // fast path for border
            out[0] = in[0];
          }
          else
          {
            // analyse one bayer block to get same number of rggb pixels each time
            const float near_clip = 0.96f*clip;
            const float post_clip = 1.10f*clip;
            float blend = 0.0f;
            float mean = 0.0f;
            for(int jj=0; jj<=1; jj++)
            {
              for(int ii=0; ii<=1; ii++)
              {
                const float val = in[(size_t)jj*roi_out->width + ii];
                mean += val*0.25f;
                blend += (fminf(post_clip, val) - near_clip)/(post_clip-near_clip);
              }
            }
            blend = CLAMP(blend, 0.0f, 1.0f);
            if(blend > 0)
            {
              // recover:
              out[0] = blend*mean + (1.f-blend)*in[0];
            }
            else out[0] = in[0];
          }
          out ++;
          in ++;
        }
      }
      break;
    default:
    case DT_IOP_HIGHLIGHTS_CLIP:
    {
      const __m128 clipm = _mm_set1_ps(clip);
      const size_t n = (size_t)roi_out->height*roi_out->width;
      float *const out = (float *)ovoid;
      float *const in  = (float *)ivoid;
#ifdef _OPENMP
      #pragma omp parallel for schedule(static) default(none)
#endif
      for(int j=0; j<n; j+=4)
        _mm_stream_ps(out+j, _mm_min_ps(clipm, _mm_load_ps(in+j)));
      _mm_sfence();
      // lets see if there's a non-multiple of four rest to process:
      if(n & 3) for(size_t j=n&~3u; j<n; j++) out[j] = MIN(clip, in[j]);
      break;
    }
  }

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_ps(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm_set1_ps(fr->ic->k_rf);
Example #10
0
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
Example #12
0
std::complex<float>
inline CalcHzAll(const sps::element_rect_t<float>& element,
                 const sps::point_t<float>& projection, // Consider 4 points
                 const float& k,
                 const float* us,
                 const float* uweights,
                 const size_t nUs,
                 const float* vs,
                 const float* vweights,
                 const size_t nVs)
{

  std::complex<float> retval;

  const float z = projection[2];

  float s0 = fabs(projection[1]) + element.hh;
  float s2 = element.hh - fabs(projection[1]);
  float l0 = fabs(projection[0]) + element.hw;
  float l1 = element.hw - fabs(projection[0]);

  __m128 s = _mm_set_ps(s2,s2,s0,s0);
  __m128 l = _mm_set_ps(l1,l0,l1,l0);

  const __m128 vec_s = _mm_fabs_ps(s);
  const __m128 vec_l = _mm_fabs_ps(l);
  const __m128 cargz = _mm_set1_ps(cos(-k*z));
  const __m128 sargz = _mm_set1_ps(sin(-k*z));

  const __m128 vec_l_2 = _mm_mul_ps(vec_l,_m_half_ps);
  const __m128 vec_s_2 = _mm_mul_ps(vec_s,_m_half_ps);

  const __m128 z2 = _mm_set1_ps(SQUARE(z));
  const __m128 vec_l2 = _mm_square_ps(vec_l);
  const __m128 vec_s2 = _mm_square_ps(vec_s);

  __m128 real, imag;

  __m128 intWreal = _mm_setzero_ps();
  __m128 intWimag = _mm_setzero_ps();

  for (size_t iu = 0 ; iu < nUs ; iu++) {

    __m128 us1       = _mm_load1_ps((float*)&us[iu]);
    __m128 uweights1 = _mm_load1_ps((float*)&uweights[iu]);

    __m128 ls  = _mm_add_ps(_mm_mul_ps(vec_l_2,us1),vec_l_2);
    __m128 ls2 = _mm_square_ps(ls);

    __m128 argw = _mm_mul_ps(
                    _mm_set1_ps(-k),
                    _mm_sqrt_ps(
                      _mm_add_ps(
                        _mm_add_ps(
                          ls2,
                          z2),
                        vec_s2)));

    __m128 cargw, sargw;

    _mm_sin_cos_ps(argw, &sargw, &cargw);

    __m128 rcp_denom = _mm_rcp_ps(_mm_add_ps(ls2,vec_s2));
    real = _mm_mul_ps(_mm_mul_ps(uweights1,_mm_sub_ps(cargw, cargz)),rcp_denom);
    imag = _mm_mul_ps(_mm_mul_ps(uweights1,_mm_sub_ps(sargw, sargz)),rcp_denom);
    intWreal = _mm_add_ps(intWreal, real);
    intWimag = _mm_add_ps(intWimag, imag);
  }

  __m128 rcp_denom1 = _mm_rcp_ps(_mm_mul_ps(_m_2pi_ps,_mm_set1_ps(k)));

  intWreal = _mm_mul_ps(
               intWreal,
               _mm_mul_ps(
                 _mm_mul_ps(
                   vec_l_2,
                   vec_s),
                 rcp_denom1));
  intWimag = _mm_mul_ps(
               intWimag,
               _mm_mul_ps(
                 _mm_mul_ps(
                   vec_l_2,
                   vec_s),
                 rcp_denom1));

  __m128 intHreal = _mm_setzero_ps();
  __m128 intHimag = _mm_setzero_ps();

  for(size_t iv = 0 ; iv < nVs ; iv++) {

    __m128 vs1       = _mm_load1_ps((float*)&vs[iv]);
    __m128 vweights1 = _mm_load1_ps((float*)&vweights[iv]);

    __m128 ss  = _mm_add_ps(_mm_mul_ps(vec_s_2,vs1),vec_s_2);
    __m128 ss2 = _mm_square_ps(ss);

    __m128 argh = _mm_mul_ps(
                    _mm_set1_ps(-k),
                    _mm_sqrt_ps(
                      _mm_add_ps(
                        _mm_add_ps(
                          ss2,
                          z2),
                        vec_l2)));

    __m128 cargh, sargh;

    _mm_sin_cos_ps(argh, &sargh, &cargh);

    __m128 rcp_denom = _mm_rcp_ps(_mm_add_ps(ss2,vec_l2));

    real = _mm_mul_ps(_mm_mul_ps(vweights1,_mm_sub_ps(cargh, cargz)),rcp_denom);
    imag = _mm_mul_ps(_mm_mul_ps(vweights1,_mm_sub_ps(sargh, sargz)),rcp_denom);
    intHreal = _mm_add_ps(intHreal, real);
    intHimag = _mm_add_ps(intHimag, imag);
  }

  // Divide by denominator
  intHreal = _mm_mul_ps(intHreal, _mm_mul_ps(_mm_mul_ps(vec_s_2,vec_l), rcp_denom1));
  intHimag = _mm_mul_ps(intHimag, _mm_mul_ps(_mm_mul_ps(vec_s_2,vec_l), rcp_denom1));

  intHreal = _mm_add_ps(intHreal,intWreal);
  intHimag = _mm_add_ps(intHimag,intWimag);

  // Multiply by -i
  __m128 tmp = intHreal;
  intHreal = intHimag;
  intHimag = _mm_neg_ps(tmp);

  // Filter
  __m128 sign = _mm_mul_ps(s,l);
  intHreal = _mm_mulsign_ps(intHreal,sign);
  intHimag = _mm_mulsign_ps(intHimag,sign);

  // Horizontal sum
  _mm_store_ss(&(reinterpret_cast<float(&)[2]>(retval)[0]),_mm_dp_ps(_m_one_ps,intHreal,0xF1));
  _mm_store_ss(&(reinterpret_cast<float(&)[2]>(retval)[1]),_mm_dp_ps(_m_one_ps,intHimag,0xF1));

  return retval;
}
Example #13
0
std::complex<float>
inline CalcHzVecGL(const float& s,
                   const float& l,
                   const float& z,
                   const float& k,
                   const float* us,
                   const float* uweights,
                   const size_t nUs,
                   const float* vs,
                   const float* vweights,
                   const size_t nVs)
{

  const __m128 carg = _mm_set1_ps(cos(-k*z));
  const __m128 sarg = _mm_set1_ps(sin(-k*z));

  const __m128 l_2 = _mm_mul_ps(_mm_set1_ps(l),_m_half_ps);
  const __m128 s_2 = _mm_mul_ps(_mm_set1_ps(s),_m_half_ps);

  const __m128 z2 = _mm_set1_ps(SQUARE(z));
  const __m128 l2 = _mm_set1_ps(SQUARE(l));
  const __m128 s2 = _mm_set1_ps(SQUARE(s));

  __m128 intWreal = _mm_setzero_ps();
  __m128 intWimag = _mm_setzero_ps();

  __m128 real, imag;

  for (size_t iu = 0 ; iu < 4*((nUs+3)/4) ; iu+=4) {

    __m128 ls  = _mm_add_ps(_mm_mul_ps(l_2,_mm_load_ps((float*)&us[iu])),l_2);
    __m128 ls2 = _mm_square_ps(ls);

    __m128 argw = _mm_mul_ps(_mm_set1_ps(-k),_mm_sqrt_ps(_mm_add_ps(_mm_add_ps(ls2,z2),s2)));

    __m128 cargw, sargw;

    _mm_sin_cos_ps(argw, &sargw, &cargw);

    __m128 vec_uweight = _mm_load_ps((float*)&uweights[iu]);

    __m128 denom = _mm_add_ps(ls2,s2);
    __m128 rcp_denom = _mm_rcp_ps(denom);

    real = _mm_mul_ps(_mm_mul_ps(vec_uweight,_mm_sub_ps(cargw, carg)),rcp_denom);
    imag = _mm_mul_ps(_mm_mul_ps(vec_uweight,_mm_sub_ps(sargw, sarg)),rcp_denom);

    // Update integral
    intWreal = _mm_add_ps(intWreal, real);
    intWimag = _mm_add_ps(intWimag, imag);
  }

  __m128 rcp_denom1 = _mm_rcp_ps(_mm_mul_ps(_m_2pi_ps,_mm_set1_ps(k)));

  intWreal = _mm_mul_ps(intWreal, _mm_mul_ps(_mm_mul_ps(l_2,_mm_set1_ps(s)), rcp_denom1));
  intWimag = _mm_mul_ps(intWimag, _mm_mul_ps(_mm_mul_ps(l_2,_mm_set1_ps(s)), rcp_denom1));

  // integral height
  std::complex<float> intH = std::complex<float>(float(0.0),float(0.0));

  __m128 intHreal = _mm_setzero_ps();
  __m128 intHimag = _mm_setzero_ps();

  for(size_t iv = 0 ; iv < 4*((nVs+3)/4) ; iv+=4) {

    __m128 ss  = _mm_add_ps(_mm_mul_ps(s_2,_mm_load_ps((float*)&vs[iv])),s_2);
    __m128 ss2 = _mm_square_ps(ss);

    __m128 argh = _mm_mul_ps(_mm_set1_ps(-k),_mm_sqrt_ps(_mm_add_ps(_mm_add_ps(ss2,z2),l2)));

    __m128 cargh, sargh;

    _mm_sin_cos_ps(argh, &sargh, &cargh);

    __m128 vec_vweight = _mm_load_ps((float*)&vweights[iv]);
    __m128 rcp_denom = _mm_rcp_ps(_mm_add_ps(ss2,l2));

    real = _mm_mul_ps(_mm_mul_ps(vec_vweight,_mm_sub_ps(cargh, carg)),rcp_denom);
    imag = _mm_mul_ps(_mm_mul_ps(vec_vweight,_mm_sub_ps(sargh, sarg)),rcp_denom);
    intHreal = _mm_add_ps(intHreal, real);
    intHimag = _mm_add_ps(intHimag, imag);
  }

  // Divide by denominator
  intHreal = _mm_mul_ps(intHreal, _mm_mul_ps(_mm_mul_ps(s_2,_mm_set1_ps(l)), rcp_denom1));
  intHimag = _mm_mul_ps(intHimag, _mm_mul_ps(_mm_mul_ps(s_2,_mm_set1_ps(l)), rcp_denom1));

  intHreal = _mm_add_ps(intHreal,intWreal);
  intHimag = _mm_add_ps(intHimag,intWimag);

  // Multiply by -i
  __m128 tmp = intHreal;
  intHreal = intHimag;
  intHimag = _mm_neg_ps(tmp);

  // Sum 4 partial integrals
  __m128 result = _mm_dp_ps(_m_one_ps,intHreal,0xF1);
  result = _mm_add_ps(result,_mm_dp_ps(_m_one_ps,intHimag,0xF2));

  ALIGN16_BEGIN float results[4] ALIGN16_END;

  _mm_store_ps(results,result);

  intH.real(results[0]);
  intH.imag(results[1]);

  return intH;
}
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
Example #16
0
mlib_status
__mlib_SignalLimit_F32_F32(
    mlib_f32 *dst,
    const mlib_f32 *src,
    const mlib_f32 *low,
    const mlib_f32 *high,
    mlib_s32 n)
{
	mlib_s32 i, count;
	mlib_f32 tl0;
	mlib_f32 th0;
	mlib_f32 x;
	__m128 tl0_p;
	__m128 th0_p;
	__m128 dx;
	mlib_f32 *psrc, *pdst;
	__m128 *spsrc, *spdst;

	tl0 = low[0];
	th0 = high[0];

	if ((tl0 > th0) || (n <= 0))
		return (MLIB_FAILURE);

	tl0_p = _mm_set1_ps(tl0);
	th0_p = _mm_set1_ps(th0);

	psrc = (mlib_f32 *)src;
	pdst = (mlib_f32 *)dst;

	count = (16 - ((mlib_addr)psrc & 15)) >> 2;
	if (count > n) count = n;

	for (i = 0; i < count; i++) {
		x = psrc[i];
		x = (x < tl0) ? tl0 : x;
		x = (x >= th0) ? th0 : x;
		pdst[i] = x;
	}

	n -= count;
	psrc += count;
	pdst += count;
	spsrc = (__m128 *)psrc;
	spdst = (__m128 *)pdst;

	if ((mlib_addr)spdst & 15) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (i = 0; i < n >> 2; i++) {
			dx = _mm_load_ps((mlib_f32 *)(spsrc + i));
			dx = _mm_max_ps(dx, tl0_p);
			dx = _mm_min_ps(dx, th0_p);
			_mm_storeu_ps((mlib_f32 *)(spdst + i), dx);
		}
	} else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (i = 0; i < n >> 2; i++) {
			dx = _mm_load_ps((mlib_f32 *)(spsrc + i));
			dx = _mm_max_ps(dx, tl0_p);
			dx = _mm_min_ps(dx, th0_p);
			_mm_store_ps((mlib_f32 *)(spdst + i), dx);
		}
	}

	i <<= 2;

	for (; i < n; i++) {
		x = psrc[i];
		x = (x < tl0) ? tl0 : x;
		x = (x >= th0) ? th0 : x;
		pdst[i] = x;
	}

	return (MLIB_SUCCESS);
}
 int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 real             rcutoff_scalar;
 real             *shiftvec,*fshift,*x,*f;
 real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 real             scratch[4*DIM];
 __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 int              vdwioffset0;
 __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 real             *charge;
 __m128i          gbitab;
 __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
 __m128           minushalf = _mm_set1_ps(-0.5);
 real             *invsqrta,*dvda,*gbtab;
 int              nvdwtype;
 __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 int              *vdwtype;
 real             *vdwparam;
 __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 __m128i          vfitab;
 __m128i          ifour       = _mm_set1_epi32(4);
 __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 real             *vftab;
 __m128           dummy_mask,cutoff_mask;
 __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 __m128           one     = _mm_set1_ps(1.0);
 __m128           two     = _mm_set1_ps(2.0);
    real             *shiftvec,*fshift,*x,*f;
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
    real             scratch[4*DIM];
    __m128           fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
    __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
    __m128i          ewitab;
    __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
    real             *ewtab;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
Example #19
0
bool QBVHAccel::Intersect(const Ray *ray, RayHit *rayHit) const {
  //------------------------------
  // Prepare the ray for intersection
  QuadRay ray4(*ray);
  __m128 invDir[3];
  invDir[0] = _mm_set1_ps(1.f / ray->d.x);
  invDir[1] = _mm_set1_ps(1.f / ray->d.y);
  invDir[2] = _mm_set1_ps(1.f / ray->d.z);

  int signs[3];
  ray->GetDirectionSigns(signs);

  //------------------------------
  // Main loop
  int todoNode = 0; // the index in the stack
  int32_t nodeStack[64];
  nodeStack[0] = 0; // first node to handle: root node

  while (todoNode >= 0) {
    // Leaves are identified by a negative index
    if (!QBVHNode::IsLeaf(nodeStack[todoNode])) {
      QBVHNode &node = nodes[nodeStack[todoNode]];
      --todoNode;

      // It is quite strange but checking here for empty nodes slows down the rendering
      const int32_t visit = node.BBoxIntersect(ray4, invDir, signs);

      switch (visit) {
      case (0x1 | 0x0 | 0x0 | 0x0):
        nodeStack[++todoNode] = node.children[0];
        break;
      case (0x0 | 0x2 | 0x0 | 0x0):
        nodeStack[++todoNode] = node.children[1];
        break;
      case (0x1 | 0x2 | 0x0 | 0x0):
        nodeStack[++todoNode] = node.children[0];
        nodeStack[++todoNode] = node.children[1];
        break;
      case (0x0 | 0x0 | 0x4 | 0x0):
        nodeStack[++todoNode] = node.children[2];
        break;
      case (0x1 | 0x0 | 0x4 | 0x0):
        nodeStack[++todoNode] = node.children[0];
        nodeStack[++todoNode] = node.children[2];
        break;
      case (0x0 | 0x2 | 0x4 | 0x0):
        nodeStack[++todoNode] = node.children[1];
        nodeStack[++todoNode] = node.children[2];
        break;
      case (0x1 | 0x2 | 0x4 | 0x0):
        nodeStack[++todoNode] = node.children[0];
        nodeStack[++todoNode] = node.children[1];
        nodeStack[++todoNode] = node.children[2];
        break;
      case (0x0 | 0x0 | 0x0 | 0x8):
        nodeStack[++todoNode] = node.children[3];
        break;
      case (0x1 | 0x0 | 0x0 | 0x8):
        nodeStack[++todoNode] = node.children[0];
        nodeStack[++todoNode] = node.children[3];
        break;
      case (0x0 | 0x2 | 0x0 | 0x8):
        nodeStack[++todoNode] = node.children[1];
        nodeStack[++todoNode] = node.children[3];
        break;
      case (0x1 | 0x2 | 0x0 | 0x8):
        nodeStack[++todoNode] = node.children[0];
        nodeStack[++todoNode] = node.children[1];
        nodeStack[++todoNode] = node.children[3];
        break;
      case (0x0 | 0x0 | 0x4 | 0x8):
        nodeStack[++todoNode] = node.children[2];
        nodeStack[++todoNode] = node.children[3];
        break;
      case (0x1 | 0x0 | 0x4 | 0x8):
        nodeStack[++todoNode] = node.children[0];
        nodeStack[++todoNode] = node.children[2];
        nodeStack[++todoNode] = node.children[3];
        break;
      case (0x0 | 0x2 | 0x4 | 0x8):
        nodeStack[++todoNode] = node.children[1];
        nodeStack[++todoNode] = node.children[2];
        nodeStack[++todoNode] = node.children[3];
        break;
      case (0x1 | 0x2 | 0x4 | 0x8):
        nodeStack[++todoNode] = node.children[0];
        nodeStack[++todoNode] = node.children[1];
        nodeStack[++todoNode] = node.children[2];
        nodeStack[++todoNode] = node.children[3];
        break;
      }
    } else {
      //----------------------
      // It is a leaf,
      // all the informations are encoded in the index
      const int32_t leafData = nodeStack[todoNode];
      --todoNode;

      if (QBVHNode::IsEmpty(leafData))
        continue;

      // Perform intersection
      const u_int nbQuadPrimitives = QBVHNode::NbQuadPrimitives(leafData);

      const u_int offset = QBVHNode::FirstQuadIndex(leafData);

      for (u_int primNumber = offset; primNumber < (offset
          + nbQuadPrimitives); ++primNumber)
        prims[primNumber].Intersect(ray4, *ray, rayHit);
    }//end of the else
  }

  return !rayHit->Miss();
}
    real             scratch[4*DIM];
    __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
    real             *vftab;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_ps(fr->epsfac);
    charge           = mdatoms->chargeA;
Example #21
0
void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid,
             const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  float *in;
  float *out;
  dt_iop_zonesystem_gui_data_t *g = NULL;
  dt_iop_zonesystem_data_t *data = (dt_iop_zonesystem_data_t *)piece->data;

  const int width = roi_out->width;
  const int height = roi_out->height;

  if(self->dev->gui_attached && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW)
  {
    g = (dt_iop_zonesystem_gui_data_t *)self->gui_data;
    dt_pthread_mutex_lock(&g->lock);
    if(g->in_preview_buffer == NULL || g->out_preview_buffer == NULL || g->preview_width != width
       || g->preview_height != height)
    {
      g_free(g->in_preview_buffer);
      g_free(g->out_preview_buffer);
      g->in_preview_buffer = g_malloc_n((size_t)width * height, sizeof(guchar));
      g->out_preview_buffer = g_malloc_n((size_t)width * height, sizeof(guchar));
      g->preview_width = width;
      g->preview_height = height;
    }
    dt_pthread_mutex_unlock(&g->lock);
  }

  /* calculate zonemap */
  const int size = data->size;
  float zonemap[MAX_ZONE_SYSTEM_SIZE] = { -1 };
  _iop_zonesystem_calculate_zonemap(data, zonemap);
  const int ch = piece->colors;


  /* process the image */
  in = (float *)ivoid;
  out = (float *)ovoid;

  const float rzscale = (size - 1) / 100.0f;

  float zonemap_offset[MAX_ZONE_SYSTEM_SIZE] = { -1 };
  float zonemap_scale[MAX_ZONE_SYSTEM_SIZE] = { -1 };

  // precompute scale and offset
  for(int k = 0; k < size - 1; k++) zonemap_scale[k] = (zonemap[k + 1] - zonemap[k]) * (size - 1);
  for(int k = 0; k < size - 1; k++) zonemap_offset[k] = 100.0f * ((k + 1) * zonemap[k] - k * zonemap[k + 1]);

#ifdef _OPENMP
#pragma omp parallel for default(none) shared(in, out, zonemap_scale, zonemap_offset) schedule(static)
#endif
  for(int j = 0; j < height; j++)
    for(int i = 0; i < width; i++)
    {
      /* remap lightness into zonemap and apply lightness */
      const float *inp = in + ch * ((size_t)j * width + i);
      float *outp = out + ch * ((size_t)j * width + i);

      const int rz = CLAMPS(inp[0] * rzscale, 0, size - 2); // zone index

      const float zs = ((rz > 0) ? (zonemap_offset[rz] / inp[0]) : 0) + zonemap_scale[rz];

      _mm_stream_ps(outp, _mm_mul_ps(_mm_load_ps(inp), _mm_set1_ps(zs)));
    }

  _mm_sfence();

  if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, width, height);


  /* if gui and have buffer lets gaussblur and fill buffer with zone indexes */
  if(self->dev->gui_attached && g && g->in_preview_buffer && g->out_preview_buffer)
  {

    float Lmax[] = { 100.0f };
    float Lmin[] = { 0.0f };

    /* setup gaussian kernel */
    const int radius = 8;
    const float sigma = 2.5 * (radius * roi_in->scale / piece->iscale);

    dt_gaussian_t *gauss = dt_gaussian_init(width, height, 1, Lmax, Lmin, sigma, DT_IOP_GAUSSIAN_ZERO);

    float *tmp = g_malloc_n((size_t)width * height, sizeof(float));

    if(gauss && tmp)
    {
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(ivoid, tmp) schedule(static)
#endif
      for(size_t k = 0; k < (size_t)width * height; k++) tmp[k] = ((float *)ivoid)[ch * k];

      dt_gaussian_blur(gauss, tmp, tmp);

      /* create zonemap preview for input */
      dt_pthread_mutex_lock(&g->lock);
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(tmp, g) schedule(static)
#endif
      for(size_t k = 0; k < (size_t)width * height; k++)
      {
        g->in_preview_buffer[k] = CLAMPS(tmp[k] * (size - 1) / 100.0f, 0, size - 2);
      }
      dt_pthread_mutex_unlock(&g->lock);


#ifdef _OPENMP
#pragma omp parallel for default(none) shared(ovoid, tmp) schedule(static)
#endif
      for(size_t k = 0; k < (size_t)width * height; k++) tmp[k] = ((float *)ovoid)[ch * k];

      dt_gaussian_blur(gauss, tmp, tmp);


      /* create zonemap preview for output */
      dt_pthread_mutex_lock(&g->lock);
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(tmp, g) schedule(static)
#endif
      for(size_t k = 0; k < (size_t)width * height; k++)
      {
        g->out_preview_buffer[k] = CLAMPS(tmp[k] * (size - 1) / 100.0f, 0, size - 2);
      }
      dt_pthread_mutex_unlock(&g->lock);
    }

    g_free(tmp);
    if(gauss) dt_gaussian_free(gauss);
  }
}
void getScale(short *mirror, short *result, int height, int width, int delay, float ix, float iy) {
	int i, j;
//	int n, m;
	float a, b;
	
	//float pn, pm;
	__m128 pm,
		pn_1, pn0, pn1, pn2;
	__m128 sum_1, sum0, sum1, sum2;
	
	int size = height*iy;
	int depth = width+2*delay;

	float sum;

#pragma omp parallel private(i, j, sum, a, pn_1, pn0, pn1, pn2, pm, sum_1, sum0, sum1, sum2)
{
#pragma omp for
	for (i = 0; i < size; i++) {
		for (j = 0; j < width*ix; j++) {

//			sum = 0.0f;
			a = ((float) i)/ix  - ((int) i/ix);
			b = ((float) j)/iy  - ((int) j/iy);
			
				//Get all pn
				pn_1 = _mm_set1_ps(Pk(-1 - a));
				pn0  = _mm_set1_ps(Pk(- a));
				pn1  = _mm_set1_ps(Pk(1 - a));
				pn2  = _mm_set1_ps(Pk(2 - a));
				//get all pm
				pm = _mm_set_ps(Pk(b-2),Pk(b-1),Pk(b),Pk(b+1));
				//tmp mul pn*pm
				pn_1 = _mm_mul_ps(pm,pn_1);
				pn0  = _mm_mul_ps(pm,pn0);
				pn1  = _mm_mul_ps(pm,pn1);
				pn2  = _mm_mul_ps(pm,pn2);
				//get all mirror pos
				sum_1 = _mm_cvtepi32_ps(_mm_setr_epi32(
						mirror INDEX ((int) (i/ix+delay-1), (int) (j/iy+delay-1), depth),
						mirror INDEX ((int) (i/ix+delay-1), (int) (j/iy+delay), depth),
						mirror INDEX ((int) (i/ix+delay-1), (int) (j/iy+delay+1), depth),
						mirror INDEX ((int) (i/ix+delay-1), (int) (j/iy+delay+2), depth)));
				sum0  = _mm_cvtepi32_ps(_mm_setr_epi32(
						mirror INDEX ((int) (i/ix+delay), (int) (j/iy+delay-1), depth),
						mirror INDEX ((int) (i/ix+delay), (int) (j/iy+delay), depth),
						mirror INDEX ((int) (i/ix+delay), (int) (j/iy+delay+1), depth),
						mirror INDEX ((int) (i/ix+delay), (int) (j/iy+delay+2), depth)));
				sum1  = _mm_cvtepi32_ps(_mm_setr_epi32(
						mirror INDEX ((int) (i/ix+delay+1), (int) (j/iy+delay-1), depth),
						mirror INDEX ((int) (i/ix+delay+1), (int) (j/iy+delay), depth),
						mirror INDEX ((int) (i/ix+delay+1), (int) (j/iy+delay+1), depth),
						mirror INDEX ((int) (i/ix+delay+1), (int) (j/iy+delay+2), depth)));
				sum2  = _mm_cvtepi32_ps(_mm_setr_epi32(
						mirror INDEX ((int) (i/ix+delay+2), (int) (j/iy+delay-1), depth),
						mirror INDEX ((int) (i/ix+delay+2), (int) (j/iy+delay), depth),
						mirror INDEX ((int) (i/ix+delay+2), (int) (j/iy+delay+1), depth),
						mirror INDEX ((int) (i/ix+delay+2), (int) (j/iy+delay+2), depth)));
				//get sum for all mirror pos
				sum_1 = _mm_mul_ps(sum_1, pn_1);
				sum0  = _mm_mul_ps(sum0, pn0);
				sum1  = _mm_mul_ps(sum1, pn1);
				sum2  = _mm_mul_ps(sum2, pn2);
				//sum all  record sse for mirror pos *pn*pm
				sum_1 = _mm_add_ps(sum_1, sum0);
				sum1  = _mm_add_ps(sum1, sum2);
				
				sum_1 = _mm_add_ps(sum_1, sum1);
				
				sum_1 = _mm_hadd_ps(sum_1, sum_1);
				sum_1 = _mm_hadd_ps(sum_1, sum_1);
				
				_mm_store_ss(&sum, sum_1);

			result INDEX(i, j, (int) (width*ix)) = (int) sum;
			
		}
	}
}
	

}
Example #23
0
	static inline Simd sub(const Simd& lhs, float rhs) {
		Simd res;
		__m128 tmp = _mm_set1_ps(rhs);
		res.reg = _mm_sub_ps(lhs.reg, tmp);
		return res;
	}
Example #24
0
        uint32_t r_size = 0;

        float h[4];
        float p[4][3];
        float ft[4][2];
        float t[4];
        uint32_t offset = 0;

        float const d[2] = { line[2] - line[0], line[3] - line[1], };
        float const length_inv = 1.0f/sqrtf(d[0]*d[0] + d[1]*d[1]);
        float const n[2] = { d[1]*length_inv, -d[0]*length_inv, };
        float const distance = line[0]*n[0] + line[1]*n[1];

        /* TODO: investigate integer registers */
        __m128 const distance_4 = _mm_set1_ps(distance);
        __m128 const n0_4 = _mm_set1_ps(n[0]);
        __m128 const n1_4 = _mm_set1_ps(n[1]);

        __m128 const mask1_4 = _mm_set1_ps(1);
        __m128 const mask2_4 = _mm_set1_ps(2);
        __m128 const shift_4 = _mm_setr_ps(1, 3, 9, 27);

        /* process cell ids */
        for (uint32_t ii = cells_offset; ii < cells_count; ++ii) {
                uint32_t const id = cells[ii];

                if (id > dims[0]*dims[1]*dims[2]) {
                        printf("big id %u\n", id);
                        assert(0);
                }
    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    int              vdwioffset2;
    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          ewitab;
    __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    real             *ewtab;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_ps(fr->epsfac);
    charge           = mdatoms->chargeA;
Example #26
0
int main() {
    __m128 u, v;
    u = _mm_set1_ps(0.0f);
    v = _mm_moveldup_ps(u); // SSE3
    return 0;
}
    int              vdwioffset0;
    __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwioffset1;
    __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    int              vdwioffset2;
    __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
    __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128           velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128           dummy_mask,cutoff_mask;
    __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
    __m128           one     = _mm_set1_ps(1.0);
    __m128           two     = _mm_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_ps(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm_set1_ps(fr->ic->k_rf);
Example #28
0
/* A vectorized version of the Voigt function using X86 SSE instructions */
void my_voigt(const float *damping, const float *frequency_offset, float *voigt_value, int N)   
{                                                                   
    // coefficients of the rational approximation formula
    // to the complementary error function
    const __m128 A0 = _mm_set1_ps(122.607931777104326f);
    const __m128 A1 = _mm_set1_ps(214.382388694706425f);
    const __m128 A2 = _mm_set1_ps(181.928533092181549f);
    const __m128 A3 = _mm_set1_ps(93.155580458138441f);
    const __m128 A4 = _mm_set1_ps(30.180142196210589f);
    const __m128 A5 = _mm_set1_ps(5.912626209773153f);
    const __m128 A6 = _mm_set1_ps(0.564189583562615f);
    const __m128 B0 = _mm_set1_ps(122.60793177387535f);
    const __m128 B1 = _mm_set1_ps(352.730625110963558f);
    const __m128 B2 = _mm_set1_ps(457.334478783897737f);
    const __m128 B3 = _mm_set1_ps(348.703917719495792f);
    const __m128 B4 = _mm_set1_ps(170.354001821091472f);
    const __m128 B5 = _mm_set1_ps(53.992906912940207f);
    const __m128 B6 = _mm_set1_ps(10.479857114260399f);

    __m128 ivsigno;
    __m128 V;
    __m128 Z1_real;
    __m128 Z1_imag;
    __m128 Z2_real;
    __m128 Z2_imag;
    __m128 Z3_real;
    __m128 Z3_imag;
    __m128 Z4_real;
    __m128 Z4_imag;
    __m128 Z5_real;
    __m128 Z5_imag;
    __m128 Z6_real;
    __m128 Z6_imag;
    __m128 ZZ1_real;
    __m128 ZZ1_imag;
    __m128 ZZ2_real;
    __m128 ZZ2_imag;
    __m128 ZZ3_real;
    __m128 ZZ3_imag;
    __m128 ZZ4_real;
    __m128 ZZ4_imag;
    __m128 ZZ5_real;
    __m128 ZZ5_imag;
    __m128 ZZ6_real;
    __m128 ZZ6_imag;
    __m128 ZZ7_real;
    __m128 ZZ7_imag;
    __m128 division_factor;
    __m128 ZZZ_real;
    __m128 damp;
    __m128 offs;
    __m128 vval;
    __m128 one = _mm_set1_ps(1.0f); 
    __m128 zero = _mm_set1_ps(0.0f);
    __m128 mone = _mm_set1_ps(-1.0f);
    __m128 half = _mm_set1_ps(-0.5f);
    __m128 mask;

    float *stmp = (float *) _mm_malloc(4*sizeof(float), 16);

    int i;
    for(i=0; i<N; i+=VECLEN){
        _mm_prefetch((const char *)&damping[i+64], _MM_HINT_T0);
        _mm_prefetch((const char *)&frequency_offset[i+64], _MM_HINT_T0);
        damp = _mm_load_ps(&damping[i]);
        offs = _mm_load_ps(&frequency_offset[i]);
        mask = _mm_cmplt_ps(offs, zero);
        ivsigno = _mm_add_ps(_mm_and_ps(mask,mone),_mm_andnot_ps(mask,one));
        V = _mm_mul_ps(ivsigno, offs);       

        Z1_real = _mm_add_ps(_mm_mul_ps(A6, damp), A5);
        Z1_imag = _mm_mul_ps(A6, V);
        Z2_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z1_real, damp), _mm_mul_ps(Z1_imag, V)), A4);
        Z2_imag = _mm_add_ps(_mm_mul_ps(Z1_real, V), _mm_mul_ps(Z1_imag, damp));
        Z3_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z2_real, damp), _mm_mul_ps(Z2_imag, V)), A3);
        Z3_imag = _mm_add_ps(_mm_mul_ps(Z2_real, V), _mm_mul_ps(Z2_imag, damp));
        Z4_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z3_real, damp), _mm_mul_ps(Z3_imag, V)), A2);
        Z4_imag = _mm_add_ps(_mm_mul_ps(Z3_real, V), _mm_mul_ps(Z3_imag, damp));
        Z5_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z4_real, damp), _mm_mul_ps(Z4_imag, V)), A1);
        Z5_imag = _mm_add_ps(_mm_mul_ps(Z4_real, V), _mm_mul_ps(Z4_imag, damp));
        Z6_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z5_real, damp), _mm_mul_ps(Z5_imag, V)), A0);
        Z6_imag = _mm_add_ps(_mm_mul_ps(Z5_real, V), _mm_mul_ps(Z5_imag, damp));
        ZZ1_real = _mm_add_ps(damp, B6);          
        ZZ1_imag = V;                    
        ZZ2_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ1_real, damp), _mm_mul_ps(ZZ1_imag, V)), B5); 
        ZZ2_imag = _mm_add_ps(_mm_mul_ps(ZZ1_real, V), _mm_mul_ps(ZZ1_imag, damp)); 
        ZZ3_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ2_real, damp), _mm_mul_ps(ZZ2_imag, V)), B4); 
        ZZ3_imag = _mm_add_ps(_mm_mul_ps(ZZ2_real, V), _mm_mul_ps(ZZ2_imag, damp)); 
        ZZ4_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ3_real, damp), _mm_mul_ps(ZZ3_imag, V)), B3); 
        ZZ4_imag = _mm_add_ps(_mm_mul_ps(ZZ3_real, V), _mm_mul_ps(ZZ3_imag, damp)); 
        ZZ5_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ4_real, damp), _mm_mul_ps(ZZ4_imag, V)), B2); 
        ZZ5_imag = _mm_add_ps(_mm_mul_ps(ZZ4_real, V), _mm_mul_ps(ZZ4_imag, damp)); 
        ZZ6_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ5_real, damp), _mm_mul_ps(ZZ5_imag, V)), B1); 
        ZZ6_imag = _mm_add_ps(_mm_mul_ps(ZZ5_real, V), _mm_mul_ps(ZZ5_imag, damp)); 
        ZZ7_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ6_real, damp), _mm_mul_ps(ZZ6_imag, V)), B0); 
        ZZ7_imag = _mm_add_ps(_mm_mul_ps(ZZ6_real, V), _mm_mul_ps(ZZ6_imag, damp)); 
        division_factor = _mm_div_ps(one, _mm_add_ps(_mm_mul_ps(ZZ7_real, ZZ7_real), _mm_mul_ps(ZZ7_imag, ZZ7_imag)));
        ZZZ_real = _mm_mul_ps((_mm_add_ps(_mm_mul_ps(Z6_real, ZZ7_real), _mm_mul_ps(Z6_imag, ZZ7_imag))), division_factor); 

        _mm_stream_ps(&voigt_value[i], ZZZ_real);
    }
    _mm_free(stmp);
}
Example #29
0
void L0Smoothing(cv::Mat &im8uc3, cv::Mat& dest, const float lambda, const float kappa)
{
	// convert the image to double format
	int row = im8uc3.rows, col = im8uc3.cols;
	int size = row*col;
	cv::Mat S;
	im8uc3.convertTo(S, CV_32FC3, 1./255.);

	cv::Mat fx(1,2,CV_32FC1);
	cv::Mat fy(2,1,CV_32FC1);
	fx.at<float>(0) = 1; fx.at<float>(1) = -1;
	fy.at<float>(0) = 1; fy.at<float>(1) = -1;

	cv::Size sizeI2D = im8uc3.size();	
	cv::Mat otfFx = psf2otf(fx, sizeI2D);
	cv::Mat otfFy = psf2otf(fy, sizeI2D);

	cv::Mat Normin1[3];
	cv::Mat single_channel[3];

	cv::split(S, single_channel);

	cv::Mat buffer(S.size(),CV_32F);

	for (int k = 0; k < 3; k++)
	{
		cv::dft(single_channel[k], Normin1[k], cv::DFT_COMPLEX_OUTPUT);
	}

	cv::Mat Denormin2(row, col, CV_32FC1);


	for (int i = 0; i < row; i++)
	{
		for (int j = 0; j < col; j++)
		{
			cv::Vec2f &c1 = otfFx.at<cv::Vec2f>(i,j), &c2 = otfFy.at<cv::Vec2f>(i,j);
			Denormin2.at<float>(i,j) = SQR(c1[0]) + SQR(c1[1]) + SQR(c2[0]) + SQR(c2[1]);
		}
	}


	// the bigger beta the more time iteration
	float beta = 4.f*lambda;
	// the smaller betamax the less segmentation count
	double betamax = 1e5;
	//float betamax = 3e1;

	cv::Mat Denormin;
	cv::Mat shifted_x;
	cv::Mat shifted_y;
	cv::Mat dx[3], dy[3];

	cv::Mat FNormin2;

	while (beta < betamax)
	{	
		addWeighted(Mat::ones(Denormin2.size(),Denormin2.type()), 1.0, Denormin2, beta, 0.0, Denormin);

		Denormin = 1.f/Denormin;
		// h-v subproblem
		for (int k = 0; k < 3; k++)
		{
			single_channel[k].copyTo(shifted_x);
			circshift(shifted_x, 0, -1, buffer);
			dx[k] = shifted_x - single_channel[k];

			single_channel[k].copyTo(shifted_y);
			circshift(shifted_y, -1, 0, buffer);
			dy[k] = shifted_y - single_channel[k];
		}

		const float lb = lambda/beta;

		float* dx0 = dx[0].ptr<float>(0);
		float* dx1 = dx[1].ptr<float>(0);
		float* dx2 = dx[2].ptr<float>(0);
		float* dy0 = dy[0].ptr<float>(0);
		float* dy1 = dy[1].ptr<float>(0);
		float* dy2 = dy[2].ptr<float>(0);
		const __m128 mlb = _mm_set1_ps(lb);
		cv::Mat buff(4,1,CV_32F);
		float* b = (float*)buff.ptr<float>(0);
		int i=0;
		for(;i<=size-4;i+=4)
		{
			__m128 x =  _mm_load_ps(dx0+i);
			__m128 v = _mm_mul_ps(x,x);
			x =  _mm_load_ps(dx1+i);
			v = _mm_add_ps(v, _mm_mul_ps(x,x));
			x =  _mm_load_ps(dx2+i);
			v = _mm_add_ps(v, _mm_mul_ps(x,x));
			x =  _mm_load_ps(dy0+i);
			v = _mm_add_ps(v, _mm_mul_ps(x,x));
			x =  _mm_load_ps(dy1+i);
			v = _mm_add_ps(v, _mm_mul_ps(x,x));
			x =  _mm_load_ps(dy2+i);
			v = _mm_add_ps(v, _mm_mul_ps(x,x));

			_mm_store_ps(b,v);
			if(b[0]< lb)
			{
				dx0[i]=dx1[i]=dx2[i]=dy0[i]=dy1[i]=dy2[i]=0.f;
			}
			if(b[1]< lb)
			{
				dx0[i+1]=dx1[i+1]=dx2[i+1]=dy0[i+1]=dy1[i+1]=dy2[i+1]=0.f;
			}
			if(b[2]< lb)
			{
				dx0[i+2]=dx1[i+2]=dx2[i+2]=dy0[i+2]=dy1[i+2]=dy2[i+2]=0.f;
			}
			if(b[3]< lb)
			{
				dx0[i+3]=dx1[i+3]=dx2[i+3]=dy0[i+3]=dy1[i+3]=dy2[i+3]=0.f;	
			}
		}
		for(;i<size;i++)
		{
			float v =  dx0[i]*dx0[i]+dx1[i]*dx1[i]+dx2[i]*dx2[i]+dy0[i]*dy0[i]+dy1[i]*dy1[i]+dy2[i]*dy2[i];
			if(v < lb)
			{
				dx0[i]=dx1[i]=dx2[i]=dy0[i]=dy1[i]=dy2[i]=0.f;
			}
		}

		// S subproblem
		for (int k = 0; k < 3; k++)
		{
			dx[k].copyTo(shifted_x);
			circshift(shifted_x, 0, 1, buffer);
			dy[k].copyTo(shifted_y);
			circshift(shifted_y, 1, 0, buffer);

			cv::Mat Normin2 = shifted_x - dx[k] + shifted_y - dy[k];

			cv::dft(Normin2, FNormin2, cv::DFT_COMPLEX_OUTPUT);

			//cv::Mat FS = Normin1[k] + beta*FNormin2;
			//FS*=real(Denormin);
			float* n1 = (float*)Normin1[k].ptr<Vec2f>(0);
			float* fn2 = (float*)FNormin2.ptr<Vec2f>(0);
			float* D = Denormin.ptr<float>(0);
			const __m128 mbeta = _mm_set1_ps(beta);
			int i=0;
			for(;i<=size*2-4;i+=4)
			{
				__m128 mfn2 =_mm_add_ps(_mm_loadu_ps(n1+i), _mm_mul_ps(mbeta,_mm_loadu_ps(fn2+i)));
				__m128 mn1 = _mm_loadu_ps(D+(i>>1));
				mn1 = _mm_shuffle_ps(mn1,mn1,_MM_SHUFFLE(1, 1, 0, 0));
				mfn2 = _mm_mul_ps(mn1,mfn2);
				_mm_storeu_ps(fn2+i,mfn2);
			}
			for(;i<size*2;i+=2)
			{
				const float dd = D[(i>>1)];
				fn2[i] = dd*(n1[i] + beta*fn2[i]);
				fn2[i+1] = dd*(n1[i+1] + beta*fn2[i+1]);
			}

			cv::idft(FNormin2, single_channel[k], cv::DFT_SCALE | cv::DFT_REAL_OUTPUT);
		}
		beta *= kappa;
	}

	cv::merge(single_channel, 3, S);
	S.convertTo(dest, CV_8UC3, 255.f);
}
Example #30
0
int main()
{
	std::random_device rd;
	std::mt19937 gen(rd());
	std::uniform_real_distribution<float> dis(0, 255);

	size_t max_iter = 20;
	size_t array_size = 800;
	size_t vector_size = array_size*4;

	vfloat32 *vX1, *vX2, *vY , *vY1 , *vY2;
	std::vector<float> vec1(vector_size) , vec2(vector_size) , vecy(vector_size , 0.) , vecy1(vector_size,0.)
	, vecy2(vector_size, 0.);

	// SIMD vectors must be 16 bits aligned
	vX1 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);
	vX2 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);
	vY  =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);
	vY1 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);
	vY2 =(vfloat32*) _mm_malloc ((size_t) (array_size*sizeof(vfloat32)), 16);

	vfloat32 vy = _mm_set_ps(0,0,0,0);

	int j = 0;

	// Initialize vectors and simd arrays
	for(size_t i = 0 ; i < array_size ; ++i)
	{
		float r1 = dis(gen) , r2 = dis(gen) , r3 = dis(gen) , r4 = dis(gen);
		float r5 = dis(gen) , r6 = dis(gen) , r7 = dis(gen) , r8 = dis(gen);

		vec1[j] = r1; vec1[j+1] = r2 ; vec1[j+2] = r3 ; vec1[j+3] = r4;
		vec2[j] = r5; vec2[j+1] = r6 ; vec2[j+2] = r7 ; vec2[j+3] = r8;

		vfloat32 vx1 = _mm_set_ps(r4 , r3 , r2 , r1  );
		vfloat32 vx2 = _mm_set_ps(r8 , r7 , r6 , r5  );

		_mm_store_ps((float*) &vX1[i], vx1);
		_mm_store_ps((float*) &vX2[i], vx2);
		_mm_store_ps((float*) &vY[i], vy);
		_mm_store_ps((float*) &vY1[i], vy);
		_mm_store_ps((float*) &vY2[i], vy);

		j +=4;
	}

	// test pour l'addition de vectors
	{
		auto start = std::chrono::steady_clock::now();
		vectoradd_simd(vX1,vX2,vY,array_size);
		auto end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff = end-start;
		// std::cout << "vector addition time with simd: " << diff.count() << " s" << std::endl;

		start = std::chrono::steady_clock::now();
		std::transform( vec1.begin() , vec1.end() , vec2.begin() , vecy.begin() , std::plus<float>());

		end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff1 = end-start;
		// std::cout << "vector addition time without simd: " << diff1.count() << " s" << std::endl;

		j = 0;
		bool is_valid = true;
		for(size_t i = 0 ; i < array_size ; ++i)
		{
			float out[4] ;
			_mm_store_ps(out , vY[i]);

			if ( out[0] == vecy[j] && out[1] == vecy[j+1] && out[2] == vecy[j+2] && out[3] == vecy[j+3])
				{ j += 4;}
			else
			{
				is_valid = false;
				break;
			}
		}

		if(is_valid)
		{
			std::cout << "l'addition de vecteurs en simd est correcte" << std::endl;
			std::cout << "speedup obtained for vector addition with simd : " << diff1.count() / diff.count() << std::endl;
		}
		else
		{
			std::cout << " l'addition de vecteurs end simd est incorrecte" << std::endl;
		}

		std::cout << "\n";
	}

	// test pour le dot product
	{
		auto start = std::chrono::steady_clock::now();
		vfloat32 sres = vectordot_simd(vX1 , vX2 , array_size);
		auto end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff = end-start;
		// std::cout << "dot product time with simd: " << diff.count() << " s" << std::endl;

		start = std::chrono::steady_clock::now();
		float res = std::inner_product( vec1.begin() , vec1.end() , vec2.begin() , 0. );
		end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff1 = end-start;
		// std::cout << "dot product time without simd: " << diff1.count() << " s" << std::endl;

		float out[4] ;
		_mm_store_ps( out , sres);

		if(  std::abs(out[0] - res ) < 0.01f )
		{
			std::cout << "le produit de vecteurs en simd est correct" << std::endl;
			std::cout << "speedup obtained for dot product with simd : " << diff1.count() / diff.count() << std::endl;
		}
		else {std::cout << "le produit de vecteurs en simd est incorrect : " << out[0] << "  " << res << std::endl;}

		std::cout << "\n";
	}

	// test for 1D filtre with rotation without border check
	{
		auto start = std::chrono::steady_clock::now();
		float divide = 1./3. ;
		for(std::size_t i = 1 ; i < vector_size-1 ; ++i)
		{
			vecy1[i] = divide * ( vec1[i-1] + vec1[i] + vec1[i+1] );
		}
		auto end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff1 = end-start;;

		start = std::chrono::steady_clock::now();
		vectoravg3_simd(vX1 , vY1 , array_size);
		end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff = end-start;

		j = 4;
		bool is_valid = true;

		for(size_t i = 1 ; i < array_size-1 ; ++i)
		{
			float out[4] ;
			_mm_store_ps(out , vY1[i]);

			if ( is_valid == true && out[0] == vecy1[j] && out[1] == vecy1[j+1] && out[2] == vecy1[j+2] && out[3] == vecy1[j+3])
				{ j += 4;}
			else
			{
				is_valid = false;
				break;
			}
		}

		if(is_valid)
		{
			std::cout << "la filtre moyenneur en simd est correct" << std::endl;
			std::cout << "speedup obtained for average filter with simd : " << diff1.count() / diff.count() << std::endl;
		}
		else
		{
			std::cout << "la filtre moyenneur en simd est incorrect" << std::endl;
		}

		std::cout << "\n";
	}

	bool valid_mandel = false;
	// test for mandelbrot
	{
		std::vector<float> mandel_test(4,0);
		std::vector<float> mandel_test1(4,0);
		std::vector<size_t> indx(4,0);
		vfloat32 mdt = _mm_set1_ps(0);
		vfloat32 mdt1 = _mm_set1_ps(0);

		mandel_test[0] = -0.70;
		mandel_test[1] = -0.80;
		mandel_test[2] = -0.90;
		mandel_test[3] = -1.00;

		mandel_test1[0] = +0.10;
		mandel_test1[1] = +0.30;
		mandel_test1[2] = +0.30;
		mandel_test1[3] = +0.40;

		mdt  = _mm_setr_ps(-1.00, -0.90, -0.80, -0.70);
		mdt1 = _mm_setr_ps(+0.40, +0.30, +0.30, +0.10);

		auto start = std::chrono::steady_clock::now();
		for(std::size_t i = 0 ; i < 4 ; ++i )
		{

			indx[i] = mandelbrot_scalar(mandel_test[i] , mandel_test1[i] , max_iter );
		}
		auto end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff1 = end-start;;

		start = std::chrono::steady_clock::now();
		vuint32 res_mandel = mandelbrot_simd(mdt, mdt1 , max_iter);
		end = std::chrono::steady_clock::now();
		std::chrono::duration<double> diff = end-start;

		unsigned int out[4] __attribute__((aligned(16))) ;

		__m128i* po = (__m128i*) &out[0] ;

		_mm_store_si128(po, res_mandel);

		bool v1 = false , v2 = false;

		if( indx[0] == 20 && indx[1] == 8 && indx[2] == 10 && indx[3] == 6 )
		{
			v1 = true;
			std::cout << "la fonction mandelbrot en scalaire est correcte" << std::endl;
		}
		else
		{
			std::cout << "la fonction mandelbrot en scalaire est incorrecte" << std::endl;
			std::cout << "le bon résultat est : 20 8 10 6 \n" << "vous avez obtenu : ";
			vec_display(indx,0);
		}


		if( out[3] == 20 && out[2] == 8 && out[1] == 10 && out[0] == 6 )
		{
			v2 = true;
			std::cout << "la fonction mandelbrot en SIMD est correcte" << std::endl;
		}
		else
		{
			std::cout << "la fonction mandelbrot en SIMD est incorrecte" << std::endl;
			std::cout << "le bon résultat est 20 8 10 6 \n" << "vous avez obtenu :  ";
			simd_display_i32(res_mandel);
		}


		if ( v1 && v2 )
		{
			std::cout << "speedup obtained for mandelbrot : " << diff1.count() / diff.count() << std::endl;
			valid_mandel = true;
		}
	}

	// test for mandelbrot function

	{
		if(valid_mandel)
		{

			std::cout << "\n-----------------------------" << std::endl;
			std::cout << "------ benchmandelbrot ------" << std::endl;
			std::cout << "-----------------------------\n" << std::endl;

			size_t h = SIZE , w = SIZE ;
			std::vector<size_t> indx(h*w,0);
			vfloat32 mdt = _mm_set1_ps(0);
			vfloat32 mdt1 = _mm_set1_ps(0);

			float a0 = -1.5 , a1 = +0.5;
			float b0 = -1.0 , b1 = +1.0;

			float avg_cycles_vec = 0;
			float avg_time_vec  = 0;

			size_t num_iter = 200;

			for(size_t i =0 ; i < num_iter ; ++i)
			{
				auto start = std::chrono::steady_clock::now();
				auto cycles_s = rdtsc();
				calc_mandelbrot_scalar( indx , h , w , a0 , a1 , b0 , b1  , max_iter );
				auto cycles_e = rdtsc();
				auto end = std::chrono::steady_clock::now();
				std::chrono::duration<double> diff1 = end-start;

				avg_time_vec += diff1.count() ;

				avg_cycles_vec += cycles_e - cycles_s;
			}

			avg_time_vec /= num_iter ;
			avg_cycles_vec /= num_iter ;

			std::cout << " mandelbrot vector time : " << avg_time_vec << std::endl;
			std::cout << " mandelbrot vector cycles time : " << avg_cycles_vec << std::endl;

			vuint32 **Simd_indx = (vuint32**)_mm_malloc ((size_t)( h*sizeof(vuint32*)), 16);
			if (Simd_indx)
			{
				for (size_t i = 0; i < w; i++)
				{
					Simd_indx[i] = (vuint32*) _mm_malloc ((size_t) (w*sizeof(vuint32)), 16);
				}
			}

			float avg_cycles_simd = 0;
			float avg_time_simd  = 0;

			for(size_t i = 0 ; i < num_iter ; ++i)
			{
				auto start = std::chrono::steady_clock::now();
				auto cycles_s = rdtsc();
				calc_mandelbrot_simd( Simd_indx , h , w , a0 , a1 , b0 , b1  , max_iter );
				auto cycles_e = rdtsc();
				auto end = std::chrono::steady_clock::now();
				std::chrono::duration<double> diff = end-start;

				avg_time_simd += diff.count() ;
				avg_cycles_simd += cycles_e - cycles_s;
			}

			avg_time_simd /= num_iter ;
			avg_cycles_simd /= num_iter ;

			std::cout << " mandelbrot SIMD time : " << avg_time_simd << std::endl;
			std::cout << " mandelbrot SIMD cycles time : " << avg_cycles_simd << std::endl;

			std::cout << "speedup obtained for mandelbrot : " << avg_time_vec / avg_time_simd << std::endl;
			std::cout << "speedup in cycles obtained for mandelbrot : " <<  avg_cycles_vec / avg_cycles_simd << std::endl;
		}

	}


	_mm_free(vX1);
	_mm_free(vX2);
	_mm_free(vY);
	_mm_free(vY1);
	_mm_free(vY2);


}