Ejemplo n.º 1
0
/* Fast remote SCI copy for systems with write-combining enabled.
   This is the version using SSE instructions to copy 128 Byte blocks,
   and flushes after 64 Byte. */
void _mpid_smi_sse64_memcpy(void *dest, const void *src, size_t size)
{
	char*	a = (char*) src;
	char*	b = (char*) dest;
	size_t	j = 0;
	__m128	xmm[8];

	/* Align the destination to a 64 Byte boundary */
	for(; (j < size) && (((size_t) &b[j]) % 64 != 0); j++)
		((char*) b)[j] = ((char*) a)[j];
	
	// Loads two cache lines of data to a location closer to the processor.
	_mm_prefetch(a+j, _MM_HINT_NTA);
	_mm_prefetch(a+j+64, _MM_HINT_NTA);

	/* copy 128 byte per loop */
	for (; (j+128) < size; j+=128) 
	{
		// Loads two cache lines of data to a location closer to the processor.
		_mm_prefetch(a+j+128, _MM_HINT_NTA);
		_mm_prefetch(a+j+192, _MM_HINT_NTA);

		/* load 128 Byte into xmm register */
		xmm[0] = _mm_load_ps((float*) &a[j]);
		xmm[1] = _mm_load_ps((float*) &a[j+16]);
		xmm[2] = _mm_load_ps((float*) &a[j+32]);
		xmm[3] = _mm_load_ps((float*) &a[j+48]);
		xmm[4] = _mm_load_ps((float*) &a[j+64]);
		xmm[5] = _mm_load_ps((float*) &a[j+80]);
		xmm[6] = _mm_load_ps((float*) &a[j+96]);
		xmm[7] = _mm_load_ps((float*) &a[j+112]);

		/* store 64 byte */
		_mm_stream_ps((float*) &b[j], xmm[0]);
		_mm_stream_ps((float*) &b[j+16], xmm[1]);
		_mm_stream_ps((float*) &b[j+32], xmm[2]);
		_mm_stream_ps((float*) &b[j+48], xmm[3]);
		
		/* flush the write-combine buffer */
		_mm_sfence(); 

		/* store 64 byte */
		_mm_stream_ps((float*) &b[j+64], xmm[4]);
		_mm_stream_ps((float*) &b[j+80], xmm[5]);
		_mm_stream_ps((float*) &b[j+96], xmm[6]);
		_mm_stream_ps((float*) &b[j+112], xmm[7]);

		/* flush the write-combine buffer */
		_mm_sfence();  
	}

	/* copy tail */
	for(; j<size; j++)
		((char*) b)[j] = ((char*) a)[j];
}
Ejemplo n.º 2
0
        void dmul(unsigned int N, const double* a, const double* b, double* y) {
            flops_counter += N ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Y1, Y2, A1, A2, B1, B2 ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&a[i] + 256), _MM_HINT_NTA) ;
                    _mm_prefetch((const char*)(&b[i] + 256), _MM_HINT_NTA) ;
                    A1 = _mm_load_pd(&a[i]) ;
                    B1 = _mm_load_pd(&b[i]) ;
                    Y1 = _mm_mul_pd(A1,B1) ;
                    i += 2 ;
                    A2 = _mm_load_pd(&a[i]) ;
                    B2 = _mm_load_pd(&b[i]) ;
                    Y2 = _mm_mul_pd(A2,B2) ;
                    i += 2 ;
                    _mm_stream_pd(&y[i - 4], Y1) ;
                    _mm_stream_pd(&y[i - 2], Y2) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            for(unsigned int i=0; i<N; i++) {
                y[i] = a[i] * b[i] ;
            }
        }
Ejemplo n.º 3
0
        void dscal(unsigned int N, double a, double* y) {
            flops_counter += N ;
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Y1, Y2, AA ;
                SSE_ALIGNED(double temp[2]) ;
                temp[0] = a ; temp[1] = a ;
                AA = _mm_load_pd(temp) ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ;
                    Y1 = _mm_load_pd(&y[i]) ;
                    Y1 = _mm_mul_pd(Y1, AA) ;
                    i += 2 ;
                    Y2 = _mm_load_pd(&y[i]) ;
                    Y2 = _mm_mul_pd(Y2, AA) ;
                    i += 2 ;
                    _mm_stream_pd(&y[i - 4], Y1) ;
                    _mm_stream_pd(&y[i - 2], Y2) ;
                }
                _mm_sfence() ;
                return ; 
            }
#endif
            for(unsigned int i=0; i<N; i++) {
                y[i] *= a ;
            }
        }
Ejemplo n.º 4
0
void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
	assert_16_byte_aligned( dst );
	assert_16_byte_aligned( src );

	int i = 0;
	for ( ; i + 128 <= numBytes; i += 128 ) {
		__m128i d0 = _mm_load_si128( (__m128i *)&src[i + 0*16] );
		__m128i d1 = _mm_load_si128( (__m128i *)&src[i + 1*16] );
		__m128i d2 = _mm_load_si128( (__m128i *)&src[i + 2*16] );
		__m128i d3 = _mm_load_si128( (__m128i *)&src[i + 3*16] );
		__m128i d4 = _mm_load_si128( (__m128i *)&src[i + 4*16] );
		__m128i d5 = _mm_load_si128( (__m128i *)&src[i + 5*16] );
		__m128i d6 = _mm_load_si128( (__m128i *)&src[i + 6*16] );
		__m128i d7 = _mm_load_si128( (__m128i *)&src[i + 7*16] );
		_mm_stream_si128( (__m128i *)&dst[i + 0*16], d0 );
		_mm_stream_si128( (__m128i *)&dst[i + 1*16], d1 );
		_mm_stream_si128( (__m128i *)&dst[i + 2*16], d2 );
		_mm_stream_si128( (__m128i *)&dst[i + 3*16], d3 );
		_mm_stream_si128( (__m128i *)&dst[i + 4*16], d4 );
		_mm_stream_si128( (__m128i *)&dst[i + 5*16], d5 );
		_mm_stream_si128( (__m128i *)&dst[i + 6*16], d6 );
		_mm_stream_si128( (__m128i *)&dst[i + 7*16], d7 );
	}
	for ( ; i + 16 <= numBytes; i += 16 ) {
		__m128i d = _mm_load_si128( (__m128i *)&src[i] );
		_mm_stream_si128( (__m128i *)&dst[i], d );
	}
	for ( ; i + 4 <= numBytes; i += 4 ) {
		*(uint32 *)&dst[i] = *(const uint32 *)&src[i];
	}
	for ( ; i < numBytes; i++ ) {
		dst[i] = src[i];
	}
	_mm_sfence();
}
Ejemplo n.º 5
0
void laplacian(double* v1, double* v2, int dim_m, int dim_n)
{
	//
#pragma omp parallel for schedule(static)
	for (int j = 1; j < dim_n - 1; ++j )
	{
		int kstart = 1;
		while ( ((long) &v2[j*dim_m + kstart]) & 0x000000000000001F )
		{ 
			kstart++;
		}
		int i = 1;
		for (; i < kstart; ++i)
		{
			kernel_sequential(v1 + j*dim_n + i, v2 + j*dim_n + i, dim_n);	
		}
		for (; i < dim_m - 1 - (dim_m - 1)%4; i = i + 4)
		{
			kernel(v1 + j*dim_n + i, v2 + j*dim_n + i, dim_n);
		}
		//asm volatile ("mfence" ::: "memory");
		for (; i < dim_m - 1; ++i)
		{
			kernel_sequential(v1 + j*dim_n + i, v2 + j*dim_n + i, dim_n);	
		}
	}
#pragma omp parallel 
{	
	_mm_sfence();
}

}
Ejemplo n.º 6
0
Archivo: pmem.c Proyecto: jxy859/nvml
/*
 * predrain_fence_sfence -- (internal) issue the pre-drain fence instruction
 */
static void
predrain_fence_sfence(void)
{
	LOG(15, NULL);

	_mm_sfence();	/* ensure CLWB or CLFLUSHOPT completes before PCOMMIT */
}
Ejemplo n.º 7
0
void process_sse2(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
                  void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  const dt_iop_zonesystem_data_t *const d = (const dt_iop_zonesystem_data_t *const)piece->data;

  process_common_setup(self, piece, ivoid, ovoid, roi_in, roi_out);

  const int ch = piece->colors;
  const int size = d->params.size;

#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
  for(int j = 0; j < roi_out->height; j++)
  {
    for(int i = 0; i < roi_out->width; i++)
    {
      /* remap lightness into zonemap and apply lightness */
      const float *in = (float *)ivoid + ch * ((size_t)j * roi_out->width + i);
      float *out = (float *)ovoid + ch * ((size_t)j * roi_out->width + i);

      const int rz = CLAMPS(in[0] * d->rzscale, 0, size - 2); // zone index

      const float zs = ((rz > 0) ? (d->zonemap_offset[rz] / in[0]) : 0) + d->zonemap_scale[rz];

      _mm_stream_ps(out, _mm_mul_ps(_mm_load_ps(in), _mm_set1_ps(zs)));
    }
  }

  _mm_sfence();

  process_common_cleanup(self, piece, ivoid, ovoid, roi_in, roi_out);
}
Ejemplo n.º 8
0
/** process, all real work is done here. */
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *i, void *o, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  // this is called for preview and full pipe separately, each with its own pixelpipe piece.
  assert(dt_iop_module_colorspace(self) == iop_cs_Lab);
  // get our data struct:
  dt_iop_colorcontrast_params_t *d = (dt_iop_colorcontrast_params_t *)piece->data;
  // how many colors in our buffer?
  const int ch = piece->colors;
  // iterate over all output pixels (same coordinates as input)
#ifdef _OPENMP
  // optional: parallelize it!
  #pragma omp parallel for default(none) schedule(static) shared(i,o,roi_in,roi_out,d)
#endif
  for(int j=0; j<roi_out->height; j++)
  {

    float *in  = ((float *)i) + ch*roi_in->width *j;
    float *out = ((float *)o) + ch*roi_out->width*j;

    const __m128 scale = _mm_set_ps(0.0f,d->b_steepness,d->a_steepness,1.0f);
    const __m128 offset = _mm_set_ps(0.0f,d->b_offset,d->a_offset,0.0f);
    const __m128 min = _mm_set_ps(0.0f,-128.0f,-128.0f, -INFINITY);
    const __m128 max = _mm_set_ps(0.0f, 128.0f, 128.0f,  INFINITY);


    for(int i=0; i<roi_out->width; i++)
    {
      _mm_stream_ps(out,_mm_min_ps(max,_mm_max_ps(min,_mm_add_ps(offset,_mm_mul_ps(scale,_mm_load_ps(in))))));
      in+=ch;
      out+=ch;
    }
  }
  _mm_sfence();
}
Ejemplo n.º 9
0
bool opt_copy_stream_to_stream( x42memStream_t *dest, x42memStream_t *src,
	size_t elemSize, uint numElems, x42opts_t *opts )
{
	REF_PARAM( opts );
	
	if( (opts->caps & OPT_SSE2) &&
		elemSize <= sizeof( __m128 ) &&
		stream_is_aligned( src ) && stream_pad_ok( src, elemSize ) &&
		stream_is_aligned( dest ) && stream_pad_ok( dest, elemSize ) )
	{ 
		uint i; 

		size_t is = src->stride; 
		size_t os = dest->stride; 

		const __m128i * RESTRICT pi = (__m128i*)src->pStreamZero; 
		__m128i * RESTRICT po = (__m128i*)dest->pStreamZero; 

		for( i = 0; i < numElems; i++ ) 
		{ 
			__m128i v = _mm_load_si128( pi ); 
			_mm_stream_si128( po, v ); 

			pi = (__m128i*)((byte*)pi + is); 
			po = (__m128i*)((byte*)po + os); 
		} 

		_mm_sfence(); 

		return true;
	}
Ejemplo n.º 10
0
Archivo: mu.c Proyecto: B-Rich/coreboot
//FUNC_ATTRIBUTE (noinline)
VOID
MemUDummyCLRead (
  IN       UINT32 Address
  )
{
  _mm_sfence ();
  __readfsbyte (Address);
}
Ejemplo n.º 11
0
Archivo: pmem.c Proyecto: jxy859/nvml
/*
 * drain_pcommit -- (internal) wait for PM stores to drain, pcommit version
 */
static void
drain_pcommit(void)
{
	LOG(15, NULL);

	Func_predrain_fence();
	_mm_pcommit();
	_mm_sfence();
}
Ejemplo n.º 12
0
void g() {
  (void)_mm_getcsr();
  _mm_setcsr(1);
  _mm_sfence();

  _mm_clflush((void*)0);
  _mm_lfence();
  _mm_mfence();
  _mm_pause();
}
Ejemplo n.º 13
0
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out)
{
  dt_develop_t *dev = self->dev;

  const int ch = piece->colors;

  const __m128 upper = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f);
  const __m128 lower = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f);

  const int colorscheme = dev->overexposed.colorscheme;
  const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]);
  const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]);

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(ovoid) schedule(static)
#endif
  for(int k=0; k<roi_out->height; k++)
  {
    const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
    {
      const __m128 pixel = _mm_load_ps(in);

      __m128 isoe = _mm_cmpge_ps(pixel, upper);
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));

      __m128 isue = _mm_cmple_ps(pixel, lower);
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));

      __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
                                _mm_and_ps(isoe, upper_color));

      result = _mm_or_ps(_mm_andnot_ps(isue, result),
                         _mm_and_ps(isue, lower_color));

      _mm_stream_ps(out, result);
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
Ejemplo n.º 14
0
void
process(
  struct dt_iop_module_t *self,
  dt_dev_pixelpipe_iop_t *piece,
  const void *const ivoid,
  void *ovoid,
  const dt_iop_roi_t *const roi_in,
  const dt_iop_roi_t *const roi_out)
{
  const  float divider  = (float)UINT16_MAX;
  const __m128 dividers = _mm_set_ps1(divider);

#ifdef _OPENMP
  #pragma omp parallel for default(none) schedule(static) shared(ovoid)
#endif
  for(int j = 0; j < roi_out->height; j++)
  {
    const uint16_t *in = ((uint16_t *)ivoid) + (size_t)j * roi_out->width;
    float *out = ((float *)ovoid) + (size_t)j * roi_out->width;

    int i = 0;
    int alignment = ((8 - (j * roi_out->width & (8 - 1))) & (8 - 1));

    // process unaligned pixels
    for ( ; i < alignment ; i++, out++, in++)
      *out = ((float)(*in)) / divider;

    // process aligned pixels with SSE
    for( ; i < roi_out->width - (8 - 1); i += 8, in += 8)
    {
      const __m128i input = _mm_load_si128((__m128i *)in);

      __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0));
      __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0));

      __m128 flo = _mm_cvtepi32_ps(ilo);
      __m128 fhi = _mm_cvtepi32_ps(ihi);

      flo = _mm_div_ps(flo, dividers);
      fhi = _mm_div_ps(fhi, dividers);

      _mm_stream_ps(out, flo);
      out += 4;
      _mm_stream_ps(out, fhi);
      out += 4;
    }

    // process the rest
    for( ; i < roi_out->width; i++, out++, in++)
      *out = ((float)(*in)) / divider;
  }
  _mm_sfence();
}
Ejemplo n.º 15
0
static void process_clip_sse2(dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid,
                              const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
                              const float clip)
{
  if(piece->pipe->dsc.filters)
  { // raw mosaic
    const __m128 clipm = _mm_set1_ps(clip);
    const size_t n = (size_t)roi_out->height * roi_out->width;
    float *const out = (float *)ovoid;
    float *const in = (float *)ivoid;
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
#endif
    for(size_t j = 0; j < (n & ~3u); j += 4) _mm_stream_ps(out + j, _mm_min_ps(clipm, _mm_load_ps(in + j)));
    _mm_sfence();
    // lets see if there's a non-multiple of four rest to process:
    if(n & 3)
      for(size_t j = n & ~3u; j < n; j++) out[j] = MIN(clip, in[j]);
  }
  else
  {
    const __m128 clipm = _mm_set1_ps(clip);
    const int ch = piece->colors;

#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      float *out = (float *)ovoid + (size_t)ch * roi_out->width * j;
      float *in = (float *)ivoid + (size_t)ch * roi_in->width * j;
      for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
      {
        _mm_stream_ps(out, _mm_min_ps(clipm, _mm_set_ps(in[3], in[2], in[1], in[0])));
      }
    }
    _mm_sfence();
  }
}
Ejemplo n.º 16
0
/*
=====================
R_CopyDecalSurface
=====================
*/
static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes,
									const decal_t * decal, const float fadeColor[4] ) {
	assert_16_byte_aligned( &verts[numVerts] );
	assert_16_byte_aligned( &indexes[numIndexes] );
	assert_16_byte_aligned( decal->indexes );
	assert_16_byte_aligned( decal->verts );
	assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
	assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
	assert_16_byte_aligned( fadeColor );


	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
	const __m128 vector_fade_color = _mm_load_ps( fadeColor );
	const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 );

	// copy vertices and apply depth/time based fading
	assert_offsetof( idDrawVert, color, 6 * 4 );
	for ( int i = 0; i < decal->numVerts; i++ ) {
		const idDrawVert &srcVert = decal->verts[i];
		idDrawVert &dstVert = verts[numVerts + i];

		__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert +  0 ) );
		__m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) );
		__m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 );

		__m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color );
		__m128i colorInt = _mm_cvtps_epi32( timeDepthFade );
		__m128i colorShort = _mm_packs_epi32( colorInt, colorInt );
		__m128i colorByte = _mm_packus_epi16( colorShort, colorShort );
		v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) );

		_mm_stream_si128( (__m128i *)( (byte *)&dstVert +  0 ), v0 );
		_mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 );
	}

	// copy indexes
	assert( ( decal->numIndexes & 7 ) == 0 );
	assert( sizeof( triIndex_t ) == 2 );
	for ( int i = 0; i < decal->numIndexes; i += 8 ) {
		__m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] );

		vi = _mm_add_epi16( vi, vector_short_num_verts );

		_mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi );
	}

	_mm_sfence();

}
Ejemplo n.º 17
0
        void dzero(unsigned int N, double* y) {
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d Z = _mm_setzero_pd() ;
                for(unsigned int i=0; i<N; i+=4) {
                    _mm_stream_pd(&y[i], Z) ;
                    _mm_stream_pd(&y[i + 2], Z) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            memset(y, 0, N*sizeof(double)) ;
        }
Ejemplo n.º 18
0
void f() {
  (void)_mm_getcsr(); // expected-warning{{implicitly declaring library function '_mm_getcsr'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_getcsr'}}
  _mm_setcsr(1); // expected-warning{{implicitly declaring library function '_mm_setcsr'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_setcsr'}}
  _mm_sfence(); // expected-warning{{implicitly declaring library function '_mm_sfence'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_sfence'}}

  _mm_clflush((void*)0); // expected-warning{{implicitly declaring library function '_mm_clflush'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_clflush'}}
  _mm_lfence(); // expected-warning{{implicitly declaring library function '_mm_lfence'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_lfence'}}
  _mm_mfence(); // expected-warning{{implicitly declaring library function '_mm_mfence'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_mfence'}}
  _mm_pause(); // expected-warning{{implicitly declaring library function '_mm_pause'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_pause'}}
}
Ejemplo n.º 19
0
inline __always_inline static void sse2_memzero128aligned(void *ptr, int n)
{
    __m128d d = (__m128d)_mm_setzero_si128 ();

    assert(((stm_word_t)ptr)%16==0);
    assert(n%128==0);
    char *p, *endptr = ((char*)ptr)+n; // = ptr;
    for(p = ptr; p < endptr; p+=128) {
	_mm_stream_pd((double*)&p[0], d);
	_mm_stream_pd((double*)&p[16], d);
	_mm_stream_pd((double*)&p[32], d);
	_mm_stream_pd((double*)&p[48], d);
	_mm_stream_pd((double*)&p[64], d);
	_mm_stream_pd((double*)&p[80], d);
	_mm_stream_pd((double*)&p[96], d);
	_mm_stream_pd((double*)&p[112], d);
    }
    _mm_sfence();
}
Ejemplo n.º 20
0
        void dcopy(unsigned int N, const double* x, double* y) {
#ifdef GX_SSE
            if(SSE2_supported) {
                __m128d X1,X2 ;
                unsigned int i = 0 ;
                while(i<N) {
                    _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ;
                    X1 = _mm_load_pd(&x[i]) ;
                    i+=2 ;
                    X2 = _mm_load_pd(&x[i]) ;
                    i+=2 ;
                    _mm_stream_pd(&y[i - 4], X1) ;
                    _mm_stream_pd(&y[i - 2], X2) ;
                }
                _mm_sfence() ;
                return ;
            }
#endif
            memcpy(y, x, N * sizeof(double)) ;
        }
Ejemplo n.º 21
0
inline static void sse2_memset128aligned(void *ptr, int n, stm_word_t word)
{
#ifdef __LP64__
    __m128d d = (__m128d)_mm_set_epi64((__m64)word, (__m64)word);
#else    
    __m128d d = (__m128d)_mm_set_epi32(word, word, word, word);
#endif
    assert(((stm_word_t)ptr)%16==0);
    assert(n%128==0);
    
    char *p, *endptr = ((char*)ptr)+n; // = ptr;
    for(p = ptr; p < endptr; p+=128) {
	_mm_stream_pd((double*)&p[0], d);
	_mm_stream_pd((double*)&p[16], d);
	_mm_stream_pd((double*)&p[32], d);
	_mm_stream_pd((double*)&p[48], d);
	_mm_stream_pd((double*)&p[64], d);
	_mm_stream_pd((double*)&p[80], d);
	_mm_stream_pd((double*)&p[96], d);
	_mm_stream_pd((double*)&p[112], d);
    }
    _mm_sfence();
}
Ejemplo n.º 22
0
void f0() {
  signed char         tmp_c;
//  unsigned char       tmp_Uc;
  signed short        tmp_s;
#ifdef USE_ALL
  unsigned short      tmp_Us;
#endif
  signed int          tmp_i;
  unsigned int        tmp_Ui;
  signed long long    tmp_LLi;
  unsigned long long  tmp_ULLi;
  float               tmp_f;
  double              tmp_d;

  void*          tmp_vp;
  const void*    tmp_vCp;
  char*          tmp_cp; 
  const char*    tmp_cCp; 
  int*           tmp_ip;
  float*         tmp_fp;
  const float*   tmp_fCp;
  double*        tmp_dp;
  const double*  tmp_dCp;
  long long*     tmp_LLip;

#define imm_i 32
#define imm_i_0_2 0
#define imm_i_0_4 3
#define imm_i_0_8 7
#define imm_i_0_16 15
  // Check this.
#define imm_i_0_256 0

  V2i*   tmp_V2ip;
  V1LLi* tmp_V1LLip;
  V2LLi* tmp_V2LLip;

  // 64-bit
  V8c    tmp_V8c;
  V4s    tmp_V4s;
  V2i    tmp_V2i;
  V1LLi  tmp_V1LLi;
#ifdef USE_3DNOW
  V2f    tmp_V2f;
#endif

  // 128-bit
  V16c   tmp_V16c;
  V8s    tmp_V8s;
  V4i    tmp_V4i;
  V2LLi  tmp_V2LLi;
  V4f    tmp_V4f;
  V2d    tmp_V2d;
  V2d*   tmp_V2dp;
  V4f*   tmp_V4fp;
  const V2d* tmp_V2dCp;
  const V4f* tmp_V4fCp;

  // 256-bit
  V32c   tmp_V32c;
  V4d    tmp_V4d;
  V8f    tmp_V8f;
  V4LLi  tmp_V4LLi;
  V8i    tmp_V8i;
  V4LLi* tmp_V4LLip;
  V4d*   tmp_V4dp;
  V8f*   tmp_V8fp;
  const V4d* tmp_V4dCp;
  const V8f* tmp_V8fCp;

  tmp_V2LLi = __builtin_ia32_undef128();
  tmp_V4LLi = __builtin_ia32_undef256();

  tmp_i = __builtin_ia32_comieq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comilt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comile(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comigt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comige(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comineq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomieq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomilt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomile(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomigt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomige(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomineq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comisdeq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdlt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdle(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdgt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdge(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdneq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdeq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdlt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdle(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdgt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdge(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdneq(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 0);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 1);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 2);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 3);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 4);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 5);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 6);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 7);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 0);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 1);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 2);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 3);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 4);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 5);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 6);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 7);
  tmp_V4f = __builtin_ia32_minps(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_maxps(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f);

  tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 3);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 4);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 5);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 6);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 7);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 0);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 1);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 2);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 3);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 4);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 5);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 6);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 7);
  tmp_V2d = __builtin_ia32_minpd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_maxpd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_minsd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_maxsd(tmp_V2d, tmp_V2d);
  tmp_V16c = __builtin_ia32_paddsb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_paddsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_psubsb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_psubsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_paddusb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_paddusw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_psubusb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_psubusw128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_pminsw128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s);
  tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_haddps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_haddpd(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d);
  tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i);
  tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s);
  tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i);
  tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s);
  tmp_V16c = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c);
  tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s);
  tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c);
  tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c);
  tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i);
  tmp_V16c = __builtin_ia32_pabsb128(tmp_V16c);
  tmp_V8c = __builtin_ia32_pabsb(tmp_V8c);
  tmp_V8s = __builtin_ia32_pabsw128(tmp_V8s);
  tmp_V4s = __builtin_ia32_pabsw(tmp_V4s);
  tmp_V4i = __builtin_ia32_pabsd128(tmp_V4i);
  tmp_V2i = __builtin_ia32_pabsd(tmp_V2i);
  tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi);
  tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi);
  tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi);
  tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi);
  tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s);
  tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0);

  __builtin_ia32_incsspd(tmp_Ui);
  __builtin_ia32_incsspq(tmp_ULLi);
  tmp_Ui = __builtin_ia32_rdsspd(tmp_Ui);
  tmp_ULLi = __builtin_ia32_rdsspq(tmp_ULLi);
  __builtin_ia32_saveprevssp();
  __builtin_ia32_rstorssp(tmp_vp);
  __builtin_ia32_wrssd(tmp_Ui, tmp_vp);
  __builtin_ia32_wrssq(tmp_ULLi, tmp_vp);
  __builtin_ia32_wrussd(tmp_Ui, tmp_vp);
  __builtin_ia32_wrussq(tmp_ULLi, tmp_vp);
  __builtin_ia32_setssbsy();
  __builtin_ia32_clrssbsy(tmp_vp);

  (void) __builtin_ia32_ldmxcsr(tmp_Ui);
  (void) _mm_setcsr(tmp_Ui);
  tmp_Ui = __builtin_ia32_stmxcsr();
  tmp_Ui = _mm_getcsr();
  (void)__builtin_ia32_fxsave(tmp_vp);
  (void)__builtin_ia32_fxsave64(tmp_vp);
  (void)__builtin_ia32_fxrstor(tmp_vp);
  (void)__builtin_ia32_fxrstor64(tmp_vp);

  (void)__builtin_ia32_xsave(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsave64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstor(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstor64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaveopt(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaveopt64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstors(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstors64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsavec(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsavec64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi);

  (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_clzero(tmp_vp);
  (void) __builtin_ia32_cldemote(tmp_vp);

  tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
  tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
  tmp_i = __builtin_ia32_cvtss2si(tmp_V4f);
  tmp_i = __builtin_ia32_cvttss2si(tmp_V4f);

  tmp_i = __builtin_ia32_rdtsc();
  tmp_i = __rdtsc();
  tmp_i = __builtin_ia32_rdtscp(&tmp_Ui);
  tmp_LLi = __builtin_ia32_rdpmc(tmp_i);
  __builtin_ia32_wbnoinvd();
#ifdef USE_64
  tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f);
  tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f);
#endif
  tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
  (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
  (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f);
  (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
  tmp_i = __builtin_ia32_movmskps(tmp_V4f);
  tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
  (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
  (void) __builtin_ia32_sfence();
  (void) _mm_sfence();

  tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c);
  tmp_V4f = __builtin_ia32_rcpps(tmp_V4f);
  tmp_V4f = __builtin_ia32_rcpss(tmp_V4f);
  tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f);
  tmp_V4f = __builtin_ia32_rsqrtss(tmp_V4f);
  tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f);
  tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f);
  (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp);
  tmp_i = __builtin_ia32_movmskpd(tmp_V2d);
  tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c);
  (void) __builtin_ia32_movnti(tmp_ip, tmp_i);
#ifdef USE_64
  (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
#endif
  tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
  tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
  tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
  tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
  tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
  tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d);
  tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d);
  tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d);
  tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i);
  tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d);
  tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d);
  tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d);
#ifdef USE_64
  tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d);
  tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d);
#endif
  tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f);
  tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f);
  (void) __builtin_ia32_clflush(tmp_vCp);
  (void) _mm_clflush(tmp_vCp);
  (void) __builtin_ia32_lfence();
  (void) _mm_lfence();
  (void) __builtin_ia32_mfence();
  (void) _mm_mfence();
  (void) __builtin_ia32_pause();
  (void) _mm_pause();
  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i);
  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i);
  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i);
  tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i);
  tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_psrlw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_psrld128(tmp_V4i, tmp_V4i);
  tmp_V2LLi = __builtin_ia32_psrlq128(tmp_V2LLi, tmp_V2LLi);
  tmp_V8s = __builtin_ia32_psllw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_pslld128(tmp_V4i, tmp_V4i);
  tmp_V2LLi = __builtin_ia32_psllq128(tmp_V2LLi, tmp_V2LLi);
  tmp_V8s = __builtin_ia32_psllwi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_pslldi128(tmp_V4i, tmp_i);
  tmp_V2LLi = __builtin_ia32_psllqi128(tmp_V2LLi, tmp_i);
  tmp_V8s = __builtin_ia32_psrlwi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_psrldi128(tmp_V4i, tmp_i);
  tmp_V2LLi = __builtin_ia32_psrlqi128(tmp_V2LLi, tmp_i);
  tmp_V8s = __builtin_ia32_psrawi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_psradi128(tmp_V4i, tmp_i);
  tmp_V8s = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s);
  (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui);
  tmp_V16c = __builtin_ia32_lddqu(tmp_cCp);
  tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i);
  tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i);
#ifdef USE_SSE4
  tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c);
  tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_blendvps(tmp_V4f, tmp_V4f, tmp_V4f);
  tmp_V8s = __builtin_ia32_packusdw128(tmp_V4i, tmp_V4i);
  tmp_V16c = __builtin_ia32_pmaxsb128(tmp_V16c, tmp_V16c);
  tmp_V4i = __builtin_ia32_pmaxsd128(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_pmaxud128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_pmaxuw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pminsb128(tmp_V16c, tmp_V16c);
  tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s);
  tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i);
  tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16);
  tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16);
  tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16);
  tmp_V2d = __builtin_ia32_roundpd(tmp_V2d, imm_i_0_16);
  tmp_V4f = __builtin_ia32_insertps128(tmp_V4f, tmp_V4f, imm_i_0_256);
#endif

  tmp_V4d = __builtin_ia32_addsubpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_addsubps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_haddpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_hsubps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_hsubpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_haddps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_maxpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_maxps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_minpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_minps256(tmp_V8f, tmp_V8f);
  tmp_V2d = __builtin_ia32_vpermilvarpd(tmp_V2d, tmp_V2LLi);
  tmp_V4f = __builtin_ia32_vpermilvarps(tmp_V4f, tmp_V4i);
  tmp_V4d = __builtin_ia32_vpermilvarpd256(tmp_V4d, tmp_V4LLi);
  tmp_V8f = __builtin_ia32_vpermilvarps256(tmp_V8f, tmp_V8i);
  tmp_V4d = __builtin_ia32_blendvpd256(tmp_V4d, tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_blendvps256(tmp_V8f, tmp_V8f, tmp_V8f);
  tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7);
  tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0);
  tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0);
  tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d);
  tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f);
  tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d);
  tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d);
  tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f);
  tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7);
  tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7);
  tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7);
  tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d);
  tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f);
  tmp_V8f = __builtin_ia32_rsqrtps256(tmp_V8f);
  tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f);
  tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1);
  tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1);
  tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestzps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestcps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestnzcps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestzpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestcpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestnzcpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestzps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_vtestcps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_vtestnzcps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_ptestz256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_ptestc256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_ptestnzc256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_movmskpd256(tmp_V4d);
  tmp_i = __builtin_ia32_movmskps256(tmp_V8f);
  __builtin_ia32_vzeroall();
  __builtin_ia32_vzeroupper();
  tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
  tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
  tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
  tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);
  tmp_V8f = __builtin_ia32_maskloadps256(tmp_V8fCp, tmp_V8i);
  __builtin_ia32_maskstorepd(tmp_V2dp, tmp_V2LLi, tmp_V2d);
  __builtin_ia32_maskstoreps(tmp_V4fp, tmp_V4i, tmp_V4f);
  __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d);
  __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f);

#ifdef USE_3DNOW
  tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c);
  tmp_V2i = __builtin_ia32_pf2id(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i);
  tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i);
  tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f);
  tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i);

  tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4);
  tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha1msg2(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256rnds2(tmp_V4i, tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256msg1(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256msg2(tmp_V4i, tmp_V4i);
#endif
}
Ejemplo n.º 23
0
void main() {
	HMODULE newhsacore = ::LoadLibraryW(L"newhsacore64.dll");
	assert(newhsacore != NULL);

	HsaGetDevicesFunction HsaGetDevices = reinterpret_cast<HsaGetDevicesFunction>(::GetProcAddress(newhsacore, "HsaGetDevices"));
	assert(HsaGetDevices != NULL);
	HsaCreateUserModeQueueFunction HsaCreateUserModeQueue = reinterpret_cast<HsaCreateUserModeQueueFunction>(::GetProcAddress(newhsacore, "HsaCreateUserModeQueue"));
	assert(HsaCreateUserModeQueue != NULL);
	HsaDestroyUserModeQueueFunction HsaDestroyUserModeQueue = reinterpret_cast<HsaDestroyUserModeQueueFunction>(::GetProcAddress(newhsacore, "HsaDestroyUserModeQueue"));
	assert(HsaDestroyUserModeQueue != NULL);
	HsaSubmitAqlFunction HsaSubmitAql = reinterpret_cast<HsaSubmitAqlFunction>(::GetProcAddress(newhsacore, "HsaSubmitAql"));
	assert(HsaSubmitAql != NULL);

	HsaCreateSignalFunction HsaCreateSignal = reinterpret_cast<HsaCreateSignalFunction>(::GetProcAddress(newhsacore, "HsaCreateSignal"));
	assert(HsaCreateSignal != NULL);
	HsaDestroySignalFunction HsaDestroySignal = reinterpret_cast<HsaDestroySignalFunction>(::GetProcAddress(newhsacore, "HsaDestroySignal"));
	assert(HsaDestroySignal != NULL);
	HsaWaitOnSignalFunction HsaWaitOnSignal = reinterpret_cast<HsaWaitOnSignalFunction>(::GetProcAddress(newhsacore, "HsaWaitOnSignal"));
	assert(HsaWaitOnSignal != NULL);
	HsaQuerySignalFunction HsaQuerySignal = reinterpret_cast<HsaQuerySignalFunction>(::GetProcAddress(newhsacore, "HsaQuerySignal"));
	assert(HsaQuerySignal != NULL);

	HsaLoadBrigFunction HsaLoadBrig = reinterpret_cast<HsaLoadBrigFunction>(::GetProcAddress(newhsacore, "HsaLoadBrig"));
	assert(HsaLoadBrig != NULL);
	HsaUnloadBrigFunction HsaUnloadBrig = reinterpret_cast<HsaUnloadBrigFunction>(::GetProcAddress(newhsacore, "HsaUnloadBrig"));
	assert(HsaUnloadBrig != NULL);
	HsaFinalizeBrigFunction HsaFinalizeBrig = reinterpret_cast<HsaFinalizeBrigFunction>(::GetProcAddress(newhsacore, "HsaFinalizeBrig"));
	assert(HsaFinalizeBrig != NULL);
	HsaFreeKernelCodeFunction HsaFreeKernelCode = reinterpret_cast<HsaFreeKernelCodeFunction>(::GetProcAddress(newhsacore, "HsaFreeKernelCode"));
	assert(HsaFreeKernelCode != NULL);
	HsaFreeKernelDebugFunction HsaFreeKernelDebug = reinterpret_cast<HsaFreeKernelDebugFunction>(::GetProcAddress(newhsacore, "HsaFreeKernelDebug"));
	assert(HsaFreeKernelDebug != NULL);

	HsaRegisterSystemMemoryFunction HsaRegisterSystemMemory = reinterpret_cast<HsaRegisterSystemMemoryFunction>(::GetProcAddress(newhsacore, "HsaRegisterSystemMemory"));
	assert(HsaRegisterSystemMemory != NULL);
	HsaDeregisterSystemMemoryFunction HsaDeregisterSystemMemory = reinterpret_cast<HsaDeregisterSystemMemoryFunction>(::GetProcAddress(newhsacore, "HsaDeregisterSystemMemory"));
	assert(HsaDeregisterSystemMemory != NULL);

	HsaStatus status = kHsaStatusSuccess;
	const HsaDevice* device = NULL;
	unsigned int deviceCount = 0;

	status = HsaGetDevices(&deviceCount, &device);
	assert(status == kHsaStatusSuccess);
	assert(deviceCount == 1);

	HsaBrig brig;
	memset(&brig, 0, sizeof(brig));
	brig.code_section = hsa_code_section;
	brig.code_section_byte_size = sizeof(hsa_code_section);
	brig.directive_section = hsa_directives_section;
	brig.directive_section_byte_size = sizeof(hsa_directives_section);
	brig.operand_section = hsa_operands_section;
	brig.operand_section_byte_size = sizeof(hsa_operands_section);
	brig.string_section = hsa_strtab_section;
	brig.string_section_byte_size = sizeof(hsa_strtab_section);
	status = HsaLoadBrig(device, &brig);
	assert(status == kHsaStatusSuccess);

	HsaKernelCode *kernelCode = NULL;
	status = HsaFinalizeBrig(device, &brig, "&hsaDemo", "", &kernelCode, NULL);
	assert(status == kHsaStatusSuccess);
	assert(kernelCode != NULL);

	status = HsaUnloadBrig(&brig);
	assert(status == kHsaStatusSuccess);

	HsaQueue* queue = NULL;
	status = HsaCreateUserModeQueue(device, NULL, 0, kHsaQueueTypeCompute, kHsaQueuePriorityMaximum, kHsaQueueFractionTen, &queue);
	assert(status == kHsaStatusSuccess);

	HsaSignal signal = NULL;
	status = HsaCreateSignal(&signal);
	assert(status == kHsaStatusSuccess);

	for (size_t arraySize = 64; arraySize <= 1024 * 1024; arraySize *= 2) {
		uint32_t* xArray = (uint32_t*)::VirtualAlloc(NULL, arraySize * sizeof(uint32_t), MEM_RESERVE, PAGE_READWRITE);
		assert(xArray != NULL);
		xArray = (uint32_t*)::VirtualAlloc(xArray, arraySize * sizeof(uint32_t), MEM_COMMIT, PAGE_READWRITE);
		assert(xArray != NULL);
		memset(xArray, 0x12, arraySize * sizeof(uint32_t));

		uint32_t* yArray = (uint32_t*)::VirtualAlloc(NULL, arraySize * sizeof(uint32_t), MEM_RESERVE, PAGE_READWRITE);
		assert(yArray != NULL);
		yArray = (uint32_t*)::VirtualAlloc(yArray, arraySize * sizeof(uint32_t), MEM_COMMIT, PAGE_READWRITE);
		assert(yArray != NULL);
		memset(yArray, 0x14, arraySize * sizeof(uint32_t));

		uint32_t* zArray = (uint32_t*)::VirtualAlloc(NULL, arraySize * sizeof(uint32_t), MEM_RESERVE, PAGE_READWRITE);
		assert(zArray != NULL);
		zArray = (uint32_t*)::VirtualAlloc(zArray, arraySize * sizeof(uint32_t), MEM_COMMIT, PAGE_READWRITE);
		assert(zArray != NULL);
		memset(zArray, 0x42, arraySize * sizeof(uint32_t));

		for (size_t iteration = 1; iteration <= 5; iteration++) {
			uint64_t kernelArguments[4];
			kernelArguments[0] = reinterpret_cast<uint64_t>(xArray);
			kernelArguments[1] = reinterpret_cast<uint64_t>(yArray);
			kernelArguments[2] = reinterpret_cast<uint64_t>(zArray);
			kernelArguments[3] = static_cast<uint64_t>(arraySize);

			memset(zArray, 0x42, arraySize * sizeof(uint32_t));

			HsaAqlDispatchPacket aqlPacket;
			memset(&aqlPacket, 0, sizeof(aqlPacket));
			aqlPacket.format = kHsaAqlPacketFormatDispatch;
			aqlPacket.invalidate_instruction_cache = 1;
            /* This is very important. Without release fence the system will crash from time to time. */
			aqlPacket.release_fence_scope = 2;
			aqlPacket.dimensions = 1;
			aqlPacket.grid_size[0] = static_cast<uint32_t>(arraySize);
			aqlPacket.grid_size[1] = 1;
			aqlPacket.grid_size[2] = 1;
			aqlPacket.workgroup_size[0] = device->wave_front_size * device->number_compute_units;
			aqlPacket.workgroup_size[1] = 1;
			aqlPacket.workgroup_size[2] = 1;
			aqlPacket.completion_signal = signal;
			aqlPacket.group_segment_size_bytes = kernelCode->workgroup_group_segment_byte_size;
			aqlPacket.private_segment_size_bytes = kernelCode->workitem_private_segment_byte_size;
			aqlPacket.kernel_object_address = reinterpret_cast<uint64_t>(kernelCode);
			aqlPacket.kernel_arg_address = reinterpret_cast<uint64_t>(kernelArguments);

			const uint64_t gpuStartCycles = __rdtsc();
			status = HsaSubmitAql(queue, &aqlPacket);
			assert(status == kHsaStatusSuccess);
			const uint64_t gpuSubmitCycles = __rdtsc();

			status = HsaWaitOnSignal(signal);
			assert(status == kHsaStatusSuccess);
			const uint64_t gpuComputeCycles = __rdtsc();

			size_t countEqual = 0;
			for (size_t i = 0; i < arraySize; i++) {
				if (xArray[i] + yArray[i] == zArray[i])
					countEqual++;
				else if (i < 10)
					printf("%"PRIx32" + %"PRIx32" = %"PRIx32"\n", xArray[i], yArray[i], zArray[i]);
			}
			if (countEqual != arraySize) {
				printf("%Iu\tFAILED (%Iu)\n", arraySize, countEqual);
				break;
			}

			const uint64_t cpuStartCycles = __rdtsc();
			for (size_t i = 0; i < arraySize; i += 16) {
				_mm_stream_si128((__m128i*)&zArray[i], _mm_add_epi32(_mm_load_si128((const __m128i*)&xArray[i]), _mm_load_si128((const __m128i*)&yArray[i])));
				_mm_stream_si128((__m128i*)&zArray[i + 4], _mm_add_epi32(_mm_load_si128((const __m128i*)&xArray[i + 4]), _mm_load_si128((const __m128i*)&yArray[i + 4])));
				_mm_stream_si128((__m128i*)&zArray[i + 8], _mm_add_epi32(_mm_load_si128((const __m128i*)&xArray[i + 8]), _mm_load_si128((const __m128i*)&yArray[i + 8])));
				_mm_stream_si128((__m128i*)&zArray[i + 12], _mm_add_epi32(_mm_load_si128((const __m128i*)&xArray[i + 12]), _mm_load_si128((const __m128i*)&yArray[i + 12])));
			}
			_mm_sfence();
			const uint64_t cpuComputeCycles = __rdtsc();

			const double gpuSubmitTime = double(gpuSubmitCycles - gpuStartCycles) / 3.7e+9;
			const double gpuComputeTime = double(gpuComputeCycles - gpuStartCycles) / 3.7e+9;
			const double cpuComputeTime = double(cpuComputeCycles - cpuStartCycles) / 3.7e+9;
			const double gpuBandwidth = double(arraySize) * double(3 * sizeof(uint32_t)) / gpuComputeTime;
			const double cpuBandwidth = double(arraySize) * double(3 * sizeof(uint32_t)) / cpuComputeTime;
			printf("%Iu\t%.2lf\t%.2lf\t%.2lf\t%.3lf\t%.3lf\n", arraySize, gpuSubmitTime * 1.0e+6, gpuComputeTime * 1.0e+6, cpuComputeTime * 1.0e+6, gpuBandwidth * 1.0e-9, cpuBandwidth * 1.0e-9);
		}

		::VirtualFree(xArray, 0, MEM_DECOMMIT);
		::VirtualFree(xArray, 0, MEM_RELEASE);
		::VirtualFree(yArray, 0, MEM_DECOMMIT);
		::VirtualFree(yArray, 0, MEM_RELEASE);
		::VirtualFree(zArray, 0, MEM_DECOMMIT);
		::VirtualFree(zArray, 0, MEM_RELEASE);
	}

	status = HsaDestroySignal(signal);
	assert(status == kHsaStatusSuccess);

	status = HsaFreeKernelCode(kernelCode);
	assert(status == kHsaStatusSuccess);

	status = HsaDestroyUserModeQueue(queue);
	assert(status == kHsaStatusSuccess);

	BOOL freeHsaCore = ::FreeLibrary(newhsacore);
	assert(freeHsaCore != FALSE);
}
Ejemplo n.º 24
0
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  float *in;
  float *out;
  dt_iop_zonesystem_gui_data_t *g = NULL;
  dt_iop_zonesystem_data_t *data = (dt_iop_zonesystem_data_t*)piece->data;

  guchar *buffer = NULL;
  if( self->dev->gui_attached && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW )
  {
    g = (dt_iop_zonesystem_gui_data_t *)self->gui_data;
    dt_pthread_mutex_lock(&g->lock);
    if(g->preview_buffer)
      g_free (g->preview_buffer);

    buffer = g->preview_buffer = g_malloc (roi_in->width*roi_in->height);
    g->preview_width=roi_out->width;
    g->preview_height=roi_out->height;
  }

  /* calculate zonemap */
  const int size = data->size;
  float zonemap[MAX_ZONE_SYSTEM_SIZE]= {-1};
  _iop_zonesystem_calculate_zonemap (data, zonemap);
  const int ch = piece->colors;

  /* if gui and have buffer lets gaussblur and fill buffer with zone indexes */
  if( self->dev->gui_attached && g && buffer)
  {
    /* setup gaussian kernel */
    const int radius = 8;
    const int rad = MIN(radius, ceilf(radius * roi_in->scale / piece->iscale));
    const int wd = 2*rad+1;
    float mat[wd*wd];
    float *m;
    const float sigma2 = (2.5*2.5)*(radius*roi_in->scale/piece->iscale)*(radius*roi_in->scale/piece->iscale);
    float weight = 0.0f;

    memset(mat, 0, wd*wd*sizeof(float));

    m = mat;
    for(int l=-rad; l<=rad; l++) for(int k=-rad; k<=rad; k++,m++)
        weight += *m = expf(- (l*l + k*k)/(2.f*sigma2));
    m = mat;
    for(int l=-rad; l<=rad; l++) for(int k=-rad; k<=rad; k++,m++)
        *m /= weight;

    /* gauss blur the L channel */
#ifdef _OPENMP
    #pragma omp parallel for default(none) private(in, out, m) shared(mat, ivoid, ovoid, roi_out, roi_in) schedule(static)
#endif
    for(int j=rad; j<roi_out->height-rad; j++)
    {
      in  = ((float *)ivoid) + ch*(j*roi_in->width  + rad);
      out = ((float *)ovoid) + ch*(j*roi_out->width + rad);
      for(int i=rad; i<roi_out->width-rad; i++)
      {
        for(int c=0; c<3; c++) out[c] = 0.0f;
        float sum = 0.0;
        m = mat;
        for(int l=-rad; l<=rad; l++)
        {
          float *inrow = in + ch*(l*roi_in->width-rad);
          for(int k=-rad; k<=rad; k++,inrow+=ch,m++)
            sum += *m * inrow[0];
        }
        out[0] = sum;
        out += ch;
        in += ch;
      }
    }

    /* create zonemap preview */
//     in  = (float *)ivoid;
    out = (float *)ovoid;
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(roi_out,out,buffer,g,zonemap) schedule(static)
#endif
    for (int k=0; k<roi_out->width*roi_out->height; k++)
    {
      buffer[k] = _iop_zonesystem_zone_index_from_lightness (out[ch*k]/100.0f, zonemap, size);
    }

    dt_pthread_mutex_unlock(&g->lock);
  }

  /* process the image */
  in  = (float *)ivoid;
  out = (float *)ovoid;

  const float rzscale = (size-1)/100.0f;

  float zonemap_offset[MAX_ZONE_SYSTEM_SIZE]= {-1};
  float zonemap_scale[MAX_ZONE_SYSTEM_SIZE]= {-1};

  // precompute scale and offset
  for (int k=0; k < size-1; k++) zonemap_scale[k]  = (zonemap[k+1]-zonemap[k])*(size-1);
  for (int k=0; k < size-1; k++) zonemap_offset[k] = 100.0f * ((k+1)*zonemap[k] - k*zonemap[k+1]) ;

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(roi_out, in, out, zonemap_scale,zonemap_offset) schedule(static)
#endif
  for (int j=0; j<roi_out->height; j++)
    for (int i=0; i<roi_out->width; i++)
    {
      /* remap lightness into zonemap and apply lightness */
      const float *inp = in + ch*(j*roi_out->width+i);
      float *outp = out + ch*(j*roi_out->width+i);

      const int rz = CLAMPS(inp[0]*rzscale, 0, size-2);  // zone index

      const float zs = ((rz > 0) ? (zonemap_offset[rz]/inp[0]) : 0) + zonemap_scale[rz];

      _mm_stream_ps(outp,_mm_mul_ps(_mm_load_ps(inp),_mm_set1_ps(zs)));
    }

  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
// =============================================================================
//
// sse3_vChirpData
// version by: Alex Kan
//   http://tbp.berkeley.edu/~alexkan/seti/
//
int sse3_ChirpData_ak(
  sah_complex * cx_DataArray,
  sah_complex * cx_ChirpDataArray,
  int chirp_rate_ind,
  double chirp_rate,
  int  ul_NumDataPoints,
  double sample_rate
) {
  int i;

  if (chirp_rate_ind == 0) {
    memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
    return 0;
  }

  int vEnd;  
  double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
  __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
  __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);

  // main vectorised loop
  vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
  for (i = 0; i < vEnd; i += 4) {
    const float *data = (const float *) (cx_DataArray + i);
    float *chirped = (float *) (cx_ChirpDataArray + i);
    __m128d di = _mm_set1_pd(i);
    __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di);
    __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di);

    __m128 d1, d2;
    __m128 cd1, cd2;
    __m128 td1, td2;
    __m128 x;
    __m128 y;
    __m128 s;
    __m128 c;
    __m128 m;

    // load the signal to be chirped
    prefetchnta((const void *)( data+32 ));
    d1 = _mm_load_ps(data);
    d2 = _mm_load_ps(data+4);

    // calculate the input angle
    a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate);
    a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate);

    // reduce the angle to the range (-0.5, 0.5)
    a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
    a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

    // convert pair of packed double into packed single
    x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));

    // square to the range [0, 0.25)
    y = _mm_mul_ps(x, x);

    // perform the initial polynomial approximations
    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4),
                                    SS3),
                                y),
                          SS2),
                    y),
              SS1),
          x);
    c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3),
                                CC2),
                          y),
                    CC1),
              y),
          ONE);

    // perform first angle doubling
    x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
    y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

    // calculate scaling factor to correct the magnitude
    //      m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO));
    //      m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO));
    m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)));

    // perform second angle doubling
    c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
    s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

    // correct the magnitude (final sine / cosine approximations)
    s = _mm_mul_ps(s, m);
    c = _mm_mul_ps(c, m);

    // chirp the data
    cd1 = _mm_shuffle_ps(c, c, 0x50);
    cd2 = _mm_shuffle_ps(c, c, 0xfa);
    cd1 = _mm_mul_ps(cd1, d1);
    cd2 = _mm_mul_ps(cd2, d2);
    d1 = _mm_shuffle_ps(d1, d1, 0xb1);
    d2 = _mm_shuffle_ps(d2, d2, 0xb1);
    td1 = _mm_shuffle_ps(s, s, 0x50);
    td2 = _mm_shuffle_ps(s, s, 0xfa);
    td1 = _mm_mul_ps(td1, d1);
    td2 = _mm_mul_ps(td2, d2);
    cd1 = _mm_addsub_ps(cd1, td1);
    cd2 = _mm_addsub_ps(cd2, td2);

    // store chirped values
    _mm_stream_ps(chirped, cd1);
    _mm_stream_ps(chirped+4, cd2);
  }
  _mm_sfence();

  // handle tail elements with scalar code
  for (   ; i < ul_NumDataPoints; ++i) {
    double angle = srate * i * i * 0.5;
    double s = sin(angle);
    double c = cos(angle);
    float re = cx_DataArray[i][0];
    float im = cx_DataArray[i][1];

    cx_ChirpDataArray[i][0] = re * c - im * s;
    cx_ChirpDataArray[i][1] = re * s + im * c;
  }
  analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

  return 0;
}
Ejemplo n.º 26
0
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->softproof_enabled == DT_SOFTPROOF_GAMUTCHECK);

  if(!isnan(d->cmatrix[0]))
  {
    //fprintf(stderr,"Using cmatrix codepath\n");
    // convert to rgb using matrix
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + ch*roi_in->width *j;
      float *out = (float*)ovoid + ch*roi_out->width*j;
      const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]);

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2)))));

        _mm_stream_ps(out,t);
      }
    }
    _mm_sfence();
    // apply profile
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + ch*roi_in->width *j;
      float *out = (float*)ovoid + ch*roi_out->width*j;

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        for(int i=0; i<3; i++)
          if (d->lut[i][0] >= 0.0f)
          {
            out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]);
          }
      }
    }
  }
  else
  {
    float *in  = (float*)ivoid;
    float *out = (float*)ovoid;
    const int rowsize=roi_out->width * 3;
    //fprintf(stderr,"Using xform codepath\n");

#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(out, roi_out, in)
#endif
    for (int k=0; k<roi_out->height; k++)
    {
      float Lab[rowsize];
      float rgb[rowsize];

      const int m=(k*(roi_out->width*ch));
      for (int l=0; l<roi_out->width; l++)
      {
        int li=3*l,ii=ch*l;
        Lab[li+0] = in[m+ii+0];
        Lab[li+1] = in[m+ii+1];
        Lab[li+2] = in[m+ii+2];
      }

      cmsDoTransform (d->xform, Lab, rgb, roi_out->width);

      for (int l=0; l<roi_out->width; l++)
      {
        int oi=ch*l, ri=3*l;
        if(gamutcheck && (rgb[ri+0] < 0.0f || rgb[ri+1] < 0.0f || rgb[ri+2] < 0.0f))
        {
          out[m+oi+0] = 0.0f;
          out[m+oi+1] = 1.0f;
          out[m+oi+2] = 1.0f;
        }
        else
        {
          out[m+oi+0] = rgb[ri+0];
          out[m+oi+1] = rgb[ri+1];
          out[m+oi+2] = rgb[ri+2];
        }
      }
    }
  }

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
Ejemplo n.º 27
0
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
    const int filters = dt_image_flipped_filter(&piece->pipe->image);
    dt_iop_temperature_data_t *d = (dt_iop_temperature_data_t *)piece->data;
    if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters && piece->pipe->image.bpp != 4)
    {
        const float coeffsi[3] = {d->coeffs[0]/65535.0f, d->coeffs[1]/65535.0f, d->coeffs[2]/65535.0f};
#ifdef _OPENMP
        #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static)
#endif
        for(int j=0; j<roi_out->height; j++)
        {
            int i=0;
            const uint16_t *in = ((uint16_t *)ivoid) + j*roi_out->width;
            float *out = ((float*)ovoid) + j*roi_out->width;

            // process unaligned pixels
            for ( ; i < ((4-(j*roi_out->width & 3)) & 3) ; i++,out++,in++)
                *out = *in * coeffsi[FC(j+roi_out->y, i+roi_out->x, filters)];

            const __m128 coeffs = _mm_set_ps(coeffsi[FC(j+roi_out->y, roi_out->x+i+3, filters)],
                                             coeffsi[FC(j+roi_out->y, roi_out->x+i+2, filters)],
                                             coeffsi[FC(j+roi_out->y, roi_out->x+i+1, filters)],
                                             coeffsi[FC(j+roi_out->y, roi_out->x+i  , filters)]);

            // process aligned pixels with SSE
            for( ; i < roi_out->width - 3 ; i+=4,out+=4,in+=4)
            {
                _mm_stream_ps(out,_mm_mul_ps(coeffs,_mm_set_ps(in[3],in[2],in[1],in[0])));
            }

            // process the rest
            for( ; i<roi_out->width; i++,out++,in++)
                *out = *in * coeffsi[FC(j+roi_out->y, i+roi_out->x, filters)];
        }
        _mm_sfence();
    }
    else if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters && piece->pipe->image.bpp == 4)
    {
#ifdef _OPENMP
        #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static)
#endif
        for(int j=0; j<roi_out->height; j++)
        {
            const float *in = ((float *)ivoid) + j*roi_out->width;
            float *out = ((float*)ovoid) + j*roi_out->width;
            for(int i=0; i<roi_out->width; i++,out++,in++)
                *out = *in * d->coeffs[FC(j+roi_out->x, i+roi_out->y, filters)];
        }
    }
    else
    {
        const int ch = piece->colors;
#ifdef _OPENMP
        #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static)
#endif
        for(int k=0; k<roi_out->height; k++)
        {
            const float *in = ((float*)ivoid) + ch*k*roi_out->width;
            float *out = ((float*)ovoid) + ch*k*roi_out->width;
            for (int j=0; j<roi_out->width; j++,in+=ch,out+=ch)
                for(int c=0; c<3; c++) out[c] = in[c]*d->coeffs[c];
        }
    }
    for(int k=0; k<3; k++)
        piece->pipe->processed_maximum[k] = d->coeffs[k] * piece->pipe->processed_maximum[k];
}
Ejemplo n.º 28
0
/* Insert a key-value entry into a hash table. */
int
clht_put(clht_t* h, clht_addr_t key, clht_val_t val) 
{
  clht_hashtable_t* hashtable = h->ht;
  size_t bin = clht_hash(hashtable, key);
  volatile bucket_t* bucket = hashtable->table + bin;

#if CLHT_READ_ONLY_FAIL == 1
  if (bucket_exists(bucket, key))
    {
      return false;
    }
#endif

  clht_lock_t* lock = &bucket->lock;
  while (!LOCK_ACQ(lock, hashtable))
    {
      hashtable = h->ht;
      size_t bin = clht_hash(hashtable, key);

      bucket = hashtable->table + bin;
      lock = &bucket->lock;
    }

  CLHT_GC_HT_VERSION_USED(hashtable);
  CLHT_CHECK_STATUS(h);
  clht_addr_t* empty = NULL;
  clht_val_t* empty_v = NULL;

  uint32_t j;
  do 
    {
      for (j = 0; j < ENTRIES_PER_BUCKET; j++) 
	{
	  if (bucket->key[j] == key) 
	    {
	      LOCK_RLS(lock);
	      return false;
	    }
	  else if (empty == NULL && bucket->key[j] == 0)
	    {
	      empty = (clht_addr_t*) &bucket->key[j];
	      empty_v = &bucket->val[j];
	    }
	}
        
      int resize = 0;
      if (likely(bucket->next == NULL))
	{
	  if (unlikely(empty == NULL))
	    {
	      DPP(put_num_failed_expand);

	      bucket_t* b = clht_bucket_create_stats(hashtable, &resize);
	      b->val[0] = val;
#ifdef __tile__
	      /* keep the writes in order */
	      _mm_sfence();
#endif
	      b->key[0] = key;
#ifdef __tile__
	      /* make sure they are visible */
	      _mm_sfence();
#endif
	      bucket->next = b;
	    }
	  else 
	    {
	      *empty_v = val;
#ifdef __tile__
	      /* keep the writes in order */
	      _mm_sfence();
#endif
	      *empty = key;
	    }

	  LOCK_RLS(lock);
	  if (unlikely(resize))
	    {
	      /* ht_resize_pes(h, 1); */
	      ht_status(h, 1, 0);
	    }
	  return true;
	}
      bucket = bucket->next;
    }
  while (true);
}
Ejemplo n.º 29
0
	void __cdecl VDFastMemcpyFinishMMX2() {
		_mm_empty();
		_mm_sfence();
	}
Ejemplo n.º 30
0
void process_sse2(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
                  void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  const dt_iop_rawprepare_data_t *const d = (dt_iop_rawprepare_data_t *)piece->data;

  // fprintf(stderr, "roi in %d %d %d %d\n", roi_in->x, roi_in->y, roi_in->width, roi_in->height);
  // fprintf(stderr, "roi out %d %d %d %d\n", roi_out->x, roi_out->y, roi_out->width, roi_out->height);

  const float scale = roi_in->scale / piece->iscale;
  const int csx = (int)roundf((float)d->x * scale), csy = (int)roundf((float)d->y * scale);

  if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && piece->pipe->filters)
  { // raw mosaic
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      const uint16_t *in = ((uint16_t *)ivoid) + ((size_t)roi_in->width * (j + csy) + csx);
      float *out = ((float *)ovoid) + (size_t)roi_out->width * j;

      int i = 0;

      // FIXME: figure alignment!  !!! replace with for !!!
      while((!dt_is_aligned(in, 16) || !dt_is_aligned(out, 16)) && (i < roi_out->width))
      {
        const int id = BL(roi_out, d, j, i);
        *out = (((float)(*in)) - d->sub[id]) / d->div[id];
        i++;
        in++;
        out++;
      }

      const __m128 sub = _mm_set_ps(d->sub[BL(roi_out, d, j, i + 3)], d->sub[BL(roi_out, d, j, i + 2)],
                                    d->sub[BL(roi_out, d, j, i + 1)], d->sub[BL(roi_out, d, j, i)]);

      const __m128 div = _mm_set_ps(d->div[BL(roi_out, d, j, i + 3)], d->div[BL(roi_out, d, j, i + 2)],
                                    d->div[BL(roi_out, d, j, i + 1)], d->div[BL(roi_out, d, j, i)]);

      // process aligned pixels with SSE
      for(; i < roi_out->width - (8 - 1); i += 8, in += 8)
      {
        const __m128i input = _mm_load_si128((__m128i *)in);

        __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0));
        __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0));

        __m128 flo = _mm_cvtepi32_ps(ilo);
        __m128 fhi = _mm_cvtepi32_ps(ihi);

        flo = _mm_div_ps(_mm_sub_ps(flo, sub), div);
        fhi = _mm_div_ps(_mm_sub_ps(fhi, sub), div);

        _mm_stream_ps(out, flo);
        out += 4;
        _mm_stream_ps(out, fhi);
        out += 4;
      }

      // process the rest
      for(; i < roi_out->width; i++, in++, out++)
      {
        const int id = BL(roi_out, d, j, i);
        *out = MAX(0.0f, ((float)(*in)) - d->sub[id]) / d->div[id];
      }
    }

    piece->pipe->filters = dt_rawspeed_crop_dcraw_filters(self->dev->image_storage.filters, csx, csy);
    adjust_xtrans_filters(piece->pipe, csx, csy);
  }
  else
  { // pre-downsampled buffer that needs black/white scaling

    const __m128 sub = _mm_load_ps(d->sub), div = _mm_load_ps(d->div);

#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      const float *in = ((float *)ivoid) + (size_t)4 * (roi_in->width * (j + csy) + csx);
      float *out = ((float *)ovoid) + (size_t)4 * roi_out->width * j;

      // process aligned pixels with SSE
      for(int i = 0; i < roi_out->width; i++, in += 4, out += 4)
      {
        const __m128 input = _mm_load_ps(in);

        const __m128 scaled = _mm_div_ps(_mm_sub_ps(input, sub), div);

        _mm_stream_ps(out, scaled);
      }
    }
  }
  _mm_sfence();
}