/* Fast remote SCI copy for systems with write-combining enabled. This is the version using SSE instructions to copy 128 Byte blocks, and flushes after 64 Byte. */ void _mpid_smi_sse64_memcpy(void *dest, const void *src, size_t size) { char* a = (char*) src; char* b = (char*) dest; size_t j = 0; __m128 xmm[8]; /* Align the destination to a 64 Byte boundary */ for(; (j < size) && (((size_t) &b[j]) % 64 != 0); j++) ((char*) b)[j] = ((char*) a)[j]; // Loads two cache lines of data to a location closer to the processor. _mm_prefetch(a+j, _MM_HINT_NTA); _mm_prefetch(a+j+64, _MM_HINT_NTA); /* copy 128 byte per loop */ for (; (j+128) < size; j+=128) { // Loads two cache lines of data to a location closer to the processor. _mm_prefetch(a+j+128, _MM_HINT_NTA); _mm_prefetch(a+j+192, _MM_HINT_NTA); /* load 128 Byte into xmm register */ xmm[0] = _mm_load_ps((float*) &a[j]); xmm[1] = _mm_load_ps((float*) &a[j+16]); xmm[2] = _mm_load_ps((float*) &a[j+32]); xmm[3] = _mm_load_ps((float*) &a[j+48]); xmm[4] = _mm_load_ps((float*) &a[j+64]); xmm[5] = _mm_load_ps((float*) &a[j+80]); xmm[6] = _mm_load_ps((float*) &a[j+96]); xmm[7] = _mm_load_ps((float*) &a[j+112]); /* store 64 byte */ _mm_stream_ps((float*) &b[j], xmm[0]); _mm_stream_ps((float*) &b[j+16], xmm[1]); _mm_stream_ps((float*) &b[j+32], xmm[2]); _mm_stream_ps((float*) &b[j+48], xmm[3]); /* flush the write-combine buffer */ _mm_sfence(); /* store 64 byte */ _mm_stream_ps((float*) &b[j+64], xmm[4]); _mm_stream_ps((float*) &b[j+80], xmm[5]); _mm_stream_ps((float*) &b[j+96], xmm[6]); _mm_stream_ps((float*) &b[j+112], xmm[7]); /* flush the write-combine buffer */ _mm_sfence(); } /* copy tail */ for(; j<size; j++) ((char*) b)[j] = ((char*) a)[j]; }
void dmul(unsigned int N, const double* a, const double* b, double* y) { flops_counter += N ; #ifdef GX_SSE if(SSE2_supported) { __m128d Y1, Y2, A1, A2, B1, B2 ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&a[i] + 256), _MM_HINT_NTA) ; _mm_prefetch((const char*)(&b[i] + 256), _MM_HINT_NTA) ; A1 = _mm_load_pd(&a[i]) ; B1 = _mm_load_pd(&b[i]) ; Y1 = _mm_mul_pd(A1,B1) ; i += 2 ; A2 = _mm_load_pd(&a[i]) ; B2 = _mm_load_pd(&b[i]) ; Y2 = _mm_mul_pd(A2,B2) ; i += 2 ; _mm_stream_pd(&y[i - 4], Y1) ; _mm_stream_pd(&y[i - 2], Y2) ; } _mm_sfence() ; return ; } #endif for(unsigned int i=0; i<N; i++) { y[i] = a[i] * b[i] ; } }
void dscal(unsigned int N, double a, double* y) { flops_counter += N ; #ifdef GX_SSE if(SSE2_supported) { __m128d Y1, Y2, AA ; SSE_ALIGNED(double temp[2]) ; temp[0] = a ; temp[1] = a ; AA = _mm_load_pd(temp) ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ; Y1 = _mm_load_pd(&y[i]) ; Y1 = _mm_mul_pd(Y1, AA) ; i += 2 ; Y2 = _mm_load_pd(&y[i]) ; Y2 = _mm_mul_pd(Y2, AA) ; i += 2 ; _mm_stream_pd(&y[i - 4], Y1) ; _mm_stream_pd(&y[i - 2], Y2) ; } _mm_sfence() ; return ; } #endif for(unsigned int i=0; i<N; i++) { y[i] *= a ; } }
void CopyBuffer( byte * dst, const byte * src, int numBytes ) { assert_16_byte_aligned( dst ); assert_16_byte_aligned( src ); int i = 0; for ( ; i + 128 <= numBytes; i += 128 ) { __m128i d0 = _mm_load_si128( (__m128i *)&src[i + 0*16] ); __m128i d1 = _mm_load_si128( (__m128i *)&src[i + 1*16] ); __m128i d2 = _mm_load_si128( (__m128i *)&src[i + 2*16] ); __m128i d3 = _mm_load_si128( (__m128i *)&src[i + 3*16] ); __m128i d4 = _mm_load_si128( (__m128i *)&src[i + 4*16] ); __m128i d5 = _mm_load_si128( (__m128i *)&src[i + 5*16] ); __m128i d6 = _mm_load_si128( (__m128i *)&src[i + 6*16] ); __m128i d7 = _mm_load_si128( (__m128i *)&src[i + 7*16] ); _mm_stream_si128( (__m128i *)&dst[i + 0*16], d0 ); _mm_stream_si128( (__m128i *)&dst[i + 1*16], d1 ); _mm_stream_si128( (__m128i *)&dst[i + 2*16], d2 ); _mm_stream_si128( (__m128i *)&dst[i + 3*16], d3 ); _mm_stream_si128( (__m128i *)&dst[i + 4*16], d4 ); _mm_stream_si128( (__m128i *)&dst[i + 5*16], d5 ); _mm_stream_si128( (__m128i *)&dst[i + 6*16], d6 ); _mm_stream_si128( (__m128i *)&dst[i + 7*16], d7 ); } for ( ; i + 16 <= numBytes; i += 16 ) { __m128i d = _mm_load_si128( (__m128i *)&src[i] ); _mm_stream_si128( (__m128i *)&dst[i], d ); } for ( ; i + 4 <= numBytes; i += 4 ) { *(uint32 *)&dst[i] = *(const uint32 *)&src[i]; } for ( ; i < numBytes; i++ ) { dst[i] = src[i]; } _mm_sfence(); }
void laplacian(double* v1, double* v2, int dim_m, int dim_n) { // #pragma omp parallel for schedule(static) for (int j = 1; j < dim_n - 1; ++j ) { int kstart = 1; while ( ((long) &v2[j*dim_m + kstart]) & 0x000000000000001F ) { kstart++; } int i = 1; for (; i < kstart; ++i) { kernel_sequential(v1 + j*dim_n + i, v2 + j*dim_n + i, dim_n); } for (; i < dim_m - 1 - (dim_m - 1)%4; i = i + 4) { kernel(v1 + j*dim_n + i, v2 + j*dim_n + i, dim_n); } //asm volatile ("mfence" ::: "memory"); for (; i < dim_m - 1; ++i) { kernel_sequential(v1 + j*dim_n + i, v2 + j*dim_n + i, dim_n); } } #pragma omp parallel { _mm_sfence(); } }
/* * predrain_fence_sfence -- (internal) issue the pre-drain fence instruction */ static void predrain_fence_sfence(void) { LOG(15, NULL); _mm_sfence(); /* ensure CLWB or CLFLUSHOPT completes before PCOMMIT */ }
void process_sse2(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const dt_iop_zonesystem_data_t *const d = (const dt_iop_zonesystem_data_t *const)piece->data; process_common_setup(self, piece, ivoid, ovoid, roi_in, roi_out); const int ch = piece->colors; const int size = d->params.size; #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { for(int i = 0; i < roi_out->width; i++) { /* remap lightness into zonemap and apply lightness */ const float *in = (float *)ivoid + ch * ((size_t)j * roi_out->width + i); float *out = (float *)ovoid + ch * ((size_t)j * roi_out->width + i); const int rz = CLAMPS(in[0] * d->rzscale, 0, size - 2); // zone index const float zs = ((rz > 0) ? (d->zonemap_offset[rz] / in[0]) : 0) + d->zonemap_scale[rz]; _mm_stream_ps(out, _mm_mul_ps(_mm_load_ps(in), _mm_set1_ps(zs))); } } _mm_sfence(); process_common_cleanup(self, piece, ivoid, ovoid, roi_in, roi_out); }
/** process, all real work is done here. */ void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *i, void *o, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { // this is called for preview and full pipe separately, each with its own pixelpipe piece. assert(dt_iop_module_colorspace(self) == iop_cs_Lab); // get our data struct: dt_iop_colorcontrast_params_t *d = (dt_iop_colorcontrast_params_t *)piece->data; // how many colors in our buffer? const int ch = piece->colors; // iterate over all output pixels (same coordinates as input) #ifdef _OPENMP // optional: parallelize it! #pragma omp parallel for default(none) schedule(static) shared(i,o,roi_in,roi_out,d) #endif for(int j=0; j<roi_out->height; j++) { float *in = ((float *)i) + ch*roi_in->width *j; float *out = ((float *)o) + ch*roi_out->width*j; const __m128 scale = _mm_set_ps(0.0f,d->b_steepness,d->a_steepness,1.0f); const __m128 offset = _mm_set_ps(0.0f,d->b_offset,d->a_offset,0.0f); const __m128 min = _mm_set_ps(0.0f,-128.0f,-128.0f, -INFINITY); const __m128 max = _mm_set_ps(0.0f, 128.0f, 128.0f, INFINITY); for(int i=0; i<roi_out->width; i++) { _mm_stream_ps(out,_mm_min_ps(max,_mm_max_ps(min,_mm_add_ps(offset,_mm_mul_ps(scale,_mm_load_ps(in)))))); in+=ch; out+=ch; } } _mm_sfence(); }
bool opt_copy_stream_to_stream( x42memStream_t *dest, x42memStream_t *src, size_t elemSize, uint numElems, x42opts_t *opts ) { REF_PARAM( opts ); if( (opts->caps & OPT_SSE2) && elemSize <= sizeof( __m128 ) && stream_is_aligned( src ) && stream_pad_ok( src, elemSize ) && stream_is_aligned( dest ) && stream_pad_ok( dest, elemSize ) ) { uint i; size_t is = src->stride; size_t os = dest->stride; const __m128i * RESTRICT pi = (__m128i*)src->pStreamZero; __m128i * RESTRICT po = (__m128i*)dest->pStreamZero; for( i = 0; i < numElems; i++ ) { __m128i v = _mm_load_si128( pi ); _mm_stream_si128( po, v ); pi = (__m128i*)((byte*)pi + is); po = (__m128i*)((byte*)po + os); } _mm_sfence(); return true; }
//FUNC_ATTRIBUTE (noinline) VOID MemUDummyCLRead ( IN UINT32 Address ) { _mm_sfence (); __readfsbyte (Address); }
/* * drain_pcommit -- (internal) wait for PM stores to drain, pcommit version */ static void drain_pcommit(void) { LOG(15, NULL); Func_predrain_fence(); _mm_pcommit(); _mm_sfence(); }
void g() { (void)_mm_getcsr(); _mm_setcsr(1); _mm_sfence(); _mm_clflush((void*)0); _mm_lfence(); _mm_mfence(); _mm_pause(); }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out) { dt_develop_t *dev = self->dev; const int ch = piece->colors; const __m128 upper = _mm_set_ps(FLT_MAX, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f); const __m128 lower = _mm_set_ps(FLT_MAX, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f); const int colorscheme = dev->overexposed.colorscheme; const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]); const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]); #ifdef _OPENMP #pragma omp parallel for default(none) shared(ovoid) schedule(static) #endif for(int k=0; k<roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width; float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width; for (int j=0; j<roi_out->width; j++,in+=4,out+=4) { const __m128 pixel = _mm_load_ps(in); __m128 isoe = _mm_cmpge_ps(pixel, upper); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); __m128 isue = _mm_cmple_ps(pixel, lower); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel), _mm_and_ps(isoe, upper_color)); result = _mm_or_ps(_mm_andnot_ps(isue, result), _mm_and_ps(isue, lower_color)); _mm_stream_ps(out, result); } } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
void process( struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const float divider = (float)UINT16_MAX; const __m128 dividers = _mm_set_ps1(divider); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(ovoid) #endif for(int j = 0; j < roi_out->height; j++) { const uint16_t *in = ((uint16_t *)ivoid) + (size_t)j * roi_out->width; float *out = ((float *)ovoid) + (size_t)j * roi_out->width; int i = 0; int alignment = ((8 - (j * roi_out->width & (8 - 1))) & (8 - 1)); // process unaligned pixels for ( ; i < alignment ; i++, out++, in++) *out = ((float)(*in)) / divider; // process aligned pixels with SSE for( ; i < roi_out->width - (8 - 1); i += 8, in += 8) { const __m128i input = _mm_load_si128((__m128i *)in); __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0)); __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0)); __m128 flo = _mm_cvtepi32_ps(ilo); __m128 fhi = _mm_cvtepi32_ps(ihi); flo = _mm_div_ps(flo, dividers); fhi = _mm_div_ps(fhi, dividers); _mm_stream_ps(out, flo); out += 4; _mm_stream_ps(out, fhi); out += 4; } // process the rest for( ; i < roi_out->width; i++, out++, in++) *out = ((float)(*in)) / divider; } _mm_sfence(); }
static void process_clip_sse2(dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const float clip) { if(piece->pipe->dsc.filters) { // raw mosaic const __m128 clipm = _mm_set1_ps(clip); const size_t n = (size_t)roi_out->height * roi_out->width; float *const out = (float *)ovoid; float *const in = (float *)ivoid; #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) #endif for(size_t j = 0; j < (n & ~3u); j += 4) _mm_stream_ps(out + j, _mm_min_ps(clipm, _mm_load_ps(in + j))); _mm_sfence(); // lets see if there's a non-multiple of four rest to process: if(n & 3) for(size_t j = n & ~3u; j < n; j++) out[j] = MIN(clip, in[j]); } else { const __m128 clipm = _mm_set1_ps(clip); const int ch = piece->colors; #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { float *out = (float *)ovoid + (size_t)ch * roi_out->width * j; float *in = (float *)ivoid + (size_t)ch * roi_in->width * j; for(int i = 0; i < roi_out->width; i++, in += ch, out += ch) { _mm_stream_ps(out, _mm_min_ps(clipm, _mm_set_ps(in[3], in[2], in[1], in[0]))); } } _mm_sfence(); } }
/* ===================== R_CopyDecalSurface ===================== */ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes, const decal_t * decal, const float fadeColor[4] ) { assert_16_byte_aligned( &verts[numVerts] ); assert_16_byte_aligned( &indexes[numIndexes] ); assert_16_byte_aligned( decal->indexes ); assert_16_byte_aligned( decal->verts ); assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); const __m128 vector_fade_color = _mm_load_ps( fadeColor ); const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 ); // copy vertices and apply depth/time based fading assert_offsetof( idDrawVert, color, 6 * 4 ); for ( int i = 0; i < decal->numVerts; i++ ) { const idDrawVert &srcVert = decal->verts[i]; idDrawVert &dstVert = verts[numVerts + i]; __m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 0 ) ); __m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) ); __m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 ); __m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color ); __m128i colorInt = _mm_cvtps_epi32( timeDepthFade ); __m128i colorShort = _mm_packs_epi32( colorInt, colorInt ); __m128i colorByte = _mm_packus_epi16( colorShort, colorShort ); v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 0 ), v0 ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 ); } // copy indexes assert( ( decal->numIndexes & 7 ) == 0 ); assert( sizeof( triIndex_t ) == 2 ); for ( int i = 0; i < decal->numIndexes; i += 8 ) { __m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] ); vi = _mm_add_epi16( vi, vector_short_num_verts ); _mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi ); } _mm_sfence(); }
void dzero(unsigned int N, double* y) { #ifdef GX_SSE if(SSE2_supported) { __m128d Z = _mm_setzero_pd() ; for(unsigned int i=0; i<N; i+=4) { _mm_stream_pd(&y[i], Z) ; _mm_stream_pd(&y[i + 2], Z) ; } _mm_sfence() ; return ; } #endif memset(y, 0, N*sizeof(double)) ; }
void f() { (void)_mm_getcsr(); // expected-warning{{implicitly declaring library function '_mm_getcsr'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_getcsr'}} _mm_setcsr(1); // expected-warning{{implicitly declaring library function '_mm_setcsr'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_setcsr'}} _mm_sfence(); // expected-warning{{implicitly declaring library function '_mm_sfence'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_sfence'}} _mm_clflush((void*)0); // expected-warning{{implicitly declaring library function '_mm_clflush'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_clflush'}} _mm_lfence(); // expected-warning{{implicitly declaring library function '_mm_lfence'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_lfence'}} _mm_mfence(); // expected-warning{{implicitly declaring library function '_mm_mfence'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_mfence'}} _mm_pause(); // expected-warning{{implicitly declaring library function '_mm_pause'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_pause'}} }
inline __always_inline static void sse2_memzero128aligned(void *ptr, int n) { __m128d d = (__m128d)_mm_setzero_si128 (); assert(((stm_word_t)ptr)%16==0); assert(n%128==0); char *p, *endptr = ((char*)ptr)+n; // = ptr; for(p = ptr; p < endptr; p+=128) { _mm_stream_pd((double*)&p[0], d); _mm_stream_pd((double*)&p[16], d); _mm_stream_pd((double*)&p[32], d); _mm_stream_pd((double*)&p[48], d); _mm_stream_pd((double*)&p[64], d); _mm_stream_pd((double*)&p[80], d); _mm_stream_pd((double*)&p[96], d); _mm_stream_pd((double*)&p[112], d); } _mm_sfence(); }
void dcopy(unsigned int N, const double* x, double* y) { #ifdef GX_SSE if(SSE2_supported) { __m128d X1,X2 ; unsigned int i = 0 ; while(i<N) { _mm_prefetch((const char*)(&y[i] + 128), _MM_HINT_NTA) ; X1 = _mm_load_pd(&x[i]) ; i+=2 ; X2 = _mm_load_pd(&x[i]) ; i+=2 ; _mm_stream_pd(&y[i - 4], X1) ; _mm_stream_pd(&y[i - 2], X2) ; } _mm_sfence() ; return ; } #endif memcpy(y, x, N * sizeof(double)) ; }
inline static void sse2_memset128aligned(void *ptr, int n, stm_word_t word) { #ifdef __LP64__ __m128d d = (__m128d)_mm_set_epi64((__m64)word, (__m64)word); #else __m128d d = (__m128d)_mm_set_epi32(word, word, word, word); #endif assert(((stm_word_t)ptr)%16==0); assert(n%128==0); char *p, *endptr = ((char*)ptr)+n; // = ptr; for(p = ptr; p < endptr; p+=128) { _mm_stream_pd((double*)&p[0], d); _mm_stream_pd((double*)&p[16], d); _mm_stream_pd((double*)&p[32], d); _mm_stream_pd((double*)&p[48], d); _mm_stream_pd((double*)&p[64], d); _mm_stream_pd((double*)&p[80], d); _mm_stream_pd((double*)&p[96], d); _mm_stream_pd((double*)&p[112], d); } _mm_sfence(); }
void f0() { signed char tmp_c; // unsigned char tmp_Uc; signed short tmp_s; #ifdef USE_ALL unsigned short tmp_Us; #endif signed int tmp_i; unsigned int tmp_Ui; signed long long tmp_LLi; unsigned long long tmp_ULLi; float tmp_f; double tmp_d; void* tmp_vp; const void* tmp_vCp; char* tmp_cp; const char* tmp_cCp; int* tmp_ip; float* tmp_fp; const float* tmp_fCp; double* tmp_dp; const double* tmp_dCp; long long* tmp_LLip; #define imm_i 32 #define imm_i_0_2 0 #define imm_i_0_4 3 #define imm_i_0_8 7 #define imm_i_0_16 15 // Check this. #define imm_i_0_256 0 V2i* tmp_V2ip; V1LLi* tmp_V1LLip; V2LLi* tmp_V2LLip; // 64-bit V8c tmp_V8c; V4s tmp_V4s; V2i tmp_V2i; V1LLi tmp_V1LLi; #ifdef USE_3DNOW V2f tmp_V2f; #endif // 128-bit V16c tmp_V16c; V8s tmp_V8s; V4i tmp_V4i; V2LLi tmp_V2LLi; V4f tmp_V4f; V2d tmp_V2d; V2d* tmp_V2dp; V4f* tmp_V4fp; const V2d* tmp_V2dCp; const V4f* tmp_V4fCp; // 256-bit V32c tmp_V32c; V4d tmp_V4d; V8f tmp_V8f; V4LLi tmp_V4LLi; V8i tmp_V8i; V4LLi* tmp_V4LLip; V4d* tmp_V4dp; V8f* tmp_V8fp; const V4d* tmp_V4dCp; const V8f* tmp_V8fCp; tmp_V2LLi = __builtin_ia32_undef128(); tmp_V4LLi = __builtin_ia32_undef256(); tmp_i = __builtin_ia32_comieq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comilt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comile(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comigt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comige(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comineq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomieq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomilt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomile(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomigt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomige(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomineq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comisdeq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdlt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdle(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdgt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdge(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdneq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdeq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdlt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdle(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdgt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdge(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdneq(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 0); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 1); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 2); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 3); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 4); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 5); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 6); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 7); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 0); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 1); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 2); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 3); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 4); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 5); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 6); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 7); tmp_V4f = __builtin_ia32_minps(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxps(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f); tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 3); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 4); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 5); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 6); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 7); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 2); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 3); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 4); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 5); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 6); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 7); tmp_V2d = __builtin_ia32_minpd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_maxpd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_minsd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_maxsd(tmp_V2d, tmp_V2d); tmp_V16c = __builtin_ia32_paddsb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_paddsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_psubsb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_paddusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_paddusw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_psubusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubusw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pminsw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s); tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_haddps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_haddpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d); tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s); tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c); tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i); tmp_V16c = __builtin_ia32_pabsb128(tmp_V16c); tmp_V8c = __builtin_ia32_pabsb(tmp_V8c); tmp_V8s = __builtin_ia32_pabsw128(tmp_V8s); tmp_V4s = __builtin_ia32_pabsw(tmp_V4s); tmp_V4i = __builtin_ia32_pabsd128(tmp_V4i); tmp_V2i = __builtin_ia32_pabsd(tmp_V2i); tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi); tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi); tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi); tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi); tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi); tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s); tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0); __builtin_ia32_incsspd(tmp_Ui); __builtin_ia32_incsspq(tmp_ULLi); tmp_Ui = __builtin_ia32_rdsspd(tmp_Ui); tmp_ULLi = __builtin_ia32_rdsspq(tmp_ULLi); __builtin_ia32_saveprevssp(); __builtin_ia32_rstorssp(tmp_vp); __builtin_ia32_wrssd(tmp_Ui, tmp_vp); __builtin_ia32_wrssq(tmp_ULLi, tmp_vp); __builtin_ia32_wrussd(tmp_Ui, tmp_vp); __builtin_ia32_wrussq(tmp_ULLi, tmp_vp); __builtin_ia32_setssbsy(); __builtin_ia32_clrssbsy(tmp_vp); (void) __builtin_ia32_ldmxcsr(tmp_Ui); (void) _mm_setcsr(tmp_Ui); tmp_Ui = __builtin_ia32_stmxcsr(); tmp_Ui = _mm_getcsr(); (void)__builtin_ia32_fxsave(tmp_vp); (void)__builtin_ia32_fxsave64(tmp_vp); (void)__builtin_ia32_fxrstor(tmp_vp); (void)__builtin_ia32_fxrstor64(tmp_vp); (void)__builtin_ia32_xsave(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsave64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstor(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstor64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaveopt(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaveopt64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstors(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstors64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsavec(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsavec64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi); (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui); (void) __builtin_ia32_clzero(tmp_vp); (void) __builtin_ia32_cldemote(tmp_vp); tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i); tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f); tmp_i = __builtin_ia32_cvtss2si(tmp_V4f); tmp_i = __builtin_ia32_cvttss2si(tmp_V4f); tmp_i = __builtin_ia32_rdtsc(); tmp_i = __rdtsc(); tmp_i = __builtin_ia32_rdtscp(&tmp_Ui); tmp_LLi = __builtin_ia32_rdpmc(tmp_i); __builtin_ia32_wbnoinvd(); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f); tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f); #endif tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f); (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp); (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f); (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f); tmp_i = __builtin_ia32_movmskps(tmp_V4f); tmp_i = __builtin_ia32_pmovmskb(tmp_V8c); (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi); (void) __builtin_ia32_sfence(); (void) _mm_sfence(); tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c); tmp_V4f = __builtin_ia32_rcpps(tmp_V4f); tmp_V4f = __builtin_ia32_rcpss(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtss(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f); (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp); tmp_i = __builtin_ia32_movmskpd(tmp_V2d); tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c); (void) __builtin_ia32_movnti(tmp_ip, tmp_i); #ifdef USE_64 (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi); #endif tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d); tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d); tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d); tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d); tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d); tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d); tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d); tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i); tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d); tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d); tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d); tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d); #endif tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f); tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f); (void) __builtin_ia32_clflush(tmp_vCp); (void) _mm_clflush(tmp_vCp); (void) __builtin_ia32_lfence(); (void) _mm_lfence(); (void) __builtin_ia32_mfence(); (void) _mm_mfence(); (void) __builtin_ia32_pause(); (void) _mm_pause(); tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i); tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i); tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i); tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i); tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i); tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i); tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psrlw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrld128(tmp_V4i, tmp_V4i); tmp_V2LLi = __builtin_ia32_psrlq128(tmp_V2LLi, tmp_V2LLi); tmp_V8s = __builtin_ia32_psllw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_pslld128(tmp_V4i, tmp_V4i); tmp_V2LLi = __builtin_ia32_psllq128(tmp_V2LLi, tmp_V2LLi); tmp_V8s = __builtin_ia32_psllwi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_pslldi128(tmp_V4i, tmp_i); tmp_V2LLi = __builtin_ia32_psllqi128(tmp_V2LLi, tmp_i); tmp_V8s = __builtin_ia32_psrlwi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_psrldi128(tmp_V4i, tmp_i); tmp_V2LLi = __builtin_ia32_psrlqi128(tmp_V2LLi, tmp_i); tmp_V8s = __builtin_ia32_psrawi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_psradi128(tmp_V4i, tmp_i); tmp_V8s = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s); (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui); tmp_V16c = __builtin_ia32_lddqu(tmp_cCp); tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i); tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i); #ifdef USE_SSE4 tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_blendvps(tmp_V4f, tmp_V4f, tmp_V4f); tmp_V8s = __builtin_ia32_packusdw128(tmp_V4i, tmp_V4i); tmp_V16c = __builtin_ia32_pmaxsb128(tmp_V16c, tmp_V16c); tmp_V4i = __builtin_ia32_pmaxsd128(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_pmaxud128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_pmaxuw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminsb128(tmp_V16c, tmp_V16c); tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s); tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i); tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16); tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16); tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16); tmp_V2d = __builtin_ia32_roundpd(tmp_V2d, imm_i_0_16); tmp_V4f = __builtin_ia32_insertps128(tmp_V4f, tmp_V4f, imm_i_0_256); #endif tmp_V4d = __builtin_ia32_addsubpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_addsubps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_haddpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_hsubps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_hsubpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_haddps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_maxpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_maxps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_minpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_minps256(tmp_V8f, tmp_V8f); tmp_V2d = __builtin_ia32_vpermilvarpd(tmp_V2d, tmp_V2LLi); tmp_V4f = __builtin_ia32_vpermilvarps(tmp_V4f, tmp_V4i); tmp_V4d = __builtin_ia32_vpermilvarpd256(tmp_V4d, tmp_V4LLi); tmp_V8f = __builtin_ia32_vpermilvarps256(tmp_V8f, tmp_V8i); tmp_V4d = __builtin_ia32_blendvpd256(tmp_V4d, tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_blendvps256(tmp_V8f, tmp_V8f, tmp_V8f); tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7); tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0); tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0); tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d); tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f); tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d); tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d); tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f); tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7); tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7); tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7); tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d); tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f); tmp_V8f = __builtin_ia32_rsqrtps256(tmp_V8f); tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f); tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1); tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1); tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestzps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestcps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestnzcps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestzpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestcpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestnzcpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestzps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_vtestcps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_vtestnzcps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_ptestz256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_ptestc256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_ptestnzc256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_movmskpd256(tmp_V4d); tmp_i = __builtin_ia32_movmskps256(tmp_V8f); __builtin_ia32_vzeroall(); __builtin_ia32_vzeroupper(); tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp); tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi); tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i); tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi); tmp_V8f = __builtin_ia32_maskloadps256(tmp_V8fCp, tmp_V8i); __builtin_ia32_maskstorepd(tmp_V2dp, tmp_V2LLi, tmp_V2d); __builtin_ia32_maskstoreps(tmp_V4fp, tmp_V4i, tmp_V4f); __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d); __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f); #ifdef USE_3DNOW tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c); tmp_V2i = __builtin_ia32_pf2id(tmp_V2f); tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f); tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f); tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i); tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f); tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i); tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f); tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i); tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4); tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg2(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256rnds2(tmp_V4i, tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256msg1(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256msg2(tmp_V4i, tmp_V4i); #endif }
void main() { HMODULE newhsacore = ::LoadLibraryW(L"newhsacore64.dll"); assert(newhsacore != NULL); HsaGetDevicesFunction HsaGetDevices = reinterpret_cast<HsaGetDevicesFunction>(::GetProcAddress(newhsacore, "HsaGetDevices")); assert(HsaGetDevices != NULL); HsaCreateUserModeQueueFunction HsaCreateUserModeQueue = reinterpret_cast<HsaCreateUserModeQueueFunction>(::GetProcAddress(newhsacore, "HsaCreateUserModeQueue")); assert(HsaCreateUserModeQueue != NULL); HsaDestroyUserModeQueueFunction HsaDestroyUserModeQueue = reinterpret_cast<HsaDestroyUserModeQueueFunction>(::GetProcAddress(newhsacore, "HsaDestroyUserModeQueue")); assert(HsaDestroyUserModeQueue != NULL); HsaSubmitAqlFunction HsaSubmitAql = reinterpret_cast<HsaSubmitAqlFunction>(::GetProcAddress(newhsacore, "HsaSubmitAql")); assert(HsaSubmitAql != NULL); HsaCreateSignalFunction HsaCreateSignal = reinterpret_cast<HsaCreateSignalFunction>(::GetProcAddress(newhsacore, "HsaCreateSignal")); assert(HsaCreateSignal != NULL); HsaDestroySignalFunction HsaDestroySignal = reinterpret_cast<HsaDestroySignalFunction>(::GetProcAddress(newhsacore, "HsaDestroySignal")); assert(HsaDestroySignal != NULL); HsaWaitOnSignalFunction HsaWaitOnSignal = reinterpret_cast<HsaWaitOnSignalFunction>(::GetProcAddress(newhsacore, "HsaWaitOnSignal")); assert(HsaWaitOnSignal != NULL); HsaQuerySignalFunction HsaQuerySignal = reinterpret_cast<HsaQuerySignalFunction>(::GetProcAddress(newhsacore, "HsaQuerySignal")); assert(HsaQuerySignal != NULL); HsaLoadBrigFunction HsaLoadBrig = reinterpret_cast<HsaLoadBrigFunction>(::GetProcAddress(newhsacore, "HsaLoadBrig")); assert(HsaLoadBrig != NULL); HsaUnloadBrigFunction HsaUnloadBrig = reinterpret_cast<HsaUnloadBrigFunction>(::GetProcAddress(newhsacore, "HsaUnloadBrig")); assert(HsaUnloadBrig != NULL); HsaFinalizeBrigFunction HsaFinalizeBrig = reinterpret_cast<HsaFinalizeBrigFunction>(::GetProcAddress(newhsacore, "HsaFinalizeBrig")); assert(HsaFinalizeBrig != NULL); HsaFreeKernelCodeFunction HsaFreeKernelCode = reinterpret_cast<HsaFreeKernelCodeFunction>(::GetProcAddress(newhsacore, "HsaFreeKernelCode")); assert(HsaFreeKernelCode != NULL); HsaFreeKernelDebugFunction HsaFreeKernelDebug = reinterpret_cast<HsaFreeKernelDebugFunction>(::GetProcAddress(newhsacore, "HsaFreeKernelDebug")); assert(HsaFreeKernelDebug != NULL); HsaRegisterSystemMemoryFunction HsaRegisterSystemMemory = reinterpret_cast<HsaRegisterSystemMemoryFunction>(::GetProcAddress(newhsacore, "HsaRegisterSystemMemory")); assert(HsaRegisterSystemMemory != NULL); HsaDeregisterSystemMemoryFunction HsaDeregisterSystemMemory = reinterpret_cast<HsaDeregisterSystemMemoryFunction>(::GetProcAddress(newhsacore, "HsaDeregisterSystemMemory")); assert(HsaDeregisterSystemMemory != NULL); HsaStatus status = kHsaStatusSuccess; const HsaDevice* device = NULL; unsigned int deviceCount = 0; status = HsaGetDevices(&deviceCount, &device); assert(status == kHsaStatusSuccess); assert(deviceCount == 1); HsaBrig brig; memset(&brig, 0, sizeof(brig)); brig.code_section = hsa_code_section; brig.code_section_byte_size = sizeof(hsa_code_section); brig.directive_section = hsa_directives_section; brig.directive_section_byte_size = sizeof(hsa_directives_section); brig.operand_section = hsa_operands_section; brig.operand_section_byte_size = sizeof(hsa_operands_section); brig.string_section = hsa_strtab_section; brig.string_section_byte_size = sizeof(hsa_strtab_section); status = HsaLoadBrig(device, &brig); assert(status == kHsaStatusSuccess); HsaKernelCode *kernelCode = NULL; status = HsaFinalizeBrig(device, &brig, "&hsaDemo", "", &kernelCode, NULL); assert(status == kHsaStatusSuccess); assert(kernelCode != NULL); status = HsaUnloadBrig(&brig); assert(status == kHsaStatusSuccess); HsaQueue* queue = NULL; status = HsaCreateUserModeQueue(device, NULL, 0, kHsaQueueTypeCompute, kHsaQueuePriorityMaximum, kHsaQueueFractionTen, &queue); assert(status == kHsaStatusSuccess); HsaSignal signal = NULL; status = HsaCreateSignal(&signal); assert(status == kHsaStatusSuccess); for (size_t arraySize = 64; arraySize <= 1024 * 1024; arraySize *= 2) { uint32_t* xArray = (uint32_t*)::VirtualAlloc(NULL, arraySize * sizeof(uint32_t), MEM_RESERVE, PAGE_READWRITE); assert(xArray != NULL); xArray = (uint32_t*)::VirtualAlloc(xArray, arraySize * sizeof(uint32_t), MEM_COMMIT, PAGE_READWRITE); assert(xArray != NULL); memset(xArray, 0x12, arraySize * sizeof(uint32_t)); uint32_t* yArray = (uint32_t*)::VirtualAlloc(NULL, arraySize * sizeof(uint32_t), MEM_RESERVE, PAGE_READWRITE); assert(yArray != NULL); yArray = (uint32_t*)::VirtualAlloc(yArray, arraySize * sizeof(uint32_t), MEM_COMMIT, PAGE_READWRITE); assert(yArray != NULL); memset(yArray, 0x14, arraySize * sizeof(uint32_t)); uint32_t* zArray = (uint32_t*)::VirtualAlloc(NULL, arraySize * sizeof(uint32_t), MEM_RESERVE, PAGE_READWRITE); assert(zArray != NULL); zArray = (uint32_t*)::VirtualAlloc(zArray, arraySize * sizeof(uint32_t), MEM_COMMIT, PAGE_READWRITE); assert(zArray != NULL); memset(zArray, 0x42, arraySize * sizeof(uint32_t)); for (size_t iteration = 1; iteration <= 5; iteration++) { uint64_t kernelArguments[4]; kernelArguments[0] = reinterpret_cast<uint64_t>(xArray); kernelArguments[1] = reinterpret_cast<uint64_t>(yArray); kernelArguments[2] = reinterpret_cast<uint64_t>(zArray); kernelArguments[3] = static_cast<uint64_t>(arraySize); memset(zArray, 0x42, arraySize * sizeof(uint32_t)); HsaAqlDispatchPacket aqlPacket; memset(&aqlPacket, 0, sizeof(aqlPacket)); aqlPacket.format = kHsaAqlPacketFormatDispatch; aqlPacket.invalidate_instruction_cache = 1; /* This is very important. Without release fence the system will crash from time to time. */ aqlPacket.release_fence_scope = 2; aqlPacket.dimensions = 1; aqlPacket.grid_size[0] = static_cast<uint32_t>(arraySize); aqlPacket.grid_size[1] = 1; aqlPacket.grid_size[2] = 1; aqlPacket.workgroup_size[0] = device->wave_front_size * device->number_compute_units; aqlPacket.workgroup_size[1] = 1; aqlPacket.workgroup_size[2] = 1; aqlPacket.completion_signal = signal; aqlPacket.group_segment_size_bytes = kernelCode->workgroup_group_segment_byte_size; aqlPacket.private_segment_size_bytes = kernelCode->workitem_private_segment_byte_size; aqlPacket.kernel_object_address = reinterpret_cast<uint64_t>(kernelCode); aqlPacket.kernel_arg_address = reinterpret_cast<uint64_t>(kernelArguments); const uint64_t gpuStartCycles = __rdtsc(); status = HsaSubmitAql(queue, &aqlPacket); assert(status == kHsaStatusSuccess); const uint64_t gpuSubmitCycles = __rdtsc(); status = HsaWaitOnSignal(signal); assert(status == kHsaStatusSuccess); const uint64_t gpuComputeCycles = __rdtsc(); size_t countEqual = 0; for (size_t i = 0; i < arraySize; i++) { if (xArray[i] + yArray[i] == zArray[i]) countEqual++; else if (i < 10) printf("%"PRIx32" + %"PRIx32" = %"PRIx32"\n", xArray[i], yArray[i], zArray[i]); } if (countEqual != arraySize) { printf("%Iu\tFAILED (%Iu)\n", arraySize, countEqual); break; } const uint64_t cpuStartCycles = __rdtsc(); for (size_t i = 0; i < arraySize; i += 16) { _mm_stream_si128((__m128i*)&zArray[i], _mm_add_epi32(_mm_load_si128((const __m128i*)&xArray[i]), _mm_load_si128((const __m128i*)&yArray[i]))); _mm_stream_si128((__m128i*)&zArray[i + 4], _mm_add_epi32(_mm_load_si128((const __m128i*)&xArray[i + 4]), _mm_load_si128((const __m128i*)&yArray[i + 4]))); _mm_stream_si128((__m128i*)&zArray[i + 8], _mm_add_epi32(_mm_load_si128((const __m128i*)&xArray[i + 8]), _mm_load_si128((const __m128i*)&yArray[i + 8]))); _mm_stream_si128((__m128i*)&zArray[i + 12], _mm_add_epi32(_mm_load_si128((const __m128i*)&xArray[i + 12]), _mm_load_si128((const __m128i*)&yArray[i + 12]))); } _mm_sfence(); const uint64_t cpuComputeCycles = __rdtsc(); const double gpuSubmitTime = double(gpuSubmitCycles - gpuStartCycles) / 3.7e+9; const double gpuComputeTime = double(gpuComputeCycles - gpuStartCycles) / 3.7e+9; const double cpuComputeTime = double(cpuComputeCycles - cpuStartCycles) / 3.7e+9; const double gpuBandwidth = double(arraySize) * double(3 * sizeof(uint32_t)) / gpuComputeTime; const double cpuBandwidth = double(arraySize) * double(3 * sizeof(uint32_t)) / cpuComputeTime; printf("%Iu\t%.2lf\t%.2lf\t%.2lf\t%.3lf\t%.3lf\n", arraySize, gpuSubmitTime * 1.0e+6, gpuComputeTime * 1.0e+6, cpuComputeTime * 1.0e+6, gpuBandwidth * 1.0e-9, cpuBandwidth * 1.0e-9); } ::VirtualFree(xArray, 0, MEM_DECOMMIT); ::VirtualFree(xArray, 0, MEM_RELEASE); ::VirtualFree(yArray, 0, MEM_DECOMMIT); ::VirtualFree(yArray, 0, MEM_RELEASE); ::VirtualFree(zArray, 0, MEM_DECOMMIT); ::VirtualFree(zArray, 0, MEM_RELEASE); } status = HsaDestroySignal(signal); assert(status == kHsaStatusSuccess); status = HsaFreeKernelCode(kernelCode); assert(status == kHsaStatusSuccess); status = HsaDestroyUserModeQueue(queue); assert(status == kHsaStatusSuccess); BOOL freeHsaCore = ::FreeLibrary(newhsacore); assert(freeHsaCore != FALSE); }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { float *in; float *out; dt_iop_zonesystem_gui_data_t *g = NULL; dt_iop_zonesystem_data_t *data = (dt_iop_zonesystem_data_t*)piece->data; guchar *buffer = NULL; if( self->dev->gui_attached && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW ) { g = (dt_iop_zonesystem_gui_data_t *)self->gui_data; dt_pthread_mutex_lock(&g->lock); if(g->preview_buffer) g_free (g->preview_buffer); buffer = g->preview_buffer = g_malloc (roi_in->width*roi_in->height); g->preview_width=roi_out->width; g->preview_height=roi_out->height; } /* calculate zonemap */ const int size = data->size; float zonemap[MAX_ZONE_SYSTEM_SIZE]= {-1}; _iop_zonesystem_calculate_zonemap (data, zonemap); const int ch = piece->colors; /* if gui and have buffer lets gaussblur and fill buffer with zone indexes */ if( self->dev->gui_attached && g && buffer) { /* setup gaussian kernel */ const int radius = 8; const int rad = MIN(radius, ceilf(radius * roi_in->scale / piece->iscale)); const int wd = 2*rad+1; float mat[wd*wd]; float *m; const float sigma2 = (2.5*2.5)*(radius*roi_in->scale/piece->iscale)*(radius*roi_in->scale/piece->iscale); float weight = 0.0f; memset(mat, 0, wd*wd*sizeof(float)); m = mat; for(int l=-rad; l<=rad; l++) for(int k=-rad; k<=rad; k++,m++) weight += *m = expf(- (l*l + k*k)/(2.f*sigma2)); m = mat; for(int l=-rad; l<=rad; l++) for(int k=-rad; k<=rad; k++,m++) *m /= weight; /* gauss blur the L channel */ #ifdef _OPENMP #pragma omp parallel for default(none) private(in, out, m) shared(mat, ivoid, ovoid, roi_out, roi_in) schedule(static) #endif for(int j=rad; j<roi_out->height-rad; j++) { in = ((float *)ivoid) + ch*(j*roi_in->width + rad); out = ((float *)ovoid) + ch*(j*roi_out->width + rad); for(int i=rad; i<roi_out->width-rad; i++) { for(int c=0; c<3; c++) out[c] = 0.0f; float sum = 0.0; m = mat; for(int l=-rad; l<=rad; l++) { float *inrow = in + ch*(l*roi_in->width-rad); for(int k=-rad; k<=rad; k++,inrow+=ch,m++) sum += *m * inrow[0]; } out[0] = sum; out += ch; in += ch; } } /* create zonemap preview */ // in = (float *)ivoid; out = (float *)ovoid; #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out,out,buffer,g,zonemap) schedule(static) #endif for (int k=0; k<roi_out->width*roi_out->height; k++) { buffer[k] = _iop_zonesystem_zone_index_from_lightness (out[ch*k]/100.0f, zonemap, size); } dt_pthread_mutex_unlock(&g->lock); } /* process the image */ in = (float *)ivoid; out = (float *)ovoid; const float rzscale = (size-1)/100.0f; float zonemap_offset[MAX_ZONE_SYSTEM_SIZE]= {-1}; float zonemap_scale[MAX_ZONE_SYSTEM_SIZE]= {-1}; // precompute scale and offset for (int k=0; k < size-1; k++) zonemap_scale[k] = (zonemap[k+1]-zonemap[k])*(size-1); for (int k=0; k < size-1; k++) zonemap_offset[k] = 100.0f * ((k+1)*zonemap[k] - k*zonemap[k+1]) ; #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, in, out, zonemap_scale,zonemap_offset) schedule(static) #endif for (int j=0; j<roi_out->height; j++) for (int i=0; i<roi_out->width; i++) { /* remap lightness into zonemap and apply lightness */ const float *inp = in + ch*(j*roi_out->width+i); float *outp = out + ch*(j*roi_out->width+i); const int rz = CLAMPS(inp[0]*rzscale, 0, size-2); // zone index const float zs = ((rz > 0) ? (zonemap_offset[rz]/inp[0]) : 0) + zonemap_scale[rz]; _mm_stream_ps(outp,_mm_mul_ps(_mm_load_ps(inp),_mm_set1_ps(zs))); } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
// ============================================================================= // // sse3_vChirpData // version by: Alex Kan // http://tbp.berkeley.edu/~alexkan/seti/ // int sse3_ChirpData_ak( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); for (i = 0; i < vEnd; i += 4) { const float *data = (const float *) (cx_DataArray + i); float *chirped = (float *) (cx_ChirpDataArray + i); __m128d di = _mm_set1_pd(i); __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di); __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di); __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 s; __m128 c; __m128 m; // load the signal to be chirped prefetchnta((const void *)( data+32 )); d1 = _mm_load_ps(data); d2 = _mm_load_ps(data+4); // calculate the input angle a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate); a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate); // reduce the angle to the range (-0.5, 0.5) a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal)); a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal)); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4), SS3), y), SS2), y), SS1), x); c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3), CC2), y), CC1), y), ONE); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude // m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO)); // m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO)); m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y))); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) s = _mm_mul_ps(s, m); c = _mm_mul_ps(c, m); // chirp the data cd1 = _mm_shuffle_ps(c, c, 0x50); cd2 = _mm_shuffle_ps(c, c, 0xfa); cd1 = _mm_mul_ps(cd1, d1); cd2 = _mm_mul_ps(cd2, d2); d1 = _mm_shuffle_ps(d1, d1, 0xb1); d2 = _mm_shuffle_ps(d2, d2, 0xb1); td1 = _mm_shuffle_ps(s, s, 0x50); td2 = _mm_shuffle_ps(s, s, 0xfa); td1 = _mm_mul_ps(td1, d1); td2 = _mm_mul_ps(td2, d2); cd1 = _mm_addsub_ps(cd1, td1); cd2 = _mm_addsub_ps(cd2, td2); // store chirped values _mm_stream_ps(chirped, cd1); _mm_stream_ps(chirped+4, cd2); } _mm_sfence(); // handle tail elements with scalar code for ( ; i < ul_NumDataPoints; ++i) { double angle = srate * i * i * 0.5; double s = sin(angle); double c = cos(angle); float re = cx_DataArray[i][0]; float im = cx_DataArray[i][1]; cx_ChirpDataArray[i][0] = re * c - im * s; cx_ChirpDataArray[i][1] = re * s + im * c; } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; return 0; }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data; const int ch = piece->colors; const int gamutcheck = (d->softproof_enabled == DT_SOFTPROOF_GAMUTCHECK); if(!isnan(d->cmatrix[0])) { //fprintf(stderr,"Using cmatrix codepath\n"); // convert to rgb using matrix #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid) #endif for(int j=0; j<roi_out->height; j++) { float *in = (float*)ivoid + ch*roi_in->width *j; float *out = (float*)ovoid + ch*roi_out->width*j; const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]); const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]); const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]); for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch ) { const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in)); const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2))))); _mm_stream_ps(out,t); } } _mm_sfence(); // apply profile #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid) #endif for(int j=0; j<roi_out->height; j++) { float *in = (float*)ivoid + ch*roi_in->width *j; float *out = (float*)ovoid + ch*roi_out->width*j; for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch ) { for(int i=0; i<3; i++) if (d->lut[i][0] >= 0.0f) { out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]); } } } } else { float *in = (float*)ivoid; float *out = (float*)ovoid; const int rowsize=roi_out->width * 3; //fprintf(stderr,"Using xform codepath\n"); #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(out, roi_out, in) #endif for (int k=0; k<roi_out->height; k++) { float Lab[rowsize]; float rgb[rowsize]; const int m=(k*(roi_out->width*ch)); for (int l=0; l<roi_out->width; l++) { int li=3*l,ii=ch*l; Lab[li+0] = in[m+ii+0]; Lab[li+1] = in[m+ii+1]; Lab[li+2] = in[m+ii+2]; } cmsDoTransform (d->xform, Lab, rgb, roi_out->width); for (int l=0; l<roi_out->width; l++) { int oi=ch*l, ri=3*l; if(gamutcheck && (rgb[ri+0] < 0.0f || rgb[ri+1] < 0.0f || rgb[ri+2] < 0.0f)) { out[m+oi+0] = 0.0f; out[m+oi+1] = 1.0f; out[m+oi+2] = 1.0f; } else { out[m+oi+0] = rgb[ri+0]; out[m+oi+1] = rgb[ri+1]; out[m+oi+2] = rgb[ri+2]; } } } } if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const int filters = dt_image_flipped_filter(&piece->pipe->image); dt_iop_temperature_data_t *d = (dt_iop_temperature_data_t *)piece->data; if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters && piece->pipe->image.bpp != 4) { const float coeffsi[3] = {d->coeffs[0]/65535.0f, d->coeffs[1]/65535.0f, d->coeffs[2]/65535.0f}; #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static) #endif for(int j=0; j<roi_out->height; j++) { int i=0; const uint16_t *in = ((uint16_t *)ivoid) + j*roi_out->width; float *out = ((float*)ovoid) + j*roi_out->width; // process unaligned pixels for ( ; i < ((4-(j*roi_out->width & 3)) & 3) ; i++,out++,in++) *out = *in * coeffsi[FC(j+roi_out->y, i+roi_out->x, filters)]; const __m128 coeffs = _mm_set_ps(coeffsi[FC(j+roi_out->y, roi_out->x+i+3, filters)], coeffsi[FC(j+roi_out->y, roi_out->x+i+2, filters)], coeffsi[FC(j+roi_out->y, roi_out->x+i+1, filters)], coeffsi[FC(j+roi_out->y, roi_out->x+i , filters)]); // process aligned pixels with SSE for( ; i < roi_out->width - 3 ; i+=4,out+=4,in+=4) { _mm_stream_ps(out,_mm_mul_ps(coeffs,_mm_set_ps(in[3],in[2],in[1],in[0]))); } // process the rest for( ; i<roi_out->width; i++,out++,in++) *out = *in * coeffsi[FC(j+roi_out->y, i+roi_out->x, filters)]; } _mm_sfence(); } else if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters && piece->pipe->image.bpp == 4) { #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static) #endif for(int j=0; j<roi_out->height; j++) { const float *in = ((float *)ivoid) + j*roi_out->width; float *out = ((float*)ovoid) + j*roi_out->width; for(int i=0; i<roi_out->width; i++,out++,in++) *out = *in * d->coeffs[FC(j+roi_out->x, i+roi_out->y, filters)]; } } else { const int ch = piece->colors; #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static) #endif for(int k=0; k<roi_out->height; k++) { const float *in = ((float*)ivoid) + ch*k*roi_out->width; float *out = ((float*)ovoid) + ch*k*roi_out->width; for (int j=0; j<roi_out->width; j++,in+=ch,out+=ch) for(int c=0; c<3; c++) out[c] = in[c]*d->coeffs[c]; } } for(int k=0; k<3; k++) piece->pipe->processed_maximum[k] = d->coeffs[k] * piece->pipe->processed_maximum[k]; }
/* Insert a key-value entry into a hash table. */ int clht_put(clht_t* h, clht_addr_t key, clht_val_t val) { clht_hashtable_t* hashtable = h->ht; size_t bin = clht_hash(hashtable, key); volatile bucket_t* bucket = hashtable->table + bin; #if CLHT_READ_ONLY_FAIL == 1 if (bucket_exists(bucket, key)) { return false; } #endif clht_lock_t* lock = &bucket->lock; while (!LOCK_ACQ(lock, hashtable)) { hashtable = h->ht; size_t bin = clht_hash(hashtable, key); bucket = hashtable->table + bin; lock = &bucket->lock; } CLHT_GC_HT_VERSION_USED(hashtable); CLHT_CHECK_STATUS(h); clht_addr_t* empty = NULL; clht_val_t* empty_v = NULL; uint32_t j; do { for (j = 0; j < ENTRIES_PER_BUCKET; j++) { if (bucket->key[j] == key) { LOCK_RLS(lock); return false; } else if (empty == NULL && bucket->key[j] == 0) { empty = (clht_addr_t*) &bucket->key[j]; empty_v = &bucket->val[j]; } } int resize = 0; if (likely(bucket->next == NULL)) { if (unlikely(empty == NULL)) { DPP(put_num_failed_expand); bucket_t* b = clht_bucket_create_stats(hashtable, &resize); b->val[0] = val; #ifdef __tile__ /* keep the writes in order */ _mm_sfence(); #endif b->key[0] = key; #ifdef __tile__ /* make sure they are visible */ _mm_sfence(); #endif bucket->next = b; } else { *empty_v = val; #ifdef __tile__ /* keep the writes in order */ _mm_sfence(); #endif *empty = key; } LOCK_RLS(lock); if (unlikely(resize)) { /* ht_resize_pes(h, 1); */ ht_status(h, 1, 0); } return true; } bucket = bucket->next; } while (true); }
void __cdecl VDFastMemcpyFinishMMX2() { _mm_empty(); _mm_sfence(); }
void process_sse2(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const dt_iop_rawprepare_data_t *const d = (dt_iop_rawprepare_data_t *)piece->data; // fprintf(stderr, "roi in %d %d %d %d\n", roi_in->x, roi_in->y, roi_in->width, roi_in->height); // fprintf(stderr, "roi out %d %d %d %d\n", roi_out->x, roi_out->y, roi_out->width, roi_out->height); const float scale = roi_in->scale / piece->iscale; const int csx = (int)roundf((float)d->x * scale), csy = (int)roundf((float)d->y * scale); if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && piece->pipe->filters) { // raw mosaic #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { const uint16_t *in = ((uint16_t *)ivoid) + ((size_t)roi_in->width * (j + csy) + csx); float *out = ((float *)ovoid) + (size_t)roi_out->width * j; int i = 0; // FIXME: figure alignment! !!! replace with for !!! while((!dt_is_aligned(in, 16) || !dt_is_aligned(out, 16)) && (i < roi_out->width)) { const int id = BL(roi_out, d, j, i); *out = (((float)(*in)) - d->sub[id]) / d->div[id]; i++; in++; out++; } const __m128 sub = _mm_set_ps(d->sub[BL(roi_out, d, j, i + 3)], d->sub[BL(roi_out, d, j, i + 2)], d->sub[BL(roi_out, d, j, i + 1)], d->sub[BL(roi_out, d, j, i)]); const __m128 div = _mm_set_ps(d->div[BL(roi_out, d, j, i + 3)], d->div[BL(roi_out, d, j, i + 2)], d->div[BL(roi_out, d, j, i + 1)], d->div[BL(roi_out, d, j, i)]); // process aligned pixels with SSE for(; i < roi_out->width - (8 - 1); i += 8, in += 8) { const __m128i input = _mm_load_si128((__m128i *)in); __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0)); __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0)); __m128 flo = _mm_cvtepi32_ps(ilo); __m128 fhi = _mm_cvtepi32_ps(ihi); flo = _mm_div_ps(_mm_sub_ps(flo, sub), div); fhi = _mm_div_ps(_mm_sub_ps(fhi, sub), div); _mm_stream_ps(out, flo); out += 4; _mm_stream_ps(out, fhi); out += 4; } // process the rest for(; i < roi_out->width; i++, in++, out++) { const int id = BL(roi_out, d, j, i); *out = MAX(0.0f, ((float)(*in)) - d->sub[id]) / d->div[id]; } } piece->pipe->filters = dt_rawspeed_crop_dcraw_filters(self->dev->image_storage.filters, csx, csy); adjust_xtrans_filters(piece->pipe, csx, csy); } else { // pre-downsampled buffer that needs black/white scaling const __m128 sub = _mm_load_ps(d->sub), div = _mm_load_ps(d->div); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { const float *in = ((float *)ivoid) + (size_t)4 * (roi_in->width * (j + csy) + csx); float *out = ((float *)ovoid) + (size_t)4 * roi_out->width * j; // process aligned pixels with SSE for(int i = 0; i < roi_out->width; i++, in += 4, out += 4) { const __m128 input = _mm_load_ps(in); const __m128 scaled = _mm_div_ps(_mm_sub_ps(input, sub), div); _mm_stream_ps(out, scaled); } } } _mm_sfence(); }