namespace FPURoundMode { // Get the default SSE states here. static u32 saved_sse_state = _mm_getcsr(); static const u32 default_sse_state = _mm_getcsr(); void SetRoundMode(int mode) { // Convert PowerPC to native rounding mode. static const int rounding_mode_lut[] = { FE_TONEAREST, FE_TOWARDZERO, FE_UPWARD, FE_DOWNWARD }; fesetround(rounding_mode_lut[mode]); } void SetPrecisionMode(PrecisionMode /* mode */) { //x64 doesn't need this - fpu is done with SSE } void SetSIMDMode(int rounding_mode, bool non_ieee_mode) { // OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register) const u32 EXCEPTION_MASK = 0x1F80; // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0) const u32 FTZ = 0x8000; // lookup table for FPSCR.RN-to-MXCSR.RC translation static const u32 simd_rounding_table[] = { (0 << 13) | EXCEPTION_MASK, // nearest (3 << 13) | EXCEPTION_MASK, // -inf (2 << 13) | EXCEPTION_MASK, // +inf (1 << 13) | EXCEPTION_MASK, // zero }; u32 csr = simd_rounding_table[rounding_mode]; if (non_ieee_mode) { csr |= FTZ; } _mm_setcsr(csr); } void SaveSIMDState() { saved_sse_state = _mm_getcsr(); } void LoadSIMDState() { _mm_setcsr(saved_sse_state); } void LoadDefaultSIMDState() { _mm_setcsr(default_sse_state); } }
FlushToZero::FlushToZero() { #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) _controlfp_s(&previous_state, _MCW_DN, _DN_FLUSH); #elif defined(__APPLE__) fegetenv(&previous_state); fesetenv(FE_DFL_DISABLE_SSE_DENORMS_ENV); #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) previous_state = _mm_getcsr() & _MM_DENORMALS_ZERO_MASK; _mm_setcsr(_mm_getcsr() | (_MM_DENORMALS_ZERO_ON)); #endif }
RTCORE_API void rtcCommitThread(RTCScene hscene, unsigned int threadID, unsigned int numThreads) { Scene* scene = (Scene*) hscene; RTCORE_CATCH_BEGIN; RTCORE_TRACE(rtcCommitThread); RTCORE_VERIFY_HANDLE(hscene); if (unlikely(numThreads == 0)) throw_RTCError(RTC_INVALID_OPERATION,"invalid number of threads specified"); #if defined(__MIC__) if (unlikely(numThreads % 4 != 0 && numThreads != 1)) throw_RTCError(RTC_INVALID_OPERATION,"MIC requires numThreads % 4 == 0 in rtcCommitThread"); #endif /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */ #if !defined(__MIC__) unsigned int mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6)); #endif /* perform scene build */ scene->build(threadID,numThreads); /* reset MXCSR register again */ #if !defined(__MIC__) _mm_setcsr(mxcsr); #endif RTCORE_CATCH_END(scene->device); }
///@todo Combine this with QueueDraw void QueueDispatch(SWR_CONTEXT *pContext) { _ReadWriteBarrier(); pContext->DrawEnqueued++; if (KNOB_SINGLE_THREADED) { // flush denormals to 0 uint32_t mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); WorkOnCompute(pContext, 0, pContext->WorkerBE[0]); // restore csr _mm_setcsr(mxcsr); } else { RDTSC_START(APIDrawWakeAllThreads); WakeAllThreads(pContext); RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); } // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. pContext->pPrevDrawContext = pContext->pCurDrawContext; pContext->pCurDrawContext = nullptr; }
INNER void disable_denormals() { #if __SSE2__ _mm_setcsr(_mm_getcsr() | 0x8040); #endif }
static void* threadStartup(ThreadStartupData* parg) { _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); parg->f(parg->arg); delete parg; return nullptr; }
void MaterialRenderer::renderFrame(const Ref<Camera>& camera, const Ref<BackendScene>& scene, Ref<Film>& film) { /*! flush to zero and no denormals */ _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); /*! precompute some values */ numTilesX = ((int)film->width +TILE_SIZE_X-1)/TILE_SIZE_X; numTilesY = ((int)film->height+TILE_SIZE_Y-1)/TILE_SIZE_Y; numTiles = numTilesX * numTilesY; rcpWidth = 1.0f/float(film->width); rcpHeight = 1.0f/float(film->height); /*! render frame */ double t = getSeconds(); this->tileID = 0; this->atomicNumRays = 0; this->camera = camera; this->scene = scene; this->film = film; scheduler->addTask((Task::runFunction)&run_renderThread,this,scheduler->getNumThreads()); scheduler->go(); this->camera = null; this->scene = null; this->film = null; double dt = getSeconds()-t; /*! print framerate */ std::cout << "MATERIAL RENDERED : " << 1.0f/dt << " fps, " << dt*1000.0f << " ms, " << atomicNumRays/dt*1E-6 << " Mrps" << std::endl; }
void CAllPassFilterPair::processBlock(float* data, int numSamples) { jassert((((size_t) data) & 0xF) == 0); jassert((_mm_getcsr() & 0x8040) == 0x8040); __m128 coeff = _mm_load_ps(mf.getPtr(0)); __m128 x1 = _mm_load_ps(mf.getPtr(1)); __m128 x2 = _mm_load_ps(mf.getPtr(2)); __m128 y1 = _mm_load_ps(mf.getPtr(3)); __m128 y2 = _mm_load_ps(mf.getPtr(4)); for (int i=0; i<numSamples; ++i) { __m128 x0 = _mm_load_ps(&(data[4*i])); __m128 tmp = _mm_sub_ps(x0, y2); tmp = _mm_mul_ps(tmp, coeff); __m128 y0 = _mm_add_ps(x2, tmp); _mm_store_ps(&(data[4*i]), y0); x2=x1; x1=x0; y2=y1; y1=y0; } _mm_store_ps(mf.getPtr(1), x1); _mm_store_ps(mf.getPtr(2), x2); _mm_store_ps(mf.getPtr(3), y1); _mm_store_ps(mf.getPtr(4), y2); };
void SuperSpreadAudioProcessor::processBlock (AudioSampleBuffer& buffer, MidiBuffer& /*midiMessages*/) { unsigned int csr = _mm_getcsr(); _mm_setcsr(csr | 0x8040); AudioProcessorParameter* mixParam = parameterState->getParameter("Mix"); const NormalisableRange<float> mixRange(parameterState->getParameterRange("Mix")); const float spread0 = parameterState->getParameter("Spread")->getValue(); const float mix = mixRange.convertFrom0to1(mixParam->getValue()); const float detuneFade = jmin(spread0/0.1f, 1.f); const float detunedGain = mix >= 100.f ? 1.f : mix / 100.f; const float dryGain = mix <= 100.f ? 1.f : detuneFade < 1.f ? jmax(0.5f * (1.f - detuneFade), (200.f - mix) / 100.f) : (200.f - mix) / 100.f; const float spreadGain = detunedGain * detuneFade; const float spread = 0.5f * spread0*spread0; const int numChannels = buffer.getNumChannels(); const int numSamples = buffer.getNumSamples(); float* chL = buffer.getWritePointer(0); float* chR = numChannels == 2 ? buffer.getWritePointer(1) : nullptr; for (int i=0; i<12 / 2; ++i) { pitchBuffer.copyFrom(i, 0, chL, numSamples); if (chR != nullptr) pitchBuffer.copyFrom(6 + i, 0, chR, numSamples); } mainDelay.processBlock(chL, chR, numSamples); buffer.applyGain(dryGain); const float maxPitches[6] = {0.893f, 0.939f, 0.98f, 1.02f, 1.064f, 1.11f}; for (int i=0; i<6; ++i) { shifter[i]->setPitch(std::pow(maxPitches[i], spread)); shifter[i+6]->setPitch(std::pow(1.f / maxPitches[i], spread)); float* procL = pitchBuffer.getWritePointer(i); float* procR = pitchBuffer.getWritePointer(i+6); shifter[i]->processBlock(procL, numSamples); buffer.addFrom(0, 0, procL, numSamples, spreadGain/* * gain*/); if (numChannels == 2) { shifter[i+6]->processBlock(procR, numSamples); buffer.addFrom(1, 0, procR, numSamples, spreadGain/* * gain*/); } } const float totalGain = spreadGain == 0.f ? 1.f : 1.41f / (1.f + std::sqrt(6.f) * spreadGain); buffer.applyGain(totalGain); _mm_setcsr(csr); }
void nova_server::prepare_backend(void) { /* register audio backend ports */ const int blocksize = get_audio_blocksize(); const int input_channels = get_input_count(); const int output_channels = get_output_count(); std::vector<sample*> inputs, outputs; for (int channel = 0; channel != input_channels; ++channel) inputs.push_back(sc_factory->world.mAudioBus + (blocksize * (output_channels + channel))); audio_backend::input_mapping(inputs.begin(), inputs.end()); for (int channel = 0; channel != output_channels; ++channel) outputs.push_back(sc_factory->world.mAudioBus + blocksize * channel); audio_backend::output_mapping(outputs.begin(), outputs.end()); #ifdef __SSE__ /* denormal handling */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _mm_setcsr(_mm_getcsr() | 0x40); #endif time_per_tick = time_tag::from_samples(blocksize, get_samplerate()); }
void Traverso::setup_fpu() { // export TRAVERSO_RUNNING_UNDER_VALGRIND to disable assembler stuff below! if (getenv("TRAVERSO_RUNNING_UNDER_VALGRIND")) { printf("TRAVERSO_RUNNING_UNDER_VALGRIND=TRUE\n"); // valgrind doesn't understand this assembler stuff // September 10th, 2007 return; } #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(USE_XMMINTRIN) int MXCSR; FPU fpu; /* XXX use real code to determine if the processor supports DenormalsAreZero and FlushToZero */ if (!fpu.has_flush_to_zero() && !fpu.has_denormals_are_zero()) { return; } MXCSR = _mm_getcsr(); /* switch (Config->get_denormal_model()) { case DenormalNone: MXCSR &= ~(_MM_FLUSH_ZERO_ON|0x8000); break; case DenormalFTZ: if (fpu.has_flush_to_zero()) { MXCSR |= _MM_FLUSH_ZERO_ON; } break; case DenormalDAZ:*/ MXCSR &= ~_MM_FLUSH_ZERO_ON; if (fpu.has_denormals_are_zero()) { MXCSR |= 0x8000; } // break; // // case DenormalFTZDAZ: // if (fpu.has_flush_to_zero()) { // if (fpu.has_denormals_are_zero()) { // MXCSR |= _MM_FLUSH_ZERO_ON | 0x8000; // } else { // MXCSR |= _MM_FLUSH_ZERO_ON; // } // } // break; // } _mm_setcsr (MXCSR); #endif }
int mwDisableDenormalsSSE(void) { int oldMXCSR = _mm_getcsr(); int newMXCSR = oldMXCSR | 0x8040; _mm_setcsr(newMXCSR); mw_printf("Disabled denormals\n"); return oldMXCSR; }
void Client::thread_init ( void *arg ) { #ifdef __SSE2_MATH__ /* set FTZ and DAZ flags */ _mm_setcsr(_mm_getcsr() | 0x8040); #endif ((Client*)arg)->thread_init(); }
void g() { (void)_mm_getcsr(); _mm_setcsr(1); _mm_sfence(); _mm_clflush((void*)0); _mm_lfence(); _mm_mfence(); _mm_pause(); }
FlushToZero::~FlushToZero() { #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) unsigned int new_state; _controlfp_s(&new_state, _MCW_DN, previous_state); #elif defined(__APPLE__) fesetenv(&previous_state); #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) _mm_setcsr(_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK); #endif }
double _sin_cos_special(double x, char *name) { UT64 xu; unsigned int is_snan; xu.f64 = x; if((xu.u64 & EXPBITS_DP64) == EXPBITS_DP64) { // x is Inf or NaN if((xu.u64 & MANTBITS_DP64) == 0x0) { // x is Inf _mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID); #ifdef WIN64 xu.u64 = INDEFBITPATT_DP64; __amd_handle_error(DOMAIN, EDOM, name, x, 0, xu.f64); #else xu.u64 = QNANBITPATT_DP64; name = *(&name); // dummy statement to avoid warning #endif } else { // x is NaN is_snan = (((xu.u64 & QNAN_MASK_64) == QNAN_MASK_64) ? 0 : 1); if(is_snan){ xu.u64 |= QNAN_MASK_64; #ifdef WIN64 #else _mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID); #endif } #ifdef WIN64 __amd_handle_error(DOMAIN, EDOM, name, x, 0, xu.f64); #endif } } return xu.f64; }
DLLEXPORT uint8_t jl_zero_denormals(uint8_t isZero) { #ifdef __SSE2__ // SSE2 supports both FZ and DAZ uint32_t flags = 0x8040; #elif __SSE__ // SSE supports only the FZ flag uint32_t flags = 0x8000; #endif #ifdef __SSE__ if (isZero) { _mm_setcsr(_mm_getcsr() | flags); } else { _mm_setcsr(_mm_getcsr() & ~flags); } return 1; #else return 0; #endif }
static void* threadStartup(ThreadStartupData* parg) { _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6)); #if !defined(__LINUX__) || defined(__MIC__) if (parg->affinity >= 0) setAffinity(parg->affinity); #endif parg->f(parg->arg); delete parg; return NULL; }
/** * Fetches the contents of the fpstate (mxcsr on x86) register. * * On platforms without support for it just returns 0. */ unsigned util_fpstate_get(void) { unsigned mxcsr = 0; #if defined(PIPE_ARCH_SSE) if (util_cpu_caps.has_sse) { mxcsr = _mm_getcsr(); } #endif return mxcsr; }
void FC_FUNC_(force_ftz,FORCE_FTZ)() { #ifdef FORCE_FTZ unsigned int x; /* force FTZ by setting bits 11 and 15 to one */ x = _mm_getcsr(); x |= (1 << FTZ_BIT); x |= (1 << UNDERFLOW_EXCEPTION_MASK); _mm_setcsr(x); #endif }
float _sinf_cosf_special(float x, char *name) { UT32 xu; unsigned int is_snan; xu.f32 = x; if((xu.u32 & EXPBITS_SP32) == EXPBITS_SP32) { // x is Inf or NaN if((xu.u32 & MANTBITS_SP32) == 0x0) { // x is Inf _mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID); #ifdef WIN64 xu.u32 = INDEFBITPATT_SP32; __amd_handle_errorf(DOMAIN, EDOM, name, x, 0, 0.0f, 0, xu.f32, 0); #else xu.u32 = QNANBITPATT_SP32; name = *(&name); // dummy statement to avoid warning #endif } else { // x is NaN is_snan = (((xu.u32 & QNAN_MASK_32) == QNAN_MASK_32) ? 0 : 1); if(is_snan) { xu.u32 |= QNAN_MASK_32; _mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID); } #ifdef WIN64 __amd_handle_errorf(DOMAIN, EDOM, name, x, is_snan, 0.0f, 0, xu.f32, 0); #endif } } return xu.f32; }
void SysCoreThread::ExecuteTaskInThread() { Threading::EnableHiresScheduler(); m_sem_event.WaitWithoutYield(); m_mxcsr_saved.bitmask = _mm_getcsr(); PCSX2_PAGEFAULT_PROTECT { while(true) { StateCheckInThread(); DoCpuExecute(); } } PCSX2_PAGEFAULT_EXCEPT; }
void SysCoreThread::ExecuteTaskInThread() { Threading::EnableHiresScheduler(); // Note that *something* in SPU2-X and GSdx also set the timer resolution to 1ms. m_sem_event.WaitWithoutYield(); m_mxcsr_saved.bitmask = _mm_getcsr(); PCSX2_PAGEFAULT_PROTECT { while(true) { StateCheckInThread(); DoCpuExecute(); } } PCSX2_PAGEFAULT_EXCEPT; }
inline T get_smallest_value(mpl::true_ const&) { // // numeric_limits lies about denorms being present - particularly // when this can be turned on or off at runtime, as is the case // when using the SSE2 registers in DAZ or FTZ mode. // static const T m = std::numeric_limits<T>::denorm_min(); #ifdef BOOST_MATH_CHECK_SSE2 return (_mm_getcsr() & (_MM_FLUSH_ZERO_ON | 0x40)) ? tools::min_value<T>() : m;; #else return ((tools::min_value<T>() / 2) == 0) ? tools::min_value<T>() : m; #endif }
static int main( const std::vector<CL_String> &args ) { _mm_setcsr( _mm_getcsr() | _MM_FLUSH_ZERO_ON); // plane p( plane::dir_zx_p, vec3f( 0.5, 0.5, 0.5 )); // return 0; try { ortho o; o.start(); } catch( gl_error_exception x ) { std::cerr << x.what() << std::endl; std::cerr << "bailing out\n"; } return 0; }
void f() { (void)_mm_getcsr(); // expected-warning{{implicitly declaring library function '_mm_getcsr'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_getcsr'}} _mm_setcsr(1); // expected-warning{{implicitly declaring library function '_mm_setcsr'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_setcsr'}} _mm_sfence(); // expected-warning{{implicitly declaring library function '_mm_sfence'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_sfence'}} _mm_clflush((void*)0); // expected-warning{{implicitly declaring library function '_mm_clflush'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_clflush'}} _mm_lfence(); // expected-warning{{implicitly declaring library function '_mm_lfence'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_lfence'}} _mm_mfence(); // expected-warning{{implicitly declaring library function '_mm_mfence'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_mfence'}} _mm_pause(); // expected-warning{{implicitly declaring library function '_mm_pause'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_pause'}} }
void dynamicdsp_threadprocess(t_dynamicdsp *x, void **sig_outs, void *temp_mem_ptr, t_ptr_uint temp_mem_size, long vec_size, long thread_num, long num_active_threads) { long num_sig_outs = x->num_sig_outs; // Turn off denormals #if defined( __i386__ ) || defined( __x86_64__ ) int oldMXCSR = _mm_getcsr(); // read the old MXCSR setting _mm_setcsr(oldMXCSR | 0x8040); // write the new MXCSR setting setting DAZ and FZ bits #endif // Zero outputs for (long i = 0; i < num_sig_outs; i++) memset(sig_outs[i], 0, sig_size * vec_size); if (x->manual_threading) { for (long i = 0; i < x->slots->size(); i++) x->slots->processIfThreadMatches(i, temp_mem_ptr, sig_outs, temp_mem_size, thread_num, num_active_threads); } else { long size = x->slots->size(); long index = (thread_num * (size / num_active_threads)) - 1; for (long i = 0; i < size; i++) { if (++index >= size) index -= size; x->slots->processIfUnprocessed(i, temp_mem_ptr, sig_outs, temp_mem_size); } } // return denormals to previous state #if defined( __i386__ ) || defined( __x86_64__ ) _mm_setcsr(oldMXCSR); #endif }
int main(int argc, char **argv){ _mm_setcsr(_mm_getcsr() | 0x8040); int num_frames = atoi(argv[1]); // If we hardcode the value, the compiler is likely to optimize things which it wouldn't have done normally //fprintf(stderr, "num_frames: %d\n",num_frames); float *data = (float*)malloc(sizeof(float)*num_frames); for(int i=0;i<num_frames;i++){ data[i] = (float)i / (float)num_frames; // Insert some some legal values. } void *resampler = RESAMPLER_create(src_callback, 1, NULL, RESAMPLER_CUBIC); int top = 1024*1024*8 * 64 / num_frames; for(int i=0 ; i < top ; i ++) RESAMPLER_read(resampler, scale(i,0,top,0.1,8.0), num_frames, data); //printf("hello\n"); return 0; }
void f0() { signed char tmp_c; // unsigned char tmp_Uc; signed short tmp_s; #ifdef USE_ALL unsigned short tmp_Us; #endif signed int tmp_i; unsigned int tmp_Ui; signed long long tmp_LLi; unsigned long long tmp_ULLi; float tmp_f; double tmp_d; void* tmp_vp; const void* tmp_vCp; char* tmp_cp; const char* tmp_cCp; int* tmp_ip; float* tmp_fp; const float* tmp_fCp; double* tmp_dp; const double* tmp_dCp; long long* tmp_LLip; #define imm_i 32 #define imm_i_0_2 0 #define imm_i_0_4 3 #define imm_i_0_8 7 #define imm_i_0_16 15 // Check this. #define imm_i_0_256 0 V2i* tmp_V2ip; V1LLi* tmp_V1LLip; V2LLi* tmp_V2LLip; // 64-bit V8c tmp_V8c; V4s tmp_V4s; V2i tmp_V2i; V1LLi tmp_V1LLi; #ifdef USE_3DNOW V2f tmp_V2f; #endif // 128-bit V16c tmp_V16c; V8s tmp_V8s; V4i tmp_V4i; V2LLi tmp_V2LLi; V4f tmp_V4f; V2d tmp_V2d; V2d* tmp_V2dp; V4f* tmp_V4fp; const V2d* tmp_V2dCp; const V4f* tmp_V4fCp; // 256-bit V32c tmp_V32c; V4d tmp_V4d; V8f tmp_V8f; V4LLi tmp_V4LLi; V8i tmp_V8i; V4LLi* tmp_V4LLip; V4d* tmp_V4dp; V8f* tmp_V8fp; const V4d* tmp_V4dCp; const V8f* tmp_V8fCp; tmp_V2LLi = __builtin_ia32_undef128(); tmp_V4LLi = __builtin_ia32_undef256(); tmp_i = __builtin_ia32_comieq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comilt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comile(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comigt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comige(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comineq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomieq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomilt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomile(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomigt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomige(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomineq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comisdeq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdlt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdle(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdgt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdge(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdneq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdeq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdlt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdle(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdgt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdge(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdneq(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 0); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 1); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 2); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 3); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 4); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 5); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 6); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 7); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 0); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 1); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 2); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 3); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 4); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 5); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 6); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 7); tmp_V4f = __builtin_ia32_minps(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxps(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f); tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 3); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 4); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 5); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 6); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 7); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 2); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 3); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 4); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 5); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 6); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 7); tmp_V2d = __builtin_ia32_minpd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_maxpd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_minsd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_maxsd(tmp_V2d, tmp_V2d); tmp_V16c = __builtin_ia32_paddsb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_paddsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_psubsb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_paddusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_paddusw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_psubusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubusw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pminsw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s); tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_haddps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_haddpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d); tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s); tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c); tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i); tmp_V16c = __builtin_ia32_pabsb128(tmp_V16c); tmp_V8c = __builtin_ia32_pabsb(tmp_V8c); tmp_V8s = __builtin_ia32_pabsw128(tmp_V8s); tmp_V4s = __builtin_ia32_pabsw(tmp_V4s); tmp_V4i = __builtin_ia32_pabsd128(tmp_V4i); tmp_V2i = __builtin_ia32_pabsd(tmp_V2i); tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi); tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi); tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi); tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi); tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi); tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s); tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0); __builtin_ia32_incsspd(tmp_Ui); __builtin_ia32_incsspq(tmp_ULLi); tmp_Ui = __builtin_ia32_rdsspd(tmp_Ui); tmp_ULLi = __builtin_ia32_rdsspq(tmp_ULLi); __builtin_ia32_saveprevssp(); __builtin_ia32_rstorssp(tmp_vp); __builtin_ia32_wrssd(tmp_Ui, tmp_vp); __builtin_ia32_wrssq(tmp_ULLi, tmp_vp); __builtin_ia32_wrussd(tmp_Ui, tmp_vp); __builtin_ia32_wrussq(tmp_ULLi, tmp_vp); __builtin_ia32_setssbsy(); __builtin_ia32_clrssbsy(tmp_vp); (void) __builtin_ia32_ldmxcsr(tmp_Ui); (void) _mm_setcsr(tmp_Ui); tmp_Ui = __builtin_ia32_stmxcsr(); tmp_Ui = _mm_getcsr(); (void)__builtin_ia32_fxsave(tmp_vp); (void)__builtin_ia32_fxsave64(tmp_vp); (void)__builtin_ia32_fxrstor(tmp_vp); (void)__builtin_ia32_fxrstor64(tmp_vp); (void)__builtin_ia32_xsave(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsave64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstor(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstor64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaveopt(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaveopt64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstors(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstors64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsavec(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsavec64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi); (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui); (void) __builtin_ia32_clzero(tmp_vp); (void) __builtin_ia32_cldemote(tmp_vp); tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i); tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f); tmp_i = __builtin_ia32_cvtss2si(tmp_V4f); tmp_i = __builtin_ia32_cvttss2si(tmp_V4f); tmp_i = __builtin_ia32_rdtsc(); tmp_i = __rdtsc(); tmp_i = __builtin_ia32_rdtscp(&tmp_Ui); tmp_LLi = __builtin_ia32_rdpmc(tmp_i); __builtin_ia32_wbnoinvd(); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f); tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f); #endif tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f); (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp); (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f); (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f); tmp_i = __builtin_ia32_movmskps(tmp_V4f); tmp_i = __builtin_ia32_pmovmskb(tmp_V8c); (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi); (void) __builtin_ia32_sfence(); (void) _mm_sfence(); tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c); tmp_V4f = __builtin_ia32_rcpps(tmp_V4f); tmp_V4f = __builtin_ia32_rcpss(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtss(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f); (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp); tmp_i = __builtin_ia32_movmskpd(tmp_V2d); tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c); (void) __builtin_ia32_movnti(tmp_ip, tmp_i); #ifdef USE_64 (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi); #endif tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d); tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d); tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d); tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d); tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d); tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d); tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d); tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i); tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d); tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d); tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d); tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d); #endif tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f); tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f); (void) __builtin_ia32_clflush(tmp_vCp); (void) _mm_clflush(tmp_vCp); (void) __builtin_ia32_lfence(); (void) _mm_lfence(); (void) __builtin_ia32_mfence(); (void) _mm_mfence(); (void) __builtin_ia32_pause(); (void) _mm_pause(); tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i); tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i); tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i); tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i); tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i); tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i); tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psrlw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrld128(tmp_V4i, tmp_V4i); tmp_V2LLi = __builtin_ia32_psrlq128(tmp_V2LLi, tmp_V2LLi); tmp_V8s = __builtin_ia32_psllw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_pslld128(tmp_V4i, tmp_V4i); tmp_V2LLi = __builtin_ia32_psllq128(tmp_V2LLi, tmp_V2LLi); tmp_V8s = __builtin_ia32_psllwi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_pslldi128(tmp_V4i, tmp_i); tmp_V2LLi = __builtin_ia32_psllqi128(tmp_V2LLi, tmp_i); tmp_V8s = __builtin_ia32_psrlwi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_psrldi128(tmp_V4i, tmp_i); tmp_V2LLi = __builtin_ia32_psrlqi128(tmp_V2LLi, tmp_i); tmp_V8s = __builtin_ia32_psrawi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_psradi128(tmp_V4i, tmp_i); tmp_V8s = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s); (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui); tmp_V16c = __builtin_ia32_lddqu(tmp_cCp); tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i); tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i); #ifdef USE_SSE4 tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_blendvps(tmp_V4f, tmp_V4f, tmp_V4f); tmp_V8s = __builtin_ia32_packusdw128(tmp_V4i, tmp_V4i); tmp_V16c = __builtin_ia32_pmaxsb128(tmp_V16c, tmp_V16c); tmp_V4i = __builtin_ia32_pmaxsd128(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_pmaxud128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_pmaxuw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminsb128(tmp_V16c, tmp_V16c); tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s); tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i); tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16); tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16); tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16); tmp_V2d = __builtin_ia32_roundpd(tmp_V2d, imm_i_0_16); tmp_V4f = __builtin_ia32_insertps128(tmp_V4f, tmp_V4f, imm_i_0_256); #endif tmp_V4d = __builtin_ia32_addsubpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_addsubps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_haddpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_hsubps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_hsubpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_haddps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_maxpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_maxps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_minpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_minps256(tmp_V8f, tmp_V8f); tmp_V2d = __builtin_ia32_vpermilvarpd(tmp_V2d, tmp_V2LLi); tmp_V4f = __builtin_ia32_vpermilvarps(tmp_V4f, tmp_V4i); tmp_V4d = __builtin_ia32_vpermilvarpd256(tmp_V4d, tmp_V4LLi); tmp_V8f = __builtin_ia32_vpermilvarps256(tmp_V8f, tmp_V8i); tmp_V4d = __builtin_ia32_blendvpd256(tmp_V4d, tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_blendvps256(tmp_V8f, tmp_V8f, tmp_V8f); tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7); tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0); tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0); tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d); tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f); tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d); tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d); tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f); tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7); tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7); tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7); tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d); tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f); tmp_V8f = __builtin_ia32_rsqrtps256(tmp_V8f); tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f); tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1); tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1); tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestzps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestcps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestnzcps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestzpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestcpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestnzcpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestzps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_vtestcps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_vtestnzcps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_ptestz256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_ptestc256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_ptestnzc256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_movmskpd256(tmp_V4d); tmp_i = __builtin_ia32_movmskps256(tmp_V8f); __builtin_ia32_vzeroall(); __builtin_ia32_vzeroupper(); tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp); tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi); tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i); tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi); tmp_V8f = __builtin_ia32_maskloadps256(tmp_V8fCp, tmp_V8i); __builtin_ia32_maskstorepd(tmp_V2dp, tmp_V2LLi, tmp_V2d); __builtin_ia32_maskstoreps(tmp_V4fp, tmp_V4i, tmp_V4f); __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d); __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f); #ifdef USE_3DNOW tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c); tmp_V2i = __builtin_ia32_pf2id(tmp_V2f); tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f); tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f); tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i); tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f); tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i); tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f); tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i); tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4); tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg2(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256rnds2(tmp_V4i, tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256msg1(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256msg2(tmp_V4i, tmp_V4i); #endif }
void Permutohedral::init ( const float* feature, int feature_size, int N ) { // Compute the lattice coordinates for each feature [there is going to be a lot of magic here N_ = N; d_ = feature_size; HashTable hash_table( d_, N_/**(d_+1)*/ ); const int blocksize = sizeof(__m128) / sizeof(float); const __m128 invdplus1 = _mm_set1_ps( 1.0f / (d_+1) ); const __m128 dplus1 = _mm_set1_ps( d_+1 ); const __m128 Zero = _mm_set1_ps( 0 ); const __m128 One = _mm_set1_ps( 1 ); // Allocate the class memory if (offset_) delete [] offset_; offset_ = new int[ (d_+1)*(N_+16) ]; memset( offset_, 0, (d_+1)*(N_+16)*sizeof(int) ); if (barycentric_) delete [] barycentric_; barycentric_ = new float[ (d_+1)*(N_+16) ]; memset( barycentric_, 0, (d_+1)*(N_+16)*sizeof(float) ); // Allocate the local memory __m128 * scale_factor = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * f = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * elevated = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rem0 = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rank = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 ); float * barycentric = new float[(d_+2)*blocksize]; short * canonical = new short[(d_+1)*(d_+1)]; short * key = new short[d_+1]; // Compute the canonical simplex for( int i=0; i<=d_; i++ ){ for( int j=0; j<=d_-i; j++ ) canonical[i*(d_+1)+j] = i; for( int j=d_-i+1; j<=d_; j++ ) canonical[i*(d_+1)+j] = i - (d_+1); } // Expected standard deviation of our filter (p.6 in [Adams etal 2010]) float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1); // Compute the diagonal part of E (p.5 in [Adams etal 2010]) for( int i=0; i<d_; i++ ) scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( float((i+2)*(i+1) ) * inv_std_dev) ); // Setup the SSE rounding #ifndef __SSE4_1__ const unsigned int old_rounding = _mm_getcsr(); _mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST ); #endif // Compute the simplex each feature lies in for( int k=0; k<N_; k+=blocksize ){ // Load the feature from memory float * ff = (float*)f; for( int j=0; j<d_; j++ ) for( int i=0; i<blocksize; i++ ) ff[ j*blocksize + i ] = k+i < N_ ? feature[ (k+i)*d_+j ] : 0.0; // Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010]) // sm contains the sum of 1..n of our faeture vector __m128 sm = Zero; for( int j=d_; j>0; j-- ){ __m128 cf = f[j-1]*scale_factor[j-1]; elevated[j] = sm - _mm_set1_ps(j)*cf; sm += cf; } elevated[0] = sm; // Find the closest 0-colored simplex through rounding __m128 sum = Zero; for( int i=0; i<=d_; i++ ){ __m128 v = invdplus1 * elevated[i]; #ifdef __SSE4_1__ v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT ); #else v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) ); #endif rem0[i] = v*dplus1; sum += v; } // Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values) for( int i=0; i<=d_; i++ ) rank[i] = Zero; for( int i=0; i<d_; i++ ){ __m128 di = elevated[i] - rem0[i]; for( int j=i+1; j<=d_; j++ ){ __m128 dj = elevated[j] - rem0[j]; __m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) ); rank[i] += c; rank[j] += One-c; } } // If the point doesn't lie on the plane (sum != 0) bring it back for( int i=0; i<=d_; i++ ){ rank[i] += sum; __m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) ); __m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) ); rank[i] += add-sub; rem0[i] += add-sub; } // Compute the barycentric coordinates (p.10 in [Adams etal 2010]) for( int i=0; i<(d_+2)*blocksize; i++ ) barycentric[ i ] = 0; for( int i=0; i<=d_; i++ ){ __m128 v = (elevated[i] - rem0[i])*invdplus1; // Didn't figure out how to SSE this float * fv = (float*)&v; float * frank = (float*)&rank[i]; for( int j=0; j<blocksize; j++ ){ int p = d_-frank[j]; barycentric[j*(d_+2)+p ] += fv[j]; barycentric[j*(d_+2)+p+1] -= fv[j]; } } // The rest is not SSE'd for( int j=0; j<blocksize; j++ ){ // Wrap around barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1]; float * frank = (float*)rank; float * frem0 = (float*)rem0; // Compute all vertices and their offset for( int remainder=0; remainder<=d_; remainder++ ){ for( int i=0; i<d_; i++ ){ key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ]; } offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true ); barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ]; } } } _mm_free( scale_factor ); _mm_free( f ); _mm_free( elevated ); _mm_free( rem0 ); _mm_free( rank ); delete [] barycentric; delete [] canonical; delete [] key; // Reset the SSE rounding #ifndef __SSE4_1__ _mm_setcsr( old_rounding ); #endif // This is normally fast enough so no SSE needed here // Find the Neighbors of each lattice point // Get the number of vertices in the lattice M_ = hash_table.size(); // Create the neighborhood structure if(blur_neighbors_) delete[] blur_neighbors_; blur_neighbors_ = new Neighbors[ (d_+1)*M_ ]; short * n1 = new short[d_+1]; short * n2 = new short[d_+1]; // For each of d+1 axes, for( int j = 0; j <= d_; j++ ){ for( int i=0; i<M_; i++ ){ const short * key = hash_table.getKey( i ); for( int k=0; k<d_; k++ ){ n1[k] = key[k] - 1; n2[k] = key[k] + 1; } n1[j] = key[j] + d_; n2[j] = key[j] - d_; blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 ); blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 ); } } delete[] n1; delete[] n2; }