예제 #1
0
namespace FPURoundMode
{
	// Get the default SSE states here.
	static u32 saved_sse_state = _mm_getcsr();
	static const u32 default_sse_state = _mm_getcsr();

	void SetRoundMode(int mode)
	{
		// Convert PowerPC to native rounding mode.
		static const int rounding_mode_lut[] = {
			FE_TONEAREST,
			FE_TOWARDZERO,
			FE_UPWARD,
			FE_DOWNWARD
		};
		fesetround(rounding_mode_lut[mode]);
	}

	void SetPrecisionMode(PrecisionMode /* mode */)
	{
		//x64 doesn't need this - fpu is done with SSE
	}

	void SetSIMDMode(int rounding_mode, bool non_ieee_mode)
	{
		// OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register)
		const u32 EXCEPTION_MASK = 0x1F80;
		// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
		const u32 FTZ = 0x8000;
		// lookup table for FPSCR.RN-to-MXCSR.RC translation
		static const u32 simd_rounding_table[] =
		{
			(0 << 13) | EXCEPTION_MASK, // nearest
			(3 << 13) | EXCEPTION_MASK, // -inf
			(2 << 13) | EXCEPTION_MASK, // +inf
			(1 << 13) | EXCEPTION_MASK, // zero
		};
		u32 csr = simd_rounding_table[rounding_mode];

		if (non_ieee_mode)
		{
			csr |= FTZ;
		}
		_mm_setcsr(csr);
	}

	void SaveSIMDState()
	{
		saved_sse_state = _mm_getcsr();
	}
	void LoadSIMDState()
	{
		_mm_setcsr(saved_sse_state);
	}
	void LoadDefaultSIMDState()
	{
		_mm_setcsr(default_sse_state);
	}
}
예제 #2
0
  FlushToZero::FlushToZero()
  {
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
    _controlfp_s(&previous_state, _MCW_DN, _DN_FLUSH);
#elif defined(__APPLE__)
    fegetenv(&previous_state);
    fesetenv(FE_DFL_DISABLE_SSE_DENORMS_ENV);
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
    previous_state = _mm_getcsr() & _MM_DENORMALS_ZERO_MASK;
    _mm_setcsr(_mm_getcsr() | (_MM_DENORMALS_ZERO_ON));
#endif
  }
예제 #3
0
파일: rtcore.cpp 프로젝트: eyalsoreq/embree
  RTCORE_API void rtcCommitThread(RTCScene hscene, unsigned int threadID, unsigned int numThreads) 
  {
    Scene* scene = (Scene*) hscene;
    RTCORE_CATCH_BEGIN;
    RTCORE_TRACE(rtcCommitThread);
    RTCORE_VERIFY_HANDLE(hscene);

    if (unlikely(numThreads == 0)) 
      throw_RTCError(RTC_INVALID_OPERATION,"invalid number of threads specified");

#if defined(__MIC__)
    if (unlikely(numThreads % 4 != 0 && numThreads != 1)) 
      throw_RTCError(RTC_INVALID_OPERATION,"MIC requires numThreads % 4 == 0 in rtcCommitThread");
#endif
    
    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
#if !defined(__MIC__)
    unsigned int mxcsr = _mm_getcsr();
    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
#endif
    
     /* perform scene build */
    scene->build(threadID,numThreads);

 /* reset MXCSR register again */
#if !defined(__MIC__)
    _mm_setcsr(mxcsr);
#endif

    RTCORE_CATCH_END(scene->device);
  }
예제 #4
0
///@todo Combine this with QueueDraw
void QueueDispatch(SWR_CONTEXT *pContext)
{
    _ReadWriteBarrier();
    pContext->DrawEnqueued++;

    if (KNOB_SINGLE_THREADED)
    {
        // flush denormals to 0
        uint32_t mxcsr = _mm_getcsr();
        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

        WorkOnCompute(pContext, 0, pContext->WorkerBE[0]);

        // restore csr
        _mm_setcsr(mxcsr);
    }
    else
    {
        RDTSC_START(APIDrawWakeAllThreads);
        WakeAllThreads(pContext);
        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
    }

    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
    pContext->pPrevDrawContext = pContext->pCurDrawContext;
    pContext->pCurDrawContext = nullptr;
}
예제 #5
0
파일: util.hpp 프로젝트: notwa/crap
INNER void
disable_denormals()
{
	#if __SSE2__
	_mm_setcsr(_mm_getcsr() | 0x8040);
	#endif
}
예제 #6
0
 static void* threadStartup(ThreadStartupData* parg)
 {
   _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
   parg->f(parg->arg);
   delete parg;
   return nullptr;
 }
예제 #7
0
  void MaterialRenderer::renderFrame(const Ref<Camera>& camera, const Ref<BackendScene>& scene,  Ref<Film>& film)
  {
    /*! flush to zero and no denormals */
    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));

    /*! precompute some values */
    numTilesX = ((int)film->width +TILE_SIZE_X-1)/TILE_SIZE_X;
    numTilesY = ((int)film->height+TILE_SIZE_Y-1)/TILE_SIZE_Y;
    numTiles = numTilesX * numTilesY;
    rcpWidth  = 1.0f/float(film->width);
    rcpHeight = 1.0f/float(film->height);

    /*! render frame */
    double t = getSeconds();
    this->tileID = 0;
    this->atomicNumRays = 0;
    this->camera = camera;
    this->scene = scene;
    this->film = film;
    scheduler->addTask((Task::runFunction)&run_renderThread,this,scheduler->getNumThreads());
    scheduler->go();
    this->camera = null;
    this->scene = null;
    this->film = null;
    double dt = getSeconds()-t;

    /*! print framerate */
    std::cout << "MATERIAL RENDERED : " << 1.0f/dt << " fps, " << dt*1000.0f << " ms, " << atomicNumRays/dt*1E-6 << " Mrps" << std::endl;
  }
예제 #8
0
void CAllPassFilterPair::processBlock(float* data, int numSamples)
{
	jassert((((size_t) data) & 0xF) == 0);
	jassert((_mm_getcsr() & 0x8040) == 0x8040);

	__m128 coeff = _mm_load_ps(mf.getPtr(0));
	__m128 x1 = _mm_load_ps(mf.getPtr(1));
	__m128 x2 = _mm_load_ps(mf.getPtr(2));
	__m128 y1 = _mm_load_ps(mf.getPtr(3));
	__m128 y2 = _mm_load_ps(mf.getPtr(4));

	for (int i=0; i<numSamples; ++i)
	{
		__m128 x0 = _mm_load_ps(&(data[4*i]));
		__m128 tmp = _mm_sub_ps(x0, y2);
		tmp = _mm_mul_ps(tmp, coeff);
		__m128 y0 = _mm_add_ps(x2, tmp);

		_mm_store_ps(&(data[4*i]), y0);

		x2=x1;
		x1=x0;

		y2=y1;
		y1=y0;
	}

	_mm_store_ps(mf.getPtr(1), x1);
	_mm_store_ps(mf.getPtr(2), x2);
	_mm_store_ps(mf.getPtr(3), y1);
	_mm_store_ps(mf.getPtr(4), y2);

};
예제 #9
0
void SuperSpreadAudioProcessor::processBlock (AudioSampleBuffer& buffer, MidiBuffer& /*midiMessages*/)
{
    unsigned int csr = _mm_getcsr();
    _mm_setcsr(csr | 0x8040);
    AudioProcessorParameter* mixParam = parameterState->getParameter("Mix");
    const NormalisableRange<float> mixRange(parameterState->getParameterRange("Mix"));

    const float spread0 = parameterState->getParameter("Spread")->getValue();
    const float mix = mixRange.convertFrom0to1(mixParam->getValue());
    const float detuneFade = jmin(spread0/0.1f, 1.f);

    const float detunedGain = mix >= 100.f ? 1.f : mix / 100.f;
    const float dryGain = mix <= 100.f ? 1.f : detuneFade < 1.f ? jmax(0.5f * (1.f - detuneFade), (200.f - mix) / 100.f) : (200.f - mix) / 100.f;
    const float spreadGain = detunedGain * detuneFade;


    const float spread = 0.5f * spread0*spread0;

    const int numChannels = buffer.getNumChannels();
    const int numSamples = buffer.getNumSamples();
    float* chL = buffer.getWritePointer(0);
    float* chR = numChannels == 2 ? buffer.getWritePointer(1) : nullptr;

    for (int i=0; i<12 / 2; ++i)
    {
        pitchBuffer.copyFrom(i, 0, chL, numSamples);

        if (chR != nullptr)
            pitchBuffer.copyFrom(6 + i, 0, chR, numSamples);
    }

    mainDelay.processBlock(chL, chR, numSamples);
    buffer.applyGain(dryGain);

    const float maxPitches[6] = {0.893f, 0.939f, 0.98f, 1.02f, 1.064f, 1.11f}; 

    for (int i=0; i<6; ++i)
    {
        shifter[i]->setPitch(std::pow(maxPitches[i], spread));
        shifter[i+6]->setPitch(std::pow(1.f / maxPitches[i], spread));

        float* procL = pitchBuffer.getWritePointer(i);
        float* procR = pitchBuffer.getWritePointer(i+6);

        shifter[i]->processBlock(procL, numSamples);
        buffer.addFrom(0, 0, procL, numSamples, spreadGain/* * gain*/);

        if (numChannels == 2)
        {
            shifter[i+6]->processBlock(procR, numSamples);
            buffer.addFrom(1, 0, procR, numSamples, spreadGain/* * gain*/);
        }
    }

    const float totalGain = spreadGain == 0.f ? 1.f : 1.41f / (1.f + std::sqrt(6.f) * spreadGain);

    buffer.applyGain(totalGain);

    _mm_setcsr(csr);
}
예제 #10
0
void nova_server::prepare_backend(void)
{
    /* register audio backend ports */
    const int blocksize = get_audio_blocksize();
    const int input_channels = get_input_count();
    const int output_channels = get_output_count();

    std::vector<sample*> inputs, outputs;
    for (int channel = 0; channel != input_channels; ++channel)
        inputs.push_back(sc_factory->world.mAudioBus + (blocksize * (output_channels + channel)));

    audio_backend::input_mapping(inputs.begin(), inputs.end());

    for (int channel = 0; channel != output_channels; ++channel)
        outputs.push_back(sc_factory->world.mAudioBus + blocksize * channel);

    audio_backend::output_mapping(outputs.begin(), outputs.end());

#ifdef __SSE__
    /* denormal handling */
    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    _mm_setcsr(_mm_getcsr() | 0x40);
#endif

    time_per_tick = time_tag::from_samples(blocksize, get_samplerate());
}
예제 #11
0
void Traverso::setup_fpu()
{

	// export TRAVERSO_RUNNING_UNDER_VALGRIND to disable assembler stuff below!
	if (getenv("TRAVERSO_RUNNING_UNDER_VALGRIND")) {
		printf("TRAVERSO_RUNNING_UNDER_VALGRIND=TRUE\n");
		// valgrind doesn't understand this assembler stuff
		// September 10th, 2007
		return;
	}

#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(USE_XMMINTRIN)
        
	int MXCSR;
	FPU fpu;

	/* XXX use real code to determine if the processor supports
	DenormalsAreZero and FlushToZero
	*/
	
	if (!fpu.has_flush_to_zero() && !fpu.has_denormals_are_zero()) {
		return;
	}

	MXCSR  = _mm_getcsr();

/*	switch (Config->get_denormal_model()) {
		case DenormalNone:
			MXCSR &= ~(_MM_FLUSH_ZERO_ON|0x8000);
			break;

		case DenormalFTZ:
			if (fpu.has_flush_to_zero()) {
				MXCSR |= _MM_FLUSH_ZERO_ON;
			}
			break;

		case DenormalDAZ:*/
			MXCSR &= ~_MM_FLUSH_ZERO_ON;
			if (fpu.has_denormals_are_zero()) {
				MXCSR |= 0x8000;
			}
// 			break;
// 		
// 		case DenormalFTZDAZ:
// 			if (fpu.has_flush_to_zero()) {
// 				if (fpu.has_denormals_are_zero()) {
// 					MXCSR |= _MM_FLUSH_ZERO_ON | 0x8000;
// 				} else {
// 					MXCSR |= _MM_FLUSH_ZERO_ON;
// 				}
// 			}
// 			break;
// 	}

	_mm_setcsr (MXCSR);

#endif
}
int mwDisableDenormalsSSE(void)
{
    int oldMXCSR = _mm_getcsr();
    int newMXCSR = oldMXCSR | 0x8040;
    _mm_setcsr(newMXCSR);

    mw_printf("Disabled denormals\n");
    return oldMXCSR;
}
예제 #13
0
파일: Client.C 프로젝트: 0mk/non
    void
    Client::thread_init ( void *arg )
    {
#ifdef __SSE2_MATH__
    /* set FTZ and DAZ flags */
    _mm_setcsr(_mm_getcsr() | 0x8040);
#endif

        ((Client*)arg)->thread_init();
    }
예제 #14
0
void g() {
  (void)_mm_getcsr();
  _mm_setcsr(1);
  _mm_sfence();

  _mm_clflush((void*)0);
  _mm_lfence();
  _mm_mfence();
  _mm_pause();
}
예제 #15
0
  FlushToZero::~FlushToZero()
  {
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
    unsigned int new_state;
    _controlfp_s(&new_state, _MCW_DN, previous_state);
#elif defined(__APPLE__)
    fesetenv(&previous_state);
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
    _mm_setcsr(_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK);
#endif
  }
예제 #16
0
double _sin_cos_special(double x, char *name)
{
    UT64 xu;
	unsigned int is_snan;

	xu.f64 = x;

    if((xu.u64 & EXPBITS_DP64) == EXPBITS_DP64)
    {
        // x is Inf or NaN
        if((xu.u64 & MANTBITS_DP64) == 0x0)
        {
            // x is Inf
            _mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID); 
#ifdef WIN64
            xu.u64 = INDEFBITPATT_DP64;
			__amd_handle_error(DOMAIN, EDOM, name, x, 0, xu.f64);
#else
			xu.u64 = QNANBITPATT_DP64;
            name = *(&name); // dummy statement to avoid warning
#endif
		}
		else {
			// x is NaN
            is_snan = (((xu.u64 & QNAN_MASK_64) == QNAN_MASK_64) ? 0 : 1);
			if(is_snan){
				xu.u64 |= QNAN_MASK_64;
#ifdef WIN64
#else
				_mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID);
#endif
			}
#ifdef WIN64
			__amd_handle_error(DOMAIN, EDOM, name, x, 0, xu.f64);
#endif
		}
		
	}

	return xu.f64;
}
예제 #17
0
파일: sys.c 프로젝트: jakevdp/julia
DLLEXPORT uint8_t jl_zero_denormals(uint8_t isZero)
{
#ifdef __SSE2__
    // SSE2 supports both FZ and DAZ
    uint32_t flags = 0x8040;
#elif __SSE__
    // SSE supports only the FZ flag
    uint32_t flags = 0x8000;
#endif

#ifdef __SSE__
    if (isZero) {
	_mm_setcsr(_mm_getcsr() | flags);
    }
    else {
	_mm_setcsr(_mm_getcsr() & ~flags);
    }
    return 1;
#else
    return 0;
#endif
}
예제 #18
0
  static void* threadStartup(ThreadStartupData* parg)
  {
    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));

#if !defined(__LINUX__) || defined(__MIC__)
    if (parg->affinity >= 0)
	setAffinity(parg->affinity);
#endif

    parg->f(parg->arg);
    delete parg;
    return NULL;
  }
예제 #19
0
파일: u_math.c 프로젝트: DirectFB/mesa
/**
 * Fetches the contents of the fpstate (mxcsr on x86) register.
 *
 * On platforms without support for it just returns 0.
 */
unsigned
util_fpstate_get(void)
{
   unsigned mxcsr = 0;

#if defined(PIPE_ARCH_SSE)
   if (util_cpu_caps.has_sse) {
      mxcsr = _mm_getcsr();
   }
#endif

   return mxcsr;
}
예제 #20
0
void
FC_FUNC_(force_ftz,FORCE_FTZ)()
{
#ifdef FORCE_FTZ
  unsigned int x;

  /* force FTZ by setting bits 11 and 15 to one */
  x = _mm_getcsr();
  x |= (1 << FTZ_BIT);
  x |= (1 << UNDERFLOW_EXCEPTION_MASK);
  _mm_setcsr(x);
#endif
}
예제 #21
0
float _sinf_cosf_special(float x, char *name)
{
    UT32 xu;
	unsigned int is_snan;

	xu.f32 = x;

    if((xu.u32 & EXPBITS_SP32) == EXPBITS_SP32)
    {
        // x is Inf or NaN
        if((xu.u32 & MANTBITS_SP32) == 0x0)
        {
            // x is Inf	
            _mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID);
#ifdef WIN64
            xu.u32 = INDEFBITPATT_SP32;
			__amd_handle_errorf(DOMAIN, EDOM, name, x, 0, 0.0f, 0, xu.f32, 0);
#else
			xu.u32 = QNANBITPATT_SP32; 
            name = *(&name); // dummy statement to avoid warning
#endif
		}
		else {
			// x is NaN
            is_snan = (((xu.u32 & QNAN_MASK_32) == QNAN_MASK_32) ? 0 : 1);
			if(is_snan) {
				xu.u32 |= QNAN_MASK_32;
				_mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID);
			}
#ifdef WIN64
			__amd_handle_errorf(DOMAIN, EDOM, name, x, is_snan, 0.0f, 0, xu.f32, 0);
#endif
		}
		
	}

	return xu.f32;
}
예제 #22
0
void SysCoreThread::ExecuteTaskInThread()
{
	Threading::EnableHiresScheduler();
	m_sem_event.WaitWithoutYield();

	m_mxcsr_saved.bitmask = _mm_getcsr();

	PCSX2_PAGEFAULT_PROTECT {
		while(true) {
			StateCheckInThread();
			DoCpuExecute();
		}
	} PCSX2_PAGEFAULT_EXCEPT;
}
예제 #23
0
void SysCoreThread::ExecuteTaskInThread()
{
	Threading::EnableHiresScheduler(); // Note that *something* in SPU2-X and GSdx also set the timer resolution to 1ms.
	m_sem_event.WaitWithoutYield();

	m_mxcsr_saved.bitmask = _mm_getcsr();

	PCSX2_PAGEFAULT_PROTECT {
		while(true) {
			StateCheckInThread();
			DoCpuExecute();
		}
	} PCSX2_PAGEFAULT_EXCEPT;
}
예제 #24
0
inline T get_smallest_value(mpl::true_ const&)
{
   //
   // numeric_limits lies about denorms being present - particularly
   // when this can be turned on or off at runtime, as is the case
   // when using the SSE2 registers in DAZ or FTZ mode.
   //
   static const T m = std::numeric_limits<T>::denorm_min();
#ifdef BOOST_MATH_CHECK_SSE2
   return (_mm_getcsr() & (_MM_FLUSH_ZERO_ON | 0x40)) ? tools::min_value<T>() : m;;
#else
   return ((tools::min_value<T>() / 2) == 0) ? tools::min_value<T>() : m;
#endif
}
예제 #25
0
파일: ortho.cpp 프로젝트: sim82/shooter2
    static int main( const std::vector<CL_String> &args ) {
	 _mm_setcsr( _mm_getcsr() | _MM_FLUSH_ZERO_ON);
//      plane p( plane::dir_zx_p, vec3f( 0.5, 0.5, 0.5 ));

//      return 0;
        try {
            
     
            ortho o;
            o.start();
        } catch( gl_error_exception x ) {
            std::cerr << x.what() << std::endl;
            std::cerr << "bailing out\n";
        }

        return 0;
    }
예제 #26
0
void f() {
  (void)_mm_getcsr(); // expected-warning{{implicitly declaring library function '_mm_getcsr'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_getcsr'}}
  _mm_setcsr(1); // expected-warning{{implicitly declaring library function '_mm_setcsr'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_setcsr'}}
  _mm_sfence(); // expected-warning{{implicitly declaring library function '_mm_sfence'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_sfence'}}

  _mm_clflush((void*)0); // expected-warning{{implicitly declaring library function '_mm_clflush'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_clflush'}}
  _mm_lfence(); // expected-warning{{implicitly declaring library function '_mm_lfence'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_lfence'}}
  _mm_mfence(); // expected-warning{{implicitly declaring library function '_mm_mfence'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_mfence'}}
  _mm_pause(); // expected-warning{{implicitly declaring library function '_mm_pause'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_pause'}}
}
예제 #27
0
void dynamicdsp_threadprocess(t_dynamicdsp *x, void **sig_outs, void *temp_mem_ptr, t_ptr_uint temp_mem_size, long vec_size, long thread_num, long num_active_threads)
{
    long num_sig_outs = x->num_sig_outs;
    
    // Turn off denormals
    
#if defined( __i386__ ) || defined( __x86_64__ )
    int oldMXCSR = _mm_getcsr();						// read the old MXCSR setting
    _mm_setcsr(oldMXCSR | 0x8040);						// write the new MXCSR setting setting DAZ and FZ bits
#endif
    
    // Zero outputs
    
    for (long i = 0; i < num_sig_outs; i++)
        memset(sig_outs[i], 0, sig_size * vec_size);
    
    if (x->manual_threading)
    {
        for (long i = 0; i < x->slots->size(); i++)
            x->slots->processIfThreadMatches(i, temp_mem_ptr, sig_outs, temp_mem_size, thread_num, num_active_threads);
    }
    else
    {
        long size = x->slots->size();
        long index = (thread_num * (size / num_active_threads)) - 1;
        for (long i = 0; i < size; i++)
        {
            if (++index >= size)
                index -= size;
            
            x->slots->processIfUnprocessed(i, temp_mem_ptr, sig_outs, temp_mem_size);
        }
    }
    
    // return denormals to previous state 
    
#if defined( __i386__ ) || defined( __x86_64__ )	
    _mm_setcsr(oldMXCSR);	
#endif
}
예제 #28
0
int main(int argc, char **argv){
  
  _mm_setcsr(_mm_getcsr() | 0x8040);

  int num_frames = atoi(argv[1]); // If we hardcode the value, the compiler is likely to optimize things which it wouldn't have done normally
  //fprintf(stderr, "num_frames: %d\n",num_frames);
  
  float *data = (float*)malloc(sizeof(float)*num_frames);
 
  for(int i=0;i<num_frames;i++){
    data[i] = (float)i / (float)num_frames; // Insert some some legal values.
  }

  void *resampler = RESAMPLER_create(src_callback, 1, NULL, RESAMPLER_CUBIC);

  int top = 1024*1024*8 * 64 / num_frames;
  
  for(int i=0 ; i < top ; i ++)
    RESAMPLER_read(resampler, scale(i,0,top,0.1,8.0), num_frames, data);
  
  //printf("hello\n");
  return 0;
}
예제 #29
0
void f0() {
  signed char         tmp_c;
//  unsigned char       tmp_Uc;
  signed short        tmp_s;
#ifdef USE_ALL
  unsigned short      tmp_Us;
#endif
  signed int          tmp_i;
  unsigned int        tmp_Ui;
  signed long long    tmp_LLi;
  unsigned long long  tmp_ULLi;
  float               tmp_f;
  double              tmp_d;

  void*          tmp_vp;
  const void*    tmp_vCp;
  char*          tmp_cp; 
  const char*    tmp_cCp; 
  int*           tmp_ip;
  float*         tmp_fp;
  const float*   tmp_fCp;
  double*        tmp_dp;
  const double*  tmp_dCp;
  long long*     tmp_LLip;

#define imm_i 32
#define imm_i_0_2 0
#define imm_i_0_4 3
#define imm_i_0_8 7
#define imm_i_0_16 15
  // Check this.
#define imm_i_0_256 0

  V2i*   tmp_V2ip;
  V1LLi* tmp_V1LLip;
  V2LLi* tmp_V2LLip;

  // 64-bit
  V8c    tmp_V8c;
  V4s    tmp_V4s;
  V2i    tmp_V2i;
  V1LLi  tmp_V1LLi;
#ifdef USE_3DNOW
  V2f    tmp_V2f;
#endif

  // 128-bit
  V16c   tmp_V16c;
  V8s    tmp_V8s;
  V4i    tmp_V4i;
  V2LLi  tmp_V2LLi;
  V4f    tmp_V4f;
  V2d    tmp_V2d;
  V2d*   tmp_V2dp;
  V4f*   tmp_V4fp;
  const V2d* tmp_V2dCp;
  const V4f* tmp_V4fCp;

  // 256-bit
  V32c   tmp_V32c;
  V4d    tmp_V4d;
  V8f    tmp_V8f;
  V4LLi  tmp_V4LLi;
  V8i    tmp_V8i;
  V4LLi* tmp_V4LLip;
  V4d*   tmp_V4dp;
  V8f*   tmp_V8fp;
  const V4d* tmp_V4dCp;
  const V8f* tmp_V8fCp;

  tmp_V2LLi = __builtin_ia32_undef128();
  tmp_V4LLi = __builtin_ia32_undef256();

  tmp_i = __builtin_ia32_comieq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comilt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comile(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comigt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comige(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comineq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomieq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomilt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomile(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomigt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomige(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomineq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comisdeq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdlt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdle(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdgt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdge(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdneq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdeq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdlt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdle(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdgt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdge(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdneq(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 0);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 1);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 2);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 3);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 4);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 5);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 6);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 7);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 0);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 1);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 2);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 3);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 4);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 5);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 6);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 7);
  tmp_V4f = __builtin_ia32_minps(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_maxps(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f);

  tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 3);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 4);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 5);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 6);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 7);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 0);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 1);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 2);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 3);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 4);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 5);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 6);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 7);
  tmp_V2d = __builtin_ia32_minpd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_maxpd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_minsd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_maxsd(tmp_V2d, tmp_V2d);
  tmp_V16c = __builtin_ia32_paddsb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_paddsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_psubsb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_psubsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_paddusb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_paddusw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_psubusb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_psubusw128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_pminsw128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s);
  tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_haddps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_haddpd(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d);
  tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i);
  tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s);
  tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i);
  tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s);
  tmp_V16c = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c);
  tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s);
  tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c);
  tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c);
  tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i);
  tmp_V16c = __builtin_ia32_pabsb128(tmp_V16c);
  tmp_V8c = __builtin_ia32_pabsb(tmp_V8c);
  tmp_V8s = __builtin_ia32_pabsw128(tmp_V8s);
  tmp_V4s = __builtin_ia32_pabsw(tmp_V4s);
  tmp_V4i = __builtin_ia32_pabsd128(tmp_V4i);
  tmp_V2i = __builtin_ia32_pabsd(tmp_V2i);
  tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi);
  tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi);
  tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi);
  tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi);
  tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s);
  tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0);

  __builtin_ia32_incsspd(tmp_Ui);
  __builtin_ia32_incsspq(tmp_ULLi);
  tmp_Ui = __builtin_ia32_rdsspd(tmp_Ui);
  tmp_ULLi = __builtin_ia32_rdsspq(tmp_ULLi);
  __builtin_ia32_saveprevssp();
  __builtin_ia32_rstorssp(tmp_vp);
  __builtin_ia32_wrssd(tmp_Ui, tmp_vp);
  __builtin_ia32_wrssq(tmp_ULLi, tmp_vp);
  __builtin_ia32_wrussd(tmp_Ui, tmp_vp);
  __builtin_ia32_wrussq(tmp_ULLi, tmp_vp);
  __builtin_ia32_setssbsy();
  __builtin_ia32_clrssbsy(tmp_vp);

  (void) __builtin_ia32_ldmxcsr(tmp_Ui);
  (void) _mm_setcsr(tmp_Ui);
  tmp_Ui = __builtin_ia32_stmxcsr();
  tmp_Ui = _mm_getcsr();
  (void)__builtin_ia32_fxsave(tmp_vp);
  (void)__builtin_ia32_fxsave64(tmp_vp);
  (void)__builtin_ia32_fxrstor(tmp_vp);
  (void)__builtin_ia32_fxrstor64(tmp_vp);

  (void)__builtin_ia32_xsave(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsave64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstor(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstor64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaveopt(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaveopt64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstors(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstors64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsavec(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsavec64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi);

  (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_clzero(tmp_vp);
  (void) __builtin_ia32_cldemote(tmp_vp);

  tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
  tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
  tmp_i = __builtin_ia32_cvtss2si(tmp_V4f);
  tmp_i = __builtin_ia32_cvttss2si(tmp_V4f);

  tmp_i = __builtin_ia32_rdtsc();
  tmp_i = __rdtsc();
  tmp_i = __builtin_ia32_rdtscp(&tmp_Ui);
  tmp_LLi = __builtin_ia32_rdpmc(tmp_i);
  __builtin_ia32_wbnoinvd();
#ifdef USE_64
  tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f);
  tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f);
#endif
  tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
  (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
  (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f);
  (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
  tmp_i = __builtin_ia32_movmskps(tmp_V4f);
  tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
  (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
  (void) __builtin_ia32_sfence();
  (void) _mm_sfence();

  tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c);
  tmp_V4f = __builtin_ia32_rcpps(tmp_V4f);
  tmp_V4f = __builtin_ia32_rcpss(tmp_V4f);
  tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f);
  tmp_V4f = __builtin_ia32_rsqrtss(tmp_V4f);
  tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f);
  tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f);
  (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp);
  tmp_i = __builtin_ia32_movmskpd(tmp_V2d);
  tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c);
  (void) __builtin_ia32_movnti(tmp_ip, tmp_i);
#ifdef USE_64
  (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
#endif
  tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
  tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
  tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
  tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
  tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
  tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d);
  tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d);
  tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d);
  tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i);
  tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d);
  tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d);
  tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d);
#ifdef USE_64
  tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d);
  tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d);
#endif
  tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f);
  tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f);
  (void) __builtin_ia32_clflush(tmp_vCp);
  (void) _mm_clflush(tmp_vCp);
  (void) __builtin_ia32_lfence();
  (void) _mm_lfence();
  (void) __builtin_ia32_mfence();
  (void) _mm_mfence();
  (void) __builtin_ia32_pause();
  (void) _mm_pause();
  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i);
  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i);
  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i);
  tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i);
  tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_psrlw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_psrld128(tmp_V4i, tmp_V4i);
  tmp_V2LLi = __builtin_ia32_psrlq128(tmp_V2LLi, tmp_V2LLi);
  tmp_V8s = __builtin_ia32_psllw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_pslld128(tmp_V4i, tmp_V4i);
  tmp_V2LLi = __builtin_ia32_psllq128(tmp_V2LLi, tmp_V2LLi);
  tmp_V8s = __builtin_ia32_psllwi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_pslldi128(tmp_V4i, tmp_i);
  tmp_V2LLi = __builtin_ia32_psllqi128(tmp_V2LLi, tmp_i);
  tmp_V8s = __builtin_ia32_psrlwi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_psrldi128(tmp_V4i, tmp_i);
  tmp_V2LLi = __builtin_ia32_psrlqi128(tmp_V2LLi, tmp_i);
  tmp_V8s = __builtin_ia32_psrawi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_psradi128(tmp_V4i, tmp_i);
  tmp_V8s = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s);
  (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui);
  tmp_V16c = __builtin_ia32_lddqu(tmp_cCp);
  tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i);
  tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i);
#ifdef USE_SSE4
  tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c);
  tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_blendvps(tmp_V4f, tmp_V4f, tmp_V4f);
  tmp_V8s = __builtin_ia32_packusdw128(tmp_V4i, tmp_V4i);
  tmp_V16c = __builtin_ia32_pmaxsb128(tmp_V16c, tmp_V16c);
  tmp_V4i = __builtin_ia32_pmaxsd128(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_pmaxud128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_pmaxuw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pminsb128(tmp_V16c, tmp_V16c);
  tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s);
  tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i);
  tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16);
  tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16);
  tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16);
  tmp_V2d = __builtin_ia32_roundpd(tmp_V2d, imm_i_0_16);
  tmp_V4f = __builtin_ia32_insertps128(tmp_V4f, tmp_V4f, imm_i_0_256);
#endif

  tmp_V4d = __builtin_ia32_addsubpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_addsubps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_haddpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_hsubps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_hsubpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_haddps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_maxpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_maxps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_minpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_minps256(tmp_V8f, tmp_V8f);
  tmp_V2d = __builtin_ia32_vpermilvarpd(tmp_V2d, tmp_V2LLi);
  tmp_V4f = __builtin_ia32_vpermilvarps(tmp_V4f, tmp_V4i);
  tmp_V4d = __builtin_ia32_vpermilvarpd256(tmp_V4d, tmp_V4LLi);
  tmp_V8f = __builtin_ia32_vpermilvarps256(tmp_V8f, tmp_V8i);
  tmp_V4d = __builtin_ia32_blendvpd256(tmp_V4d, tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_blendvps256(tmp_V8f, tmp_V8f, tmp_V8f);
  tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7);
  tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0);
  tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0);
  tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d);
  tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f);
  tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d);
  tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d);
  tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f);
  tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7);
  tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7);
  tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7);
  tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d);
  tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f);
  tmp_V8f = __builtin_ia32_rsqrtps256(tmp_V8f);
  tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f);
  tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1);
  tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1);
  tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestzps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestcps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestnzcps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestzpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestcpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestnzcpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestzps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_vtestcps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_vtestnzcps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_ptestz256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_ptestc256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_ptestnzc256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_movmskpd256(tmp_V4d);
  tmp_i = __builtin_ia32_movmskps256(tmp_V8f);
  __builtin_ia32_vzeroall();
  __builtin_ia32_vzeroupper();
  tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
  tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
  tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
  tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);
  tmp_V8f = __builtin_ia32_maskloadps256(tmp_V8fCp, tmp_V8i);
  __builtin_ia32_maskstorepd(tmp_V2dp, tmp_V2LLi, tmp_V2d);
  __builtin_ia32_maskstoreps(tmp_V4fp, tmp_V4i, tmp_V4f);
  __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d);
  __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f);

#ifdef USE_3DNOW
  tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c);
  tmp_V2i = __builtin_ia32_pf2id(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i);
  tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i);
  tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f);
  tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i);

  tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4);
  tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha1msg2(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256rnds2(tmp_V4i, tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256msg1(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256msg2(tmp_V4i, tmp_V4i);
#endif
}
예제 #30
0
void Permutohedral::init ( const float* feature, int feature_size, int N )
{
	// Compute the lattice coordinates for each feature [there is going to be a lot of magic here
	N_ = N;
	d_ = feature_size;
	HashTable hash_table( d_, N_/**(d_+1)*/ );

	const int blocksize = sizeof(__m128) / sizeof(float);
	const __m128 invdplus1   = _mm_set1_ps( 1.0f / (d_+1) );
	const __m128 dplus1      = _mm_set1_ps( d_+1 );
	const __m128 Zero        = _mm_set1_ps( 0 );
	const __m128 One         = _mm_set1_ps( 1 );

	// Allocate the class memory
	if (offset_) delete [] offset_;
	offset_ = new int[ (d_+1)*(N_+16) ];
	memset( offset_, 0, (d_+1)*(N_+16)*sizeof(int) );

	if (barycentric_) delete [] barycentric_;
	barycentric_ = new float[ (d_+1)*(N_+16) ];
	memset( barycentric_, 0, (d_+1)*(N_+16)*sizeof(float) );

	// Allocate the local memory
	__m128 * scale_factor = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
	__m128 * f            = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
	__m128 * elevated     = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
	__m128 * rem0         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
	__m128 * rank         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 );
	float * barycentric = new float[(d_+2)*blocksize];
	short * canonical = new short[(d_+1)*(d_+1)];
	short * key = new short[d_+1];

	// Compute the canonical simplex
	for( int i=0; i<=d_; i++ ){
		for( int j=0; j<=d_-i; j++ )
			canonical[i*(d_+1)+j] = i;
		for( int j=d_-i+1; j<=d_; j++ )
			canonical[i*(d_+1)+j] = i - (d_+1);
	}

	// Expected standard deviation of our filter (p.6 in [Adams etal 2010])
	float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1);
	// Compute the diagonal part of E (p.5 in [Adams etal 2010])
	for( int i=0; i<d_; i++ )
		scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( float((i+2)*(i+1) ) * inv_std_dev) );

	// Setup the SSE rounding
#ifndef __SSE4_1__
	const unsigned int old_rounding = _mm_getcsr();
	_mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST );
#endif

	// Compute the simplex each feature lies in
	for( int k=0; k<N_; k+=blocksize ){
		// Load the feature from memory
		float * ff = (float*)f;
		for( int j=0; j<d_; j++ )
			for( int i=0; i<blocksize; i++ )
				ff[ j*blocksize + i ] = k+i < N_ ? feature[ (k+i)*d_+j ] : 0.0;

		// Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010])

		// sm contains the sum of 1..n of our faeture vector
		__m128 sm = Zero;
		for( int j=d_; j>0; j-- ){
			__m128 cf = f[j-1]*scale_factor[j-1];
			elevated[j] = sm - _mm_set1_ps(j)*cf;
			sm += cf;
		}
		elevated[0] = sm;

		// Find the closest 0-colored simplex through rounding
		__m128 sum = Zero;
		for( int i=0; i<=d_; i++ ){
			__m128 v = invdplus1 * elevated[i];
#ifdef __SSE4_1__
			v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT );
#else
			v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) );
#endif
			rem0[i] = v*dplus1;
			sum += v;
		}

		// Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values)
		for( int i=0; i<=d_; i++ )
			rank[i] = Zero;
		for( int i=0; i<d_; i++ ){
			__m128 di = elevated[i] - rem0[i];
			for( int j=i+1; j<=d_; j++ ){
				__m128 dj = elevated[j] - rem0[j];
				__m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) );
				rank[i] += c;
				rank[j] += One-c;
			}
		}

		// If the point doesn't lie on the plane (sum != 0) bring it back
		for( int i=0; i<=d_; i++ ){
			rank[i] += sum;
			__m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) );
			__m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) );
			rank[i] += add-sub;
			rem0[i] += add-sub;
		}

		// Compute the barycentric coordinates (p.10 in [Adams etal 2010])
		for( int i=0; i<(d_+2)*blocksize; i++ )
			barycentric[ i ] = 0;
		for( int i=0; i<=d_; i++ ){
			__m128 v = (elevated[i] - rem0[i])*invdplus1;

			// Didn't figure out how to SSE this
			float * fv = (float*)&v;
			float * frank = (float*)&rank[i];
			for( int j=0; j<blocksize; j++ ){
				int p = d_-frank[j];
				barycentric[j*(d_+2)+p  ] += fv[j];
				barycentric[j*(d_+2)+p+1] -= fv[j];
			}
		}

		// The rest is not SSE'd
		for( int j=0; j<blocksize; j++ ){
			// Wrap around
			barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1];

			float * frank = (float*)rank;
			float * frem0 = (float*)rem0;
			// Compute all vertices and their offset
			for( int remainder=0; remainder<=d_; remainder++ ){
				for( int i=0; i<d_; i++ ){
					key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ];
				}
				offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true );
				barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ];
			}
		}
	}
	_mm_free( scale_factor );
	_mm_free( f );
	_mm_free( elevated );
	_mm_free( rem0 );
	_mm_free( rank );
	delete [] barycentric;
	delete [] canonical;
	delete [] key;

	// Reset the SSE rounding
#ifndef __SSE4_1__
	_mm_setcsr( old_rounding );
#endif

	// This is normally fast enough so no SSE needed here
	// Find the Neighbors of each lattice point

	// Get the number of vertices in the lattice
	M_ = hash_table.size();

	// Create the neighborhood structure
	if(blur_neighbors_) delete[] blur_neighbors_;
	blur_neighbors_ = new Neighbors[ (d_+1)*M_ ];

	short * n1 = new short[d_+1];
	short * n2 = new short[d_+1];

	// For each of d+1 axes,
	for( int j = 0; j <= d_; j++ ){
		for( int i=0; i<M_; i++ ){
			const short * key = hash_table.getKey( i );
			for( int k=0; k<d_; k++ ){
				n1[k] = key[k] - 1;
				n2[k] = key[k] + 1;
			}
			n1[j] = key[j] + d_;
			n2[j] = key[j] - d_;

			blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 );
			blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 );
		}
	}
	delete[] n1;
	delete[] n2;
}