// specific version of the function. Compile once for each version float FUNCNAME (float * f) { Vec16f a; // vector of 16 floats a.load(f); // load array into vector return horizontal_add(a); // return sum of 16 elements }
static inline double calc_output_single (SINC_FILTER *filter, const increment_t increment, const increment_t start_filter_index) { #ifdef RESAMPLER_SSE_OPT __m128i increment4; __m128 left128,right128; float left,right; #else double left,right; #endif const coeff_t * const __restrict coeffs = filter->coeffs; const float * const __restrict buffer = filter->buffer; increment_t filter_index, max_filter_index ; int data_index, coeff_count; /* Convert input parameters into fixed point. */ max_filter_index = int_to_fp (filter->coeff_half_len) ; /* First apply the left half of the filter. */ filter_index = start_filter_index ; coeff_count = (max_filter_index - filter_index) / increment ; filter_index = filter_index + coeff_count * increment ; data_index = filter->b_current - coeff_count ; #ifdef RESAMPLER_SSE_OPT increment4 = _mm_set_epi32(increment * 3, increment * 2, increment, 0); left128 = _mm_setzero_ps(); while(filter_index >= increment * 3) { #ifdef USE_WINDOWS_CODE __m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #else Windows__m128i indx; indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx.m128i,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #endif __m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below __m128 icoeff,icoeffp1,icoeffd,fraction; #ifdef _DEBUG icoeff0 = icoeff2 = _mm_setzero_ps(); #endif #ifdef USE_WINDOWS_CODE indx = _mm_srai_epi32(indx, SHIFT_BITS); #else indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS); #endif icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1])); icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3])); icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0)); icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1)); icoeffd = _mm_sub_ps(icoeffp1, icoeff); fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE)); icoeff = _mm_add_ps(icoeff,_mm_mul_ps(icoeffd, fraction)); left128 = _mm_add_ps(left128,_mm_mul_ps(icoeff, _mm_loadu_ps(buffer + data_index))); data_index += 4; filter_index -= increment * 4; } #endif left = 0.; while (filter_index >= MAKE_INCREMENT_T(0)) { coeff_t fraction = fp_to_float(filter_index); int indx = fp_to_int(filter_index); coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]); left += icoeff * buffer[data_index]; filter_index -= increment; data_index++; } /* Now apply the right half of the filter. */ filter_index = increment - start_filter_index ; coeff_count = (max_filter_index - filter_index) / increment ; filter_index = filter_index + coeff_count * increment ; data_index = filter->b_current + 1 + coeff_count ; #ifdef RESAMPLER_SSE_OPT right128 = _mm_setzero_ps(); while (filter_index > increment * 3) { #ifdef USE_WINDOWS_CODE __m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #else Windows__m128i indx; indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx.m128i, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #endif __m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below __m128 icoeff,icoeffp1,icoeffd,fraction,data; #ifdef _DEBUG icoeff0 = icoeff2 = _mm_setzero_ps(); #endif #ifdef USE_WINDOWS_CODE indx = _mm_srai_epi32(indx, SHIFT_BITS); #else indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS); #endif icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1])); icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3])); icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0)); icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1)); icoeffd = _mm_sub_ps(icoeffp1, icoeff); fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE)); icoeff = _mm_add_ps(icoeff, _mm_mul_ps(icoeffd, fraction)); data = _mm_loadu_ps(buffer + (data_index - 3)); right128 = _mm_add_ps(right128,_mm_mul_ps(icoeff, _mm_shuffle_ps(data,data,_MM_SHUFFLE(0,1,2,3)))); data_index -= 4; filter_index -= increment * 4; } #endif right = 0.; while (filter_index > MAKE_INCREMENT_T(0)) { coeff_t fraction = fp_to_float(filter_index); int indx = fp_to_int(filter_index); coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]); right += icoeff * buffer[data_index]; filter_index -= increment; data_index--; } return ( #ifdef RESAMPLER_SSE_OPT _mm_cvtss_f32(horizontal_add(left128)) + _mm_cvtss_f32(horizontal_add(right128)) + #endif left + right) ; } /* calc_output_single */