namespace Avx2 { const __m256i K8_SHUFFLE_REORDER_16 = SIMD_MM256_SETR_EPI8( 0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6, 0x9, 0x8, 0xB, 0xA, 0xD, 0xC, 0xF, 0xE, 0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6, 0x9, 0x8, 0xB, 0xA, 0xD, 0xC, 0xF, 0xE); template <bool align> SIMD_INLINE void Reorder16bit(const uint8_t * src, uint8_t * dst) { __m256i _src = Load<align>((__m256i*)src); Store<align>((__m256i*)dst, _mm256_shuffle_epi8(_src, K8_SHUFFLE_REORDER_16)); } template <bool align> void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) { assert(size >= A && size%2 == 0); size_t alignedSize = AlignLo(size, A); for(size_t i = 0; i < alignedSize; i += A) Reorder16bit<align>(src + i, dst + i); for(size_t i = alignedSize; i < size; i += 2) Base::Reorder16bit(src + i, dst + i); } void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst) { if(Aligned(src) && Aligned(dst)) Reorder16bit<true>(src, size, dst); else Reorder16bit<false>(src, size, dst); } const __m256i K8_SHUFFLE_REORDER_32 = SIMD_MM256_SETR_EPI8( 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0xB, 0xA, 0x9, 0x8, 0xF, 0xE, 0xD, 0xC, 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0xB, 0xA, 0x9, 0x8, 0xF, 0xE, 0xD, 0xC); template <bool align> SIMD_INLINE void Reorder32bit(const uint8_t * src, uint8_t * dst) { __m256i _src = Load<align>((__m256i*)src); Store<align>((__m256i*)dst, _mm256_shuffle_epi8(_src, K8_SHUFFLE_REORDER_32)); } template <bool align> void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) { assert(size >= A && size%4 == 0); size_t alignedSize = AlignLo(size, A); for(size_t i = 0; i < alignedSize; i += A) Reorder32bit<align>(src + i, dst + i); for(size_t i = alignedSize; i < size; i += 4) Base::Reorder32bit(src + i, dst + i); } void Reorder32bit(const uint8_t * src, size_t size, uint8_t * dst) { if(Aligned(src) && Aligned(dst)) Reorder32bit<true>(src, size, dst); else Reorder32bit<false>(src, size, dst); } const __m256i K8_SHUFFLE_REORDER_64 = SIMD_MM256_SETR_EPI8( 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); template <bool align> SIMD_INLINE void Reorder64bit(const uint8_t * src, uint8_t * dst) { __m256i _src = Load<align>((__m256i*)src); Store<align>((__m256i*)dst, _mm256_shuffle_epi8(_src, K8_SHUFFLE_REORDER_64)); } template <bool align> void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) { assert(size >= A && size%8 == 0); size_t alignedSize = AlignLo(size, A); for(size_t i = 0; i < alignedSize; i += A) Reorder64bit<align>(src + i, dst + i); for(size_t i = alignedSize; i < size; i += 8) Base::Reorder64bit(src + i, dst + i); } void Reorder64bit(const uint8_t * src, size_t size, uint8_t * dst) { if(Aligned(src) && Aligned(dst)) Reorder64bit<true>(src, size, dst); else Reorder64bit<false>(src, size, dst); } }
namespace Avx2 { namespace { struct Buffer { Buffer(size_t size, size_t width, size_t height) { _p = Allocate(3*size + sizeof(int)*(2*height + width)); bx[0] = (uint8_t*)_p; bx[1] = bx[0] + size; ax = bx[1] + size; ix = (int*)(ax + size); iy = ix + width; ay = iy + height; } ~Buffer() { Free(_p); } uint8_t * bx[2]; uint8_t * ax; int * ix; int * ay; int * iy; private: void *_p; }; } template <size_t channelCount> void EstimateAlphaIndexX(size_t srcSize, size_t dstSize, int * indexes, uint8_t * alphas) { float scale = (float)srcSize/dstSize; for(size_t i = 0; i < dstSize; ++i) { float alpha = (float)((i + 0.5)*scale - 0.5); ptrdiff_t index = (ptrdiff_t)::floor(alpha); alpha -= index; if(index < 0) { index = 0; alpha = 0; } if(index > (ptrdiff_t)srcSize - 2) { index = srcSize - 2; alpha = 1; } indexes[i] = (int)index; alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); for(size_t channel = 1; channel < channelCount; channel++) ((uint16_t*)alphas)[channel] = *(uint16_t*)alphas; alphas += 2*channelCount; } } template <size_t channelCount> void InterpolateX(const __m256i * alpha, __m256i * buffer); template <> SIMD_INLINE void InterpolateX<1>(const __m256i * alpha, __m256i * buffer) { #if defined(_MSC_VER) // Workaround for Visual Studio 2012 compiler bug in release mode: __m256i _buffer = _mm256_or_si256(K_ZERO, _mm256_load_si256(buffer)); #else __m256i _buffer = _mm256_load_si256(buffer); #endif _mm256_store_si256(buffer, _mm256_maddubs_epi16(_buffer, _mm256_load_si256(alpha))); } const __m256i K8_SHUFFLE_X2 = SIMD_MM256_SETR_EPI8(0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF, 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7, 0x8, 0xA, 0x9, 0xB, 0xC, 0xE, 0xD, 0xF); SIMD_INLINE void InterpolateX2(const __m256i * alpha, __m256i * buffer) { __m256i src = _mm256_shuffle_epi8(_mm256_load_si256(buffer), K8_SHUFFLE_X2); _mm256_store_si256(buffer, _mm256_maddubs_epi16(src, _mm256_load_si256(alpha))); } template <> SIMD_INLINE void InterpolateX<2>(const __m256i * alpha, __m256i * buffer) { InterpolateX2(alpha + 0, buffer + 0); InterpolateX2(alpha + 1, buffer + 1); } const __m256i K8_SHUFFLE_X3_00 = SIMD_MM256_SETR_EPI8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); const __m256i K8_SHUFFLE_X3_01 = SIMD_MM256_SETR_EPI8(0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1, -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1); const __m256i K8_SHUFFLE_X3_02 = SIMD_MM256_SETR_EPI8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1); const __m256i K8_SHUFFLE_X3_10 = SIMD_MM256_SETR_EPI8(0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); const __m256i K8_SHUFFLE_X3_11 = SIMD_MM256_SETR_EPI8( -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF, 0x0, 0x3, 0x1, 0x4, 0x2, 0x5, 0x6, 0x9, 0x7, 0xA, 0x8, 0xB, 0xC, 0xF, 0xD, -1); const __m256i K8_SHUFFLE_X3_12 = SIMD_MM256_SETR_EPI8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0); const __m256i K8_SHUFFLE_X3_20 = SIMD_MM256_SETR_EPI8(0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); const __m256i K8_SHUFFLE_X3_21 = SIMD_MM256_SETR_EPI8( -1, 0x1, 0x2, 0x5, 0x3, 0x6, 0x4, 0x7, 0x8, 0xB, 0x9, 0xC, 0xA, 0xD, 0xE, -1, -1, 0x2, 0x0, 0x3, 0x4, 0x7, 0x5, 0x8, 0x6, 0x9, 0xA, 0xD, 0xB, 0xE, 0xC, 0xF); const __m256i K8_SHUFFLE_X3_22 = SIMD_MM256_SETR_EPI8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); template <> SIMD_INLINE void InterpolateX<3>(const __m256i * alpha, __m256i * buffer) { __m256i src[3], shuffled; src[0] = _mm256_load_si256(buffer + 0); src[1] = _mm256_load_si256(buffer + 1); src[2] = _mm256_load_si256(buffer + 2); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[0], 0x21), K8_SHUFFLE_X3_00); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[0], K8_SHUFFLE_X3_01)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_02)); _mm256_store_si256(buffer + 0, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 0))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[0], src[1], 0x21), K8_SHUFFLE_X3_10); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[1], K8_SHUFFLE_X3_11)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_12)); _mm256_store_si256(buffer + 1, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 1))); shuffled = _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[1], src[2], 0x21), K8_SHUFFLE_X3_20); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(src[2], K8_SHUFFLE_X3_21)); shuffled = _mm256_or_si256(shuffled, _mm256_shuffle_epi8(_mm256_permute2x128_si256(src[2], src[2], 0x21), K8_SHUFFLE_X3_22)); _mm256_store_si256(buffer + 2, _mm256_maddubs_epi16(shuffled, _mm256_load_si256(alpha + 2))); } const __m256i K8_SHUFFLE_X4 = SIMD_MM256_SETR_EPI8(0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF, 0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7, 0x8, 0xC, 0x9, 0xD, 0xA, 0xE, 0xB, 0xF); SIMD_INLINE void InterpolateX4(const __m256i * alpha, __m256i * buffer) { __m256i src = _mm256_shuffle_epi8(_mm256_load_si256(buffer), K8_SHUFFLE_X4); _mm256_store_si256(buffer, _mm256_maddubs_epi16(src, _mm256_load_si256(alpha))); } template <> SIMD_INLINE void InterpolateX<4>(const __m256i * alpha, __m256i * buffer) { InterpolateX4(alpha + 0, buffer + 0); InterpolateX4(alpha + 1, buffer + 1); InterpolateX4(alpha + 2, buffer + 2); InterpolateX4(alpha + 3, buffer + 3); } const __m256i K16_FRACTION_ROUND_TERM = SIMD_MM256_SET1_EPI16(Base::BILINEAR_ROUND_TERM); template<bool align> SIMD_INLINE __m256i InterpolateY(const __m256i * pbx0, const __m256i * pbx1, __m256i alpha[2]) { __m256i sum = _mm256_add_epi16(_mm256_mullo_epi16(Load<align>(pbx0), alpha[0]), _mm256_mullo_epi16(Load<align>(pbx1), alpha[1])); return _mm256_srli_epi16(_mm256_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); } template<bool align> SIMD_INLINE void InterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m256i alpha[2], uint8_t * dst) { __m256i lo = InterpolateY<align>((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha); __m256i hi = InterpolateY<align>((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha); Store<false>((__m256i*)dst, PackU16ToU8(lo, hi)); } template <size_t channelCount> void ResizeBilinear( const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) { assert(dstWidth >= A); struct One { uint8_t channels[channelCount]; }; struct Two { uint8_t channels[channelCount*2]; }; size_t size = 2*dstWidth*channelCount; size_t bufferSize = AlignHi(dstWidth, A)*channelCount*2; size_t alignedSize = AlignHi(size, DA) - DA; const size_t step = A*channelCount; Buffer buffer(bufferSize, dstWidth, dstHeight); Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); EstimateAlphaIndexX<channelCount>(srcWidth, dstWidth, buffer.ix, buffer.ax); ptrdiff_t previous = -2; __m256i a[2]; for(size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) { a[0] = _mm256_set1_epi16(int16_t(Base::FRACTION_RANGE - buffer.ay[yDst])); a[1] = _mm256_set1_epi16(int16_t(buffer.ay[yDst])); ptrdiff_t sy = buffer.iy[yDst]; int k = 0; if(sy == previous) k = 2; else if(sy == previous + 1) { Swap(buffer.bx[0], buffer.bx[1]); k = 1; } previous = sy; for(; k < 2; k++) { Two * pb = (Two *)buffer.bx[k]; const One * psrc = (const One *)(src + (sy + k)*srcStride); for(size_t x = 0; x < dstWidth; x++) pb[x] = *(Two *)(psrc + buffer.ix[x]); uint8_t * pbx = buffer.bx[k]; for(size_t i = 0; i < bufferSize; i += step) InterpolateX<channelCount>((__m256i*)(buffer.ax + i), (__m256i*)(pbx + i)); } for(size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A) InterpolateY<true>(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id); size_t i = size - DA; InterpolateY<false>(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i/2); } } void ResizeBilinear( const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) { switch(channelCount) { case 1: ResizeBilinear<1>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); break; case 2: ResizeBilinear<2>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); break; case 3: ResizeBilinear<3>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); break; case 4: ResizeBilinear<4>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); break; default: Base::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); } } }