C++ (Cpp) _mm_packus_epi32の例

コード例 #1

0

ファイルを表示

ファイル: av1_highbd_convolve_sse4.c プロジェクト: luke-chang/gecko-1

static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
  __m128i v0, v1;
  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));

  u[0] = _mm_loadu_si128((__m128i const *)src);
  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));

  u[0] = _mm_add_epi32(u[0], rnd);
  u[1] = _mm_add_epi32(u[1], rnd);
  u[2] = _mm_add_epi32(u[2], rnd);
  u[3] = _mm_add_epi32(u[3], rnd);

  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);

  u[0] = _mm_packus_epi32(u[0], u[1]);
  u[1] = _mm_packus_epi32(u[2], u[3]);

  highbd_clip(u, 2, bd);

  v0 = _mm_unpacklo_epi16(u[0], u[1]);
  v1 = _mm_unpackhi_epi16(u[0], u[1]);

  u[0] = _mm_unpacklo_epi16(v0, v1);
  u[2] = _mm_unpackhi_epi16(v0, v1);

  u[1] = _mm_srli_si128(u[0], 8);
  u[3] = _mm_srli_si128(u[2], 8);
}

コード例 #2

0

ファイルを表示

ファイル: haar_invtransform.hpp プロジェクト: GrokImageCompression/vc2hqdecode

template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int32_t(void *_idata,
																			   const int istride,
																			   const char *odata,
																			   const int ostride,
																			   const int iwidth,
																			   const int iheight,
																			   const int ooffset_x,
																			   const int ooffset_y,
																			   const int owidth,
																			   const int oheight) {
  int32_t *idata = (int32_t *)_idata;
  const int skip = 1;
  const __m128i ONE = _mm_set1_epi32(1);
  const __m128i OFFSET = _mm_set1_epi32(1 << (active_bits - 1));

  (void)iwidth;
  (void)iheight;

  for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) {
    for (int x = ooffset_x; x < ooffset_x + owidth; x += 8) {
      __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]);
      __m128i D4 = _mm_load_si128((__m128i *)&idata[y*istride + x + 4]);

      __m128i A0 = _mm_unpacklo_epi32(D0, D4);
      __m128i A2 = _mm_unpackhi_epi32(D0, D4);

      __m128i E0 = _mm_unpacklo_epi32(A0, A2);
      __m128i O1 = _mm_unpackhi_epi32(A0, A2);

      __m128i X0 = _mm_sub_epi32(E0, _mm_srai_epi32(_mm_add_epi32(O1, ONE), 1));
      __m128i X1 = _mm_add_epi32(O1, X0);

      __m128i Z0 = _mm_unpacklo_epi32(X0, X1);
      __m128i Z4 = _mm_unpackhi_epi32(X0, X1);

      if (shift != 0) {
        Z0 = _mm_add_epi32(Z0, ONE);
        Z4 = _mm_add_epi32(Z4, ONE);
        Z0 = _mm_srai_epi32(Z0, shift);
        Z4 = _mm_srai_epi32(Z4, shift);
      }

      Z0 = _mm_add_epi32(Z0, OFFSET);
      Z4 = _mm_add_epi32(Z4, OFFSET);

      Z0 = _mm_slli_epi32(Z0, (16 - active_bits));
      Z4 = _mm_slli_epi32(Z4, (16 - active_bits));

      __m128i R = _mm_packus_epi32(Z0, Z4);

      R = _mm_srli_epi16(R, (16 - active_bits));
      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x - ooffset_x)], R);
    }
  }
}

コード例 #3

0

ファイルを表示

ファイル: convert_f16c.cpp プロジェクト: mysteryx93/AviSynthShader

static inline void
planar_shader_to_yuv_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const __m128 coef = _mm_set1_ps(STACK16 ? 65535.0f : 255.0f);
    const __m128i mask16 = _mm_set1_epi16(0x00FF);
    float* buff = reinterpret_cast<float*>(_buff);

    for (int p = 0; p < 3; ++p) {
        const uint8_t* s = srcp[p];
        uint8_t* d = dstp[p];
        uint8_t* lsb = d + height * dpitch;

        for (int y = 0; y < height; ++y) {
            convert_half_to_float(buff, s, width);
            for (int x = 0; x < width; x += 16) {
                __m128i s0 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 0)));
                __m128i s1 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 4)));
                __m128i s2 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 8)));
                __m128i s3 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 12)));
                s0 = _mm_packus_epi32(s0, s1);
                s1 = _mm_packus_epi32(s2, s3);
                if (!STACK16) {
                    s0 = _mm_packus_epi16(s0, s1);
                    _mm_stream_si128(reinterpret_cast<__m128i*>(d + x), s0);
                } else {
                    __m128i dm = _mm_packus_epi16(_mm_srli_epi16(s0, 8), _mm_srli_epi16(s1, 8));
                    __m128i dl = _mm_packus_epi16(_mm_and_si128(s0, mask16), _mm_and_si128(s1, mask16));
                    _mm_stream_si128(reinterpret_cast<__m128i*>(d + x), dm);
                    _mm_stream_si128(reinterpret_cast<__m128i*>(lsb + x), dl);
                }
            }
            s += spitch;
            d += dpitch;
            if (STACK16) {
                lsb += dpitch;
            }
        }
    }
}

コード例 #4

0

ファイルを表示

ファイル: xsimd_sse_int32.hpp プロジェクト: jmabille/nxsimd

        inline void store_aligned_int32(__m128i src, uint16_t* dst)
        {
#if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION
            __m128i tmp = _mm_packus_epi32(src, src);
            _mm_storel_epi64((__m128i*)dst, tmp);
#else
            alignas(16) int32_t tmp[4];
            _mm_store_si128((__m128i*)tmp, src);
            unroller<4>([&](std::size_t i){
                dst[i] = static_cast<uint16_t>(tmp[i]);
            });
#endif
        }

コード例 #5

0

ファイルを表示

ファイル: render-cache.c プロジェクト: krh/ksim

static void
sfid_render_cache_rt_write_rep16_bgra_unorm8_xmajor(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const __m128 scale = _mm_set1_ps(255.0f);
    const __m128 half =  _mm_set1_ps(0.5f);
    struct reg src[1];

    memcpy(src, &t->grf[args->src], sizeof(src));

    if (srgb_format(args->rt.format)) {
        const __m256 inv_gamma = _mm256_set1_ps(1.0f / 2.4f);
        src[0].reg = _ZGVdN8vv_powf(src[0].reg, inv_gamma);
        /* Don't gamma correct alpha */
        src[0].f[3] = t->grf[args->src].f[3];
    }

    __m128 bgra = _mm_shuffle_ps(_mm256_castps256_ps128(src[0].reg),
                                 _mm256_castps256_ps128(src[0].reg),
                                 SWIZZLE(2, 1, 0, 3));

    bgra = _mm_mul_ps(bgra, scale);
    bgra = _mm_add_ps(bgra, half);

    __m128i bgra_i = _mm_cvtps_epi32(bgra);
    bgra_i = _mm_packus_epi32(bgra_i, bgra_i);
    bgra_i = _mm_packus_epi16(bgra_i, bgra_i);

    /* Swizzle two middle mask pairs so that dword 0-3 and 4-7
     * form linear owords of pixels. */
    __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3));

    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x0 = t->grf[1].uw[4];
    const int y0 = t->grf[1].uw[5] + slice_y;
    const int cpp = 4;
    void *base0 = xmajor_offset(args->rt.pixels, x0,  y0, args->rt.stride, cpp);

    _mm_maskstore_epi32(base0, _mm256_extractf128_si256(mask, 0), bgra_i);
    _mm_maskstore_epi32(base0 + 512, _mm256_extractf128_si256(mask, 1), bgra_i);

    const int x1 = t->grf[1].uw[8];
    const int y1 = t->grf[1].uw[9] + slice_y;
    void *base1 = xmajor_offset(args->rt.pixels, x1,  y1, args->rt.stride, 4);

    __m256i mask1 = _mm256_permute4x64_epi64(t->mask_q2, SWIZZLE(0, 2, 1, 3));
    _mm_maskstore_epi32(base1, _mm256_extractf128_si256(mask1, 0), bgra_i);
    _mm_maskstore_epi32(base1 + 512, _mm256_extractf128_si256(mask1, 1), bgra_i);
}

コード例 #6

0

ファイルを表示

ファイル: sse4_1-packusdw.c プロジェクト: 0day-ci/gcc

static void
TEST (void)
{
  union
    {
      __m128i x[NUM / 4];
      int i[NUM];
    } src1, src2;
  union
    {
      __m128i x[NUM / 4];
      unsigned short s[NUM * 2];
    } dst;
  int i, sign = 1;

  for (i = 0; i < NUM; i++)
    {
      src1.i[i] = i * i * sign;
      src2.i[i] = (i + 20) * sign;
      sign = -sign;
    }

  for (i = 0; i < NUM; i += 4)
    dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);

  for (i = 0; i < NUM; i ++)
    {
      int dstIndex;
      unsigned short sVal;

      sVal = int_to_ushort (src1.i[i]);
      dstIndex = (i % 4) + (i / 4) * 8;
      if (sVal != dst.s[dstIndex])
	abort ();

      sVal = int_to_ushort (src2.i[i]);
      dstIndex += 4;
      if (sVal != dst.s[dstIndex])
	abort ();
    }
}

コード例 #7

0

ファイルを表示

ファイル: simd.cpp プロジェクト: hjwhang/Image_Rescale

inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y)
{
 const int stride = img->width;
 const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

 // Load the data (2 pixels in one load)
 __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); 
 __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); 

 __m128 weight = CalcWeights(x, y);

 // convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA)
 __m128i p1234 = _mm_unpacklo_epi8(p12, p34);
 __m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128());
 __m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx);

 // extend to 16bit 
 __m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128());
 __m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128());
 
 // convert weights to integer
 weight = _mm_mul_ps(weight, CONST_256); 
 __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
         weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit

 //outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4]
 __m128i outRG = _mm_madd_epi16(pRG, weighti);
 //outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4]
 __m128i outBA = _mm_madd_epi16(pBA, weighti);

 // horizontal add that will produce the output values (in 32bit)
 __m128i out = _mm_hadd_epi32(outRG, outBA);
 out = _mm_srli_epi32(out, 8); // divide by 256
 
 // convert 32bit->8bit
 out = _mm_packus_epi32(out, _mm_setzero_si128());
 out = _mm_packus_epi16(out, _mm_setzero_si128());

 // return
 return _mm_cvtsi128_si32(out);
}

コード例 #8

0

ファイルを表示

ファイル: render-cache.c プロジェクト: krh/ksim

static void
sfid_render_cache_rt_write_simd8_r_uint8_ymajor(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    const int cpp = 1;

    void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp);

    struct reg *src = &t->grf[args->src];

    __m256i r32 = _mm256_permute4x64_epi64(src[0].ireg, SWIZZLE(0, 2, 1, 3));

    __m128i lo = _mm256_extractf128_si256(r32, 0);
    __m128i hi = _mm256_extractf128_si256(r32, 1);
    __m128i r16 = _mm_packus_epi32(lo, hi);
    __m128i r8 = _mm_packus_epi16(r16, r16);

    /* FIXME: Needs masking. */
    *(uint32_t *) (base +  0) = _mm_extract_epi32(r8, 0);
    *(uint32_t *) (base + 16) = _mm_extract_epi32(r8, 1);
}

コード例 #9

0

ファイルを表示

ファイル: vp9_diamond_search_sad_avx.c プロジェクト: Crawping/chromium_extract

/*****************************************************************************
 * This function utilises 3 properties of the cost function lookup tables,   *
 * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
 * vp9_encoder.c.                                                            *
 * For the joint cost:                                                       *
 *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
 * For the component costs:                                                  *
 *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
 *         (Equal costs for both components)                                 *
 *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
 *         (Cost function is even)                                           *
 * If these do not hold, then this function cannot be used without           *
 * modification, in which case you can revert to using the C implementation, *
 * which does not rely on these properties.                                  *
 *****************************************************************************/
int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                               const search_site_config *cfg,
                               MV *ref_mv, MV *best_mv, int search_param,
                               int sad_per_bit, int *num00,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *center_mv) {
  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);

  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);

  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);

  // search_param determines the length of the initial step and hence the number
  // of iterations.
  // 0 = initial step (MAX_FIRST_STEP) pel
  // 1 = (MAX_FIRST_STEP/2) pel,
  // 2 = (MAX_FIRST_STEP/4) pel...
  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
  const int tot_steps = cfg->total_steps - search_param;

  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
                                        center_mv->col >> 3);
  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);

  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);

  int_mv bmv = pack_int_mv(ref_row, ref_col);
  int_mv new_bmv = bmv;
  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);

  const int what_stride = x->plane[0].src.stride;
  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
  const uint8_t *const what = x->plane[0].src.buf;
  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
                                 ref_row * in_what_stride + ref_col;

  // Work out the start point for the search
  const uint8_t *best_address = in_what;
  const uint8_t *new_best_address = best_address;
#if ARCH_X86_64
  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

  unsigned int best_sad;

  int i;
  int j;
  int step;

  // Check the prerequisite cost function properties that are easy to check
  // in an assert. See the function-level documentation for details on all
  // prerequisites.
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);

  // Check the starting position
  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);

  *num00 = 0;

  for (i = 0, step = 0; step < tot_steps; step++) {
    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
      __m128i v_sad_d;
      __m128i v_cost_d;
      __m128i v_outside_d;
      __m128i v_inside_d;
      __m128i v_diff_mv_w;
#if ARCH_X86_64
      __m128i v_blocka[2];
#else
      __m128i v_blocka[1];
#endif

      // Compute the candidate motion vectors
      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
      // Clamp them to the search bounds
      __m128i v_these_mv_clamp_w = v_these_mv_w;
      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
      // The ones that did not change are inside the search area
      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);

      // If none of them are inside, then move on
      if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
        continue;
      }

      // The inverse mask indicates which of the MVs are outside
      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
      // Shift right to keep the sign bit clear, we will use this later
      // to set the cost to the maximum value.
      v_outside_d = _mm_srli_epi32(v_outside_d, 1);

      // Compute the difference MV
      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
      // We utilise the fact that the cost function is even, and use the
      // absolute difference. This allows us to use unsigned indexes later
      // and reduces cache pressure somewhat as only a half of the table
      // is ever referenced.
      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);

      // Compute the SIMD pointer offsets.
      {
#if ARCH_X86_64  //  sizeof(intptr_t) == 8
        // Load the offsets
        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
        // Set the ones falling outside to zero
        v_bo10_q = _mm_and_si128(v_bo10_q,
                                 _mm_cvtepi32_epi64(v_inside_d));
        v_bo32_q = _mm_and_si128(v_bo32_q,
                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
        // Compute the candidate addresses
        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
#else  // ARCH_X86 //  sizeof(intptr_t) == 4
        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
#endif
      }

      fn_ptr->sdx4df(what, what_stride,
                     (const uint8_t **)&v_blocka[0], in_what_stride,
                     (uint32_t*)&v_sad_d);

      // Look up the component cost of the residual motion vector
      {
        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);

        // Note: This is a use case for vpgather in AVX2
        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];

        __m128i v_cost_10_d, v_cost_32_d;

        v_cost_10_d = _mm_cvtsi32_si128(cost0);
        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);

        v_cost_32_d = _mm_cvtsi32_si128(cost2);
        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);

        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
      }

      // Now add in the joint cost
      {
        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
                                                _mm_setzero_si128());
        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
                                                       v_joint_cost_0_d,
                                                       v_sel_d);
        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
      }

      // Multiply by sad_per_bit
      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
      // ROUND_POWER_OF_TWO(v_cost_d, 8)
      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
      // Add the cost to the sad
      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);

      // Make the motion vectors outside the search area have max cost
      // by or'ing in the comparison mask, this way the minimum search won't
      // pick them.
      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);

      // Find the minimum value and index horizontally in v_sad_d
      {
        // Try speculatively on 16 bits, so we can use the minpos intrinsic
        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);

        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);

        // If the local best value is not saturated, just use it, otherwise
        // find the horizontal minimum again the hard way on 32 bits.
        // This is executed rarely.
        if (__unlikely__(local_best_sad == 0xffff)) {
          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;

          v_loval_d = v_sad_d;
          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
          v_hival_d = _mm_srli_si128(v_loval_d, 8);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
          v_hival_d = _mm_srli_si128(v_loval_d, 4);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);

          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
        }

        // Update the global minimum if the local minimum is smaller
        if (__likely__(local_best_sad < best_sad)) {
          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];

          best_sad = local_best_sad;
        }
      }
    }

    bmv = new_bmv;
    best_address = new_best_address;

    v_bmv_w = _mm_set1_epi32(bmv.as_int);
#if ARCH_X86_64
    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

    if (__unlikely__(best_address == in_what)) {
      (*num00)++;
    }
  }

  *best_mv = bmv.as_mv;
  return best_sad;
}

コード例 #10

0

ファイルを表示

ファイル: convert_rgb.c プロジェクト: shootthemoonfilms/dcraw-fast

void convert_to_rgb_fast()
{
    unsigned i,j,c;
    int row, col, k;
    ushort *img;
    float out_cam[3][4];
    double num, inverse[3][3];
    static const double xyzd50_srgb[3][3] =
    { { 0.436083, 0.385083, 0.143055 },
      { 0.222507, 0.716888, 0.060608 },
      { 0.013930, 0.097097, 0.714022 } };
    static const double rgb_rgb[3][3] =
    { { 1,0,0 }, { 0,1,0 }, { 0,0,1 } };
    static const double adobe_rgb[3][3] =
    { { 0.715146, 0.284856, 0.000000 },
      { 0.000000, 1.000000, 0.000000 },
      { 0.000000, 0.041166, 0.958839 } };
    static const double wide_rgb[3][3] =
    { { 0.593087, 0.404710, 0.002206 },
      { 0.095413, 0.843149, 0.061439 },
      { 0.011621, 0.069091, 0.919288 } };
    static const double prophoto_rgb[3][3] =
    { { 0.529317, 0.330092, 0.140588 },
      { 0.098368, 0.873465, 0.028169 },
      { 0.016879, 0.117663, 0.865457 } };
    static const double (*out_rgb[])[3] =
    { rgb_rgb, adobe_rgb, wide_rgb, prophoto_rgb, xyz_rgb };
    static const char *name[] =
    { "sRGB", "Adobe RGB (1998)", "WideGamut D65", "ProPhoto D65", "XYZ" };
    static const unsigned phead[] =
    { 1024, 0, 0x2100000, 0x6d6e7472, 0x52474220, 0x58595a20, 0, 0, 0,
      0x61637370, 0, 0, 0x6e6f6e65, 0, 0, 0, 0, 0xf6d6, 0x10000, 0xd32d };
    unsigned pbody[] =
    { 10, 0x63707274, 0, 36, /* cprt */
      0x64657363, 0, 40, /* desc */
      0x77747074, 0, 20, /* wtpt */
      0x626b7074, 0, 20, /* bkpt */
      0x72545243, 0, 14, /* rTRC */
      0x67545243, 0, 14, /* gTRC */
      0x62545243, 0, 14, /* bTRC */
      0x7258595a, 0, 20, /* rXYZ */
      0x6758595a, 0, 20, /* gXYZ */
      0x6258595a, 0, 20 }; /* bXYZ */
    static const unsigned pwhite[] = { 0xf351, 0x10000, 0x116cc };
    unsigned pcurve[] = { 0x63757276, 0, 1, 0x1000000 };

    gamma_curve (gamm[0], gamm[1], 0, 0);
    memcpy (out_cam, rgb_cam, sizeof out_cam);
    raw_color |= colors == 1 || document_mode ||
                 output_color < 1 || output_color > 5;
    if (!raw_color) {
        oprof = (unsigned *) calloc (phead[0], 1);
        merror (oprof, "convert_to_rgb()");
        memcpy (oprof, phead, sizeof phead);
        if (output_color == 5) oprof[4] = oprof[5];
        oprof[0] = 132 + 12*pbody[0];
        for (i=0; i < pbody[0]; i++) {
            oprof[oprof[0]/4] = i ? (i > 1 ? 0x58595a20 : 0x64657363) : 0x74657874;
            pbody[i*3+2] = oprof[0];
            oprof[0] += (pbody[i*3+3] + 3) & -4;
        }
        memcpy (oprof+32, pbody, sizeof pbody);
        oprof[pbody[5]/4+2] = strlen(name[output_color-1]) + 1;
        memcpy ((char *)oprof+pbody[8]+8, pwhite, sizeof pwhite);
        pcurve[3] = (short)(256/gamm[5]+0.5) << 16;
        for (i=4; i < 7; i++)
            memcpy ((char *)oprof+pbody[i*3+2], pcurve, sizeof pcurve);
        pseudoinverse ((double (*)[3])out_rgb[output_color-1], inverse, 3);
        for (i=0; i < 3; i++)
            for (j=0; j < 3; j++) {
                for (num = k=0; k < 3; k++)
                    num += xyzd50_srgb[i][k] * inverse[j][k];
                oprof[pbody[j*3+23]/4+i+2] = num * 0x10000 + 0.5;
            }
        for (i=0; i < phead[0]/4; i++)
            oprof[i] = htonl(oprof[i]);
        strcpy ((char *)oprof+pbody[2]+8, "auto-generated by dcraw");
        strcpy ((char *)oprof+pbody[5]+12, name[output_color-1]);
        for (i=0; i < 3; i++)
            for (j=0; j < colors; j++)
                for (out_cam[i][j] = k=0; k < 3; k++)
                    out_cam[i][j] += out_rgb[output_color-1][i][k] * rgb_cam[k][j];
    }
    if (verbose)
        fprintf (stderr, raw_color ? _("Building histograms...\n") :
                 _("Converting to %s colorspace...\n"), name[output_color-1]);

    memset (histogram, 0, sizeof histogram);
    if(!raw_color) {
        __m128 outcam0= {out_cam[0][0],out_cam[1][0],out_cam[2][0],0},
               outcam1= {out_cam[0][1],out_cam[1][1],out_cam[2][1],0},
               outcam2= {out_cam[0][2],out_cam[1][2],out_cam[2][2],0},
               outcam3= {out_cam[0][3],out_cam[1][3],out_cam[2][3],0};
        for (img=image[0]; img < image[width*height]; img+=4) {
            __m128 out0;
            __m128 vimg0 = {img[0],img[0],img[0],0},
                   vimg1 = {img[1],img[1],img[1],0},
                   vimg2 = {img[2],img[2],img[2],0},
                   vimg3 = {img[3],img[3],img[3],0};

//             out[0] = out_cam[0][0] * img[0]
//                     +out_cam[0][1] * img[1]
//                     +out_cam[0][2] * img[2]
//                     +out_cam[0][3] * img[3];
//             out[1] = out_cam[1][0] * img[0]
//                     +out_cam[1][1] * img[1]
//                     +out_cam[1][2] * img[2]
//                     +out_cam[1][3] * img[3];
//             out[2] = out_cam[2][0] * img[0]
//                     +out_cam[2][1] * img[1]
//                     +out_cam[2][2] * img[2]
//                     +out_cam[2][3] * img[3];
            out0 = _mm_add_ps(_mm_add_ps(
                     _mm_mul_ps(vimg0, outcam0),
                     _mm_mul_ps(vimg1, outcam1)
                   ), _mm_add_ps(
                     _mm_mul_ps(vimg2, outcam2),
                     _mm_mul_ps(vimg3, outcam3)
                   ));
            //clip
            out0 = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(out0, _MM_FROUND_TO_ZERO)));
            __m128i o = _mm_cvtps_epi32(out0);
            o = _mm_packus_epi32(o,_mm_setzero_si128());
            memcpy(img, &o, sizeof(short)*3);

            FORCC histogram[c][img[c] >> 3]++;
        }

    } else if (document_mode) {

コード例 #11

0

ファイルを表示

ファイル: av1_highbd_convolve_sse4.c プロジェクト: luke-chang/gecko-1

static void highbdRndingPacks(__m128i *u) {
  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
  u[0] = _mm_add_epi32(u[0], rnd);
  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
  u[0] = _mm_packus_epi32(u[0], u[0]);
}

コード例 #12

0

ファイルを表示

ファイル: sse41-builtins.c プロジェクト: ashwinma/clang_trunk

__m128i test_mm_packus_epi32(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_packus_epi32
  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw
  // CHECK-ASM: packusdw %xmm{{.*}}, %xmm{{.*}}
  return _mm_packus_epi32(x, y);
}

コード例 #13

0

ファイルを表示

ファイル: sse41-builtins.c プロジェクト: JaredCJR/clang

__m128i test_mm_packus_epi32(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_packus_epi32
  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_packus_epi32(x, y);
}

コード例 #14

0

ファイルを表示

ファイル: SimdSse41Detection.cpp プロジェクト: ashh87/Simd

 SIMD_INLINE void PackResult32i(const uint32_t * src, uint8_t * dst)
 {
     __m128i lo = _mm_packus_epi32(_mm_loadu_si128((__m128i*)src + 0), _mm_loadu_si128((__m128i*)src + 1));
     __m128i hi = _mm_packus_epi32(_mm_loadu_si128((__m128i*)src + 2), _mm_loadu_si128((__m128i*)src + 3));
     _mm_storeu_si128((__m128i*)dst, _mm_packus_epi16(lo, hi));
 }

コード例 #15

0

ファイルを表示

ファイル: highbd_convolve_2d_sse4.c プロジェクト: jfiguinha/Regards

void av1_highbd_jnt_convolve_2d_copy_sse4_1(
    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
    int h, const InterpFilterParams *filter_params_x,
    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
  CONV_BUF_TYPE *dst = conv_params->dst;
  int dst_stride = conv_params->dst_stride;
  (void)filter_params_x;
  (void)filter_params_y;
  (void)subpel_x_q4;
  (void)subpel_y_q4;

  const int bits =
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
  const __m128i left_shift = _mm_cvtsi32_si128(bits);
  const int do_average = conv_params->do_average;
  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
  const int w0 = conv_params->fwd_offset;
  const int w1 = conv_params->bck_offset;
  const __m128i wt0 = _mm_set1_epi32(w0);
  const __m128i wt1 = _mm_set1_epi32(w1);
  const __m128i zero = _mm_setzero_si128();
  int i, j;

  const int offset_0 =
      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  const __m128i offset_const = _mm_set1_epi32(offset);
  const __m128i offset_const_16b = _mm_set1_epi16(offset);
  const int rounding_shift =
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
  const __m128i clip_pixel_to_bd =
      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));

  assert(bits <= 4);

  if (!(w % 8)) {
    for (i = 0; i < h; i += 1) {
      for (j = 0; j < w; j += 8) {
        const __m128i src_16bit =
            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
        const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
        if (do_average) {
          const __m128i data_0 =
              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));

          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);

          const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
          const __m128i res_unsigned_lo =
              _mm_add_epi32(res_32b_lo, offset_const);

          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);

          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
          const __m128i res_unsigned_hi =
              _mm_add_epi32(res_32b_hi, offset_const);

          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);

          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);

          const __m128i res_16b =
              _mm_packus_epi32(round_result_lo, round_result_hi);
          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);

          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
        } else {
          const __m128i res_unsigned_16b =
              _mm_adds_epu16(res, offset_const_16b);

          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
                          res_unsigned_16b);
        }
      }
    }
  } else if (!(w % 4)) {
    for (i = 0; i < h; i += 2) {
      for (j = 0; j < w; j += 4) {
        const __m128i src_row_0 =
            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
        const __m128i src_row_1 =
            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
        const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);

        const __m128i res = _mm_sll_epi16(src_10, left_shift);

        if (do_average) {
          const __m128i data_0 =
              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
          const __m128i data_1 = _mm_loadl_epi64(
              (__m128i *)(&dst[i * dst_stride + j + dst_stride]));

          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
          const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);

          const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
          const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);

          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
          const __m128i res_unsigned_hi =
              _mm_add_epi32(res_32b_hi, offset_const);

          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);

          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);

          const __m128i res_16b =
              _mm_packus_epi32(round_result_lo, round_result_hi);
          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);

          const __m128i res_1 = _mm_srli_si128(res_clip, 8);

          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
          _mm_storel_epi64(
              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
        } else {
          const __m128i res_unsigned_16b =
              _mm_adds_epu16(res, offset_const_16b);

          const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);

          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
                           res_unsigned_16b);
          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
                           res_1);
        }
      }
    }
  }
}

コード例 #16

0

ファイルを表示

ファイル: highbd_convolve_2d_sse4.c プロジェクト: jfiguinha/Regards

void av1_highbd_jnt_convolve_2d_sse4_1(
    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
    int h, const InterpFilterParams *filter_params_x,
    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
  DECLARE_ALIGNED(16, int16_t,
                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
  CONV_BUF_TYPE *dst = conv_params->dst;
  int dst_stride = conv_params->dst_stride;
  int im_h = h + filter_params_y->taps - 1;
  int im_stride = MAX_SB_SIZE;
  int i, j;
  const int do_average = conv_params->do_average;
  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
  const int fo_vert = filter_params_y->taps / 2 - 1;
  const int fo_horiz = filter_params_x->taps / 2 - 1;
  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

  const int w0 = conv_params->fwd_offset;
  const int w1 = conv_params->bck_offset;
  const __m128i wt0 = _mm_set1_epi32(w0);
  const __m128i wt1 = _mm_set1_epi32(w1);

  const int offset_0 =
      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  const __m128i offset_const = _mm_set1_epi32(offset);
  const int rounding_shift =
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
  const __m128i clip_pixel_to_bd =
      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));

  // Check that, even with 12-bit input, the intermediate values will fit
  // into an unsigned 16-bit intermediate array.
  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);

  /* Horizontal filter */
  {
    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const = _mm_set1_epi32(
        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);

    for (i = 0; i < im_h; ++i) {
      for (j = 0; j < w; j += 8) {
        const __m128i data =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
        const __m128i data2 =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);

        // Filter even-index pixels
        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
        const __m128i res_2 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
        const __m128i res_4 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
        const __m128i res_6 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);

        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                         _mm_add_epi32(res_2, res_6));
        res_even =
            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);

        // Filter odd-index pixels
        const __m128i res_1 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
        const __m128i res_3 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
        const __m128i res_5 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
        const __m128i res_7 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);

        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                        _mm_add_epi32(res_3, res_7));
        res_odd =
            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);

        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
        __m128i res = _mm_packs_epi32(res_even, res_odd);
        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
      }
    }
  }

  /* Vertical filter */
  {
    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const = _mm_set1_epi32(
        ((1 << conv_params->round_1) >> 1) -
        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);

    for (i = 0; i < h; ++i) {
      for (j = 0; j < w; j += 8) {
        // Filter even-index pixels
        const int16_t *data = &im_block[i * im_stride + j];
        const __m128i src_0 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
                               *(__m128i *)(data + 1 * im_stride));
        const __m128i src_2 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
                               *(__m128i *)(data + 3 * im_stride));
        const __m128i src_4 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
                               *(__m128i *)(data + 5 * im_stride));
        const __m128i src_6 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
                               *(__m128i *)(data + 7 * im_stride));

        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);

        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                               _mm_add_epi32(res_4, res_6));

        // Filter odd-index pixels
        const __m128i src_1 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
                               *(__m128i *)(data + 1 * im_stride));
        const __m128i src_3 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
                               *(__m128i *)(data + 3 * im_stride));
        const __m128i src_5 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
                               *(__m128i *)(data + 5 * im_stride));
        const __m128i src_7 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
                               *(__m128i *)(data + 7 * im_stride));

        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);

        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                              _mm_add_epi32(res_5, res_7));

        // Rearrange pixels back into the order 0 ... 7
        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        const __m128i res_lo_round =
            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);

        const __m128i res_unsigned_lo =
            _mm_add_epi32(res_lo_round, offset_const);

        if (w < 8) {
          if (do_average) {
            const __m128i data_0 =
                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));

            const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);

            const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);

            const __m128i round_result = highbd_convolve_rounding_sse2(
                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);

            const __m128i res_16b =
                _mm_packus_epi32(round_result, round_result);
            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);

            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
          } else {
            const __m128i res_16b =
                _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
          }
        } else {
          const __m128i res_hi_round =
              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);

          const __m128i res_unsigned_hi =
              _mm_add_epi32(res_hi_round, offset_const);

          if (do_average) {
            const __m128i data_lo =
                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
            const __m128i data_hi =
                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));

            const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
            const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);

            const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
            const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);

            const __m128i round_result_lo =
                highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
                                              &rounding_const, rounding_shift);
            const __m128i round_result_hi =
                highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
                                              &rounding_const, rounding_shift);

            const __m128i res_16b =
                _mm_packus_epi32(round_result_lo, round_result_hi);
            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);

            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
          } else {
            const __m128i res_16b =
                _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
          }
        }
      }
    }
  }
}