Beispiel #1
0
static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
                                            const uint8_t* pred,
                                            uint8_t* dst, int length) {
  v16u8 src0, pred0, dst0;
  assert(length >= 0);
  while (length >= 32) {
    v16u8 src1, pred1, dst1;
    LD_UB2(src, 16, src0, src1);
    LD_UB2(pred, 16, pred0, pred1);
    SUB2(src0, pred0, src1, pred1, dst0, dst1);
    ST_UB2(dst0, dst1, dst, 16);
    src += 32;
    pred += 32;
    dst += 32;
    length -= 32;
  }
  if (length > 0) {
    int i;
    if (length >= 16) {
      src0 = LD_UB(src);
      pred0 = LD_UB(pred);
      dst0 = src0 - pred0;
      ST_UB(dst0, dst);
      src += 16;
      pred += 16;
      dst += 16;
      length -= 16;
    }
    for (i = 0; i < length; i++) {
      dst[i] = src[i] - pred[i];
    }
  }
}
Beispiel #2
0
static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
                                            const uint8_t* ppred,
                                            uint8_t* poutput, int stride,
                                            int size) {
  int w;
  const v16i8 zero = { 0 };
  while (size >= 16) {
    v16u8 pred0, dst0;
    v8i16 a0, a1, b0, b1, c0, c1;
    const v16u8 tmp0 = LD_UB(ppred - 1);
    const v16u8 tmp1 = LD_UB(ppred - stride);
    const v16u8 tmp2 = LD_UB(ppred - stride - 1);
    const v16u8 src0 = LD_UB(pinput);
    ILVRL_B2_SH(zero, tmp0, a0, a1);
    ILVRL_B2_SH(zero, tmp1, b0, b1);
    ILVRL_B2_SH(zero, tmp2, c0, c1);
    ADD2(a0, b0, a1, b1, a0, a1);
    SUB2(a0, c0, a1, c1, a0, a1);
    CLIP_SH2_0_255(a0, a1);
    pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
    dst0 = src0 - pred0;
    ST_UB(dst0, poutput);
    ppred += 16;
    pinput += 16;
    poutput += 16;
    size -= 16;
  }
  for (w = 0; w < size; ++w) {
    const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
    poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
  }
}
Beispiel #3
0
static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
                                    const uint8_t* top) {
  int DC;
  v16u8 out;
  if (top != NULL && left != NULL) {
    const v16u8 rtop = LD_UB(top);
    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
    const v16u8 rleft = LD_UB(left);
    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
    const v8u16 dctemp = dctop + dcleft;
    DC = HADD_UH_U32(dctemp);
    DC = (DC + 16) >> 5;
  } else if (left != NULL) {   // left but no top
Beispiel #4
0
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
                                         int32_t dst_stride) {
  uint32_t row;
  v16u8 src1, src2;

  src1 = LD_UB(src);
  src2 = LD_UB(src + 16);

  for (row = 32; row--;) {
    ST_UB2(src1, src2, dst, 16);
    dst += dst_stride;
  }
}
static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left,
                                     int32_t src_stride_left,
                                     uint8_t *dst, int32_t dst_stride,
                                     uint8_t is_above, uint8_t is_left)
{
    uint32_t row, addition = 0;
    uint64_t out;
    v16u8 src_above, store;
    v8u16 sum_above;
    v4u32 sum_top;
    v2u64 sum;

    if (is_left && is_above)
    {
        src_above = LD_UB(src_top);

        sum_above = __msa_hadd_u_h(src_above, src_above);
        sum_top = __msa_hadd_u_w(sum_above, sum_above);
        sum = __msa_hadd_u_d(sum_top, sum_top);
        addition = __msa_copy_u_w((v4i32)sum, 0);

        for (row = 0; row < 8; ++row)
        {
            addition += src_left[row * src_stride_left];
        }

        addition = (addition + 8) >> 4;
        store = (v16u8)__msa_fill_b(addition);
    }
void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
                             char blackclamp[16], char whiteclamp[16],
                             char bothclamp[16], uint32_t width,
                             uint32_t height, int32_t pitch) {
  uint32_t i, j;

  for (i = 0; i < height / 2; ++i) {
    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
    for (j = width / 16; j--;) {
      v16i8 temp00_s, temp01_s;
      v16u8 temp00, temp01, black_clamp, white_clamp;
      v16u8 pos0, ref0, pos1, ref1;
      v16i8 const127 = __msa_ldi_b(127);

      pos0 = LD_UB(pos0_ptr);
      ref0 = LD_UB(ref0_ptr);
      pos1 = LD_UB(pos1_ptr);
      ref1 = LD_UB(ref1_ptr);
      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
      temp00 = (pos0 < black_clamp);
      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
      temp01 = (pos1 < black_clamp);
      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
      XORI_B2_128_UB(pos0, pos1);
      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
      temp00 = (v16u8)(temp00_s < pos0);
      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
      temp01 = (temp01_s < pos1);
      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
      XORI_B2_128_UB(pos0, pos1);
      pos0 += ref0;
      ST_UB(pos0, pos0_ptr);
      pos1 += ref1;
      ST_UB(pos1, pos1_ptr);
      pos0_ptr += 16;
      pos1_ptr += 16;
      ref0_ptr += 16;
      ref1_ptr += 16;
    }
  }
}
static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
                                         int32_t dst_stride)
{
    v16u8 out = LD_UB(src);

    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
    dst += (8 * dst_stride);
    ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
}
Beispiel #8
0
static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
  if (top != NULL) {
    const v16u8 out = LD_UB(top);
    STORE16x16(out, dst);
  } else {
    const v16u8 out = (v16u8)__msa_fill_b(0x7f);
    STORE16x16(out, dst);
  }
}
Beispiel #9
0
static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
                                         int32_t dst_stride) {
  uint32_t row;
  v16u8 src0;

  src0 = LD_UB(src);

  for (row = 16; row--;) {
    ST_UB(src0, dst);
    dst += dst_stride;
  }
}
Beispiel #10
0
static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
                                       const uint8_t *src_left, uint8_t *dst,
                                       int32_t dst_stride) {
  v16u8 top, left, out;
  v8u16 sum_h, sum_top, sum_left;
  v4u32 sum_w;
  v2u64 sum_d;

  top = LD_UB(src_top);
  left = LD_UB(src_left);
  HADD_UB2_UH(top, left, sum_top, sum_left);
  sum_h = sum_top + sum_left;
  sum_w = __msa_hadd_u_w(sum_h, sum_h);
  sum_d = __msa_hadd_u_d(sum_w, sum_w);
  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
  sum_d = __msa_hadd_u_d(sum_w, sum_w);
  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);

  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
  dst += (8 * dst_stride);
  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
}
Beispiel #11
0
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
  const v16i8 zero = { 0 };
  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
  const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
  const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
  const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
  const v16u8 T1 = LD_UB(top);
  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
  const v8i16 d = T - TL;
  v8i16 r0, r1, r2, r3;
  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
  CLIP_SH4_0_255(r0, r1, r2, r3);
  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
}
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
                                                  int32_t src_stride,
                                                  uint8_t *dst,
                                                  int32_t dst_stride,
                                                  int8_t *filter_horiz,
                                                  int8_t *filter_vert,
                                                  int32_t height) {
  uint32_t loop_cnt;
  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;

  mask0 = LD_UB(&mc_filt_mask_arr[16]);
  src -= (3 + 3 * src_stride);

  /* rearranging filter */
  filt = LD_SH(filter_horiz);
  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);

  mask1 = mask0 + 2;
  mask2 = mask0 + 4;
  mask3 = mask0 + 6;

  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  src += (7 * src_stride);

  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
                            filt_hz1, filt_hz2, filt_hz3);
  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);

  filt = LD_SH(filter_vert);
  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);

  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);

  for (loop_cnt = (height >> 2); loop_cnt--;) {
Beispiel #13
0
static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
                                          int32_t dst_stride) {
  v16u8 data, out;
  v8u16 sum_h;
  v4u32 sum_w;
  v2u64 sum_d;

  data = LD_UB(src);
  sum_h = __msa_hadd_u_h(data, data);
  sum_w = __msa_hadd_u_w(sum_h, sum_h);
  sum_d = __msa_hadd_u_d(sum_w, sum_w);
  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
  sum_d = __msa_hadd_u_d(sum_w, sum_w);
  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);

  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
  dst += (8 * dst_stride);
  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
}
Beispiel #14
0
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
                                        const uint8_t* top) {
  if (left != NULL) {
    if (top != NULL) {
      int j;
      v8i16 d1, d2;
      const v16i8 zero = { 0 };
      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
      const v16u8 T = LD_UB(top);
      ILVRL_B2_SH(zero, T, d1, d2);
      SUB2(d1, TL, d2, TL, d1, d2);
      for (j = 0; j < 16; j += 4) {
        v16i8 t0, t1, t2, t3;
        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
        CLIP_SH4_0_255(r0, r1, r2, r3);
        CLIP_SH4_0_255(r4, r5, r6, r7);
        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
        ST_SB4(t0, t1, t2, t3, dst, BPS);
        dst += 4 * BPS;
      }
    } else {
      HorizontalPred16x16(dst, left);
    }
  } else {
    if (top != NULL) {
      VerticalPred16x16(dst, top);
    } else {
      const v16u8 out = (v16u8)__msa_fill_b(0x81);
      STORE16x16(out, dst);
    }
  }
}
Beispiel #15
0
}

static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
                                    const uint8_t* top) {
  int DC;
  v16u8 out;
  if (top != NULL && left != NULL) {
    const v16u8 rtop = LD_UB(top);
    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
    const v16u8 rleft = LD_UB(left);
    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
    const v8u16 dctemp = dctop + dcleft;
    DC = HADD_UH_U32(dctemp);
    DC = (DC + 16) >> 5;
  } else if (left != NULL) {   // left but no top
    const v16u8 rleft = LD_UB(left);
    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
    DC = HADD_UH_U32(dcleft);
    DC = (DC + DC + 16) >> 5;
  } else if (top != NULL) {   // top but no left
    const v16u8 rtop = LD_UB(top);
    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
    DC = HADD_UH_U32(dctop);
    DC = (DC + DC + 16) >> 5;
  } else {   // no top, no left, nothing.
    DC = 0x80;
  }
  out = (v16u8)__msa_fill_b(DC);
  STORE16x16(out, dst);
}
Beispiel #16
0
void
yuv_abgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr,
                      JSAMPROW p_rgb, JDIMENSION out_width)
{
  int y, cb, cr;
  unsigned int col, num_cols_mul_16 = out_width >> 4;
  unsigned int remaining_wd = out_width & 0xF;
  v16i8 alpha = __msa_ldi_b(0xFF);
  v16i8 const_128 = __msa_ldi_b(128);
  v16u8 out0, out1, out2, out3, input_y = {0};
  v16i8 input_cb, input_cr, out_rgb0, out_rgb1, out_ab0, out_ab1;
  v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1;
  v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0};
  v16i8  out_r0, out_g0, out_b0;

  for (col = num_cols_mul_16; col--;) {
    input_y = LD_UB(p_in_y);
    input_cb = LD_SB(p_in_cb);
    input_cr = LD_SB(p_in_cr);

    p_in_y += 16;
    p_in_cb += 16;
    p_in_cr += 16;

    input_cb -= const_128;
    input_cr -= const_128;

    UNPCK_UB_SH(input_y, y_h0, y_h1);
    UNPCK_SB_SH(input_cb, cb_h0, cb_h1);
    UNPCK_SB_SH(input_cr, cr_h0, cr_h1);

    CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0);

    UNPCK_SH_SW(cr_h0, cr_w0, cr_w1);
    UNPCK_SH_SW(cr_h1, cr_w2, cr_w3);
    CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0);

    UNPCK_SH_SW(cb_h0, cb_w0, cb_w1);
    UNPCK_SH_SW(cb_h1, cb_w2, cb_w3);
    CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0);

    ILVRL_B2_SB(out_r0, out_g0, out_rgb0, out_rgb1);
    ILVRL_B2_SB(out_b0, alpha, out_ab0, out_ab1);

    ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1);
    ILVRL_H2_UB(out_rgb1, out_ab1, out2, out3);

    ST_UB4(out0, out1, out2, out3, p_rgb, 16);
    p_rgb += 16 * 4;
  }

  if (remaining_wd >= 8) {
    uint64_t in_y, in_cb, in_cr;
    v16i8 input_cbcr = {0};

    in_y = LD(p_in_y);
    in_cb = LD(p_in_cb);
    in_cr = LD(p_in_cr);

    p_in_y += 8;
    p_in_cb += 8;
    p_in_cr += 8;

    input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y);
    input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb);
    input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr);

    input_cbcr -= const_128;

    y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y);
    UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0);
    UNPCK_SH_SW(cb_h0, cb_w0, cb_w1);
    UNPCK_SH_SW(cr_h0, cr_w0, cr_w1);

    CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0);
    CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0);
    CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0);

    out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_r0, (v16i8) out_g0);
    out_ab0 = (v16i8) __msa_ilvr_b((v16i8) out_b0, alpha);
    ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1);

    ST_UB2(out0, out1, p_rgb, 16);
    p_rgb += 16 * 2;

    remaining_wd -= 8;
  }

  for (col = 0; col < remaining_wd; col++) {
    y  = (int) (p_in_y[col]);
    cb = (int) (p_in_cb[col]) - 128;
    cr = (int) (p_in_cr[col]) - 128;

    p_rgb[0] = 0xFF;
    p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16));
    p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb -
                                                   FIX_0_71414 * cr), 16));
    p_rgb[3] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16));
    p_rgb += 4;
  }
}
Beispiel #17
0
void
yuv_bgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr,
                     JSAMPROW p_rgb, JDIMENSION out_width)
{
  int32_t y, cb, cr;
  uint32_t col, num_cols_mul_16 = out_width >> 4;
  uint32_t remaining_wd = out_width & 0xF;
  v16u8 mask_rgb0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
  v16u8 mask_rgb1 = {11, 21, 12, 13, 22, 14, 15, 23, 0, 1, 24, 2, 3, 25, 4, 5};
  v16u8 mask_rgb2 = {26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31};
  v16u8 tmp0, tmp1, out0, out1, out2, input_y = {0};
  v16i8 input_cb, input_cr, out_rgb0, out_rgb1, const_128 = __msa_ldi_b(128);
  v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1;
  v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0};
  v16i8  out_r0, out_g0, out_b0;

  for (col = num_cols_mul_16; col--;) {
    input_y = LD_UB(p_in_y);
    input_cb = LD_SB(p_in_cb);
    input_cr = LD_SB(p_in_cr);

    p_in_y += 16;
    p_in_cb += 16;
    p_in_cr += 16;

    input_cb -= const_128;
    input_cr -= const_128;

    UNPCK_UB_SH(input_y, y_h0, y_h1);
    UNPCK_SB_SH(input_cb, cb_h0, cb_h1);
    UNPCK_SB_SH(input_cr, cr_h0, cr_h1);

    CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0);

    UNPCK_SH_SW(cr_h0, cr_w0, cr_w1);
    UNPCK_SH_SW(cr_h1, cr_w2, cr_w3);
    CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0);

    UNPCK_SH_SW(cb_h0, cb_w0, cb_w1);
    UNPCK_SH_SW(cb_h1, cb_w2, cb_w3);
    CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0);

    ILVRL_B2_SB(out_g0, out_b0, out_rgb0, out_rgb1);

    VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1,
               out0, tmp0);
    VSHF_B2_UB(out_rgb1, out_r0, out_rgb1, out_r0, mask_rgb1, mask_rgb2,
               tmp1, out2);
    out1 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) tmp1, 8);
    out1 = (v16u8) __msa_pckev_d((v2i64) out1, (v2i64) tmp0);

    ST_UB(out0, p_rgb);
    p_rgb += 16;
    ST_UB(out1, p_rgb);
    p_rgb += 16;
    ST_UB(out2, p_rgb);
    p_rgb += 16;
  }

  if (remaining_wd >= 8) {
    uint64_t in_y, in_cb, in_cr;
    v16i8 input_cbcr = {0};

    in_y = LD(p_in_y);
    in_cb = LD(p_in_cb);
    in_cr = LD(p_in_cr);

    p_in_y += 8;
    p_in_cb += 8;
    p_in_cr += 8;

    input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y);
    input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb);
    input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr);

    input_cbcr -= const_128;

    y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y);
    UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0);
    UNPCK_SH_SW(cb_h0, cb_w0, cb_w1);
    UNPCK_SH_SW(cr_h0, cr_w0, cr_w1);

    CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0);
    CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0);
    CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0);

    out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_g0, (v16i8) out_b0);
    VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1,
               out0, out1);

    ST_UB(out0, p_rgb);
    p_rgb += 16;
    ST8x1_UB(out1, p_rgb);
    p_rgb += 8;

    remaining_wd -= 8;
  }

  for (col = 0; col < remaining_wd; col++) {
    y  = (int) (p_in_y[col]);
    cb = (int) (p_in_cb[col]) - 128;
    cr = (int) (p_in_cr[col]) - 128;

    /* Range-limiting is essential due to noise introduced by DCT losses. */
    p_rgb[0] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16));
    p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb -
                                                   FIX_0_71414 * cr), 16));
    p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16));
    p_rgb += 3;
  }
}