static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput, const uint8_t* ppred, uint8_t* poutput, int stride, int size) { int w; const v16i8 zero = { 0 }; while (size >= 16) { v16u8 pred0, dst0; v8i16 a0, a1, b0, b1, c0, c1; const v16u8 tmp0 = LD_UB(ppred - 1); const v16u8 tmp1 = LD_UB(ppred - stride); const v16u8 tmp2 = LD_UB(ppred - stride - 1); const v16u8 src0 = LD_UB(pinput); ILVRL_B2_SH(zero, tmp0, a0, a1); ILVRL_B2_SH(zero, tmp1, b0, b1); ILVRL_B2_SH(zero, tmp2, c0, c1); ADD2(a0, b0, a1, b1, a0, a1); SUB2(a0, c0, a1, c1, a0, a1); CLIP_SH2_0_255(a0, a1); pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0); dst0 = src0 - pred0; ST_UB(dst0, poutput); ppred += 16; pinput += 16; poutput += 16; size -= 16; } for (w = 0; w < size; ++w) { const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1]; poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred); } }
static WEBP_INLINE void PredictLineInverse0(const uint8_t* src, const uint8_t* pred, uint8_t* dst, int length) { v16u8 src0, pred0, dst0; assert(length >= 0); while (length >= 32) { v16u8 src1, pred1, dst1; LD_UB2(src, 16, src0, src1); LD_UB2(pred, 16, pred0, pred1); SUB2(src0, pred0, src1, pred1, dst0, dst1); ST_UB2(dst0, dst1, dst, 16); src += 32; pred += 32; dst += 32; length -= 32; } if (length > 0) { int i; if (length >= 16) { src0 = LD_UB(src); pred0 = LD_UB(pred); dst0 = src0 - pred0; ST_UB(dst0, dst); src += 16; pred += 16; dst += 16; length -= 16; } for (i = 0; i < length; i++) { dst[i] = src[i] - pred[i]; } } }
void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], uint32_t width, uint32_t height, int32_t pitch) { uint32_t i, j; for (i = 0; i < height / 2; ++i) { uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); for (j = width / 16; j--;) { v16i8 temp00_s, temp01_s; v16u8 temp00, temp01, black_clamp, white_clamp; v16u8 pos0, ref0, pos1, ref1; v16i8 const127 = __msa_ldi_b(127); pos0 = LD_UB(pos0_ptr); ref0 = LD_UB(ref0_ptr); pos1 = LD_UB(pos1_ptr); ref1 = LD_UB(ref1_ptr); black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); temp00 = (pos0 < black_clamp); pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); temp01 = (pos1 < black_clamp); pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); XORI_B2_128_UB(pos0, pos1); temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); temp00 = (v16u8)(temp00_s < pos0); pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); temp01 = (temp01_s < pos1); pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); XORI_B2_128_UB(pos0, pos1); pos0 += ref0; ST_UB(pos0, pos0_ptr); pos1 += ref1; ST_UB(pos1, pos1_ptr); pos0_ptr += 16; pos1_ptr += 16; ref0_ptr += 16; ref1_ptr += 16; } } }
static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride) { uint32_t row; v16u8 src0; src0 = LD_UB(src); for (row = 16; row--;) { ST_UB(src0, dst); dst += dst_stride; } }
void yuv_bgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr, JSAMPROW p_rgb, JDIMENSION out_width) { int32_t y, cb, cr; uint32_t col, num_cols_mul_16 = out_width >> 4; uint32_t remaining_wd = out_width & 0xF; v16u8 mask_rgb0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; v16u8 mask_rgb1 = {11, 21, 12, 13, 22, 14, 15, 23, 0, 1, 24, 2, 3, 25, 4, 5}; v16u8 mask_rgb2 = {26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31}; v16u8 tmp0, tmp1, out0, out1, out2, input_y = {0}; v16i8 input_cb, input_cr, out_rgb0, out_rgb1, const_128 = __msa_ldi_b(128); v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1; v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0}; v16i8 out_r0, out_g0, out_b0; for (col = num_cols_mul_16; col--;) { input_y = LD_UB(p_in_y); input_cb = LD_SB(p_in_cb); input_cr = LD_SB(p_in_cr); p_in_y += 16; p_in_cb += 16; p_in_cr += 16; input_cb -= const_128; input_cr -= const_128; UNPCK_UB_SH(input_y, y_h0, y_h1); UNPCK_SB_SH(input_cb, cb_h0, cb_h1); UNPCK_SB_SH(input_cr, cr_h0, cr_h1); CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); UNPCK_SH_SW(cr_h1, cr_w2, cr_w3); CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cb_h1, cb_w2, cb_w3); CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0); ILVRL_B2_SB(out_g0, out_b0, out_rgb0, out_rgb1); VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1, out0, tmp0); VSHF_B2_UB(out_rgb1, out_r0, out_rgb1, out_r0, mask_rgb1, mask_rgb2, tmp1, out2); out1 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) tmp1, 8); out1 = (v16u8) __msa_pckev_d((v2i64) out1, (v2i64) tmp0); ST_UB(out0, p_rgb); p_rgb += 16; ST_UB(out1, p_rgb); p_rgb += 16; ST_UB(out2, p_rgb); p_rgb += 16; } if (remaining_wd >= 8) { uint64_t in_y, in_cb, in_cr; v16i8 input_cbcr = {0}; in_y = LD(p_in_y); in_cb = LD(p_in_cb); in_cr = LD(p_in_cr); p_in_y += 8; p_in_cb += 8; p_in_cr += 8; input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr); input_cbcr -= const_128; y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y); UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0); CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0); CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0); out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_g0, (v16i8) out_b0); VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1, out0, out1); ST_UB(out0, p_rgb); p_rgb += 16; ST8x1_UB(out1, p_rgb); p_rgb += 8; remaining_wd -= 8; } for (col = 0; col < remaining_wd; col++) { y = (int) (p_in_y[col]); cb = (int) (p_in_cb[col]) - 128; cr = (int) (p_in_cr[col]) - 128; /* Range-limiting is essential due to noise introduced by DCT losses. */ p_rgb[0] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16)); p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb - FIX_0_71414 * cr), 16)); p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16)); p_rgb += 3; } }