void test_vmlal_nu16 (void) { uint32x4_t out_uint32x4_t; uint32x4_t arg0_uint32x4_t; uint16x4_t arg1_uint16x4_t; uint16_t arg2_uint16_t; out_uint32x4_t = vmlal_n_u16 (arg0_uint32x4_t, arg1_uint16x4_t, arg2_uint16_t); }
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, const uint8x8_t G, const uint8x8_t B) { const uint16x8_t r = vmovl_u8(R); const uint16x8_t g = vmovl_u8(G); const uint16x8_t b = vmovl_u8(B); const uint16x4_t r_lo = vget_low_u16(r); const uint16x4_t r_hi = vget_high_u16(r); const uint16x4_t g_lo = vget_low_u16(g); const uint16x4_t g_hi = vget_high_u16(g); const uint16x4_t b_lo = vget_low_u16(b); const uint16x4_t b_hi = vget_high_u16(b); const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u); const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u); const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u); const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u); const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u); const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u); const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16), vrshrn_n_u32(tmp2_hi, 16)); const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16)); return vqmovn_u16(Y2); }