void test_vset_laneu32 (void) { uint32x2_t out_uint32x2_t; uint32_t arg0_uint32_t; uint32x2_t arg1_uint32x2_t; out_uint32x2_t = vset_lane_u32 (arg0_uint32_t, arg1_uint32x2_t, 1); }
uint32x2_t test_vset_lane_u32(uint32_t v1, uint32x2_t v2) { // CHECK: test_vset_lane_u32 return vset_lane_u32(v1, v2, 1); // CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}} }
uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) { // CHECK-LABEL: test_vset_lane_u32: // CHECK-NEXT: ins.s v0[1], w0 // CHECK-NEXT: ret return vset_lane_u32(a, b, 1); }
uint8x8_t val2_val4_8x8; x255_16x8 = vdupq_n_u16(0xff); # ifdef COLMUL uint16x4_t x255_16x4; x255_16x4 = vget_low_u16(x255_16x8); uint16x4_t c1_16x4; # ifdef COLSAME uint16x4_t c1_val3_16x4; uint16x8_t c1_16x8; uint16x8_t c1_val3_16x8; uint32x2_t c1_32x2; uint8x8_t c1_8x8; uint8x8_t c1_val3_8x8; c1_32x2 = vset_lane_u32(c1, c1_32x2, 0); c1_8x8 = vreinterpret_u8_u32(c1_32x2); c1_16x8 = vmovl_u8(c1_8x8); c1_16x4 = vget_low_u16(c1_16x8); # else uint16x4_t c2_16x4; uint16x4_t c2_local_16x4; uint16x4_t cv_16x4; uint16x8_t c1_c2_16x8; uint16x8_t c1_val1_16x8; uint16x8_t c2_val3_16x8; uint16x8_t cv_rv_16x8; uint32x2_t c1_c2_32x2; uint8x8_t c1_c2_8x8; uint8x8_t val3_8x8; uint16x8_t val3_16x8;
// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 { // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> // CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1 // CHECK: ret <2 x i32> [[VSET_LANE]] uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) { return vset_lane_u32(a, b, 1); }
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) { int ay = _ay; int i; DATA32* pbuf = result; uint16x4_t ay_16x4; uint16x4_t p0_16x4; uint16x4_t p2_16x4; uint16x8_t ax_16x8; uint16x8_t p0_p2_16x8; uint16x8_t p1_p3_16x8; uint16x8_t x255_16x8; uint32x2_t p0_p2_32x2; uint32x2_t p1_p3_32x2; uint32x2_t res_32x2; uint8x8_t p0_p2_8x8; uint8x8_t p1_p3_8x8; uint8x8_t p2_8x8; uint16x4_t temp_16x4; ay_16x4 = vdup_n_u16(ay); x255_16x8 = vdupq_n_u16(0xff); for(i = 0; i < len; i++) { DATA32 p0 = *_p0++; DATA32 p1 = *_p1++; DATA32 p2 = *_p2++; DATA32 p3 = *_p3++; int ax = *_ax++; if (p0 | p1 | p2 | p3) { ax_16x8 = vdupq_n_u16(ax); p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0); p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1); p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0); p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1); p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2); p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2); p1_p3_16x8 = vmovl_u8(p1_p3_8x8); p0_p2_16x8 = vmovl_u8(p0_p2_8x8); p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8); p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8); p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8); p0_16x4 = vget_low_u16(p1_p3_16x8); p2_16x4 = vget_high_u16(p1_p3_16x8); p2_16x4 = vsub_u16(p2_16x4, p0_16x4); p2_16x4 = vmul_u16(p2_16x4, ay_16x4); p2_16x4 = vshr_n_u16(p2_16x4, 8); p2_16x4 = vadd_u16(p2_16x4, p0_16x4); p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4); p2_8x8 = vmovn_u16(p1_p3_16x8); res_32x2 = vreinterpret_u32_u8(p2_8x8); vst1_lane_u32(pbuf++, res_32x2, 1); } else *pbuf++ = p0; } return 0; }