Beispiel #1
0
void test_vset_laneu32 (void)
{
  uint32x2_t out_uint32x2_t;
  uint32_t arg0_uint32_t;
  uint32x2_t arg1_uint32x2_t;

  out_uint32x2_t = vset_lane_u32 (arg0_uint32_t, arg1_uint32x2_t, 1);
}
Beispiel #2
0
uint32x2_t test_vset_lane_u32(uint32_t v1, uint32x2_t v2) {
   // CHECK: test_vset_lane_u32
  return vset_lane_u32(v1, v2, 1);
  // CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
}
Beispiel #3
0
uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
  // CHECK-LABEL: test_vset_lane_u32:
  // CHECK-NEXT:  ins.s v0[1], w0
  // CHECK-NEXT:  ret
  return vset_lane_u32(a, b, 1);
}
Beispiel #4
0
   uint8x8_t val2_val4_8x8;

   x255_16x8 = vdupq_n_u16(0xff);
#   ifdef COLMUL
   uint16x4_t x255_16x4;
   x255_16x4 = vget_low_u16(x255_16x8);
   uint16x4_t c1_16x4;
#    ifdef COLSAME
   uint16x4_t c1_val3_16x4;
   uint16x8_t c1_16x8;
   uint16x8_t c1_val3_16x8;
   uint32x2_t c1_32x2;
   uint8x8_t c1_8x8;
   uint8x8_t c1_val3_8x8;

   c1_32x2 = vset_lane_u32(c1, c1_32x2, 0);
   c1_8x8 = vreinterpret_u8_u32(c1_32x2);
   c1_16x8 = vmovl_u8(c1_8x8);
   c1_16x4 = vget_low_u16(c1_16x8);
#    else
   uint16x4_t c2_16x4;
   uint16x4_t c2_local_16x4;
   uint16x4_t cv_16x4;
   uint16x8_t c1_c2_16x8;
   uint16x8_t c1_val1_16x8;
   uint16x8_t c2_val3_16x8;
   uint16x8_t cv_rv_16x8;
   uint32x2_t c1_c2_32x2;
   uint8x8_t c1_c2_8x8;
   uint8x8_t val3_8x8;
   uint16x8_t val3_16x8;
Beispiel #5
0
// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
// CHECK:   ret <2 x i32> [[VSET_LANE]]
uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
  return vset_lane_u32(a, b, 1);
}
Beispiel #6
0
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) {
  int ay = _ay;
  int i;
  DATA32* pbuf = result;
	    uint16x4_t ay_16x4;
	    uint16x4_t p0_16x4;
	    uint16x4_t p2_16x4;
	    uint16x8_t ax_16x8;
	    uint16x8_t p0_p2_16x8;
	    uint16x8_t p1_p3_16x8;
	    uint16x8_t x255_16x8;
	    uint32x2_t p0_p2_32x2;
	    uint32x2_t p1_p3_32x2;
	    uint32x2_t res_32x2;
	    uint8x8_t p0_p2_8x8;
	    uint8x8_t p1_p3_8x8;
	    uint8x8_t p2_8x8;
	    uint16x4_t temp_16x4;

	    ay_16x4 = vdup_n_u16(ay);
	    x255_16x8 = vdupq_n_u16(0xff);
  for(i = 0; i < len; i++) {
    DATA32 p0 = *_p0++;
    DATA32 p1 = *_p1++;
    DATA32 p2 = *_p2++;
    DATA32 p3 = *_p3++;
    int ax = *_ax++;
		if (p0 | p1 | p2 | p3)
		  {
		    ax_16x8 = vdupq_n_u16(ax);

		    p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0);
		    p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1);
		    p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0);
		    p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1);

		    p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2);
		    p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2);
		    p1_p3_16x8 = vmovl_u8(p1_p3_8x8);
		    p0_p2_16x8 = vmovl_u8(p0_p2_8x8);

		    p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8);
		    p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8);
		    p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8);
		    p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8);
		    p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8);

		    p0_16x4 = vget_low_u16(p1_p3_16x8);
		    p2_16x4 = vget_high_u16(p1_p3_16x8);

		    p2_16x4 = vsub_u16(p2_16x4, p0_16x4);
		    p2_16x4 = vmul_u16(p2_16x4, ay_16x4);
		    p2_16x4 = vshr_n_u16(p2_16x4, 8);
		    p2_16x4 = vadd_u16(p2_16x4, p0_16x4);

		    p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4);
		    p2_8x8 = vmovn_u16(p1_p3_16x8);
		    res_32x2 = vreinterpret_u32_u8(p2_8x8);
		    vst1_lane_u32(pbuf++, res_32x2, 1);
		  }
		else
		  *pbuf++ = p0;

  }
	return 0;
}