示例#1
0
文件: yuv_neon.c 项目: 1vanK/Urho3D
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
                                     uint16_t* dst, int len) {
  int i;
  const int16x8_t zero = vdupq_n_s16(0);
  const int16x8_t max = vdupq_n_s16(MAX_Y);
  uint64x2_t sum = vdupq_n_u64(0);
  uint64_t diff;

  for (i = 0; i + 8 <= len; i += 8) {
    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
    const int16x8_t D = vsubq_s16(A, B);       // diff_y
    const int16x8_t F = vaddq_s16(C, D);       // new_y
    const uint16x8_t H =
        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
    vst1q_u16(dst + i, H);
    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
  }
  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
  for (; i < len; ++i) {
    const int diff_y = ref[i] - src[i];
    const int new_y = (int)(dst[i]) + diff_y;
    dst[i] = clip_y(new_y);
    diff += (uint64_t)(abs(diff_y));
  }
  return diff;
}
示例#2
0
inline int v_signmask(const v_uint32x4& a)
{
    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
    uint64x2_t v1 = vpaddlq_u32(v0);
    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
}
示例#3
0
inline int v_signmask(const v_uint16x8& a)
{
    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
}
示例#4
0
inline int v_signmask(const v_uint8x16& a)
{
    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
}
示例#5
0
int norx_aead_decrypt(
  unsigned char *m, size_t *mlen,
  const unsigned char *a, size_t alen,
  const unsigned char *c, size_t clen,
  const unsigned char *z, size_t zlen,
  const unsigned char *nonce,
  const unsigned char *key
)
{
    uint64x2_t S[8];
    uint32x4_t T[2];

    if (clen < BYTES(NORX_T)) { return -1; }

    *mlen = clen - BYTES(NORX_T);

    INITIALISE(S, nonce, key);
    ABSORB_DATA(S, a, alen, HEADER_TAG);
    DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T));
    ABSORB_DATA(S, z, zlen, TRAILER_TAG);
    FINALISE(S);

    /* Verify tag */
    T[0] = vceqq_u32(U64TOU32(S[0]), U8TOU32( vld1q_u8((uint8_t *)(c + clen - BYTES(NORX_T)  )) ));
    T[1] = vceqq_u32(U64TOU32(S[1]), U8TOU32( vld1q_u8((uint8_t *)(c + clen - BYTES(NORX_T)/2)) ));
    T[0] = vandq_u32(T[0], T[1]);
    return 0xFFFFFFFFFFFFFFFFULL == (vgetq_lane_u64(U32TOU64(T[0]), 0) & vgetq_lane_u64(U32TOU64(T[0]), 1)) ? 0 : -1;
}
示例#6
0
uint64_t test_vgetq_lane_u64(uint64x2_t v1) {
  // CHECK: test_vgetq_lane_u64
  return vgetq_lane_u64(v1, 1);
  // CHECK: umov {{x[0-9]+}}, {{v[0-9]+}}.d[1]
}
示例#7
0
uint64_t test_vgetq_lane_u64(uint64x2_t a) {
  // CHECK-LABEL: test_vgetq_lane_u64:
  // CHECK-NEXT:  mov.d  x0, v0[1]
  // CHECK-NEXT:  ret
  return vgetq_lane_u64(a, 1);
}
示例#8
0
bool CPU_ProbeNEON()
{
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
	return false;
#elif (CRYPTOPP_ARM_NEON_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
	volatile bool result = true;
	__try
	{
		uint32_t v1[4] = {1,1,1,1};
		uint32x4_t x1 = vld1q_u32(v1);
		uint64_t v2[2] = {1,1};
		uint64x2_t x2 = vld1q_u64(v2);

		uint32x4_t x3 = vdupq_n_u32(2);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
		uint64x2_t x4 = vdupq_n_u64(2);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);

		result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
	}
	__except (EXCEPTION_EXECUTE_HANDLER)
	{
		return false;
	}
	return result;
# else

	// longjmp and clobber warnings. Volatile is required.
	// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
	volatile bool result = true;

	volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
	if (oldHandler == SIG_ERR)
		return false;

	volatile sigset_t oldMask;
	if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
		return false;

	if (setjmp(s_jmpSIGILL))
		result = false;
	else
	{
		uint32_t v1[4] = {1,1,1,1};
		uint32x4_t x1 = vld1q_u32(v1);
		uint64_t v2[2] = {1,1};
		uint64x2_t x2 = vld1q_u64(v2);

		uint32x4_t x3 = {0,0,0,0};
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
		uint64x2_t x4 = {0,0};
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);

		// Hack... GCC optimizes away the code and returns true
		result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
	}

	sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
	signal(SIGILL, oldHandler);
	return result;
# endif
#else
	return false;
#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
}
示例#9
0
// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
// CHECK:   ret i64 [[VGETQ_LANE]]
uint64_t test_vgetq_lane_u64(uint64x2_t a) {
  return vgetq_lane_u64(a, 1);
}