static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
inline int v_signmask(const v_uint32x4& a) { int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000)); uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0)); uint64x2_t v1 = vpaddlq_u32(v0); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2); }
inline int v_signmask(const v_uint16x8& a) { int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000)); uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0)); uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0)); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4); }
inline int v_signmask(const v_uint8x16& a) { int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100)); uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0)); uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0))); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8); }
int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { uint64x2_t S[8]; uint32x4_t T[2]; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(S, nonce, key); ABSORB_DATA(S, a, alen, HEADER_TAG); DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S); /* Verify tag */ T[0] = vceqq_u32(U64TOU32(S[0]), U8TOU32( vld1q_u8((uint8_t *)(c + clen - BYTES(NORX_T) )) )); T[1] = vceqq_u32(U64TOU32(S[1]), U8TOU32( vld1q_u8((uint8_t *)(c + clen - BYTES(NORX_T)/2)) )); T[0] = vandq_u32(T[0], T[1]); return 0xFFFFFFFFFFFFFFFFULL == (vgetq_lane_u64(U32TOU64(T[0]), 0) & vgetq_lane_u64(U32TOU64(T[0]), 1)) ? 0 : -1; }
uint64_t test_vgetq_lane_u64(uint64x2_t v1) { // CHECK: test_vgetq_lane_u64 return vgetq_lane_u64(v1, 1); // CHECK: umov {{x[0-9]+}}, {{v[0-9]+}}.d[1] }
uint64_t test_vgetq_lane_u64(uint64x2_t a) { // CHECK-LABEL: test_vgetq_lane_u64: // CHECK-NEXT: mov.d x0, v0[1] // CHECK-NEXT: ret return vgetq_lane_u64(a, 1); }
bool CPU_ProbeNEON() { #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) return false; #elif (CRYPTOPP_ARM_NEON_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) volatile bool result = true; __try { uint32_t v1[4] = {1,1,1,1}; uint32x4_t x1 = vld1q_u32(v1); uint64_t v2[2] = {1,1}; uint64x2_t x2 = vld1q_u64(v2); uint32x4_t x3 = vdupq_n_u32(2); x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0); x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3); uint64x2_t x4 = vdupq_n_u64(2); x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0); x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1); result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1)); } __except (EXCEPTION_EXECUTE_HANDLER) { return false; } return result; # else // longjmp and clobber warnings. Volatile is required. // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 volatile bool result = true; volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); if (oldHandler == SIG_ERR) return false; volatile sigset_t oldMask; if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) return false; if (setjmp(s_jmpSIGILL)) result = false; else { uint32_t v1[4] = {1,1,1,1}; uint32x4_t x1 = vld1q_u32(v1); uint64_t v2[2] = {1,1}; uint64x2_t x2 = vld1q_u64(v2); uint32x4_t x3 = {0,0,0,0}; x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0); x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3); uint64x2_t x4 = {0,0}; x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0); x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1); // Hack... GCC optimizes away the code and returns true result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1)); } sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); signal(SIGILL, oldHandler); return result; # endif #else return false; #endif // CRYPTOPP_ARM_NEON_AVAILABLE }
// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 { // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> // CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 // CHECK: ret i64 [[VGETQ_LANE]] uint64_t test_vgetq_lane_u64(uint64x2_t a) { return vgetq_lane_u64(a, 1); }