static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { const uint8x8_t f0 = vmov_n_u8(filter[0]); const uint8x8_t f1 = vmov_n_u8(filter[1]); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 16) { const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); } // Next row... src_ptr += src_pixels_per_line; output_ptr += output_width; } }
int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { uint64x2_t S[8]; uint32x4_t T[2]; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(S, nonce, key); ABSORB_DATA(S, a, alen, HEADER_TAG); DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S); /* Verify tag */ T[0] = vceqq_u32(U64TOU32(S[0]), U8TOU32( vld1q_u8((uint8_t *)(c + clen - BYTES(NORX_T) )) )); T[1] = vceqq_u32(U64TOU32(S[1]), U8TOU32( vld1q_u8((uint8_t *)(c + clen - BYTES(NORX_T)/2)) )); T[0] = vandq_u32(T[0], T[1]); return 0xFFFFFFFFFFFFFFFFULL == (vgetq_lane_u64(U32TOU64(T[0]), 0) & vgetq_lane_u64(U32TOU64(T[0]), 1)) ? 0 : -1; }
//Note: it takes size and offset in units of byte static inline int compute_ham_similarity_64(unsigned short* ref, unsigned short* circ_array, int size){ const uint8_t* ref_c=(uint8_t*) ref; const uint8_t* circ_c=(uint8_t*) circ_array; register uint8x16_t a,b; register uint8x16_t c,d,temp; register uint16x8_t acc; register uint i=0,count=0; int j=0; int shift=size&0xF; for(i=0;i<=size-16; i+=16){ j++; a=vld1q_u8(&ref_c[i]); b=vld1q_u8(&circ_c[i]); c=veorq_u8(a,b); acc=vaddq_u16(acc,vpaddlq_u8(vcntq_u8(c))); } count=setbits(acc); a=vld1q_u8(&ref_c[i]); b=vld1q_u8(&circ_c[i]); c=veorq_u8(a,b); c=vcntq_u8(c); for(i=0;i<shift;i++){ count=count+vgetq_lane_u8 (c,i); } return size*8-count; }
static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride, const uint8x16_t res) { const uint8x16_t a0 = vld1q_u8(*dest); const uint8x16_t a1 = vld1q_u8(*dest + 16); const uint8x16_t b0 = vqsubq_u8(a0, res); const uint8x16_t b1 = vqsubq_u8(a1, res); vst1q_u8(*dest, b0); vst1q_u8(*dest + 16, b1); *dest += stride; }
void aes_ofb(const unsigned char* in, unsigned char *out, int length, const char *expkey, const char* iv) { uint8x16_t block, cipher; block = vld1q_u8((int8_t *)iv); for (int i = 0; i < length; i += 16){ block = aes_enc(block, (uint8x16_t *)expkey); cipher = veorq_u8(vld1q_u8(&((int8_t *)in)[i]), block); vst1q_u8(&((int8_t*)out)[i], cipher); } }
void ar_vmin_u8_neon(uint8_t* res, const uint8_t* a, const uint8_t* b, uint32_t n) { uint8x16_t a_loaded; uint8x16_t b_loaded; uint8x16_t res_loaded; for (uint32_t i = 0; i < n; i += 16) { a_loaded = vld1q_u8(&(a[i])); b_loaded = vld1q_u8(&(b[i])); res_loaded = vminq_u8(a_loaded, b_loaded); vst1q_u8(&(res[i]),res_loaded); } }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A0 = vld1q_u8(above); // top row const uint8x16_t A1 = vld1q_u8(above + 16); const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top const uint16x8_t p1 = vpaddlq_u8(A1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_top = vcombine_u16(p5, p5); } if (do_left) { const uint8x16_t L0 = vld1q_u8(left); // left row const uint8x16_t L1 = vld1q_u8(left + 16); const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left const uint16x8_t p1 = vpaddlq_u8(L1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_left = vcombine_u16(p5, p5); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 6); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 5); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 5); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 32; ++i) { vst1q_u8(dst + i * stride, dc); vst1q_u8(dst + i * stride + 16, dc); } } }
void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; uint8x16_t q0u8 = vdupq_n_u8(0); uint8x16_t q1u8 = vdupq_n_u8(0); (void)left; q0u8 = vld1q_u8(above); q1u8 = vld1q_u8(above + 16); for (i = 0; i < 32; i++, dst += stride) { vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q1u8); } }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); #ifdef USE_VTBLQ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); #else const uint8x8_t shuffle = vld1_u8(kGreenShuffle); #endif for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t greens = DoGreenShuffle(argb, shuffle); vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3); }
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ static void unshuffle4_neon(uint8_t * const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements ) { size_t i, j, k; static const size_t bytesoftype = 4; uint8x16x4_t r0; for( i = 0, k = 0; i<vectorizable_elements*bytesoftype; i += 64, k++) { /* load 64 bytes to the structure r0 */ for( j = 0; j < 4; j++) { r0.val[j] = vld1q_u8(src + total_elements* j + k*16); } /* Store (with permutation) the results in the destination vector */ vst4q_u8(dest + k*64, r0); } }
int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; #if CV_SSE __m128i d0 = _mm_setzero_si128(); for( ; j <= n - 16; j += 16 ) { __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } for( ; j <= n - 4; j += 4 ) { __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); #elif CV_NEON uint32x4_t v_sum = vdupq_n_u32(0.0f); for ( ; j <= n - 16; j += 16) { uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); } uint CV_DECL_ALIGNED(16) buf[4]; vst1q_u32(buf, v_sum); d = buf[0] + buf[1] + buf[2] + buf[3]; #endif { for( ; j <= n - 4; j += 4 ) { d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); } } for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; }
void png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_bytep rp = row; png_bytep rp_stop = row + row_info->rowbytes; uint8x16_t vtmp = vld1q_u8(rp); uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); uint8x8x2_t vrp = *vrpt; uint8x8x4_t vdest; vdest.val[3] = vdup_n_u8(0); png_debug(1, "in png_read_filter_row_sub3_neon"); for (; rp < rp_stop;) { uint8x8_t vtmp1, vtmp2; uint32x2_t *temp_pointer; vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); vdest.val[3] = vadd_u8(vdest.val[2], vtmp1); vtmp = vld1q_u8(rp + 12); vrpt = png_ptr(uint8x8x2_t, &vtmp); vrp = *vrpt; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); rp += 3; } PNG_UNUSED(prev_row) }
void png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_bytep rp = row; png_bytep rp_stop = row + row_info->rowbytes; png_const_bytep pp = prev_row; for (; rp < rp_stop; rp += 16, pp += 16) { uint8x16_t qrp, qpp; qrp = vld1q_u8(rp); qpp = vld1q_u8(pp); qrp = vaddq_u8(qrp, qpp); vst1q_u8(rp, qrp); } }
void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; uint8x16_t q0u8 = vdupq_n_u8(0); (void)left; q0u8 = vld1q_u8(above); for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A = vld1q_u8(above); // top row const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_top = vcombine_u16(p3, p3); } if (do_left) { const uint8x16_t L = vld1q_u8(left); // left row const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_left = vcombine_u16(p3, p3); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 5); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 4); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 4); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 16; ++i) { vst1q_u8(dst + i * stride, dc); } } }
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; uint16x8_t vec_sum_lo = vdupq_n_u16(0); uint16x8_t vec_sum_hi = vdupq_n_u16(0); const int shift_factor = ((height >> 5) + 3) * -1; const int16x8_t vec_shift = vdupq_n_s16(shift_factor); for (i = 0; i < height; i += 8) { const uint8x16_t vec_row1 = vld1q_u8(ref); const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); ref += ref_stride * 8; } vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); hbuf += 8; vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); }
void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) { int r; uint8x16_t qtmp; for (r = 0; r < 16; ++r) { qtmp = vld1q_u8(src); vst1q_u8(dst, qtmp); src += src_stride; dst += dst_stride; } }
void vp8_mbloop_filter_horizontal_edge_y_neon( unsigned char *src, int pitch, unsigned char blimit, unsigned char limit, unsigned char thresh) { uint8x16_t qblimit, qlimit, qthresh, q3, q4; uint8x16_t q5, q6, q7, q8, q9, q10; qblimit = vdupq_n_u8(blimit); qlimit = vdupq_n_u8(limit); qthresh = vdupq_n_u8(thresh); src -= (pitch << 2); q3 = vld1q_u8(src); src += pitch; q4 = vld1q_u8(src); src += pitch; q5 = vld1q_u8(src); src += pitch; q6 = vld1q_u8(src); src += pitch; q7 = vld1q_u8(src); src += pitch; q8 = vld1q_u8(src); src += pitch; q9 = vld1q_u8(src); src += pitch; q10 = vld1q_u8(src); vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, q10, &q4, &q5, &q6, &q7, &q8, &q9); src -= (pitch * 6); vst1q_u8(src, q4); src += pitch; vst1q_u8(src, q5); src += pitch; vst1q_u8(src, q6); src += pitch; vst1q_u8(src, q7); src += pitch; vst1q_u8(src, q8); src += pitch; vst1q_u8(src, q9); return; }
/* u8x16 saturated sub */ void mw_neon_mm_qsub_u8x16(unsigned char * A, int Row, int Col, unsigned char * B, unsigned char * C) { uint8x16_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 16; i <= size ; i+=16) { k = i - 16; neon_a = vld1q_u8(A + k); neon_b = vld1q_u8(B + k); neon_c = vqsubq_u8(neon_a, neon_b); vst1q_u8(C + k, neon_c); } k = i - 16; for (i = 0; i < size % 16; i++) { C[k + i] = A[k + i] - B[k + i]; } }
int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) { int i; uint16x8_t vec_sum = vdupq_n_u16(0); for (i = 0; i < width; i += 16) { const uint8x16_t vec_row = vld1q_u8(ref); vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); ref += 16; } return horizontal_add_u16x8(vec_sum); }
void vpx_lpf_horizontal_4_dual_neon( uint8_t *s, int p /* pitch */, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; uint8x16_t qblimit, qlimit, qthresh; uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; dblimit0 = vld1_u8(blimit0); dlimit0 = vld1_u8(limit0); dthresh0 = vld1_u8(thresh0); dblimit1 = vld1_u8(blimit1); dlimit1 = vld1_u8(limit1); dthresh1 = vld1_u8(thresh1); qblimit = vcombine_u8(dblimit0, dblimit1); qlimit = vcombine_u8(dlimit0, dlimit1); qthresh = vcombine_u8(dthresh0, dthresh1); s -= (p << 2); q3u8 = vld1q_u8(s); s += p; q4u8 = vld1q_u8(s); s += p; q5u8 = vld1q_u8(s); s += p; q6u8 = vld1q_u8(s); s += p; q7u8 = vld1q_u8(s); s += p; q8u8 = vld1q_u8(s); s += p; q9u8 = vld1q_u8(s); s += p; q10u8 = vld1q_u8(s); loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8); s -= (p * 5); vst1q_u8(s, q5u8); s += p; vst1q_u8(s, q6u8); s += p; vst1q_u8(s, q7u8); s += p; vst1q_u8(s, q8u8); return; }
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); const uint8x8_t shuffle = vld1_u8(kGreenShuffle); for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t greens = vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); }
void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t A0 = vld1q_u8(above); // top row const uint8x16_t above_right = vld1q_dup_u8(above + 15); const uint8x16_t A1 = vextq_u8(A0, above_right, 1); const uint8x16_t A2 = vextq_u8(A0, above_right, 2); const uint8x16_t avg1 = vhaddq_u8(A0, A2); uint8x16_t row = vrhaddq_u8(avg1, A1); int i; (void)left; for (i = 0; i < 15; ++i) { vst1q_u8(dst + i * stride, row); row = vextq_u8(row, above_right, 1); } vst1q_u8(dst + i * stride, row); }
void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j, k; uint8x8_t d2u8 = vdup_n_u8(0); uint8x16_t q0u8 = vdupq_n_u8(0); uint8x16_t q1u8 = vdupq_n_u8(0); (void)above; for (k = 0; k < 2; k++, left += 16) { q1u8 = vld1q_u8(left); d2u8 = vget_low_u8(q1u8); for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { q0u8 = vdupq_lane_u8(d2u8, 0); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 1); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 2); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 3); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 4); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 5); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 6); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 7); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; } } }
int main () { /* Create custom arbitrary data. */ const uint8_t uint8_data[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; /* Create the vector with our data. */ uint8x16_t data; /* Load our custom data into the vector register. */ data = vld1q_u8 (uint8_data); print_uint8 (data, "data"); /* Call of the add3 function. */ add3(&data); print_uint8 (data, "data (new)"); return 0; }
static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, uint8x16_t *q9u8, uint8x16_t *q10u8, uint8x16_t *q11u8, uint8x16_t *q12u8, uint8x16_t *q13u8, uint8x16_t *q14u8, uint8x16_t *q15u8) { *q8u8 = vld1q_u8(d); d += d_stride; *q9u8 = vld1q_u8(d); d += d_stride; *q10u8 = vld1q_u8(d); d += d_stride; *q11u8 = vld1q_u8(d); d += d_stride; *q12u8 = vld1q_u8(d); d += d_stride; *q13u8 = vld1q_u8(d); d += d_stride; *q14u8 = vld1q_u8(d); d += d_stride; *q15u8 = vld1q_u8(d); return; }
uint8_t ar_vminall_u8_neon(const uint8_t* a, uint32_t n) { uint8x16_t a_loaded; uint8x16_t overall_min = vdupq_n_u8(255); uint8_t om_array[16]; uint8_t themin = 255; for (uint32_t i = 0; i < n; i += 16) { a_loaded = vld1q_u8(&(a[i])); overall_min = vminq_u8(a_loaded, overall_min); } vst1q_u8(om_array, overall_min); for (uint32_t i = 0; i < 16; i++) { themin = ar_min_u8(themin, om_array[i]); } return themin; }
uint8_t ar_vmaxall_u8_neon(const uint8_t* a, uint32_t n) { uint8x16_t a_loaded; uint8x16_t overall_max = vdupq_n_u8(0); uint8_t om_array[16]; uint8_t themax = 0; for (uint32_t i = 0; i < n; i += 16) { a_loaded = vld1q_u8(&(a[i])); overall_max = vmaxq_u8(a_loaded, overall_max); } vst1q_u8(om_array, overall_max); for (uint32_t i = 0; i < 16; i++) { themax = ar_max_u8(themax, om_array[i]); } return themax; }
static void SubBytes(void) { vst1q_u8(lut,*state); lut[0] = sbox[lut[0]]; lut[1] = sbox[lut[1]]; lut[2] = sbox[lut[2]]; lut[3] = sbox[lut[3]]; lut[4] = sbox[lut[4]]; lut[5] = sbox[lut[5]]; lut[6] = sbox[lut[6]]; lut[7] = sbox[lut[7]]; lut[8] = sbox[lut[8]]; lut[9] = sbox[lut[9]]; lut[10] = sbox[lut[10]]; lut[11] = sbox[lut[11]]; lut[12] = sbox[lut[12]]; lut[13] = sbox[lut[13]]; lut[14] = sbox[lut[14]]; lut[15] = sbox[lut[15]]; *state = vld1q_u8(lut); }
int crypto_stream_xor( unsigned char *c, const unsigned char *m,unsigned long long mlen, const unsigned char *n, const unsigned char *k ) { const uint32x4_t abab = {-1,0,-1,0}; const uint64x1_t nextblock = {1}; uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k); uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16)); uint32x4_t start0 = (uint32x4_t) vld1q_u8((uint8_t *) sigma); uint32x2_t n0n1 = (uint32x2_t) vld1_u8((uint8_t *) n); uint32x2_t n2n3 = {0,0}; uint32x2_t k0k1 = vget_low_u32(k0k1k2k3); uint32x2_t k2k3 = vget_high_u32(k0k1k2k3); uint32x2_t k4k5 = vget_low_u32(k4k5k6k7); uint32x2_t k6k7 = vget_high_u32(k4k5k6k7); uint32x2_t n1n0 = vext_u32(n0n1,n0n1,1); uint32x2_t n3n2; uint32x2_t n0k4 = vext_u32(n1n0,k4k5,1); uint32x2_t k5k0 = vext_u32(k4k5,k0k1,1); uint32x2_t k1n1 = vext_u32(k0k1,n1n0,1); uint32x2_t n2k6; uint32x2_t k7k2 = vext_u32(k6k7,k2k3,1); uint32x2_t k3n3; uint32x4_t start1 = vcombine_u32(k5k0,n0k4); uint32x4_t start2; uint32x4_t start3; register uint32x4_t diag0; register uint32x4_t diag1; register uint32x4_t diag2; register uint32x4_t diag3; uint32x4_t next_start2; uint32x4_t next_start3; register uint32x4_t next_diag0; register uint32x4_t next_diag1; register uint32x4_t next_diag2; register uint32x4_t next_diag3; uint32x4_t x0x5x10x15; uint32x4_t x12x1x6x11; uint32x4_t x8x13x2x7; uint32x4_t x4x9x14x3; uint32x4_t x0x1x10x11; uint32x4_t x12x13x6x7; uint32x4_t x8x9x2x3; uint32x4_t x4x5x14x15; uint32x4_t x0x1x2x3; uint32x4_t x4x5x6x7; uint32x4_t x8x9x10x11; uint32x4_t x12x13x14x15; uint32x4_t m0m1m2m3; uint32x4_t m4m5m6m7; uint32x4_t m8m9m10m11; uint32x4_t m12m13m14m15; register uint32x4_t a0; register uint32x4_t a1; register uint32x4_t a2; register uint32x4_t a3; register uint32x4_t b0; register uint32x4_t b1; register uint32x4_t b2; register uint32x4_t b3; register uint32x4_t next_a0; register uint32x4_t next_a1; register uint32x4_t next_a2; register uint32x4_t next_a3; register uint32x4_t next_b0; register uint32x4_t next_b1; register uint32x4_t next_b2; register uint32x4_t next_b3; unsigned char block[64]; unsigned char *savec; int i; int flagm = (m != 0); if (!mlen) return 0; if (mlen < 128) goto mlenatleast1; mlenatleast128: n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); start2 = vcombine_u32(n2k6,k1n1); start3 = vcombine_u32(k3n3,k7k2); n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); diag0 = start0; diag1 = start1; diag2 = start2; diag3 = start3; n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); next_start2 = vcombine_u32(n2k6,k1n1); next_start3 = vcombine_u32(k3n3,k7k2); n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); next_diag0 = start0; next_diag1 = start1; next_diag2 = next_start2; next_diag3 = next_start3; for (i = ROUNDS;i > 0;i -= 2) { a0 = diag1 + diag0; b0 = vshlq_n_u32(a0,7); next_a0 = next_diag1 + next_diag0; a0 = vsriq_n_u32(b0,a0,25); next_b0 = vshlq_n_u32(next_a0,7); diag3 ^= a0; next_a0 = vsriq_n_u32(next_b0,next_a0,25); a1 = diag0 + diag3; next_diag3 ^= next_a0; b1 = vshlq_n_u32(a1,9); next_a1 = next_diag0 + next_diag3; a1 = vsriq_n_u32(b1,a1,23); next_b1 = vshlq_n_u32(next_a1,9); diag2 ^= a1; next_a1 = vsriq_n_u32(next_b1,next_a1,23); a2 = diag3 + diag2; diag3 = vextq_u32(diag3,diag3,3); next_diag2 ^= next_a1; b2 = vshlq_n_u32(a2,13); next_a2 = next_diag3 + next_diag2; next_diag3 = vextq_u32(next_diag3,next_diag3,3); a2 = vsriq_n_u32(b2,a2,19); next_b2 = vshlq_n_u32(next_a2,13); diag1 ^= a2; next_a2 = vsriq_n_u32(next_b2,next_a2,19); a3 = diag2 + diag1; diag2 = vextq_u32(diag2,diag2,2); next_diag1 ^= next_a2; b3 = vshlq_n_u32(a3,18); diag1 = vextq_u32(diag1,diag1,1); next_a3 = next_diag2 + next_diag1; next_diag2 = vextq_u32(next_diag2,next_diag2,2); a3 = vsriq_n_u32(b3,a3,14); next_b3 = vshlq_n_u32(next_a3,18); next_diag1 = vextq_u32(next_diag1,next_diag1,1); diag0 ^= a3; next_a3 = vsriq_n_u32(next_b3,next_a3,14); a0 = diag3 + diag0; next_diag0 ^= next_a3; b0 = vshlq_n_u32(a0,7); next_a0 = next_diag3 + next_diag0; a0 = vsriq_n_u32(b0,a0,25); next_b0 = vshlq_n_u32(next_a0,7); diag1 ^= a0; next_a0 = vsriq_n_u32(next_b0,next_a0,25); a1 = diag0 + diag1; next_diag1 ^= next_a0; b1 = vshlq_n_u32(a1,9); next_a1 = next_diag0 + next_diag1; a1 = vsriq_n_u32(b1,a1,23); next_b1 = vshlq_n_u32(next_a1,9); diag2 ^= a1; next_a1 = vsriq_n_u32(next_b1,next_a1,23); a2 = diag1 + diag2; diag1 = vextq_u32(diag1,diag1,3); next_diag2 ^= next_a1; b2 = vshlq_n_u32(a2,13); next_a2 = next_diag1 + next_diag2; next_diag1 = vextq_u32(next_diag1,next_diag1,3); a2 = vsriq_n_u32(b2,a2,19); next_b2 = vshlq_n_u32(next_a2,13); diag3 ^= a2; next_a2 = vsriq_n_u32(next_b2,next_a2,19); a3 = diag2 + diag3; diag2 = vextq_u32(diag2,diag2,2); next_diag3 ^= next_a2; b3 = vshlq_n_u32(a3,18); diag3 = vextq_u32(diag3,diag3,1); next_a3 = next_diag2 + next_diag3; next_diag2 = vextq_u32(next_diag2,next_diag2,2); a3 = vsriq_n_u32(b3,a3,14); next_b3 = vshlq_n_u32(next_a3,18); next_diag3 = vextq_u32(next_diag3,next_diag3,1); diag0 ^= a3; next_a3 = vsriq_n_u32(next_b3,next_a3,14); next_diag0 ^= next_a3; } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); x0x5x10x15 = next_diag0 + start0; x12x1x6x11 = next_diag1 + start1; x8x13x2x7 = next_diag2 + next_start2; x4x9x14x3 = next_diag3 + next_start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8(64 + (uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(80 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(96 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(112 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8(64 + (uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(80 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(96 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(112 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); mlen -= 128; c += 128; if (flagm) m += 128; if (mlen >= 128) goto mlenatleast128; mlenatleast1: if (mlen < 64) { if (flagm) for (i = 0;i < 64;++i) block[i] = 0; if (flagm) for (i = 0;i < mlen;++i) block[i] = m[i]; savec = c; c = block; if (flagm) m = block; } n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); start2 = vcombine_u32(n2k6,k1n1); start3 = vcombine_u32(k3n3,k7k2); diag0 = start0; diag1 = start1; diag2 = start2; diag3 = start3; for (i = ROUNDS;i > 0;i -= 2) { a0 = diag1 + diag0; b0 = vshlq_n_u32(a0,7); a0 = vsriq_n_u32(b0,a0,25); diag3 ^= a0; a1 = diag0 + diag3; b1 = vshlq_n_u32(a1,9); a1 = vsriq_n_u32(b1,a1,23); diag2 ^= a1; a2 = diag3 + diag2; diag3 = vextq_u32(diag3,diag3,3); b2 = vshlq_n_u32(a2,13); a2 = vsriq_n_u32(b2,a2,19); diag1 ^= a2; a3 = diag2 + diag1; diag2 = vextq_u32(diag2,diag2,2); b3 = vshlq_n_u32(a3,18); diag1 = vextq_u32(diag1,diag1,1); a3 = vsriq_n_u32(b3,a3,14); diag0 ^= a3; a0 = diag3 + diag0; b0 = vshlq_n_u32(a0,7); a0 = vsriq_n_u32(b0,a0,25); diag1 ^= a0; a1 = diag0 + diag1; b1 = vshlq_n_u32(a1,9); a1 = vsriq_n_u32(b1,a1,23); diag2 ^= a1; a2 = diag1 + diag2; diag1 = vextq_u32(diag1,diag1,3); b2 = vshlq_n_u32(a2,13); a2 = vsriq_n_u32(b2,a2,19); diag3 ^= a2; a3 = diag2 + diag3; diag2 = vextq_u32(diag2,diag2,2); b3 = vshlq_n_u32(a3,18); diag3 = vextq_u32(diag3,diag3,1); a3 = vsriq_n_u32(b3,a3,14); diag0 ^= a3; } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); if (mlen < 64) { for (i = 0;i < mlen;++i) savec[i] = c[i]; } if (mlen <= 64) return 0; n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); mlen -= 64; c += 64; if (flagm) m += 64; goto mlenatleast1; }