static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
                                       unsigned int src_pixels_per_line,
                                       int pixel_step,
                                       unsigned int output_height,
                                       unsigned int output_width,
                                       const uint8_t *filter) {
  const uint8x8_t f0 = vmov_n_u8(filter[0]);
  const uint8x8_t f1 = vmov_n_u8(filter[1]);
  unsigned int i, j;
  for (i = 0; i < output_height; ++i) {
    for (j = 0; j < output_width; j += 16) {
      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
    }
    // Next row...
    src_ptr += src_pixels_per_line;
    output_ptr += output_width;
  }
}
Exemplo n.º 2
0
int norx_aead_decrypt(
  unsigned char *m, size_t *mlen,
  const unsigned char *a, size_t alen,
  const unsigned char *c, size_t clen,
  const unsigned char *z, size_t zlen,
  const unsigned char *nonce,
  const unsigned char *key
)
{
    uint64x2_t S[8];
    uint32x4_t T[2];

    if (clen < BYTES(NORX_T)) { return -1; }

    *mlen = clen - BYTES(NORX_T);

    INITIALISE(S, nonce, key);
    ABSORB_DATA(S, a, alen, HEADER_TAG);
    DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T));
    ABSORB_DATA(S, z, zlen, TRAILER_TAG);
    FINALISE(S);

    /* Verify tag */
    T[0] = vceqq_u32(U64TOU32(S[0]), U8TOU32( vld1q_u8((uint8_t *)(c + clen - BYTES(NORX_T)  )) ));
    T[1] = vceqq_u32(U64TOU32(S[1]), U8TOU32( vld1q_u8((uint8_t *)(c + clen - BYTES(NORX_T)/2)) ));
    T[0] = vandq_u32(T[0], T[1]);
    return 0xFFFFFFFFFFFFFFFFULL == (vgetq_lane_u64(U32TOU64(T[0]), 0) & vgetq_lane_u64(U32TOU64(T[0]), 1)) ? 0 : -1;
}
//Note: it takes size and offset in units of byte
static inline int compute_ham_similarity_64(unsigned short* ref, unsigned short* circ_array, 
                                int size){
	
    const uint8_t*         	ref_c=(uint8_t*) ref;
    const uint8_t*         	circ_c=(uint8_t*) circ_array;
    register uint8x16_t     a,b;
    register uint8x16_t     c,d,temp;
    register uint16x8_t     acc;
    register uint           i=0,count=0;
	int 					j=0;
	int 					shift=size&0xF;
	for(i=0;i<=size-16; i+=16){
		j++;
		a=vld1q_u8(&ref_c[i]);
		b=vld1q_u8(&circ_c[i]);
		c=veorq_u8(a,b);
		acc=vaddq_u16(acc,vpaddlq_u8(vcntq_u8(c)));
	}	
	count=setbits(acc);
	a=vld1q_u8(&ref_c[i]);
	b=vld1q_u8(&circ_c[i]);
	c=veorq_u8(a,b);
	c=vcntq_u8(c);
	
	for(i=0;i<shift;i++){
		count=count+vgetq_lane_u8 (c,i);
	}
    return size*8-count;
}
Exemplo n.º 4
0
static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
                                              const uint8x16_t res) {
  const uint8x16_t a0 = vld1q_u8(*dest);
  const uint8x16_t a1 = vld1q_u8(*dest + 16);
  const uint8x16_t b0 = vqsubq_u8(a0, res);
  const uint8x16_t b1 = vqsubq_u8(a1, res);
  vst1q_u8(*dest, b0);
  vst1q_u8(*dest + 16, b1);
  *dest += stride;
}
Exemplo n.º 5
0
void aes_ofb(const unsigned char* in, unsigned char *out, 
			int length, const char *expkey, const char* iv)
{
	uint8x16_t block, cipher;
	block = vld1q_u8((int8_t *)iv);
	for (int i = 0; i < length; i += 16){
		block = aes_enc(block, (uint8x16_t *)expkey);	
		cipher = veorq_u8(vld1q_u8(&((int8_t *)in)[i]), block);
		vst1q_u8(&((int8_t*)out)[i], cipher);
	}
}
Exemplo n.º 6
0
Arquivo: ops.c Projeto: petecoup/argon
void ar_vmin_u8_neon(uint8_t* res, const uint8_t* a, const uint8_t* b, uint32_t n)
{
   uint8x16_t a_loaded;
   uint8x16_t b_loaded;
   uint8x16_t res_loaded;

   for (uint32_t i = 0; i < n; i += 16) {
      a_loaded = vld1q_u8(&(a[i]));
      b_loaded = vld1q_u8(&(b[i]));
      res_loaded = vminq_u8(a_loaded, b_loaded);
      vst1q_u8(&(res[i]),res_loaded);
   }
}
Exemplo n.º 7
0
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
                            const uint8_t *above, const uint8_t *left,
                            int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x16_t A0 = vld1q_u8(above);  // top row
    const uint8x16_t A1 = vld1q_u8(above + 16);
    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
    const uint16x8_t p1 = vpaddlq_u8(A1);
    const uint16x8_t p2 = vaddq_u16(p0, p1);
    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
    const uint16x4_t p4 = vpadd_u16(p3, p3);
    const uint16x4_t p5 = vpadd_u16(p4, p4);
    sum_top = vcombine_u16(p5, p5);
  }

  if (do_left) {
    const uint8x16_t L0 = vld1q_u8(left);  // left row
    const uint8x16_t L1 = vld1q_u8(left + 16);
    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
    const uint16x8_t p1 = vpaddlq_u8(L1);
    const uint16x8_t p2 = vaddq_u16(p0, p1);
    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
    const uint16x4_t p4 = vpadd_u16(p3, p3);
    const uint16x4_t p5 = vpadd_u16(p4, p4);
    sum_left = vcombine_u16(p5, p5);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 6);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 5);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 5);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 32; ++i) {
      vst1q_u8(dst + i * stride, dc);
      vst1q_u8(dst + i * stride + 16, dc);
    }
  }
}
Exemplo n.º 8
0
void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x16_t q0u8 = vdupq_n_u8(0);
  uint8x16_t q1u8 = vdupq_n_u8(0);
  (void)left;

  q0u8 = vld1q_u8(above);
  q1u8 = vld1q_u8(above + 16);
  for (i = 0; i < 32; i++, dst += stride) {
    vst1q_u8(dst, q0u8);
    vst1q_u8(dst + 16, q1u8);
  }
}
Exemplo n.º 9
0
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
}
Exemplo n.º 10
0
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */
static void
    unshuffle4_neon(uint8_t * const
dest,
const uint8_t* const src,
const size_t vectorizable_elements,
const size_t total_elements
)
{
size_t i, j, k;
static const size_t bytesoftype = 4;
uint8x16x4_t r0;

for(
i = 0, k = 0;
i<vectorizable_elements*bytesoftype;
i += 64, k++) {
/* load 64 bytes to the structure r0 */
for(
j = 0;
j < 4; j++) {
r0.val[j] =
vld1q_u8(src
+
total_elements* j
+ k*16);
}
/* Store (with permutation) the results in the destination vector */
vst4q_u8(dest
+ k*64, r0);
}
}
Exemplo n.º 11
0
int normL1_(const uchar* a, const uchar* b, int n)
{
    int j = 0, d = 0;
#if CV_SSE
    __m128i d0 = _mm_setzero_si128();

    for( ; j <= n - 16; j += 16 )
    {
        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }

    for( ; j <= n - 4; j += 4 )
    {
        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
#elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
    {
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    }

    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
}
Exemplo n.º 12
0
void
png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_bytep rp = row;
   png_bytep rp_stop = row + row_info->rowbytes;

   uint8x16_t vtmp = vld1q_u8(rp);
   uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
   uint8x8x2_t vrp = *vrpt;

   uint8x8x4_t vdest;
   vdest.val[3] = vdup_n_u8(0);

   png_debug(1, "in png_read_filter_row_sub3_neon");

   for (; rp < rp_stop;)
   {
      uint8x8_t vtmp1, vtmp2;
      uint32x2_t *temp_pointer;

      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);

      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
      vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
      vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);

      vtmp = vld1q_u8(rp + 12);
      vrpt = png_ptr(uint8x8x2_t, &vtmp);
      vrp = *vrpt;

      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
      rp += 3;
   }

   PNG_UNUSED(prev_row)
}
Exemplo n.º 13
0
void
png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_bytep rp = row;
   png_bytep rp_stop = row + row_info->rowbytes;
   png_const_bytep pp = prev_row;

   for (; rp < rp_stop; rp += 16, pp += 16)
   {
      uint8x16_t qrp, qpp;

      qrp = vld1q_u8(rp);
      qpp = vld1q_u8(pp);
      qrp = vaddq_u8(qrp, qpp);
      vst1q_u8(rp, qrp);
   }
}
Exemplo n.º 14
0
void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x16_t q0u8 = vdupq_n_u8(0);
  (void)left;

  q0u8 = vld1q_u8(above);
  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
}
Exemplo n.º 15
0
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
                            const uint8_t *above, const uint8_t *left,
                            int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x16_t A = vld1q_u8(above);  // top row
    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_top = vcombine_u16(p3, p3);
  }

  if (do_left) {
    const uint8x16_t L = vld1q_u8(left);  // left row
    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_left = vcombine_u16(p3, p3);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 5);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 4);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 4);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 16; ++i) {
      vst1q_u8(dst + i * stride, dc);
    }
  }
}
Exemplo n.º 16
0
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                          const int ref_stride, const int height) {
  int i;
  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
  const int shift_factor = ((height >> 5) + 3) * -1;
  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);

  for (i = 0; i < height; i += 8) {
    const uint8x16_t vec_row1 = vld1q_u8(ref);
    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));

    ref += ref_stride * 8;
  }

  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);

  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
  hbuf += 8;
  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
}
Exemplo n.º 17
0
void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride,
                            unsigned char *dst, int dst_stride) {
  int r;
  uint8x16_t qtmp;

  for (r = 0; r < 16; ++r) {
    qtmp = vld1q_u8(src);
    vst1q_u8(dst, qtmp);
    src += src_stride;
    dst += dst_stride;
  }
}
Exemplo n.º 18
0
void vp8_mbloop_filter_horizontal_edge_y_neon(
        unsigned char *src,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh) {
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;

    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);

    src -= (pitch << 2);

    q3 = vld1q_u8(src);
    src += pitch;
    q4 = vld1q_u8(src);
    src += pitch;
    q5 = vld1q_u8(src);
    src += pitch;
    q6 = vld1q_u8(src);
    src += pitch;
    q7 = vld1q_u8(src);
    src += pitch;
    q8 = vld1q_u8(src);
    src += pitch;
    q9 = vld1q_u8(src);
    src += pitch;
    q10 = vld1q_u8(src);

    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);

    src -= (pitch * 6);
    vst1q_u8(src, q4);
    src += pitch;
    vst1q_u8(src, q5);
    src += pitch;
    vst1q_u8(src, q6);
    src += pitch;
    vst1q_u8(src, q7);
    src += pitch;
    vst1q_u8(src, q8);
    src += pitch;
    vst1q_u8(src, q9);
    return;
}
Exemplo n.º 19
0
/* u8x16 saturated sub */
void mw_neon_mm_qsub_u8x16(unsigned char * A, int Row, int Col, unsigned char * B, unsigned char * C)
{
	uint8x16_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 16; i <= size ; i+=16)
	{
		k = i - 16;
		neon_a = vld1q_u8(A + k);
		neon_b = vld1q_u8(B + k);
		neon_c = vqsubq_u8(neon_a, neon_b);
		vst1q_u8(C + k, neon_c);
	}

	k = i - 16;
    for (i = 0; i < size % 16; i++)
	{
		C[k + i] = A[k + i] - B[k + i];
	}
}
Exemplo n.º 20
0
int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
  int i;
  uint16x8_t vec_sum = vdupq_n_u16(0);

  for (i = 0; i < width; i += 16) {
    const uint8x16_t vec_row = vld1q_u8(ref);
    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
    ref += 16;
  }

  return horizontal_add_u16x8(vec_sum);
}
Exemplo n.º 21
0
void vpx_lpf_horizontal_4_dual_neon(
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
    const uint8_t *limit1, const uint8_t *thresh1) {
  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
  uint8x16_t qblimit, qlimit, qthresh;
  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;

  dblimit0 = vld1_u8(blimit0);
  dlimit0 = vld1_u8(limit0);
  dthresh0 = vld1_u8(thresh0);
  dblimit1 = vld1_u8(blimit1);
  dlimit1 = vld1_u8(limit1);
  dthresh1 = vld1_u8(thresh1);
  qblimit = vcombine_u8(dblimit0, dblimit1);
  qlimit = vcombine_u8(dlimit0, dlimit1);
  qthresh = vcombine_u8(dthresh0, dthresh1);

  s -= (p << 2);

  q3u8 = vld1q_u8(s);
  s += p;
  q4u8 = vld1q_u8(s);
  s += p;
  q5u8 = vld1q_u8(s);
  s += p;
  q6u8 = vld1q_u8(s);
  s += p;
  q7u8 = vld1q_u8(s);
  s += p;
  q8u8 = vld1q_u8(s);
  s += p;
  q9u8 = vld1q_u8(s);
  s += p;
  q10u8 = vld1q_u8(s);

  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);

  s -= (p * 5);
  vst1q_u8(s, q5u8);
  s += p;
  vst1q_u8(s, q6u8);
  s += p;
  vst1q_u8(s, q7u8);
  s += p;
  vst1q_u8(s, q8u8);
  return;
}
Exemplo n.º 22
0
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
    const uint8x16_t greens =
        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                    vtbl1_u8(vget_high_u8(argb), shuffle));
    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
}
Exemplo n.º 23
0
void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
  const uint8x16_t A0 = vld1q_u8(above);  // top row
  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
  uint8x16_t row = vrhaddq_u8(avg1, A1);
  int i;
  (void)left;
  for (i = 0; i < 15; ++i) {
    vst1q_u8(dst + i * stride, row);
    row = vextq_u8(row, above_right, 1);
  }
  vst1q_u8(dst + i * stride, row);
}
Exemplo n.º 24
0
void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint8x8_t d2u8 = vdup_n_u8(0);
  uint8x16_t q0u8 = vdupq_n_u8(0);
  uint8x16_t q1u8 = vdupq_n_u8(0);
  (void)above;

  for (k = 0; k < 2; k++, left += 16) {
    q1u8 = vld1q_u8(left);
    d2u8 = vget_low_u8(q1u8);
    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
      q0u8 = vdupq_lane_u8(d2u8, 0);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 1);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 2);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 3);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 4);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 5);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 6);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 7);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
    }
  }
}
Exemplo n.º 25
0
int main () {
    /* Create custom arbitrary data. */
    const uint8_t uint8_data[] = { 1, 2, 3, 4, 5, 6, 7, 8,
				   9, 10, 11, 12, 13, 14, 15, 16 };

    /* Create the vector with our data. */
    uint8x16_t data;

    /* Load our custom data into the vector register. */
    data = vld1q_u8 (uint8_data);

    print_uint8 (data, "data");
    
    /* Call of the add3 function. */
    add3(&data);

    print_uint8 (data, "data (new)");
    
    return 0;
}
Exemplo n.º 26
0
static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
                           uint8x16_t *q9u8, uint8x16_t *q10u8,
                           uint8x16_t *q11u8, uint8x16_t *q12u8,
                           uint8x16_t *q13u8, uint8x16_t *q14u8,
                           uint8x16_t *q15u8) {
  *q8u8 = vld1q_u8(d);
  d += d_stride;
  *q9u8 = vld1q_u8(d);
  d += d_stride;
  *q10u8 = vld1q_u8(d);
  d += d_stride;
  *q11u8 = vld1q_u8(d);
  d += d_stride;
  *q12u8 = vld1q_u8(d);
  d += d_stride;
  *q13u8 = vld1q_u8(d);
  d += d_stride;
  *q14u8 = vld1q_u8(d);
  d += d_stride;
  *q15u8 = vld1q_u8(d);
  return;
}
Exemplo n.º 27
0
Arquivo: ops.c Projeto: petecoup/argon
uint8_t ar_vminall_u8_neon(const uint8_t* a, uint32_t n)
{
   uint8x16_t a_loaded;
   uint8x16_t overall_min = vdupq_n_u8(255);
   uint8_t om_array[16];
   uint8_t themin = 255;

   for (uint32_t i = 0; i < n; i += 16) {
      a_loaded = vld1q_u8(&(a[i]));
      overall_min = vminq_u8(a_loaded, overall_min);
   }

   vst1q_u8(om_array, overall_min);
   
   for (uint32_t i = 0; i < 16; i++) {
      themin = ar_min_u8(themin, om_array[i]);
   }

   return themin;
}
Exemplo n.º 28
0
Arquivo: ops.c Projeto: petecoup/argon
uint8_t ar_vmaxall_u8_neon(const uint8_t* a, uint32_t n)
{
   uint8x16_t a_loaded;
   uint8x16_t overall_max = vdupq_n_u8(0);
   uint8_t om_array[16];
   uint8_t themax = 0;

   for (uint32_t i = 0; i < n; i += 16) {
      a_loaded = vld1q_u8(&(a[i]));
      overall_max = vmaxq_u8(a_loaded, overall_max);
   }

   vst1q_u8(om_array, overall_max);
   
   for (uint32_t i = 0; i < 16; i++) {
      themax = ar_max_u8(themax, om_array[i]);
   }

   return themax;
}
Exemplo n.º 29
0
static void SubBytes(void)
{
	vst1q_u8(lut,*state);
	lut[0] = sbox[lut[0]];
	lut[1] = sbox[lut[1]];
	lut[2] = sbox[lut[2]];
	lut[3] = sbox[lut[3]];
	lut[4] = sbox[lut[4]];
	lut[5] = sbox[lut[5]];
	lut[6] = sbox[lut[6]];
	lut[7] = sbox[lut[7]];
	lut[8] = sbox[lut[8]];
	lut[9] = sbox[lut[9]];
	lut[10] = sbox[lut[10]];
	lut[11] = sbox[lut[11]];
	lut[12] = sbox[lut[12]];
	lut[13] = sbox[lut[13]];
	lut[14] = sbox[lut[14]];
	lut[15] = sbox[lut[15]];
	*state = vld1q_u8(lut);
}
Exemplo n.º 30
0
int crypto_stream_xor(
        unsigned char *c,
  const unsigned char *m,unsigned long long mlen,
  const unsigned char *n,
  const unsigned char *k
)
{
  const uint32x4_t abab = {-1,0,-1,0};
  const uint64x1_t nextblock = {1};
  uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k);
  uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16));
  uint32x4_t start0 = (uint32x4_t) vld1q_u8((uint8_t *) sigma);
  uint32x2_t n0n1 = (uint32x2_t) vld1_u8((uint8_t *) n);
  uint32x2_t n2n3 = {0,0};
  uint32x2_t k0k1 = vget_low_u32(k0k1k2k3);
  uint32x2_t k2k3 = vget_high_u32(k0k1k2k3);
  uint32x2_t k4k5 = vget_low_u32(k4k5k6k7);
  uint32x2_t k6k7 = vget_high_u32(k4k5k6k7);
  uint32x2_t n1n0 = vext_u32(n0n1,n0n1,1);
  uint32x2_t n3n2;
  uint32x2_t n0k4 = vext_u32(n1n0,k4k5,1);
  uint32x2_t k5k0 = vext_u32(k4k5,k0k1,1);
  uint32x2_t k1n1 = vext_u32(k0k1,n1n0,1);
  uint32x2_t n2k6;
  uint32x2_t k7k2 = vext_u32(k6k7,k2k3,1);
  uint32x2_t k3n3;
  uint32x4_t start1 = vcombine_u32(k5k0,n0k4);
  uint32x4_t start2;
  uint32x4_t start3;
  register uint32x4_t diag0;
  register uint32x4_t diag1;
  register uint32x4_t diag2;
  register uint32x4_t diag3;
  uint32x4_t next_start2;
  uint32x4_t next_start3;
  register uint32x4_t next_diag0;
  register uint32x4_t next_diag1;
  register uint32x4_t next_diag2;
  register uint32x4_t next_diag3;
  uint32x4_t x0x5x10x15;
  uint32x4_t x12x1x6x11;
  uint32x4_t x8x13x2x7;
  uint32x4_t x4x9x14x3;
  uint32x4_t x0x1x10x11;
  uint32x4_t x12x13x6x7;
  uint32x4_t x8x9x2x3;
  uint32x4_t x4x5x14x15;
  uint32x4_t x0x1x2x3;
  uint32x4_t x4x5x6x7;
  uint32x4_t x8x9x10x11;
  uint32x4_t x12x13x14x15;
  uint32x4_t m0m1m2m3;
  uint32x4_t m4m5m6m7;
  uint32x4_t m8m9m10m11;
  uint32x4_t m12m13m14m15;
  register uint32x4_t a0;
  register uint32x4_t a1;
  register uint32x4_t a2;
  register uint32x4_t a3;
  register uint32x4_t b0;
  register uint32x4_t b1;
  register uint32x4_t b2;
  register uint32x4_t b3;
  register uint32x4_t next_a0;
  register uint32x4_t next_a1;
  register uint32x4_t next_a2;
  register uint32x4_t next_a3;
  register uint32x4_t next_b0;
  register uint32x4_t next_b1;
  register uint32x4_t next_b2;
  register uint32x4_t next_b3;
  unsigned char block[64];
  unsigned char *savec;
  int i;
  int flagm = (m != 0);

  if (!mlen) return 0;
  if (mlen < 128) goto mlenatleast1;

  mlenatleast128:

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  start2 = vcombine_u32(n2k6,k1n1);
  start3 = vcombine_u32(k3n3,k7k2);

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  diag0 = start0;
  diag1 = start1;
  diag2 = start2;
  diag3 = start3;

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  next_start2 = vcombine_u32(n2k6,k1n1);
  next_start3 = vcombine_u32(k3n3,k7k2);

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  next_diag0 = start0;
  next_diag1 = start1;
  next_diag2 = next_start2;
  next_diag3 = next_start3;

  for (i = ROUNDS;i > 0;i -= 2) {
    a0 = diag1 + diag0;
    b0 = vshlq_n_u32(a0,7);
                                        next_a0 = next_diag1 + next_diag0;
    a0 = vsriq_n_u32(b0,a0,25);
                                        next_b0 = vshlq_n_u32(next_a0,7);
    diag3 ^= a0;
                                        next_a0 = vsriq_n_u32(next_b0,next_a0,25);
    a1 = diag0 + diag3;
                                        next_diag3 ^= next_a0;
    b1 = vshlq_n_u32(a1,9);
                                        next_a1 = next_diag0 + next_diag3;
    a1 = vsriq_n_u32(b1,a1,23);
                                        next_b1 = vshlq_n_u32(next_a1,9);
    diag2 ^= a1;
                                        next_a1 = vsriq_n_u32(next_b1,next_a1,23);
    a2 = diag3 + diag2;
      diag3 = vextq_u32(diag3,diag3,3);
                                        next_diag2 ^= next_a1;
    b2 = vshlq_n_u32(a2,13);
                                        next_a2 = next_diag3 + next_diag2;
                                          next_diag3 = vextq_u32(next_diag3,next_diag3,3);
    a2 = vsriq_n_u32(b2,a2,19);
                                        next_b2 = vshlq_n_u32(next_a2,13);
    diag1 ^= a2;
                                        next_a2 = vsriq_n_u32(next_b2,next_a2,19);
    a3 = diag2 + diag1;
      diag2 = vextq_u32(diag2,diag2,2);
                                        next_diag1 ^= next_a2;
    b3 = vshlq_n_u32(a3,18);
      diag1 = vextq_u32(diag1,diag1,1);
                                        next_a3 = next_diag2 + next_diag1;
                                          next_diag2 = vextq_u32(next_diag2,next_diag2,2);
    a3 = vsriq_n_u32(b3,a3,14);
                                        next_b3 = vshlq_n_u32(next_a3,18);
                                          next_diag1 = vextq_u32(next_diag1,next_diag1,1);
    diag0 ^= a3;
                                        next_a3 = vsriq_n_u32(next_b3,next_a3,14);
    a0 = diag3 + diag0;
                                        next_diag0 ^= next_a3;
    b0 = vshlq_n_u32(a0,7);
                                        next_a0 = next_diag3 + next_diag0;
    a0 = vsriq_n_u32(b0,a0,25);
                                        next_b0 = vshlq_n_u32(next_a0,7);
    diag1 ^= a0;
                                        next_a0 = vsriq_n_u32(next_b0,next_a0,25);
    a1 = diag0 + diag1;
                                        next_diag1 ^= next_a0;
    b1 = vshlq_n_u32(a1,9);
                                        next_a1 = next_diag0 + next_diag1;
    a1 = vsriq_n_u32(b1,a1,23);
                                        next_b1 = vshlq_n_u32(next_a1,9);
    diag2 ^= a1;
                                        next_a1 = vsriq_n_u32(next_b1,next_a1,23);
    a2 = diag1 + diag2;
      diag1 = vextq_u32(diag1,diag1,3);
                                        next_diag2 ^= next_a1;
    b2 = vshlq_n_u32(a2,13);
                                        next_a2 = next_diag1 + next_diag2;
                                          next_diag1 = vextq_u32(next_diag1,next_diag1,3);
    a2 = vsriq_n_u32(b2,a2,19);
                                        next_b2 = vshlq_n_u32(next_a2,13);
    diag3 ^= a2;
                                        next_a2 = vsriq_n_u32(next_b2,next_a2,19);
    a3 = diag2 + diag3;
      diag2 = vextq_u32(diag2,diag2,2);
                                        next_diag3 ^= next_a2;
    b3 = vshlq_n_u32(a3,18);
      diag3 = vextq_u32(diag3,diag3,1);
                                        next_a3 = next_diag2 + next_diag3;
                                          next_diag2 = vextq_u32(next_diag2,next_diag2,2);
    a3 = vsriq_n_u32(b3,a3,14);
                                        next_b3 = vshlq_n_u32(next_a3,18);
                                          next_diag3 = vextq_u32(next_diag3,next_diag3,1);
    diag0 ^= a3;
                                        next_a3 = vsriq_n_u32(next_b3,next_a3,14);
                                        next_diag0 ^= next_a3;
  }

  x0x5x10x15 = diag0 + start0;
  x12x1x6x11 = diag1 + start1;
  x8x13x2x7 = diag2 + start2;
  x4x9x14x3 = diag3 + start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  x0x5x10x15 = next_diag0 + start0;
  x12x1x6x11 = next_diag1 + start1;
  x8x13x2x7 = next_diag2 + next_start2;
  x4x9x14x3 = next_diag3 + next_start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8(64 + (uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(80 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(96 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(112 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8(64 + (uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(80 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(96 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(112 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  mlen -= 128;
  c += 128;
  if (flagm) m += 128;

  if (mlen >= 128) goto mlenatleast128;

  mlenatleast1:

  if (mlen < 64) {
    if (flagm) for (i = 0;i < 64;++i) block[i] = 0;
    if (flagm) for (i = 0;i < mlen;++i) block[i] = m[i];
    savec = c;
    c = block;
    if (flagm) m = block;
  }

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  start2 = vcombine_u32(n2k6,k1n1);
  start3 = vcombine_u32(k3n3,k7k2);

  diag0 = start0;
  diag1 = start1;
  diag2 = start2;
  diag3 = start3;

  for (i = ROUNDS;i > 0;i -= 2) {
    a0 = diag1 + diag0;
    b0 = vshlq_n_u32(a0,7);
    a0 = vsriq_n_u32(b0,a0,25);
    diag3 ^= a0;
    a1 = diag0 + diag3;
    b1 = vshlq_n_u32(a1,9);
    a1 = vsriq_n_u32(b1,a1,23);
    diag2 ^= a1;
    a2 = diag3 + diag2;
      diag3 = vextq_u32(diag3,diag3,3);
    b2 = vshlq_n_u32(a2,13);
    a2 = vsriq_n_u32(b2,a2,19);
    diag1 ^= a2;
    a3 = diag2 + diag1;
      diag2 = vextq_u32(diag2,diag2,2);
    b3 = vshlq_n_u32(a3,18);
      diag1 = vextq_u32(diag1,diag1,1);
    a3 = vsriq_n_u32(b3,a3,14);
    diag0 ^= a3;

    a0 = diag3 + diag0;
    b0 = vshlq_n_u32(a0,7);
    a0 = vsriq_n_u32(b0,a0,25);
    diag1 ^= a0;
    a1 = diag0 + diag1;
    b1 = vshlq_n_u32(a1,9);
    a1 = vsriq_n_u32(b1,a1,23);
    diag2 ^= a1;
    a2 = diag1 + diag2;
      diag1 = vextq_u32(diag1,diag1,3);
    b2 = vshlq_n_u32(a2,13);
    a2 = vsriq_n_u32(b2,a2,19);
    diag3 ^= a2;
    a3 = diag2 + diag3;
      diag2 = vextq_u32(diag2,diag2,2);
    b3 = vshlq_n_u32(a3,18);
      diag3 = vextq_u32(diag3,diag3,1);
    a3 = vsriq_n_u32(b3,a3,14);
    diag0 ^= a3;
  }

  x0x5x10x15 = diag0 + start0;
  x12x1x6x11 = diag1 + start1;
  x8x13x2x7 = diag2 + start2;
  x4x9x14x3 = diag3 + start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  if (mlen < 64) {
    for (i = 0;i < mlen;++i) savec[i] = c[i];
  }
  if (mlen <= 64) return 0;

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  mlen -= 64;
  c += 64;
  if (flagm) m += 64;

  goto mlenatleast1;
}