void test_vshlQ_nu8 (void)
{
  uint8x16_t out_uint8x16_t;
  uint8x16_t arg0_uint8x16_t;

  out_uint8x16_t = vshlq_n_u8 (arg0_uint8x16_t, 1);
}
Ejemplo n.º 2
0
static uint8x16_t xtime(uint8x16_t x)
{
	uint8x16_t y = vshlq_n_u8(x,1);
	x = vshrq_n_u8(x,7);
	uint8x16_t n27 = vmovq_n_u8(0x1b);
	x = vmulq_u8(x,n27);
	x = veorq_u8(x,y);
	return x;
}
Ejemplo n.º 3
0
static inline uint8x16x4_t
enc_reshuffle (uint8x16x3_t in)
{
	uint8x16x4_t out;

	// Divide bits of three input bytes over four output bytes:
	out.val[0] = vshrq_n_u8(in.val[0], 2);
	out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4));
	out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2));
	out.val[3] = in.val[2];

	// Clear top two bits:
	out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F));
	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
	out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F));

	return out;
}
/* If we have ARM NEON support, pick off 48 bytes at a time for as long as we can: */
while (srclen >= 48)
{
	uint8x16x3_t str;
	uint8x16x4_t res;

	/* Load 48 bytes and deinterleave: */
	str = vld3q_u8((uint8_t *)c);

	/* Divide bits of three input bytes over four output bytes: */
	res.val[0] = vshrq_n_u8(str.val[0], 2);
	res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4);
	res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2);
	res.val[3] = str.val[2];

	/* Clear top two bits: */
	res.val[0] &= vdupq_n_u8(0x3F);
	res.val[1] &= vdupq_n_u8(0x3F);
	res.val[2] &= vdupq_n_u8(0x3F);
	res.val[3] &= vdupq_n_u8(0x3F);

	/* The bits have now been shifted to the right locations;
	 * translate their values 0..63 to the Base64 alphabet.
	 * Use a 64-byte table lookup: */
	res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]);
	res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]);
	res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]);
	res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]);

	/* Interleave and store result: */
	vst4q_u8((uint8_t *)o, res);