void test_vshlQ_nu8 (void) { uint8x16_t out_uint8x16_t; uint8x16_t arg0_uint8x16_t; out_uint8x16_t = vshlq_n_u8 (arg0_uint8x16_t, 1); }
static uint8x16_t xtime(uint8x16_t x) { uint8x16_t y = vshlq_n_u8(x,1); x = vshrq_n_u8(x,7); uint8x16_t n27 = vmovq_n_u8(0x1b); x = vmulq_u8(x,n27); x = veorq_u8(x,y); return x; }
static inline uint8x16x4_t enc_reshuffle (uint8x16x3_t in) { uint8x16x4_t out; // Divide bits of three input bytes over four output bytes: out.val[0] = vshrq_n_u8(in.val[0], 2); out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4)); out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2)); out.val[3] = in.val[2]; // Clear top two bits: out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F)); out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F)); out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F)); out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F)); return out; }
/* If we have ARM NEON support, pick off 48 bytes at a time for as long as we can: */ while (srclen >= 48) { uint8x16x3_t str; uint8x16x4_t res; /* Load 48 bytes and deinterleave: */ str = vld3q_u8((uint8_t *)c); /* Divide bits of three input bytes over four output bytes: */ res.val[0] = vshrq_n_u8(str.val[0], 2); res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4); res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2); res.val[3] = str.val[2]; /* Clear top two bits: */ res.val[0] &= vdupq_n_u8(0x3F); res.val[1] &= vdupq_n_u8(0x3F); res.val[2] &= vdupq_n_u8(0x3F); res.val[3] &= vdupq_n_u8(0x3F); /* The bits have now been shifted to the right locations; * translate their values 0..63 to the Base64 alphabet. * Use a 64-byte table lookup: */ res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]); res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]); res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]); res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]); /* Interleave and store result: */ vst4q_u8((uint8_t *)o, res);