Exemplo n.º 1
0
void GPUgstate::FastLoadBoneMatrix(u32 addr) {
	const u32_le *src = (const u32_le *)Memory::GetPointerUnchecked(addr);
	u32 num = boneMatrixNumber;
	u32 *dst = (u32 *)(boneMatrix + (num & 0x7F));

#ifdef _M_SSE
	__m128i row1 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);
	__m128i row2 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 4)), 8);
	__m128i row3 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 8)), 8);
	if ((num & 0x3) == 0) {
		_mm_store_si128((__m128i *)dst, row1);
		_mm_store_si128((__m128i *)(dst + 4), row2);
		_mm_store_si128((__m128i *)(dst + 8), row3);
	} else {
		_mm_storeu_si128((__m128i *)dst, row1);
		_mm_storeu_si128((__m128i *)(dst + 4), row2);
		_mm_storeu_si128((__m128i *)(dst + 8), row3);
	}
#elif PPSSPP_ARCH(ARM_NEON)
	const uint32x4_t row1 = vshlq_n_u32(vld1q_u32(src), 8);
	const uint32x4_t row2 = vshlq_n_u32(vld1q_u32(src + 4), 8);
	const uint32x4_t row3 = vshlq_n_u32(vld1q_u32(src + 8), 8);
	vst1q_u32(dst, row1);
	vst1q_u32(dst + 4, row2);
	vst1q_u32(dst + 8, row3);
#else
	for (int i = 0; i < 12; i++) {
		dst[i] = src[i] << 8;
	}
#endif

	num += 12;
	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F);
}
Exemplo n.º 2
0
static void MixColumns(void)
{
	uint32x4_t a = vreinterpretq_u32_u8(*state);
	uint32x4_t b = vreinterpretq_u32_u8(xtime(*state));
	
	uint32x4_t a3 = veorq_u32(a,b);
	uint32x4_t a3r = vshlq_n_u32(a3,8);
	a3r = vsraq_n_u32(a3r,a3,24);
	
	uint32x4_t a2 = vshlq_n_u32(a,16);
	a2 = vsraq_n_u32(a2,a,16);
	
	uint32x4_t a1 = vshlq_n_u32(a,24);
	a1 = vsraq_n_u32(a1,a,8);
	
	uint32x4_t out = veorq_u32(b,a1);
	out = veorq_u32(out,a2);
	out = veorq_u32(out,a3r);
	*state = vreinterpretq_u8_u32(out);
}
Exemplo n.º 3
0
int crypto_stream_xor(
        unsigned char *c,
  const unsigned char *m,unsigned long long mlen,
  const unsigned char *n,
  const unsigned char *k
)
{
  const uint32x4_t abab = {-1,0,-1,0};
  const uint64x1_t nextblock = {1};
  uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k);
  uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16));
  uint32x4_t start0 = (uint32x4_t) vld1q_u8((uint8_t *) sigma);
  uint32x2_t n0n1 = (uint32x2_t) vld1_u8((uint8_t *) n);
  uint32x2_t n2n3 = {0,0};
  uint32x2_t k0k1 = vget_low_u32(k0k1k2k3);
  uint32x2_t k2k3 = vget_high_u32(k0k1k2k3);
  uint32x2_t k4k5 = vget_low_u32(k4k5k6k7);
  uint32x2_t k6k7 = vget_high_u32(k4k5k6k7);
  uint32x2_t n1n0 = vext_u32(n0n1,n0n1,1);
  uint32x2_t n3n2;
  uint32x2_t n0k4 = vext_u32(n1n0,k4k5,1);
  uint32x2_t k5k0 = vext_u32(k4k5,k0k1,1);
  uint32x2_t k1n1 = vext_u32(k0k1,n1n0,1);
  uint32x2_t n2k6;
  uint32x2_t k7k2 = vext_u32(k6k7,k2k3,1);
  uint32x2_t k3n3;
  uint32x4_t start1 = vcombine_u32(k5k0,n0k4);
  uint32x4_t start2;
  uint32x4_t start3;
  register uint32x4_t diag0;
  register uint32x4_t diag1;
  register uint32x4_t diag2;
  register uint32x4_t diag3;
  uint32x4_t next_start2;
  uint32x4_t next_start3;
  register uint32x4_t next_diag0;
  register uint32x4_t next_diag1;
  register uint32x4_t next_diag2;
  register uint32x4_t next_diag3;
  uint32x4_t x0x5x10x15;
  uint32x4_t x12x1x6x11;
  uint32x4_t x8x13x2x7;
  uint32x4_t x4x9x14x3;
  uint32x4_t x0x1x10x11;
  uint32x4_t x12x13x6x7;
  uint32x4_t x8x9x2x3;
  uint32x4_t x4x5x14x15;
  uint32x4_t x0x1x2x3;
  uint32x4_t x4x5x6x7;
  uint32x4_t x8x9x10x11;
  uint32x4_t x12x13x14x15;
  uint32x4_t m0m1m2m3;
  uint32x4_t m4m5m6m7;
  uint32x4_t m8m9m10m11;
  uint32x4_t m12m13m14m15;
  register uint32x4_t a0;
  register uint32x4_t a1;
  register uint32x4_t a2;
  register uint32x4_t a3;
  register uint32x4_t b0;
  register uint32x4_t b1;
  register uint32x4_t b2;
  register uint32x4_t b3;
  register uint32x4_t next_a0;
  register uint32x4_t next_a1;
  register uint32x4_t next_a2;
  register uint32x4_t next_a3;
  register uint32x4_t next_b0;
  register uint32x4_t next_b1;
  register uint32x4_t next_b2;
  register uint32x4_t next_b3;
  unsigned char block[64];
  unsigned char *savec;
  int i;
  int flagm = (m != 0);

  if (!mlen) return 0;
  if (mlen < 128) goto mlenatleast1;

  mlenatleast128:

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  start2 = vcombine_u32(n2k6,k1n1);
  start3 = vcombine_u32(k3n3,k7k2);

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  diag0 = start0;
  diag1 = start1;
  diag2 = start2;
  diag3 = start3;

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  next_start2 = vcombine_u32(n2k6,k1n1);
  next_start3 = vcombine_u32(k3n3,k7k2);

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  next_diag0 = start0;
  next_diag1 = start1;
  next_diag2 = next_start2;
  next_diag3 = next_start3;

  for (i = ROUNDS;i > 0;i -= 2) {
    a0 = diag1 + diag0;
    b0 = vshlq_n_u32(a0,7);
                                        next_a0 = next_diag1 + next_diag0;
    a0 = vsriq_n_u32(b0,a0,25);
                                        next_b0 = vshlq_n_u32(next_a0,7);
    diag3 ^= a0;
                                        next_a0 = vsriq_n_u32(next_b0,next_a0,25);
    a1 = diag0 + diag3;
                                        next_diag3 ^= next_a0;
    b1 = vshlq_n_u32(a1,9);
                                        next_a1 = next_diag0 + next_diag3;
    a1 = vsriq_n_u32(b1,a1,23);
                                        next_b1 = vshlq_n_u32(next_a1,9);
    diag2 ^= a1;
                                        next_a1 = vsriq_n_u32(next_b1,next_a1,23);
    a2 = diag3 + diag2;
      diag3 = vextq_u32(diag3,diag3,3);
                                        next_diag2 ^= next_a1;
    b2 = vshlq_n_u32(a2,13);
                                        next_a2 = next_diag3 + next_diag2;
                                          next_diag3 = vextq_u32(next_diag3,next_diag3,3);
    a2 = vsriq_n_u32(b2,a2,19);
                                        next_b2 = vshlq_n_u32(next_a2,13);
    diag1 ^= a2;
                                        next_a2 = vsriq_n_u32(next_b2,next_a2,19);
    a3 = diag2 + diag1;
      diag2 = vextq_u32(diag2,diag2,2);
                                        next_diag1 ^= next_a2;
    b3 = vshlq_n_u32(a3,18);
      diag1 = vextq_u32(diag1,diag1,1);
                                        next_a3 = next_diag2 + next_diag1;
                                          next_diag2 = vextq_u32(next_diag2,next_diag2,2);
    a3 = vsriq_n_u32(b3,a3,14);
                                        next_b3 = vshlq_n_u32(next_a3,18);
                                          next_diag1 = vextq_u32(next_diag1,next_diag1,1);
    diag0 ^= a3;
                                        next_a3 = vsriq_n_u32(next_b3,next_a3,14);
    a0 = diag3 + diag0;
                                        next_diag0 ^= next_a3;
    b0 = vshlq_n_u32(a0,7);
                                        next_a0 = next_diag3 + next_diag0;
    a0 = vsriq_n_u32(b0,a0,25);
                                        next_b0 = vshlq_n_u32(next_a0,7);
    diag1 ^= a0;
                                        next_a0 = vsriq_n_u32(next_b0,next_a0,25);
    a1 = diag0 + diag1;
                                        next_diag1 ^= next_a0;
    b1 = vshlq_n_u32(a1,9);
                                        next_a1 = next_diag0 + next_diag1;
    a1 = vsriq_n_u32(b1,a1,23);
                                        next_b1 = vshlq_n_u32(next_a1,9);
    diag2 ^= a1;
                                        next_a1 = vsriq_n_u32(next_b1,next_a1,23);
    a2 = diag1 + diag2;
      diag1 = vextq_u32(diag1,diag1,3);
                                        next_diag2 ^= next_a1;
    b2 = vshlq_n_u32(a2,13);
                                        next_a2 = next_diag1 + next_diag2;
                                          next_diag1 = vextq_u32(next_diag1,next_diag1,3);
    a2 = vsriq_n_u32(b2,a2,19);
                                        next_b2 = vshlq_n_u32(next_a2,13);
    diag3 ^= a2;
                                        next_a2 = vsriq_n_u32(next_b2,next_a2,19);
    a3 = diag2 + diag3;
      diag2 = vextq_u32(diag2,diag2,2);
                                        next_diag3 ^= next_a2;
    b3 = vshlq_n_u32(a3,18);
      diag3 = vextq_u32(diag3,diag3,1);
                                        next_a3 = next_diag2 + next_diag3;
                                          next_diag2 = vextq_u32(next_diag2,next_diag2,2);
    a3 = vsriq_n_u32(b3,a3,14);
                                        next_b3 = vshlq_n_u32(next_a3,18);
                                          next_diag3 = vextq_u32(next_diag3,next_diag3,1);
    diag0 ^= a3;
                                        next_a3 = vsriq_n_u32(next_b3,next_a3,14);
                                        next_diag0 ^= next_a3;
  }

  x0x5x10x15 = diag0 + start0;
  x12x1x6x11 = diag1 + start1;
  x8x13x2x7 = diag2 + start2;
  x4x9x14x3 = diag3 + start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  x0x5x10x15 = next_diag0 + start0;
  x12x1x6x11 = next_diag1 + start1;
  x8x13x2x7 = next_diag2 + next_start2;
  x4x9x14x3 = next_diag3 + next_start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8(64 + (uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(80 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(96 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(112 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8(64 + (uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(80 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(96 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(112 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  mlen -= 128;
  c += 128;
  if (flagm) m += 128;

  if (mlen >= 128) goto mlenatleast128;

  mlenatleast1:

  if (mlen < 64) {
    if (flagm) for (i = 0;i < 64;++i) block[i] = 0;
    if (flagm) for (i = 0;i < mlen;++i) block[i] = m[i];
    savec = c;
    c = block;
    if (flagm) m = block;
  }

  n3n2 = vext_u32(n2n3,n2n3,1);
  n2k6 = vext_u32(n3n2,k6k7,1);
  k3n3 = vext_u32(k2k3,n3n2,1);
  start2 = vcombine_u32(n2k6,k1n1);
  start3 = vcombine_u32(k3n3,k7k2);

  diag0 = start0;
  diag1 = start1;
  diag2 = start2;
  diag3 = start3;

  for (i = ROUNDS;i > 0;i -= 2) {
    a0 = diag1 + diag0;
    b0 = vshlq_n_u32(a0,7);
    a0 = vsriq_n_u32(b0,a0,25);
    diag3 ^= a0;
    a1 = diag0 + diag3;
    b1 = vshlq_n_u32(a1,9);
    a1 = vsriq_n_u32(b1,a1,23);
    diag2 ^= a1;
    a2 = diag3 + diag2;
      diag3 = vextq_u32(diag3,diag3,3);
    b2 = vshlq_n_u32(a2,13);
    a2 = vsriq_n_u32(b2,a2,19);
    diag1 ^= a2;
    a3 = diag2 + diag1;
      diag2 = vextq_u32(diag2,diag2,2);
    b3 = vshlq_n_u32(a3,18);
      diag1 = vextq_u32(diag1,diag1,1);
    a3 = vsriq_n_u32(b3,a3,14);
    diag0 ^= a3;

    a0 = diag3 + diag0;
    b0 = vshlq_n_u32(a0,7);
    a0 = vsriq_n_u32(b0,a0,25);
    diag1 ^= a0;
    a1 = diag0 + diag1;
    b1 = vshlq_n_u32(a1,9);
    a1 = vsriq_n_u32(b1,a1,23);
    diag2 ^= a1;
    a2 = diag1 + diag2;
      diag1 = vextq_u32(diag1,diag1,3);
    b2 = vshlq_n_u32(a2,13);
    a2 = vsriq_n_u32(b2,a2,19);
    diag3 ^= a2;
    a3 = diag2 + diag3;
      diag2 = vextq_u32(diag2,diag2,2);
    b3 = vshlq_n_u32(a3,18);
      diag3 = vextq_u32(diag3,diag3,1);
    a3 = vsriq_n_u32(b3,a3,14);
    diag0 ^= a3;
  }

  x0x5x10x15 = diag0 + start0;
  x12x1x6x11 = diag1 + start1;
  x8x13x2x7 = diag2 + start2;
  x4x9x14x3 = diag3 + start3;

  if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m);
  if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m);
  if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m);
  if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m);

  x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
  x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
  x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
  x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

  x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
  x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
  x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
  x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

  if (flagm) x0x1x2x3 ^= m0m1m2m3;
  if (flagm) x4x5x6x7 ^= m4m5m6m7;
  if (flagm) x8x9x10x11 ^= m8m9m10m11;
  if (flagm) x12x13x14x15 ^= m12m13m14m15;

  vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3);
  vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7);
  vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11);
  vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15);

  if (mlen < 64) {
    for (i = 0;i < mlen;++i) savec[i] = c[i];
  }
  if (mlen <= 64) return 0;

  n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3);

  mlen -= 64;
  c += 64;
  if (flagm) m += 64;

  goto mlenatleast1;
}
Exemplo n.º 4
0
static inline void
desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4],
		  struct rte_mbuf **rx_pkts)
{
	uint32x4_t vlan0, vlan1, rss, l3_l4e;
	const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
	uint64x2_t rearm0, rearm1, rearm2, rearm3;

	/* mask everything except RSS, flow director and VLAN flags
	 * bit2 is for VLAN tag, bit11 for flow director indication
	 * bit13:12 for RSS indication.
	 */
	const uint32x4_t rss_vlan_msk = {
			0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804};

	const uint32x4_t cksum_mask = {
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD};

	/* map rss and vlan type to rss hash and vlan flag */
	const uint8x16_t vlan_flags = {
			0, 0, 0, 0,
			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0,
			0, 0, 0, 0,
			0, 0, 0, 0};

	const uint8x16_t rss_flags = {
			0, PKT_RX_FDIR, 0, 0,
			0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR,
			0, 0, 0, 0,
			0, 0, 0, 0};

	const uint8x16_t l3_l4e_flags = {
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
			PKT_RX_IP_CKSUM_BAD >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
			 PKT_RX_L4_CKSUM_BAD) >> 1,
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
			 PKT_RX_IP_CKSUM_BAD) >> 1,
			0, 0, 0, 0, 0, 0, 0, 0};

	vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),
			  vreinterpretq_u32_u64(descs[2])).val[1];
	vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]),
			  vreinterpretq_u32_u64(descs[3])).val[1];
	vlan0 = vzipq_u32(vlan0, vlan1).val[0];

	vlan1 = vandq_u32(vlan0, rss_vlan_msk);
	vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags,
						vreinterpretq_u8_u32(vlan1)));

	rss = vshrq_n_u32(vlan1, 11);
	rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags,
					      vreinterpretq_u8_u32(rss)));

	l3_l4e = vshrq_n_u32(vlan1, 22);
	l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags,
					      vreinterpretq_u8_u32(l3_l4e)));
	/* then we shift left 1 bit */
	l3_l4e = vshlq_n_u32(l3_l4e, 1);
	/* we need to mask out the reduntant bits */
	l3_l4e = vandq_u32(l3_l4e, cksum_mask);

	vlan0 = vorrq_u32(vlan0, rss);
	vlan0 = vorrq_u32(vlan0, l3_l4e);

	rearm0 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 0), mbuf_init, 1);
	rearm1 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 1), mbuf_init, 1);
	rearm2 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 2), mbuf_init, 1);
	rearm3 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 3), mbuf_init, 1);

	vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0);
	vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1);
	vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2);
	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
}
Exemplo n.º 5
0
int crypto_core(
    unsigned char *out,
    const unsigned char *in,
    const unsigned char *k,
    const unsigned char *c
)
{
    int i;

    const uint32x4_t abab = {-1,0,-1,0};

    uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k);
    uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16));
    uint32x4_t c0c1c2c3 = (uint32x4_t) vld1q_u8((uint8_t *) c);
    uint32x4_t n0n1n2n3 = (uint32x4_t) vld1q_u8((uint8_t *) in);

    uint32x4_t n1n2n3n0 = vextq_u32(n0n1n2n3,n0n1n2n3,1);
    uint32x2_t n1n2 = vget_low_u32(n1n2n3n0);
    uint32x2_t n3n0 = vget_high_u32(n1n2n3n0);
    uint32x2_t k0k1 = vget_low_u32(k0k1k2k3);
    uint32x2_t k2k3 = vget_high_u32(k0k1k2k3);
    uint32x2_t k4k5 = vget_low_u32(k4k5k6k7);
    uint32x2_t k6k7 = vget_high_u32(k4k5k6k7);

    uint32x2_t n0k0 = vext_u32(n3n0,k0k1,1);
    uint32x2_t k0n0 = vext_u32(n0k0,n0k0,1);
    uint32x4_t k4k5k0n0 = vcombine_u32(k4k5,k0n0);

    uint32x2_t k1k6 = vext_u32(k0k1,k6k7,1);
    uint32x2_t k6k1 = vext_u32(k1k6,k1k6,1);
    uint32x4_t n1n2k6k1 = vcombine_u32(n1n2,k6k1);

    uint32x2_t k7n3 = vext_u32(k6k7,n3n0,1);
    uint32x2_t n3k7 = vext_u32(k7n3,k7n3,1);
    uint32x4_t k2k3n3k7 = vcombine_u32(k2k3,n3k7);

    uint32x4_t start0 = c0c1c2c3;
    uint32x4_t start1 = vextq_u32(k4k5k0n0,k4k5k0n0,1);
    uint32x4_t start2 = vextq_u32(n1n2k6k1,n1n2k6k1,1);
    uint32x4_t start3 = vextq_u32(k2k3n3k7,k2k3n3k7,1);

    uint32x4_t diag0 = start0;
    uint32x4_t diag1 = start1;
    uint32x4_t diag2 = start2;
    uint32x4_t diag3 = start3;

    uint32x4_t x0x5x10x15;
    uint32x4_t x12x1x6x11;
    uint32x4_t x8x13x2x7;
    uint32x4_t x4x9x14x3;

    uint32x4_t x0x1x10x11;
    uint32x4_t x12x13x6x7;
    uint32x4_t x8x9x2x3;
    uint32x4_t x4x5x14x15;

    uint32x4_t x0x1x2x3;
    uint32x4_t x4x5x6x7;
    uint32x4_t x8x9x10x11;
    uint32x4_t x12x13x14x15;

    uint32x4_t a0;
    uint32x4_t a1;
    uint32x4_t a2;
    uint32x4_t a3;

    for (i = ROUNDS; i > 0; i -= 2) {
        a0 = diag1 + diag0;
        diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25);
        a1 = diag0 + diag3;
        diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23);
        a2 = diag3 + diag2;
        diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19);
        a3 = diag2 + diag1;
        diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14);

        diag3 = vextq_u32(diag3,diag3,3);
        diag2 = vextq_u32(diag2,diag2,2);
        diag1 = vextq_u32(diag1,diag1,1);

        a0 = diag3 + diag0;
        diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25);
        a1 = diag0 + diag1;
        diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23);
        a2 = diag1 + diag2;
        diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19);
        a3 = diag2 + diag3;
        diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14);

        diag1 = vextq_u32(diag1,diag1,3);
        diag2 = vextq_u32(diag2,diag2,2);
        diag3 = vextq_u32(diag3,diag3,1);
    }

    x0x5x10x15 = diag0 + start0;
    x12x1x6x11 = diag1 + start1;
    x8x13x2x7 = diag2 + start2;
    x4x9x14x3 = diag3 + start3;

    x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
    x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
    x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
    x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);

    x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
    x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
    x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
    x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));

    vst1q_u8((uint8_t *) out,(uint8x16_t) x0x1x2x3);
    vst1q_u8(16 + (uint8_t *) out,(uint8x16_t) x4x5x6x7);
    vst1q_u8(32 + (uint8_t *) out,(uint8x16_t) x8x9x10x11);
    vst1q_u8(48 + (uint8_t *) out,(uint8x16_t) x12x13x14x15);

    return 0;
}