Exemplo n.º 1
0
void GPUgstate::FastLoadBoneMatrix(u32 addr) {
	const u32_le *src = (const u32_le *)Memory::GetPointerUnchecked(addr);
	u32 num = boneMatrixNumber;
	u32 *dst = (u32 *)(boneMatrix + (num & 0x7F));

#ifdef _M_SSE
	__m128i row1 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);
	__m128i row2 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 4)), 8);
	__m128i row3 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 8)), 8);
	if ((num & 0x3) == 0) {
		_mm_store_si128((__m128i *)dst, row1);
		_mm_store_si128((__m128i *)(dst + 4), row2);
		_mm_store_si128((__m128i *)(dst + 8), row3);
	} else {
		_mm_storeu_si128((__m128i *)dst, row1);
		_mm_storeu_si128((__m128i *)(dst + 4), row2);
		_mm_storeu_si128((__m128i *)(dst + 8), row3);
	}
#elif PPSSPP_ARCH(ARM_NEON)
	const uint32x4_t row1 = vshlq_n_u32(vld1q_u32(src), 8);
	const uint32x4_t row2 = vshlq_n_u32(vld1q_u32(src + 4), 8);
	const uint32x4_t row3 = vshlq_n_u32(vld1q_u32(src + 8), 8);
	vst1q_u32(dst, row1);
	vst1q_u32(dst + 4, row2);
	vst1q_u32(dst + 8, row3);
#else
	for (int i = 0; i < 12; i++) {
		dst[i] = src[i] << 8;
	}
#endif

	num += 12;
	gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F);
}
Exemplo n.º 2
0
/* u32x4 mm mul */
void mw_neon_mm_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, int Col, unsigned int * C)
{
	int i, k, j;

	uint32x4_t neon_b, neon_c;
	uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_u32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_u32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_u32(A + j_T);

				neon_b = vld1q_u32(B + k_Row + j);
				neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0));
				neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1));
				neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2));
				neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3));

				neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c);

				vst1q_lane_u32(C + k_Row + i, neon_c, 0);
				vst1q_lane_u32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_u32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_u32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
Exemplo n.º 3
0
/* u32x4 mv mul */
void mw_neon_mv_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, unsigned int * C)
{
	int i = 0;
	int k = 0;

	uint32x4_t neon_b, neon_c;
	uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_u32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_u32(A + j);
			j+=Row;
			neon_a1 = vld1q_u32(A + j);
			j+=Row;
			neon_a2 = vld1q_u32(A + j);
			j+=Row;
			neon_a3 = vld1q_u32(A + j);

			neon_b = vld1q_u32(B + k);
			neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0));
			neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1));
			neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2));
			neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3));

			neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c);

		}

		vst1q_u32(C + i, neon_c);
	}
}
Exemplo n.º 4
0
/* u32x4 saturated sub */
void mw_neon_mm_qsub_u32x4(unsigned int * A, int Row, int Col, unsigned int * B, unsigned int * C)
{
	uint32x4_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 4; i <= size ; i+=4)
	{
		k = i - 4;
		neon_a = vld1q_u32(A + k);
		neon_b = vld1q_u32(B + k);
		neon_c = vqsubq_u32(neon_a, neon_b);
		vst1q_u32(C + k, neon_c);
	}

	k = i - 4;
    for (i = 0; i < size % 4; i++)
	{
		C[k + i] = A[k + i] - B[k + i];
	}
}
Exemplo n.º 5
0
static
void
neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
                                    uint32_t *d_end, uint8_t btable[8][4][16],
                                    uint32_t val, int xor, int altmap)
{
  int i, j;
#ifdef ARCH_AARCH64
  uint8x16_t tables[8][4];
#else
  uint8x8x2_t tables[8][4];
#endif
  uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
  uint8x16_t p0, p1, p2, p3, si, mask1;
  uint16x8x2_t r0, r1;
  uint8x16x2_t q0, q1;

  for (i = 0; i < 8; i++) {
    for (j = 0; j < 4; j++) {
#ifdef ARCH_AARCH64
      tables[i][j] = vld1q_u8(btable[i][j]);
#else
      tables[i][j].val[0] = vld1_u8(btable[i][j]);
      tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
#endif
    }
  }

  mask1 = vdupq_n_u8(0xf);

  while (dst < d_end) {

      v0 = vld1q_u32(src); src += 4;
      v1 = vld1q_u32(src); src += 4;
      v2 = vld1q_u32(src); src += 4;
      v3 = vld1q_u32(src); src += 4;

      if (altmap) {
          q0.val[0] = vreinterpretq_u8_u32(v0);
          q0.val[1] = vreinterpretq_u8_u32(v1);
          q1.val[0] = vreinterpretq_u8_u32(v2);
          q1.val[1] = vreinterpretq_u8_u32(v3);
      } else {
          r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
          r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));

          q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
                        vreinterpretq_u8_u16(r1.val[0]));
          q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
                        vreinterpretq_u8_u16(r1.val[1]));
      }

      si = vandq_u8(q0.val[0], mask1);
      p0 = vqtbl1q_u8(tables[0][0], si);
      p1 = vqtbl1q_u8(tables[0][1], si);
      p2 = vqtbl1q_u8(tables[0][2], si);
      p3 = vqtbl1q_u8(tables[0][3], si);

      si = vshrq_n_u8(q0.val[0], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));

      si = vandq_u8(q0.val[1], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));

      si = vshrq_n_u8(q0.val[1], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));

      si = vandq_u8(q1.val[0], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));

      si = vshrq_n_u8(q1.val[0], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));

      si = vandq_u8(q1.val[1], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));

      si = vshrq_n_u8(q1.val[1], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));

      if (altmap) {
          s0 = vreinterpretq_u32_u8(p0);
          s1 = vreinterpretq_u32_u8(p1);
          s2 = vreinterpretq_u32_u8(p2);
          s3 = vreinterpretq_u32_u8(p3);
      } else {
          q0 = vtrnq_u8(p0, p1);
          q1 = vtrnq_u8(p2, p3);

          r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
                         vreinterpretq_u16_u8(q1.val[0]));
          r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
                         vreinterpretq_u16_u8(q1.val[1]));

          s0 = vreinterpretq_u32_u16(r0.val[0]);
          s1 = vreinterpretq_u32_u16(r1.val[0]);
          s2 = vreinterpretq_u32_u16(r0.val[1]);
          s3 = vreinterpretq_u32_u16(r1.val[1]);
      }

      if (xor) {
          v0 = vld1q_u32(dst);
          v1 = vld1q_u32(dst + 4);
          v2 = vld1q_u32(dst + 8);
          v3 = vld1q_u32(dst + 12);
          s0 = veorq_u32(s0, v0);
          s1 = veorq_u32(s1, v1);
          s2 = veorq_u32(s2, v2);
          s3 = veorq_u32(s3, v3);
      }

      vst1q_u32(dst,      s0);
      vst1q_u32(dst + 4,  s1);
      vst1q_u32(dst + 8,  s2);
      vst1q_u32(dst + 12, s3);

      dst += 16;
  }
}
Exemplo n.º 6
0
void test_vld1Qu32 (void)
{
  uint32x4_t out_uint32x4_t;

  out_uint32x4_t = vld1q_u32 (0);
}
Exemplo n.º 7
0
bool CPU_ProbeNEON()
{
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
	return false;
#elif (CRYPTOPP_ARM_NEON_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
	volatile bool result = true;
	__try
	{
		uint32_t v1[4] = {1,1,1,1};
		uint32x4_t x1 = vld1q_u32(v1);
		uint64_t v2[2] = {1,1};
		uint64x2_t x2 = vld1q_u64(v2);

		uint32x4_t x3 = vdupq_n_u32(2);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
		uint64x2_t x4 = vdupq_n_u64(2);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);

		result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
	}
	__except (EXCEPTION_EXECUTE_HANDLER)
	{
		return false;
	}
	return result;
# else

	// longjmp and clobber warnings. Volatile is required.
	// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
	volatile bool result = true;

	volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
	if (oldHandler == SIG_ERR)
		return false;

	volatile sigset_t oldMask;
	if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
		return false;

	if (setjmp(s_jmpSIGILL))
		result = false;
	else
	{
		uint32_t v1[4] = {1,1,1,1};
		uint32x4_t x1 = vld1q_u32(v1);
		uint64_t v2[2] = {1,1};
		uint64x2_t x2 = vld1q_u64(v2);

		uint32x4_t x3 = {0,0,0,0};
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0);
		x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3);
		uint64x2_t x4 = {0,0};
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0);
		x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1);

		// Hack... GCC optimizes away the code and returns true
		result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1));
	}

	sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
	signal(SIGILL, oldHandler);
	return result;
# endif
#else
	return false;
#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
}
Exemplo n.º 8
0
inline  uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }