void GPUgstate::FastLoadBoneMatrix(u32 addr) { const u32_le *src = (const u32_le *)Memory::GetPointerUnchecked(addr); u32 num = boneMatrixNumber; u32 *dst = (u32 *)(boneMatrix + (num & 0x7F)); #ifdef _M_SSE __m128i row1 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8); __m128i row2 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 4)), 8); __m128i row3 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 8)), 8); if ((num & 0x3) == 0) { _mm_store_si128((__m128i *)dst, row1); _mm_store_si128((__m128i *)(dst + 4), row2); _mm_store_si128((__m128i *)(dst + 8), row3); } else { _mm_storeu_si128((__m128i *)dst, row1); _mm_storeu_si128((__m128i *)(dst + 4), row2); _mm_storeu_si128((__m128i *)(dst + 8), row3); } #elif PPSSPP_ARCH(ARM_NEON) const uint32x4_t row1 = vshlq_n_u32(vld1q_u32(src), 8); const uint32x4_t row2 = vshlq_n_u32(vld1q_u32(src + 4), 8); const uint32x4_t row3 = vshlq_n_u32(vld1q_u32(src + 8), 8); vst1q_u32(dst, row1); vst1q_u32(dst + 4, row2); vst1q_u32(dst + 8, row3); #else for (int i = 0; i < 12; i++) { dst[i] = src[i] << 8; } #endif num += 12; gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F); }
int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; #if CV_SSE __m128i d0 = _mm_setzero_si128(); for( ; j <= n - 16; j += 16 ) { __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } for( ; j <= n - 4; j += 4 ) { __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); #elif CV_NEON uint32x4_t v_sum = vdupq_n_u32(0.0f); for ( ; j <= n - 16; j += 16) { uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); } uint CV_DECL_ALIGNED(16) buf[4]; vst1q_u32(buf, v_sum); d = buf[0] + buf[1] + buf[2] + buf[3]; #endif { for( ; j <= n - 4; j += 4 ) { d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); } } for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; }
uint64_t popcnt_neon_vcnt(const uint8_t* data, const size_t size) { const size_t chunk_size = 16 * 4 * 2; uint8_t* ptr = const_cast<uint8_t*>(data); const size_t n = size / chunk_size; const size_t k = size % chunk_size; uint32x4_t sum = vcombine_u32(vcreate_u32(0), vcreate_u32(0)); for (size_t i=0; i < n; i++, ptr += chunk_size) { uint8x16x4_t input0 = vld4q_u8(ptr + 0 * 16 * 4); uint8x16x4_t input1 = vld4q_u8(ptr + 1 * 16 * 4); uint8x16_t t0 = vcntq_u8(input0.val[0]); t0 = vaddq_u8(t0, vcntq_u8(input0.val[1])); t0 = vaddq_u8(t0, vcntq_u8(input0.val[2])); t0 = vaddq_u8(t0, vcntq_u8(input0.val[3])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[0])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[1])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[2])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[3])); const uint16x8_t t1 = vpaddlq_u8(t0); sum = vpadalq_u16(sum, t1); } uint32_t scalar = 0; uint32_t tmp[4]; vst1q_u32(tmp, sum); for (int i=0; i < 4; i++) { scalar += tmp[i]; } for (size_t j=0; j < k; j++) { scalar += lookup8bit[ptr[j]]; } return scalar; }
/* u32x4 mv mul */ void mw_neon_mv_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, unsigned int * C) { int i = 0; int k = 0; uint32x4_t neon_b, neon_c; uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3; uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_u32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_u32(A + j); j+=Row; neon_a1 = vld1q_u32(A + j); j+=Row; neon_a2 = vld1q_u32(A + j); j+=Row; neon_a3 = vld1q_u32(A + j); neon_b = vld1q_u32(B + k); neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0)); neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1)); neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2)); neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3)); neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c); } vst1q_u32(C + i, neon_c); } }
/* Expands a palettized row into RGBA. */ int png_do_expand_palette_neon_rgba(png_structrp png_ptr, png_row_infop row_info, png_const_bytep row, png_bytepp ssp, png_bytepp ddp) { png_uint_32 row_width = row_info->width; const png_uint_32 *riffled_palette = (const png_uint_32 *)png_ptr->riffled_palette; const png_int_32 pixels_per_chunk = 4; int i; if (row_width < pixels_per_chunk) return 0; /* This function originally gets the last byte of the output row. * The NEON part writes forward from a given position, so we have * to seek this back by 4 pixels x 4 bytes. */ *ddp = *ddp - ((pixels_per_chunk * sizeof(png_uint_32)) - 1); for (i = 0; i < row_width; i += pixels_per_chunk) { uint32x4_t cur; png_bytep sp = *ssp - i, dp = *ddp - (i << 2); cur = vld1q_dup_u32 (riffled_palette + *(sp - 3)); cur = vld1q_lane_u32(riffled_palette + *(sp - 2), cur, 1); cur = vld1q_lane_u32(riffled_palette + *(sp - 1), cur, 2); cur = vld1q_lane_u32(riffled_palette + *(sp - 0), cur, 3); vst1q_u32((void *)dp, cur); } if (i != row_width) { /* Remove the amount that wasn't processed. */ i -= pixels_per_chunk; } /* Decrement output pointers. */ *ssp = *ssp - i; *ddp = *ddp - (i << 2); return i; }
/* u32x4 saturated sub */ void mw_neon_mm_qsub_u32x4(unsigned int * A, int Row, int Col, unsigned int * B, unsigned int * C) { uint32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_u32(A + k); neon_b = vld1q_u32(B + k); neon_c = vqsubq_u32(neon_a, neon_b); vst1q_u32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] - B[k + i]; } }
static void neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst, uint32_t *d_end, uint8_t btable[8][4][16], uint32_t val, int xor, int altmap) { int i, j; #ifdef ARCH_AARCH64 uint8x16_t tables[8][4]; #else uint8x8x2_t tables[8][4]; #endif uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3; uint8x16_t p0, p1, p2, p3, si, mask1; uint16x8x2_t r0, r1; uint8x16x2_t q0, q1; for (i = 0; i < 8; i++) { for (j = 0; j < 4; j++) { #ifdef ARCH_AARCH64 tables[i][j] = vld1q_u8(btable[i][j]); #else tables[i][j].val[0] = vld1_u8(btable[i][j]); tables[i][j].val[1] = vld1_u8(btable[i][j] + 8); #endif } } mask1 = vdupq_n_u8(0xf); while (dst < d_end) { v0 = vld1q_u32(src); src += 4; v1 = vld1q_u32(src); src += 4; v2 = vld1q_u32(src); src += 4; v3 = vld1q_u32(src); src += 4; if (altmap) { q0.val[0] = vreinterpretq_u8_u32(v0); q0.val[1] = vreinterpretq_u8_u32(v1); q1.val[0] = vreinterpretq_u8_u32(v2); q1.val[1] = vreinterpretq_u8_u32(v3); } else { r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2)); r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3)); q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]), vreinterpretq_u8_u16(r1.val[0])); q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]), vreinterpretq_u8_u16(r1.val[1])); } si = vandq_u8(q0.val[0], mask1); p0 = vqtbl1q_u8(tables[0][0], si); p1 = vqtbl1q_u8(tables[0][1], si); p2 = vqtbl1q_u8(tables[0][2], si); p3 = vqtbl1q_u8(tables[0][3], si); si = vshrq_n_u8(q0.val[0], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si)); si = vandq_u8(q0.val[1], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si)); si = vshrq_n_u8(q0.val[1], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si)); si = vandq_u8(q1.val[0], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si)); si = vshrq_n_u8(q1.val[0], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si)); si = vandq_u8(q1.val[1], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si)); si = vshrq_n_u8(q1.val[1], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si)); if (altmap) { s0 = vreinterpretq_u32_u8(p0); s1 = vreinterpretq_u32_u8(p1); s2 = vreinterpretq_u32_u8(p2); s3 = vreinterpretq_u32_u8(p3); } else { q0 = vtrnq_u8(p0, p1); q1 = vtrnq_u8(p2, p3); r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]), vreinterpretq_u16_u8(q1.val[0])); r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]), vreinterpretq_u16_u8(q1.val[1])); s0 = vreinterpretq_u32_u16(r0.val[0]); s1 = vreinterpretq_u32_u16(r1.val[0]); s2 = vreinterpretq_u32_u16(r0.val[1]); s3 = vreinterpretq_u32_u16(r1.val[1]); } if (xor) { v0 = vld1q_u32(dst); v1 = vld1q_u32(dst + 4); v2 = vld1q_u32(dst + 8); v3 = vld1q_u32(dst + 12); s0 = veorq_u32(s0, v0); s1 = veorq_u32(s1, v1); s2 = veorq_u32(s2, v2); s3 = veorq_u32(s3, v3); } vst1q_u32(dst, s0); vst1q_u32(dst + 4, s1); vst1q_u32(dst + 8, s2); vst1q_u32(dst + 12, s3); dst += 16; } }
inline void vst1q(u32 * ptr, const uint32x4_t & v) { return vst1q_u32(ptr, v); }