/* u32x4 mm mul */ void mw_neon_mm_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, int Col, unsigned int * C) { int i, k, j; uint32x4_t neon_b, neon_c; uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3; uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_u32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_u32(A + j_T); j_T+=Row; neon_a1 = vld1q_u32(A + j_T); j_T+=Row; neon_a2 = vld1q_u32(A + j_T); j_T+=Row; neon_a3 = vld1q_u32(A + j_T); neon_b = vld1q_u32(B + k_Row + j); neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0)); neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1)); neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2)); neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3)); neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c); vst1q_lane_u32(C + k_Row + i, neon_c, 0); vst1q_lane_u32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_u32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_u32(C + k_Row + i + 3, neon_c, 3); } } } }
void test_vmulQu32 (void) { uint32x4_t out_uint32x4_t; uint32x4_t arg0_uint32x4_t; uint32x4_t arg1_uint32x4_t; out_uint32x4_t = vmulq_u32 (arg0_uint32x4_t, arg1_uint32x4_t); }
/* u32x4 mv mul */ void mw_neon_mv_mul_u32x4(unsigned int * A, int Row, int T, unsigned int * B, unsigned int * C) { int i = 0; int k = 0; uint32x4_t neon_b, neon_c; uint32x4_t neon_a0, neon_a1, neon_a2, neon_a3; uint32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_u32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_u32(A + j); j+=Row; neon_a1 = vld1q_u32(A + j); j+=Row; neon_a2 = vld1q_u32(A + j); j+=Row; neon_a3 = vld1q_u32(A + j); neon_b = vld1q_u32(B + k); neon_b0 = vdupq_n_u32(vgetq_lane_u32(neon_b, 0)); neon_b1 = vdupq_n_u32(vgetq_lane_u32(neon_b, 1)); neon_b2 = vdupq_n_u32(vgetq_lane_u32(neon_b, 2)); neon_b3 = vdupq_n_u32(vgetq_lane_u32(neon_b, 3)); neon_c = vaddq_u32(vmulq_u32(neon_a0, neon_b0), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a1, neon_b1), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a2, neon_b2), neon_c); neon_c = vaddq_u32(vmulq_u32(neon_a3, neon_b3), neon_c); } vst1q_u32(C + i, neon_c); } }
void SkRGB16BlitterBlitV_neon(uint16_t* device, int height, size_t deviceRB, unsigned scale, uint32_t src32) { if (height >= 8) { uint16_t* dst = device; // prepare constants uint16x8_t vdev = vdupq_n_u16(0); uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); uint32x4_t vsrc32 = vdupq_n_u32(src32); uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); while (height >= 8){ LOAD_LANE_16(vdev, 0) LOAD_LANE_16(vdev, 1) LOAD_LANE_16(vdev, 2) LOAD_LANE_16(vdev, 3) LOAD_LANE_16(vdev, 4) LOAD_LANE_16(vdev, 5) LOAD_LANE_16(vdev, 6) LOAD_LANE_16(vdev, 7) // Expand_rgb_16 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); // Compact_rgb_16 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); vdst32_lo = vshrq_n_u32(vdst32_lo, 5); vdst32_hi = vshrq_n_u32(vdst32_hi, 5); uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); STORE_LANE_16(vdst16_lo, 0) STORE_LANE_16(vdst16_lo, 1) STORE_LANE_16(vdst16_lo, 2) STORE_LANE_16(vdst16_lo, 3) STORE_LANE_16(vdst16_hi, 0) STORE_LANE_16(vdst16_hi, 1) STORE_LANE_16(vdst16_hi, 2) STORE_LANE_16(vdst16_hi, 3) height -= 8; } } while (height != 0){ uint32_t dst32 = SkExpand_rgb_16(*device) * scale; *device = SkCompact_rgb_16((src32 + dst32) >> 5); device = (uint16_t*)((char*)device + deviceRB); height--; } }