void test_vsubQu8 (void) { uint8x16_t out_uint8x16_t; uint8x16_t arg0_uint8x16_t; uint8x16_t arg1_uint8x16_t; out_uint8x16_t = vsubq_u8 (arg0_uint8x16_t, arg1_uint8x16_t); }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); const uint8x8_t shuffle = vld1_u8(kGreenShuffle); for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t greens = vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3); }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); #ifdef USE_VTBLQ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); #else const uint8x8_t shuffle = vld1_u8(kGreenShuffle); #endif for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t greens = DoGreenShuffle(argb, shuffle); vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3); }
void ar_vsub_u8_neon(uint8_t* res, const uint8_t* a, const uint8_t* b, uint32_t n) { uint8x16_t a_loaded; uint8x16_t b_loaded; uint8x16_t res_loaded; for (uint32_t i = 0; i < n; i += 16) { a_loaded = vld1q_u8(&(a[i])); b_loaded = vld1q_u8(&(b[i])); res_loaded = vsubq_u8(a_loaded, b_loaded); vst1q_u8(&(res[i]),res_loaded); } }
/* u8x16 sub */ void mw_neon_mm_sub_u8x16(unsigned char * A, int Row, int Col, unsigned char * B, unsigned char * C) { uint8x16_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 16; i <= size ; i+=16) { k = i - 16; neon_a = vld1q_u8(A + k); neon_b = vld1q_u8(B + k); neon_c = vsubq_u8(neon_a, neon_b); vst1q_u8(C + k, neon_c); } k = i - 16; for (i = 0; i < size % 16; i++) { C[k + i] = A[k + i] - B[k + i]; } }
inline uint8x16_t vsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vsubq_u8 (v0, v1); }
static inline uint8x16x4_t enc_translate (uint8x16x4_t in) { uint8x16x4_t mask1, mask2, mask3, mask4, out; // Translate values 0..63 to the Base64 alphabet. There are five sets: // # From To Abs Delta Characters // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz // 2 [52..61] [48..57] -4 -75 0123456789 // 3 [62] [43] -19 -15 + // 4 [63] [47] -16 +3 / // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4], // [3,4], and [4]: mask1.val[0] = CMPGT(in.val[0], 25); mask1.val[1] = CMPGT(in.val[1], 25); mask1.val[2] = CMPGT(in.val[2], 25); mask1.val[3] = CMPGT(in.val[3], 25); mask2.val[0] = CMPGT(in.val[0], 51); mask2.val[1] = CMPGT(in.val[1], 51); mask2.val[2] = CMPGT(in.val[2], 51); mask2.val[3] = CMPGT(in.val[3], 51); mask3.val[0] = CMPGT(in.val[0], 61); mask3.val[1] = CMPGT(in.val[1], 61); mask3.val[2] = CMPGT(in.val[2], 61); mask3.val[3] = CMPGT(in.val[3], 61); mask4.val[0] = CMPEQ(in.val[0], 63); mask4.val[1] = CMPEQ(in.val[1], 63); mask4.val[2] = CMPEQ(in.val[2], 63); mask4.val[3] = CMPEQ(in.val[3], 63); // All characters are at least in cumulative set 0, so add 'A': out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65)); out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65)); out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65)); out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65)); // For inputs which are also in any of the other cumulative sets, // add delta values against the previous set(s) to correct the shift: out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6)); out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6)); out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6)); out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6)); out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75)); out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75)); out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75)); out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75)); out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15)); out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15)); out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15)); out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15)); out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3)); out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3)); out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3)); out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3)); return out; }