void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XABCD_u8 = vld1_u8(above - 1); const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); const uint32x2_t zero = vdup_n_u32(0); const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); const uint8_t D = vget_lane_u8(XABCD_u8, 4); const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r3 = vreinterpret_u32_u8(avg2); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); }
uint64x1_t test_vshl_n_u64 (uint64x1_t a) { return vshl_n_u64 (a, 9); }
/* * Code BCH */ uint64_t BCH128to64(uv64 in){ register ui64 b1 = in[0]; register ui64 res_a = b1; register ui64 res_b = vshl_n_u64(b1, 2); register ui64 res_c = vshl_n_u64(b1, 7); register ui64 res_d = vshl_n_u64(b1, 8); res_a = ui64_shiftl_xor(res_a, b1, 10); res_b = ui64_shiftl_xor(res_b, b1, 12); res_c = ui64_shiftl_xor(res_c, b1, 14); res_d = ui64_shiftl_xor(res_d, b1, 15); res_a = ui64_shiftl_xor(res_a, b1, 16); res_b = ui64_shiftl_xor(res_b, b1, 23); res_c = ui64_shiftl_xor(res_c, b1, 25); res_d = ui64_shiftl_xor(res_d, b1, 27); res_a = ui64_shiftl_xor(res_a, b1, 28); res_b = ui64_shiftl_xor(res_b, b1, 30); res_c = ui64_shiftl_xor(res_c, b1, 31); res_d = ui64_shiftl_xor(res_d, b1, 32); res_a = ui64_shiftl_xor(res_a, b1, 33); res_b = ui64_shiftl_xor(res_b, b1, 37); res_c = ui64_shiftl_xor(res_c, b1, 38); res_d = ui64_shiftl_xor(res_d, b1, 39); res_a = ui64_shiftl_xor(res_a, b1, 40); res_b = ui64_shiftl_xor(res_b, b1, 41); res_c = ui64_shiftl_xor(res_c, b1, 42); res_d = ui64_shiftl_xor(res_d, b1, 44); res_a = ui64_shiftl_xor(res_a, b1, 45); res_b = ui64_shiftl_xor(res_b, b1, 48); res_c = ui64_shiftl_xor(res_c, b1, 58); res_d = ui64_shiftl_xor(res_d, b1, 61); res_a = ui64_shiftl_xor(res_a, b1, 63); register ui64 b2 = in[1]; res_b = ui64_shiftr_xor(res_b, b2, 62); res_c = ui64_shiftr_xor(res_c, b2, 57); res_d = ui64_shiftr_xor(res_d, b2, 56); res_a = ui64_shiftr_xor(res_a, b2, 54); res_b = ui64_shiftr_xor(res_b, b2, 52); res_c = ui64_shiftr_xor(res_c, b2, 50); res_d = ui64_shiftr_xor(res_d, b2, 49); res_a = ui64_shiftr_xor(res_a, b2, 48); res_b = ui64_shiftr_xor(res_b, b2, 41); res_c = ui64_shiftr_xor(res_c, b2, 39); res_d = ui64_shiftr_xor(res_d, b2, 37); res_a = ui64_shiftr_xor(res_a, b2, 36); res_b = ui64_shiftr_xor(res_b, b2, 34); res_c = ui64_shiftr_xor(res_c, b2, 33); res_d = ui64_shiftr_xor(res_d, b2, 32); res_a = ui64_shiftr_xor(res_a, b2, 31); res_b = ui64_shiftr_xor(res_b, b2, 27); res_c = ui64_shiftr_xor(res_c, b2, 26); res_d = ui64_shiftr_xor(res_d, b2, 25); res_a = ui64_shiftr_xor(res_a, b2, 24); res_b = ui64_shiftr_xor(res_b, b2, 23); res_c = ui64_shiftr_xor(res_c, b2, 22); res_d = ui64_shiftr_xor(res_d, b2, 20); res_a = ui64_shiftr_xor(res_a, b2, 19); res_b = ui64_shiftr_xor(res_b, b2, 16); res_c = ui64_shiftr_xor(res_c, b2, 6); res_d = ui64_shiftr_xor(res_d, b2, 3); res_a = ui64_shiftr_xor(res_a, b2, 1); ui64 res = res_a ^ res_b ^ res_c ^ res_d; return (uint64_t)res ^ ((uint64_t)(-(b2&1))); }