static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, int src_width, int do_store) { int i; for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]); const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds const uint16x8_t G = vpaddlq_u8(RGB.val[1]); const uint16x8_t B = vpaddlq_u8(RGB.val[0]); int16x8_t U_tmp, V_tmp; CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp); { const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1); const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1); if (do_store) { vst1_u8(u, U); vst1_u8(v, V); } else { const uint8x8_t prev_u = vld1_u8(u); const uint8x8_t prev_v = vld1_u8(v); vst1_u8(u, vrhadd_u8(U, prev_u)); vst1_u8(v, vrhadd_u8(V, prev_v)); } } } if (i < src_width) { // left-over WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); } }
void dmz_deinterleave_RGBA_to_R(uint8_t *source, uint8_t *dest, int size) { #if DMZ_HAS_NEON_COMPILETIME if (dmz_has_neon_runtime()) { assert(size >= 16); // required for the vectorized handling of leftover_bytes; also, a reasonable expectation! for (int offset = 0; offset + 15 < size; offset += 16) { uint8x16x4_t r1 = vld4q_u8(&source[offset * 4]); vst1q_u8(&dest[offset], r1.val[0]); } // use "overlapping" to process the remaining bytes // See http://community.arm.com/groups/processors/blog/2010/05/10/coding-for-neon--part-2-dealing-with-leftovers if (size % 16 > 0) { int offset = size - 16; uint8x16x4_t r1 = vld4q_u8(&source[offset * 4]); vst1q_u8(&dest[offset], r1.val[0]); } } else #endif { for (int offset = 0; offset + 7 < size; offset += 8) { int bufferOffset = offset * 4; dest[offset] = source[bufferOffset]; dest[offset + 1] = source[bufferOffset + (1 * 4)]; dest[offset + 2] = source[bufferOffset + (2 * 4)]; dest[offset + 3] = source[bufferOffset + (3 * 4)]; dest[offset + 4] = source[bufferOffset + (4 * 4)]; dest[offset + 5] = source[bufferOffset + (5 * 4)]; dest[offset + 6] = source[bufferOffset + (6 * 4)]; dest[offset + 7] = source[bufferOffset + (7 * 4)]; } int leftover_bytes = size % 8; // each RGBA pixel is 4 bytes, so can assume size % 4 == 0 if (leftover_bytes > 0) { for (int offset = size - leftover_bytes; offset < size; offset += 4) { int bufferOffset = offset * 4; dest[offset] = source[bufferOffset]; dest[offset + 1] = source[bufferOffset + (1 * 4)]; dest[offset + 2] = source[bufferOffset + (2 * 4)]; dest[offset + 3] = source[bufferOffset + (3 * 4)]; } } } }
uint64_t popcnt_neon_vcnt(const uint8_t* data, const size_t size) { const size_t chunk_size = 16 * 4 * 2; uint8_t* ptr = const_cast<uint8_t*>(data); const size_t n = size / chunk_size; const size_t k = size % chunk_size; uint32x4_t sum = vcombine_u32(vcreate_u32(0), vcreate_u32(0)); for (size_t i=0; i < n; i++, ptr += chunk_size) { uint8x16x4_t input0 = vld4q_u8(ptr + 0 * 16 * 4); uint8x16x4_t input1 = vld4q_u8(ptr + 1 * 16 * 4); uint8x16_t t0 = vcntq_u8(input0.val[0]); t0 = vaddq_u8(t0, vcntq_u8(input0.val[1])); t0 = vaddq_u8(t0, vcntq_u8(input0.val[2])); t0 = vaddq_u8(t0, vcntq_u8(input0.val[3])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[0])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[1])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[2])); t0 = vaddq_u8(t0, vcntq_u8(input1.val[3])); const uint16x8_t t1 = vpaddlq_u8(t0); sum = vpadalq_u16(sum, t1); } uint32_t scalar = 0; uint32_t tmp[4]; vst1q_u32(tmp, sum); for (int i=0; i < 4; i++) { scalar += tmp[i]; } for (size_t j=0; j < k; j++) { scalar += lookup8bit[ptr[j]]; } return scalar; }
static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } }; vst3q_u8(dst, tmp); dst += 48; } VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs }
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for // gcc-4.8.x at least. static void ConvertBGRAToRGBA(const uint32_t* src, int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!) const uint8x16_t tmp = pixel.val[0]; pixel.val[0] = pixel.val[2]; pixel.val[2] = tmp; vst4q_u8(dst, pixel); dst += 64; } VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs }
void FilterEffect::forceValidPreMultipliedPixels() { // Must operate on pre-multiplied results; other formats cannot have invalid pixels. if (!m_premultipliedImageResult) return; Uint8ClampedArray* imageArray = m_premultipliedImageResult.get(); unsigned char* pixelData = imageArray->data(); int pixelArrayLength = imageArray->length(); // We must have four bytes per pixel, and complete pixels ASSERT(!(pixelArrayLength % 4)); #if HAVE(ARM_NEON_INTRINSICS) if (pixelArrayLength >= 64) { unsigned char* lastPixel = pixelData + (pixelArrayLength & ~0x3f); do { // Increments pixelData by 64. uint8x16x4_t sixteenPixels = vld4q_u8(pixelData); sixteenPixels.val[0] = vminq_u8(sixteenPixels.val[0], sixteenPixels.val[3]); sixteenPixels.val[1] = vminq_u8(sixteenPixels.val[1], sixteenPixels.val[3]); sixteenPixels.val[2] = vminq_u8(sixteenPixels.val[2], sixteenPixels.val[3]); vst4q_u8(pixelData, sixteenPixels); pixelData += 64; } while (pixelData < lastPixel); pixelArrayLength &= 0x3f; if (!pixelArrayLength) return; } #endif int numPixels = pixelArrayLength / 4; // Iterate over each pixel, checking alpha and adjusting color components if necessary while (--numPixels >= 0) { // Alpha is the 4th byte in a pixel unsigned char a = *(pixelData + 3); // Clamp each component to alpha, and increment the pixel location for (int i = 0; i < 3; ++i) { if (*pixelData > a) *pixelData = a; ++pixelData; } // Increment for alpha ++pixelData; } }
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ static void shuffle4_neon(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { size_t i, j, k; static const size_t bytesoftype = 4; uint8x16x4_t r0; for (i = 0, k = 0; i < vectorizable_elements * bytesoftype; i += 64, k++) { /* Load (and permute) 64 bytes to the structure r0 */ r0 = vld4q_u8(src + i); /* Store the results in the destination vector */ for (j = 0; j < 4; j++) { vst1q_u8(dest + total_elements * j + k * 16, r0.val[j]); } } }
inline uint8x16x4_t vld4q(const u8 * ptr) { return vld4q_u8(ptr); }
void test_vld4Qu8 (void) { uint8x16x4_t out_uint8x16x4_t; out_uint8x16x4_t = vld4q_u8 (0); }