/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ static void unshuffle4_neon(uint8_t * const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements ) { size_t i, j, k; static const size_t bytesoftype = 4; uint8x16x4_t r0; for( i = 0, k = 0; i<vectorizable_elements*bytesoftype; i += 64, k++) { /* load 64 bytes to the structure r0 */ for( j = 0; j < 4; j++) { r0.val[j] = vld1q_u8(src + total_elements* j + k*16); } /* Store (with permutation) the results in the destination vector */ vst4q_u8(dest + k*64, r0); } }
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for // gcc-4.8.x at least. static void ConvertBGRAToRGBA(const uint32_t* src, int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~15); for (; src < end; src += 16) { uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!) const uint8x16_t tmp = pixel.val[0]; pixel.val[0] = pixel.val[2]; pixel.val[2] = tmp; vst4q_u8(dst, pixel); dst += 64; } VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs }
void FilterEffect::forceValidPreMultipliedPixels() { // Must operate on pre-multiplied results; other formats cannot have invalid pixels. if (!m_premultipliedImageResult) return; Uint8ClampedArray* imageArray = m_premultipliedImageResult.get(); unsigned char* pixelData = imageArray->data(); int pixelArrayLength = imageArray->length(); // We must have four bytes per pixel, and complete pixels ASSERT(!(pixelArrayLength % 4)); #if HAVE(ARM_NEON_INTRINSICS) if (pixelArrayLength >= 64) { unsigned char* lastPixel = pixelData + (pixelArrayLength & ~0x3f); do { // Increments pixelData by 64. uint8x16x4_t sixteenPixels = vld4q_u8(pixelData); sixteenPixels.val[0] = vminq_u8(sixteenPixels.val[0], sixteenPixels.val[3]); sixteenPixels.val[1] = vminq_u8(sixteenPixels.val[1], sixteenPixels.val[3]); sixteenPixels.val[2] = vminq_u8(sixteenPixels.val[2], sixteenPixels.val[3]); vst4q_u8(pixelData, sixteenPixels); pixelData += 64; } while (pixelData < lastPixel); pixelArrayLength &= 0x3f; if (!pixelArrayLength) return; } #endif int numPixels = pixelArrayLength / 4; // Iterate over each pixel, checking alpha and adjusting color components if necessary while (--numPixels >= 0) { // Alpha is the 4th byte in a pixel unsigned char a = *(pixelData + 3); // Clamp each component to alpha, and increment the pixel location for (int i = 0; i < 3; ++i) { if (*pixelData > a) *pixelData = a; ++pixelData; } // Increment for alpha ++pixelData; } }
/* Build an RGBA palette from the RGB and separate alpha palettes. */ void png_riffle_palette_rgba(png_structrp png_ptr, png_row_infop row_info) { png_const_colorp palette = png_ptr->palette; png_bytep riffled_palette = png_ptr->riffled_palette; png_const_bytep trans_alpha = png_ptr->trans_alpha; int num_trans = png_ptr->num_trans; int i; /* Initially black, opaque. */ uint8x16x4_t w = {{ vdupq_n_u8(0x00), vdupq_n_u8(0x00), vdupq_n_u8(0x00), vdupq_n_u8(0xff), }}; if (row_info->bit_depth != 8) { png_error(png_ptr, "bit_depth must be 8 for png_riffle_palette_rgba"); return; } /* First, riffle the RGB colours into a RGBA palette, the A value is * set to opaque for now. */ for (i = 0; i < (1 << row_info->bit_depth); i += 16) { uint8x16x3_t v = vld3q_u8((png_const_bytep)(palette + i)); w.val[0] = v.val[0]; w.val[1] = v.val[1]; w.val[2] = v.val[2]; vst4q_u8(riffled_palette + (i << 2), w); } /* Fix up the missing transparency values. */ for (i = 0; i < num_trans; i++) riffled_palette[(i << 2) + 3] = trans_alpha[i]; }
inline void vst4q(u8 * ptr, const uint8x16x4_t & v) { return vst4q_u8(ptr, v); }
void combineUYVY(const Size2D &size, const u8 * srcyBase, ptrdiff_t srcyStride, const u8 * srcuBase, ptrdiff_t srcuStride, const u8 * srcvBase, ptrdiff_t srcvStride, u8 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON #ifndef __ANDROID__ size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; #endif size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; for (size_t i = 0u; i < size.height; ++i) { const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); u8 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t syj = 0u, sj = 0u, dj = 0u; #ifndef __ANDROID__ for (; sj < roiw32; sj += 32, syj += 64, dj += 128) { internal::prefetch(srcy + syj); internal::prefetch(srcu + sj); internal::prefetch(srcv + sj); uint8x16x2_t v_y = vld2q_u8(srcy + syj); uint8x16x4_t v_dst; v_dst.val[0] = vld1q_u8(srcu + sj); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1q_u8(srcv + sj); v_dst.val[3] = v_y.val[1]; vst4q_u8(dst + dj, v_dst); v_y = vld2q_u8(srcy + syj + 32); v_dst.val[0] = vld1q_u8(srcu + sj + 16); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1q_u8(srcv + sj + 16); v_dst.val[3] = v_y.val[1]; vst4q_u8(dst + dj + 64, v_dst); } #endif for (; sj < roiw8; sj += 8, syj += 16, dj += 32) { uint8x8x2_t v_y = vld2_u8(srcy + syj); uint8x8x4_t v_dst; v_dst.val[0] = vld1_u8(srcu + sj); v_dst.val[1] = v_y.val[0]; v_dst.val[2] = vld1_u8(srcv + sj); v_dst.val[3] = v_y.val[1]; vst4_u8(dst + dj, v_dst); } for (; sj < size.width; ++sj, syj += 2, dj += 4) { dst[dj] = srcu[sj]; dst[dj + 1] = srcy[syj]; dst[dj + 2] = srcv[sj]; dst[dj + 3] = srcy[syj + 1]; } } #else (void)size; (void)srcyBase; (void)srcyStride; (void)srcuBase; (void)srcuStride; (void)srcvBase; (void)srcvStride; (void)dstBase; (void)dstStride; #endif }