// Process a block which is a mutiple of 16 wide and any height. static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { const uint8x8_t f0 = vdup_n_u8(filter[0]); const uint8x8_t f1 = vdup_n_u8(filter[1]); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 16) { const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi)); } src_ptr += src_pixels_per_line; output_ptr += output_width; } }
int main(void) { uint8_t v1_init[8] = {1, 1, 1, 1, 1, 1, 1, 1}; uint8_t v2_init[8] = {2, 2, 2, 2, 2, 2, 2, 2}; uint8x8_t v1 = vld1_u8 (v1_init); uint8x8_t v2 = vld1_u8 (v2_init); uint8x8x2_t vd1, vd2; union {uint8x8_t v; uint8_t buf[8];} d1, d2, d3, d4; int i; uint8_t odd, even; vd1 = vzip_u8(v1, vdup_n_u8(0)); vd2 = vzip_u8(v2, vdup_n_u8(0)); vst1_u8(d1.buf, vd1.val[0]); vst1_u8(d2.buf, vd1.val[1]); vst1_u8(d3.buf, vd2.val[0]); vst1_u8(d4.buf, vd2.val[1]); #ifdef __ARMEL__ odd = 1; even = 0; #else odd = 0; even = 1; #endif for (i = 0; i < 8; i++) if ((i % 2 == even && d4.buf[i] != 2) || (i % 2 == odd && d4.buf[i] != 0)) abort (); return 0; }
void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor) { int colA = SkColorGetA(color); int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); colA = SkAlpha255To256(colA); uint8x8_t vcolR, vcolG, vcolB; uint16x8_t vcolA; if (width >= 8) { vcolA = vdupq_n_u16(colA); vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); vdst.val[NEON_A] = vdup_n_u8(0xFF); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); } }
void ComputesRGBLuminanceMask_NEON(const uint8_t *aSourceData, int32_t aSourceStride, uint8_t *aDestData, int32_t aDestStride, const IntSize &aSize, float aOpacity) { int32_t redFactor = 55 * aOpacity; // 255 * 0.2125 * opacity int32_t greenFactor = 183 * aOpacity; // 255 * 0.7154 * opacity int32_t blueFactor = 18 * aOpacity; // 255 * 0.0721 const uint8_t *sourcePixel = aSourceData; int32_t sourceOffset = aSourceStride - 4 * aSize.width; uint8_t *destPixel = aDestData; int32_t destOffset = aDestStride - aSize.width; sourcePixel = aSourceData; int32_t remainderWidth = aSize.width % 8; int32_t roundedWidth = aSize.width - remainderWidth; uint16x8_t temp; uint8x8_t gray; uint8x8_t redVector = vdup_n_u8(redFactor); uint8x8_t greenVector = vdup_n_u8(greenFactor); uint8x8_t blueVector = vdup_n_u8(blueFactor); uint8x8_t fullBitVector = vdup_n_u8(255); uint8x8_t oneVector = vdup_n_u8(1); for (int32_t y = 0; y < aSize.height; y++) { // Calculate luminance by neon with 8 pixels per loop for (int32_t x = 0; x < roundedWidth; x += 8) { uint8x8x4_t argb = vld4_u8(sourcePixel); temp = vmull_u8(argb.val[GFX_ARGB32_OFFSET_R], redVector); // temp = red * redFactor temp = vmlal_u8(temp, argb.val[GFX_ARGB32_OFFSET_G], greenVector); // temp += green * greenFactor temp = vmlal_u8(temp, argb.val[GFX_ARGB32_OFFSET_B], blueVector); // temp += blue * blueFactor gray = vshrn_n_u16(temp, 8); // gray = temp >> 8 // Check alpha value uint8x8_t alphaVector = vtst_u8(argb.val[GFX_ARGB32_OFFSET_A], fullBitVector); gray = vmul_u8(gray, vand_u8(alphaVector, oneVector)); // Put the result to the 8 pixels vst1_u8(destPixel, gray); sourcePixel += 8 * 4; destPixel += 8; } // Calculate the rest pixels of the line by cpu for (int32_t x = 0; x < remainderWidth; x++) { if (sourcePixel[GFX_ARGB32_OFFSET_A] > 0) { *destPixel = (redFactor * sourcePixel[GFX_ARGB32_OFFSET_R]+ greenFactor * sourcePixel[GFX_ARGB32_OFFSET_G] + blueFactor * sourcePixel[GFX_ARGB32_OFFSET_B]) >> 8; } else { *destPixel = 0; } sourcePixel += 4; destPixel++; }
void png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_bytep rp = row; png_bytep rp_stop = row + row_info->rowbytes; uint8x8x4_t vdest; vdest.val[3] = vdup_n_u8(0); png_debug(1, "in png_read_filter_row_sub4_neon"); for (; rp < rp_stop; rp += 16) { uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp)); uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp); uint8x8x4_t vrp = *vrpt; uint32x2x4_t *temp_pointer; vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]); vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]); vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); } PNG_UNUSED(prev_row) }
void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t d0u8 = vdup_n_u8(0); uint64x1_t d1u64 = vdup_n_u64(0); (void)above; d1u64 = vld1_u64((const uint64_t *)left); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); vst1_u8(dst, d0u8); }
void test_vdup_nu8 (void) { uint8x8_t out_uint8x8_t; uint8_t arg0_uint8_t; out_uint8x8_t = vdup_n_u8 (arg0_uint8_t); }
/** * @brief Grayscale 변환을 수행하는 thread 함수. NEON 명령어를 사용한다. * * @param arg 계산해야할 정보가 담겨있는 _thread_data 형식의 구조체 * * @return NULL */ static void *thread_calc(void *arg) { struct _thread_data *data = (struct _thread_data *)arg; uint8x8_t rfac = vdup_n_u8 (76); uint8x8_t gfac = vdup_n_u8 (151); uint8x8_t bfac = vdup_n_u8 (29); int n = data->size / 8; int m = data->size % 8; int iTemp; unsigned char szTemp[8]; unsigned int *data_in = data->data_in; unsigned int *data_out = data->data_out; // 한 루프당 8픽셀씩 변환 (32 bytes) while (n--) { uint16x8_t temp; uint8x8x4_t rgb = vld4_u8 ((unsigned char *)data_in); uint8x8_t result; temp = vmull_u8(rgb.val[0], bfac); temp = vmlal_u8(temp, rgb.val[1], gfac); temp = vmlal_u8(temp, rgb.val[2], rfac); result = vshrn_n_u16 (temp, 8); vst1_u8(szTemp, result); iTemp = szTemp[0]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[1]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[2]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[3]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[4]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[5]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[6]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; iTemp = szTemp[7]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++; } if (m) { argb8888_to_gray(data_in, data_out, m); } return NULL; }
void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; uint8x8_t d0u8 = vdup_n_u8(0); (void)left; d0u8 = vld1_u8(above); for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); }
void png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_bytep rp = row; png_bytep rp_stop = row + row_info->rowbytes; png_const_bytep pp = prev_row; uint8x8_t vlast = vdup_n_u8(0); uint8x8x4_t vdest; vdest.val[3] = vdup_n_u8(0); png_debug(1, "in png_read_filter_row_paeth4_neon"); for (; rp < rp_stop; rp += 16, pp += 16) { uint32x2x4_t vtmp; uint8x8x4_t *vrpt, *vppt; uint8x8x4_t vrp, vpp; uint32x2x4_t *temp_pointer; vtmp = vld4_u32(png_ptr(uint32_t,rp)); vrpt = png_ptr(uint8x8x4_t,&vtmp); vrp = *vrpt; vtmp = vld4_u32(png_ptrc(uint32_t,pp)); vppt = png_ptr(uint8x8x4_t,&vtmp); vpp = *vppt; vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]); vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]); vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); vlast = vpp.val[3]; vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); } }
// Process a block exactly 8 wide and any height. static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, const uint8_t *filter) { const uint8x8_t f0 = vdup_n_u8(filter[0]); const uint8x8_t f1 = vdup_n_u8(filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); const uint16x8_t a = vmull_u8(src_0, f0); const uint16x8_t b = vmlal_u8(a, src_1, f1); const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); vst1_u8(output_ptr, out); src_ptr += src_pixels_per_line; output_ptr += 8; } }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A0 = vld1q_u8(above); // top row const uint8x16_t A1 = vld1q_u8(above + 16); const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top const uint16x8_t p1 = vpaddlq_u8(A1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_top = vcombine_u16(p5, p5); } if (do_left) { const uint8x16_t L0 = vld1q_u8(left); // left row const uint8x16_t L1 = vld1q_u8(left + 16); const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left const uint16x8_t p1 = vpaddlq_u8(L1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_left = vcombine_u16(p5, p5); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 6); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 5); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 5); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 32; ++i) { vst1q_u8(dst + i * stride, dc); vst1q_u8(dst + i * stride + 16, dc); } } }
void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j, k; uint8x8_t d2u8 = vdup_n_u8(0); uint8x16_t q0u8 = vdupq_n_u8(0); uint8x16_t q1u8 = vdupq_n_u8(0); (void)above; for (k = 0; k < 2; k++, left += 16) { q1u8 = vld1q_u8(left); d2u8 = vget_low_u8(q1u8); for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { q0u8 = vdupq_lane_u8(d2u8, 0); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 1); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 2); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 3); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 4); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 5); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 6); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; q0u8 = vdupq_lane_u8(d2u8, 7); vst1q_u8(dst, q0u8); vst1q_u8(dst + 16, q0u8); dst += stride; } } }
void png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_bytep rp = row; png_bytep rp_stop = row + row_info->rowbytes; uint8x16_t vtmp = vld1q_u8(rp); uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); uint8x8x2_t vrp = *vrpt; uint8x8x4_t vdest; vdest.val[3] = vdup_n_u8(0); png_debug(1, "in png_read_filter_row_sub3_neon"); for (; rp < rp_stop;) { uint8x8_t vtmp1, vtmp2; uint32x2_t *temp_pointer; vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); vdest.val[3] = vadd_u8(vdest.val[2], vtmp1); vtmp = vld1q_u8(rp + 12); vrpt = png_ptr(uint8x8x2_t, &vtmp); vrp = *vrpt; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); rp += 3; } PNG_UNUSED(prev_row) }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A = vld1q_u8(above); // top row const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_top = vcombine_u16(p3, p3); } if (do_left) { const uint8x16_t L = vld1q_u8(left); // left row const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_left = vcombine_u16(p3, p3); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 5); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 4); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 4); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 16; ++i) { vst1q_u8(dst + i * stride, dc); } } }
void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; uint16x8_t q1u16, q3u16; int16x8_t q1s16; uint8x8_t d0u8 = vdup_n_u8(0); uint32x2_t d2u32 = vdup_n_u32(0); d0u8 = vld1_dup_u8(above - 1); d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); for (i = 0; i < 4; i++, dst += stride) { q1u16 = vdupq_n_u16((uint16_t)left[i]); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); d0u8 = vqmovun_s16(q1s16); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); } }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x8_t A = vld1_u8(above); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_top = vcombine_u16(p2, p2); } if (do_left) { const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_left = vcombine_u16(p2, p2); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 4); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 3); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 3); } else { dc0 = vdup_n_u8(0x80); } { const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; for (i = 0; i < 8; ++i) { vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); } } }
void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t d0u8 = vdup_n_u8(0); uint32x2_t d1u32 = vdup_n_u32(0); (void)above; d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); }
static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) { uint8x8x4_t x; x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0); x = vld4_lane_u8(src, x, 0); src += pitch; x = vld4_lane_u8(src, x, 1); src += pitch; x = vld4_lane_u8(src, x, 2); src += pitch; x = vld4_lane_u8(src, x, 3); src += pitch; x = vld4_lane_u8(src, x, 4); src += pitch; x = vld4_lane_u8(src, x, 5); src += pitch; x = vld4_lane_u8(src, x, 6); src += pitch; x = vld4_lane_u8(src, x, 7); return x; }
unsigned int vp8_sub_pixel_variance16x16_neon_func( const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { int i; DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); unsigned char *tmpp; unsigned char *tmpp2; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; uint8x8_t d19u8, d20u8, d21u8; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; int64x1_t d0s64, d1s64, d2s64, d3s64; uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; tmpp2 = tmp + 272; tmpp = tmp; if (xoffset == 0) { // secondpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); q11u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } else if (yoffset == 0) { // firstpass_bfilter16x16_only d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); for (i = 4; i > 0 ; i--) { d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; __builtin_prefetch(src_ptr); __builtin_prefetch(src_ptr + src_pixels_per_line); __builtin_prefetch(src_ptr + src_pixels_per_line * 2); q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp2, q7u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q8u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q9u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q10u8); tmpp2 += 16; } } else { d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; // First Pass: output_height lines x output_width columns (17x16) for (i = 3; i > 0; i--) { q7u16 = vmull_u8(d2u8, d0u8); q8u16 = vmull_u8(d3u8, d0u8); q9u16 = vmull_u8(d5u8, d0u8); q10u16 = vmull_u8(d6u8, d0u8); q11u16 = vmull_u8(d8u8, d0u8); q12u16 = vmull_u8(d9u8, d0u8); q13u16 = vmull_u8(d11u8, d0u8); q14u16 = vmull_u8(d12u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); d11u8 = vext_u8(d11u8, d12u8, 1); q7u16 = vmlal_u8(q7u16, d2u8, d1u8); q9u16 = vmlal_u8(q9u16, d5u8, d1u8); q11u16 = vmlal_u8(q11u16, d8u8, d1u8); q13u16 = vmlal_u8(q13u16, d11u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); d12u8 = vext_u8(d12u8, d13u8, 1); q8u16 = vmlal_u8(q8u16, d3u8, d1u8); q10u16 = vmlal_u8(q10u16, d6u8, d1u8); q12u16 = vmlal_u8(q12u16, d9u8, d1u8); q14u16 = vmlal_u8(q14u16, d12u8, d1u8); d14u8 = vqrshrn_n_u16(q7u16, 7); d15u8 = vqrshrn_n_u16(q8u16, 7); d16u8 = vqrshrn_n_u16(q9u16, 7); d17u8 = vqrshrn_n_u16(q10u16, 7); d18u8 = vqrshrn_n_u16(q11u16, 7); d19u8 = vqrshrn_n_u16(q12u16, 7); d20u8 = vqrshrn_n_u16(q13u16, 7); d21u8 = vqrshrn_n_u16(q14u16, 7); d2u8 = vld1_u8(src_ptr); d3u8 = vld1_u8(src_ptr + 8); d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d5u8 = vld1_u8(src_ptr); d6u8 = vld1_u8(src_ptr + 8); d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d8u8 = vld1_u8(src_ptr); d9u8 = vld1_u8(src_ptr + 8); d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; d11u8 = vld1_u8(src_ptr); d12u8 = vld1_u8(src_ptr + 8); d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); q10u8 = vcombine_u8(d20u8, d21u8); vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; } // First-pass filtering for rest 5 lines d14u8 = vld1_u8(src_ptr); d15u8 = vld1_u8(src_ptr + 8); d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; q9u16 = vmull_u8(d2u8, d0u8); q10u16 = vmull_u8(d3u8, d0u8); q11u16 = vmull_u8(d5u8, d0u8); q12u16 = vmull_u8(d6u8, d0u8); q13u16 = vmull_u8(d8u8, d0u8); q14u16 = vmull_u8(d9u8, d0u8); d2u8 = vext_u8(d2u8, d3u8, 1); d5u8 = vext_u8(d5u8, d6u8, 1); d8u8 = vext_u8(d8u8, d9u8, 1); q9u16 = vmlal_u8(q9u16, d2u8, d1u8); q11u16 = vmlal_u8(q11u16, d5u8, d1u8); q13u16 = vmlal_u8(q13u16, d8u8, d1u8); d3u8 = vext_u8(d3u8, d4u8, 1); d6u8 = vext_u8(d6u8, d7u8, 1); d9u8 = vext_u8(d9u8, d10u8, 1); q10u16 = vmlal_u8(q10u16, d3u8, d1u8); q12u16 = vmlal_u8(q12u16, d6u8, d1u8); q14u16 = vmlal_u8(q14u16, d9u8, d1u8); q1u16 = vmull_u8(d11u8, d0u8); q2u16 = vmull_u8(d12u8, d0u8); q3u16 = vmull_u8(d14u8, d0u8); q4u16 = vmull_u8(d15u8, d0u8); d11u8 = vext_u8(d11u8, d12u8, 1); d14u8 = vext_u8(d14u8, d15u8, 1); q1u16 = vmlal_u8(q1u16, d11u8, d1u8); q3u16 = vmlal_u8(q3u16, d14u8, d1u8); d12u8 = vext_u8(d12u8, d13u8, 1); d15u8 = vext_u8(d15u8, d16u8, 1); q2u16 = vmlal_u8(q2u16, d12u8, d1u8); q4u16 = vmlal_u8(q4u16, d15u8, d1u8); d10u8 = vqrshrn_n_u16(q9u16, 7); d11u8 = vqrshrn_n_u16(q10u16, 7); d12u8 = vqrshrn_n_u16(q11u16, 7); d13u8 = vqrshrn_n_u16(q12u16, 7); d14u8 = vqrshrn_n_u16(q13u16, 7); d15u8 = vqrshrn_n_u16(q14u16, 7); d16u8 = vqrshrn_n_u16(q1u16, 7); d17u8 = vqrshrn_n_u16(q2u16, 7); d18u8 = vqrshrn_n_u16(q3u16, 7); d19u8 = vqrshrn_n_u16(q4u16, 7); q5u8 = vcombine_u8(d10u8, d11u8); q6u8 = vcombine_u8(d12u8, d13u8); q7u8 = vcombine_u8(d14u8, d15u8); q8u8 = vcombine_u8(d16u8, d17u8); q9u8 = vcombine_u8(d18u8, d19u8); vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; vst1q_u8((uint8_t *)tmpp, q9u8); // secondpass_filter d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); tmpp = tmp; tmpp2 = tmpp + 272; q11u8 = vld1q_u8(tmpp); tmpp += 16; for (i = 4; i > 0; i--) { q12u8 = vld1q_u8(tmpp); tmpp += 16; q13u8 = vld1q_u8(tmpp); tmpp += 16; q14u8 = vld1q_u8(tmpp); tmpp += 16; q15u8 = vld1q_u8(tmpp); tmpp += 16; q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); d2u8 = vqrshrn_n_u16(q1u16, 7); d3u8 = vqrshrn_n_u16(q2u16, 7); d4u8 = vqrshrn_n_u16(q3u16, 7); d5u8 = vqrshrn_n_u16(q4u16, 7); d6u8 = vqrshrn_n_u16(q5u16, 7); d7u8 = vqrshrn_n_u16(q6u16, 7); d8u8 = vqrshrn_n_u16(q7u16, 7); d9u8 = vqrshrn_n_u16(q8u16, 7); q1u8 = vcombine_u8(d2u8, d3u8); q2u8 = vcombine_u8(d4u8, d5u8); q3u8 = vcombine_u8(d6u8, d7u8); q4u8 = vcombine_u8(d8u8, d9u8); q11u8 = q15u8; vst1q_u8((uint8_t *)tmpp2, q1u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q2u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q3u8); tmpp2 += 16; vst1q_u8((uint8_t *)tmpp2, q4u8); tmpp2 += 16; } } // sub_pixel_variance16x16_neon q8s32 = vdupq_n_s32(0); q9s32 = vdupq_n_s32(0); q10s32 = vdupq_n_s32(0); tmpp = tmp + 272; for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop q0u8 = vld1q_u8(tmpp); tmpp += 16; q1u8 = vld1q_u8(tmpp); tmpp += 16; q2u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; q3u8 = vld1q_u8(dst_ptr); dst_ptr += dst_pixels_per_line; d0u8 = vget_low_u8(q0u8); d1u8 = vget_high_u8(q0u8); d2u8 = vget_low_u8(q1u8); d3u8 = vget_high_u8(q1u8); q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); q9s32 = vmlal_s16(q9s32, d22s16, d22s16); q10s32 = vmlal_s16(q10s32, d23s16, d23s16); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); q9s32 = vmlal_s16(q9s32, d24s16, d24s16); q10s32 = vmlal_s16(q10s32, d25s16, d25s16); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); q9s32 = vmlal_s16(q9s32, d26s16, d26s16); q10s32 = vmlal_s16(q10s32, d27s16, d27s16); d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); q9s32 = vmlal_s16(q9s32, d28s16, d28s16); q10s32 = vmlal_s16(q10s32, d29s16, d29s16); } q10s32 = vaddq_s32(q10s32, q9s32); q0s64 = vpaddlq_s32(q8s32); q1s64 = vpaddlq_s32(q10s32); d0s64 = vget_low_s64(q0s64); d1s64 = vget_high_s64(q0s64); d2s64 = vget_low_s64(q1s64); d3s64 = vget_high_s64(q1s64); d0s64 = vadd_s64(d0s64, d1s64); d1s64 = vadd_s64(d2s64, d3s64); q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); return vget_lane_u32(d0u32, 0); }
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst) { int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); uint8x8_t vcolR, vcolG, vcolB; uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; if (width >= 8) { vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; uint8x8_t vsel_trans, vsel_opq; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Prepare compare masks vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } // Leftovers for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], opaqueDst); } }
uint8x8_t test_vdup_n_u8(uint8_t v1) { // CHECK: test_vdup_n_u8 return vdup_n_u8(v1); // CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} }
static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh uint8x8_t d3u8, // p2 uint8x8_t d4u8, // p2 uint8x8_t d5u8, // p1 uint8x8_t d6u8, // p0 uint8x8_t d7u8, // q0 uint8x8_t d16u8, // q1 uint8x8_t d17u8, // q2 uint8x8_t d18u8, // q3 uint8x8_t *d0ru8, // p1 uint8x8_t *d1ru8, // p1 uint8x8_t *d2ru8, // p0 uint8x8_t *d3ru8, // q0 uint8x8_t *d4ru8, // q1 uint8x8_t *d5ru8) { // q1 uint32_t flat; uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; int16x8_t q15s16; uint16x8_t q10u16, q14u16; int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; d19u8 = vabd_u8(d3u8, d4u8); d20u8 = vabd_u8(d4u8, d5u8); d21u8 = vabd_u8(d5u8, d6u8); d22u8 = vabd_u8(d16u8, d7u8); d23u8 = vabd_u8(d17u8, d16u8); d24u8 = vabd_u8(d18u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d20u8 = vmax_u8(d21u8, d22u8); d25u8 = vabd_u8(d6u8, d4u8); d23u8 = vmax_u8(d23u8, d24u8); d26u8 = vabd_u8(d7u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d24u8 = vabd_u8(d6u8, d7u8); d27u8 = vabd_u8(d3u8, d6u8); d28u8 = vabd_u8(d18u8, d7u8); d19u8 = vmax_u8(d19u8, d23u8); d23u8 = vabd_u8(d5u8, d16u8); d24u8 = vqadd_u8(d24u8, d24u8); d19u8 = vcge_u8(dlimit, d19u8); d25u8 = vmax_u8(d25u8, d26u8); d26u8 = vmax_u8(d27u8, d28u8); d23u8 = vshr_n_u8(d23u8, 1); d25u8 = vmax_u8(d25u8, d26u8); d24u8 = vqadd_u8(d24u8, d23u8); d20u8 = vmax_u8(d20u8, d25u8); d23u8 = vdup_n_u8(1); d24u8 = vcge_u8(dblimit, d24u8); d21u8 = vcgt_u8(d21u8, dthresh); d20u8 = vcge_u8(d23u8, d20u8); d19u8 = vand_u8(d19u8, d24u8); d23u8 = vcgt_u8(d22u8, dthresh); d20u8 = vand_u8(d20u8, d19u8); d22u8 = vdup_n_u8(0x80); d23u8 = vorr_u8(d21u8, d23u8); q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); d30u8 = vshrn_n_u16(q10u16, 4); flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); if (flat == 0xffffffff) { // Check for all 1's, power_branch_only d27u8 = vdup_n_u8(3); d21u8 = vdup_n_u8(2); q14u16 = vaddl_u8(d6u8, d7u8); q14u16 = vmlal_u8(q14u16, d3u8, d27u8); q14u16 = vmlal_u8(q14u16, d4u8, d21u8); q14u16 = vaddw_u8(q14u16, d5u8); *d0ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d16u8); *d1ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d17u8); *d2ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d18u8); *d3ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vsubw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d18u8); *d4ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vsubw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d17u8); q14u16 = vaddw_u8(q14u16, d18u8); *d5ru8 = vqrshrn_n_u16(q14u16, 3); } else { d21u8 = veor_u8(d7u8, d22u8); d24u8 = veor_u8(d6u8, d22u8); d25u8 = veor_u8(d5u8, d22u8); d26u8 = veor_u8(d16u8, d22u8); d27u8 = vdup_n_u8(3); d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); q15s16 = vaddw_s8(q15s16, d29s8); d29u8 = vdup_n_u8(4); d28s8 = vqmovn_s16(q15s16); d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); d30s8 = vshr_n_s8(d30s8, 3); d29s8 = vshr_n_s8(d29s8, 3); d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); d29s8 = vrshr_n_s8(d29s8, 1); d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); if (flat == 0) { // filter_branch_only *d0ru8 = d4u8; *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); *d5ru8 = d17u8; return; } d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); d23u8 = vdup_n_u8(2); q14u16 = vaddl_u8(d6u8, d7u8); q14u16 = vmlal_u8(q14u16, d3u8, d27u8); q14u16 = vmlal_u8(q14u16, d4u8, d23u8); d0u8 = vbsl_u8(d20u8, dblimit, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); d1u8 = vbsl_u8(d20u8, dlimit, d25u8); d30u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d16u8); d2u8 = vbsl_u8(d20u8, dthresh, d24u8); d31u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d17u8); *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); d23u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d7u8); *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); q14u16 = vaddw_u8(q14u16, d18u8); *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); d22u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vsubw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d16u8); d3u8 = vbsl_u8(d20u8, d3u8, d21u8); q14u16 = vaddw_u8(q14u16, d18u8); d4u8 = vbsl_u8(d20u8, d4u8, d26u8); d6u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vsubw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d17u8); q14u16 = vaddw_u8(q14u16, d18u8); d5u8 = vbsl_u8(d20u8, d5u8, d17u8); d7u8 = vqrshrn_n_u16(q14u16, 3); *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); } return; }
void png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_bytep rp = row; png_const_bytep pp = prev_row; png_bytep rp_stop = row + row_info->rowbytes; uint8x16_t vtmp; uint8x8x2_t *vrpt; uint8x8x2_t vrp; uint8x8_t vlast = vdup_n_u8(0); uint8x8x4_t vdest; vdest.val[3] = vdup_n_u8(0); vtmp = vld1q_u8(rp); vrpt = png_ptr(uint8x8x2_t,&vtmp); vrp = *vrpt; png_debug(1, "in png_read_filter_row_paeth3_neon"); for (; rp < rp_stop; pp += 12) { uint8x8x2_t *vppt; uint8x8x2_t vpp; uint8x8_t vtmp1, vtmp2, vtmp3; uint32x2_t *temp_pointer; vtmp = vld1q_u8(pp); vppt = png_ptr(uint8x8x2_t,&vtmp); vpp = *vppt; vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); vtmp = vld1q_u8(rp + 12); vrpt = png_ptr(uint8x8x2_t,&vtmp); vrp = *vrpt; vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3); vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); vlast = vtmp2; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); rp += 3; vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); rp += 3; } }
static INLINE void vp9_loop_filter_neon( uint8x8_t dblimit, // flimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh uint8x8_t d3u8, // p3 uint8x8_t d4u8, // p2 uint8x8_t d5u8, // p1 uint8x8_t d6u8, // p0 uint8x8_t d7u8, // q0 uint8x8_t d16u8, // q1 uint8x8_t d17u8, // q2 uint8x8_t d18u8, // q3 uint8x8_t *d4ru8, // p1 uint8x8_t *d5ru8, // p0 uint8x8_t *d6ru8, // q0 uint8x8_t *d7ru8) { // q1 uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; int16x8_t q12s16; int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; d19u8 = vabd_u8(d3u8, d4u8); d20u8 = vabd_u8(d4u8, d5u8); d21u8 = vabd_u8(d5u8, d6u8); d22u8 = vabd_u8(d16u8, d7u8); d3u8 = vabd_u8(d17u8, d16u8); d4u8 = vabd_u8(d18u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d20u8 = vmax_u8(d21u8, d22u8); d3u8 = vmax_u8(d3u8, d4u8); d23u8 = vmax_u8(d19u8, d20u8); d17u8 = vabd_u8(d6u8, d7u8); d21u8 = vcgt_u8(d21u8, dthresh); d22u8 = vcgt_u8(d22u8, dthresh); d23u8 = vmax_u8(d23u8, d3u8); d28u8 = vabd_u8(d5u8, d16u8); d17u8 = vqadd_u8(d17u8, d17u8); d23u8 = vcge_u8(dlimit, d23u8); d18u8 = vdup_n_u8(0x80); d5u8 = veor_u8(d5u8, d18u8); d6u8 = veor_u8(d6u8, d18u8); d7u8 = veor_u8(d7u8, d18u8); d16u8 = veor_u8(d16u8, d18u8); d28u8 = vshr_n_u8(d28u8, 1); d17u8 = vqadd_u8(d17u8, d28u8); d19u8 = vdup_n_u8(3); d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8)); d17u8 = vcge_u8(dblimit, d17u8); d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8)); d22u8 = vorr_u8(d21u8, d22u8); q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); d23u8 = vand_u8(d23u8, d17u8); q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); d17u8 = vdup_n_u8(4); d27s8 = vqmovn_s16(q12s16); d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); d27s8 = vreinterpret_s8_u8(d27u8); d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); d28s8 = vshr_n_s8(d28s8, 3); d27s8 = vshr_n_s8(d27s8, 3); d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); d27s8 = vrshr_n_s8(d27s8, 1); d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); return; }
bool decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha=0xff) { // pre-condition : width, height must be even if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv) return false; // in & out pointers unsigned char* dst = out; // constants int const stride = width*trait::bytes_per_pixel; int const itHeight = height>>1; int const itWidth = width>>3; uint8x8_t const Yshift = vdup_n_u8(16); int16x8_t const half = vdupq_n_u16(128); int32x4_t const rounding = vdupq_n_s32(128); // tmp variable uint16x8_t t; // pixel block to temporary store 8 pixels typename trait::PixelBlock pblock = trait::init_pixelblock(fill_alpha); for (int j=0; j<itHeight; ++j, y+=width, dst+=stride) { for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*trait::bytes_per_pixel)) { t = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift)); int32x4_t const Y00 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y01 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); t = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift)); int32x4_t const Y10 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y11 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); // trait::loadvu pack 4 sets of uv into a uint8x8_t, layout : { v0,u0, v1,u1, v2,u2, v3,u3 } t = vsubq_s16((int16x8_t)vmovl_u8(trait::loadvu(uv)), half); // UV.val[0] : v0, v1, v2, v3 // UV.val[1] : u0, u1, u2, u3 int16x4x2_t const UV = vuzp_s16(vget_low_s16(t), vget_high_s16(t)); // tR : 128+409V // tG : 128-100U-208V // tB : 128+516U int32x4_t const tR = vmlal_n_s16(rounding, UV.val[0], 409); int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, UV.val[0], -208), UV.val[1], -100); int32x4_t const tB = vmlal_n_s16(rounding, UV.val[1], 516); int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3] int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3] int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3] // upper 8 pixels trait::store_pixel_block(dst, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8)); // lower 8 pixels trait::store_pixel_block(dst+stride, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8)); } } return true; }
static PixelBlock const init_pixelblock(unsigned char fill_alpha) { PixelBlock block; block.val[3] = vdup_n_u8(fill_alpha); // alpha channel in the last return block; }
inline uint8x8_t vdup_n(const u8 & val) { return vdup_n_u8(val); }