void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor) { int colA = SkColorGetA(color); int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); colA = SkAlpha255To256(colA); uint8x8_t vcolR, vcolG, vcolB; uint16x8_t vcolA; if (width >= 8) { vcolA = vdupq_n_u16(colA); vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); vdst.val[NEON_A] = vdup_n_u8(0xFF); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); } }
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; uint16x8_t vec_sum_lo = vdupq_n_u16(0); uint16x8_t vec_sum_hi = vdupq_n_u16(0); const int shift_factor = ((height >> 5) + 3) * -1; const int16x8_t vec_shift = vdupq_n_s16(shift_factor); for (i = 0; i < height; i += 8) { const uint8x16_t vec_row1 = vld1q_u8(ref); const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); ref += ref_stride * 8; } vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); hbuf += 8; vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); }
void test_vdupQ_nu16 (void) { uint16x8_t out_uint16x8_t; uint16_t arg0_uint16_t; out_uint16x8_t = vdupq_n_u16 (arg0_uint16_t); }
/* u16x8 mv mul */ void mw_neon_mv_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, unsigned short * C) { int i = 0; int k = 0; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { neon_c = vmovq_n_u16(0); for (k = 0; k < T; k+=8) { int j = k * T + i; neon_a0 = vld1q_u16(A + j); j+=Row; neon_a1 = vld1q_u16(A + j); j+=Row; neon_a2 = vld1q_u16(A + j); j+=Row; neon_a3 = vld1q_u16(A + j); j+=Row; neon_a4 = vld1q_u16(A + j); j+=Row; neon_a5 = vld1q_u16(A + j); j+=Row; neon_a6 = vld1q_u16(A + j); j+=Row; neon_a7 = vld1q_u16(A + j); neon_b = vld1q_u16(B + k); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); } vst1q_u16(C + i, neon_c); } }
int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) { int i; uint16x8_t vec_sum = vdupq_n_u16(0); for (i = 0; i < width; i += 16) { const uint8x16_t vec_row = vld1q_u8(ref); vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); ref += 16; } return horizontal_add_u16x8(vec_sum); }
void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; uint16x8_t q1u16, q3u16; int16x8_t q1s16; uint8x8_t d0u8 = vdup_n_u8(0); uint32x2_t d2u32 = vdup_n_u32(0); d0u8 = vld1_dup_u8(above - 1); d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); for (i = 0; i < 4; i++, dst += stride) { q1u16 = vdupq_n_u16((uint16_t)left[i]); q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); d0u8 = vqmovun_s16(q1s16); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); } }
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, const uint8x8_t G, const uint8x8_t B) { const uint16x8_t r = vmovl_u8(R); const uint16x8_t g = vmovl_u8(G); const uint16x8_t b = vmovl_u8(B); const uint16x4_t r_lo = vget_low_u16(r); const uint16x4_t r_hi = vget_high_u16(r); const uint16x4_t g_lo = vget_low_u16(g); const uint16x4_t g_hi = vget_high_u16(g); const uint16x4_t b_lo = vget_low_u16(b); const uint16x4_t b_hi = vget_high_u16(b); const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u); const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u); const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u); const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u); const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u); const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u); const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16), vrshrn_n_u32(tmp2_hi, 16)); const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16)); return vqmovn_u16(Y2); }
void SkRGB16BlitterBlitV_neon(uint16_t* device, int height, size_t deviceRB, unsigned scale, uint32_t src32) { if (height >= 8) { uint16_t* dst = device; // prepare constants uint16x8_t vdev = vdupq_n_u16(0); uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); uint32x4_t vsrc32 = vdupq_n_u32(src32); uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); while (height >= 8){ LOAD_LANE_16(vdev, 0) LOAD_LANE_16(vdev, 1) LOAD_LANE_16(vdev, 2) LOAD_LANE_16(vdev, 3) LOAD_LANE_16(vdev, 4) LOAD_LANE_16(vdev, 5) LOAD_LANE_16(vdev, 6) LOAD_LANE_16(vdev, 7) // Expand_rgb_16 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); // Compact_rgb_16 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); vdst32_lo = vshrq_n_u32(vdst32_lo, 5); vdst32_hi = vshrq_n_u32(vdst32_hi, 5); uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); STORE_LANE_16(vdst16_lo, 0) STORE_LANE_16(vdst16_lo, 1) STORE_LANE_16(vdst16_lo, 2) STORE_LANE_16(vdst16_lo, 3) STORE_LANE_16(vdst16_hi, 0) STORE_LANE_16(vdst16_hi, 1) STORE_LANE_16(vdst16_hi, 2) STORE_LANE_16(vdst16_hi, 3) height -= 8; } } while (height != 0){ uint32_t dst32 = SkExpand_rgb_16(*device) * scale; *device = SkCompact_rgb_16((src32 + dst32) >> 5); device = (uint16_t*)((char*)device + deviceRB); height--; } }
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst) { int colR = SkColorGetR(color); int colG = SkColorGetG(color); int colB = SkColorGetB(color); uint8x8_t vcolR, vcolG, vcolB; uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; if (width >= 8) { vcolR = vdup_n_u8(colR); vcolG = vdup_n_u8(colG); vcolB = vdup_n_u8(colB); vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); } while (width >= 8) { uint8x8x4_t vdst; uint16x8_t vmask; uint16x8_t vmaskR, vmaskG, vmaskB; uint8x8_t vsel_trans, vsel_opq; vdst = vld4_u8((uint8_t*)dst); vmask = vld1q_u16(src); // Prepare compare masks vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); // Get all the color masks on 5 bits vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), SK_B16_BITS + SK_R16_BITS + 1); vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); // Upscale to 0..32 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); vst4_u8((uint8_t*)dst, vdst); dst += 8; src += 8; width -= 8; } // Leftovers for (int i = 0; i < width; i++) { dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], opaqueDst); } }
uint16x8_t test_vdupq_n_u16(uint16_t v1) { // CHECK: test_vdupq_n_u16 return vdupq_n_u16(v1); // CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} }
/** * Get the sum of absolute differences for a specific pixel location and disparity * * @param leftImage left image * @param rightImage right image * @param laplacianL laplacian-fitlered left image * @param laplacianR laplacian-filtered right image * @param pxX row pixel location * @param pxY column pixel location * @param state state structure that includes a number of parameters * @param left_interest optional parameter that will be filled with the value for the left interest operation * @param right_interest same as above, for the right image * * @retval scaled sum of absolute differences for this block -- * the value is the sum/numberOfPixels */ int PushbroomStereo::GetSAD(Mat leftImage, Mat rightImage, Mat laplacianL, Mat laplacianR, int pxX, int pxY, int *left_interest, int *right_interest, int *raw_sad) { // top left corner of the SAD box int startX = pxX; int startY = pxY; // bottom right corner of the SAD box #ifndef USE_NEON int endX = pxX + m_iBlockSize - 1; #endif int endY = pxY + m_iBlockSize - 1; #if USE_SAFTEY_CHECKS int flag = false; if (startX < 0) { printf("Warning: startX < 0\n"); flag = true; } if (endX > rightImage.cols) { printf("Warning: endX > leftImage.cols\n"); flag = true; } if (startX + disparity < 0) { printf("Warning: startX + disparity < 0\n"); flag = true; } if (endX + disparity > rightImage.cols) { printf("Warning: endX + disparity > leftImage.cols\n"); flag = true; } if (endX + disparity > rightImage.cols) { printf("Warning: endX + disparity > rightImage.cols\n"); endX = rightImage.cols - disparity; flag = true; } if (startY < 0) { printf("Warning: startY < 0\n"); flag = true; } if (endY > rightImage.rows) { printf("Warning: endY > rightImage.rows\n"); flag = true; } // disparity might be negative as well if (disparity < 0 && startX + disparity < 0) { printf("Warning: disparity < 0 && startX + disparity < 0\n"); startX = -disparity; flag = true; } if (flag == true) { printf("startX = %d, endX = %d, disparity = %d, startY = %d, endY = %d\n", startX, endX, disparity, startY, endY); } startX = max(0, startX); startY = max(0, startY); endX = min(leftImage.cols - disparity, endX); endY = min(leftImage.rows, endY); #endif int leftVal = 0, rightVal = 0; int sad = 0; #ifdef USE_NEON uint16x8_t interest_op_sum_8x_L, interest_op_sum_8x_R, sad_sum_8x; // load zeros into everything interest_op_sum_8x_L = vdupq_n_u16(0); interest_op_sum_8x_R = vdupq_n_u16(0); sad_sum_8x = vdupq_n_u16(0); #endif for (int i=startY;i<=endY;i++) { if(i>=leftImage.rows-1) continue; //Get a pointer for this row uchar *this_rowL = leftImage.ptr<uchar>(i); uchar *this_rowR = rightImage.ptr<uchar>(i); uchar *this_row_laplacianL = laplacianL.ptr<uchar>(i); uchar *this_row_laplacianR = laplacianR.ptr<uchar>(i); #ifdef USE_NEON // load this row into memory uint8x8_t this_row_8x8_L = vld1_u8(this_rowL + startX); uint8x8_t this_row_8x8_R = vld1_u8(this_rowR + startX + disparity); uint8x8_t interest_op_8x8_L = vld1_u8(this_row_laplacianL + startX); uint8x8_t interest_op_8x8_R = vld1_u8(this_row_laplacianR + startX + disparity); // do absolute differencing for the entire row in one operation! uint8x8_t sad_8x = vabd_u8(this_row_8x8_L, this_row_8x8_R); // sum up sad_sum_8x = vaddw_u8(sad_sum_8x, sad_8x); // sum laplacian values interest_op_sum_8x_L = vaddw_u8(interest_op_sum_8x_L, interest_op_8x8_L); interest_op_sum_8x_R = vaddw_u8(interest_op_sum_8x_R, interest_op_8x8_R); #else // USE_NEON for (int j=startX;j<=endX;j++) { // we are now looking at a single pixel value /*uchar pxL = leftImage.at<uchar>(i,j); uchar pxR = rightImage.at<uchar>(i,j + disparity); uchar sL = laplacianL.at<uchar>(i,j); uchar sR = laplacianR.at<uchar>(i,j + disparity); */ uchar sL = this_row_laplacianL[j];//laplacianL.at<uchar>(i,j); uchar sR = this_row_laplacianR[j + m_iDisparity]; //laplacianR.at<uchar>(i,j + disparity); leftVal += sL; rightVal += sR; uchar pxL = this_rowL[j]; uchar pxR = this_rowR[j + m_iDisparity]; sad += abs(pxL - pxR); } #endif // USE_NEON } #ifdef USE_NEON // sum up sad = vgetq_lane_u16(sad_sum_8x, 0) + vgetq_lane_u16(sad_sum_8x, 1) + vgetq_lane_u16(sad_sum_8x, 2) + vgetq_lane_u16(sad_sum_8x, 3) + vgetq_lane_u16(sad_sum_8x, 4);// + vgetq_lane_u16(sad_sum_8x, 5) // + vgetq_lane_u16(sad_sum_8x, 6) + vgetq_lane_u16(sad_sum_8x, 7); leftVal = vgetq_lane_u16(interest_op_sum_8x_L, 0) + vgetq_lane_u16(interest_op_sum_8x_L, 1) + vgetq_lane_u16(interest_op_sum_8x_L, 2) + vgetq_lane_u16(interest_op_sum_8x_L, 3) + vgetq_lane_u16(interest_op_sum_8x_L, 4); rightVal = vgetq_lane_u16(interest_op_sum_8x_R, 0) + vgetq_lane_u16(interest_op_sum_8x_R, 1) + vgetq_lane_u16(interest_op_sum_8x_R, 2) + vgetq_lane_u16(interest_op_sum_8x_R, 3) + vgetq_lane_u16(interest_op_sum_8x_R, 4); #endif //cout << "(" << leftVal << ", " << rightVal << ") vs. (" << leftVal2 << ", " << rightVal2 << ")" << endl; int laplacian_value = leftVal + rightVal; int fThresh = 200; if((leftVal<fThresh)||(rightVal<fThresh)) laplacian_value /= 10; //cout << "sad with neon: " << sad << " without neon: " << sad2 << endl; if (left_interest != NULL) *left_interest = leftVal; if (right_interest != NULL) *right_interest = rightVal; // percentage of total interest value that is different //float diff_score = 100*(float)abs(leftVal - rightVal)/(float)laplacian_value; if (raw_sad != NULL) *raw_sad = sad; if (leftVal < m_iSobelLimit || rightVal < m_iSobelLimit)// || diff_score > state.interest_diff_limit) return -1; // weight laplacian_value into the score //return sobel; return NUMERIC_CONST*(float)sad/(float)laplacian_value; }
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) { int ay = _ay; int i; DATA32* pbuf = result; uint16x4_t ay_16x4; uint16x4_t p0_16x4; uint16x4_t p2_16x4; uint16x8_t ax_16x8; uint16x8_t p0_p2_16x8; uint16x8_t p1_p3_16x8; uint16x8_t x255_16x8; uint32x2_t p0_p2_32x2; uint32x2_t p1_p3_32x2; uint32x2_t res_32x2; uint8x8_t p0_p2_8x8; uint8x8_t p1_p3_8x8; uint8x8_t p2_8x8; uint16x4_t temp_16x4; ay_16x4 = vdup_n_u16(ay); x255_16x8 = vdupq_n_u16(0xff); for(i = 0; i < len; i++) { DATA32 p0 = *_p0++; DATA32 p1 = *_p1++; DATA32 p2 = *_p2++; DATA32 p3 = *_p3++; int ax = *_ax++; if (p0 | p1 | p2 | p3) { ax_16x8 = vdupq_n_u16(ax); p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0); p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1); p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0); p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1); p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2); p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2); p1_p3_16x8 = vmovl_u8(p1_p3_8x8); p0_p2_16x8 = vmovl_u8(p0_p2_8x8); p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8); p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8); p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8); p0_16x4 = vget_low_u16(p1_p3_16x8); p2_16x4 = vget_high_u16(p1_p3_16x8); p2_16x4 = vsub_u16(p2_16x4, p0_16x4); p2_16x4 = vmul_u16(p2_16x4, ay_16x4); p2_16x4 = vshr_n_u16(p2_16x4, 8); p2_16x4 = vadd_u16(p2_16x4, p0_16x4); p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4); p2_8x8 = vmovn_u16(p1_p3_16x8); res_32x2 = vreinterpret_u32_u8(p2_8x8); vst1_lane_u32(pbuf++, res_32x2, 1); } else *pbuf++ = p0; } return 0; }
# ifndef COLBLACK uint16x4_t temp_16x4; uint16x4_t rv_16x4; uint16x4_t val1_16x4; uint16x4_t val3_16x4; uint16x8_t ru_16x8; uint16x8_t val1_val3_16x8; uint16x8_t val2_val4_16x8; uint16x8_t x255_16x8; uint32x2_t res_32x2; uint32x2_t val1_val3_32x2; uint32x2_t val2_val4_32x2; uint8x8_t val1_val3_8x8; uint8x8_t val2_val4_8x8; x255_16x8 = vdupq_n_u16(0xff); # ifdef COLMUL uint16x4_t x255_16x4; x255_16x4 = vget_low_u16(x255_16x8); uint16x4_t c1_16x4; # ifdef COLSAME uint16x4_t c1_val3_16x4; uint16x8_t c1_16x8; uint16x8_t c1_val3_16x8; uint32x2_t c1_32x2; uint8x8_t c1_8x8; uint8x8_t c1_val3_8x8; c1_32x2 = vset_lane_u32(c1, c1_32x2, 0); c1_8x8 = vreinterpret_u8_u32(c1_32x2); c1_16x8 = vmovl_u8(c1_8x8);
/* u16x8 mm mul */ void mw_neon_mm_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, int Col, unsigned short * C) { int i, k, j; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_u16(0); for (j = 0; j < T; j+=8) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_u16(A + j_T); j_T+=Row; neon_a1 = vld1q_u16(A + j_T); j_T+=Row; neon_a2 = vld1q_u16(A + j_T); j_T+=Row; neon_a3 = vld1q_u16(A + j_T); j_T+=Row; neon_a4 = vld1q_u16(A + j_T); j_T+=Row; neon_a5 = vld1q_u16(A + j_T); j_T+=Row; neon_a6 = vld1q_u16(A + j_T); j_T+=Row; neon_a7 = vld1q_u16(A + j_T); neon_b = vld1q_u16(B + k_Row + j); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); vst1q_lane_u16(C + k_Row + i, neon_c, 0); vst1q_lane_u16(C + k_Row + i + 1, neon_c, 1); vst1q_lane_u16(C + k_Row + i + 2, neon_c, 2); vst1q_lane_u16(C + k_Row + i + 3, neon_c, 3); vst1q_lane_u16(C + k_Row + i + 4, neon_c, 4); vst1q_lane_u16(C + k_Row + i + 5, neon_c, 5); vst1q_lane_u16(C + k_Row + i + 6, neon_c, 6); vst1q_lane_u16(C + k_Row + i + 7, neon_c, 7); } } } }
inline uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
void test_vdupq_nu16 (void) { out_uint16x8_t = vdupq_n_u16 (~0x1200); }
static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit uint8x16_t qlimit, // limit uint8x16_t qthresh, // thresh uint8x16_t q3, // p3 uint8x16_t q4, // p2 uint8x16_t q5, // p1 uint8x16_t q6, // p0 uint8x16_t q7, // q0 uint8x16_t q8, // q1 uint8x16_t q9, // q2 uint8x16_t q10, // q3 uint8x16_t *q5r, // p1 uint8x16_t *q6r, // p0 uint8x16_t *q7r, // q0 uint8x16_t *q8r) { // q1 uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; int16x8_t q2s16, q11s16; uint16x8_t q4u16; int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; int8x8_t d2s8, d3s8; q11u8 = vabdq_u8(q3, q4); q12u8 = vabdq_u8(q4, q5); q13u8 = vabdq_u8(q5, q6); q14u8 = vabdq_u8(q8, q7); q3 = vabdq_u8(q9, q8); q4 = vabdq_u8(q10, q9); q11u8 = vmaxq_u8(q11u8, q12u8); q12u8 = vmaxq_u8(q13u8, q14u8); q3 = vmaxq_u8(q3, q4); q15u8 = vmaxq_u8(q11u8, q12u8); q9 = vabdq_u8(q6, q7); // vp8_hevmask q13u8 = vcgtq_u8(q13u8, qthresh); q14u8 = vcgtq_u8(q14u8, qthresh); q15u8 = vmaxq_u8(q15u8, q3); q2u8 = vabdq_u8(q5, q8); q9 = vqaddq_u8(q9, q9); q15u8 = vcgeq_u8(qlimit, q15u8); // vp8_filter() function // convert to signed q10 = vdupq_n_u8(0x80); q8 = veorq_u8(q8, q10); q7 = veorq_u8(q7, q10); q6 = veorq_u8(q6, q10); q5 = veorq_u8(q5, q10); q2u8 = vshrq_n_u8(q2u8, 1); q9 = vqaddq_u8(q9, q2u8); q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), vget_low_s8(vreinterpretq_s8_u8(q6))); q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), vget_high_s8(vreinterpretq_s8_u8(q6))); q9 = vcgeq_u8(qblimit, q9); q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); q14u8 = vorrq_u8(q13u8, q14u8); q4u16 = vdupq_n_u16(3); q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); q15u8 = vandq_u8(q15u8, q9); q1s8 = vreinterpretq_s8_u8(q1u8); q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); q4 = vdupq_n_u8(3); q9 = vdupq_n_u8(4); // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) d2s8 = vqmovn_s16(q2s16); d3s8 = vqmovn_s16(q11s16); q1s8 = vcombine_s8(d2s8, d3s8); q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); q1s8 = vreinterpretq_s8_u8(q1u8); q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); q2s8 = vshrq_n_s8(q2s8, 3); q1s8 = vshrq_n_s8(q1s8, 3); q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); q1s8 = vrshrq_n_s8(q1s8, 1); q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); return; }
bool decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha=0xff) { // pre-condition : width, height must be even if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv) return false; // in & out pointers unsigned char* dst = out; // constants int const stride = width*trait::bytes_per_pixel; int const itHeight = height>>1; int const itWidth = width>>3; uint8x8_t const Yshift = vdup_n_u8(16); int16x8_t const half = vdupq_n_u16(128); int32x4_t const rounding = vdupq_n_s32(128); // tmp variable uint16x8_t t; // pixel block to temporary store 8 pixels typename trait::PixelBlock pblock = trait::init_pixelblock(fill_alpha); for (int j=0; j<itHeight; ++j, y+=width, dst+=stride) { for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*trait::bytes_per_pixel)) { t = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift)); int32x4_t const Y00 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y01 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); t = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift)); int32x4_t const Y10 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y11 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); // trait::loadvu pack 4 sets of uv into a uint8x8_t, layout : { v0,u0, v1,u1, v2,u2, v3,u3 } t = vsubq_s16((int16x8_t)vmovl_u8(trait::loadvu(uv)), half); // UV.val[0] : v0, v1, v2, v3 // UV.val[1] : u0, u1, u2, u3 int16x4x2_t const UV = vuzp_s16(vget_low_s16(t), vget_high_s16(t)); // tR : 128+409V // tG : 128-100U-208V // tB : 128+516U int32x4_t const tR = vmlal_n_s16(rounding, UV.val[0], 409); int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, UV.val[0], -208), UV.val[1], -100); int32x4_t const tB = vmlal_n_s16(rounding, UV.val[1], 516); int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3] int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3] int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3] // upper 8 pixels trait::store_pixel_block(dst, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8)); // lower 8 pixels trait::store_pixel_block(dst+stride, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8)); } } return true; }
static INLINE void vp8_mbloop_filter_neon( uint8x16_t qblimit, // mblimit uint8x16_t qlimit, // limit uint8x16_t qthresh, // thresh uint8x16_t q3, // p2 uint8x16_t q4, // p2 uint8x16_t q5, // p1 uint8x16_t q6, // p0 uint8x16_t q7, // q0 uint8x16_t q8, // q1 uint8x16_t q9, // q2 uint8x16_t q10, // q3 uint8x16_t *q4r, // p1 uint8x16_t *q5r, // p1 uint8x16_t *q6r, // p0 uint8x16_t *q7r, // q0 uint8x16_t *q8r, // q1 uint8x16_t *q9r) { // q1 uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; int8x16_t q0s8, q12s8, q14s8, q15s8; int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; q11u8 = vabdq_u8(q3, q4); q12u8 = vabdq_u8(q4, q5); q13u8 = vabdq_u8(q5, q6); q14u8 = vabdq_u8(q8, q7); q1u8 = vabdq_u8(q9, q8); q0u8 = vabdq_u8(q10, q9); q11u8 = vmaxq_u8(q11u8, q12u8); q12u8 = vmaxq_u8(q13u8, q14u8); q1u8 = vmaxq_u8(q1u8, q0u8); q15u8 = vmaxq_u8(q11u8, q12u8); q12u8 = vabdq_u8(q6, q7); // vp8_hevmask q13u8 = vcgtq_u8(q13u8, qthresh); q14u8 = vcgtq_u8(q14u8, qthresh); q15u8 = vmaxq_u8(q15u8, q1u8); q15u8 = vcgeq_u8(qlimit, q15u8); q1u8 = vabdq_u8(q5, q8); q12u8 = vqaddq_u8(q12u8, q12u8); // vp8_filter() function // convert to signed q0u8 = vdupq_n_u8(0x80); q9 = veorq_u8(q9, q0u8); q8 = veorq_u8(q8, q0u8); q7 = veorq_u8(q7, q0u8); q6 = veorq_u8(q6, q0u8); q5 = veorq_u8(q5, q0u8); q4 = veorq_u8(q4, q0u8); q1u8 = vshrq_n_u8(q1u8, 1); q12u8 = vqaddq_u8(q12u8, q1u8); q14u8 = vorrq_u8(q13u8, q14u8); q12u8 = vcgeq_u8(qblimit, q12u8); q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), vget_low_s8(vreinterpretq_s8_u8(q6))); q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), vget_high_s8(vreinterpretq_s8_u8(q6))); q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); q11s16 = vdupq_n_s16(3); q2s16 = vmulq_s16(q2s16, q11s16); q13s16 = vmulq_s16(q13s16, q11s16); q15u8 = vandq_u8(q15u8, q12u8); q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); q12u8 = vdupq_n_u8(3); q11u8 = vdupq_n_u8(4); // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) d2 = vqmovn_s16(q2s16); d3 = vqmovn_s16(q13s16); q1s8 = vcombine_s8(d2, d3); q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); q2s8 = vshrq_n_s8(q2s8, 3); q13s8 = vshrq_n_s8(q13s8, 3); q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); d5 = vdup_n_s8(9); d4 = vdup_n_s8(18); q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); d5 = vdup_n_s8(27); q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); d0 = vqshrn_n_s16(q0s16 , 7); d1 = vqshrn_n_s16(q11s16, 7); d24 = vqshrn_n_s16(q12s16, 7); d25 = vqshrn_n_s16(q13s16, 7); d28 = vqshrn_n_s16(q14s16, 7); d29 = vqshrn_n_s16(q15s16, 7); q0s8 = vcombine_s8(d0, d1); q12s8 = vcombine_s8(d24, d25); q14s8 = vcombine_s8(d28, d29); q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); q15s8 = vqsubq_s8((q7s8), q14s8); q14s8 = vqaddq_s8((q6s8), q14s8); q1u8 = vdupq_n_u8(0x80); *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); return; }