Exemplo n.º 1
0
void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
                                   SkColor color, int width, SkPMColor) {
    int colA = SkColorGetA(color);
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    colA = SkAlpha255To256(colA);

    uint8x8_t vcolR, vcolG, vcolB;
    uint16x8_t vcolA;

    if (width >= 8) {
        vcolA = vdupq_n_u16(colA);
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
        vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
        vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);

        vdst.val[NEON_A] = vdup_n_u8(0xFF);
        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
    }
}
Exemplo n.º 2
0
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                          const int ref_stride, const int height) {
  int i;
  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
  const int shift_factor = ((height >> 5) + 3) * -1;
  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);

  for (i = 0; i < height; i += 8) {
    const uint8x16_t vec_row1 = vld1q_u8(ref);
    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));

    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));

    ref += ref_stride * 8;
  }

  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);

  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
  hbuf += 8;
  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
}
Exemplo n.º 3
0
void test_vdupQ_nu16 (void)
{
  uint16x8_t out_uint16x8_t;
  uint16_t arg0_uint16_t;

  out_uint16x8_t = vdupq_n_u16 (arg0_uint16_t);
}
Exemplo n.º 4
0
/* u16x8 mv mul */
void mw_neon_mv_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, unsigned short * C)
{
	int i = 0;
	int k = 0;

	uint16x8_t neon_b, neon_c;
	uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7;
	uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7;

	for (i = 0; i < Row; i+=8)
	{
		neon_c = vmovq_n_u16(0);

		for (k = 0; k < T; k+=8)
		{
			int j = k * T + i;

			neon_a0 = vld1q_u16(A + j);
			j+=Row;
			neon_a1 = vld1q_u16(A + j);
			j+=Row;
			neon_a2 = vld1q_u16(A + j);
			j+=Row;
			neon_a3 = vld1q_u16(A + j);
			j+=Row;
			neon_a4 = vld1q_u16(A + j);
			j+=Row;
			neon_a5 = vld1q_u16(A + j);
			j+=Row;
			neon_a6 = vld1q_u16(A + j);
			j+=Row;
			neon_a7 = vld1q_u16(A + j);

			neon_b = vld1q_u16(B + k);
			neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0));
			neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1));
			neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2));
			neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3));
			neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4));
			neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5));
			neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6));
			neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7));

			neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c);
			neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c);

		}

		vst1q_u16(C + i, neon_c);
	}
}
Exemplo n.º 5
0
int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
  int i;
  uint16x8_t vec_sum = vdupq_n_u16(0);

  for (i = 0; i < width; i += 16) {
    const uint8x16_t vec_row = vld1q_u8(ref);
    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
    ref += 16;
  }

  return horizontal_add_u16x8(vec_sum);
}
Exemplo n.º 6
0
void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  int i;
  uint16x8_t q1u16, q3u16;
  int16x8_t q1s16;
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint32x2_t d2u32 = vdup_n_u32(0);

  d0u8 = vld1_dup_u8(above - 1);
  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
  for (i = 0; i < 4; i++, dst += stride) {
    q1u16 = vdupq_n_u16((uint16_t)left[i]);
    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
                      vreinterpretq_s16_u16(q3u16));
    d0u8 = vqmovun_s16(q1s16);
    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  }
}
Exemplo n.º 7
0
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
                                    const uint8x8_t G,
                                    const uint8x8_t B) {
  const uint16x8_t r = vmovl_u8(R);
  const uint16x8_t g = vmovl_u8(G);
  const uint16x8_t b = vmovl_u8(B);
  const uint16x4_t r_lo = vget_low_u16(r);
  const uint16x4_t r_hi = vget_high_u16(r);
  const uint16x4_t g_lo = vget_low_u16(g);
  const uint16x4_t g_hi = vget_high_u16(g);
  const uint16x4_t b_lo = vget_low_u16(b);
  const uint16x4_t b_hi = vget_high_u16(b);
  const uint32x4_t tmp0_lo = vmull_n_u16(         r_lo, 16839u);
  const uint32x4_t tmp0_hi = vmull_n_u16(         r_hi, 16839u);
  const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
  const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
  const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
  const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
  const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
                                     vrshrn_n_u32(tmp2_hi, 16));
  const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
  return vqmovn_u16(Y2);
}
Exemplo n.º 8
0
void SkRGB16BlitterBlitV_neon(uint16_t* device,
                              int height,
                              size_t deviceRB,
                              unsigned scale,
                              uint32_t src32) {
    if (height >= 8)
    {
        uint16_t* dst = device;

        // prepare constants
        uint16x8_t vdev = vdupq_n_u16(0);
        uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
        uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
        uint32x4_t vsrc32 = vdupq_n_u32(src32);
        uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);

        while (height >= 8){
            LOAD_LANE_16(vdev, 0)
            LOAD_LANE_16(vdev, 1)
            LOAD_LANE_16(vdev, 2)
            LOAD_LANE_16(vdev, 3)
            LOAD_LANE_16(vdev, 4)
            LOAD_LANE_16(vdev, 5)
            LOAD_LANE_16(vdev, 6)
            LOAD_LANE_16(vdev, 7)

            // Expand_rgb_16
            uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
            uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
            uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);

            // Compact_rgb_16
            vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
            vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
            vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
            vdst32_hi = vshrq_n_u32(vdst32_hi, 5);

            uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
            uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
            uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
            vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
            vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
            uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);

            STORE_LANE_16(vdst16_lo, 0)
            STORE_LANE_16(vdst16_lo, 1)
            STORE_LANE_16(vdst16_lo, 2)
            STORE_LANE_16(vdst16_lo, 3)
            STORE_LANE_16(vdst16_hi, 0)
            STORE_LANE_16(vdst16_hi, 1)
            STORE_LANE_16(vdst16_hi, 2)
            STORE_LANE_16(vdst16_hi, 3)
            height -= 8;
        }
    }
    while (height != 0){
        uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
        *device = SkCompact_rgb_16((src32 + dst32) >> 5);
        device = (uint16_t*)((char*)device + deviceRB);
        height--;
    }
}
Exemplo n.º 9
0
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
                                        SkColor color, int width,
                                        SkPMColor opaqueDst) {
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    uint8x8_t vcolR, vcolG, vcolB;
    uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;

    if (width >= 8) {
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
        vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
        vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
        vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
        vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;
        uint8x8_t vsel_trans, vsel_opq;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Prepare compare masks
        vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
        vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
        vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);

        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
        vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
        vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    // Leftovers
    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
                                    opaqueDst);
    }
}
Exemplo n.º 10
0
uint16x8_t test_vdupq_n_u16(uint16_t v1) {
  // CHECK: test_vdupq_n_u16
  return vdupq_n_u16(v1);
  // CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
}
/**
 * Get the sum of absolute differences for a specific pixel location and disparity
 *
 * @param leftImage left image
 * @param rightImage right image
 * @param laplacianL laplacian-fitlered left image
 * @param laplacianR laplacian-filtered right image
 * @param pxX row pixel location
 * @param pxY column pixel location
 * @param state state structure that includes a number of parameters
 * @param left_interest optional parameter that will be filled with the value for the left interest operation
 * @param right_interest same as above, for the right image
 *
 * @retval scaled sum of absolute differences for this block --
 *      the value is the sum/numberOfPixels
 */
int PushbroomStereo::GetSAD(Mat leftImage, Mat rightImage, Mat laplacianL, Mat laplacianR, int pxX, int pxY, int *left_interest, int *right_interest, int *raw_sad)
{
    // top left corner of the SAD box
    int startX = pxX;
    int startY = pxY;

    // bottom right corner of the SAD box
    #ifndef USE_NEON
        int endX = pxX + m_iBlockSize - 1;
    #endif

    int endY = pxY + m_iBlockSize - 1;

    #if USE_SAFTEY_CHECKS
        int flag = false;
        if (startX < 0)
        {
            printf("Warning: startX < 0\n");
            flag = true;
        }

        if (endX > rightImage.cols)
        {
            printf("Warning: endX > leftImage.cols\n");
            flag = true;
        }

        if (startX + disparity < 0)
        {
            printf("Warning: startX + disparity < 0\n");
            flag = true;
        }

        if (endX + disparity > rightImage.cols)
        {
            printf("Warning: endX + disparity > leftImage.cols\n");
            flag = true;
        }

        if (endX + disparity > rightImage.cols)
        {
            printf("Warning: endX + disparity > rightImage.cols\n");
            endX = rightImage.cols - disparity;
            flag = true;
        }

        if (startY < 0) {
            printf("Warning: startY < 0\n");
            flag = true;
        }

        if (endY > rightImage.rows) {
            printf("Warning: endY > rightImage.rows\n");
            flag = true;
        }

        // disparity might be negative as well
        if (disparity < 0 && startX + disparity < 0)
        {
            printf("Warning: disparity < 0 && startX + disparity < 0\n");
            startX = -disparity;
            flag = true;
        }

        if (flag == true)
        {
            printf("startX = %d, endX = %d, disparity = %d, startY = %d, endY = %d\n", startX, endX, disparity, startY, endY);
        }



        startX = max(0, startX);
        startY = max(0, startY);

        endX = min(leftImage.cols - disparity, endX);
        endY = min(leftImage.rows, endY);
    #endif

    int leftVal = 0, rightVal = 0;

    int sad = 0;

    #ifdef USE_NEON
        uint16x8_t interest_op_sum_8x_L, interest_op_sum_8x_R, sad_sum_8x;

        // load zeros into everything
        interest_op_sum_8x_L = vdupq_n_u16(0);
        interest_op_sum_8x_R = vdupq_n_u16(0);
        sad_sum_8x = vdupq_n_u16(0);

    #endif
    for (int i=startY;i<=endY;i++) 
	{
		if(i>=leftImage.rows-1)
			continue;
        //Get a pointer for this row
        uchar *this_rowL = leftImage.ptr<uchar>(i);
        uchar *this_rowR = rightImage.ptr<uchar>(i);

        uchar *this_row_laplacianL = laplacianL.ptr<uchar>(i);
        uchar *this_row_laplacianR = laplacianR.ptr<uchar>(i);

        #ifdef USE_NEON
            // load this row into memory
            uint8x8_t this_row_8x8_L = vld1_u8(this_rowL + startX);
            uint8x8_t this_row_8x8_R = vld1_u8(this_rowR + startX + disparity);

            uint8x8_t interest_op_8x8_L = vld1_u8(this_row_laplacianL + startX);
            uint8x8_t interest_op_8x8_R = vld1_u8(this_row_laplacianR + startX + disparity);

            // do absolute differencing for the entire row in one operation!
            uint8x8_t sad_8x = vabd_u8(this_row_8x8_L, this_row_8x8_R);

            // sum up
            sad_sum_8x = vaddw_u8(sad_sum_8x, sad_8x);

            // sum laplacian values
            interest_op_sum_8x_L = vaddw_u8(interest_op_sum_8x_L, interest_op_8x8_L);
            interest_op_sum_8x_R = vaddw_u8(interest_op_sum_8x_R, interest_op_8x8_R);

        #else // USE_NEON
            for (int j=startX;j<=endX;j++) {
                // we are now looking at a single pixel value
                /*uchar pxL = leftImage.at<uchar>(i,j);
                uchar pxR = rightImage.at<uchar>(i,j + disparity);

                uchar sL = laplacianL.at<uchar>(i,j);
                uchar sR = laplacianR.at<uchar>(i,j + disparity);
                */


                uchar sL = this_row_laplacianL[j];//laplacianL.at<uchar>(i,j);
                uchar sR = this_row_laplacianR[j + m_iDisparity]; //laplacianR.at<uchar>(i,j + disparity);

                leftVal += sL;
                rightVal += sR;

                uchar pxL = this_rowL[j];
                uchar pxR = this_rowR[j + m_iDisparity];

                sad += abs(pxL - pxR);
            }
        #endif // USE_NEON
    }

    #ifdef USE_NEON
        // sum up
        sad = vgetq_lane_u16(sad_sum_8x, 0) + vgetq_lane_u16(sad_sum_8x, 1)
           + vgetq_lane_u16(sad_sum_8x, 2) + vgetq_lane_u16(sad_sum_8x, 3)
           + vgetq_lane_u16(sad_sum_8x, 4);// + vgetq_lane_u16(sad_sum_8x, 5)
    //           + vgetq_lane_u16(sad_sum_8x, 6) + vgetq_lane_u16(sad_sum_8x, 7);

        leftVal = vgetq_lane_u16(interest_op_sum_8x_L, 0)
                + vgetq_lane_u16(interest_op_sum_8x_L, 1)
                + vgetq_lane_u16(interest_op_sum_8x_L, 2)
                + vgetq_lane_u16(interest_op_sum_8x_L, 3)
                + vgetq_lane_u16(interest_op_sum_8x_L, 4);


        rightVal = vgetq_lane_u16(interest_op_sum_8x_R, 0)
                 + vgetq_lane_u16(interest_op_sum_8x_R, 1)
                 + vgetq_lane_u16(interest_op_sum_8x_R, 2)
                 + vgetq_lane_u16(interest_op_sum_8x_R, 3)
                 + vgetq_lane_u16(interest_op_sum_8x_R, 4);
    #endif

    //cout << "(" << leftVal << ", " << rightVal << ") vs. (" << leftVal2 << ", " << rightVal2 << ")" << endl;

    int laplacian_value = leftVal + rightVal;

	int fThresh = 200;
	if((leftVal<fThresh)||(rightVal<fThresh))
		laplacian_value /= 10;
    //cout << "sad with neon: " << sad << " without neon: " << sad2 << endl;
    if (left_interest != NULL)         *left_interest = leftVal; 
    if (right_interest != NULL)        *right_interest = rightVal;

    // percentage of total interest value that is different
    //float diff_score = 100*(float)abs(leftVal - rightVal)/(float)laplacian_value;

    if (raw_sad != NULL)        *raw_sad = sad;
    if (leftVal < m_iSobelLimit || rightVal < m_iSobelLimit)// || diff_score > state.interest_diff_limit)
        return -1;

    // weight laplacian_value into the score

    //return sobel;
    return NUMERIC_CONST*(float)sad/(float)laplacian_value;
}
Exemplo n.º 12
0
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) {
  int ay = _ay;
  int i;
  DATA32* pbuf = result;
	    uint16x4_t ay_16x4;
	    uint16x4_t p0_16x4;
	    uint16x4_t p2_16x4;
	    uint16x8_t ax_16x8;
	    uint16x8_t p0_p2_16x8;
	    uint16x8_t p1_p3_16x8;
	    uint16x8_t x255_16x8;
	    uint32x2_t p0_p2_32x2;
	    uint32x2_t p1_p3_32x2;
	    uint32x2_t res_32x2;
	    uint8x8_t p0_p2_8x8;
	    uint8x8_t p1_p3_8x8;
	    uint8x8_t p2_8x8;
	    uint16x4_t temp_16x4;

	    ay_16x4 = vdup_n_u16(ay);
	    x255_16x8 = vdupq_n_u16(0xff);
  for(i = 0; i < len; i++) {
    DATA32 p0 = *_p0++;
    DATA32 p1 = *_p1++;
    DATA32 p2 = *_p2++;
    DATA32 p3 = *_p3++;
    int ax = *_ax++;
		if (p0 | p1 | p2 | p3)
		  {
		    ax_16x8 = vdupq_n_u16(ax);

		    p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0);
		    p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1);
		    p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0);
		    p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1);

		    p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2);
		    p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2);
		    p1_p3_16x8 = vmovl_u8(p1_p3_8x8);
		    p0_p2_16x8 = vmovl_u8(p0_p2_8x8);

		    p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8);
		    p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8);
		    p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8);
		    p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8);
		    p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8);

		    p0_16x4 = vget_low_u16(p1_p3_16x8);
		    p2_16x4 = vget_high_u16(p1_p3_16x8);

		    p2_16x4 = vsub_u16(p2_16x4, p0_16x4);
		    p2_16x4 = vmul_u16(p2_16x4, ay_16x4);
		    p2_16x4 = vshr_n_u16(p2_16x4, 8);
		    p2_16x4 = vadd_u16(p2_16x4, p0_16x4);

		    p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4);
		    p2_8x8 = vmovn_u16(p1_p3_16x8);
		    res_32x2 = vreinterpret_u32_u8(p2_8x8);
		    vst1_lane_u32(pbuf++, res_32x2, 1);
		  }
		else
		  *pbuf++ = p0;

  }
	return 0;
}
Exemplo n.º 13
0
#  ifndef COLBLACK
   uint16x4_t temp_16x4;
   uint16x4_t rv_16x4;
   uint16x4_t val1_16x4;
   uint16x4_t val3_16x4;
   uint16x8_t ru_16x8;
   uint16x8_t val1_val3_16x8;
   uint16x8_t val2_val4_16x8;
   uint16x8_t x255_16x8;
   uint32x2_t res_32x2;
   uint32x2_t val1_val3_32x2;
   uint32x2_t val2_val4_32x2;
   uint8x8_t val1_val3_8x8;
   uint8x8_t val2_val4_8x8;

   x255_16x8 = vdupq_n_u16(0xff);
#   ifdef COLMUL
   uint16x4_t x255_16x4;
   x255_16x4 = vget_low_u16(x255_16x8);
   uint16x4_t c1_16x4;
#    ifdef COLSAME
   uint16x4_t c1_val3_16x4;
   uint16x8_t c1_16x8;
   uint16x8_t c1_val3_16x8;
   uint32x2_t c1_32x2;
   uint8x8_t c1_8x8;
   uint8x8_t c1_val3_8x8;

   c1_32x2 = vset_lane_u32(c1, c1_32x2, 0);
   c1_8x8 = vreinterpret_u8_u32(c1_32x2);
   c1_16x8 = vmovl_u8(c1_8x8);
Exemplo n.º 14
0
/* u16x8 mm mul */
void mw_neon_mm_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, int Col, unsigned short * C)
{
	int i, k, j;

	uint16x8_t neon_b, neon_c;
	uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7;
	uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7;

	for (i = 0; i < Row; i+=8)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_u16(0);

			for (j = 0; j < T; j+=8)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a4 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a5 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a6 = vld1q_u16(A + j_T);
				j_T+=Row;
				neon_a7 = vld1q_u16(A + j_T);

				neon_b = vld1q_u16(B + k_Row + j);
				neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0));
				neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1));
				neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2));
				neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3));
				neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4));
				neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5));
				neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6));
				neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7));

				neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c);
				neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c);

				vst1q_lane_u16(C + k_Row + i, neon_c, 0);
				vst1q_lane_u16(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_u16(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_u16(C + k_Row + i + 3, neon_c, 3);
				vst1q_lane_u16(C + k_Row + i + 4, neon_c, 4);
				vst1q_lane_u16(C + k_Row + i + 5, neon_c, 5);
				vst1q_lane_u16(C + k_Row + i + 6, neon_c, 6);
				vst1q_lane_u16(C + k_Row + i + 7, neon_c, 7);

			}
		}
	}
}
Exemplo n.º 15
0
inline  uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
Exemplo n.º 16
0
void test_vdupq_nu16 (void)
{
  out_uint16x8_t = vdupq_n_u16 (~0x1200);
}
Exemplo n.º 17
0
static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
                                       uint8x16_t qlimit,   // limit
                                       uint8x16_t qthresh,  // thresh
                                       uint8x16_t q3,       // p3
                                       uint8x16_t q4,       // p2
                                       uint8x16_t q5,       // p1
                                       uint8x16_t q6,       // p0
                                       uint8x16_t q7,       // q0
                                       uint8x16_t q8,       // q1
                                       uint8x16_t q9,       // q2
                                       uint8x16_t q10,      // q3
                                       uint8x16_t *q5r,     // p1
                                       uint8x16_t *q6r,     // p0
                                       uint8x16_t *q7r,     // q0
                                       uint8x16_t *q8r) {   // q1
  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
  int16x8_t q2s16, q11s16;
  uint16x8_t q4u16;
  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
  int8x8_t d2s8, d3s8;

  q11u8 = vabdq_u8(q3, q4);
  q12u8 = vabdq_u8(q4, q5);
  q13u8 = vabdq_u8(q5, q6);
  q14u8 = vabdq_u8(q8, q7);
  q3 = vabdq_u8(q9, q8);
  q4 = vabdq_u8(q10, q9);

  q11u8 = vmaxq_u8(q11u8, q12u8);
  q12u8 = vmaxq_u8(q13u8, q14u8);
  q3 = vmaxq_u8(q3, q4);
  q15u8 = vmaxq_u8(q11u8, q12u8);

  q9 = vabdq_u8(q6, q7);

  // vp8_hevmask
  q13u8 = vcgtq_u8(q13u8, qthresh);
  q14u8 = vcgtq_u8(q14u8, qthresh);
  q15u8 = vmaxq_u8(q15u8, q3);

  q2u8 = vabdq_u8(q5, q8);
  q9 = vqaddq_u8(q9, q9);

  q15u8 = vcgeq_u8(qlimit, q15u8);

  // vp8_filter() function
  // convert to signed
  q10 = vdupq_n_u8(0x80);
  q8 = veorq_u8(q8, q10);
  q7 = veorq_u8(q7, q10);
  q6 = veorq_u8(q6, q10);
  q5 = veorq_u8(q5, q10);

  q2u8 = vshrq_n_u8(q2u8, 1);
  q9 = vqaddq_u8(q9, q2u8);

  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
                   vget_low_s8(vreinterpretq_s8_u8(q6)));
  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
                    vget_high_s8(vreinterpretq_s8_u8(q6)));

  q9 = vcgeq_u8(qblimit, q9);

  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));

  q14u8 = vorrq_u8(q13u8, q14u8);

  q4u16 = vdupq_n_u16(3);
  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));

  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
  q15u8 = vandq_u8(q15u8, q9);

  q1s8 = vreinterpretq_s8_u8(q1u8);
  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));

  q4 = vdupq_n_u8(3);
  q9 = vdupq_n_u8(4);
  // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
  d2s8 = vqmovn_s16(q2s16);
  d3s8 = vqmovn_s16(q11s16);
  q1s8 = vcombine_s8(d2s8, d3s8);
  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
  q1s8 = vreinterpretq_s8_u8(q1u8);

  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
  q2s8 = vshrq_n_s8(q2s8, 3);
  q1s8 = vshrq_n_s8(q1s8, 3);

  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);

  q1s8 = vrshrq_n_s8(q1s8, 1);
  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));

  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);

  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
  return;
}
Exemplo n.º 18
0
bool decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha=0xff)
{
    // pre-condition : width, height must be even
    if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv)
        return false;

    // in & out pointers
    unsigned char* dst = out;

    // constants
    int const stride = width*trait::bytes_per_pixel;
    int const itHeight = height>>1;
    int const itWidth = width>>3;

    uint8x8_t const Yshift = vdup_n_u8(16);
    int16x8_t const half = vdupq_n_u16(128);
    int32x4_t const rounding = vdupq_n_s32(128);

    // tmp variable
    uint16x8_t t;

    // pixel block to temporary store 8 pixels
    typename trait::PixelBlock pblock = trait::init_pixelblock(fill_alpha);    

    for (int j=0; j<itHeight; ++j, y+=width, dst+=stride) {
        for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*trait::bytes_per_pixel)) {
            t = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift));
            int32x4_t const Y00 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298);
            int32x4_t const Y01 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298);

            t = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift));
            int32x4_t const Y10 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298);
            int32x4_t const Y11 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298);

            // trait::loadvu pack 4 sets of uv into a uint8x8_t, layout : { v0,u0, v1,u1, v2,u2, v3,u3 }
            t = vsubq_s16((int16x8_t)vmovl_u8(trait::loadvu(uv)), half);

            // UV.val[0] : v0, v1, v2, v3
            // UV.val[1] : u0, u1, u2, u3
            int16x4x2_t const UV = vuzp_s16(vget_low_s16(t), vget_high_s16(t));

            // tR : 128+409V
            // tG : 128-100U-208V
            // tB : 128+516U
            int32x4_t const tR = vmlal_n_s16(rounding, UV.val[0], 409);
            int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, UV.val[0], -208), UV.val[1], -100);
            int32x4_t const tB = vmlal_n_s16(rounding, UV.val[1], 516);

            int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3]
            int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3]
            int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3]

            // upper 8 pixels
            trait::store_pixel_block(dst, pblock,
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8));

            // lower 8 pixels
            trait::store_pixel_block(dst+stride, pblock,
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8));
        }
    }
    return true;
}
Exemplo n.º 19
0
static INLINE void vp8_mbloop_filter_neon(
        uint8x16_t qblimit,  // mblimit
        uint8x16_t qlimit,   // limit
        uint8x16_t qthresh,  // thresh
        uint8x16_t q3,       // p2
        uint8x16_t q4,       // p2
        uint8x16_t q5,       // p1
        uint8x16_t q6,       // p0
        uint8x16_t q7,       // q0
        uint8x16_t q8,       // q1
        uint8x16_t q9,       // q2
        uint8x16_t q10,      // q3
        uint8x16_t *q4r,     // p1
        uint8x16_t *q5r,     // p1
        uint8x16_t *q6r,     // p0
        uint8x16_t *q7r,     // q0
        uint8x16_t *q8r,     // q1
        uint8x16_t *q9r) {   // q1
    uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
    int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
    int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
    uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
    int8x16_t q0s8, q12s8, q14s8, q15s8;
    int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;

    q11u8 = vabdq_u8(q3, q4);
    q12u8 = vabdq_u8(q4, q5);
    q13u8 = vabdq_u8(q5, q6);
    q14u8 = vabdq_u8(q8, q7);
    q1u8  = vabdq_u8(q9, q8);
    q0u8  = vabdq_u8(q10, q9);

    q11u8 = vmaxq_u8(q11u8, q12u8);
    q12u8 = vmaxq_u8(q13u8, q14u8);
    q1u8  = vmaxq_u8(q1u8, q0u8);
    q15u8 = vmaxq_u8(q11u8, q12u8);

    q12u8 = vabdq_u8(q6, q7);

    // vp8_hevmask
    q13u8 = vcgtq_u8(q13u8, qthresh);
    q14u8 = vcgtq_u8(q14u8, qthresh);
    q15u8 = vmaxq_u8(q15u8, q1u8);

    q15u8 = vcgeq_u8(qlimit, q15u8);

    q1u8 = vabdq_u8(q5, q8);
    q12u8 = vqaddq_u8(q12u8, q12u8);

    // vp8_filter() function
    // convert to signed
    q0u8 = vdupq_n_u8(0x80);
    q9 = veorq_u8(q9, q0u8);
    q8 = veorq_u8(q8, q0u8);
    q7 = veorq_u8(q7, q0u8);
    q6 = veorq_u8(q6, q0u8);
    q5 = veorq_u8(q5, q0u8);
    q4 = veorq_u8(q4, q0u8);

    q1u8 = vshrq_n_u8(q1u8, 1);
    q12u8 = vqaddq_u8(q12u8, q1u8);

    q14u8 = vorrq_u8(q13u8, q14u8);
    q12u8 = vcgeq_u8(qblimit, q12u8);

    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
                     vget_low_s8(vreinterpretq_s8_u8(q6)));
    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
                      vget_high_s8(vreinterpretq_s8_u8(q6)));

    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
                     vreinterpretq_s8_u8(q8));

    q11s16 = vdupq_n_s16(3);
    q2s16  = vmulq_s16(q2s16, q11s16);
    q13s16 = vmulq_s16(q13s16, q11s16);

    q15u8 = vandq_u8(q15u8, q12u8);

    q2s16  = vaddw_s8(q2s16, vget_low_s8(q1s8));
    q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));

    q12u8 = vdupq_n_u8(3);
    q11u8 = vdupq_n_u8(4);
    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
    d2 = vqmovn_s16(q2s16);
    d3 = vqmovn_s16(q13s16);
    q1s8 = vcombine_s8(d2, d3);
    q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
    q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));

    q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
    q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
    q2s8 = vshrq_n_s8(q2s8, 3);
    q13s8 = vshrq_n_s8(q13s8, 3);

    q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
    q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);

    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));

    q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
    d5 = vdup_n_s8(9);
    d4 = vdup_n_s8(18);

    q0s16  = vmlal_s8(vreinterpretq_s16_u16(q0u16),  vget_low_s8(q1s8),  d5);
    q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
    d5 = vdup_n_s8(27);
    q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8),  d4);
    q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
    q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8),  d5);
    q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);

    d0  = vqshrn_n_s16(q0s16 , 7);
    d1  = vqshrn_n_s16(q11s16, 7);
    d24 = vqshrn_n_s16(q12s16, 7);
    d25 = vqshrn_n_s16(q13s16, 7);
    d28 = vqshrn_n_s16(q14s16, 7);
    d29 = vqshrn_n_s16(q15s16, 7);

    q0s8  = vcombine_s8(d0, d1);
    q12s8 = vcombine_s8(d24, d25);
    q14s8 = vcombine_s8(d28, d29);

    q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
    q0s8  = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
    q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
    q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
    q15s8 = vqsubq_s8((q7s8), q14s8);
    q14s8 = vqaddq_s8((q6s8), q14s8);

    q1u8 = vdupq_n_u8(0x80);
    *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
    *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
    *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
    *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
    *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
    *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
    return;
}