static uint8x8_t
paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
{
   uint8x8_t d, e;
   uint16x8_t p1, pa, pb, pc;

   p1 = vaddl_u8(a, b); /* a + b */
   pc = vaddl_u8(c, c); /* c * 2 */
   pa = vabdl_u8(b, c); /* pa */
   pb = vabdl_u8(a, c); /* pb */
   pc = vabdq_u16(p1, pc); /* pc */

   p1 = vcleq_u16(pa, pb); /* pa <= pb */
   pa = vcleq_u16(pa, pc); /* pa <= pc */
   pb = vcleq_u16(pb, pc); /* pb <= pc */

   p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */

   d = vmovn_u16(pb);
   e = vmovn_u16(p1);

   d = vbsl_u8(d, b, c);
   e = vbsl_u8(e, a, d);

   return e;
}
Example #2
0
File: vbslu8.c Project: pjump/gcc
void test_vbslu8 (void)
{
    uint8x8_t out_uint8x8_t;
    uint8x8_t arg0_uint8x8_t;
    uint8x8_t arg1_uint8x8_t;
    uint8x8_t arg2_uint8x8_t;

    out_uint8x8_t = vbsl_u8 (arg0_uint8x8_t, arg1_uint8x8_t, arg2_uint8x8_t);
}
Example #3
0
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width)
{
    const int *S0 = src[0], *S1 = src[1];

    int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
    int32x4_t qT_0123, qT_4567;
    int16x4_t dT_0123, dT_4567;
    uint16x8_t qT_01234567;
    uint8x8_t dT_01234567, dDst_01234567;

    int32x2_t dBeta;
    dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0);
    dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1);

    int32x4_t qDelta, qMin, qMax;
    qDelta = vdupq_n_s32 (DELTA);
    qMin = vdupq_n_s32 (0);
    qMax = vdupq_n_s32 (255);

    int x = 0;
    for (; x <= width - 8; x += 8)
    {
        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        vst1_u8 (&dst[x], dT_01234567);
    }

    if (x < width)
    {
        uint8x8_t dMask;
        dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)]));
        dDst_01234567 = vld1_u8 (&dst[x]);

        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);
        vst1_u8 (&dst[x], dMask);
    }
}
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
                                        SkColor color, int width,
                                        SkPMColor opaqueDst) {
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    uint8x8_t vcolR, vcolG, vcolB;
    uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;

    if (width >= 8) {
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
        vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
        vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
        vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
        vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;
        uint8x8_t vsel_trans, vsel_opq;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Prepare compare masks
        vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
        vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
        vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);

        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
        vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
        vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    // Leftovers
    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
                                    opaqueDst);
    }
}
static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
                                      uint8x8_t dlimit,    // limit
                                      uint8x8_t dthresh,   // thresh
                                      uint8x8_t d3u8,      // p2
                                      uint8x8_t d4u8,      // p2
                                      uint8x8_t d5u8,      // p1
                                      uint8x8_t d6u8,      // p0
                                      uint8x8_t d7u8,      // q0
                                      uint8x8_t d16u8,     // q1
                                      uint8x8_t d17u8,     // q2
                                      uint8x8_t d18u8,     // q3
                                      uint8x8_t *d0ru8,    // p1
                                      uint8x8_t *d1ru8,    // p1
                                      uint8x8_t *d2ru8,    // p0
                                      uint8x8_t *d3ru8,    // q0
                                      uint8x8_t *d4ru8,    // q1
                                      uint8x8_t *d5ru8) {  // q1
  uint32_t flat;
  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
  int16x8_t q15s16;
  uint16x8_t q10u16, q14u16;
  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;

  d19u8 = vabd_u8(d3u8, d4u8);
  d20u8 = vabd_u8(d4u8, d5u8);
  d21u8 = vabd_u8(d5u8, d6u8);
  d22u8 = vabd_u8(d16u8, d7u8);
  d23u8 = vabd_u8(d17u8, d16u8);
  d24u8 = vabd_u8(d18u8, d17u8);

  d19u8 = vmax_u8(d19u8, d20u8);
  d20u8 = vmax_u8(d21u8, d22u8);

  d25u8 = vabd_u8(d6u8, d4u8);

  d23u8 = vmax_u8(d23u8, d24u8);

  d26u8 = vabd_u8(d7u8, d17u8);

  d19u8 = vmax_u8(d19u8, d20u8);

  d24u8 = vabd_u8(d6u8, d7u8);
  d27u8 = vabd_u8(d3u8, d6u8);
  d28u8 = vabd_u8(d18u8, d7u8);

  d19u8 = vmax_u8(d19u8, d23u8);

  d23u8 = vabd_u8(d5u8, d16u8);
  d24u8 = vqadd_u8(d24u8, d24u8);

  d19u8 = vcge_u8(dlimit, d19u8);

  d25u8 = vmax_u8(d25u8, d26u8);
  d26u8 = vmax_u8(d27u8, d28u8);

  d23u8 = vshr_n_u8(d23u8, 1);

  d25u8 = vmax_u8(d25u8, d26u8);

  d24u8 = vqadd_u8(d24u8, d23u8);

  d20u8 = vmax_u8(d20u8, d25u8);

  d23u8 = vdup_n_u8(1);
  d24u8 = vcge_u8(dblimit, d24u8);

  d21u8 = vcgt_u8(d21u8, dthresh);

  d20u8 = vcge_u8(d23u8, d20u8);

  d19u8 = vand_u8(d19u8, d24u8);

  d23u8 = vcgt_u8(d22u8, dthresh);

  d20u8 = vand_u8(d20u8, d19u8);

  d22u8 = vdup_n_u8(0x80);

  d23u8 = vorr_u8(d21u8, d23u8);

  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));

  d30u8 = vshrn_n_u16(q10u16, 4);
  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);

  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
    d27u8 = vdup_n_u8(3);
    d21u8 = vdup_n_u8(2);
    q14u16 = vaddl_u8(d6u8, d7u8);
    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    *d0ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d16u8);
    *d1ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    *d2ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d3ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vsubw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d4ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vsubw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
  } else {
    d21u8 = veor_u8(d7u8, d22u8);
    d24u8 = veor_u8(d6u8, d22u8);
    d25u8 = veor_u8(d5u8, d22u8);
    d26u8 = veor_u8(d16u8, d22u8);

    d27u8 = vdup_n_u8(3);

    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));

    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));

    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));

    q15s16 = vaddw_s8(q15s16, d29s8);

    d29u8 = vdup_n_u8(4);

    d28s8 = vqmovn_s16(q15s16);

    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));

    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
    d30s8 = vshr_n_s8(d30s8, 3);
    d29s8 = vshr_n_s8(d29s8, 3);

    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);

    d29s8 = vrshr_n_s8(d29s8, 1);
    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));

    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);

    if (flat == 0) {  // filter_branch_only
      *d0ru8 = d4u8;
      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
      *d5ru8 = d17u8;
      return;
    }

    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);

    d23u8 = vdup_n_u8(2);
    q14u16 = vaddl_u8(d6u8, d7u8);
    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);

    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);

    q14u16 = vaddw_u8(q14u16, d5u8);

    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);

    d30u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d16u8);

    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);

    d31u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d17u8);

    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);

    d23u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d7u8);

    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);

    q14u16 = vaddw_u8(q14u16, d18u8);

    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);

    d22u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vsubw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d16u8);

    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);

    q14u16 = vaddw_u8(q14u16, d18u8);

    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);

    d6u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vsubw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    q14u16 = vaddw_u8(q14u16, d18u8);

    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);

    d7u8 = vqrshrn_n_u16(q14u16, 3);

    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
  }
  return;
}
void  yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count,
			 unsigned char * destp)
{
  const unsigned char *source_endp;
  const unsigned char *vector_endp;
  int remainder;
  const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0};
  const int16x8_t v_coeff = {90, -46, 0,  0, 90, -46, 0,  0};
  const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF};
  const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 
  int16x8_t mp0_rgba;  /* macropixel 0's resulting RGBA RGBA pixels  */
  int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels  */
  uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1}   */
  uint8x8_t rgba0, rgba1; /* rgba values as bytes  */
  uint8x16_t bothrgba;
  uint8_t * destinationp; /* pointer into output buffer destp  */
  int16x8_t widerpixels; /*  rawpixels promoted to shorts per component */
  const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff,
			     0x00, 0x00, 0x00, 0x00};
  
  
  /* we're working with things in 4-byte macropixels  */
  remainder = source_byte_count % 4;

  source_endp = sourcep + source_byte_count;
  vector_endp = source_endp - remainder;
  destinationp = (uint8_t *)destp;

  while (sourcep < vector_endp)
    {
     /* pull YUYV from 2 four byte macropixels starting at sourcep. */
      /* we'll increment sourcep as we go to save the array dereference */
      /* and separate increment instruction at the end of the loop  */

      /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */
      rawpixels = vld1_u8(sourcep);
      sourcep += sizeof(rawpixels);

      widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels));
 


      
      /* ---------- process macropixel 0 --------------- */
      /* take macropixel zero ([YUYV]0) from rawpixels and   */
      /* compute the two RGBA pixels that come from it. store  */
      /* those two pixels in mp0_rgba  */
      {
	int16x8_t wider_yalpha;
	int16x8_t u_vec, v_vec, uv_vec;
	uint8x8_t narrow_yalpha;
	uint8x8_t y0_vec, y1_vec;
	int16x4_t yuyv;

	/* narrow_yalpha is drawn from [YUYV]0 and formed into */
	/* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha}   */
	/* this would have been a nice place for vtbx1_u8, but i  */
	/* can't get it to work. so i'll have to use vbsl_u8 instead.  */

	y0_vec = vdup_lane_u8(rawpixels, MP0_Y0);
	y1_vec = vdup_lane_u8(rawpixels, MP0_Y1);
	narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec);

	/* store ALPHA in elements 3 and 7 (after the RGB components)  */
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 3);
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 7);

	/* use vmovl_u8 to go from being unsigned 8-bit to  */
	/* unsigned 16-bit, the use vreinterpretq_s16_u16 to  */
	/* change interpretation from unsigned 16-bit to signed  */
	/* 16-bit.   */
	wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha));

	yuyv = vget_low_s16(widerpixels);
	
	/* form a vector of the U component from MP0  */
	u_vec = vdupq_lane_s16(yuyv, MP0_U);
	
	/* subtract uvbias from u_vec */
	u_vec = vsubq_s16(u_vec, uvbias);

	/* form a vector of the V component from MP0  */
	v_vec = vdupq_lane_s16(yuyv, MP0_V);
	
	/* subtract uvbias from v_vec */
	v_vec = vsubq_s16(v_vec, uvbias);

		
	/* Multiply eight 16-bit values in u_vec by eight 16-bit */
	/* values in u_coeff and store the results in u_vec.  */


	u_vec = vmulq_s16(u_vec, u_coeff);

	/* likewise multiply eight 16-bit values in v_vec by   */
	/* v_coeff and store the results in  v_vec */
	
	v_vec = vmulq_s16(v_vec, v_coeff);

	/* form uv_vec as the sum of u_vec & v_vec, then shift 6 places   */
	/* (dividing by 64)  */
	uv_vec = vaddq_s16(u_vec, v_vec);
	  
	uv_vec = vshrq_n_s16(uv_vec, 6);

	/* now mp0_rgba = y_vec + u_vec + v_vec  */
	mp0_rgba = vaddq_s16(wider_yalpha, uv_vec);

      }

      /* ---------- process macropixel 1 --------------- */
      /* take macropixel one ([YUYV]1) from rawpixels and   */
      /* compute the two RGBA pixels that come from it. store  */
      /* those two pixels in mp1_rgba  */      
      {
	int16x8_t wider_yalpha;
	int16x8_t u_vec, v_vec, uv_vec;
	uint8x8_t narrow_yalpha;
	uint8x8_t y0_vec, y1_vec;
	int16x4_t yuyv;

	/* narrow_yalpha is drawn from [YUYV]1 and formed into */
	/* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha}   */
	/* this would have been a nice place for vtbx1_u8, but i  */
	/* can't get it to work. so i'll have to use vbsl_u8 instead.  */

	y0_vec = vdup_lane_u8(rawpixels, MP1_Y0);
	y1_vec = vdup_lane_u8(rawpixels, MP1_Y1);
	narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec);
	  
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 3);
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 7);

	/* use vmovl_u8 to go from being unsigned 8-bit to  */
	/* unsigned 16-bit, the use vreinterpretq_s16_u16 to  */


	wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha));

	yuyv = vget_high_s16(widerpixels);
	u_vec = vdupq_lane_s16(yuyv, 1);
	u_vec = vsubq_s16(u_vec, uvbias);
	
	v_vec = vdupq_lane_s16(yuyv, 3);
	v_vec = vsubq_s16(v_vec, uvbias);

		
	/* Multiply eight 16-bit values in u_vec by eight 16-bit */
	/* values in u_coeff and store the results in u_vec.  */


	u_vec = vmulq_s16(u_vec, u_coeff);

	/* likewise multiply eight 16-bit values in v_vec by   */
	/* v_coeff and store the results in  v_vec */
	
	v_vec = vmulq_s16(v_vec, v_coeff);
     
	/* form uv_vec as the sum of u_vec & v_vec, then shift 6 places   */
	/* (dividing by 64)  */
	uv_vec  = vaddq_s16(u_vec, v_vec);
	uv_vec = vshrq_n_s16(uv_vec, 6);


	/* now mp1_rgba = y_vec + u_vec + v_vec  */
	mp1_rgba = vaddq_s16(wider_yalpha, uv_vec);
      }
      

      /* turn mp0_rgba from a vector of shorts to a vector of  */
      /* unsigned unsigned chars. this will saturate: clipping  */
      /* the values between 0 and 255.   */
      
      rgba0 = vqmovun_s16(mp0_rgba);
      rgba1 = vqmovun_s16(mp1_rgba);

      /* make it faster to copy these back out of vector registers into  */
      /* memory by combining rgba0 and rgba1 into the larger bothrgba.   */
      /* then store that back into memory at destinationp.               */

      bothrgba = vcombine_u8(rgba0, rgba1);
      
      vst1q_u8(destinationp, bothrgba);
      destinationp += 16;
      
      
    }
}