Ejemplo n.º 1
0
void float_to_int16_altivec(int16_t *dst, const float *src, int len)
{
    int i;
    vector float s0, s1;
    vector signed int t0, t1;
    vector signed short d0, d1, d;
    vector unsigned char align;
    if(((long)dst)&15) //FIXME
    for(i=0; i<len-7; i+=8) {
        s0 = vec_ld(0, src+i);
        s1 = vec_ld(16, src+i);
        t0 = vec_cts(s0, 0);
        d0 = vec_ld(0, dst+i);
        t1 = vec_cts(s1, 0);
        d1 = vec_ld(15, dst+i);
        d = vec_packs(t0,t1);
        d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
        align = vec_lvsr(0, dst+i);
        d0 = vec_perm(d1, d, align);
        d1 = vec_perm(d, d1, align);
        vec_st(d0, 0, dst+i);
        vec_st(d1,15, dst+i);
    }
    else
    for(i=0; i<len-7; i+=8) {
        s0 = vec_ld(0, src+i);
        s1 = vec_ld(16, src+i);
        t0 = vec_cts(s0, 0);
        t1 = vec_cts(s1, 0);
        d = vec_packs(t0,t1);
        vec_st(d, 0, dst+i);
    }
}
Ejemplo n.º 2
0
static void FUNC(ff_hevc_idct_4x4, BIT_DEPTH)(int16_t *coeffs, int col_limit)
{
    const int shift = 7;
    const int shift2 = 20 - BIT_DEPTH;
    vec_s16 src_01, src_23;
    vec_s32 res[4];
    vec_s16 res_packed[2];

    src_01 = vec_ld(0, coeffs);
    src_23 = vec_ld(16, coeffs);

    transform4x4(src_01, src_23, res, shift, coeffs);
    src_01 = vec_packs(res[0], res[1]);
    src_23 = vec_packs(res[2], res[3]);
    scale(res, res_packed, shift);
    // transpose
    src_01 = vec_perm(res_packed[0], res_packed[1], mask[0]);
    src_23 = vec_perm(res_packed[0], res_packed[1], mask[1]);

    transform4x4(src_01, src_23, res, shift2, coeffs);
    scale(res, res_packed, shift2);
    // transpose
    src_01 = vec_perm(res_packed[0], res_packed[1], mask[0]);
    src_23 = vec_perm(res_packed[0], res_packed[1], mask[1]);

    vec_st(src_01, 0, coeffs);
    vec_st(src_23, 16, coeffs);
}
Ejemplo n.º 3
0
void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
{
    vec_s16_t diff[2];
    vec_s32_t sum[2];
    vec_s32_t zero32 = vec_splat_s32(0);
    vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };

    pix_diff( &pix1[0], &pix2[0], diff, 0 );
    pix_diff( &pix1[4*FENC_STRIDE], &pix2[4*FDEC_STRIDE], diff, 1 );

    sum[0] = vec_sum4s( diff[0], zero32 );
    sum[1] = vec_sum4s( diff[1], zero32 );
    diff[0] = vec_packs( sum[0], sum[1] );
    sum[0] = vec_sum4s( diff[0], zero32 );
    diff[0] = vec_packs( sum[0], zero32 );

    diff[1] = vec_vsx_ld( 0, dct );
    diff[0] = vec_perm( diff[0], diff[1], mask );

    vec_vsx_st( diff[0], 0, dct );

    /* 2x2 DC transform */
    int d0 = dct[0] + dct[1];
    int d1 = dct[2] + dct[3];
    int d2 = dct[0] - dct[1];
    int d3 = dct[2] - dct[3];
    dct[0] = d0 + d1;
    dct[1] = d0 - d1;
    dct[2] = d2 + d3;
    dct[3] = d2 - d3;
}
Ejemplo n.º 4
0
static vector signed short float_to_int16_one_altivec(const float *src)
{
    vector float s0 = vec_ld(0, src);
    vector float s1 = vec_ld(16, src);
    vector signed int t0 = vec_cts(s0, 0);
    vector signed int t1 = vec_cts(s1, 0);
    return vec_packs(t0,t1);
}
static inline vector signed short convert16_altivec(vector signed int v1, vector signed int v2)
{
  register vector signed short result;
  v1 = vec_subs(v1, magic);
  v2 = vec_subs(v2, magic);
  result = vec_packs(v1, v2);

  return result;
}
Ejemplo n.º 6
0
void audio_convert_float_to_s16_altivec(int16_t *out,
      const float *in, size_t samples)
{
   // Unaligned loads/store is a bit expensive, so we optimize for the good path (very likely).
   if (((uintptr_t)out & 15) + ((uintptr_t)in & 15) == 0)
   {
      size_t i;
      for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
      {
         vector float input0 = vec_ld( 0, in);
         vector float input1 = vec_ld(16, in);
         vector signed int result0 = vec_cts(input0, 15);
         vector signed int result1 = vec_cts(input1, 15);
         vec_st(vec_packs(result0, result1), 0, out);
      }

      audio_convert_float_to_s16_C(out, in, samples - i);
   }
   else
      audio_convert_float_to_s16_C(out, in, samples);
}
Ejemplo n.º 7
0
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
    register int i;
    LOAD_ZERO;
    const vec_u8 permM2 = vec_lvsl(-2, src);
    const vec_u8 permM1 = vec_lvsl(-1, src);
    const vec_u8 permP0 = vec_lvsl(+0, src);
    const vec_u8 permP1 = vec_lvsl(+1, src);
    const vec_u8 permP2 = vec_lvsl(+2, src);
    const vec_u8 permP3 = vec_lvsl(+3, src);
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
    const vec_u32 v10ui = vec_splat_u32(10);
    const vec_s16 v5ss = vec_splat_s16(5);
    const vec_s16 v1ss = vec_splat_s16(1);
    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));

    register int align = ((((unsigned long)src) - 2) % 16);

    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
              srcP2A, srcP2B, srcP3A, srcP3B,
              srcM1A, srcM1B, srcM2A, srcM2B,
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;

    const vec_u8 mperm = (const vec_u8)
        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
    int16_t *tmpbis = tmp;

    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
              tmpP2ssA, tmpP2ssB;

    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
              ssumAe, ssumAo, ssumBe, ssumBo;
    vec_u8 fsum, sumv, sum;
    vec_s16 ssume, ssumo;

    src -= (2 * srcStride);
    for (i = 0 ; i < 21 ; i ++) {
        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
        vec_u8 srcR1 = vec_ld(-2, src);
        vec_u8 srcR2 = vec_ld(14, src);

        switch (align) {
        default: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = vec_perm(srcR1, srcR2, permP3);
        } break;
        case 11: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = srcR2;
        } break;
        case 12: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = srcR2;
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 13: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = srcR2;
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 14: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = srcR2;
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 15: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = srcR2;
            srcP0 = vec_perm(srcR2, srcR3, permP0);
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        }

        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);

        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);

        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);

        sum1A = vec_adds(srcP0A, srcP1A);
        sum1B = vec_adds(srcP0B, srcP1B);
        sum2A = vec_adds(srcM1A, srcP2A);
        sum2B = vec_adds(srcM1B, srcP2B);
        sum3A = vec_adds(srcM2A, srcP3A);
        sum3B = vec_adds(srcM2B, srcP3B);

        pp1A = vec_mladd(sum1A, v20ss, sum3A);
        pp1B = vec_mladd(sum1B, v20ss, sum3B);

        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);

        psumA = vec_sub(pp1A, pp2A);
        psumB = vec_sub(pp1B, pp2B);

        vec_st(psumA, 0, tmp);
        vec_st(psumB, 16, tmp);

        src += srcStride;
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
    }

    tmpM2ssA = vec_ld(0, tmpbis);
    tmpM2ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpM1ssA = vec_ld(0, tmpbis);
    tmpM1ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP0ssA = vec_ld(0, tmpbis);
    tmpP0ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP1ssA = vec_ld(0, tmpbis);
    tmpP1ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP2ssA = vec_ld(0, tmpbis);
    tmpP2ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;

    for (i = 0 ; i < 16 ; i++) {
        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);

        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);

        tmpbis += tmpStride;

        tmpM2ssA = tmpM1ssA;
        tmpM2ssB = tmpM1ssB;
        tmpM1ssA = tmpP0ssA;
        tmpM1ssB = tmpP0ssB;
        tmpP0ssA = tmpP1ssA;
        tmpP0ssB = tmpP1ssB;
        tmpP1ssA = tmpP2ssA;
        tmpP1ssB = tmpP2ssB;
        tmpP2ssA = tmpP3ssA;
        tmpP2ssB = tmpP3ssB;

        pp1Ae = vec_mule(sum1A, v20ss);
        pp1Ao = vec_mulo(sum1A, v20ss);
        pp1Be = vec_mule(sum1B, v20ss);
        pp1Bo = vec_mulo(sum1B, v20ss);

        pp2Ae = vec_mule(sum2A, v5ss);
        pp2Ao = vec_mulo(sum2A, v5ss);
        pp2Be = vec_mule(sum2B, v5ss);
        pp2Bo = vec_mulo(sum2B, v5ss);

        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
        pp3Ao = vec_mulo(sum3A, v1ss);
        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
        pp3Bo = vec_mulo(sum3B, v1ss);

        pp1cAe = vec_add(pp1Ae, v512si);
        pp1cAo = vec_add(pp1Ao, v512si);
        pp1cBe = vec_add(pp1Be, v512si);
        pp1cBo = vec_add(pp1Bo, v512si);

        pp32Ae = vec_sub(pp3Ae, pp2Ae);
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
        pp32Be = vec_sub(pp3Be, pp2Be);
        pp32Bo = vec_sub(pp3Bo, pp2Bo);

        sumAe = vec_add(pp1cAe, pp32Ae);
        sumAo = vec_add(pp1cAo, pp32Ao);
        sumBe = vec_add(pp1cBe, pp32Be);
        sumBo = vec_add(pp1cBo, pp32Bo);

        ssumAe = vec_sra(sumAe, v10ui);
        ssumAo = vec_sra(sumAo, v10ui);
        ssumBe = vec_sra(sumBe, v10ui);
        ssumBo = vec_sra(sumBo, v10ui);

        ssume = vec_packs(ssumAe, ssumBe);
        ssumo = vec_packs(ssumAo, ssumBo);

        sumv = vec_packsu(ssume, ssumo);
        sum = vec_perm(sumv, sumv, mperm);

        ASSERT_ALIGNED(dst);

        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));

        vec_st(fsum, 0, dst);

        dst += dstStride;
    }
}
Ejemplo n.º 8
0
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
  register int i;
  const vector signed int vzero = vec_splat_s32(0);
  const vector unsigned char permM2 = vec_lvsl(-2, src);
  const vector unsigned char permM1 = vec_lvsl(-1, src);
  const vector unsigned char permP0 = vec_lvsl(+0, src);
  const vector unsigned char permP1 = vec_lvsl(+1, src);
  const vector unsigned char permP2 = vec_lvsl(+2, src);
  const vector unsigned char permP3 = vec_lvsl(+3, src);
  const vector signed short v20ss = (const vector signed short)AVV(20);
  const vector unsigned int v10ui = vec_splat_u32(10);
  const vector signed short v5ss = vec_splat_s16(5);
  const vector signed short v1ss = vec_splat_s16(1);
  const vector signed int v512si = (const vector signed int)AVV(512);
  const vector unsigned int v16ui = (const vector unsigned int)AVV(16);

  register int align = ((((unsigned long)src) - 2) % 16);

  src -= (2 * srcStride);

  for (i = 0 ; i < 21 ; i ++) {
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
    vector unsigned char srcR1 = vec_ld(-2, src);
    vector unsigned char srcR2 = vec_ld(14, src);

    switch (align) {
    default: {
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = vec_perm(srcR1, srcR2, permP2);
      srcP3 = vec_perm(srcR1, srcR2, permP3);
    } break;
    case 11: {
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = vec_perm(srcR1, srcR2, permP2);
      srcP3 = srcR2;
    } break;
    case 12: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = srcR2;
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 13: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = srcR2;
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 14: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = srcR2;
      srcP1 = vec_perm(srcR2, srcR3, permP1);
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 15: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = srcR2;
      srcP0 = vec_perm(srcR2, srcR3, permP0);
      srcP1 = vec_perm(srcR2, srcR3, permP1);
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    }

    const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
    const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
    const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
    const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);

    const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
    const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
    const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
    const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);

    const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
    const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
    const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
    const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);

    const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
    const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
    const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
    const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
    const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
    const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
    
    const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A);
    const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B);

    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);

    const vector signed short psumA = vec_sub(pp1A, pp2A);
    const vector signed short psumB = vec_sub(pp1B, pp2B);

    vec_st(psumA, 0, tmp);
    vec_st(psumB, 16, tmp);
    
    src += srcStride;
    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
  }
  
  const vector unsigned char dstperm = vec_lvsr(0, dst);
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
  const vector unsigned char mperm = (const vector unsigned char)
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
  
  int16_t *tmpbis = tmp - (tmpStride * 21);

  vector signed short tmpM2ssA = vec_ld(0, tmpbis);
  vector signed short tmpM2ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;
  vector signed short tmpM1ssA = vec_ld(0, tmpbis);
  vector signed short tmpM1ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;
  vector signed short tmpP0ssA = vec_ld(0, tmpbis);
  vector signed short tmpP0ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;
  vector signed short tmpP1ssA = vec_ld(0, tmpbis);
  vector signed short tmpP1ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;
  vector signed short tmpP2ssA = vec_ld(0, tmpbis);
  vector signed short tmpP2ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;

  for (i = 0 ; i < 16 ; i++) {
    const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
    const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;

    const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
    const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
    const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
    const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
    const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
    const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);

    tmpM2ssA = tmpM1ssA;
    tmpM2ssB = tmpM1ssB;
    tmpM1ssA = tmpP0ssA;
    tmpM1ssB = tmpP0ssB;
    tmpP0ssA = tmpP1ssA;
    tmpP0ssB = tmpP1ssB;
    tmpP1ssA = tmpP2ssA;
    tmpP1ssB = tmpP2ssB;
    tmpP2ssA = tmpP3ssA;
    tmpP2ssB = tmpP3ssB;

    const vector signed int pp1Ae = vec_mule(sum1A, v20ss);
    const vector signed int pp1Ao = vec_mulo(sum1A, v20ss);
    const vector signed int pp1Be = vec_mule(sum1B, v20ss);
    const vector signed int pp1Bo = vec_mulo(sum1B, v20ss);

    const vector signed int pp2Ae = vec_mule(sum2A, v5ss);
    const vector signed int pp2Ao = vec_mulo(sum2A, v5ss);
    const vector signed int pp2Be = vec_mule(sum2B, v5ss);
    const vector signed int pp2Bo = vec_mulo(sum2B, v5ss);

    const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
    const vector signed int pp3Ao = vec_mulo(sum3A, v1ss);
    const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui);
    const vector signed int pp3Bo = vec_mulo(sum3B, v1ss);

    const vector signed int pp1cAe = vec_add(pp1Ae, v512si);
    const vector signed int pp1cAo = vec_add(pp1Ao, v512si);
    const vector signed int pp1cBe = vec_add(pp1Be, v512si);
    const vector signed int pp1cBo = vec_add(pp1Bo, v512si);

    const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae);
    const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao);
    const vector signed int pp32Be = vec_sub(pp3Be, pp2Be);
    const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo);

    const vector signed int sumAe = vec_add(pp1cAe, pp32Ae);
    const vector signed int sumAo = vec_add(pp1cAo, pp32Ao);
    const vector signed int sumBe = vec_add(pp1cBe, pp32Be);
    const vector signed int sumBo = vec_add(pp1cBo, pp32Bo);
    
    const vector signed int ssumAe = vec_sra(sumAe, v10ui);
    const vector signed int ssumAo = vec_sra(sumAo, v10ui);
    const vector signed int ssumBe = vec_sra(sumBe, v10ui);
    const vector signed int ssumBo = vec_sra(sumBo, v10ui);

    const vector signed short ssume = vec_packs(ssumAe, ssumBe);
    const vector signed short ssumo = vec_packs(ssumAo, ssumBo);

    const vector unsigned char sumv = vec_packsu(ssume, ssumo);
    const vector unsigned char sum = vec_perm(sumv, sumv, mperm);

    const vector unsigned char dst1 = vec_ld(0, dst);
    const vector unsigned char dst2 = vec_ld(16, dst);
    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));

    vector unsigned char fsum;
    OP_U8_ALTIVEC(fsum, sum, vdst);

    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);

    vec_st(fdst1, 0, dst);
    vec_st(fdst2, 16, dst);

    dst += dstStride;
  }
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
}
Ejemplo n.º 9
0
void
gimp_composite_dodge_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx)
{
  const guchar *A = ctx->A;
  const guchar *B = ctx->B;
  guchar *D = ctx->D;
  guint length = ctx->n_pixels;
  vector unsigned char a,b,d;
  vector unsigned char alpha_a,alpha_b,alpha;
  vector signed short ox0001=vec_splat_s16(1);
  union
    {
      vector signed short v;
      vector unsigned short vu;
      gushort u16[8];
    } ah,al,bh,bl;

  while (length >= 4)
    {
      a=LoadUnaligned(A);
      b=LoadUnaligned(B);

      alpha_a=vec_and(a, alphamask);
      alpha_b=vec_and(b, alphamask);
      alpha=vec_min(alpha_a, alpha_b);

      ah.v=vec_unpackh((vector signed char)a);
      ah.v=vec_sl(ah.v,ox0008);
      al.v=vec_unpackl((vector signed char)a);
      al.v=vec_sl(al.v,ox0008);

      b=vec_nor(b,b);
      bh.v=vec_unpackh((vector signed char)b);
      bh.v=vec_and(bh.v,ox00ff);
      bh.v=vec_add(bh.v,ox0001);
      bl.v=vec_unpackl((vector signed char)b);
      bl.v=vec_and(bl.v,ox00ff);
      bl.v=vec_add(bl.v,ox0001);

      ah.u16[0]=ah.u16[0]/bh.u16[0];
      ah.u16[1]=ah.u16[1]/bh.u16[1];
      ah.u16[2]=ah.u16[2]/bh.u16[2];
      ah.u16[4]=ah.u16[4]/bh.u16[4];
      ah.u16[5]=ah.u16[5]/bh.u16[5];
      ah.u16[6]=ah.u16[6]/bh.u16[6];

      al.u16[0]=al.u16[0]/bl.u16[0];
      al.u16[1]=al.u16[1]/bl.u16[1];
      al.u16[2]=al.u16[2]/bl.u16[2];
      al.u16[4]=al.u16[4]/bl.u16[4];
      al.u16[5]=al.u16[5]/bl.u16[5];
      al.u16[6]=al.u16[6]/bl.u16[6];

      d=vec_packs(ah.vu,al.vu);

      d=vec_andc(d, alphamask);
      d=vec_or(d, alpha);

      StoreUnaligned(d, D);
      A+=16;
      B+=16;
      D+=16;
      length-=4;
    }
  length = length*4;
  a=LoadUnalignedLess(A, length);
  b=LoadUnalignedLess(B, length);

  alpha_a=vec_and(a, alphamask);
  alpha_b=vec_and(b, alphamask);
  alpha=vec_min(alpha_a, alpha_b);

  ah.v=vec_unpackh((vector signed char)a);
  ah.v=vec_sl(ah.v,ox0008);
  al.v=vec_unpackl((vector signed char)a);
  al.v=vec_sl(al.v,ox0008);

  b=vec_nor(b,b);
  bh.v=vec_unpackh((vector signed char)b);
  bh.v=vec_and(bh.v,ox00ff);
  bh.v=vec_add(bh.v,ox0001);
  bl.v=vec_unpackl((vector signed char)b);
  bl.v=vec_and(bl.v,ox00ff);
  bl.v=vec_add(bl.v,ox0001);

  ah.u16[0]=ah.u16[0]/bh.u16[0];
  ah.u16[1]=ah.u16[1]/bh.u16[1];
  ah.u16[2]=ah.u16[2]/bh.u16[2];
  ah.u16[4]=ah.u16[4]/bh.u16[4];
  ah.u16[5]=ah.u16[5]/bh.u16[5];
  ah.u16[6]=ah.u16[6]/bh.u16[6];

  al.u16[0]=al.u16[0]/bl.u16[0];
  al.u16[1]=al.u16[1]/bl.u16[1];
  al.u16[2]=al.u16[2]/bl.u16[2];
  al.u16[4]=al.u16[4]/bl.u16[4];
  al.u16[5]=al.u16[5]/bl.u16[5];
  al.u16[6]=al.u16[6]/bl.u16[6];

  d=vec_packs(ah.vu,al.vu);

  d=vec_andc(d, alphamask);
  d=vec_or(d, alpha);

  StoreUnalignedLess(d, D, length);
}
Ejemplo n.º 10
0
static void test()
{
  /* Input vectors.  */
  vector unsigned short vusa = {0,1,2,3,4,5,6,7};
  vector unsigned short vusb = {8,9,10,11,12,13,14,15};
  vector signed short vssa = {-8,-7,-6,-5,-4,-3,-2,-1};
  vector signed short vssb = {0,1,2,3,4,5,6,7};
  vector bool short vbsa = {0,65535,65535,0,0,0,65535,0};
  vector bool short vbsb = {65535,0,0,65535,65535,65535,0,65535};
  vector unsigned int vuia = {0,1,2,3};
  vector unsigned int vuib = {4,5,6,7};
  vector signed int vsia = {-4,-3,-2,-1};
  vector signed int vsib = {0,1,2,3};
  vector bool int vbia = {0,BIG,BIG,BIG};
  vector bool int vbib = {BIG,0,0,0};
  vector unsigned int vipa = {(0<<24) + (2<<19) + (3<<11) + (4<<3),
			      (1<<24) + (5<<19) + (6<<11) + (7<<3),
			      (0<<24) + (8<<19) + (9<<11) + (10<<3),
			      (1<<24) + (11<<19) + (12<<11) + (13<<3)};
  vector unsigned int vipb = {(1<<24) + (14<<19) + (15<<11) + (16<<3),
			      (0<<24) + (17<<19) + (18<<11) + (19<<3),
			      (1<<24) + (20<<19) + (21<<11) + (22<<3),
			      (0<<24) + (23<<19) + (24<<11) + (25<<3)};
  vector unsigned short vusc = {0,256,1,257,2,258,3,259};
  vector unsigned short vusd = {4,260,5,261,6,262,7,263};
  vector signed short vssc = {-1,-128,0,127,-2,-129,1,128};
  vector signed short vssd = {-3,-130,2,129,-4,-131,3,130};
  vector unsigned int vuic = {0,65536,1,65537};
  vector unsigned int vuid = {2,65538,3,65539};
  vector signed int vsic = {-1,-32768,0,32767};
  vector signed int vsid = {-2,-32769,1,32768};

  /* Result vectors.  */
  vector unsigned char vucr;
  vector signed char vscr;
  vector bool char vbcr;
  vector unsigned short vusr;
  vector signed short vssr;
  vector bool short vbsr;
  vector pixel vpr;
  vector unsigned char vucsr;
  vector signed char vscsr;
  vector unsigned short vussr;
  vector signed short vsssr;
  vector unsigned char vucsur1, vucsur2;
  vector unsigned short vussur1, vussur2;

  /* Expected result vectors.  */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  vector unsigned char vucer = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7};
  vector signed char vscer = {0,1,2,3,4,5,6,7,-8,-7,-6,-5,-4,-3,-2,-1};
  vector bool char vbcer = {255,0,0,255,255,255,0,255,0,255,255,0,0,0,255,0};
  vector unsigned short vuser = {4,5,6,7,0,1,2,3};
  vector signed short vsser = {0,1,2,3,-4,-3,-2,-1};
  vector bool short vbser = {65535,0,0,0,0,65535,65535,65535};
  vector pixel vper = {(1<<15) + (14<<10) + (15<<5) + 16,
		       (0<<15) + (17<<10) + (18<<5) + 19,
		       (1<<15) + (20<<10) + (21<<5) + 22,
		       (0<<15) + (23<<10) + (24<<5) + 25,
		       (0<<15) + (2<<10) + (3<<5) + 4,
		       (1<<15) + (5<<10) + (6<<5) + 7,
		       (0<<15) + (8<<10) + (9<<5) + 10,
		       (1<<15) + (11<<10) + (12<<5) + 13};
  vector unsigned char vucser = {4,255,5,255,6,255,7,255,0,255,1,255,2,255,3,255};
  vector signed char vscser = {-3,-128,2,127,-4,-128,3,127,
			       -1,-128,0,127,-2,-128,1,127};
  vector unsigned short vusser = {2,65535,3,65535,0,65535,1,65535};
  vector signed short vssser = {-2,-32768,1,32767,-1,-32768,0,32767};
  vector unsigned char vucsuer1 = {4,255,5,255,6,255,7,255,0,255,1,255,2,255,3,255};
  vector unsigned char vucsuer2 = {0,0,2,129,0,0,3,130,0,0,0,127,0,0,1,128};
  vector unsigned short vussuer1 = {2,65535,3,65535,0,65535,1,65535};
  vector unsigned short vussuer2 = {0,0,1,32768,0,0,0,32767};
#else
  vector unsigned char vucer = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
  vector signed char vscer = {-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7};
  vector bool char vbcer = {0,255,255,0,0,0,255,0,255,0,0,255,255,255,0,255};
  vector unsigned short vuser = {0,1,2,3,4,5,6,7};
  vector signed short vsser = {-4,-3,-2,-1,0,1,2,3};
  vector bool short vbser = {0,65535,65535,65535,65535,0,0,0};
  vector pixel vper = {(0<<15) + (2<<10) + (3<<5) + 4,
		       (1<<15) + (5<<10) + (6<<5) + 7,
		       (0<<15) + (8<<10) + (9<<5) + 10,
		       (1<<15) + (11<<10) + (12<<5) + 13,
		       (1<<15) + (14<<10) + (15<<5) + 16,
		       (0<<15) + (17<<10) + (18<<5) + 19,
		       (1<<15) + (20<<10) + (21<<5) + 22,
		       (0<<15) + (23<<10) + (24<<5) + 25};
  vector unsigned char vucser = {0,255,1,255,2,255,3,255,4,255,5,255,6,255,7,255};
  vector signed char vscser = {-1,-128,0,127,-2,-128,1,127,
			       -3,-128,2,127,-4,-128,3,127};
  vector unsigned short vusser = {0,65535,1,65535,2,65535,3,65535};
  vector signed short vssser = {-1,-32768,0,32767,-2,-32768,1,32767};
  vector unsigned char vucsuer1 = {0,255,1,255,2,255,3,255,4,255,5,255,6,255,7,255};
  vector unsigned char vucsuer2 = {0,0,0,127,0,0,1,128,0,0,2,129,0,0,3,130};
  vector unsigned short vussuer1 = {0,65535,1,65535,2,65535,3,65535};
  vector unsigned short vussuer2 = {0,0,0,32767,0,0,1,32768};
#endif

  vucr = vec_pack (vusa, vusb);
  vscr = vec_pack (vssa, vssb);
  vbcr = vec_pack (vbsa, vbsb);
  vusr = vec_pack (vuia, vuib);
  vssr = vec_pack (vsia, vsib);
  vbsr = vec_pack (vbia, vbib);
  vpr  = vec_packpx (vipa, vipb);
  vucsr = vec_packs (vusc, vusd);
  vscsr = vec_packs (vssc, vssd);
  vussr = vec_packs (vuic, vuid);
  vsssr = vec_packs (vsic, vsid);
  vucsur1 = vec_packsu (vusc, vusd);
  vucsur2 = vec_packsu (vssc, vssd);
  vussur1 = vec_packsu (vuic, vuid);
  vussur2 = vec_packsu (vsic, vsid);

  check (vec_all_eq (vucr, vucer), "vucr");
  check (vec_all_eq (vscr, vscer), "vscr");
  check (vec_all_eq (vbcr, vbcer), "vbcr");
  check (vec_all_eq (vusr, vuser), "vusr");
  check (vec_all_eq (vssr, vsser), "vssr");
  check (vec_all_eq (vbsr, vbser), "vbsr");
  check (vec_all_eq (vpr,  vper ), "vpr" );
  check (vec_all_eq (vucsr, vucser), "vucsr");
  check (vec_all_eq (vscsr, vscser), "vscsr");
  check (vec_all_eq (vussr, vusser), "vussr");
  check (vec_all_eq (vsssr, vssser), "vsssr");
  check (vec_all_eq (vucsur1, vucsuer1), "vucsur1");
  check (vec_all_eq (vucsur2, vucsuer2), "vucsur2");
  check (vec_all_eq (vussur1, vussuer1), "vussur1");
  check (vec_all_eq (vussur2, vussuer2), "vussur2");
}
Ejemplo n.º 11
0
void
jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
                          JSAMPARRAY output_buf, JDIMENSION output_col)
{
    short *dct_table = (short *)dct_table_;
    int *outptr;

    __vector short row0, row1, row2, row3, row4, row5, row6, row7,
             col0, col1, col2, col3, col4, col5, col6, col7,
             quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
             tmp0, tmp1, tmp2, tmp3, z3, z4,
             z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
             row71l, row71h, row26l, row26h, row53l, row53h,
             out0, out1, out2, out3, out4, out5, out6, out7;
    __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
             tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
             z3l, z3h, z4l, z4h,
             out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
             out5l, out5h, out6l, out6h, out7l, out7h;
    __vector signed char outb;

    /* Constants */
    __vector short pw_zero = { __8X(0) },
                   pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
                   pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
                   pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
                   pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
                   pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
                   pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
                   pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
                   pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
    __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
    __vector int pd_zero = { __4X(0) },
                 pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
                 pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
    __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
                          descale_p2 = { __4X(DESCALE_P2) },
                          const_bits = { __4X(CONST_BITS) };
    __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };

    /* Pass 1: process columns */

    col0 = vec_ld(0, coef_block);
    col1 = vec_ld(16, coef_block);
    col2 = vec_ld(32, coef_block);
    col3 = vec_ld(48, coef_block);
    col4 = vec_ld(64, coef_block);
    col5 = vec_ld(80, coef_block);
    col6 = vec_ld(96, coef_block);
    col7 = vec_ld(112, coef_block);

    tmp1 = vec_or(col1, col2);
    tmp2 = vec_or(col3, col4);
    tmp1 = vec_or(tmp1, tmp2);
    tmp3 = vec_or(col5, col6);
    tmp3 = vec_or(tmp3, col7);
    tmp1 = vec_or(tmp1, tmp3);

    quant0 = vec_ld(0, dct_table);
    col0 = vec_mladd(col0, quant0, pw_zero);

    if (vec_all_eq(tmp1, pw_zero)) {
        /* AC terms all zero */

        col0 = vec_sl(col0, pass1_bits);

        row0 = vec_splat(col0, 0);
        row1 = vec_splat(col0, 1);
        row2 = vec_splat(col0, 2);
        row3 = vec_splat(col0, 3);
        row4 = vec_splat(col0, 4);
        row5 = vec_splat(col0, 5);
        row6 = vec_splat(col0, 6);
        row7 = vec_splat(col0, 7);

    } else {

        quant1 = vec_ld(16, dct_table);
        quant2 = vec_ld(32, dct_table);
        quant3 = vec_ld(48, dct_table);
        quant4 = vec_ld(64, dct_table);
        quant5 = vec_ld(80, dct_table);
        quant6 = vec_ld(96, dct_table);
        quant7 = vec_ld(112, dct_table);

        col1 = vec_mladd(col1, quant1, pw_zero);
        col2 = vec_mladd(col2, quant2, pw_zero);
        col3 = vec_mladd(col3, quant3, pw_zero);
        col4 = vec_mladd(col4, quant4, pw_zero);
        col5 = vec_mladd(col5, quant5, pw_zero);
        col6 = vec_mladd(col6, quant6, pw_zero);
        col7 = vec_mladd(col7, quant7, pw_zero);

        DO_IDCT(col, 1);

        TRANSPOSE(out, row);
    }

    /* Pass 2: process rows */

    DO_IDCT(row, 2);

    TRANSPOSE(out, col);

    outb = vec_packs(col0, col0);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[0] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col1, col1);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[1] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col2, col2);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[2] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col3, col3);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[3] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col4, col4);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[4] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col5, col5);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[5] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col6, col6);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[6] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col7, col7);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[7] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);
}
Ejemplo n.º 12
0
/* start of optimized motionblur */
void pix_motionblur :: processYUVAltivec(imageStruct &image)
{
  int h,w,width;
  signed short rightGain,imageGain;
  unsigned char *saved = m_savedImage.data;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();
  if(saved!=m_savedImage.data) {
    m_savedImage.setBlack();
  }
  saved=m_savedImage.data;

  width = image.xsize/8;
  /*
  // hmm: why does it read 235 ?
  rightGain = (signed short)(235. * m_motionblur);
  imageGain = (signed short) (255. - (235. * m_motionblur));
  */
  rightGain = m_blur1;
  imageGain = m_blur0;

  union {
    signed short        elements[8];
    vector      signed short v;
  } shortBuffer;

  union {
    unsigned int        elements[4];
    vector      unsigned int v;
  } bitBuffer;

  register vector signed short gainAdd, hiImage, loImage,hiRight,loRight,
           YImage, UVImage;
  // register vector signed short loadhiImage, loadloImage,loadhiRight,loadloRight;
  register vector unsigned char loadImage, loadRight;
  register vector unsigned char zero = vec_splat_u8(0);
  register vector signed int UVhi,UVlo,Yhi,Ylo;
  register vector signed int UVhiR,UVloR,YhiR,YloR;
  register vector signed short gainSub,gain,gainR;//,d;
  register vector unsigned int bitshift;
  vector unsigned char *inData = (vector unsigned char*) image.data;
  vector unsigned char *rightData = (vector unsigned char*) saved;


  shortBuffer.elements[0] = 128;
  shortBuffer.elements[1] = 0;
  shortBuffer.elements[2] = 128;
  shortBuffer.elements[3] = 0;
  shortBuffer.elements[4] = 128;
  shortBuffer.elements[5] = 0;
  shortBuffer.elements[6] = 128;
  shortBuffer.elements[7] = 0;

  gainSub = shortBuffer.v;

  shortBuffer.elements[0] = imageGain;
  gain = shortBuffer.v;
  gain =  vec_splat(gain, 0 );

  shortBuffer.elements[0] = rightGain;
  gainR = shortBuffer.v;
  gainR =  vec_splat(gainR, 0 );

  bitBuffer.elements[0] = 8;

  //Load it into the vector unit
  bitshift = bitBuffer.v;
  bitshift = vec_splat(bitshift,0);

  shortBuffer.elements[0] = 128;

  //Load it into the vector unit
  gainAdd = shortBuffer.v;
  gainAdd = (vector signed short)vec_splat((vector signed short)gainAdd,0);

# ifndef PPC970
  UInt32                        prefetchSize = GetPrefetchConstant( 16, 1,
      256 );
  vec_dst( inData, prefetchSize, 0 );
  vec_dst( rightData, prefetchSize, 1 );
  vec_dst( inData+32, prefetchSize, 2 );
  vec_dst( rightData+32, prefetchSize, 3 );
# endif

  loadImage = inData[0];
  loadRight = rightData[0];

  for ( h=0; h<image.ysize; h++) {
    for (w=0; w<width; w++) {
# ifndef PPC970
      vec_dst( inData, prefetchSize, 0 );
      vec_dst( rightData, prefetchSize, 1 );
      vec_dst( inData+32, prefetchSize, 2 );
      vec_dst( rightData+32, prefetchSize, 3 );
# endif
      //interleaved U Y V Y chars

      hiImage = (vector signed short) vec_mergeh( zero, loadImage );
      loImage = (vector signed short) vec_mergel( zero, loadImage );

      hiRight = (vector signed short) vec_mergeh( zero, loadRight );
      loRight = (vector signed short) vec_mergel( zero, loadRight );

      //hoist that load!!
      loadImage = inData[1];
      loadRight = rightData[1];

      //subtract 128 from UV

      hiImage = vec_subs(hiImage,gainSub);
      loImage = vec_subs(loImage,gainSub);

      hiRight = vec_subs(hiRight,gainSub);
      loRight = vec_subs(loRight,gainSub);

      //now vec_mule the UV into two vector ints
      //change sone to gain
      UVhi = vec_mule(gain,hiImage);
      UVlo = vec_mule(gain,loImage);

      UVhiR = vec_mule(gainR,hiRight);
      UVloR = vec_mule(gainR,loRight);

      //now vec_mulo the Y into two vector ints
      Yhi = vec_mulo(gain,hiImage);
      Ylo = vec_mulo(gain,loImage);

      YhiR = vec_mulo(gainR,hiRight);
      YloR = vec_mulo(gainR,loRight);


      //this is where to do the add and bitshift due to the resolution
      //add UV
      UVhi = vec_adds(UVhi,UVhiR);
      UVlo = vec_adds(UVlo,UVloR);

      Yhi = vec_adds(Yhi,YhiR);
      Ylo = vec_adds(Ylo,YloR);

      //bitshift UV
      UVhi = vec_sra(UVhi,bitshift);
      UVlo = vec_sra(UVlo,bitshift);

      Yhi = vec_sra(Yhi,bitshift);
      Ylo = vec_sra(Ylo,bitshift);

      //pack the UV into a single short vector
      UVImage =  vec_packs(UVhi,UVlo);

      //pack the Y into a single short vector
      YImage =  vec_packs(Yhi,Ylo);

      //vec_mergel + vec_mergeh Y and UV
      hiImage =  vec_mergeh(UVImage,YImage);
      loImage =  vec_mergel(UVImage,YImage);

      //add 128 offset back
      hiImage = vec_adds(hiImage,gainSub);
      loImage = vec_adds(loImage,gainSub);

      //vec_mergel + vec_mergeh Y and UV
      rightData[0] = (vector unsigned char)vec_packsu(hiImage, loImage);
      inData[0] = (vector unsigned char)vec_packsu(hiImage, loImage);

      inData++;
      rightData++;
    }
  }
# ifndef PPC970
  //stop the cache streams
  vec_dss( 0 );
  vec_dss( 1 );
  vec_dss( 2 );
  vec_dss( 3 );
# endif


}/* end of working altivec function */
Ejemplo n.º 13
0
void x264_zigzag_interleave_8x8_cavlc_altivec( int16_t *dst, int16_t *src, uint8_t *nnz )
{
    vec_s16_t tmpv[8];
    vec_s16_t merge[2];
    vec_s16_t permv[2];
    vec_s16_t orv[4];
    vec_s16_t src0v = vec_ld( 0*16, src );
    vec_s16_t src1v = vec_ld( 1*16, src );
    vec_s16_t src2v = vec_ld( 2*16, src );
    vec_s16_t src3v = vec_ld( 3*16, src );
    vec_s16_t src4v = vec_ld( 4*16, src );
    vec_s16_t src5v = vec_ld( 5*16, src );
    vec_s16_t src6v = vec_ld( 6*16, src );
    vec_s16_t src7v = vec_ld( 7*16, src );
    vec_u8_t pack;
    vec_u8_t nnzv = vec_vsx_ld( 0, nnz );
    vec_u8_t shift = vec_splat_u8( 7 );
    LOAD_ZERO;

    const vec_u8_t mask[3] = {
        { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 },
        { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F },
        { 0x10, 0x11, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x12, 0x13, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F }
    };

    tmpv[0] = vec_mergeh( src0v, src1v );
    tmpv[1] = vec_mergel( src0v, src1v );

    tmpv[2] = vec_mergeh( src2v, src3v );
    tmpv[3] = vec_mergel( src2v, src3v );

    tmpv[4] = vec_mergeh( src4v, src5v );
    tmpv[5] = vec_mergel( src4v, src5v );

    tmpv[6] = vec_mergeh( src6v, src7v );
    tmpv[7] = vec_mergel( src6v, src7v );

    merge[0] = vec_mergeh( tmpv[0], tmpv[1] );
    merge[1] = vec_mergeh( tmpv[2], tmpv[3] );
    permv[0] = vec_perm( merge[0], merge[1], mask[0] );
    permv[1] = vec_perm( merge[0], merge[1], mask[1] );
    vec_st( permv[0], 0*16, dst );

    merge[0] = vec_mergeh( tmpv[4], tmpv[5] );
    merge[1] = vec_mergeh( tmpv[6], tmpv[7] );
    permv[0] = vec_perm( merge[0], merge[1], mask[0] );
    permv[2] = vec_perm( merge[0], merge[1], mask[1] );
    vec_st( permv[0], 1*16, dst );
    vec_st( permv[1], 2*16, dst );
    vec_st( permv[2], 3*16, dst );

    merge[0] = vec_mergel( tmpv[0], tmpv[1] );
    merge[1] = vec_mergel( tmpv[2], tmpv[3] );
    permv[0] = vec_perm( merge[0], merge[1], mask[0] );
    permv[1] = vec_perm( merge[0], merge[1], mask[1] );
    vec_st( permv[0], 4*16, dst );

    merge[0] = vec_mergel( tmpv[4], tmpv[5] );
    merge[1] = vec_mergel( tmpv[6], tmpv[7] );
    permv[0] = vec_perm( merge[0], merge[1], mask[0] );
    permv[2] = vec_perm( merge[0], merge[1], mask[1] );
    vec_st( permv[0], 5*16, dst );
    vec_st( permv[1], 6*16, dst );
    vec_st( permv[2], 7*16, dst );

    orv[0] = vec_or( src0v, src1v );
    orv[1] = vec_or( src2v, src3v );
    orv[2] = vec_or( src4v, src5v );
    orv[3] = vec_or( src6v, src7v );

    permv[0] = vec_or( orv[0], orv[1] );
    permv[1] = vec_or( orv[2], orv[3] );
    permv[0] = vec_or( permv[0], permv[1] );

    permv[1] = vec_perm( permv[0], permv[0], mask[1] );
    permv[0] = vec_or( permv[0], permv[1] );

    pack = (vec_u8_t)vec_packs( permv[0], permv[0] );
    pack = (vec_u8_t)vec_cmpeq( pack, zerov );
    pack = vec_nor( pack, zerov );
    pack = vec_sr( pack, shift );
    nnzv = vec_perm( nnzv, pack, mask[2] );
    vec_st( nnzv, 0, nnz );
}
Ejemplo n.º 14
0
// CHECK-LABEL: define void @test1
void test1() {

  /* vec_cmpeq */
  res_vbll = vec_cmpeq(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd
// CHECK-LE: @llvm.ppc.altivec.vcmpequd
// CHECK-PPC: error: call to 'vec_cmpeq' is ambiguous

  res_vbll = vec_cmpeq(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd
// CHECK-LE: @llvm.ppc.altivec.vcmpequd
// CHECK-PPC: error: call to 'vec_cmpeq' is ambiguous

  /* vec_cmpgt */
  res_vbll = vec_cmpgt(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd
// CHECK-PPC: error: call to 'vec_cmpgt' is ambiguous

  res_vbll = vec_cmpgt(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud
// CHECK-PPC: error: call to 'vec_cmpgt' is ambiguous

  /* ----------------------- predicates --------------------------- */
  /* vec_all_eq */
  res_i = vec_all_eq(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_eq' is ambiguous

  res_i = vec_all_eq(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_eq' is ambiguous

  res_i = vec_all_eq(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_eq' is ambiguous

  res_i = vec_all_eq(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_eq' is ambiguous

  res_i = vec_all_eq(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_eq' is ambiguous

  res_i = vec_all_eq(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_eq' is ambiguous

  res_i = vec_all_eq(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_eq' is ambiguous

  /* vec_all_ne */
  res_i = vec_all_ne(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_ne' is ambiguous

  res_i = vec_all_ne(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_ne' is ambiguous

  res_i = vec_all_ne(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_ne' is ambiguous

  res_i = vec_all_ne(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_ne' is ambiguous

  res_i = vec_all_ne(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_ne' is ambiguous

  res_i = vec_all_ne(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_ne' is ambiguous

  res_i = vec_all_ne(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_all_ne' is ambiguous

  /* vec_any_eq */
  res_i = vec_any_eq(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_eq' is ambiguous

  res_i = vec_any_eq(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_eq' is ambiguous

  res_i = vec_any_eq(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_eq' is ambiguous

  res_i = vec_any_eq(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_eq' is ambiguous

  res_i = vec_any_eq(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_eq' is ambiguous

  res_i = vec_any_eq(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_eq' is ambiguous

  res_i = vec_any_eq(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_eq' is ambiguous

  /* vec_any_ne */
  res_i = vec_any_ne(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_ne' is ambiguous

  res_i = vec_any_ne(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_ne' is ambiguous

  res_i = vec_any_ne(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_ne' is ambiguous

  res_i = vec_any_ne(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_ne' is ambiguous

  res_i = vec_any_ne(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_ne' is ambiguous

  res_i = vec_any_ne(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_ne' is ambiguous

  res_i = vec_any_ne(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpequd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpequd.p
// CHECK-PPC: error: call to 'vec_any_ne' is ambiguous

  /* vec_all_ge */
  res_i = vec_all_ge(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_all_ge' is ambiguous

  res_i = vec_all_ge(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_all_ge' is ambiguous

  res_i = vec_all_ge(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_ge' is ambiguous

  res_i = vec_all_ge(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_ge' is ambiguous

  res_i = vec_all_ge(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_ge' is ambiguous

  res_i = vec_all_ge(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_ge' is ambiguous

  res_i = vec_all_ge(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_ge' is ambiguous

  /* vec_all_gt */
  res_i = vec_all_gt(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_all_gt' is ambiguous

  res_i = vec_all_gt(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_all_gt' is ambiguous

  res_i = vec_all_gt(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_gt' is ambiguous

  res_i = vec_all_gt(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_gt' is ambiguous

  res_i = vec_all_gt(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_gt' is ambiguous

  res_i = vec_all_gt(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_gt' is ambiguous

  res_i = vec_all_gt(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_gt' is ambiguous

  /* vec_all_le */
  res_i = vec_all_le(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_all_le' is ambiguous

  res_i = vec_all_le(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_all_le' is ambiguous

  res_i = vec_all_le(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_le' is ambiguous

  res_i = vec_all_le(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_le' is ambiguous

  res_i = vec_all_le(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_le' is ambiguous

  res_i = vec_all_le(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_le' is ambiguous

  res_i = vec_all_le(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_le' is ambiguous

  /* vec_all_lt */
  res_i = vec_all_lt(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_all_lt' is ambiguous

  res_i = vec_all_lt(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_all_lt' is ambiguous

  res_i = vec_all_lt(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_lt' is ambiguous

  res_i = vec_all_lt(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_lt' is ambiguous

  res_i = vec_all_lt(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_lt' is ambiguous

  res_i = vec_all_lt(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_lt' is ambiguous

  res_i = vec_all_lt(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_all_lt' is ambiguous

  /* vec_any_ge */
  res_i = vec_any_ge(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_any_ge' is ambiguous

  res_i = vec_any_ge(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_any_ge' is ambiguous

  res_i = vec_any_ge(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_ge' is ambiguous

  res_i = vec_any_ge(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_ge' is ambiguous

  res_i = vec_any_ge(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_ge' is ambiguous

  res_i = vec_any_ge(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_ge' is ambiguous

  res_i = vec_any_ge(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_ge' is ambiguous

  /* vec_any_gt */
  res_i = vec_any_gt(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_any_gt' is ambiguous

  res_i = vec_any_gt(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_any_gt' is ambiguous

  res_i = vec_any_gt(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_gt' is ambiguous

  res_i = vec_any_gt(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_gt' is ambiguous

  res_i = vec_any_gt(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_gt' is ambiguous

  res_i = vec_any_gt(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_gt' is ambiguous

  res_i = vec_any_gt(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_gt' is ambiguous

  /* vec_any_le */
  res_i = vec_any_le(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_any_le' is ambiguous

  res_i = vec_any_le(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_any_le' is ambiguous

  res_i = vec_any_le(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_le' is ambiguous

  res_i = vec_any_le(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_le' is ambiguous

  res_i = vec_any_le(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_le' is ambiguous

  res_i = vec_any_le(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_le' is ambiguous

  res_i = vec_any_le(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_le' is ambiguous

  /* vec_any_lt */
  res_i = vec_any_lt(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_any_lt' is ambiguous

  res_i = vec_any_lt(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p
// CHECK-PPC: error: call to 'vec_any_lt' is ambiguous

  res_i = vec_any_lt(vull, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_lt' is ambiguous

  res_i = vec_any_lt(vull, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_lt' is ambiguous

  res_i = vec_any_lt(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_lt' is ambiguous

  res_i = vec_any_lt(vbll, vull);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_lt' is ambiguous

  res_i = vec_any_lt(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p
// CHECK-PPC: error: call to 'vec_any_lt' is ambiguous

  /* vec_max */
  res_vsll = vec_max(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vmaxsd
// CHECK-LE: @llvm.ppc.altivec.vmaxsd
// CHECK-PPC: error: call to 'vec_max' is ambiguous

  res_vsll = vec_max(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vmaxsd
// CHECK-LE: @llvm.ppc.altivec.vmaxsd
// CHECK-PPC: error: call to 'vec_max' is ambiguous

  res_vsll = vec_max(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vmaxsd
// CHECK-LE: @llvm.ppc.altivec.vmaxsd
// CHECK-PPC: error: call to 'vec_max' is ambiguous

  res_vull = vec_max(vull, vull);
// CHECK: @llvm.ppc.altivec.vmaxud
// CHECK-LE: @llvm.ppc.altivec.vmaxud
// CHECK-PPC: error: call to 'vec_max' is ambiguous

  res_vull = vec_max(vbll, vull);
// CHECK: @llvm.ppc.altivec.vmaxud
// CHECK-LE: @llvm.ppc.altivec.vmaxud
// CHECK-PPC: error: call to 'vec_max' is ambiguous

  res_vull = vec_max(vull, vbll);
// CHECK: @llvm.ppc.altivec.vmaxud
// CHECK-LE: @llvm.ppc.altivec.vmaxud
// CHECK-PPC: error: call to 'vec_max' is ambiguous

  /* vec_min */
  res_vsll = vec_min(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vminsd
// CHECK-LE: @llvm.ppc.altivec.vminsd
// CHECK-PPC: error: call to 'vec_min' is ambiguous

  res_vsll = vec_min(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vminsd
// CHECK-LE: @llvm.ppc.altivec.vminsd
// CHECK-PPC: error: call to 'vec_min' is ambiguous

  res_vsll = vec_min(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vminsd
// CHECK-LE: @llvm.ppc.altivec.vminsd
// CHECK-PPC: error: call to 'vec_min' is ambiguous

  res_vull = vec_min(vull, vull);
// CHECK: @llvm.ppc.altivec.vminud
// CHECK-LE: @llvm.ppc.altivec.vminud
// CHECK-PPC: error: call to 'vec_min' is ambiguous

  res_vull = vec_min(vbll, vull);
// CHECK: @llvm.ppc.altivec.vminud
// CHECK-LE: @llvm.ppc.altivec.vminud
// CHECK-PPC: error: call to 'vec_min' is ambiguous

  res_vull = vec_min(vull, vbll);
// CHECK: @llvm.ppc.altivec.vminud
// CHECK-LE: @llvm.ppc.altivec.vminud
// CHECK-PPC: error: call to 'vec_min' is ambiguous

  /* vec_mule */
  res_vsll = vec_mule(vi, vi);
// CHECK: @llvm.ppc.altivec.vmulesw
// CHECK-LE: @llvm.ppc.altivec.vmulosw
// CHECK-PPC: error: call to 'vec_mule' is ambiguous

  res_vull = vec_mule(vui , vui);
// CHECK: @llvm.ppc.altivec.vmuleuw
// CHECK-LE: @llvm.ppc.altivec.vmulouw
// CHECK-PPC: error: call to 'vec_mule' is ambiguous

  /* vec_mulo */
  res_vsll = vec_mulo(vi, vi);
// CHECK: @llvm.ppc.altivec.vmulosw
// CHECK-LE: @llvm.ppc.altivec.vmulesw
// CHECK-PPC: error: call to 'vec_mulo' is ambiguous

  res_vull = vec_mulo(vui, vui);
// CHECK: @llvm.ppc.altivec.vmulouw
// CHECK-LE: @llvm.ppc.altivec.vmuleuw
// CHECK-PPC: error: call to 'vec_mulo' is ambiguous

  /* vec_packs */
  res_vi = vec_packs(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vpksdss
// CHECK-LE: @llvm.ppc.altivec.vpksdss
// CHECK-PPC: error: call to 'vec_packs' is ambiguous

  res_vui = vec_packs(vull, vull);
// CHECK: @llvm.ppc.altivec.vpkudus
// CHECK-LE: @llvm.ppc.altivec.vpkudus
// CHECK-PPC: error: call to 'vec_packs' is ambiguous

  /* vec_packsu */
  res_vui = vec_packsu(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vpksdus
// CHECK-LE: @llvm.ppc.altivec.vpksdus
// CHECK-PPC: error: call to 'vec_packsu' is ambiguous

  res_vui = vec_packsu(vull, vull);
// CHECK: @llvm.ppc.altivec.vpkudus
// CHECK-LE: @llvm.ppc.altivec.vpkudus
// CHECK-PPC: error: call to 'vec_packsu' is ambiguous

  /* vec_rl */
  res_vsll = vec_rl(vsll, vull);
// CHECK: @llvm.ppc.altivec.vrld
// CHECK-LE: @llvm.ppc.altivec.vrld
// CHECK-PPC: error: call to 'vec_rl' is ambiguous

  res_vull = vec_rl(vull, vull);
// CHECK: @llvm.ppc.altivec.vrld
// CHECK-LE: @llvm.ppc.altivec.vrld
// CHECK-PPC: error: call to 'vec_rl' is ambiguous

  /* vec_sl */
  res_vsll = vec_sl(vsll, vull);
// CHECK: shl <2 x i64>
// CHECK-LE: shl <2 x i64>
// CHECK-PPC: error: call to 'vec_sl' is ambiguous

  res_vull = vec_sl(vull, vull);
// CHECK: shl <2 x i64>
// CHECK-LE: shl <2 x i64>
// CHECK-PPC: error: call to 'vec_sl' is ambiguous

  /* vec_sr */
  res_vsll = vec_sr(vsll, vull);
// CHECK: ashr <2 x i64>
// CHECK-LE: ashr <2 x i64>
// CHECK-PPC: error: call to 'vec_sr' is ambiguous

  res_vull = vec_sr(vull, vull);
// CHECK: lshr <2 x i64>
// CHECK-LE: lshr <2 x i64>
// CHECK-PPC: error: call to 'vec_sr' is ambiguous

  /* vec_sra */
  res_vsll = vec_sra(vsll, vull);
// CHECK: ashr <2 x i64>
// CHECK-LE: ashr <2 x i64>
// CHECK-PPC: error: call to 'vec_sra' is ambiguous

  res_vull = vec_sra(vull, vull);
// CHECK: ashr <2 x i64>
// CHECK-LE: ashr <2 x i64>
// CHECK-PPC: error: call to 'vec_sra' is ambiguous

  /* vec_unpackh */
  res_vsll = vec_unpackh(vi);
// CHECK: llvm.ppc.altivec.vupkhsw
// CHECK-LE: llvm.ppc.altivec.vupklsw
// CHECK-PPC: error: call to 'vec_unpackh' is ambiguous

  res_vbll = vec_unpackh(vbi);
// CHECK: llvm.ppc.altivec.vupkhsw
// CHECK-LE: llvm.ppc.altivec.vupklsw
// CHECK-PPC: error: call to 'vec_unpackh' is ambiguous

  /* vec_unpackl */
  res_vsll = vec_unpackl(vi);
// CHECK: llvm.ppc.altivec.vupklsw
// CHECK-LE: llvm.ppc.altivec.vupkhsw
// CHECK-PPC: error: call to 'vec_unpackl' is ambiguous

  res_vbll = vec_unpackl(vbi);
// CHECK: llvm.ppc.altivec.vupklsw
// CHECK-LE: llvm.ppc.altivec.vupkhsw
// CHECK-PPC: error: call to 'vec_unpackl' is ambiguous

  /* vec_vpksdss */
  res_vi = vec_vpksdss(vsll, vsll);
// CHECK: llvm.ppc.altivec.vpksdss
// CHECK-LE: llvm.ppc.altivec.vpksdss
// CHECK-PPC: warning: implicit declaration of function 'vec_vpksdss'

  /* vec_vpksdus */
  res_vui = vec_vpksdus(vsll, vsll);
// CHECK: llvm.ppc.altivec.vpksdus
// CHECK-LE: llvm.ppc.altivec.vpksdus
// CHECK-PPC: warning: implicit declaration of function 'vec_vpksdus'

  /* vec_vpkudum */
  res_vi = vec_vpkudum(vsll, vsll);
// CHECK: vperm
// CHECK-LE: vperm
// CHECK-PPC: warning: implicit declaration of function 'vec_vpkudum'

  res_vui = vec_vpkudum(vull, vull);
// CHECK: vperm
// CHECK-LE: vperm

  res_vui = vec_vpkudus(vull, vull);
// CHECK: llvm.ppc.altivec.vpkudus
// CHECK-LE: llvm.ppc.altivec.vpkudus
// CHECK-PPC: warning: implicit declaration of function 'vec_vpkudus'

  /* vec_vupkhsw */
  res_vsll = vec_vupkhsw(vi);
// CHECK: llvm.ppc.altivec.vupkhsw
// CHECK-LE: llvm.ppc.altivec.vupklsw
// CHECK-PPC: warning: implicit declaration of function 'vec_vupkhsw'

  res_vbll = vec_vupkhsw(vbi);
// CHECK: llvm.ppc.altivec.vupkhsw
// CHECK-LE: llvm.ppc.altivec.vupklsw

  /* vec_vupklsw */
  res_vsll = vec_vupklsw(vi);
// CHECK: llvm.ppc.altivec.vupklsw
// CHECK-LE: llvm.ppc.altivec.vupkhsw
// CHECK-PPC: warning: implicit declaration of function 'vec_vupklsw'

  res_vbll = vec_vupklsw(vbi);
// CHECK: llvm.ppc.altivec.vupklsw
// CHECK-LE: llvm.ppc.altivec.vupkhsw

  /* vec_max */
  res_vsll = vec_max(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vmaxsd
// CHECK-LE: @llvm.ppc.altivec.vmaxsd

  res_vsll = vec_max(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vmaxsd
// CHECK-LE: @llvm.ppc.altivec.vmaxsd

  res_vsll = vec_max(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vmaxsd
// CHECK-LE: @llvm.ppc.altivec.vmaxsd

  res_vull = vec_max(vull, vull);
// CHECK: @llvm.ppc.altivec.vmaxud
// CHECK-LE: @llvm.ppc.altivec.vmaxud

  res_vull = vec_max(vbll, vull);
// CHECK: @llvm.ppc.altivec.vmaxud
// CHECK-LE: @llvm.ppc.altivec.vmaxud

  /* vec_min */
  res_vsll = vec_min(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vminsd
// CHECK-LE: @llvm.ppc.altivec.vminsd

  res_vsll = vec_min(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vminsd
// CHECK-LE: @llvm.ppc.altivec.vminsd

  res_vsll = vec_min(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vminsd
// CHECK-LE: @llvm.ppc.altivec.vminsd

  res_vull = vec_min(vull, vull);
// CHECK: @llvm.ppc.altivec.vminud
// CHECK-LE: @llvm.ppc.altivec.vminud

  res_vull = vec_min(vbll, vull);
// CHECK: @llvm.ppc.altivec.vminud
// CHECK-LE: @llvm.ppc.altivec.vminud

}
Ejemplo n.º 15
0
void YUV422_to_BGRA_altivec(const unsigned char *yuvdata,
			    size_t pixelnum, unsigned char *output)
{
  const vector unsigned char *UYVY_ptr=reinterpret_cast<const vector unsigned char *>(yuvdata);
  vector unsigned char *BGRA_ptr=reinterpret_cast<vector unsigned char *>(output);

  vector unsigned int vShift;
  vector signed short tempU, tempV, tempY, tempUV, out1, out2, out3, out4;

  vector signed short  v16, v128, a255, szero, one;
  vector unsigned char zero;
  vector signed short t0, t1, t2, tempGB1, tempGB2, tempRA1, tempRA2;
  vector signed short vU_G, vV_G, vU_B, vU_R, y0, hiImage, loImage;
  vector unsigned int   uv_rEven, uv_rOdd, uv_rHi, uv_rLo,
					  uv_gUEven, uv_gVEven, uv_gUOdd, uv_gVOdd, uv_gHi, uv_gLo,
					  uv_bEven, uv_bOdd;
  vector signed int	tempUhi, tempUlo, tempVhi, tempVlo;
  vector signed int yEven, yOdd;

  vector unsigned int t0Even, t0Odd, t1Even, t1Odd, t2Even, t2Odd;

  /* Load the equation constants. */
  vector signed short vConst =
    static_cast<vector signed short>(298, 519, 409, 16, 128, 255, -100, -210 );

  vector unsigned char vPerm1 =
    static_cast<vector unsigned char>( 0, 1, 16, 17,  8,  9, 24, 25,
                            2, 3, 18, 19, 10, 11, 26, 27 );
  vector unsigned char vPerm2 =
    static_cast<vector unsigned char>( 4, 5, 20, 21, 12, 13, 28, 29,
							6, 7, 22, 23, 14, 15, 30, 31 );

  vector unsigned char vPermY =
    static_cast<vector unsigned char>(  2,  3,  6,  7, 10, 11, 14, 15,
	                        18, 19, 22, 23, 26, 27, 30, 31 );
  vector unsigned char vPermU =
    static_cast<vector unsigned char>(  0,  1, 16, 17,  4,  5, 20, 21,
	                         8,  9, 24, 25, 12, 13, 28, 29 );
  vector unsigned char vPermV =
    static_cast<vector unsigned char>(  2,  3, 18, 19,  6,  7, 22, 23,
							10, 11, 26, 27, 14, 15, 30, 31 );
  vector unsigned char vOutPerm1 =
    static_cast<vector unsigned char>(  0,  1,  2,  3, 16, 17, 18, 19,
	                         4,  5,  6,  7, 20, 21, 22, 23 );
  vector unsigned char vOutPerm2 =
    static_cast<vector unsigned char>(  8,  9, 10, 11, 24, 25, 26, 27,
	                        12, 13, 14, 15, 28, 29, 30, 31 );
  vector unsigned char uvPerm =
    static_cast<vector unsigned char>(  0,  1,  4,  5,  8,  9, 12, 13,
	                        16, 17, 20, 21, 24, 25, 28, 29 );

  zero   = vec_splat_u8(0);
  szero  = vec_splat_s16(0);
  one    = vec_splat_s16(1);
  vShift = vec_splat_u32(8);
  a255   = vec_splat( vConst, 5 ); // alpha channel = 255
  vU_G   = vec_splat( vConst, 6 ); // -100
  vV_G   = vec_splat( vConst, 7 ); // -210
  vU_B   = vec_splat( vConst, 1 ); // 519
  vU_R   = vec_splat( vConst, 2 ); // 409
  y0     = vec_splat( vConst, 0 ); // 298
  v16    = vec_splat( vConst, 3 ); //  16
  v128   = vec_splat( vConst, 4 ); // 128

  for ( unsigned int i = 0; i < (pixelnum/sizeof(vector unsigned char)); i++ ) {

    // Load UYUV input vector
	const vector unsigned char *vec1 = UYVY_ptr++;

	//expand the UInt8's to short's
	hiImage = static_cast<vector signed short>(vec_mergeh( zero, *vec1 ));
	loImage = static_cast<vector signed short>(vec_mergel( zero, *vec1 ));

	tempUV = static_cast<vector signed short>(vec_perm( hiImage, loImage, uvPerm ));
	tempY  = static_cast<vector signed short>(vec_perm( hiImage, loImage, vPermY ));

	// subtract UV_OFFSET from UV's  (should this be saturated?)
	tempUV = static_cast<vector signed short>(vec_sub( tempUV, v128 ));
	// subtract Y-OFFSET from Y's    (should this be saturated?)
	tempY  = static_cast<vector signed short>(vec_sub( tempY, v16 ));

	// expand to UUUU UUUU and VVVV VVVV
	tempU = vec_perm(tempUV, tempUV, vPermU);
	tempV = vec_perm(tempUV, tempUV, vPermV);
	//below:
	//
	//error: cannot convert `vector int' to `vector unsigned int' in assignment
	tempUhi = vec_mule( tempU, one );
	// unsigned int = vec_mule( signed short, signed short )
	// should be
	// signed int = vec_mule( signed short, signed short )
	tempUlo = vec_mulo( tempU, one );
	tempVhi = vec_mule( tempV, one );
	tempVlo = vec_mulo( tempV, one );

	// uv_r = YUV2RGB_12*u + YUV2RGB_13*v
	// uv_r = (-1)*u + 409*v (or "409*V - U")
	uv_rEven = vec_mule( tempV, vU_R );
	uv_rOdd  = vec_mulo( tempV, vU_R );
	uv_rHi = vec_sub( uv_rEven, tempUhi );
	uv_rLo = vec_sub( uv_rOdd, tempUlo );

	// uv_g = YUV2RGB_22*u + YUV2RGB_23*v
	// uv_g = -100*u + (-210)*v
	// multiply U by -100
	uv_gUEven = vec_mule( tempU, vU_G );
	uv_gUOdd  = vec_mulo( tempU, vU_G );
	// multiply V by -210
	uv_gVEven = vec_mule( tempV, vV_G );
	uv_gVOdd  = vec_mulo( tempV, vV_G );
	// add U & V products
	uv_gHi   = vec_add( uv_gUEven, uv_gVEven );
	uv_gLo   = vec_add( uv_gUOdd, uv_gVOdd );

	// uv_b = YUV2RGB_32*u + YUV2RGB_33*v
	// uv_b = 519*u + 0*v
	uv_bEven = vec_mule( tempU, vU_B );
	uv_bOdd  = vec_mulo( tempU, vU_B );

	// y = YUV2RGB_11 * tempY
	// y = 298* (tempY - 16)
	yEven = vec_mule( tempY, y0 );
	yOdd  = vec_mulo( tempY, y0 );

	// add while int's
	t0Even = vec_add( yEven, uv_bEven );
	t0Odd  = vec_add( yOdd, uv_bOdd );
	t1Even = vec_add( yEven, uv_gHi );
	t1Odd  = vec_add( yOdd, uv_gLo );
	t2Even = vec_add( yEven, uv_rHi );
	t2Odd  = vec_add( yOdd, uv_rLo );

	// shift while int's
	t0Even = vec_sra( t0Even, vShift );
	t0Odd  = vec_sra( t0Odd,  vShift );
	t1Even = vec_sra( t1Even, vShift );
	t1Odd  = vec_sra( t1Odd,  vShift );
	t2Even = vec_sra( t2Even, vShift );
	t2Odd  = vec_sra( t2Odd,  vShift );

	// pack down to shorts
	t0 = vec_packs( t0Even, t0Odd );
	t1 = vec_packs( t1Even, t1Odd );
	t2 = vec_packs( t2Even, t2Odd );

	// Permute to GBGBGBGB GBGBGBGB + re-interleave even & odd
	tempGB1 = vec_perm( t1,   t0, vPerm1 );
	tempGB2 = vec_perm( t1,   t0, vPerm2 );
	// Permute to ARARARAR ARARARAR + re-interleave even & odd
	tempRA1 = vec_perm( a255, t2, vPerm1 );
	tempRA2 = vec_perm( a255, t2, vPerm2 );

	// Permute to ARGB's
	out1 = vec_perm( tempRA1, tempGB1, vOutPerm1 );
	out2 = vec_perm( tempRA1, tempGB1, vOutPerm2 );
	out3 = vec_perm( tempRA2, tempGB2, vOutPerm1 );
	out4 = vec_perm( tempRA2, tempGB2, vOutPerm2 );

	// pack down to char's
	*BGRA_ptr = vec_packsu( out1, out2 );
	BGRA_ptr++;
	*BGRA_ptr = vec_packsu( out3, out4 );
	BGRA_ptr++;
  }
}
 SIMD_INLINE v128_u8 Interpolate(v128_u16 s[2][2], v128_u16 k[2])
 {
     return vec_packs(Interpolate(s[0], k), Interpolate(s[1], k));
 }