Esempio n. 1
0
unsigned long
lightsource_diffuse (const float *vert, const float *norm,
		     const colour *ambient_col, const colour *pigment_col,
		     const float *light_pos)
{
  float light_to_vertex[3];
  float out;
  int r, g, b;

  vec_sub (light_to_vertex, light_pos, vert);
  vec_normalize (light_to_vertex, light_to_vertex);

  out = vec_dot (light_to_vertex, norm);

  if (out < 0)
    out = 0;

  r = ambient_col->r + out * (pigment_col->r - ambient_col->r);
  g = ambient_col->g + out * (pigment_col->g - ambient_col->g);
  b = ambient_col->b + out * (pigment_col->b - ambient_col->b);

  return (255 << 24) | (r << 16) | (g << 8) | b;
}
Esempio n. 2
0
int			init_env(t_env *e, t_obj *obj)
{
	if ((e->mlx = mlx_init()) == NULL)
		return (m_error("mlx_init(): fail"));
	if ((e->win = mlx_new_window(e->mlx, WIN_X, WIN_Y, "rtv1")) == NULL)
		return (m_error("mlx_new_window(): fail"));
	if ((e->img = mlx_new_image(e->mlx, WIN_X, WIN_Y)) == NULL)
		return (m_error("mlx_new_image(): fail"));
	if ((e->addr = mlx_get_data_addr(e->img, &(e->bpp), &(e->size_line),
		&(e->endian))) == NULL)
		return (m_error("mlx_get_data_addr(): fail"));
	if ((e->rgb_tab = (t_rgb *)malloc(sizeof(t_rgb) * (WIN_X * WIN_Y))) == NULL)
		return (m_error("rgb_tab_init(): fail"));
	e->eye_pos = vec_new(0, 0, 0);
	e->eye_dir = vec_new(0, 0, 1);
	e->right_vec = vec_new(1, 0, 0);
	e->up_vec = vec_new(0, -1, 0);
	e->view_plane_ori = vec_add(vec_add(e->eye_pos, vec_numb(e->eye_dir,
		VIEW_PLANE_DIST)), vec_sub(vec_numb(e->up_vec, VIEW_PLANE_HEIGHT / 2.0)
		, vec_numb(e->right_vec, VIEW_PLANE_WIDTH / 2.0)));
	init_obj(obj);
	return (0);
}
Esempio n. 3
0
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
    register int i;

    LOAD_ZERO;
    const vec_u8 permM2 = vec_lvsl(-2, src);
    const vec_u8 permM1 = vec_lvsl(-1, src);
    const vec_u8 permP0 = vec_lvsl(+0, src);
    const vec_u8 permP1 = vec_lvsl(+1, src);
    const vec_u8 permP2 = vec_lvsl(+2, src);
    const vec_u8 permP3 = vec_lvsl(+3, src);
    const vec_s16 v5ss = vec_splat_s16(5);
    const vec_u16 v5us = vec_splat_u16(5);
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));

    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;

    register int align = ((((unsigned long)src) - 2) % 16);

    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
              srcP2A, srcP2B, srcP3A, srcP3B,
              srcM1A, srcM1B, srcM2A, srcM2B,
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
              psumA, psumB, sumA, sumB;

    vec_u8 sum, fsum;

    for (i = 0 ; i < 16 ; i ++) {
        vec_u8 srcR1 = vec_ld(-2, src);
        vec_u8 srcR2 = vec_ld(14, src);

        switch (align) {
        default: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = vec_perm(srcR1, srcR2, permP3);
        } break;
        case 11: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = srcR2;
        } break;
        case 12: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = srcR2;
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 13: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = srcR2;
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 14: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = srcR2;
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 15: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = srcR2;
            srcP0 = vec_perm(srcR2, srcR3, permP0);
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        }

        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);

        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);

        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);

        sum1A = vec_adds(srcP0A, srcP1A);
        sum1B = vec_adds(srcP0B, srcP1B);
        sum2A = vec_adds(srcM1A, srcP2A);
        sum2B = vec_adds(srcM1B, srcP2B);
        sum3A = vec_adds(srcM2A, srcP3A);
        sum3B = vec_adds(srcM2B, srcP3B);

        pp1A = vec_mladd(sum1A, v20ss, v16ss);
        pp1B = vec_mladd(sum1B, v20ss, v16ss);

        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);

        pp3A = vec_add(sum3A, pp1A);
        pp3B = vec_add(sum3B, pp1B);

        psumA = vec_sub(pp3A, pp2A);
        psumB = vec_sub(pp3B, pp2B);

        sumA = vec_sra(psumA, v5us);
        sumB = vec_sra(psumB, v5us);

        sum = vec_packsu(sumA, sumB);

        ASSERT_ALIGNED(dst);

        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));

        vec_st(fsum, 0, dst);

        src += srcStride;
        dst += dstStride;
    }
}
Esempio n. 4
0
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
    register int i;

    LOAD_ZERO;
    const vec_u8 perm = vec_lvsl(0, src);
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
    const vec_u16 v5us = vec_splat_u16(5);
    const vec_s16 v5ss = vec_splat_s16(5);
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));

    uint8_t *srcbis = src - (srcStride * 2);

    const vec_u8 srcM2a = vec_ld(0, srcbis);
    const vec_u8 srcM2b = vec_ld(16, srcbis);
    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
    //srcbis += srcStride;
    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
    const vec_u8 srcM1b = vec_ld(16, srcbis);
    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
    //srcbis += srcStride;
    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
    const vec_u8 srcP0b = vec_ld(16, srcbis);
    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
    //srcbis += srcStride;
    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
    const vec_u8 srcP1b = vec_ld(16, srcbis);
    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
    //srcbis += srcStride;
    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
    const vec_u8 srcP2b = vec_ld(16, srcbis);
    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
    //srcbis += srcStride;

    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);

    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
              psumA, psumB, sumA, sumB,
              srcP3ssA, srcP3ssB,
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;

    vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;

    for (i = 0 ; i < 16 ; i++) {
        srcP3a = vec_ld(0, srcbis += srcStride);
        srcP3b = vec_ld(16, srcbis);
        srcP3 = vec_perm(srcP3a, srcP3b, perm);
        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
        //srcbis += srcStride;

        sum1A = vec_adds(srcP0ssA, srcP1ssA);
        sum1B = vec_adds(srcP0ssB, srcP1ssB);
        sum2A = vec_adds(srcM1ssA, srcP2ssA);
        sum2B = vec_adds(srcM1ssB, srcP2ssB);
        sum3A = vec_adds(srcM2ssA, srcP3ssA);
        sum3B = vec_adds(srcM2ssB, srcP3ssB);

        srcM2ssA = srcM1ssA;
        srcM2ssB = srcM1ssB;
        srcM1ssA = srcP0ssA;
        srcM1ssB = srcP0ssB;
        srcP0ssA = srcP1ssA;
        srcP0ssB = srcP1ssB;
        srcP1ssA = srcP2ssA;
        srcP1ssB = srcP2ssB;
        srcP2ssA = srcP3ssA;
        srcP2ssB = srcP3ssB;

        pp1A = vec_mladd(sum1A, v20ss, v16ss);
        pp1B = vec_mladd(sum1B, v20ss, v16ss);

        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);

        pp3A = vec_add(sum3A, pp1A);
        pp3B = vec_add(sum3B, pp1B);

        psumA = vec_sub(pp3A, pp2A);
        psumB = vec_sub(pp3B, pp2B);

        sumA = vec_sra(psumA, v5us);
        sumB = vec_sra(psumB, v5us);

        sum = vec_packsu(sumA, sumB);

        ASSERT_ALIGNED(dst);

        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));

        vec_st(fsum, 0, dst);

        dst += dstStride;
    }
}
Esempio n. 5
0
File: fdctdsp.c Progetto: 63n/FFmpeg
/* two dimensional discrete cosine transform */
void ff_fdct_altivec(int16_t *block)
{
    vector signed short *bp;
    const vector float *cp = fdctconsts;
    vector float b00, b10, b20, b30, b40, b50, b60, b70;
    vector float b01, b11, b21, b31, b41, b51, b61, b71;
    vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
    vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;

    /* setup constants {{{ */
    /* mzero = -0.0 */
    mzero  = ((vector float) vec_splat_u32(-1));
    mzero  = ((vector float) vec_sl(vu32(mzero), vu32(mzero)));
    cnsts0 = vec_ld(0, cp);
    cp++;
    cnsts1 = vec_ld(0, cp);
    cp++;
    cnsts2 = vec_ld(0, cp);
    /* }}} */

    /* 8x8 matrix transpose (vector short[8]) {{{ */
#define MERGE_S16(hl, a, b) vec_merge ## hl(vs16(a), vs16(b))

    bp  = (vector signed short *) block;
    b00 = ((vector float) vec_ld(0,      bp));
    b40 = ((vector float) vec_ld(16 * 4, bp));
    b01 = ((vector float) MERGE_S16(h, b00, b40));
    b11 = ((vector float) MERGE_S16(l, b00, b40));
    bp++;
    b10 = ((vector float) vec_ld(0,      bp));
    b50 = ((vector float) vec_ld(16 * 4, bp));
    b21 = ((vector float) MERGE_S16(h, b10, b50));
    b31 = ((vector float) MERGE_S16(l, b10, b50));
    bp++;
    b20 = ((vector float) vec_ld(0,      bp));
    b60 = ((vector float) vec_ld(16 * 4, bp));
    b41 = ((vector float) MERGE_S16(h, b20, b60));
    b51 = ((vector float) MERGE_S16(l, b20, b60));
    bp++;
    b30 = ((vector float) vec_ld(0,      bp));
    b70 = ((vector float) vec_ld(16 * 4, bp));
    b61 = ((vector float) MERGE_S16(h, b30, b70));
    b71 = ((vector float) MERGE_S16(l, b30, b70));

    x0 = ((vector float) MERGE_S16(h, b01, b41));
    x1 = ((vector float) MERGE_S16(l, b01, b41));
    x2 = ((vector float) MERGE_S16(h, b11, b51));
    x3 = ((vector float) MERGE_S16(l, b11, b51));
    x4 = ((vector float) MERGE_S16(h, b21, b61));
    x5 = ((vector float) MERGE_S16(l, b21, b61));
    x6 = ((vector float) MERGE_S16(h, b31, b71));
    x7 = ((vector float) MERGE_S16(l, b31, b71));

    b00 = ((vector float) MERGE_S16(h, x0, x4));
    b10 = ((vector float) MERGE_S16(l, x0, x4));
    b20 = ((vector float) MERGE_S16(h, x1, x5));
    b30 = ((vector float) MERGE_S16(l, x1, x5));
    b40 = ((vector float) MERGE_S16(h, x2, x6));
    b50 = ((vector float) MERGE_S16(l, x2, x6));
    b60 = ((vector float) MERGE_S16(h, x3, x7));
    b70 = ((vector float) MERGE_S16(l, x3, x7));

#undef MERGE_S16
    /* }}} */

    /* Some of the initial calculations can be done as vector short
     * before conversion to vector float.  The following code section
     * takes advantage of this. */

    /* fdct rows {{{ */
    x0 = ((vector float) vec_add(vs16(b00), vs16(b70)));
    x7 = ((vector float) vec_sub(vs16(b00), vs16(b70)));
    x1 = ((vector float) vec_add(vs16(b10), vs16(b60)));
    x6 = ((vector float) vec_sub(vs16(b10), vs16(b60)));
    x2 = ((vector float) vec_add(vs16(b20), vs16(b50)));
    x5 = ((vector float) vec_sub(vs16(b20), vs16(b50)));
    x3 = ((vector float) vec_add(vs16(b30), vs16(b40)));
    x4 = ((vector float) vec_sub(vs16(b30), vs16(b40)));

    b70 = ((vector float) vec_add(vs16(x0), vs16(x3)));
    b10 = ((vector float) vec_add(vs16(x1), vs16(x2)));

    b00 = ((vector float) vec_add(vs16(b70), vs16(b10)));
    b40 = ((vector float) vec_sub(vs16(b70), vs16(b10)));

#define CTF0(n)                                                    \
    b ## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ## 0))); \
    b ## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ## 0))); \
    b ## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0);                   \
    b ## n ## 0 = vec_ctf(vs32(b ## n ## 0), 0)

    CTF0(0);
    CTF0(4);

    b20 = ((vector float) vec_sub(vs16(x0), vs16(x3)));
    b60 = ((vector float) vec_sub(vs16(x1), vs16(x2)));

    CTF0(2);
    CTF0(6);

#undef CTF0

    x0 = vec_add(b60, b20);
    x1 = vec_add(b61, b21);

    cnst = LD_W2;
    x0   = vec_madd(cnst, x0, mzero);
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_W1;
    b20  = vec_madd(cnst, b20, x0);
    b21  = vec_madd(cnst, b21, x1);
    cnst = LD_W0;
    b60  = vec_madd(cnst, b60, x0);
    b61  = vec_madd(cnst, b61, x1);

#define CTFX(x, b)                                  \
    b ## 0 = ((vector float) vec_unpackh(vs16(x))); \
    b ## 1 = ((vector float) vec_unpackl(vs16(x))); \
    b ## 0 = vec_ctf(vs32(b ## 0), 0);              \
    b ## 1 = vec_ctf(vs32(b ## 1), 0)

    CTFX(x4, b7);
    CTFX(x5, b5);
    CTFX(x6, b3);
    CTFX(x7, b1);

#undef CTFX

    x0   = vec_add(b70, b10);
    x1   = vec_add(b50, b30);
    x2   = vec_add(b70, b30);
    x3   = vec_add(b50, b10);
    x8   = vec_add(x2, x3);
    cnst = LD_W3;
    x8   = vec_madd(cnst, x8, mzero);

    cnst = LD_W8;
    x0   = vec_madd(cnst, x0, mzero);
    cnst = LD_W9;
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_WA;
    x2   = vec_madd(cnst, x2, x8);
    cnst = LD_WB;
    x3   = vec_madd(cnst, x3, x8);

    cnst = LD_W4;
    b70  = vec_madd(cnst, b70, x0);
    cnst = LD_W5;
    b50  = vec_madd(cnst, b50, x1);
    cnst = LD_W6;
    b30  = vec_madd(cnst, b30, x1);
    cnst = LD_W7;
    b10  = vec_madd(cnst, b10, x0);

    b70 = vec_add(b70, x2);
    b50 = vec_add(b50, x3);
    b30 = vec_add(b30, x2);
    b10 = vec_add(b10, x3);

    x0   = vec_add(b71, b11);
    x1   = vec_add(b51, b31);
    x2   = vec_add(b71, b31);
    x3   = vec_add(b51, b11);
    x8   = vec_add(x2, x3);
    cnst = LD_W3;
    x8   = vec_madd(cnst, x8, mzero);

    cnst = LD_W8;
    x0   = vec_madd(cnst, x0, mzero);
    cnst = LD_W9;
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_WA;
    x2   = vec_madd(cnst, x2, x8);
    cnst = LD_WB;
    x3   = vec_madd(cnst, x3, x8);

    cnst = LD_W4;
    b71  = vec_madd(cnst, b71, x0);
    cnst = LD_W5;
    b51  = vec_madd(cnst, b51, x1);
    cnst = LD_W6;
    b31  = vec_madd(cnst, b31, x1);
    cnst = LD_W7;
    b11  = vec_madd(cnst, b11, x0);

    b71 = vec_add(b71, x2);
    b51 = vec_add(b51, x3);
    b31 = vec_add(b31, x2);
    b11 = vec_add(b11, x3);
    /* }}} */

    /* 8x8 matrix transpose (vector float[8][2]) {{{ */
    x0 = VEC_FMERGEL(b00, b20);
    x1 = VEC_FMERGEH(b00, b20);
    x2 = VEC_FMERGEL(b10, b30);
    x3 = VEC_FMERGEH(b10, b30);

    b00 = VEC_FMERGEH(x1, x3);
    b10 = VEC_FMERGEL(x1, x3);
    b20 = VEC_FMERGEH(x0, x2);
    b30 = VEC_FMERGEL(x0, x2);

    x4 = VEC_FMERGEL(b41, b61);
    x5 = VEC_FMERGEH(b41, b61);
    x6 = VEC_FMERGEL(b51, b71);
    x7 = VEC_FMERGEH(b51, b71);

    b41 = VEC_FMERGEH(x5, x7);
    b51 = VEC_FMERGEL(x5, x7);
    b61 = VEC_FMERGEH(x4, x6);
    b71 = VEC_FMERGEL(x4, x6);

    x0 = VEC_FMERGEL(b01, b21);
    x1 = VEC_FMERGEH(b01, b21);
    x2 = VEC_FMERGEL(b11, b31);
    x3 = VEC_FMERGEH(b11, b31);

    x4 = VEC_FMERGEL(b40, b60);
    x5 = VEC_FMERGEH(b40, b60);
    x6 = VEC_FMERGEL(b50, b70);
    x7 = VEC_FMERGEH(b50, b70);

    b40 = VEC_FMERGEH(x1, x3);
    b50 = VEC_FMERGEL(x1, x3);
    b60 = VEC_FMERGEH(x0, x2);
    b70 = VEC_FMERGEL(x0, x2);

    b01 = VEC_FMERGEH(x5, x7);
    b11 = VEC_FMERGEL(x5, x7);
    b21 = VEC_FMERGEH(x4, x6);
    b31 = VEC_FMERGEL(x4, x6);
    /* }}} */

    FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
    FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);

    /* round, convert back to short {{{ */
#define CTS(n)                                                  \
    b ## n ## 0 = vec_round(b ## n ## 0);                       \
    b ## n ## 1 = vec_round(b ## n ## 1);                       \
    b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0));     \
    b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0));     \
    b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0),   \
                                           vs32(b ## n ## 1))); \
    vec_st(vs16(b ## n ## 0), 0, bp)

    bp = (vector signed short *) block;
    CTS(0);
    bp++;
    CTS(1);
    bp++;
    CTS(2);
    bp++;
    CTS(3);
    bp++;
    CTS(4);
    bp++;
    CTS(5);
    bp++;
    CTS(6);
    bp++;
    CTS(7);

#undef CTS
    /* }}} */
}
Esempio n. 6
0
static void
parabolic_texcoords (float *texc, float vertex[3], float normal[3], int front)
{
  //float light[3] = { 0.2357, 0.2357, 0.9428 };
  float vertex4[4], x_vertex[4], x_normal[4];
  float eye_to_vertex[3], reflection[4], tmp[3];
  float temp[4], along;
  float eye_norm_dot;
  float incidence, normalized_normal[4];
  int r, g, b;
  GLfloat unrot[4], x, y, z;
  float x_f, y_f, z_f;
  float x_b, y_b, z_b;
  float c_eyepos[4];

  vec_normalize (normalized_normal, normal);

  /* Find the normal in eye space.  */
  normalized_normal[3] = 1.0;
  vec_transform_fipr (x_normal, (float *) &rotate[0][0], normalized_normal);

 /* incidence = vec_dot (&light[0], &x_normal[0]);
  
  if (incidence < 0)
    incidence = 0;*/
  
  /*r = 64 + 191 * incidence;
  g = 64 + 191 * incidence;
  b = 64 + 191 * incidence;
  
  glColor4ub (r, g, b, 0);
  glColor4ub (255, 255, 255, 0);*/
  
  /* We need the vertex in eye space.  This duplicates work!  */
  memcpy (vertex4, vertex, sizeof (float) * 3);
  vertex4[3] = 1.0;
  vec_transform_fipr (x_vertex, (float *) &transform[0][0], vertex4);

  vec_transform_fipr (&c_eyepos[0], &camera[0][0], &eye_pos[0]);
  vec_sub (eye_to_vertex, &c_eyepos[0], &x_vertex[0]);
  vec_normalize (eye_to_vertex, eye_to_vertex);
  
  eye_norm_dot = vec_dot (eye_to_vertex, &x_normal[0]);
  vec_scale (tmp, x_normal, 2.0 * eye_norm_dot);

  vec_sub (reflection, tmp, eye_to_vertex);
  
  //dbgio_printf ("ref length: %f\n", (double) vec_length (reflection));

#if 0
  if (draw_vectors > 0)
    {
     /* dbgio_printf ("%f %f %f\n", (double) eye_to_vertex[0],
				  (double) eye_to_vertex[1],
				  (double) eye_to_vertex[2]); */

      dbgio_printf ("e2v . norm = %f  norm . refl = %f\n",
        (double) eye_norm_dot, (double) vec_dot (x_normal, reflection));

      for (along = 0.0; along < 0.25; along += 0.005)
	{
	  int colour = 0x001f | ((int) (along * 255) << 5);
	  vec_scale (temp, x_normal, along);
	  vec_add (&temp[0], x_vertex, &temp[0]);
	  box (temp, colour);
	}

      for (along = 0.0; along < 0.25; along += 0.005)
	{
	  int colour = 0xf800 | ((int) (along * 255) << 5);

	  /*vec_scale (temp, eye_to_vertex, along);
	  vec_add (&temp[0], x_vertex, &temp[0]);
	  box (temp, colour);*/

	  colour &= ~0xf800;

	  vec_scale (temp, reflection, along);
	  vec_add (&temp[0], x_vertex, &temp[0]);
	  box (temp, colour);
	}
      draw_vectors--;
    }
#endif

  /*x = reflection[0];
  y = reflection[1];
  z = reflection[2];
  w = 1.0;
  mat_load ((matrix_t *) &unrotate[0][0]);
  mat_trans_nodiv (x, y, z, w);
  glKosMatrixDirty ();*/
  reflection[3] = 1.0;
  vec_transform_fipr (unrot, (float *) &invcamera[0][0], reflection);
  x = unrot[0];
  y = unrot[1];
  z = unrot[2];

  //glColor4ub (128 + 127 * x, 128 + 127 * y, 128 + 127 * z, 0);

  /*dbgio_printf ("x_norm len: %f  ref len: %f  unrot len: %f (%f, %f, %f)\n",
                (double) vec_length (x_normal),
                (double) vec_length (reflection),
		(double) vec_length (unrot),
		(double) unrot[0],
		(double) unrot[1],
		(double) unrot[2]);*/

  /* (x, y, z, w) should now be the reflection vector in object space (r_o).
    
    for the front face:
    
    d_o - r_o = [ k.x  k.y  k ]
    
    for the back face:
    
    -d_o - r_o = [ -k.x  -k.y  k ]
    
    for both, calculate lhs, then divide x, y by z.  */
    
  if (front)
    {
      x_f = x;
      y_f = -y;
      z_f = -1.0 - z;

      /*if (z_f >= 0.0)
        z_f = -0.0001;*/

      x_f = 0.5 + 0.5 * (x_f / z_f);
      y_f = 0.5 - 0.5 * (y_f / z_f);

      texc[0] = x_f;
      texc[1] = y_f;
      texc[2] = z;
    }
  else
    {
      x_b = x;
      y_b = y;
      z_b = 1.0 - z;

      /*if (z_b <= 0.0)
        z_b = 0.0001;*/

      x_b = 0.5 + 0.5 * (x_b / z_b);
      y_b = 0.5 - 0.5 * (y_b / z_b);

      texc[0] = x_b;
      texc[1] = y_b;
      texc[2] = z;
    }
}
Esempio n. 7
0
 template<> SIMD_INLINE v128_s16 InterferenceChange<false>(v128_s16 statistic, v128_s16 value, v128_s16 saturation)
 {
     return vec_max(vec_sub(statistic, value), saturation);
 }
void 
nb_kernel010nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float nul;
	vector float dx,dy,dz;
	vector float Vvdwtot,c6,c12;
	vector float rinvsq,rsq,rinvsix;
  
	int n,k,ii,is3,ii3,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int ntiA,tja,tjb,tjc,tjd;
#ifdef GMX_THREAD_SHM_FDECOMP
	int nn0, nn1;
#endif
  
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();

#ifdef GMX_THREAD_SHM_FDECOMP
    nthreads = *p_nthreads;
	do {
		tMPI_Thread_mutex_lock((tMPI_Thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		tMPI_Thread_mutex_unlock((tMPI_Thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without tMPI_Threads */
		for(n=0;n<nri;n++) {
#endif
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			Vvdwtot     = nul;
			ix         = vec_add(ix,shvec);    
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			ntiA       = 2*ntype*type[ii];
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				zero_highest_2_elements_in_vector(&rinvsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinvsq          = do_recip(rsq);
				zero_highest_3_elements_in_vector(&rinvsq);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				load_1_pair(vdwparam+tja,&c6,&c12);
				Vvdwtot          = vec_nmsub(c6,rinvsix,Vvdwtot);
				Vvdwtot          = vec_madd(c12,
										   vec_madd(rinvsix,rinvsix,nul),
										   Vvdwtot);
			}
			/* update outer data */
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREAD_SHM_FDECOMP
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
void 
nb_kernel133nf_ppc_altivec(int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float iOx,iOy,iOz,iH1x,iH1y,iH1z,iH2x,iH2y,iH2z,iMx,iMy,iMz;
	vector float dOx,dOy,dOz,dH1x,dH1y,dH1z,dH2x,dH2y,dH2z,dMx,dMy,dMz;
	vector float Vvdwtot,c6,c12,VVd,VVr,tsc,r;
	vector float vfacel,nul;
	vector float vctot,qqM,qqH,iqM,iqH,jq;
	vector float rinvO,rinvH1,rinvH2,rinvM,rsqO,rsqH1,rsqH2,rsqM;  

	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif
 
    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	tsc=load_float_and_splat(p_tabscale);
	vfacel=load_float_and_splat(p_facel);
	ii         = iinr[0];
	iqH        = vec_madd(load_float_and_splat(charge+ii+1),vfacel,nul);
	iqM        = vec_madd(load_float_and_splat(charge+ii+3),vfacel,nul);
	ntiA       = 2*ntype*type[ii];
  
#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			ii         = iinr[n];
			ii3        = 3*ii;
			load_1_4atoms_shift_and_splat(pos+ii3,shiftvec+is3,&iOx,&iOy,&iOz,
										  &iH1x,&iH1y,&iH1z,&iH2x,&iH2y,&iH2z,
										  &iMx,&iMy,&iMz);
			vctot      = nul;
			Vvdwtot     = nul;
			nj0        = jindex[n];
			nj1        = jindex[n+1];
    
			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);

				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				r               = vec_madd(rsqO,rinvO,nul);
				
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				/* load 4 j charges and multiply by iq */
				jq=load_4_float(charge+jnra,charge+jnrb,
								charge+jnrc,charge+jnrd);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_vonly_4_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			} 
			if(k<(nj1-2)) 
            {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),nul,&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);
				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				zero_highest_element_in_vector(&rinvO);
				zero_highest_element_in_vector(&rsqO);				
				zero_highest_element_in_3_vectors(&rinvH1,&rinvH2,&rinvM);

				r               = vec_madd(rsqO,rinvO,nul);
				
				jq=load_3_float(charge+jnra,charge+jnrb,charge+jnrc);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				/* load 3 j charges and multiply by iq */
				load_3_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,&c6,&c12);
				do_vonly_3_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			}
            else if(k<(nj1-1))
            {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);

				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);

				zero_highest_2_elements_in_vector(&rinvO);
				zero_highest_2_elements_in_vector(&rsqO);				
				zero_highest_2_elements_in_3_vectors(&rinvM,&rinvH1,&rinvH2);

				r               = vec_madd(rsqO,rinvO,nul);
				
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				/* load 2 j charges and multiply by iq */
				jq=load_2_float(charge+jnra,charge+jnrb);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_vonly_2_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);

				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			} 
            else if(k<nj1) 
            {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dMx,&dMy,&dMz);
				dOx             = vec_sub(iOx,dMx);
				dOy             = vec_sub(iOy,dMy);
				dOz             = vec_sub(iOz,dMz);
				dH1x            = vec_sub(iH1x,dMx);
				dH1y            = vec_sub(iH1y,dMy);
				dH1z            = vec_sub(iH1z,dMz);
				dH2x            = vec_sub(iH2x,dMx);
				dH2y            = vec_sub(iH2y,dMy);
				dH2z            = vec_sub(iH2z,dMz);
				dMx             = vec_sub(iMx,dMx);
				dMy             = vec_sub(iMy,dMy);
				dMz             = vec_sub(iMz,dMz);

				rsqO            = vec_madd(dOx,dOx,nul);
				rsqH1           = vec_madd(dH1x,dH1x,nul);
				rsqH2           = vec_madd(dH2x,dH2x,nul);
				rsqM            = vec_madd(dMx,dMx,nul);
				rsqO            = vec_madd(dOy,dOy,rsqO);
				rsqH1           = vec_madd(dH1y,dH1y,rsqH1);
				rsqH2           = vec_madd(dH2y,dH2y,rsqH2);
				rsqM            = vec_madd(dMy,dMy,rsqM);
				rsqO            = vec_madd(dOz,dOz,rsqO);
				rsqH1           = vec_madd(dH1z,dH1z,rsqH1);
				rsqH2           = vec_madd(dH2z,dH2z,rsqH2);
				rsqM            = vec_madd(dMz,dMz,rsqM);
				rinvO           = do_invsqrt(rsqO);
				do_3_invsqrt(rsqM,rsqH1,rsqH2,&rinvM,&rinvH1,&rinvH2);
				zero_highest_3_elements_in_vector(&rinvO);
				zero_highest_3_elements_in_vector(&rsqO);				
				zero_highest_3_elements_in_3_vectors(&rinvH1,&rinvH2,&rinvM);

				r               = vec_madd(rsqO,rinvO,nul);
				
				jq=load_1_float(charge+jnra);
				tja             = ntiA+2*type[jnra];
				/* load 1 j charge and multiply by iq */
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_vonly_1_ljtable_lj(VFtab,vec_madd(r,tsc,nul),&VVd,&VVr);
				Vvdwtot         = vec_madd(c6,VVd,Vvdwtot);
				Vvdwtot         = vec_madd(c12,VVr,Vvdwtot);
				qqH             = vec_madd(iqH,jq,nul);
				qqM             = vec_madd(iqM,jq,nul);
				vctot           = vec_madd(qqM,rinvM,vctot);
				vctot           = vec_madd(qqH,rinvH1,vctot);
				vctot           = vec_madd(qqH,rinvH2,vctot);
			}
			/* update outer data */
			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
Esempio n. 10
0
 void qp(double *x_out, int *iter, double *gt, double *bt) {

 /* define data */
 double q[300]= {0};


double q1[180]= {0};

double q2[120]= {0};

double l[180] = {0};

double u[180] = {0};

double tmp_var_p[180] = {0};


double tmp_var_p2[180] = {0};


double arg_prox_h[180] = {0};


double lambda[180] = {0};
double y[180] = {0};


double x[180] = {0};


double lambda_old[180] = {0};
double v[180] = {0};
double v_old[180] = {0};
double tmp_var_n[180] = {0};

double tmp_var_n2[180] = {0};


double tmp_var_nm[300] = {0};


double tmp_var_nm2[300] = {0};


double rhs[300] = {0};


 int jj = 0;


double cond = -1;


 double theta = 1;
 double theta_old = 1;


 mat_vec_mult_sparse(&G,gt,q1);


 mat_vec_mult_sparse(&B,bt,q2);


 copy_vec_part((double *) &Lb,l,180);


 copy_vec_part((double *) &Ub,u,180);


 while ((jj < 2000) && (cond < 0)) {


 jj++;


copy_vec_part_negate(v,tmp_var_p,180);


mat_vec_mult_diag(&CT,tmp_var_p,tmp_var_n);


 vec_sub(tmp_var_n,q1,tmp_var_n,180);


 stack_vec(tmp_var_n,q2,rhs,180,120);


perm_fwdsolve(&L,p,rhs,tmp_var_nm);


 mat_vec_mult_sparse(&Dinv,tmp_var_nm,tmp_var_nm2);


 backsolve_perm(&LT,p,tmp_var_nm2,tmp_var_nm);

 copy_vec_part(tmp_var_nm,x,180);


 mat_vec_mult_diag(&C,x,tmp_var_p);


 vec_add(v,tmp_var_p,tmp_var_p,180);
 copy_vec_part(tmp_var_p,arg_prox_h,180);
 clip(tmp_var_p,l,u,180);


 mat_vec_mult_diag(&Einv,tmp_var_p,y);


 copy_vec_part(lambda,lambda_old,180);


 vec_sub(arg_prox_h,tmp_var_p,lambda,180);


 vec_sub(lambda,lambda_old,tmp_var_p,180);


 theta_old = theta;


 theta = (1+sqrt(1+4*pow(theta_old,2)))/2;


 scalar_mult((theta_old-1)/theta,tmp_var_p,180);


 copy_vec_part(v,v_old,180);


 vec_add(tmp_var_p,lambda,v,180);


 if (mod(jj,10) == 0) {
  cond = check_stop_cond_FGM(&Einv,lambda,lambda_old,tmp_var_p,tmp_var_p2,180,0.001);
  }


 restart(lambda,lambda_old,v,v_old,tmp_var_p,tmp_var_p2,180);


 }


 copy_vec_part(x,x_out,180);


 *iter = jj;


 }
Esempio n. 11
0
GridSearchPairlist *vmd_gridsearch_bonds(const float *pos, const float *radii,
                                   int natoms, float pairdist, int maxpairs) {
  float min[3], max[3];
  int i, xb, yb, zb, xytotb, totb;
  int **boxatom, *numinbox, *maxinbox, **nbrlist;
  int numon = 0;
  float sidelen[3], volume;
  int paircount = 0;

  // find bounding box for selected atoms, and number of atoms in selection.
#if 1
  minmax_3fv_aligned(pos, natoms, min, max);
#else
  find_minmax_all(pos, natoms, min, max);
#endif

  // check for NaN coordinates propagating to the bounding box result
  if (!(max[0] >= min[0] && max[1] >= min[1] && max[2] >= min[2])) {
    msgErr << "vmd_gridsearch_bonds: NaN coordinates in bounds, aborting!" << sendmsg;
    return NULL;
  }

  // do sanity checks and complain if we've got bogus atom coordinates,
  // we shouldn't ever have density higher than 0.1 atom/A^3, but we'll
  // be generous and allow much higher densities.  
  if (maxpairs != -1) {
    vec_sub(sidelen, max, min);
    // include estimate for atom radius (1 Angstrom) in volume determination
    volume = fabsf((sidelen[0] + 2.0f) * (sidelen[1] + 2.0f) * (sidelen[2] + 2.0f));
    if ((numon / volume) > 1.0) {
      msgWarn << "vmd_gridsearch_bonds: insane atom density" << sendmsg;
    }
  }

  // I don't want the grid to get too large, otherwise I could run out
  // of memory.  Octrees would be cool, but I'll just limit the grid size
  // and let the performance degrade a little for pathological systems.
  // Note that pairdist^2 is what gets used for the actual distance checks;
  // from here on out pairdist is only used to set the grid size, so we 
  // can set it to anything larger than the original pairdist.
  const int MAXBOXES = 4000000;
  totb = MAXBOXES + 1;

  float newpairdist = pairdist;
  float xrange = max[0]-min[0];
  float yrange = max[1]-min[1];
  float zrange = max[2]-min[2];
  do {
    pairdist = newpairdist;
    const float invpairdist = 1.0f / pairdist; 
    xb = ((int)(xrange*invpairdist))+1;
    yb = ((int)(yrange*invpairdist))+1;
    zb = ((int)(zrange*invpairdist))+1;
    xytotb = yb * xb;
    totb = xytotb * zb;
    newpairdist = pairdist * 1.26f; // cbrt(2) is about 1.26
  } while (totb > MAXBOXES || totb < 1); // check for integer wraparound too
 
  // 2. Sort each atom into appropriate bins
  boxatom = (int **) calloc(1, totb*sizeof(int *));
  numinbox = (int *) calloc(1, totb*sizeof(int));
  maxinbox = (int *) calloc(1, totb*sizeof(int));
  if (boxatom == NULL || numinbox == NULL || maxinbox == NULL) {
    if (boxatom != NULL)
      free(boxatom);
    if (numinbox != NULL)
      free(numinbox);
    if (maxinbox != NULL)
      free(maxinbox);
    msgErr << "Bondsearch memory allocation failed, bailing out" << sendmsg;
    return NULL; // ran out of memory, bail out!
  }

  const float invpairdist = 1.0f / pairdist; 
  for (i=0; i<natoms; i++) {
    int axb, ayb, azb, aindex, num;

    // compute box index for new atom
    const float *loc = pos + 3L*i;
    axb = (int)((loc[0] - min[0])*invpairdist);
    ayb = (int)((loc[1] - min[1])*invpairdist);
    azb = (int)((loc[2] - min[2])*invpairdist);

    // clamp box indices to valid range in case of FP error
    if (axb >= xb) axb = xb-1;
    if (ayb >= yb) ayb = yb-1;
    if (azb >= zb) azb = zb-1;

    aindex = azb * xytotb + ayb * xb + axb;

    // grow box if necessary 
    if ((num = numinbox[aindex]) == maxinbox[aindex]) {
      boxatom[aindex] = (int *) realloc(boxatom[aindex], (num+4)*sizeof(int));
      maxinbox[aindex] += 4;
    }

    // store atom index in box
    boxatom[aindex][num] = i;
    numinbox[aindex]++;
  }
  free(maxinbox);
 
  nbrlist = (int **) calloc(1, totb*sizeof(int *));
  if (make_neighborlist(nbrlist, xb, yb, zb)) {
    if (boxatom != NULL) {
      for (i=0; i<totb; i++) {
        if (boxatom[i] != NULL) free(boxatom[i]);
      }
      free(boxatom);
    }
    if (nbrlist != NULL) {
      for (i=0; i<totb; i++) {
        if (nbrlist[i] != NULL) free(nbrlist[i]);
      }
      free(nbrlist);
    }
    free(numinbox);
    msgErr << "Bondsearch memory allocation failed, bailing out" << sendmsg;
    return NULL; // ran out of memory, bail out!
  }

  // if maxpairs is "unlimited", set it to the biggest positive int
  if (maxpairs < 0) {
    maxpairs = 2147483647;
  }

  // setup head of pairlist
  GridSearchPairlist *head, *cur;
  head = (GridSearchPairlist *) malloc(sizeof(GridSearchPairlist));
  head->next = NULL;
  paircount = vmd_bondsearch_thr(pos, radii, head, totb, 
                                 boxatom, numinbox, nbrlist, 
                                 maxpairs, pairdist);

  for (i=0; i<totb; i++) {
    free(boxatom[i]);
    free(nbrlist[i]);
  }
  free(boxatom);
  free(nbrlist);
  free(numinbox);

  cur = head->next;
  free(head);

  if (paircount > maxpairs) 
    msgErr << "vmdgridsearch_bonds: exceeded pairlist sanity check, aborted" << sendmsg;

  return cur;
}
Esempio n. 12
0
EFP_EXPORT enum efp_result
efp_get_electric_field(struct efp *efp, size_t frag_idx, const double *xyz, double *field)
{
	assert(efp);
	assert(frag_idx < efp->n_frag);
	assert(xyz);
	assert(field);

	const struct frag *frag = efp->frags + frag_idx;
	vec_t elec_field = vec_zero;

	for (size_t i = 0; i < efp->n_frag; i++) {
		if (i == frag_idx || efp_skip_frag_pair(efp, i, frag_idx))
			continue;

		const struct frag *fr_i = efp->frags + i;
		struct swf swf = efp_make_swf(efp, fr_i, frag);

		/* field due to nuclei */
		for (size_t j = 0; j < fr_i->n_atoms; j++) {
			const struct efp_atom *at = fr_i->atoms + j;

			vec_t dr = {
				xyz[0] - at->x - swf.cell.x,
				xyz[1] - at->y - swf.cell.y,
				xyz[2] - at->z - swf.cell.z
			};

			double r = vec_len(&dr);
			double r3 = r * r * r;

			elec_field.x += swf.swf * at->znuc * dr.x / r3;
			elec_field.y += swf.swf * at->znuc * dr.y / r3;
			elec_field.z += swf.swf * at->znuc * dr.z / r3;
		}

		/* field due to multipoles */
		for (size_t j = 0; j < fr_i->n_multipole_pts; j++) {
			const struct multipole_pt *mpt = fr_i->multipole_pts + j;
			vec_t mult_field = get_multipole_field((const vec_t *)xyz, mpt, &swf);

			elec_field.x += mult_field.x;
			elec_field.y += mult_field.y;
			elec_field.z += mult_field.z;
		}

		/* field due to induced dipoles */
		for (size_t j = 0; j < fr_i->n_polarizable_pts; j++) {
			struct polarizable_pt *pt_i = fr_i->polarizable_pts + j;
			size_t idx = fr_i->polarizable_offset + j;

			vec_t dr = {
				xyz[0] - pt_i->x - swf.cell.x,
				xyz[1] - pt_i->y - swf.cell.y,
				xyz[2] - pt_i->z - swf.cell.z
			};

			double r = vec_len(&dr);
			double r3 = r * r * r;
			double r5 = r3 * r * r;
			double t1 = vec_dot(&efp->indip[idx], &dr);

			elec_field.x -= swf.swf * (efp->indip[idx].x / r3 -
						3.0 * t1 * dr.x / r5);
			elec_field.y -= swf.swf * (efp->indip[idx].y / r3 -
						3.0 * t1 * dr.y / r5);
			elec_field.z -= swf.swf * (efp->indip[idx].z / r3 -
						3.0 * t1 * dr.z / r5);
		}
	}

	if (efp->opts.terms & EFP_TERM_AI_POL) {
		/* field due to nuclei from ab initio subsystem */
		for (size_t i = 0; i < efp->n_ptc; i++) {
			vec_t dr = vec_sub((const vec_t *)xyz, efp->ptc_xyz + i);

			double r = vec_len(&dr);
			double r3 = r * r * r;

			elec_field.x += efp->ptc[i] * dr.x / r3;
			elec_field.y += efp->ptc[i] * dr.y / r3;
			elec_field.z += efp->ptc[i] * dr.z / r3;
		}
	}

	*((vec_t *)field) = elec_field;
	return (EFP_RESULT_SUCCESS);
}
Esempio n. 13
0
static void
compute_grad_point(struct efp *efp, size_t frag_idx, size_t pt_idx)
{
	const struct frag *fr_i = efp->frags + frag_idx;
	const struct polarizable_pt *pt_i = fr_i->polarizable_pts + pt_idx;
	size_t idx_i = fr_i->polarizable_offset + pt_idx;

	vec_t dipole_i = {
		0.5 * (efp->indip[idx_i].x + efp->indipconj[idx_i].x),
		0.5 * (efp->indip[idx_i].y + efp->indipconj[idx_i].y),
		0.5 * (efp->indip[idx_i].z + efp->indipconj[idx_i].z)
	};

	for (size_t j = 0; j < efp->n_frag; j++) {
		if (j == frag_idx || efp_skip_frag_pair(efp, frag_idx, j))
			continue;

		struct frag *fr_j = efp->frags + j;
		struct swf swf = efp_make_swf(efp, fr_i, fr_j);

		/* energy without switching applied */
		double energy = 0.0;

		/* induced dipole - nuclei */
		for (size_t k = 0; k < fr_j->n_atoms; k++) {
			struct efp_atom *at_j = fr_j->atoms + k;

			vec_t dr = {
				at_j->x - pt_i->x - swf.cell.x,
				at_j->y - pt_i->y - swf.cell.y,
				at_j->z - pt_i->z - swf.cell.z
			};

			double p1 = 1.0, p2 = 0.0;

			if (efp->opts.pol_damp == EFP_POL_DAMP_TT) {
				double r = vec_len(&dr);

				p1 = efp_get_pol_damp_tt(r, fr_i->pol_damp,
						fr_j->pol_damp);
				p2 = efp_get_pol_damp_tt_grad(r, fr_i->pol_damp,
						fr_j->pol_damp);
			}

			vec_t force, add_i, add_j;

			double e = -efp_charge_dipole_energy(at_j->znuc, &dipole_i, &dr);

			efp_charge_dipole_grad(at_j->znuc, &dipole_i, &dr,
					       &force, &add_j, &add_i);
			vec_negate(&force);

			vec_scale(&force, p1);
			vec_scale(&add_i, p1);
			vec_scale(&add_j, p1);

			force.x += p2 * e * dr.x;
			force.y += p2 * e * dr.y;
			force.z += p2 * e * dr.z;

			vec_scale(&force, swf.swf);
			vec_scale(&add_i, swf.swf);
			vec_scale(&add_j, swf.swf);

			efp_add_force(efp->grad + frag_idx, CVEC(fr_i->x),
					CVEC(pt_i->x), &force, &add_i);
			efp_sub_force(efp->grad + j, CVEC(fr_j->x),
					CVEC(at_j->x), &force, &add_j);
			efp_add_stress(&swf.dr, &force, &efp->stress);

			energy += p1 * e;
		}

		/* induced dipole - multipoles */
		for (size_t k = 0; k < fr_j->n_multipole_pts; k++) {
			struct multipole_pt *pt_j = fr_j->multipole_pts + k;

			vec_t dr = {
				pt_j->x - pt_i->x - swf.cell.x,
				pt_j->y - pt_i->y - swf.cell.y,
				pt_j->z - pt_i->z - swf.cell.z
			};

			double p1 = 1.0, p2 = 0.0;

			if (efp->opts.pol_damp == EFP_POL_DAMP_TT) {
				double r = vec_len(&dr);

				p1 = efp_get_pol_damp_tt(r, fr_i->pol_damp,
						fr_j->pol_damp);
				p2 = efp_get_pol_damp_tt_grad(r, fr_i->pol_damp,
						fr_j->pol_damp);
			}

			double e = 0.0;

			vec_t force_, add_i_, add_j_;
			vec_t force = vec_zero, add_i = vec_zero, add_j = vec_zero;

			/* induced dipole - charge */
			e -= efp_charge_dipole_energy(pt_j->monopole, &dipole_i, &dr);

			efp_charge_dipole_grad(pt_j->monopole, &dipole_i, &dr,
					       &force_, &add_j_, &add_i_);
			vec_negate(&force_);
			add_3(&force, &force_, &add_i, &add_i_, &add_j, &add_j_);

			/* induced dipole - dipole */
			e += efp_dipole_dipole_energy(&dipole_i, &pt_j->dipole, &dr);

			efp_dipole_dipole_grad(&dipole_i, &pt_j->dipole, &dr,
					       &force_, &add_i_, &add_j_);
			vec_negate(&add_j_);
			add_3(&force, &force_, &add_i, &add_i_, &add_j, &add_j_);

			/* induced dipole - quadrupole */
			e += efp_dipole_quadrupole_energy(&dipole_i, pt_j->quadrupole, &dr);

			efp_dipole_quadrupole_grad(&dipole_i, pt_j->quadrupole, &dr,
						   &force_, &add_i_, &add_j_);
			add_3(&force, &force_, &add_i, &add_i_, &add_j, &add_j_);

			/* induced dipole - octupole interactions are ignored */

			vec_scale(&force, p1);
			vec_scale(&add_i, p1);
			vec_scale(&add_j, p1);

			force.x += p2 * e * dr.x;
			force.y += p2 * e * dr.y;
			force.z += p2 * e * dr.z;

			vec_scale(&force, swf.swf);
			vec_scale(&add_i, swf.swf);
			vec_scale(&add_j, swf.swf);

			efp_add_force(efp->grad + frag_idx, CVEC(fr_i->x),
					CVEC(pt_i->x), &force, &add_i);
			efp_sub_force(efp->grad + j, CVEC(fr_j->x),
					CVEC(pt_j->x), &force, &add_j);
			efp_add_stress(&swf.dr, &force, &efp->stress);

			energy += p1 * e;
		}

		/* induced dipole - induced dipoles */
		for (size_t jj = 0; jj < fr_j->n_polarizable_pts; jj++) {
			struct polarizable_pt *pt_j = fr_j->polarizable_pts + jj;
			size_t idx_j = fr_j->polarizable_offset + jj;

			vec_t dr = {
				pt_j->x - pt_i->x - swf.cell.x,
				pt_j->y - pt_i->y - swf.cell.y,
				pt_j->z - pt_i->z - swf.cell.z
			};

			vec_t half_dipole_i = {
				0.5 * efp->indip[idx_i].x,
				0.5 * efp->indip[idx_i].y,
				0.5 * efp->indip[idx_i].z
			};

			double p1 = 1.0, p2 = 0.0;

			if (efp->opts.pol_damp == EFP_POL_DAMP_TT) {
				double r = vec_len(&dr);

				p1 = efp_get_pol_damp_tt(r, fr_i->pol_damp,
						fr_j->pol_damp);
				p2 = efp_get_pol_damp_tt_grad(r, fr_i->pol_damp,
						fr_j->pol_damp);
			}

			vec_t force, add_i, add_j;

			double e = efp_dipole_dipole_energy(&half_dipole_i,
						&efp->indipconj[idx_j], &dr);

			efp_dipole_dipole_grad(&half_dipole_i, &efp->indipconj[idx_j],
						&dr, &force, &add_i, &add_j);
			vec_negate(&add_j);

			vec_scale(&force, p1);
			vec_scale(&add_i, p1);
			vec_scale(&add_j, p1);

			force.x += p2 * e * dr.x;
			force.y += p2 * e * dr.y;
			force.z += p2 * e * dr.z;

			vec_scale(&force, swf.swf);
			vec_scale(&add_i, swf.swf);
			vec_scale(&add_j, swf.swf);

			efp_add_force(efp->grad + frag_idx, CVEC(fr_i->x),
					CVEC(pt_i->x), &force, &add_i);
			efp_sub_force(efp->grad + j, CVEC(fr_j->x),
					CVEC(pt_j->x), &force, &add_j);
			efp_add_stress(&swf.dr, &force, &efp->stress);

			energy += p1 * e;
		}

		vec_t force = {
			swf.dswf.x * energy,
			swf.dswf.y * energy,
			swf.dswf.z * energy
		};

		six_atomic_add_xyz(efp->grad + frag_idx, &force);
		six_atomic_sub_xyz(efp->grad + j, &force);
		efp_add_stress(&swf.dr, &force, &efp->stress);
	}

	/* induced dipole - ab initio nuclei */
	if (efp->opts.terms & EFP_TERM_AI_POL) {
		for (size_t j = 0; j < efp->n_ptc; j++) {
			vec_t dr = vec_sub(efp->ptc_xyz + j, CVEC(pt_i->x));
			vec_t force, add_i, add_j;

			efp_charge_dipole_grad(efp->ptc[j], &dipole_i, &dr,
					       &force, &add_j, &add_i);
			vec_negate(&add_i);

			vec_atomic_add(efp->ptc_grad + j, &force);
			efp_sub_force(efp->grad + frag_idx, CVEC(fr_i->x),
					CVEC(pt_i->x), &force, &add_i);
		}
	}
}
Esempio n. 14
0
static vec_t
get_elec_field(const struct efp *efp, size_t frag_idx, size_t pt_idx)
{
	const struct frag *fr_j = efp->frags + frag_idx;
	const struct polarizable_pt *pt = fr_j->polarizable_pts + pt_idx;
	vec_t elec_field = vec_zero;

	for (size_t i = 0; i < efp->n_frag; i++) {
		if (i == frag_idx || efp_skip_frag_pair(efp, i, frag_idx))
			continue;

		const struct frag *fr_i = efp->frags + i;
		struct swf swf = efp_make_swf(efp, fr_i, fr_j);

		/* field due to nuclei */
		for (size_t j = 0; j < fr_i->n_atoms; j++) {
			const struct efp_atom *at = fr_i->atoms + j;

			vec_t dr = {
				pt->x - at->x - swf.cell.x,
				pt->y - at->y - swf.cell.y,
				pt->z - at->z - swf.cell.z
			};

			double r = vec_len(&dr);
			double r3 = r * r * r;
			double p1 = 1.0;

			if (efp->opts.pol_damp == EFP_POL_DAMP_TT)
				p1 = efp_get_pol_damp_tt(r, fr_i->pol_damp, fr_j->pol_damp);

			elec_field.x += swf.swf * at->znuc * dr.x / r3 * p1;
			elec_field.y += swf.swf * at->znuc * dr.y / r3 * p1;
			elec_field.z += swf.swf * at->znuc * dr.z / r3 * p1;
		}

		/* field due to multipoles */
		for (size_t j = 0; j < fr_i->n_multipole_pts; j++) {
			const struct multipole_pt *mult_pt = fr_i->multipole_pts + j;
			vec_t mult_field = get_multipole_field(CVEC(pt->x), mult_pt, &swf);

			vec_t dr = {
				pt->x - mult_pt->x - swf.cell.x,
				pt->y - mult_pt->y - swf.cell.y,
				pt->z - mult_pt->z - swf.cell.z
			};

			double r = vec_len(&dr);
			double p1 = 1.0;

			if (efp->opts.pol_damp == EFP_POL_DAMP_TT)
				p1 = efp_get_pol_damp_tt(r, fr_i->pol_damp, fr_j->pol_damp);

			elec_field.x += mult_field.x * p1;
			elec_field.y += mult_field.y * p1;
			elec_field.z += mult_field.z * p1;
		}
	}

	if (efp->opts.terms & EFP_TERM_AI_POL) {
		/* field due to nuclei from ab initio subsystem */
		for (size_t i = 0; i < efp->n_ptc; i++) {
			vec_t dr = vec_sub(CVEC(pt->x), efp->ptc_xyz + i);

			double r = vec_len(&dr);
			double r3 = r * r * r;

			elec_field.x += efp->ptc[i] * dr.x / r3;
			elec_field.y += efp->ptc[i] * dr.y / r3;
			elec_field.z += efp->ptc[i] * dr.z / r3;
		}
	}

	return (elec_field);
}
Esempio n. 15
0
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL)
{
    int i;
    vector signed short vsrc;
    uint16_t *qmat;
    vector unsigned short vqmat;
    vector unsigned short vmquant;
    vector bool short eqzero, ltzero;
    vector signed short val, t0;
    vector signed short zero, one;
    vector unsigned int four;
    vector signed short min, max;
    int offset, offset2;
    int16_t dst0;
    union {
	vector unsigned short vu16;
	unsigned short mquant;
	vector signed int vs32;
	struct {
	    signed int pad[3];
	    signed int sum;
	} s;
    } vu;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY /* {{{ */
    if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat))
	mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)",
	    wsp->intra_q_mat);

    if (NOT_VECTOR_ALIGNED(src))
	mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src);

    if (NOT_VECTOR_ALIGNED(dst))
	mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst);

    for (i = 0; i < 64; i++)
	if (src[i] < -256 || src[i] > 255)
	    mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)",
		i, src[i]);
#endif /* }}} */

    AMBER_START;

    dst0 = src[0] << (3 - dc_prec);

    qmat = (uint16_t*)wsp->intra_q_mat;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(64/8,1,0);
    vec_dst(src, dsc.control, 0);
    vec_dst(qmat, dsc.control, 1);
#endif

    /* vmquant = (vector unsigned short)(mquant); */
    vu.mquant = (unsigned short)mquant;
    vmquant = vec_splat(vu.vu16, 0);

    zero = vec_splat_s16(0);
    one = vec_splat_s16(1);
    four = vec_splat_u32(4);
    /* max = (2047); min = (-2048); {{{ */
    vu8(max) = vec_splat_u8(0x7);
    t0 = vec_splat_s16(-1); /* 0xffff */
    vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */
    min = vec_sub(t0, max);
    /* }}} */
    offset = 0;

#if 1
    vsrc = vec_ld(offset, (signed short*)src);
    vqmat = vec_ld(offset, (unsigned short*)qmat);
    i = (64/8) - 1;
    do {
	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);
	eqzero = vec_cmpeq(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	offset2 = offset;
	offset += 8*sizeof(int16_t);
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* val = val - 1&~(val|val==0) */
	t0 = vec_or(val, eqzero);
	t0 = vec_andc(one, t0);
	val = vec_sub(val, t0);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vec_st(val, offset2, dst);
    } while (--i);
    /* intra_q[i] * mquant */
    vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

    /* save sign */
    ltzero = vec_cmplt(vsrc, zero);
    eqzero = vec_cmpeq(vsrc, zero);

    /* val = abs(src) */
    t0 = vec_sub(zero, vsrc);
    val = vec_max(t0, vsrc);

    /* val = (src * quant) >> 4 */
    vs32(t0) = vec_mule(val, vs16(vqmat));
    vs32(val) = vec_mulo(val, vs16(vqmat));
    vs32(t0) = vec_sra(vs32(t0), four);
    vs16(t0) = vec_pack(vs32(t0), vs32(t0));
    vs32(val) = vec_sra(vs32(val), four);
    vs16(val) = vec_pack(vs32(val), vs32(val));
    val = vec_mergeh(vs16(t0), vs16(val));

    /* val = val - 1&~(val|val==0) */
    t0 = vec_or(val, eqzero);
    t0 = vec_andc(one, t0);
    val = vec_sub(val, t0);

    /* restore sign */
    t0 = vec_sub(zero, val);
    val = vec_sel(val, t0, ltzero);

    /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
    val = vec_min(val, max);
    val = vec_max(val, min);

    vec_st(val, offset, dst);
#else
    /* {{{ */
    i = (64/8);
    do {
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);
	eqzero = vec_cmpeq(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	/* val = val - 1&~(val|val==0) */
	t0 = vec_or(val, eqzero);
	t0 = vec_andc(one, t0);
	val = vec_sub(val, t0);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vec_st(val, offset, dst);

	offset += 8*sizeof(int16_t);
    } while (--i);
    /* }}} */
#endif

    dst[0] = dst0;

    AMBER_STOP;
}
void
jsimd_fdct_islow_altivec (DCTELEM *data)
{
  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    col0, col1, col2, col3, col4, col5, col6, col7,
    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
    z3, z4, z34l, z34h,
    out0, out1, out2, out3, out4, out5, out6, out7;
  __vector int z3l, z3h, z4l, z4h,
    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
    out7l, out7h;

  /* Constants */
  __vector short
    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
    descale_p2 = { __4X(DESCALE_P2) };

  /* Pass 1: process rows */

  row0 = vec_ld(0, data);
  row1 = vec_ld(16, data);
  row2 = vec_ld(32, data);
  row3 = vec_ld(48, data);
  row4 = vec_ld(64, data);
  row5 = vec_ld(80, data);
  row6 = vec_ld(96, data);
  row7 = vec_ld(112, data);

  TRANSPOSE(row, col);

  tmp0 = vec_add(col0, col7);
  tmp7 = vec_sub(col0, col7);
  tmp1 = vec_add(col1, col6);
  tmp6 = vec_sub(col1, col6);
  tmp2 = vec_add(col2, col5);
  tmp5 = vec_sub(col2, col5);
  tmp3 = vec_add(col3, col4);
  tmp4 = vec_sub(col3, col4);

  DO_FDCT_PASS1();

  /* Pass 2: process columns */

  TRANSPOSE(out, row);

  tmp0 = vec_add(row0, row7);
  tmp7 = vec_sub(row0, row7);
  tmp1 = vec_add(row1, row6);
  tmp6 = vec_sub(row1, row6);
  tmp2 = vec_add(row2, row5);
  tmp5 = vec_sub(row2, row5);
  tmp3 = vec_add(row3, row4);
  tmp4 = vec_sub(row3, row4);

  DO_FDCT_PASS2();

  vec_st(out0, 0, data);
  vec_st(out1, 16, data);
  vec_st(out2, 32, data);
  vec_st(out3, 48, data);
  vec_st(out4, 64, data);
  vec_st(out5, 80, data);
  vec_st(out6, 96, data);
  vec_st(out7, 112, data);
}
Esempio n. 17
0
void DispCmdCylinder::putdata(const float *pos1, const float *pos2, float rad, 
                      int res, int filled, VMDDisplayList *dobj) {

  float lenaxis[3];
  vec_sub(lenaxis, pos1, pos2);  // check that it's valid
  if (dot_prod(lenaxis,lenaxis) == 0.0 || res <= 0) return;

  if (lastres != res ) {
    rot[0] = cosf( (float) VMD_TWOPI / (float) res);
    rot[1] = sinf( (float) VMD_TWOPI / (float) res);
  }
  lastres = res;
  size_t size = (9 + res*3*3)*sizeof(float);

  float *pos = (float *)(dobj->append(DCYLINDER, size));
  if (pos == NULL) 
    return;

  memcpy(pos,pos1,3*sizeof(float));
  memcpy(pos+3,pos2,3*sizeof(float));
  pos[6] = rad;
  pos[7] = (float)res;
  pos[8] = (float)filled;

  float axis[3];
  vec_sub(axis, pos1, pos2);
  vec_normalize(axis);
  int i;  // find an axis not aligned with the cylinder
  if (fabs(axis[0]) < fabs(axis[1]) &&
      fabs(axis[0]) < fabs(axis[2])) {
     i = 0;
  } else if (fabs(axis[1]) < fabs(axis[2])) {
     i = 1;
  } else {
     i = 2;
  }
  float perp[3];
  perp[i] = 0;                    // this is not aligned with the cylinder
  perp[(i+1)%3] = axis[(i+2)%3];
  perp[(i+2)%3] = -axis[(i+1)%3];
  vec_normalize(perp);
  float perp2[3];
  cross_prod(perp2, axis, perp); // find a normal to the cylinder

  float *posptr = pos+9;
  float m = rot[0], n = rot[1];
  for (int h=0; h<res; h++) {
    float tmp0, tmp1, tmp2;
    
    tmp0 = m*perp[0] + n*perp2[0]; // add the normal
    tmp1 = m*perp[1] + n*perp2[1];
    tmp2 = m*perp[2] + n*perp2[2];

    posptr[0] = tmp0; // add the normal
    posptr[1] = tmp1;
    posptr[2] = tmp2;

    posptr[3] = pos2[0] + rad * tmp0; // start
    posptr[4] = pos2[1] + rad * tmp1;
    posptr[5] = pos2[2] + rad * tmp2;

    posptr[6] = posptr[3] + lenaxis[0];  // and end of the edge
    posptr[7] = posptr[4] + lenaxis[1];
    posptr[8] = posptr[5] + lenaxis[2];
    posptr += 9;
    // use angle addition formulae:
    // cos(A+B) = cos A cos B - sin A sin B
    // sin(A+B) = cos A sin B + sin A cos B
    float mtmp = rot[0]*m - rot[1]*n;
    float ntmp = rot[0]*n + rot[1]*m; 
    m = mtmp;
    n = ntmp;
  }
}
/* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
  register int i;
  
  const vector signed int vzero = vec_splat_s32(0);
  const vector unsigned char permM2 = vec_lvsl(-2, src);
  const vector unsigned char permM1 = vec_lvsl(-1, src);
  const vector unsigned char permP0 = vec_lvsl(+0, src);
  const vector unsigned char permP1 = vec_lvsl(+1, src);
  const vector unsigned char permP2 = vec_lvsl(+2, src);
  const vector unsigned char permP3 = vec_lvsl(+3, src);
  const vector signed short v20ss = (const vector signed short)AVV(20);
  const vector unsigned short v5us = vec_splat_u16(5);
  const vector signed short v5ss = vec_splat_s16(5);
  const vector signed short v16ss = (const vector signed short)AVV(16);
  const vector unsigned char dstperm = vec_lvsr(0, dst);
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);

  register int align = ((((unsigned long)src) - 2) % 16);

  for (i = 0 ; i < 16 ; i ++) {
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
    vector unsigned char srcR1 = vec_ld(-2, src);
    vector unsigned char srcR2 = vec_ld(14, src);

    switch (align) {
    default: {
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = vec_perm(srcR1, srcR2, permP2);
      srcP3 = vec_perm(srcR1, srcR2, permP3);
    } break;
    case 11: {
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = vec_perm(srcR1, srcR2, permP2);
      srcP3 = srcR2;
    } break;
    case 12: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = srcR2;
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 13: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = srcR2;
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 14: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = srcR2;
      srcP1 = vec_perm(srcR2, srcR3, permP1);
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 15: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = srcR2;
      srcP0 = vec_perm(srcR2, srcR3, permP0);
      srcP1 = vec_perm(srcR2, srcR3, permP1);
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    }

    const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
    const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
    const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
    const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);

    const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
    const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
    const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
    const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);

    const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
    const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
    const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
    const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);

    const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
    const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
    const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
    const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
    const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
    const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
    
    const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
    const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);

    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
    
    const vector signed short pp3A = vec_add(sum3A, pp1A);
    const vector signed short pp3B = vec_add(sum3B, pp1B);

    const vector signed short psumA = vec_sub(pp3A, pp2A);
    const vector signed short psumB = vec_sub(pp3B, pp2B);

    const vector signed short sumA = vec_sra(psumA, v5us);
    const vector signed short sumB = vec_sra(psumB, v5us);

    const vector unsigned char sum = vec_packsu(sumA, sumB);

    const vector unsigned char dst1 = vec_ld(0, dst);
    const vector unsigned char dst2 = vec_ld(16, dst);
    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));

    vector unsigned char fsum;
    OP_U8_ALTIVEC(fsum, sum, vdst);

    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);

    vec_st(fdst1, 0, dst);
    vec_st(fdst2, 16, dst);

    src += srcStride;
    dst += dstStride;
  }
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
}
Esempio n. 19
0
void test1() {
// CHECK-LABEL: define void @test1
// CHECK-LE-LABEL: define void @test1

  res_vf = vec_abs(vf);
// CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_add(vd, vd);
// CHECK: fadd <2 x double>
// CHECK-LE: fadd <2 x double>

  res_vd = vec_and(vbll, vd);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  res_vd = vec_and(vd, vbll);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  res_vd = vec_and(vd, vd);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_andc(vbll, vd);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_andc(vd, vbll);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_andc(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_ceil(vd);
// CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}})

  res_vf = vec_ceil(vf);
// CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpeq(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpeq(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpge(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpge(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpgt(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpgt(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmple(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmple(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmplt(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmplt(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  /* vec_cpsgn */
  res_vf = vec_cpsgn(vf, vf);
// CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}})
// CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}})

  res_vd = vec_cpsgn(vd, vd);
// CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}})
// CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}})

  /* vec_div */
  res_vsll = vec_div(vsll, vsll);
// CHECK: sdiv <2 x i64>
// CHECK-LE: sdiv <2 x i64>

  res_vull = vec_div(vull, vull);
// CHECK: udiv <2 x i64>
// CHECK-LE: udiv <2 x i64>

  res_vf = vec_div(vf, vf);
// CHECK: fdiv <4 x float>
// CHECK-LE: fdiv <4 x float>

  res_vd = vec_div(vd, vd);
// CHECK: fdiv <2 x double>
// CHECK-LE: fdiv <2 x double>

  /* vec_max */
  res_vf = vec_max(vf, vf);
// CHECK: @llvm.ppc.vsx.xvmaxsp
// CHECK-LE: @llvm.ppc.vsx.xvmaxsp

  res_vd = vec_max(vd, vd);
// CHECK: @llvm.ppc.vsx.xvmaxdp
// CHECK-LE: @llvm.ppc.vsx.xvmaxdp

  res_vf = vec_vmaxfp(vf, vf);
// CHECK: @llvm.ppc.vsx.xvmaxsp
// CHECK-LE: @llvm.ppc.vsx.xvmaxsp

  /* vec_min */
  res_vf = vec_min(vf, vf);
// CHECK: @llvm.ppc.vsx.xvminsp
// CHECK-LE: @llvm.ppc.vsx.xvminsp

  res_vd = vec_min(vd, vd);
// CHECK: @llvm.ppc.vsx.xvmindp
// CHECK-LE: @llvm.ppc.vsx.xvmindp

  res_vf = vec_vminfp(vf, vf);
// CHECK: @llvm.ppc.vsx.xvminsp
// CHECK-LE: @llvm.ppc.vsx.xvminsp

  res_d = __builtin_vsx_xsmaxdp(d, d);
// CHECK: @llvm.ppc.vsx.xsmaxdp
// CHECK-LE: @llvm.ppc.vsx.xsmaxdp

  res_d = __builtin_vsx_xsmindp(d, d);
// CHECK: @llvm.ppc.vsx.xsmindp
// CHECK-LE: @llvm.ppc.vsx.xsmindp

  /* vec_perm */
  res_vsll = vec_perm(vsll, vsll, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_perm(vull, vull, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vbll = vec_perm(vbll, vbll, vuc);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vf = vec_round(vf);
// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float>
// CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float>

  res_vd = vec_round(vd);
// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double>
// CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double>

  res_vd = vec_perm(vd, vd, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vd = vec_splat(vd, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vbll = vec_splat(vbll, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vsll =  vec_splat(vsll, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vull =  vec_splat(vull, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vsi = vec_pack(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vui = vec_pack(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vbi = vec_pack(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_vperm(vsll, vsll, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_vperm(vull, vull, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vd = vec_vperm(vd, vd, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_vsx_ld */

  res_vsi = vec_vsx_ld(0, &vsi);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vui = vec_vsx_ld(0, &vui);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vf = vec_vsx_ld (0, &vf);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsll = vec_vsx_ld(0, &vsll);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vull = vec_vsx_ld(0, &vull);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vd = vec_vsx_ld(0, &vd);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vull = vec_vsx_ld(0, &vull);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vd = vec_vsx_ld(0, &vd);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vss = vec_vsx_ld(0, &vss);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vss = vec_vsx_ld(0, &ss);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vus = vec_vsx_ld(0, &vus);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vus = vec_vsx_ld(0, &us);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vbc = vec_vsx_ld(0, &vbc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsc = vec_vsx_ld(0, &vsc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vuc = vec_vsx_ld(0, &vuc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsc = vec_vsx_ld(0, &sc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vuc = vec_vsx_ld(0, &uc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  /* vec_vsx_st */

  vec_vsx_st(vsi, 0, &res_vsi);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsi, 0, &res_si);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vui, 0, &res_vui);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vui, 0, &res_ui);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vf, 0, &res_vf);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsll, 0, &res_vsll);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vull, 0, &res_vull);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vd, 0, &res_vd);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vss, 0, &res_vss);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vss, 0, &res_ss);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vus, 0, &res_vus);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vus, 0, &res_us);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsc, 0, &res_vsc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsc, 0, &res_sc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vuc, 0, &res_vuc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vuc, 0, &res_uc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_vbc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_sc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_uc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  /* vec_and */
  res_vsll = vec_and(vsll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_and(vbll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_and(vsll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vull, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vbll, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vull, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_and(vbll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  /* vec_vand */
  res_vsll = vec_vand(vsll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_vand(vbll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_vand(vsll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vull, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vbll, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vull, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_vand(vbll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  /* vec_andc */
  res_vsll = vec_andc(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_andc(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_andc(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vull, vull);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_andc(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vf = vec_floor(vf);
// CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_floor(vd);
// CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_madd(vf, vf, vf);
// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})

  res_vd = vec_madd(vd, vd, vd);
// CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})

  /* vec_mergeh */
  res_vsll = vec_mergeh(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergeh(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergeh(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vull, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vbll, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_mergel */
  res_vsll = vec_mergel(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergel(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergel(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vull, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vbll, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_msub */
  res_vf = vec_msub(vf, vf, vf);
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>

  res_vd = vec_msub(vd, vd, vd);
// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>

  res_vsll = vec_mul(vsll, vsll);
// CHECK: mul <2 x i64>
// CHECK-LE: mul <2 x i64>

  res_vull = vec_mul(vull, vull);
// CHECK: mul <2 x i64>
// CHECK-LE: mul <2 x i64>

  res_vf = vec_mul(vf, vf);
// CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}}

  res_vd = vec_mul(vd, vd);
// CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_nearbyint(vf);
// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_nearbyint(vd);
// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_nmadd(vf, vf, vf);
// CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]]
// CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]]

  res_vd = vec_nmadd(vd, vd, vd);
// CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]
// CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]

  res_vf = vec_nmsub(vf, vf, vf);
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}

  res_vd = vec_nmsub(vd, vd, vd);
// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]
// CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]

  /* vec_nor */
  res_vsll = vec_nor(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_nor(vull, vull);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_nor(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vd = vec_nor(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1>
// CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1>

  /* vec_or */
  res_vsll = vec_or(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_or(vbll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_or(vsll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vull, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vbll, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vull, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vbll = vec_or(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vd = vec_or(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}

  res_vd = vec_or(vbll, vd);
// CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]]
// CHECK: bitcast <2 x i64> [[T2]] to <2 x double>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]]
// CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double>

  res_vd = vec_or(vd, vbll);
// CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[T2]] to <2 x double>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double>

  res_vf = vec_re(vf);
// CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float>
// CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float>

  res_vd = vec_re(vd);
// CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>
// CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>

  res_vf = vec_rint(vf);
// CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_rint(vd);
// CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_rsqrte(vf);
// CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}})

  res_vd = vec_rsqrte(vd);
// CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}})

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vf = vec_sel(vd, vd, vbll);
// CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64> %{{[0-9]+}},
// CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: or <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>
// CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64> %{{[0-9]+}},
// CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: or <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_sel(vd, vd, vull);
// CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64> %{{[0-9]+}},
// CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: or <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>
// CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64> %{{[0-9]+}},
// CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: or <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>

  res_vf = vec_sqrt(vf);
// CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_sqrt(vd);
// CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}})

  res_vd = vec_sub(vd, vd);
// CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_trunc(vf);
// CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_trunc(vd);
// CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}})

  /* vec_vor */
  res_vsll = vec_vor(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_vor(vbll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_vor(vsll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vull, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vbll, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vull, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vbll = vec_vor(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  /* vec_xor */
  res_vsll = vec_xor(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_xor(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_xor(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vull, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vbll = vec_xor(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vd, vd);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vd, vbll);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vbll, vd);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  /* vec_vxor */
  res_vsll = vec_vxor(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_vxor(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_vxor(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vull, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vbll = vec_vxor(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_cts(vd, 0);
// CHECK: fmul <2 x double>
// CHECK: fptosi <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_cts(vd, 31);
// CHECK: fmul <2 x double>
// CHECK: fptosi <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_ctu(vd, 0);
// CHECK: fmul <2 x double>
// CHECK: fptoui <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_ctu(vd, 31);
// CHECK: fmul <2 x double>
// CHECK: fptoui <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64>

  res_vd = vec_ctf(vsll, 0);
// CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vsll, 31);
// CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vull, 0);
// CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vull, 31);
// CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>
}
/* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
  
  register int i;

  const vector signed int vzero = vec_splat_s32(0);
  const vector unsigned char perm = vec_lvsl(0, src);
  const vector signed short v20ss = (const vector signed short)AVV(20);
  const vector unsigned short v5us = vec_splat_u16(5);
  const vector signed short v5ss = vec_splat_s16(5);
  const vector signed short v16ss = (const vector signed short)AVV(16);
  const vector unsigned char dstperm = vec_lvsr(0, dst);
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
  
  uint8_t *srcbis = src - (srcStride * 2);

  const vector unsigned char srcM2a = vec_ld(0, srcbis);
  const vector unsigned char srcM2b = vec_ld(16, srcbis);
  const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
  srcbis += srcStride;
  const vector unsigned char srcM1a = vec_ld(0, srcbis);
  const vector unsigned char srcM1b = vec_ld(16, srcbis);
  const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
  srcbis += srcStride;
  const vector unsigned char srcP0a = vec_ld(0, srcbis);
  const vector unsigned char srcP0b = vec_ld(16, srcbis);
  const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
  srcbis += srcStride;
  const vector unsigned char srcP1a = vec_ld(0, srcbis);
  const vector unsigned char srcP1b = vec_ld(16, srcbis);
  const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
  srcbis += srcStride;
  const vector unsigned char srcP2a = vec_ld(0, srcbis);
  const vector unsigned char srcP2b = vec_ld(16, srcbis);
  const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
  srcbis += srcStride;

  vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
  vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
  vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
  vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
  vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
  vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
  vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
  vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
  vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
  vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);

  for (i = 0 ; i < 16 ; i++) {
    const vector unsigned char srcP3a = vec_ld(0, srcbis);
    const vector unsigned char srcP3b = vec_ld(16, srcbis);
    const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm);
    const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
    const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
    srcbis += srcStride;

    const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA);
    const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB);
    const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA);
    const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB);
    const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA);
    const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB);

    srcM2ssA = srcM1ssA;
    srcM2ssB = srcM1ssB;
    srcM1ssA = srcP0ssA;
    srcM1ssB = srcP0ssB;
    srcP0ssA = srcP1ssA;
    srcP0ssB = srcP1ssB;
    srcP1ssA = srcP2ssA;
    srcP1ssB = srcP2ssB;
    srcP2ssA = srcP3ssA;
    srcP2ssB = srcP3ssB;
    
    const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
    const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);

    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
    
    const vector signed short pp3A = vec_add(sum3A, pp1A);
    const vector signed short pp3B = vec_add(sum3B, pp1B);

    const vector signed short psumA = vec_sub(pp3A, pp2A);
    const vector signed short psumB = vec_sub(pp3B, pp2B);

    const vector signed short sumA = vec_sra(psumA, v5us);
    const vector signed short sumB = vec_sra(psumB, v5us);

    const vector unsigned char sum = vec_packsu(sumA, sumB);

    const vector unsigned char dst1 = vec_ld(0, dst);
    const vector unsigned char dst2 = vec_ld(16, dst);
    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));

    vector unsigned char fsum;
    OP_U8_ALTIVEC(fsum, sum, vdst);

    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);

    vec_st(fdst1, 0, dst);
    vec_st(fdst2, 16, dst);

    dst += dstStride;
  }
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
}
void 
nb_kernel310_ppc_altivec  (int *             p_nri,
                       int               iinr[],
                       int               jindex[],
                       int               jjnr[],
                       int               shift[],
                       float             shiftvec[],
                       float             fshift[],
                       int               gid[],
                       float             pos[],
                       float             faction[],
                       float             charge[],
                       float *           p_facel,
                       float *           p_krf,
                       float *           p_crf,
                       float             Vc[],
                       int               type[],
                       int *             p_ntype,
                       float             vdwparam[],
                       float             Vvdw[],
                       float *           p_tabscale,
                       float             VFtab[],
                       float             invsqrta[],
                       float             dvda[],
                       float *           p_gbtabscale,
                       float             GBtab[],
                       int *             p_nthreads,
                       int *             count,
                       void *            mtx,
                       int *             outeriter,
                       int *             inneriter,
					   float *           work)
{
	vector float ix,iy,iz,shvec;
	vector float vfacel,tsc,fs,fs2,nul;
	vector float dx,dy,dz;
	vector float Vvdwtot,vctot,qq,iq,c6,c12,VVc,FFc;
	vector float fix,fiy,fiz;
	vector float tmp1,tmp2,tmp3,tmp4;
	vector float rinv,r,rinvsq,rsq,rinvsix,Vvdw6,Vvdw12;

	int n,k,ii,is3,ii3,ntiA,nj0,nj1;
	int jnra,jnrb,jnrc,jnrd;
	int j3a,j3b,j3c,j3d;
	int nri, ntype, nouter, ninner;
	int tja,tjb,tjc,tjd;
#ifdef GMX_THREADS
	int nn0, nn1;
#endif

    nouter   = 0;
    ninner   = 0;
    nri      = *p_nri;
    ntype    = *p_ntype;
	nul=vec_zero();
	vfacel=load_float_and_splat(p_facel);
	tsc=load_float_and_splat(p_tabscale);

#ifdef GMX_THREADS
    nthreads = *p_nthreads;
	do {
		gmx_thread_mutex_lock((gmx_thread_mutex_t *)mtx);
		nn0              = *count;
		nn1              = nn0+(nri-nn0)/(2*nthreads)+3;
		*count           = nn1;
		gmx_thread_mutex_unlock((gmx_thread_mutex_t *)mtx);
		if(nn1>nri) nn1=nri;
		for(n=nn0; (n<nn1); n++) {
#if 0
		} /* maintain correct indentation even with conditional left braces */
#endif
#else /* without gmx_threads */
		for(n=0;n<nri;n++) {
#endif  
			is3        = 3*shift[n];
			shvec      = load_xyz(shiftvec+is3);
			ii         = iinr[n];
			ii3        = 3*ii;
			ix         = load_xyz(pos+ii3);
			Vvdwtot     = nul;
			vctot      = nul;
			fix        = nul;
			fiy        = nul;
			fiz        = nul;
			ix         = vec_add(ix,shvec);
			nj0        = jindex[n];
			nj1        = jindex[n+1];
			splat_xyz_to_vectors(ix,&ix,&iy,&iz);
			ntiA       = 2*ntype*type[ii];
			iq        = vec_madd(load_float_and_splat(charge+ii),vfacel,nul);

			for(k=nj0; k<(nj1-3); k+=4) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				jnrc            = jjnr[k+2];
				jnrd            = jjnr[k+3];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				j3c             = 3*jnrc;
				j3d             = 3*jnrd;
				transpose_4_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),
								 load_xyz(pos+j3c),
								 load_xyz(pos+j3d),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				rinv            = do_invsqrt(rsq);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				tjc             = ntiA+2*type[jnrc];
				tjd             = ntiA+2*type[jnrd];
				qq = vec_madd(load_4_float(charge+jnra,charge+jnrb,
										   charge+jnrc,charge+jnrd),iq,nul);
				load_4_pair(vdwparam+tja,vdwparam+tjb,vdwparam+tjc,vdwparam+tjd,&c6,&c12);
				do_4_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_4(dx,dy,dz,&tmp1,&tmp2,&tmp3,&tmp4);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				add_xyz_to_mem(faction+j3c,tmp3);
				add_xyz_to_mem(faction+j3d,tmp4);
			}
			if(k<(nj1-1)) {
				jnra            = jjnr[k];
				jnrb            = jjnr[k+1];
				j3a             = 3*jnra;
				j3b             = 3*jnrb;
				transpose_2_to_3(load_xyz(pos+j3a),
								 load_xyz(pos+j3b),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_2_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_2_elements_in_vector(&rinv);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				tjb             = ntiA+2*type[jnrb];
				qq = vec_madd(load_2_float(charge+jnra,charge+jnrb),iq,nul);
				load_2_pair(vdwparam+tja,vdwparam+tjb,&c6,&c12);
				do_2_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_madd(fs,rinv,nul);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_2(dx,dy,dz,&tmp1,&tmp2);
				add_xyz_to_mem(faction+j3a,tmp1);
				add_xyz_to_mem(faction+j3b,tmp2);
				k              += 2;
			}
			if((nj1-nj0) & 0x1) {
				jnra            = jjnr[k];
				j3a             = 3*jnra;
				transpose_1_to_3(load_xyz(pos+j3a),&dx,&dy,&dz);
				dx              = vec_sub(ix,dx);
				dy              = vec_sub(iy,dy);
				dz              = vec_sub(iz,dz);
				rsq             = vec_madd(dx,dx,nul);
				rsq             = vec_madd(dy,dy,rsq);
				rsq             = vec_madd(dz,dz,rsq);
				zero_highest_3_elements_in_vector(&rsq);
				rinv            = do_invsqrt(rsq);
				zero_highest_3_elements_in_vector(&rinv);
				rinvsq          = vec_madd(rinv,rinv,nul);
				r               = vec_madd(rinv,rsq,nul);
				rinvsix         = vec_madd(rinvsq,rinvsq,nul);
				rinvsix         = vec_madd(rinvsix,rinvsq,nul);
				tja             = ntiA+2*type[jnra];
				qq = vec_madd(load_1_float(charge+jnra),iq,nul);
				load_1_pair(vdwparam+tja,&c6,&c12);
				do_1_ctable_coul(VFtab,vec_madd(r,tsc,nul),&VVc,&FFc);
				fs2             = vec_madd(qq,FFc,nul);   /* fijC */
				vctot           = vec_madd(qq,VVc,vctot);
				Vvdw6            = vec_madd(c6,rinvsix,nul);
				Vvdw12           = vec_madd(c12,vec_madd(rinvsix,rinvsix,nul),
										   nul);
				fs              = vec_madd(vec_twelve(),Vvdw12,nul);
				fs              = vec_nmsub(vec_six(),Vvdw6,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_add(Vvdwtot,Vvdw12);
				fs              = vec_nmsub(fs2,tsc,fs);
				fs              = vec_madd(fs,rinv,nul);
				Vvdwtot          = vec_sub(Vvdwtot,Vvdw6);
				fix             = vec_madd(fs,dx,fix); /* +=fx */
				fiy             = vec_madd(fs,dy,fiy); /* +=fy */
				fiz             = vec_madd(fs,dz,fiz); /* +=fz */
				dx              = vec_nmsub(dx,fs,nul); /* -fx */
				dy              = vec_nmsub(dy,fs,nul); /* -fy */
				dz              = vec_nmsub(dz,fs,nul); /* -fz */
				transpose_3_to_1(dx,dy,dz,&tmp1);
				add_xyz_to_mem(faction+j3a,tmp1);
			}
			/* update outer data */
			transpose_3_to_4(fix,fiy,fiz,&tmp1,&tmp2,&tmp3,&tmp4);
			tmp1 = vec_add(tmp1,tmp3);
			tmp2 = vec_add(tmp2,tmp4);
			tmp1 = vec_add(tmp1,tmp2);

			add_xyz_to_mem(faction+ii3,tmp1);
			add_xyz_to_mem(fshift+is3,tmp1);

			add_vector_to_float(Vc+gid[n],vctot);
			add_vector_to_float(Vvdw+gid[n],Vvdwtot);
			ninner += nj1 - nj0;
		}
#ifdef GMX_THREADS
		nouter += nn1 - nn0;
	} while (nn1<nri);
#else
	nouter = nri;
#endif
	*outeriter = nouter;
	*inneriter = ninner;
}
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
  register int i;
  const vector signed int vzero = vec_splat_s32(0);
  const vector unsigned char permM2 = vec_lvsl(-2, src);
  const vector unsigned char permM1 = vec_lvsl(-1, src);
  const vector unsigned char permP0 = vec_lvsl(+0, src);
  const vector unsigned char permP1 = vec_lvsl(+1, src);
  const vector unsigned char permP2 = vec_lvsl(+2, src);
  const vector unsigned char permP3 = vec_lvsl(+3, src);
  const vector signed short v20ss = (const vector signed short)AVV(20);
  const vector unsigned int v10ui = vec_splat_u32(10);
  const vector signed short v5ss = vec_splat_s16(5);
  const vector signed short v1ss = vec_splat_s16(1);
  const vector signed int v512si = (const vector signed int)AVV(512);
  const vector unsigned int v16ui = (const vector unsigned int)AVV(16);

  register int align = ((((unsigned long)src) - 2) % 16);

  src -= (2 * srcStride);

  for (i = 0 ; i < 21 ; i ++) {
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
    vector unsigned char srcR1 = vec_ld(-2, src);
    vector unsigned char srcR2 = vec_ld(14, src);

    switch (align) {
    default: {
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = vec_perm(srcR1, srcR2, permP2);
      srcP3 = vec_perm(srcR1, srcR2, permP3);
    } break;
    case 11: {
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = vec_perm(srcR1, srcR2, permP2);
      srcP3 = srcR2;
    } break;
    case 12: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = vec_perm(srcR1, srcR2, permP1);
      srcP2 = srcR2;
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 13: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = vec_perm(srcR1, srcR2, permP0);
      srcP1 = srcR2;
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 14: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = vec_perm(srcR1, srcR2, permM1);
      srcP0 = srcR2;
      srcP1 = vec_perm(srcR2, srcR3, permP1);
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    case 15: {
      vector unsigned char srcR3 = vec_ld(30, src);
      srcM2 = vec_perm(srcR1, srcR2, permM2);
      srcM1 = srcR2;
      srcP0 = vec_perm(srcR2, srcR3, permP0);
      srcP1 = vec_perm(srcR2, srcR3, permP1);
      srcP2 = vec_perm(srcR2, srcR3, permP2);
      srcP3 = vec_perm(srcR2, srcR3, permP3);
    } break;
    }

    const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
    const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
    const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
    const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);

    const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
    const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
    const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
    const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);

    const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
    const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
    const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
    const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);

    const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
    const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
    const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
    const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
    const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
    const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
    
    const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A);
    const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B);

    const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
    const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);

    const vector signed short psumA = vec_sub(pp1A, pp2A);
    const vector signed short psumB = vec_sub(pp1B, pp2B);

    vec_st(psumA, 0, tmp);
    vec_st(psumB, 16, tmp);
    
    src += srcStride;
    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
  }
  
  const vector unsigned char dstperm = vec_lvsr(0, dst);
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
  const vector unsigned char mperm = (const vector unsigned char)
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
  
  int16_t *tmpbis = tmp - (tmpStride * 21);

  vector signed short tmpM2ssA = vec_ld(0, tmpbis);
  vector signed short tmpM2ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;
  vector signed short tmpM1ssA = vec_ld(0, tmpbis);
  vector signed short tmpM1ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;
  vector signed short tmpP0ssA = vec_ld(0, tmpbis);
  vector signed short tmpP0ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;
  vector signed short tmpP1ssA = vec_ld(0, tmpbis);
  vector signed short tmpP1ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;
  vector signed short tmpP2ssA = vec_ld(0, tmpbis);
  vector signed short tmpP2ssB = vec_ld(16, tmpbis);
  tmpbis += tmpStride;

  for (i = 0 ; i < 16 ; i++) {
    const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
    const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;

    const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
    const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
    const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
    const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
    const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
    const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);

    tmpM2ssA = tmpM1ssA;
    tmpM2ssB = tmpM1ssB;
    tmpM1ssA = tmpP0ssA;
    tmpM1ssB = tmpP0ssB;
    tmpP0ssA = tmpP1ssA;
    tmpP0ssB = tmpP1ssB;
    tmpP1ssA = tmpP2ssA;
    tmpP1ssB = tmpP2ssB;
    tmpP2ssA = tmpP3ssA;
    tmpP2ssB = tmpP3ssB;

    const vector signed int pp1Ae = vec_mule(sum1A, v20ss);
    const vector signed int pp1Ao = vec_mulo(sum1A, v20ss);
    const vector signed int pp1Be = vec_mule(sum1B, v20ss);
    const vector signed int pp1Bo = vec_mulo(sum1B, v20ss);

    const vector signed int pp2Ae = vec_mule(sum2A, v5ss);
    const vector signed int pp2Ao = vec_mulo(sum2A, v5ss);
    const vector signed int pp2Be = vec_mule(sum2B, v5ss);
    const vector signed int pp2Bo = vec_mulo(sum2B, v5ss);

    const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
    const vector signed int pp3Ao = vec_mulo(sum3A, v1ss);
    const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui);
    const vector signed int pp3Bo = vec_mulo(sum3B, v1ss);

    const vector signed int pp1cAe = vec_add(pp1Ae, v512si);
    const vector signed int pp1cAo = vec_add(pp1Ao, v512si);
    const vector signed int pp1cBe = vec_add(pp1Be, v512si);
    const vector signed int pp1cBo = vec_add(pp1Bo, v512si);

    const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae);
    const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao);
    const vector signed int pp32Be = vec_sub(pp3Be, pp2Be);
    const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo);

    const vector signed int sumAe = vec_add(pp1cAe, pp32Ae);
    const vector signed int sumAo = vec_add(pp1cAo, pp32Ao);
    const vector signed int sumBe = vec_add(pp1cBe, pp32Be);
    const vector signed int sumBo = vec_add(pp1cBo, pp32Bo);
    
    const vector signed int ssumAe = vec_sra(sumAe, v10ui);
    const vector signed int ssumAo = vec_sra(sumAo, v10ui);
    const vector signed int ssumBe = vec_sra(sumBe, v10ui);
    const vector signed int ssumBo = vec_sra(sumBo, v10ui);

    const vector signed short ssume = vec_packs(ssumAe, ssumBe);
    const vector signed short ssumo = vec_packs(ssumAo, ssumBo);

    const vector unsigned char sumv = vec_packsu(ssume, ssumo);
    const vector unsigned char sum = vec_perm(sumv, sumv, mperm);

    const vector unsigned char dst1 = vec_ld(0, dst);
    const vector unsigned char dst2 = vec_ld(16, dst);
    const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));

    vector unsigned char fsum;
    OP_U8_ALTIVEC(fsum, sum, vdst);

    const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
    const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
    const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);

    vec_st(fdst1, 0, dst);
    vec_st(fdst2, 16, dst);

    dst += dstStride;
  }
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
}
Esempio n. 23
0
void cfft2(unsigned int n,float x[][2],float y[][2],float w[][2], float sign)
{

    /*
       altivec version of cfft2 from Petersen and Arbenz book, "Intro.
       to Parallel Computing", Oxford Univ. Press, 2003, Section 3.6
                                            wpp 14. Dec. 2003
    */

    int jb,jc,jd,jw,k,k2,k4,lj,m,j,mj,mj2,pass,tgle;
    float rp,up,wr[4] __attribute((aligned(16)));
    float wu[4] __attribute((aligned(16)));
    float *a,*b,*c,*d;
    const vector float vminus = (vector float) {
        -0.,0.,-0.,0.
    };
    const vector float vzero  = (vector float) {
        0.,0.,0.,0.
    };
    const vector unsigned char pv3201 =
    (vector unsigned char) {
        4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11
    };
    vector float V0,V1,V2,V3,V4,V5,V6,V7;
    vector float V8,V9,V10,V11,V12,V13,V14,V15;

    if(n<=1) {
        y[0][0] = x[0][0];
        y[0][1] = x[0][1];
        return;
    }
    m    = (int) (log((float) n)/log(1.99));
    mj   = 1;
    mj2  = 2;
    lj   = n/2;
    /* first pass thru data: x -> y */
    for(j=0; j<lj; j++) {
        jb = n/2+j;
        jc  = j*mj2;
        jd = jc + 1;
        rp = w[j][0];
        up = w[j][1];
        if(sign<0.0) up = -up;
        y[jd][0] = rp*(x[j][0] - x[jb][0]) - up*(x[j][1] - x[jb][1]);
        y[jd][1] = up*(x[j][0] - x[jb][0]) + rp*(x[j][1] - x[jb][1]);
        y[jc][0] = x[j][0] + x[jb][0];
        y[jc][1] = x[j][1] + x[jb][1];
    }
    if(n==2) return;
    /* next pass is mj = 2 */
    mj  = 2;
    mj2 = 4;
    lj  = n/4;
    a = (float *)&y[0][0];
    b = (float *)&y[n/2][0];
    c = (float *)&x[0][0];
    d = (float *)&x[mj][0];
    if(n==4) {
        c = (float *)&y[0][0];
        d = (float *)&y[mj][0];
    }
    for(j=0; j<lj; j++) {
        jw = j*mj;
        jc = j*mj2;
        jd = 2*jc;
        rp = w[jw][0];
        up = w[jw][1];
        if(sign<0.0) up = -up;
        wr[0] = rp;
        wr[1] = rp;
        wr[2] = rp;
        wr[3] = rp;
        wu[0] = up;
        wu[1] = up;
        wu[2] = up;
        wu[3] = up;
        V6 = vec_ld(0,wr);
        V7 = vec_ld(0,wu);
        V7 = vec_xor(V7,vminus);
        V0 = vec_ld(0,(vector float *) (a+jc));
        V1 = vec_ld(0,(vector float *) (b+jc));
        V2 = vec_add(V0,V1);                         /* a + b */
        vec_st(V2,0,(vector float *) (c+jd));     /* store c */
        V3 = vec_sub(V0,V1);                         /* a - b */
        V4 = vec_perm(V3,V3,pv3201);
        V0 = vec_madd(V6,V3,vzero);
        V1 = vec_madd(V7,V4,vzero);
        V2 = vec_add(V0,V1);                         /* w*(a - b) */
        vec_st(V2,0,(vector float*) (d+jd));         /* store d */
    }
    if(n==4) return;
    mj  *= 2;
    mj2  = 2*mj;
    lj   = n/mj2;
    tgle = 0;
    for(pass=2; pass<m-1; pass++) {
        if(tgle) {
            a = (float *)&y[0][0];
            b = (float *)&y[n/2][0];
            c = (float *)&x[0][0];
            d = (float *)&x[mj][0];
            tgle = 0;
        } else {
            a = (float *)&x[0][0];
            b = (float *)&x[n/2][0];
            c = (float *)&y[0][0];
            d = (float *)&y[mj][0];
            tgle = 1;
        }
        for(j=0; j<lj; j++) {
            jw = j*mj;
            jc = j*mj2;
            jd = 2*jc;
            rp = w[jw][0];
            up = w[jw][1];
            if(sign<0.0) up = -up;
            wr[0] = rp;
            wr[1] = rp;
            wr[2] = rp;
            wr[3] = rp;
            wu[0] = up;
            wu[1] = up;
            wu[2] = up;
            wu[3] = up;
            V6 = vec_ld(0,wr);
            V7 = vec_ld(0,wu);
            V7 = vec_xor(V7,vminus);
            for(k=0; k<mj; k+=4) {
                k2   = 2*k;
                k4 = k2+4;
                V0 = vec_ld(0,(vector float *) (a+jc+k2));
                V1 = vec_ld(0,(vector float *) (b+jc+k2));
                V2 = vec_add(V0,V1);                        /* a + b */
                vec_st(V2,0,(vector float*) (c+jd+k2));   /* store c */
                V3 = vec_sub(V0,V1);                        /* a - b */
                V4 = vec_perm(V3,V3,pv3201);
                V0 = vec_madd(V6,V3,vzero);
                V1 = vec_madd(V7,V4,vzero);
                V2 = vec_add(V0,V1);                        /* w*(a - b) */
                vec_st(V2,0,(vector float *) (d+jd+k2));    /* store d */
                V8 = vec_ld(0,(vector float *) (a+jc+k4));
                V9 = vec_ld(0,(vector float *) (b+jc+k4));
                V10 = vec_add(V8,V9);		        /* a + b */
                vec_st(V10,0,(vector float *) (c+jd+k4));   /* store c */
                V11 = vec_sub(V8,V9);		        /* a - b */
                V12 = vec_perm(V11,V11,pv3201);
                V8  = vec_madd(V6,V11,vzero);
                V9  = vec_madd(V7,V12,vzero);
                V10 = vec_add(V8,V9);		        /* w*(a - b) */
                vec_st(V10,0,(vector float *) (d+jd+k4));   /* store d */
            }
        }
        mj  *= 2;
        mj2  = 2*mj;
        lj   = n/mj2;
    }
    /* last pass thru data: in-place if previous in y */
    c = (float *)&y[0][0];
    d = (float *)&y[n/2][0];
    if(tgle) {
        a = (float *)&y[0][0];
        b = (float *)&y[n/2][0];
    } else {
        a = (float *)&x[0][0];
        b = (float *)&x[n/2][0];
    }
    for(k=0; k<(n/2); k+=4) {
        k2 = 2*k;
        k4 = k2+4;
        V0 = vec_ld(0,(vector float *) (a+k2));
        V1 = vec_ld(0,(vector float *) (b+k2));
        V2 = vec_add(V0,V1);                      /* a + b */
        vec_st(V2,0,(vector float*) (c+k2));      /* store c */
        V3 = vec_sub(V0,V1);                      /* a - b */
        vec_st(V3,0,(vector float *) (d+k2));     /* store d */
        V4 = vec_ld(0,(vector float *) (a+k4));
        V5 = vec_ld(0,(vector float *) (b+k4));
        V6 = vec_add(V4,V5);                      /* a + b */
        vec_st(V6,0,(vector float *) (c+k4));     /* store c */
        V7 = vec_sub(V4,V5);                      /* a - b */
        vec_st(V7,0,(vector float *) (d+k4));     /* store d */
    }
}

// LLVM LOCAL begin
// Implementations of sin() and cos() may vary slightly in the accuracy of
// their results, typically only in the least significant bit.  Round to make
// the results consistent across platforms.
typedef union {
    double d;
    unsigned long long ll;
} dbl_ll_union;
static double LLVMsin(double d) {
    dbl_ll_union u;
    u.d = sin(d);
    u.ll = (u.ll + 1) & ~1ULL;
    return u.d;
}
static double LLVMcos(double d) {
    dbl_ll_union u;
    u.d = cos(d);
    u.ll = (u.ll + 1) & ~1ULL;
    return u.d;
}
// LLVM LOCAL end

void cffti(int n, float w[][2])
{

    /* initialization routine for cfft2: computes
            cos(twopi*k),sin(twopi*k) for k=0..n/2-1
       - the "twiddle factors" for a binary radix FFT */

    int i,n2;
    float aw,arg,pi;
    pi = 3.141592653589793;
    n2 = n/2;
    aw = 2.0*pi/((float)n);
    for(i=0; i<n2; i++) {
        arg   = aw*((float)i);
        w[i][0] = LLVMcos(arg);
        w[i][1] = LLVMsin(arg);
    }
}
Esempio n. 24
0
float hill_reilly_ring_pucker(SmallRing &ring, float *framepos) {
  int N = ring.num(); // the number of atoms in the current ring

#if 0
  // return the default color if this isn't a 5 or 6 ring atom
  if (N != 5 && N != 6)
    return 0.0;
    //MK added
    if (N==6) {
      //MK do Hill-Reilly for 6-membered rings
      int NP = N-3; // number of puckering parameters
      float *X = new float[N*3]; // atom co-ordinates
      float *r = new float[N*3]; // bond vectors
      float *a = new float[NP*3]; // puckering axes
      float *q = new float[NP*3]; // normalized puckering vectors
      float *n = new float[3]; // normal to reference plane
      float *p = new float[3]; // a flap normal
      float *theta = new float[NP]; // puckering parameters
      float pucker_sum;
      float max_pucker_sum;
      float *atompos;
      int curatomid, i, j, k, l;
    
      // load ring co-ordinates
      for (i=0; i<N; i++) {
        curatomid = ring[i];
        atompos = framepos + 3*curatomid;
        X[3*i] = atompos[0];
        X[3*i+1] = atompos[1];
        X[3*i+2] = atompos[2];
      }     
    
      // calculate bond vectors
      for (i=0; i<N; i++) {
        j = (i+1) % N;
        vec_sub(r+3*i,X+3*j,X+3*i);
      }
    
      // calculate puckering axes, flap normals and puckering vectors
      for (i=0; i<NP; i++) {
        k = (2*(i+1)) % N;
        j = (2*i) % N;
        l = (2*i+1) % N;
        vec_sub(a+3*i,X+3*k,X+3*j);
        cross_prod(p,r+3*j,r+3*l);
        cross_prod(q+3*i,a+3*i,p);
        vec_normalize(q+3*i);
      }
    
      // reference normal
      cross_prod(n,a+3*0,a+3*1);
      vec_normalize(n);
    
      // calculate the puckering parameters
      pucker_sum = 0.0;
    
      for (i=0; i<NP; i++) {
        theta[i] = (float(VMD_PI)/2.0f) - acosf(dot_prod(q+3*i, n));
        pucker_sum += theta[i];
      }
    
    
      // 0.6154 radians (35.26 degrees) has significance for perfect tetrahedral bond geometry (see Hill paper)
      max_pucker_sum = NP * 0.6154f;
      float pucker_scaled = pucker_sum/max_pucker_sum;
      pucker_sum = fabsf((pucker_scaled < 1.0f) ? pucker_scaled : 1.0f);
      pucker_sum = (pucker_sum < 1.0f) ? pucker_sum : 1.0f;
    
      delete [] X;
      delete [] r;
      delete [] a;
      delete [] q;
      delete [] n;
      delete [] p;
      delete [] theta;
      return pucker_sum;
    }  //end MK if N==6
    else {  //N==5 
#endif
    float *xring = new float[N];
    float *yring = new float[N];
    float *zring = new float[N];
    float *displ = new float[N];
    float *q = new float[N];
    float *phi = new float[N];
    float Q;
    int m;
    float *atompos;
    int curatomid;
    
    for (int i=0; i<N; i++) {
      curatomid = ring[i];
      atompos = framepos + 3*curatomid; // pointer arithmetic is evil :)
      xring[i] = atompos[0];
      yring[i] = atompos[1];
      zring[i] = atompos[2];
    }     
    
    atom_displ_from_mean_plane(xring, yring, zring, displ, N);

    delete [] xring;
    delete [] yring;
    delete [] zring;

    if (cremer_pople_params(N, displ, q, phi, m, Q)) {
      // Q is the puckering amplitude - i.e. the intensity of the pucker.
      Q = (Q < 2.0f) ? Q : 2.0f;  //truncate amplitude at 2
      delete [] displ;
      delete [] q;
      delete [] phi;
      return Q;
    } else {
      delete [] displ;
      delete [] q;
      delete [] phi;
      return 0.0;
    }    
#if 0
  }
#endif
}
Esempio n. 25
0
/* this code assume that stride % 16 == 0 */
void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
    signed int ABCD[4] __attribute__((aligned(16))) =
                        {((8 - x) * (8 - y)),
                          ((x) * (8 - y)),
                          ((8 - x) * (y)),
                          ((x) * (y))};
    register int i;
    vector unsigned char fperm;
    const vector signed int vABCD = vec_ld(0, ABCD);
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
    const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
    const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
    const vector signed int vzero = vec_splat_s32(0);
    const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
    const vector unsigned short v6us = vec_splat_u16(6);
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;

    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
    vector unsigned char vsrc0uc, vsrc1uc;
    vector signed short vsrc0ssH, vsrc1ssH;
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
    vector signed short vsrc2ssH, vsrc3ssH, psum;
    vector unsigned char vdst, ppsum, fsum;

    if (((unsigned long)dst) % 16 == 0) {
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
                                        0x14, 0x15, 0x16, 0x17,
                                        0x08, 0x09, 0x0A, 0x0B,
                                        0x0C, 0x0D, 0x0E, 0x0F);
    } else {
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
                                        0x04, 0x05, 0x06, 0x07,
                                        0x18, 0x19, 0x1A, 0x1B,
                                        0x1C, 0x1D, 0x1E, 0x1F);
    }

    vsrcAuc = vec_ld(0, src);

    if (loadSecond)
      vsrcBuc = vec_ld(16, src);
    vsrcperm0 = vec_lvsl(0, src);
    vsrcperm1 = vec_lvsl(1, src);

    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
    if (reallyBadAlign)
      vsrc1uc = vsrcBuc;
    else
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);

    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
                                               (vector unsigned char)vsrc0uc);
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
                                               (vector unsigned char)vsrc1uc);

    if (!loadSecond) {// -> !reallyBadAlign
      for (i = 0 ; i < h ; i++) {


        vsrcCuc = vec_ld(stride + 0, src);

        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);

        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
                                                (vector unsigned char)vsrc2uc);
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
                                                (vector unsigned char)vsrc3uc);

        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
        psum = vec_mladd(vB, vsrc1ssH, psum);
        psum = vec_mladd(vC, vsrc2ssH, psum);
        psum = vec_mladd(vD, vsrc3ssH, psum);
        psum = vec_add(v28ss, psum);
        psum = vec_sra(psum, v6us);

        vdst = vec_ld(0, dst);
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
        fsum = vec_perm(vdst, ppsum, fperm);

        vec_st(fsum, 0, dst);

        vsrc0ssH = vsrc2ssH;
        vsrc1ssH = vsrc3ssH;

        dst += stride;
        src += stride;
      }
    } else {
        vector unsigned char vsrcDuc;
      for (i = 0 ; i < h ; i++) {
        vsrcCuc = vec_ld(stride + 0, src);
        vsrcDuc = vec_ld(stride + 16, src);

        vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
        if (reallyBadAlign)
          vsrc3uc = vsrcDuc;
        else
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);

        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
                                                (vector unsigned char)vsrc2uc);
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
                                                (vector unsigned char)vsrc3uc);

        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
        psum = vec_mladd(vB, vsrc1ssH, psum);
        psum = vec_mladd(vC, vsrc2ssH, psum);
        psum = vec_mladd(vD, vsrc3ssH, psum);
        psum = vec_add(v28ss, psum);
        psum = vec_sr(psum, v6us);

        vdst = vec_ld(0, dst);
        ppsum = (vector unsigned char)vec_pack(psum, psum);
        fsum = vec_perm(vdst, ppsum, fperm);

        vec_st(fsum, 0, dst);

        vsrc0ssH = vsrc2ssH;
        vsrc1ssH = vsrc3ssH;

        dst += stride;
        src += stride;
      }
    }
}
Esempio n. 26
0
int32_t main(int32_t argc, char *argv[]) {
    if( init_sdl2() ) {
        return 1;
    }

    int width = 1280;
    int height = 720;

    SDL_Window* window;
    sdl2_window("cute3d: " __FILE__, SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, width, height, &window);

    SDL_GLContext* context;
    sdl2_glcontext(3, 2, window, &context);

    if( init_shader() ) {
        return 1;
    }

    if( init_canvas(width, height) ) {
        return 1;
    }
    canvas_create("global_dynamic_canvas", &global_dynamic_canvas);
    canvas_create("global_static_canvas", &global_static_canvas);

    struct Vbo vbo = {0};
    vbo_create(&vbo);
    vbo_add_buffer(&vbo, SHADER_ATTRIBUTE_VERTEX, 3, GL_FLOAT, GL_STATIC_DRAW);
    vbo_add_buffer(&vbo, SHADER_ATTRIBUTE_VERTEX_NORMAL, 3, GL_FLOAT, GL_STATIC_DRAW);
    vbo_add_buffer(&vbo, SHADER_ATTRIBUTE_VERTEX_COLOR, 4, GL_UNSIGNED_BYTE, GL_STATIC_DRAW);
    vbo_add_buffer(&vbo, SHADER_ATTRIBUTE_VERTEX_NORMAL, NORMAL_SIZE, GL_FLOAT, GL_STATIC_DRAW);

    struct Ibo ibo = {0};
    ibo_create(GL_TRIANGLES, GL_UNSIGNED_INT, GL_STATIC_DRAW, &ibo);

    struct SolidTetrahedron hard_tetrahedron = {0};
    struct SolidBox hard_cube = {0};
    struct SolidSphere16 hard_sphere16 = {0};
    struct SolidSphere32 hard_sphere32 = {0};
    solid_tetrahedron_create(1.0, (Color){255, 0, 0, 255}, &hard_tetrahedron);
    solid_cube_create(0.5, (Color){0, 255, 0, 255}, &hard_cube);
    solid_sphere16_create(16, 8, 0.75, (Color){0, 255, 255, 255}, &hard_sphere16);
    solid_sphere32_create(32, 16, 0.75, (Color){255, 255, 0, 255}, &hard_sphere32);

    solid_optimize((struct Solid*)&hard_tetrahedron);
    solid_optimize((struct Solid*)&hard_cube);
    solid_optimize((struct Solid*)&hard_sphere16);
    solid_optimize((struct Solid*)&hard_sphere32);

    struct VboMesh hard_tetrahedron_mesh, hard_box_mesh, hard_cube_mesh, hard_sphere16_mesh, hard_sphere32_mesh;
    vbo_mesh_create_from_solid((struct Solid*)&hard_tetrahedron, &vbo, &ibo, &hard_tetrahedron_mesh);
    vbo_mesh_create_from_solid((struct Solid*)&hard_cube, &vbo, &ibo, &hard_cube_mesh);
    vbo_mesh_create_from_solid((struct Solid*)&hard_sphere16, &vbo, &ibo, &hard_sphere16_mesh);
    vbo_mesh_create_from_solid((struct Solid*)&hard_sphere32, &vbo, &ibo, &hard_sphere32_mesh);

    struct SolidTetrahedron smooth_tetrahedron = {0};
    struct SolidBox smooth_cube = {0};
    struct SolidSphere16 smooth_sphere16 = {0};
    struct SolidSphere32 smooth_sphere32 = {0};
    solid_tetrahedron_create(1.0, (Color){255, 0, 0, 255}, &smooth_tetrahedron);
    solid_cube_create(0.5, (Color){0, 255, 0, 255}, &smooth_cube);
    solid_sphere16_create(16, 8, 0.75, (Color){0, 255, 255, 255}, &smooth_sphere16);
    solid_sphere32_create(32, 16, 0.75, (Color){255, 255, 0, 255}, &smooth_sphere32);

    solid_optimize((struct Solid*)&smooth_tetrahedron);
    solid_optimize((struct Solid*)&smooth_cube);
    solid_optimize((struct Solid*)&smooth_sphere16);
    solid_optimize((struct Solid*)&smooth_sphere32);
    solid_smooth_normals((struct Solid*)&smooth_tetrahedron, smooth_tetrahedron.normals, smooth_tetrahedron.normals);
    solid_smooth_normals((struct Solid*)&smooth_cube, smooth_cube.normals, smooth_cube.normals);
    solid_smooth_normals((struct Solid*)&smooth_sphere16, smooth_sphere16.normals, smooth_sphere16.normals);
    solid_smooth_normals((struct Solid*)&smooth_sphere32, smooth_sphere32.normals, smooth_sphere32.normals);

    struct VboMesh smooth_tetrahedron_mesh, smooth_box_mesh, smooth_cube_mesh, smooth_sphere16_mesh, smooth_sphere32_mesh;
    vbo_mesh_create_from_solid((struct Solid*)&smooth_tetrahedron, &vbo, &ibo, &smooth_tetrahedron_mesh);
    vbo_mesh_create_from_solid((struct Solid*)&smooth_cube, &vbo, &ibo, &smooth_cube_mesh);
    vbo_mesh_create_from_solid((struct Solid*)&smooth_sphere16, &vbo, &ibo, &smooth_sphere16_mesh);
    vbo_mesh_create_from_solid((struct Solid*)&smooth_sphere32, &vbo, &ibo, &smooth_sphere32_mesh);

    struct Arcball arcball = {0};
    arcball_create(width, height, (Vec4f){2.5,17.0,17.0,1.0}, (Vec4f){2.5,0.0,0.0,1.0}, 0.1, 100.0, &arcball);

    float circular_motion_angle = 0.0f;
    float circular_motion_speed = (2.0f*PI)/30;
    float circular_motion_radius = 12.0f;

    Vec3f light_position = { circular_motion_radius, 10.0, circular_motion_radius };
    Vec3f light_direction = {0};
    vec_sub((Vec3f){0.0f, 0.0f, 0.0f}, light_position, light_direction);
    vec_normalize(light_direction, light_direction);

    Vec3f eye_position = {0};
    vec_copy3f(arcball.camera.pivot.position, eye_position);

    Color ambiance = {50, 25, 150, 255};
    Color specular = {255, 255, 255, 255};
    float material_shininess = 1.0;
    Vec4f material_coefficients = { 0.8, 0.2, 0.0, 0.0 };

    // flat
    struct Shader flat_shader = {0};
    shader_create(&flat_shader);
    shader_attach(&flat_shader, GL_VERTEX_SHADER, "prefix.vert", 1, "flat_shading.vert");
    shader_attach(&flat_shader, GL_FRAGMENT_SHADER, "prefix.frag", 1, "flat_shading.frag");
    shader_make_program(&flat_shader, SHADER_DEFAULT_NAMES, "flat_shader");

    shader_set_uniform_3f(&flat_shader, flat_shader.program, SHADER_UNIFORM_LIGHT_DIRECTION, 3, GL_FLOAT, light_direction);
    shader_set_uniform_4f(&flat_shader, flat_shader.program, SHADER_UNIFORM_AMBIENT_LIGHT, 4, GL_UNSIGNED_BYTE, ambiance);

    // gouraud
    struct Shader gouraud_shader = {0};
    shader_create(&gouraud_shader);
    shader_attach(&gouraud_shader, GL_VERTEX_SHADER, "prefix.vert", 1, "gouraud_shading.vert");
    shader_attach(&gouraud_shader, GL_FRAGMENT_SHADER, "prefix.frag", 1, "gouraud_shading.frag");
    shader_make_program(&gouraud_shader, SHADER_DEFAULT_NAMES, "gouraud_shader");

    shader_set_uniform_3f(&gouraud_shader, gouraud_shader.program, SHADER_UNIFORM_LIGHT_POSITION, 3, GL_FLOAT, light_position);
    shader_set_uniform_3f(&gouraud_shader, gouraud_shader.program, SHADER_UNIFORM_EYE_POSITION, 3, GL_FLOAT, eye_position);
    shader_set_uniform_4f(&gouraud_shader, gouraud_shader.program, SHADER_UNIFORM_AMBIENT_LIGHT, 4, GL_UNSIGNED_BYTE, ambiance);
    shader_set_uniform_4f(&gouraud_shader, gouraud_shader.program, SHADER_UNIFORM_SPECULAR_LIGHT, 4, GL_UNSIGNED_BYTE, specular);
    shader_set_uniform_1f(&gouraud_shader, gouraud_shader.program, SHADER_UNIFORM_MATERIAL_SHININESS, 1, GL_FLOAT, &material_shininess);
    shader_set_uniform_4f(&gouraud_shader, gouraud_shader.program, SHADER_UNIFORM_MATERIAL_COEFFICIENTS, 4, GL_FLOAT, material_coefficients);
    shader_set_uniform_3f(&gouraud_shader, gouraud_shader.program, SHADER_UNIFORM_EYE_POSITION, 3, GL_FLOAT, &arcball.camera.pivot.position);

    Mat identity = IDENTITY_MAT;
    draw_grid(&global_static_canvas, 0, identity, (Color){127, 127, 127, 255}, 0.03f, 12.0f, 12.0f, 12);

    Mat shading_label_transform = IDENTITY_MAT;
    Quat text_rotation = IDENTITY_QUAT;
    quat_from_axis_angle((Vec3f)X_AXIS, -PI/2.0f, text_rotation);
    mat_rotate(shading_label_transform, text_rotation, shading_label_transform);
    mat_translate(shading_label_transform, (float[4]){ 6.5, 0.0, -2.5, 1.0 }, shading_label_transform);
Esempio n. 27
0
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
    register int i;
    LOAD_ZERO;
    const vec_u8 permM2 = vec_lvsl(-2, src);
    const vec_u8 permM1 = vec_lvsl(-1, src);
    const vec_u8 permP0 = vec_lvsl(+0, src);
    const vec_u8 permP1 = vec_lvsl(+1, src);
    const vec_u8 permP2 = vec_lvsl(+2, src);
    const vec_u8 permP3 = vec_lvsl(+3, src);
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
    const vec_u32 v10ui = vec_splat_u32(10);
    const vec_s16 v5ss = vec_splat_s16(5);
    const vec_s16 v1ss = vec_splat_s16(1);
    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));

    register int align = ((((unsigned long)src) - 2) % 16);

    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
              srcP2A, srcP2B, srcP3A, srcP3B,
              srcM1A, srcM1B, srcM2A, srcM2B,
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;

    const vec_u8 mperm = (const vec_u8)
        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
    int16_t *tmpbis = tmp;

    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
              tmpP2ssA, tmpP2ssB;

    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
              ssumAe, ssumAo, ssumBe, ssumBo;
    vec_u8 fsum, sumv, sum;
    vec_s16 ssume, ssumo;

    src -= (2 * srcStride);
    for (i = 0 ; i < 21 ; i ++) {
        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
        vec_u8 srcR1 = vec_ld(-2, src);
        vec_u8 srcR2 = vec_ld(14, src);

        switch (align) {
        default: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = vec_perm(srcR1, srcR2, permP3);
        } break;
        case 11: {
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = vec_perm(srcR1, srcR2, permP2);
            srcP3 = srcR2;
        } break;
        case 12: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = vec_perm(srcR1, srcR2, permP1);
            srcP2 = srcR2;
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 13: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = vec_perm(srcR1, srcR2, permP0);
            srcP1 = srcR2;
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 14: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = vec_perm(srcR1, srcR2, permM1);
            srcP0 = srcR2;
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        case 15: {
            vec_u8 srcR3 = vec_ld(30, src);
            srcM2 = vec_perm(srcR1, srcR2, permM2);
            srcM1 = srcR2;
            srcP0 = vec_perm(srcR2, srcR3, permP0);
            srcP1 = vec_perm(srcR2, srcR3, permP1);
            srcP2 = vec_perm(srcR2, srcR3, permP2);
            srcP3 = vec_perm(srcR2, srcR3, permP3);
        } break;
        }

        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);

        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);

        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);

        sum1A = vec_adds(srcP0A, srcP1A);
        sum1B = vec_adds(srcP0B, srcP1B);
        sum2A = vec_adds(srcM1A, srcP2A);
        sum2B = vec_adds(srcM1B, srcP2B);
        sum3A = vec_adds(srcM2A, srcP3A);
        sum3B = vec_adds(srcM2B, srcP3B);

        pp1A = vec_mladd(sum1A, v20ss, sum3A);
        pp1B = vec_mladd(sum1B, v20ss, sum3B);

        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);

        psumA = vec_sub(pp1A, pp2A);
        psumB = vec_sub(pp1B, pp2B);

        vec_st(psumA, 0, tmp);
        vec_st(psumB, 16, tmp);

        src += srcStride;
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
    }

    tmpM2ssA = vec_ld(0, tmpbis);
    tmpM2ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpM1ssA = vec_ld(0, tmpbis);
    tmpM1ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP0ssA = vec_ld(0, tmpbis);
    tmpP0ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP1ssA = vec_ld(0, tmpbis);
    tmpP1ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;
    tmpP2ssA = vec_ld(0, tmpbis);
    tmpP2ssB = vec_ld(16, tmpbis);
    tmpbis += tmpStride;

    for (i = 0 ; i < 16 ; i++) {
        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);

        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);

        tmpbis += tmpStride;

        tmpM2ssA = tmpM1ssA;
        tmpM2ssB = tmpM1ssB;
        tmpM1ssA = tmpP0ssA;
        tmpM1ssB = tmpP0ssB;
        tmpP0ssA = tmpP1ssA;
        tmpP0ssB = tmpP1ssB;
        tmpP1ssA = tmpP2ssA;
        tmpP1ssB = tmpP2ssB;
        tmpP2ssA = tmpP3ssA;
        tmpP2ssB = tmpP3ssB;

        pp1Ae = vec_mule(sum1A, v20ss);
        pp1Ao = vec_mulo(sum1A, v20ss);
        pp1Be = vec_mule(sum1B, v20ss);
        pp1Bo = vec_mulo(sum1B, v20ss);

        pp2Ae = vec_mule(sum2A, v5ss);
        pp2Ao = vec_mulo(sum2A, v5ss);
        pp2Be = vec_mule(sum2B, v5ss);
        pp2Bo = vec_mulo(sum2B, v5ss);

        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
        pp3Ao = vec_mulo(sum3A, v1ss);
        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
        pp3Bo = vec_mulo(sum3B, v1ss);

        pp1cAe = vec_add(pp1Ae, v512si);
        pp1cAo = vec_add(pp1Ao, v512si);
        pp1cBe = vec_add(pp1Be, v512si);
        pp1cBo = vec_add(pp1Bo, v512si);

        pp32Ae = vec_sub(pp3Ae, pp2Ae);
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
        pp32Be = vec_sub(pp3Be, pp2Be);
        pp32Bo = vec_sub(pp3Bo, pp2Bo);

        sumAe = vec_add(pp1cAe, pp32Ae);
        sumAo = vec_add(pp1cAo, pp32Ao);
        sumBe = vec_add(pp1cBe, pp32Be);
        sumBo = vec_add(pp1cBo, pp32Bo);

        ssumAe = vec_sra(sumAe, v10ui);
        ssumAo = vec_sra(sumAo, v10ui);
        ssumBe = vec_sra(sumBe, v10ui);
        ssumBo = vec_sra(sumBo, v10ui);

        ssume = vec_packs(ssumAe, ssumBe);
        ssumo = vec_packs(ssumAo, ssumBo);

        sumv = vec_packsu(ssume, ssumo);
        sum = vec_perm(sumv, sumv, mperm);

        ASSERT_ALIGNED(dst);

        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));

        vec_st(fsum, 0, dst);

        dst += dstStride;
    }
}
Esempio n. 28
0
/* AltiVec version of dct_unquantize_h263
   this code assumes `block' is 16 bytes-aligned */
static void dct_unquantize_h263_altivec(MpegEncContext *s,
                                 DCTELEM *block, int n, int qscale)
{
    int i, level, qmul, qadd;
    int nCoeffs;

    assert(s->block_last_index[n]>=0);

    qadd = (qscale - 1) | 1;
    qmul = qscale << 1;

    if (s->mb_intra) {
        if (!s->h263_aic) {
            if (n < 4)
                block[0] = block[0] * s->y_dc_scale;
            else
                block[0] = block[0] * s->c_dc_scale;
        }else
            qadd = 0;
        i = 1;
        nCoeffs= 63; //does not always use zigzag table
    } else {
        i = 0;
        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
    }

    {
        register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
        DECLARE_ALIGNED(16, short, qmul8) = qmul;
        DECLARE_ALIGNED(16, short, qadd8) = qadd;
        register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
        register vector bool short blockv_null, blockv_neg;
        register short backup_0 = block[0];
        register int j = 0;

        qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
        qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
        nqaddv = vec_sub(vczero, qaddv);

#if 0   // block *is* 16 bytes-aligned, it seems.
        // first make sure block[j] is 16 bytes-aligned
        for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
            level = block[j];
            if (level) {
                if (level < 0) {
                    level = level * qmul - qadd;
                } else {
                    level = level * qmul + qadd;
                }
                block[j] = level;
            }
        }
#endif

        // vectorize all the 16 bytes-aligned blocks
        // of 8 elements
        for(; (j + 7) <= nCoeffs ; j+=8) {
            blockv = vec_ld(j << 1, block);
            blockv_neg = vec_cmplt(blockv, vczero);
            blockv_null = vec_cmpeq(blockv, vczero);
            // choose between +qadd or -qadd as the third operand
            temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
            // multiply & add (block{i,i+7} * qmul [+-] qadd)
            temp1 = vec_mladd(blockv, qmulv, temp1);
            // put 0 where block[{i,i+7} used to have 0
            blockv = vec_sel(temp1, blockv, blockv_null);
            vec_st(blockv, j << 1, block);
        }

        // if nCoeffs isn't a multiple of 8, finish the job
        // using good old scalar units.
        // (we could do it using a truncated vector,
        // but I'm not sure it's worth the hassle)
        for(; j <= nCoeffs ; j++) {
            level = block[j];
            if (level) {
                if (level < 0) {
                    level = level * qmul - qadd;
                } else {
                    level = level * qmul + qadd;
                }
                block[j] = level;
            }
        }

        if (i == 1) {
            // cheat. this avoid special-casing the first iteration
            block[0] = backup_0;
        }
    }
}
Esempio n. 29
0
int DisplayDevice::pick(int dim, const float *pos, const VMDDisplayList *cmdList,
			float &eyedist, int *unitcell, float window_size) {
  char *cmdptr = NULL;
  int tok;
  float newEyeDist, currEyeDist = eyedist;
  int tag = (-1), inRegion, currTag;
  float minX=0.0f, minY=0.0f, maxX=0.0f, maxY=0.0f;
  float fminX=0.0f, fminY=0.0f, fminZ=0.0f, fmaxX=0.0f, fmaxY=0.0f, fmaxZ=0.0f;
  float wpntpos[3], pntpos[3], cpos[3];

  if(!cmdList)
    return (-1);

  // initialize picking: find screen region to look for object
  if (dim == 2) {
    fminX = pos[0] - window_size;
    fmaxX = pos[0] + window_size;
    fminY = pos[1] - window_size;
    fmaxY = pos[1] + window_size;
    abs_screen_pos(fminX, fminY);
    abs_screen_pos(fmaxX, fmaxY);
#if defined(_MSC_VER)
    // Win32 lacks the C99 round() routine
    // Add +/- 0.5 then then round towards zero.
#if 1
    minX = (int) ((float) (fminX + (fminX >= 0.0f ?  0.5f : -0.5f)));
    maxX = (int) ((float) (fmaxX + (fmaxX >= 0.0f ?  0.5f : -0.5f)));
    minY = (int) ((float) (fminY + (fminY >= 0.0f ?  0.5f : -0.5f)));
    maxY = (int) ((float) (fmaxY + (fmaxY >= 0.0f ?  0.5f : -0.5f)));
#else
    minX = truncf(fminX + (fminX >= 0.0f ?  0.5f : -0.5f));
    maxX = truncf(fmaxX + (fmaxX >= 0.0f ?  0.5f : -0.5f));
    minY = truncf(fminY + (fminY >= 0.0f ?  0.5f : -0.5f));
    maxY = truncf(fmaxY + (fmaxY >= 0.0f ?  0.5f : -0.5f));
//    minX = floor(fminX + 0.5);
//    maxX = floor(fmaxX + 0.5);
//    minY = floor(fminY + 0.5);
//    maxY = floor(fmaxY + 0.5);
#endif
#else
    minX = round(fminX);
    maxX = round(fmaxX);
    minY = round(fminY);
    maxY = round(fmaxY);
#endif

  } else {
    fminX = pos[0] - window_size;
    fmaxX = pos[0] + window_size;
    fminY = pos[1] - window_size;
    fmaxY = pos[1] + window_size;
    fminZ = pos[2] - window_size;
    fmaxZ = pos[2] + window_size;
  }

  // make sure we do not disturb the regular transformation matrix
  transMat.dup();
  (transMat.top()).multmatrix(cmdList->mat);

  // Transform the current pick point for each periodic image 
  ResizeArray<Matrix4> pbcImages;
  ResizeArray<int> pbcCells;
  find_pbc_images(cmdList, pbcImages);
  find_pbc_cells(cmdList, pbcCells);
  int pbcimg;
  for (pbcimg=0; pbcimg<pbcImages.num(); pbcimg++) {
    transMat.dup();
    (transMat.top()).multmatrix(pbcImages[pbcimg]);

    // scan through the list, getting each command and executing it, until
    // the end of commands token is found
    VMDDisplayList::VMDLinkIter cmditer;
    cmdList->first(&cmditer);
    while((tok = cmdList->next(&cmditer, cmdptr)) != DLASTCOMMAND) {
      switch (tok) {
        case DPICKPOINT:
          // calculate the transformed position of the point
          {
            DispCmdPickPoint *cmd = (DispCmdPickPoint *)cmdptr;
            vec_copy(wpntpos, cmd->postag);
            currTag = cmd->tag;
          }
          (transMat.top()).multpoint3d(wpntpos, pntpos);

          // check if in picking region ... different for 2D and 3D
          if (dim == 2) {
            // convert the 3D world coordinate to 2D (XY) absolute screen 
            // coordinate, and a normalized Z coordinate.
            abs_screen_loc_3D(pntpos, cpos);
      
            // check to see if the projected picking position falls within the 
            // view frustum, with the XY coords falling within the displayed 
            // window, and the Z coordinate falling within the view volume
            // between the front and rear clipping planes.
            inRegion = (cpos[0] >= minX && cpos[0] <= maxX &&
                        cpos[1] >= minY && cpos[1] <= maxY &&
                        cpos[2] >= 0.0  && cpos[2] <= 1.0);
          } else {
            // just check to see if the position is in a box centered on our
            // pointer.  The pointer position should already be transformed.
            inRegion = (pntpos[0] >= fminX && pntpos[0] <= fmaxX &&	
                        pntpos[1] >= fminY && pntpos[1] <= fmaxY &&
                        pntpos[2] >= fminZ && pntpos[2] <= fmaxZ);
          }

          // Clip still-viable pick points against all active clipping planes
          if (inRegion) {
            // We must perform a check against all of the active
            // user-defined clipping planes to ensure that only pick points
            // associated with visible geometry can be selected.
            int cp;
            for (cp=0; cp < VMD_MAX_CLIP_PLANE; cp++) {
              // The final result is the intersection of all of the
              // individual clipping plane tests...
              if (cmdList->clipplanes[cp].mode) {
                float cpdist[3];
                vec_sub(cpdist, wpntpos, cmdList->clipplanes[cp].center);
                inRegion &= (dot_prod(cpdist, 
                                      cmdList->clipplanes[cp].normal) > 0.0f);
              }
            }
          }
      
          // has a hit occurred?
          if (inRegion) {
            // yes, see if it is closer to the eye position than earlier objects
            if(dim==2) 
              newEyeDist = DTOEYE(pntpos[0], pntpos[1], pntpos[2]);
            else 
              newEyeDist = DTOPOINT(pntpos[0],pntpos[1],pntpos[2]);

            if(currEyeDist < 0.0 || newEyeDist < currEyeDist) {
              currEyeDist = newEyeDist;
              tag = currTag;
              if (unitcell) {
                unitcell[0] = pbcCells[3*pbcimg  ];
                unitcell[1] = pbcCells[3*pbcimg+1];
                unitcell[2] = pbcCells[3*pbcimg+2];
              }
            }
          }
          break;

        case DPICKPOINT_ARRAY:
          // loop over all of the pick points in the pick point index array
          DispCmdPickPointArray *cmd = (DispCmdPickPointArray *)cmdptr;
          float *pickpos=NULL;
          float *crds=NULL;
          int *indices=NULL;
          cmd->getpointers(crds, indices); 

          int i;
          for (i=0; i<cmd->numpicks; i++) {
            pickpos = crds + i*3;
            if (cmd->allselected) {
              currTag = i + cmd->firstindex;
            } else {
              currTag = indices[i];
            }
            vec_copy(wpntpos, pickpos);

            (transMat.top()).multpoint3d(pickpos, pntpos);

            // check if in picking region ... different for 2D and 3D
            if (dim == 2) {
              // convert the 3D world coordinate to 2D absolute screen coord
              abs_screen_loc_3D(pntpos, cpos);

              // check to see if the position falls in our picking region
              // including the clipping region (cpos[2])
              inRegion = (cpos[0] >= minX && cpos[0] <= maxX &&
                          cpos[1] >= minY && cpos[1] <= maxY &&
                          cpos[2] >= 0.0  && cpos[2] <= 1.0);
            } else {
              // just check to see if the position is in a box centered on our
              // pointer.  The pointer position should already be transformed.
              inRegion = (pntpos[0] >= fminX && pntpos[0] <= fmaxX &&
                          pntpos[1] >= fminY && pntpos[1] <= fmaxY &&
                          pntpos[2] >= fminZ && pntpos[2] <= fmaxZ);
            }

            // Clip still-viable pick points against all active clipping planes
            if (inRegion) {
              // We must perform a check against all of the active
              // user-defined clipping planes to ensure that only pick points
              // associated with visible geometry can be selected.
              int cp;
              for (cp=0; cp < VMD_MAX_CLIP_PLANE; cp++) {
                // The final result is the intersection of all of the
                // individual clipping plane tests...
                if (cmdList->clipplanes[cp].mode) {
                  float cpdist[3];
                  vec_sub(cpdist, wpntpos, cmdList->clipplanes[cp].center);
                  inRegion &= (dot_prod(cpdist, 
                                        cmdList->clipplanes[cp].normal) > 0.0f);
                }
              }
            }

            // has a hit occurred?
            if (inRegion) {
              // yes, see if it is closer to the eye than earlier hits
              if (dim==2)
                newEyeDist = DTOEYE(pntpos[0], pntpos[1], pntpos[2]);
              else
                newEyeDist = DTOPOINT(pntpos[0],pntpos[1],pntpos[2]);

              if (currEyeDist < 0.0 || newEyeDist < currEyeDist) {
                currEyeDist = newEyeDist;
                tag = currTag;
                if (unitcell) {
                  unitcell[0] = pbcCells[3*pbcimg  ];
                  unitcell[1] = pbcCells[3*pbcimg+1];
                  unitcell[2] = pbcCells[3*pbcimg+2];
                }
              }
            }
          }
          break;
      }
    }

    // Pop the PBC image transform
    transMat.pop();
  } // end of loop over PBC images

  // make sure we do not disturb the regular transformation matrix
  transMat.pop();

  // return result; if tag >= 0, we found something
  eyedist = currEyeDist;
  return tag;
}
Esempio n. 30
0
static int dct_quantize_altivec(MpegEncContext* s,
                         DCTELEM* data, int n,
                         int qscale, int* overflow)
{
    int lastNonZero;
    vector float row0, row1, row2, row3, row4, row5, row6, row7;
    vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
    const vector float zero = (const vector float)FOUROF(0.);
    // used after quantize step
    int oldBaseValue = 0;

    // Load the data into the row/alt vectors
    {
        vector signed short data0, data1, data2, data3, data4, data5, data6, data7;

        data0 = vec_ld(0, data);
        data1 = vec_ld(16, data);
        data2 = vec_ld(32, data);
        data3 = vec_ld(48, data);
        data4 = vec_ld(64, data);
        data5 = vec_ld(80, data);
        data6 = vec_ld(96, data);
        data7 = vec_ld(112, data);

        // Transpose the data before we start
        TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);

        // load the data into floating point vectors.  We load
        // the high half of each row into the main row vectors
        // and the low half into the alt vectors.
        row0 = vec_ctf(vec_unpackh(data0), 0);
        alt0 = vec_ctf(vec_unpackl(data0), 0);
        row1 = vec_ctf(vec_unpackh(data1), 0);
        alt1 = vec_ctf(vec_unpackl(data1), 0);
        row2 = vec_ctf(vec_unpackh(data2), 0);
        alt2 = vec_ctf(vec_unpackl(data2), 0);
        row3 = vec_ctf(vec_unpackh(data3), 0);
        alt3 = vec_ctf(vec_unpackl(data3), 0);
        row4 = vec_ctf(vec_unpackh(data4), 0);
        alt4 = vec_ctf(vec_unpackl(data4), 0);
        row5 = vec_ctf(vec_unpackh(data5), 0);
        alt5 = vec_ctf(vec_unpackl(data5), 0);
        row6 = vec_ctf(vec_unpackh(data6), 0);
        alt6 = vec_ctf(vec_unpackl(data6), 0);
        row7 = vec_ctf(vec_unpackh(data7), 0);
        alt7 = vec_ctf(vec_unpackl(data7), 0);
    }

    // The following block could exist as a separate an altivec dct
                // function.  However, if we put it inline, the DCT data can remain
                // in the vector local variables, as floats, which we'll use during the
                // quantize step...
    {
        const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f);
        const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f);
        const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f);
        const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f);
        const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f);
        const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f);
        const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f);
        const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f);
        const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f);
        const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f);
        const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f);
        const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f);


        int whichPass, whichHalf;

        for(whichPass = 1; whichPass<=2; whichPass++) {
            for(whichHalf = 1; whichHalf<=2; whichHalf++) {
                vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
                vector float tmp10, tmp11, tmp12, tmp13;
                vector float z1, z2, z3, z4, z5;

                tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7];
                tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7];
                tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4];
                tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4];
                tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6];
                tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6];
                tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5];
                tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5];

                tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3;
                tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3;
                tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2;
                tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2;


                // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
                row0 = vec_add(tmp10, tmp11);

                // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
                row4 = vec_sub(tmp10, tmp11);


                // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
                z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);

                // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                //                                CONST_BITS-PASS1_BITS);
                row2 = vec_madd(tmp13, vec_0_765366865, z1);

                // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                //                                CONST_BITS-PASS1_BITS);
                row6 = vec_madd(tmp12, vec_1_847759065, z1);

                z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
                z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6;
                z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6;
                z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7;

                // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
                z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero);

                // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
                z3 = vec_madd(z3, vec_1_961570560, z5);

                // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
                z4 = vec_madd(z4, vec_0_390180644, z5);

                // The following adds are rolled into the multiplies above
                // z3 = vec_add(z3, z5);  // z3 += z5;
                // z4 = vec_add(z4, z5);  // z4 += z5;

                // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
                // Wow!  It's actually more efficient to roll this multiply
                // into the adds below, even thought the multiply gets done twice!
                // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero);

                // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
                // Same with this one...
                // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero);

                // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
                // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
                row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3));

                // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
                // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
                row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4));

                // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
                // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
                row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3));

                // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
                // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
                row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4));

                // Swap the row values with the alts.  If this is the first half,
                // this sets up the low values to be acted on in the second half.
                // If this is the second half, it puts the high values back in
                // the row values where they are expected to be when we're done.
                SWAP(row0, alt0);
                SWAP(row1, alt1);
                SWAP(row2, alt2);
                SWAP(row3, alt3);
                SWAP(row4, alt4);
                SWAP(row5, alt5);
                SWAP(row6, alt6);
                SWAP(row7, alt7);
            }

            if (whichPass == 1) {
                // transpose the data for the second pass

                // First, block transpose the upper right with lower left.
                SWAP(row4, alt0);
                SWAP(row5, alt1);
                SWAP(row6, alt2);
                SWAP(row7, alt3);

                // Now, transpose each block of four
                TRANSPOSE4(row0, row1, row2, row3);
                TRANSPOSE4(row4, row5, row6, row7);
                TRANSPOSE4(alt0, alt1, alt2, alt3);
                TRANSPOSE4(alt4, alt5, alt6, alt7);
            }
        }
    }

    // perform the quantize step, using the floating point data
    // still in the row/alt registers
    {
        const int* biasAddr;
        const vector signed int* qmat;
        vector float bias, negBias;

        if (s->mb_intra) {
            vector signed int baseVector;

            // We must cache element 0 in the intra case
            // (it needs special handling).
            baseVector = vec_cts(vec_splat(row0, 0), 0);
            vec_ste(baseVector, 0, &oldBaseValue);

            qmat = (vector signed int*)s->q_intra_matrix[qscale];
            biasAddr = &(s->intra_quant_bias);
        } else {
            qmat = (vector signed int*)s->q_inter_matrix[qscale];
            biasAddr = &(s->inter_quant_bias);
        }

        // Load the bias vector (We add 0.5 to the bias so that we're
                                // rounding when we convert to int, instead of flooring.)
        {
            vector signed int biasInt;
            const vector float negOneFloat = (vector float)FOUROF(-1.0f);
            LOAD4(biasInt, biasAddr);
            bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT);
            negBias = vec_madd(bias, negOneFloat, zero);
        }

        {
            vector float q0, q1, q2, q3, q4, q5, q6, q7;

            q0 = vec_ctf(qmat[0], QMAT_SHIFT);
            q1 = vec_ctf(qmat[2], QMAT_SHIFT);
            q2 = vec_ctf(qmat[4], QMAT_SHIFT);
            q3 = vec_ctf(qmat[6], QMAT_SHIFT);
            q4 = vec_ctf(qmat[8], QMAT_SHIFT);
            q5 = vec_ctf(qmat[10], QMAT_SHIFT);
            q6 = vec_ctf(qmat[12], QMAT_SHIFT);
            q7 = vec_ctf(qmat[14], QMAT_SHIFT);

            row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias),
                    vec_cmpgt(row0, zero));
            row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias),
                    vec_cmpgt(row1, zero));
            row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias),
                    vec_cmpgt(row2, zero));
            row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias),
                    vec_cmpgt(row3, zero));
            row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias),
                    vec_cmpgt(row4, zero));
            row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias),
                    vec_cmpgt(row5, zero));
            row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias),
                    vec_cmpgt(row6, zero));
            row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias),
                    vec_cmpgt(row7, zero));

            q0 = vec_ctf(qmat[1], QMAT_SHIFT);
            q1 = vec_ctf(qmat[3], QMAT_SHIFT);
            q2 = vec_ctf(qmat[5], QMAT_SHIFT);
            q3 = vec_ctf(qmat[7], QMAT_SHIFT);
            q4 = vec_ctf(qmat[9], QMAT_SHIFT);
            q5 = vec_ctf(qmat[11], QMAT_SHIFT);
            q6 = vec_ctf(qmat[13], QMAT_SHIFT);
            q7 = vec_ctf(qmat[15], QMAT_SHIFT);

            alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias),
                    vec_cmpgt(alt0, zero));
            alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias),
                    vec_cmpgt(alt1, zero));
            alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias),
                    vec_cmpgt(alt2, zero));
            alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias),
                    vec_cmpgt(alt3, zero));
            alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias),
                    vec_cmpgt(alt4, zero));
            alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias),
                    vec_cmpgt(alt5, zero));
            alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias),
                    vec_cmpgt(alt6, zero));
            alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias),
                    vec_cmpgt(alt7, zero));
        }


    }

    // Store the data back into the original block
    {
        vector signed short data0, data1, data2, data3, data4, data5, data6, data7;

        data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0));
        data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0));
        data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0));
        data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0));
        data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0));
        data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0));
        data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0));
        data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0));

        {
            // Clamp for overflow
            vector signed int max_q_int, min_q_int;
            vector signed short max_q, min_q;

            LOAD4(max_q_int, &(s->max_qcoeff));
            LOAD4(min_q_int, &(s->min_qcoeff));

            max_q = vec_pack(max_q_int, max_q_int);
            min_q = vec_pack(min_q_int, min_q_int);

            data0 = vec_max(vec_min(data0, max_q), min_q);
            data1 = vec_max(vec_min(data1, max_q), min_q);
            data2 = vec_max(vec_min(data2, max_q), min_q);
            data4 = vec_max(vec_min(data4, max_q), min_q);
            data5 = vec_max(vec_min(data5, max_q), min_q);
            data6 = vec_max(vec_min(data6, max_q), min_q);
            data7 = vec_max(vec_min(data7, max_q), min_q);
        }

        {
        vector bool char zero_01, zero_23, zero_45, zero_67;
        vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67;
        vector signed char negOne = vec_splat_s8(-1);
        vector signed char* scanPtr =
                (vector signed char*)(s->intra_scantable.inverse);
        signed char lastNonZeroChar;

        // Determine the largest non-zero index.
        zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero),
                vec_cmpeq(data1, (vector signed short)zero));
        zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero),
                vec_cmpeq(data3, (vector signed short)zero));
        zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero),
                vec_cmpeq(data5, (vector signed short)zero));
        zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero),
                vec_cmpeq(data7, (vector signed short)zero));

        // 64 biggest values
        scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01);
        scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23);
        scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45);
        scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67);

        // 32 largest values
        scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23);
        scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67);

        // 16 largest values
        scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45);

        // 8 largest values
        scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
                vec_mergel(scanIndexes_01, negOne));

        // 4 largest values
        scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
                vec_mergel(scanIndexes_01, negOne));

        // 2 largest values
        scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
                vec_mergel(scanIndexes_01, negOne));

        // largest value
        scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
                vec_mergel(scanIndexes_01, negOne));

        scanIndexes_01 = vec_splat(scanIndexes_01, 0);


        vec_ste(scanIndexes_01, 0, &lastNonZeroChar);

        lastNonZero = lastNonZeroChar;

        // While the data is still in vectors we check for the transpose IDCT permute
        // and handle it using the vector unit if we can.  This is the permute used
        // by the altivec idct, so it is common when using the altivec dct.

        if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) {
            TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
        }

        vec_st(data0, 0, data);
        vec_st(data1, 16, data);
        vec_st(data2, 32, data);
        vec_st(data3, 48, data);
        vec_st(data4, 64, data);
        vec_st(data5, 80, data);
        vec_st(data6, 96, data);
        vec_st(data7, 112, data);
        }
    }

    // special handling of block[0]
    if (s->mb_intra) {
        if (!s->h263_aic) {
            if (n < 4)
                oldBaseValue /= s->y_dc_scale;
            else
                oldBaseValue /= s->c_dc_scale;
        }

        // Divide by 8, rounding the result
        data[0] = (oldBaseValue + 4) >> 3;
    }

    // We handled the transpose permutation above and we don't
    // need to permute the "no" permutation case.
    if ((lastNonZero > 0) &&
        (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
        (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) {
        ff_block_permute(data, s->dsp.idct_permutation,
                s->intra_scantable.scantable, lastNonZero);
    }

    return lastNonZero;
}