C++ (Cpp) vec_ste Beispiele

Beispiel #1

0

Datei anzeigen

Datei: int_altivec.c Projekt: 10045125/xuggle-xuggler

static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16*)v1;
    vec_s16 *pv2 = (vec_s16*)v2;
    vec_s16 *pv3 = (vec_s16*)v3;
    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
    register vec_s16 t0, t1, i0, i1;
    register vec_s16 i2 = pv2[0], i3 = pv3[0];
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        t0 = vec_perm(i2, pv2[1], align);
        i2 = pv2[2];
        t1 = vec_perm(pv2[1], i2, align);
        i0 = pv1[0];
        i1 = pv1[1];
        res = vec_msum(t0, i0, res);
        res = vec_msum(t1, i1, res);
        t0 = vec_perm(i3, pv3[1], align);
        i3 = pv3[2];
        t1 = vec_perm(pv3[1], i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1 += 2;
        pv2 += 2;
        pv3 += 2;
    } while(--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
}

Beispiel #2

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: KoetseJ/xumo

int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char *tv;
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;
    
    sv = (vector unsigned int)vec_splat_u32(0);
    
    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        tv = (vector unsigned char *) pix;
        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}

Beispiel #3

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: 0xFFeng/ffmpeg

static int pix_sum_altivec(uint8_t * pix, int line_size)
{
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned char t1;
    vector unsigned int sad;
    vector signed int sumdiffs;

    int i;
    int s;

    sad = (vector unsigned int)vec_splat_u32(0);

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned 16 pixels into t1 */
        vector unsigned char pixl = vec_ld( 0, pix);
        vector unsigned char pixr = vec_ld(15, pix);
        t1 = vec_perm(pixl, pixr, perm);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t1, sad);

        pix += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}

Beispiel #4

0

Datei anzeigen

Datei: int_altivec.c Projekt: 10045125/xuggle-xuggler

static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift)
{
    int i;
    LOAD_ZERO;
    register vec_s16 vec1, *pv;
    register vec_s32 res = vec_splat_s32(0), t;
    register vec_u32 shifts;
    int32_t ires;

    shifts = zero_u32v;
    if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
    if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
    if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
    if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
    if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));

    for(i = 0; i < order; i += 8){
        pv = (vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        t = vec_sr(t, shifts);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}

Beispiel #5

0

Datei anzeigen

Datei: sad_altivec.c Projekt: Distrotech/xvidcore

uint32_t
sad8_altivec_c(const uint8_t * cur,
	   const uint8_t *ref,
	   const uint32_t stride)
{
	uint32_t result = 0;
	
	register vector unsigned int sad;
	register vector unsigned char c;
	register vector unsigned char r;
	
	/* initialize */
	sad = vec_splat_u32(0);
	
	/* Perform sad operations */
	SAD8();
	SAD8();
	SAD8();
	SAD8();
	
	SAD8();
	SAD8();
	SAD8();
	SAD8();
	
	/* finish addition, add the first 2 together */
	sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0)));
	sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0));
	sad = vec_splat(sad,3);
	vec_ste(sad, 0, &result);
		
	return result;
}

Beispiel #6

0

Datei anzeigen

Datei: sad_altivec.c Projekt: Distrotech/xvidcore

uint32_t
sse8_16bit_altivec_c(const int16_t * b1,
			 const int16_t * b2,
			 const uint32_t stride)
{
    register vector signed short b1_vec;
    register vector signed short b2_vec;
    register vector signed short diff;
    register vector signed int sum;
    uint32_t result;
    
    /* initialize */
    sum = vec_splat_s32(0);
    
    SSE8_16BIT();
    SSE8_16BIT();
    SSE8_16BIT();
    SSE8_16BIT();
    
    SSE8_16BIT();
    SSE8_16BIT();
    SSE8_16BIT();
    SSE8_16BIT();
        
    /* sum the vector */
    sum = vec_sums(sum, vec_splat_s32(0));
    sum = vec_splat(sum,3);
    
    vec_ste(sum,0,(int*)&result);
    
    /* and return */
    return result;
}

Beispiel #7

0

Datei anzeigen

Datei: mpegvideoencdsp.c Projekt: 26mansi/FFmpeg

static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i, s = 0;
    const vector unsigned int zero =
        (const vector unsigned int) vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix);
    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
    vector signed int sum;

    for (i = 0; i < 16; i++) {
        /* Read the potentially unaligned pixels. */
        vector unsigned char pixl = vec_ld(0,  pix);
        vector unsigned char pixr = vec_ld(15, pix);
        vector unsigned char pixv = vec_perm(pixl, pixr, perm);

        /* Square the values, and add them to our sum. */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s. */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, &s);

    return s;
}

Beispiel #8

0

Datei anzeigen

Datei: vp8dsp_altivec.c Projekt: Arcen/libav

static av_always_inline
void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
                                 uint8_t *src, ptrdiff_t src_stride,
                                 int h, int mx, int w, int is6tap)
{
    LOAD_H_SUBPEL_FILTER(mx-1);
    vec_u8 align_vec0, align_vec8, permh0, permh8, filt;
    vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
    vec_u8 a, b, pixh, pixl, outer;
    vec_s16 f16h, f16l;
    vec_s32 filth, filtl;

    vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
    vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
    vec_u8 perm_inner  = is6tap ? perm_inner6 : perm_inner4;
    vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
    vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
    vec_u16 c7  = vec_splat_u16(7);

    align_vec0 = vec_lvsl( -is6tap-1, src);
    align_vec8 = vec_lvsl(8-is6tap-1, src);

    permh0     = vec_perm(align_vec0, align_vec0, perm_inner);
    permh8     = vec_perm(align_vec8, align_vec8, perm_inner);
    perm_inner = vec_add(perm_inner, vec_splat_u8(4));
    perml0     = vec_perm(align_vec0, align_vec0, perm_inner);
    perml8     = vec_perm(align_vec8, align_vec8, perm_inner);
    perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
    perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);

    while (h --> 0) {
        FILTER_H(f16h, 0);

        if (w == 16) {
            FILTER_H(f16l, 8);
            filt = vec_packsu(f16h, f16l);
            vec_st(filt, 0, dst);
        } else {
            filt = vec_packsu(f16h, f16h);
            vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
            if (w == 8)
                vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
        }
        src += src_stride;
        dst += dst_stride;
    }
}

Beispiel #9

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: KoetseJ/xumo

int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    uint8_t *pix3 = pix2 + line_size;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);

    /*
       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
       iteration becomes pix2 in the next iteration. We can use this
       fact to avoid a potentially expensive unaligned read, each
       time around the loop.
       Read unaligned pixels into our vectors. The vectors are as follows:
       pix2v: pix2[0]-pix2[15]
       Split the pixel vectors into shorts
    */
    tv = (vector unsigned char *) &pix2[0];
    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
    
    for(i=0;i<16;i++) {
        /*
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix3v: pix3[0]-pix3[15]
        */
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));

        tv = (vector unsigned char *) &pix3[0];
        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix3v);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        
        pix1 += line_size;
        pix2v = pix3v;
        pix3 += line_size;
        
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    return s;    
}

Beispiel #10

0

Datei anzeigen

Datei: altivec-6.c Projekt: Akheon23/chromecast-mirrored-source.toolchain

void
b()
{
  z = vec_add (x, y);

  /* Make sure the predicates accept correct argument types.  */

  int1 = vec_all_in (f, g);
  int1 = vec_all_ge (f, g);
  int1 = vec_all_eq (c, d);
  int1 = vec_all_ne (s, t);
  int1 = vec_any_eq (i, j);
  int1 = vec_any_ge (f, g);
  int1 = vec_all_ngt (f, g);
  int1 = vec_any_ge (c, d);
  int1 = vec_any_ge (s, t);
  int1 = vec_any_ge (i, j);
  int1 = vec_any_ge (c, d);
  int1 = vec_any_ge (s, t);
  int1 = vec_any_ge (i, j);

  vec_mtvscr (i);
  vec_dssall ();
  s = (vector signed short) vec_mfvscr ();
  vec_dss (3);

  vec_dst (pi, int1 + int2, 3);
  vec_dstst (pi, int1 + int2, 3);
  vec_dststt (pi, int1 + int2, 3);
  vec_dstt (pi, int1 + int2, 3);

  uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi);
  uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi);

  c = vec_lde (int1, (signed char *) pi);
  s = vec_lde (int1, (signed short *) pi);
  i = vec_lde (int1, (signed int *) pi);
  i = vec_ldl (int1, pi);
  i = vec_ld (int1, pi);

  vec_st (i, int2, pi);
  vec_ste (c, int2, (signed char *) pi);
  vec_ste (s, int2, (signed short *) pi);
  vec_ste (i, int2, (signed int *) pi);
  vec_stl (i, int2, pi);
}

Beispiel #11

0

Datei anzeigen

Datei: ste-be-order.c Projekt: pjump/gcc

static void test ()
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    vector unsigned char vuc = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
    vector signed char vsc = {7,6,5,4,3,2,1,0,-1,-2,-3,-4,-5,-6,-7,-8};
    vector unsigned short vus = {7,6,5,4,3,2,1,0};
    vector signed short vss = {3,2,1,0,-1,-2,-3,-4};
    vector unsigned int vui = {3,2,1,0};
    vector signed int vsi = {1,0,-1,-2};
    vector float vf = {3.0,2.0,1.0,0.0};
#else
    vector unsigned char vuc = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
    vector signed char vsc = {-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7};
    vector unsigned short vus = {0,1,2,3,4,5,6,7};
    vector signed short vss = {-4,-3,-2,-1,0,1,2,3};
    vector unsigned int vui = {0,1,2,3};
    vector signed int vsi = {-2,-1,0,1};
    vector float vf = {0.0,1.0,2.0,3.0};
#endif

    vec_ste (vuc, 9*1, (unsigned char *)svuc);
    vec_ste (vsc, 14*1, (signed char *)svsc);
    vec_ste (vus, 7*2, (unsigned short *)svus);
    vec_ste (vss, 1*2, (signed short *)svss);
    vec_ste (vui, 3*4, (unsigned int *)svui);
    vec_ste (vsi, 2*4, (signed int *)svsi);
    vec_ste (vf,  0*4, (float *)svf);

    check_arrays ();
}

Beispiel #12

0

Datei anzeigen

Datei: r_drawt_altivec.cpp Projekt: JohnnyonFlame/odamex

void r_dimpatchD_ALTIVEC(const DCanvas *const cvs, argb_t color, int alpha, int x1, int y1, int w, int h)
{
	int x, y, i;
	argb_t *line;
	int invAlpha = 256 - alpha;

	int dpitch = cvs->pitch / sizeof(argb_t);
	line = (argb_t *)cvs->buffer + y1 * dpitch;

	int batches = w / 4;
	int remainder = w & 3;

	// AltiVec temporaries:
	const vu16 zero = {0, 0, 0, 0, 0, 0, 0, 0};
	const vu16 upper8mask = {0, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0xff};
	const vu16 blendAlpha = {0, alpha, alpha, alpha, 0, alpha, alpha, alpha};
	const vu16 blendInvAlpha = {0, invAlpha, invAlpha, invAlpha, 0, invAlpha, invAlpha, invAlpha};
	const vu16 blendColor = {0, RPART(color), GPART(color), BPART(color), 0, RPART(color), GPART(color), BPART(color)};
	const vu16 blendMult = vec_mladd(blendColor, blendAlpha, zero);

	for (y = y1; y < y1 + h; y++)
	{
		// AltiVec optimize the bulk in batches of 4 colors:
		for (i = 0, x = x1; i < batches; ++i, x += 4)
		{
			const vu32 input = {line[x + 0], line[x + 1], line[x + 2], line[x + 3]};
			const vu32 output = (vu32)blend4vs1_altivec(input, blendMult, blendInvAlpha, upper8mask);
			vec_ste(output, 0, &line[x]);
			vec_ste(output, 4, &line[x]);
			vec_ste(output, 8, &line[x]);
			vec_ste(output, 12, &line[x]);
		}

		if (remainder)
		{
			// Pick up the remainder:
			for (; x < x1 + w; x++)
			{
				line[x] = alphablend1a(line[x], color, alpha);
			}
		}

		line += dpitch;
	}
}

Beispiel #13

0

Datei anzeigen

Datei: sad_altivec.c Projekt: Distrotech/xvidcore

uint32_t
sad16_altivec_c(vector unsigned char *cur,
			  vector unsigned char *ref,
			  uint32_t stride,
			  const uint32_t best_sad)
{
	vector unsigned char perm;
	vector unsigned char t1, t2;
	vector unsigned int sad;
	vector unsigned int sumdiffs;
	vector unsigned int best_vec;
	uint32_t result;

        
#ifdef DEBUG
        /* print alignment errors if DEBUG is on */
	if (((unsigned long) cur) & 0xf)
		fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur);
	if (stride & 0xf)
		fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride);
#endif
	/* initialization */
	sad = vec_splat_u32(0);
	sumdiffs = sad;
	stride >>= 4;
	perm = vec_lvsl(0, (unsigned char *) ref);
	*((uint32_t*)&best_vec) = best_sad;
	best_vec = vec_splat(best_vec, 0);

	/* perform sum of differences between current and previous */
	SAD16();
	SAD16();
	SAD16();
	SAD16();

	SAD16();
	SAD16();
	SAD16();
	SAD16();

	SAD16();
	SAD16();
	SAD16();
	SAD16();
        
	SAD16();
	SAD16();
	SAD16();
	SAD16();

  bail:
	/* copy vector sum into unaligned result */
	sumdiffs = vec_splat(sumdiffs, 3);
	vec_ste(sumdiffs, 0, (uint32_t*) &result);
	return result;
}

Beispiel #14

0

Datei anzeigen

Datei: quant_h263_altivec.c Projekt: roozbeh/openCU

uint32_t
quant_h263_inter_altivec_c(int16_t *coeff,
                            int16_t *data,
                            const uint32_t quant,
                            const uint16_t *mpeg_quant_matrices)
{
    vector unsigned char zerovec;
    vector unsigned short mult;
    vector unsigned short quant_m_2;
    vector unsigned short quant_d_2;
    vector unsigned short sum_short;
    vector signed short acLevel;
    
    vector unsigned int even;
    vector unsigned int odd;
    
    vector bool short m2_mask;
    vector bool short zero_mask;
    
    uint32_t result;

#ifdef DEBUG
    if(((unsigned)coeff) & 0x15)
        fprintf(stderr, "quant_h263_inter_altivec_c:incorrect align, coeff: %lx\n", (long)coeff);
#endif
    
    /* initialisation stuff */
    zerovec = vec_splat_u8(0);
    *((unsigned short*)&mult) = (unsigned short)multipliers[quant];
    mult = vec_splat(mult, 0);
    *((unsigned short*)&quant_m_2) = (unsigned short)quant;
    quant_m_2 = vec_splat(quant_m_2, 0);
    quant_m_2 = vec_sl(quant_m_2, vec_splat_u16(1));
    *((unsigned short*)&quant_d_2) = (unsigned short)quant;
    quant_d_2 = vec_splat(quant_d_2, 0);
    quant_d_2 = vec_sr(quant_d_2, vec_splat_u16(1));
    sum_short = (vector unsigned short)zerovec;
    
    /* Quantize */
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
    QUANT_H263_INTER_ALTIVEC();
        
    /* Calculate the return value */
    even = (vector unsigned int)vec_sum4s((vector signed short)sum_short, (vector signed int)zerovec);
    even = (vector unsigned int)vec_sums((vector signed int)even, (vector signed int)zerovec);
    even = vec_splat(even, 3);
    vec_ste(even, 0, &result);
    return result;
}

Beispiel #15

0

Datei anzeigen

Datei: gimp-composite-altivec.c Projekt: 1ynx/gimp

/* Store less than a vector to an unaligned location in memory */
static inline void
StoreUnalignedLess (vector unsigned char v,
                    const guchar *where,
                    int n)
{
  int i;
  vector unsigned char permuteVector = vec_lvsr(0, where);
  v = vec_perm(v, v, permuteVector);
  for (i=0; i<n; i++)
    vec_ste(v, i, CONST_BUFFER(where));
}

Beispiel #16

0

Datei anzeigen

Datei: sad_altivec.c Projekt: Distrotech/xvidcore

uint32_t
sad16bi_altivec_c(vector unsigned char *cur,
                        vector unsigned char *ref1,
                        vector unsigned char *ref2,
                        uint32_t stride)
{
    vector unsigned char t1, t2;
    vector unsigned char mask1, mask2;
    vector unsigned char sad;
    vector unsigned int sum;
    uint32_t result;
    
#ifdef DEBUG
    /* print alignment errors if this is on */
    if((long)cur & 0xf)
        fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur);
    if(stride & 0xf)
        fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride);
#endif
    
    /* Initialisation stuff */
    stride >>= 4;
    mask1 = vec_lvsl(0, (unsigned char*)ref1);
    mask2 = vec_lvsl(0, (unsigned char*)ref2);
    sad = vec_splat_u8(0);
    sum = (vector unsigned int)sad;
    
    SAD16BI();
    SAD16BI();
    SAD16BI();
    SAD16BI();
    
    SAD16BI();
    SAD16BI();
    SAD16BI();
    SAD16BI();
    
    SAD16BI();
    SAD16BI();
    SAD16BI();
    SAD16BI();
    
    SAD16BI();
    SAD16BI();
    SAD16BI();
    SAD16BI();
    
    sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0));
    sum = vec_splat(sum, 3);
    vec_ste(sum, 0, (uint32_t*)&result);
    
    return result;
}

Beispiel #17

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: KoetseJ/xumo

/**
 * Sum of Squared Errors for a 8x8 block.
 * AltiVec-enhanced.
 * It's the pix_abs8x8_altivec code above w/ squaring added.
 */
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sum;
    vector signed int sumsqr;
    
    sum = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    
    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

        /*
          Since we want to use unsigned chars, we can take advantage
          of the fact that abs(a-b)^2 = (a-b)^2.
        */
        
	/* Calculate abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
        
        /* Square the values and add them to our sum */
        sum = vec_msum(t5, t5, sum);
        
        pix1 += line_size;
        pix2 += line_size;
    }
    
    /* Sum up the four partial sums, and put the result into s */
    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
    sumsqr = vec_splat(sumsqr, 3);
    vec_ste(sumsqr, 0, &s);
    
    return s;
}

Beispiel #18

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: KoetseJ/xumo

int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char *tv;
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);
    for(i=0;i<16;i++) {
        /*
           Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix2v: pix2[0]-pix2[15]	pix2iv: pix2[1]-pix2[16]
        */
        tv = (vector unsigned char *) pix1;
        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
        
        tv = (vector unsigned char *) &pix2[0];
        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));

        tv = (vector unsigned char *) &pix2[1];
        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix2iv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);
        
        pix1 += line_size;
        pix2 += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}

Beispiel #19

0

Datei anzeigen

Datei: sad_altivec.c Projekt: cchatterj/isabel

/*
 * This function assumes cur is 8 bytes aligned, stride is 16 bytes
 * aligned and ref is unaligned
 */
unsigned long
sad8_altivec(const vector unsigned char *cur,
			 const vector unsigned char *ref,
			 unsigned long stride)
{
	vector unsigned char t1, t2, t3, t4, t5, tp;
	vector unsigned int sad;
	vector signed int sumdiffs;
	vector unsigned char perm_cur;
	vector unsigned char perm_ref1, perm_ref2;
	unsigned long result;

	ZERODEF;

#ifdef DEBUG
	if (((unsigned long) cur) & 0x7)
		fprintf(stderr, "sad8_altivec:incorrect align, cur: %x\n", cur);
//  if (((unsigned long)ref) & 0x7)
//      fprintf(stderr, "sad8_altivec:incorrect align, ref: %x\n", ref);
	if (stride & 0xf)
		fprintf(stderr, "sad8_altivec:incorrect align, stride: %x\n", stride);
#endif

	perm_cur = get_perm((((unsigned long) cur) >> 3) & 0x01);
	perm_ref1 = vec_lvsl(0, (unsigned char *) ref);
	perm_ref2 = get_perm(0);

	/* initialization */
	sad = (vector unsigned int) (ZEROVEC);
	stride >>= 4;

	/* perform sum of differences between current and previous */
	SAD8();
	SAD8();
	SAD8();
	SAD8();

	/* sum all parts of difference into one 32 bit quantity */
	sumdiffs = vec_sums((vector signed int) sad, (vector signed int) ZEROVEC);

	/* copy vector sum into unaligned result */
	sumdiffs = vec_splat(sumdiffs, 3);
	vec_ste(sumdiffs, 0, (int *) &result);
	return (result);
}

Beispiel #20

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: KoetseJ/xumo

int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);

    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);

    for(i=0;i<8;i++) {
	/* Read potentially unaligned pixels into t1 and t2
	   Since we're reading 16 pixels, and actually only want 8,
	   mask out the last 8 pixels. The 0s don't change the sum. */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);

	/* Calculate a sum of abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

	/* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}

Beispiel #21

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: 0xFFeng/ffmpeg

static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
    vector unsigned char perm1 = vec_lvsl(0, pix2);
    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
    vector unsigned char pix2l, pix2r;
    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    s = 0;
    sad = (vector unsigned int)vec_splat_u32(0);
    for (i = 0; i < h; i++) {
        /* Read unaligned pixels into our vectors. The vectors are as follows:
           pix1v: pix1[0]-pix1[15]
           pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16] */
        pix1v  = vec_ld( 0, pix1);
        pix2l  = vec_ld( 0, pix2);
        pix2r  = vec_ld(16, pix2);
        pix2v  = vec_perm(pix2l, pix2r, perm1);
        pix2iv = vec_perm(pix2l, pix2r, perm2);

        /* Calculate the average vector */
        avgv = vec_avg(pix2v, pix2iv);

        /* Calculate a sum of abs differences vector */
        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}

Beispiel #22

0

Datei anzeigen

Datei: picture-altivec.c Projekt: Arizer/kvazaar

static unsigned reg_sad_altivec(const kvz_pixel * const data1, const kvz_pixel * const data2,
                        const int width, const int height, const unsigned stride1, const unsigned stride2)
{
  vector unsigned int vsad = {0,0,0,0}, vzero = {0,0,0,0}; 
  vector signed int sumdiffs;
  int tmpsad, sad = 0;
  
  int y, x;
  
  for (y = 0; y < height; ++y) {
    vector unsigned char perm1, perm2;
    
    perm1 = vec_lvsl(0, &data1[y * stride1]);
    perm2 = vec_lvsl(0, &data2[y * stride2]);
    
    for (x = 0; x <= width-16; x+=16) {
      vector unsigned char t1, t2, t3, t4, t5;
      vector unsigned char *current, *previous;
      
      current = (vector unsigned char *) &data1[y * stride1 + x];
      previous = (vector unsigned char *) &data2[y * stride2 + x];
      
      t1  = vec_perm(current[0], current[1], perm1 );  /* align current vector  */ 
      t2  = vec_perm(previous[0], previous[1], perm2 );/* align previous vector */ 
      t3  = vec_max(t1, t2 );      /* find largest of two           */ 
      t4  = vec_min(t1, t2 );      /* find smaller of two           */ 
      t5  = vec_sub(t3, t4);       /* find absolute difference      */ 
      vsad = vec_sum4s(t5, vsad);    /* accumulate sum of differences */
    }

    for (; x < width; ++x) {
      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
    }
  }
  
  sumdiffs = vec_sums((vector signed int) vsad, (vector signed int) vzero);
  /* copy vector sum into unaligned result */
  sumdiffs = vec_splat( sumdiffs, 3);
  vec_ste( sumdiffs, 0, &tmpsad );
  sad += tmpsad;
  
  return sad;
}

Beispiel #23

0

Datei anzeigen

Datei: int_altivec.c Projekt: Bjelijah/EcamTurnH265

static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                           int order)
{
    int i;
    LOAD_ZERO;
    register vec_s16 vec1;
    register vec_s32 res = vec_splat_s32(0), t;
    int32_t ires;

    for(i = 0; i < order; i += 8){
        vec1 = vec_unaligned_load(v1);
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}

Beispiel #24

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: KoetseJ/xumo

int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
{
    int i;
    int s __attribute__((aligned(16)));
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm1, perm2, *pix1v, *pix2v;
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;
    
    sad = (vector unsigned int)vec_splat_u32(0);


    for(i=0;i<16;i++) {
	/* Read potentially unaligned pixels into t1 and t2 */
        perm1 = vec_lvsl(0, pix1);
        pix1v = (vector unsigned char *) pix1;
        perm2 = vec_lvsl(0, pix2);
        pix2v = (vector unsigned char *) pix2;
        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
       
	/* Calculate a sum of abs differences vector */ 
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);
	
	/* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);
    
    return s;
}

Beispiel #25

0

Datei anzeigen

Datei: pr81833-2.c Projekt: MaxKellermann/gcc

scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
			 int order)
{
  int i;
  LOAD_ZERO;
  register vec_s16 vec1;
  register vec_s32 res = vec_splat_s32 (0), t;
  signed int ires;

  for (i = 0; i < order; i += 8) {
    vec1 = vec_vsx_ld (0, v1);
    t    = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v);
    res  = vec_sums (t, res);
    v1  += 8;
    v2  += 8;
  }
  res = vec_splat (res, 3);
  vec_ste (res, 0, &ires);

  return ires;
}

Beispiel #26

0

Datei anzeigen

Datei: dsputil_ppc.c Projekt: CodeAsm/ffplay360

static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
    int i;
    int s;
    __vector zero = __vzero();
/*
    vector unsigned char *tv;
    vector unsigned char pixv;
    vector unsigned int sv;
    vector signed int sum;	
*/

	__vector *tv;
    __vector pixv;
    __vector sv;
    __vector sum;

    sv = __vzero();

    s = 0;
    for (i = 0; i < 16; i++) {
        /* Read in the potentially unaligned pixels */
        //tv = (vector unsigned char *) pix;
		tv = (__vector*) pix;
        //pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
		pixv = __vperm(tv[0], tv[1], __lvsl(pix,0));

        /* Square the values, and add them to our sum */
        sv = vec_msum(pixv, pixv, sv);

        pix += line_size;
    }
    /* Sum up the four partial sums, and put the result into s */
    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
    sum = vec_splat(sum, 3);
    
	vec_ste(sum, 0, &s);

    return s;
}

Beispiel #27

0

Datei anzeigen

Datei: dsputil_altivec.c Projekt: 0xFFeng/ffmpeg

static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
    int i;
    int s;
    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
    vector unsigned char perm = vec_lvsl(0, pix2);
    vector unsigned char t1, t2, t3,t4, t5;
    vector unsigned int sad;
    vector signed int sumdiffs;

    sad = (vector unsigned int)vec_splat_u32(0);


    for (i = 0; i < h; i++) {
        /* Read potentially unaligned pixels into t1 and t2 */
        vector unsigned char pix2l = vec_ld( 0, pix2);
        vector unsigned char pix2r = vec_ld(15, pix2);
        t1 = vec_ld(0, pix1);
        t2 = vec_perm(pix2l, pix2r, perm);

        /* Calculate a sum of abs differences vector */
        t3 = vec_max(t1, t2);
        t4 = vec_min(t1, t2);
        t5 = vec_sub(t3, t4);

        /* Add each 4 pixel group together and put 4 results into sad */
        sad = vec_sum4s(t5, sad);

        pix1 += line_size;
        pix2 += line_size;
    }

    /* Sum up the four partial sums, and put the result into s */
    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
    sumdiffs = vec_splat(sumdiffs, 3);
    vec_ste(sumdiffs, 0, &s);

    return s;
}

Beispiel #28

0

Datei anzeigen

Datei: int_altivec.c Projekt: 0x0B501E7E/ffmpeg

static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                           int order)
{
    int i;
    LOAD_ZERO;
    const vec_s16 *pv;
    register vec_s16 vec1;
    register vec_s32 res = vec_splat_s32(0), t;
    int32_t ires;

    for(i = 0; i < order; i += 8){
        pv = (const vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}

Beispiel #29

0

Datei anzeigen

Datei: lossless_audiodsp_altivec.c Projekt: TaoheGit/hmi_sdl_android

static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                    const int16_t *v2,
                                                    const int16_t *v3,
                                                    int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16 *) v1;
    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
    register vec_s16 t0, t1, i0, i1, i4, i2, i3;
    register vec_s32 res = zero_s32v;
#if HAVE_BIGENDIAN
    register vec_u8 align = vec_lvsl(0, v2);
    i2 = vec_ld(0, v2);
    i3 = vec_ld(0, v3);
#endif
    int32_t ires;

    order >>= 4;
    do {
        GET_T(t0,t1,v2,i1,i2);
        i0     = pv1[0];
        i1     = pv1[1];
        res    = vec_msum(t0, i0, res);
        res    = vec_msum(t1, i1, res);
        GET_T(t0,t1,v3,i4,i3);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1   += 2;
        v2    += 16;
        v3    += 16;
    } while (--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);

    return ires;
}

Beispiel #30

0

Datei anzeigen

Datei: tr_shade.c Projekt: ptitSeb/ioq3

static void ProjectDlightTexture_altivec( void ) {
	int		i, l;
	vec_t	origin0, origin1, origin2;
	float   texCoords0, texCoords1;
	vector float floatColorVec0, floatColorVec1;
	vector float modulateVec, colorVec, zero;
	vector short colorShort;
	vector signed int colorInt;
	vector unsigned char floatColorVecPerm, modulatePerm, colorChar;
	vector unsigned char vSel = VECCONST_UINT8(0x00, 0x00, 0x00, 0xff,
                                               0x00, 0x00, 0x00, 0xff,
                                               0x00, 0x00, 0x00, 0xff,
                                               0x00, 0x00, 0x00, 0xff);
	float	*texCoords;
	byte	*colors;
	byte	clipBits[SHADER_MAX_VERTEXES];
	float	texCoordsArray[SHADER_MAX_VERTEXES][2];
	byte	colorArray[SHADER_MAX_VERTEXES][4];
	unsigned	hitIndexes[SHADER_MAX_INDEXES];
	int		numIndexes;
	float	scale;
	float	radius;
	vec3_t	floatColor;
	float	modulate = 0.0f;

	if ( !backEnd.refdef.num_dlights ) {
		return;
	}

	// There has to be a better way to do this so that floatColor
	// and/or modulate are already 16-byte aligned.
	floatColorVecPerm = vec_lvsl(0,(float *)floatColor);
	modulatePerm = vec_lvsl(0,(float *)&modulate);
	modulatePerm = (vector unsigned char)vec_splat((vector unsigned int)modulatePerm,0);
	zero = (vector float)vec_splat_s8(0);

	for ( l = 0 ; l < backEnd.refdef.num_dlights ; l++ ) {
		dlight_t	*dl;

		if ( !( tess.dlightBits & ( 1 << l ) ) ) {
			continue;	// this surface definately doesn't have any of this light
		}
		texCoords = texCoordsArray[0];
		colors = colorArray[0];

		dl = &backEnd.refdef.dlights[l];
		origin0 = dl->transformed[0];
		origin1 = dl->transformed[1];
		origin2 = dl->transformed[2];
		radius = dl->radius;
		scale = 1.0f / radius;

		if(r_greyscale->integer)
		{
			float luminance;
			
			luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f;
			floatColor[0] = floatColor[1] = floatColor[2] = luminance;
		}
		else if(r_greyscale->value)
		{
			float luminance;
			
			luminance = LUMA(dl->color[0], dl->color[1], dl->color[2]) * 255.0f;
			floatColor[0] = LERP(dl->color[0] * 255.0f, luminance, r_greyscale->value);
			floatColor[1] = LERP(dl->color[1] * 255.0f, luminance, r_greyscale->value);
			floatColor[2] = LERP(dl->color[2] * 255.0f, luminance, r_greyscale->value);
		}
		else
		{
			floatColor[0] = dl->color[0] * 255.0f;
			floatColor[1] = dl->color[1] * 255.0f;
			floatColor[2] = dl->color[2] * 255.0f;
		}
		floatColorVec0 = vec_ld(0, floatColor);
		floatColorVec1 = vec_ld(11, floatColor);
		floatColorVec0 = vec_perm(floatColorVec0,floatColorVec0,floatColorVecPerm);
		for ( i = 0 ; i < tess.numVertexes ; i++, texCoords += 2, colors += 4 ) {
			int		clip = 0;
			vec_t dist0, dist1, dist2;
			
			dist0 = origin0 - tess.xyz[i][0];
			dist1 = origin1 - tess.xyz[i][1];
			dist2 = origin2 - tess.xyz[i][2];

			backEnd.pc.c_dlightVertexes++;

			texCoords0 = 0.5f + dist0 * scale;
			texCoords1 = 0.5f + dist1 * scale;

			if( !r_dlightBacks->integer &&
					// dist . tess.normal[i]
					( dist0 * tess.normal[i][0] +
					dist1 * tess.normal[i][1] +
					dist2 * tess.normal[i][2] ) < 0.0f ) {
				clip = 63;
			} else {
				if ( texCoords0 < 0.0f ) {
					clip |= 1;
				} else if ( texCoords0 > 1.0f ) {
					clip |= 2;
				}
				if ( texCoords1 < 0.0f ) {
					clip |= 4;
				} else if ( texCoords1 > 1.0f ) {
					clip |= 8;
				}
				texCoords[0] = texCoords0;
				texCoords[1] = texCoords1;

				// modulate the strength based on the height and color
				if ( dist2 > radius ) {
					clip |= 16;
					modulate = 0.0f;
				} else if ( dist2 < -radius ) {
					clip |= 32;
					modulate = 0.0f;
				} else {
					dist2 = Q_fabs(dist2);
					if ( dist2 < radius * 0.5f ) {
						modulate = 1.0f;
					} else {
						modulate = 2.0f * (radius - dist2) * scale;
					}
				}
			}
			clipBits[i] = clip;

			modulateVec = vec_ld(0,(float *)&modulate);
			modulateVec = vec_perm(modulateVec,modulateVec,modulatePerm);
			colorVec = vec_madd(floatColorVec0,modulateVec,zero);
			colorInt = vec_cts(colorVec,0);	// RGBx
			colorShort = vec_pack(colorInt,colorInt);		// RGBxRGBx
			colorChar = vec_packsu(colorShort,colorShort);	// RGBxRGBxRGBxRGBx
			colorChar = vec_sel(colorChar,vSel,vSel);		// RGBARGBARGBARGBA replace alpha with 255
			vec_ste((vector unsigned int)colorChar,0,(unsigned int *)colors);	// store color
		}

		// build a list of triangles that need light
		numIndexes = 0;
		for ( i = 0 ; i < tess.numIndexes ; i += 3 ) {
			int		a, b, c;

			a = tess.indexes[i];
			b = tess.indexes[i+1];
			c = tess.indexes[i+2];
			if ( clipBits[a] & clipBits[b] & clipBits[c] ) {
				continue;	// not lighted
			}
			hitIndexes[numIndexes] = a;
			hitIndexes[numIndexes+1] = b;
			hitIndexes[numIndexes+2] = c;
			numIndexes += 3;
		}

		if ( !numIndexes ) {
			continue;
		}

		qglEnableClientState( GL_TEXTURE_COORD_ARRAY );
		qglTexCoordPointer( 2, GL_FLOAT, 0, texCoordsArray[0] );

		qglEnableClientState( GL_COLOR_ARRAY );
		qglColorPointer( 4, GL_UNSIGNED_BYTE, 0, colorArray );

		GL_Bind( tr.dlightImage );
		// include GLS_DEPTHFUNC_EQUAL so alpha tested surfaces don't add light
		// where they aren't rendered
		if ( dl->additive ) {
			GL_State( GLS_SRCBLEND_ONE | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL );
		}
		else {
			GL_State( GLS_SRCBLEND_DST_COLOR | GLS_DSTBLEND_ONE | GLS_DEPTHFUNC_EQUAL );
		}
		R_DrawElements( numIndexes, hitIndexes );
		backEnd.pc.c_totalIndexes += numIndexes;
		backEnd.pc.c_dlightIndexes += numIndexes;
	}
}