void gimp_composite_darken_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); d=vec_min(a, b); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); d=vec_min(a, b); StoreUnalignedLess(d, D, length); }
void gimp_composite_multiply_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,alpha_a,alpha_b,alpha; vector unsigned short al,ah; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnalignedLess(d, D, length); }
SIMD_INLINE void PartialSort5(v128_u8 a[5]) { SortU8(a[2], a[3]); SortU8(a[1], a[2]); SortU8(a[2], a[3]); a[4] = vec_max(a[1], a[4]); a[0] = vec_min(a[0], a[3]); SortU8(a[2], a[0]); a[2] = vec_max(a[4], a[2]); a[2] = vec_min(a[2], a[0]); }
void gimp_composite_difference_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,e,alpha_a,alpha_b; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); d=vec_min(alpha_a, alpha_b); a=vec_andc(a, alphamask); a=vec_adds(a, d); b=vec_andc(b, alphamask); d=vec_subs(a, b); e=vec_subs(b, a); d=vec_add(d,e); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); alpha_a=vec_and(a,alphamask); alpha_b=vec_and(b,alphamask); d=vec_min(alpha_a,alpha_b); a=vec_andc(a,alphamask); a=vec_adds(a,d); b=vec_andc(b,alphamask); d=vec_subs(a,b); e=vec_subs(b, a); d=vec_add(d,e); StoreUnalignedLess(d, D, length); }
SIMD_INLINE void PartialSort9(v128_u8 a[9]) { SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[6], a[7]); SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); a[3] = vec_max(a[0], a[3]); a[5] = vec_min(a[5], a[8]); SortU8(a[4], a[7]); a[6] = vec_max(a[3], a[6]); a[4] = vec_max(a[1], a[4]); a[2] = vec_min(a[2], a[5]); a[4] = vec_min(a[4], a[7]); SortU8(a[4], a[2]); a[4] = vec_max(a[6], a[4]); a[4] = vec_min(a[4], a[2]); }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2, register vec_u8_t q0, register vec_u8_t tc0) { register vec_u8_t average = vec_avg(p0, q0); register vec_u8_t temp; register vec_u8_t uncliped; register vec_u8_t ones; register vec_u8_t max; register vec_u8_t min; register vec_u8_t newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, int big_endian, int output_bits) { const int dst_u = -(uintptr_t)dest & 7; const int shift = 15 - output_bits; const int add = (1 << (shift - 1)); const int clip = (1 << output_bits) - 1; const vector uint16_t vadd = (vector uint16_t) {add, add, add, add, add, add, add, add}; const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0); const vector uint16_t vshift = (vector uint16_t) vec_splat_u16(shift); const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip}; vector uint16_t v; int i; yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0); for (i = dst_u; i < dstW - 7; i += 8) { v = vec_vsx_ld(0, (const uint16_t *) &src[i]); v = vec_add(v, vadd); v = vec_sr(v, vshift); v = vec_min(v, vlargest); v = vec_rl(v, vswap); vec_st(v, 0, &dest[i]); } yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, register vector unsigned char p1, register vector unsigned char p2, register vector unsigned char q0, register vector unsigned char tc0) { register vector unsigned char average = vec_avg(p0, q0); register vector unsigned char temp; register vector unsigned char uncliped; register vector unsigned char ones; register vector unsigned char max; register vector unsigned char min; register vector unsigned char newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
void signal_task_start(task_t *task, const int nproc, proc_t *procs, req_list_t *rlist) { int *buf, i; proc_t *ptr; const int tskid = task->tskid; int leader, reqpos; /* buf = (int *)malloc((2+nproc)*sizeof(int)); */ buf = (int *)alloca((2+nproc)*sizeof(int)); assert(buf != NULL); ptr = procs; buf[0] = TASK_START; buf[1] = tskid; for(i=0; i<nproc; i++) { buf[2+i] = ptr->procid; ptr = ptr->next; } leader = vec_min(nproc, &buf[2]); #if LEADER_BCAST MPI_Send(buf, 2+nproc, MPI_INT, leader,SIGNAL_TAG, MPI_COMM_WORLD); #else for(i=0; i<nproc; i++) { MPI_Send(buf, 2+nproc, MPI_INT, buf[2+i],SIGNAL_TAG, MPI_COMM_WORLD); } #endif /* free(buf); */ reqpos = req_list_get_for_running(rlist, task); MPI_Irecv(task->v, 2, MPI_INT, leader, SIGNAL_TAG, MPI_COMM_WORLD, &rlist->reqs[reqpos]); }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
void MotionCooccurrenceHistogram::NormalizeHistogram(vector<vector<double> > &vHist, vector<vector<double> > &vHistNor) { vector<vector<double> > vHistNorTemp; double histSum = 0; for (unsigned int i = 0; i<vHist.size(); i++) { double cLower, cUpper, cScale; vector<double> vHistTemp = vHist[i]; // Min max norm cLower = *min_element(vHistTemp.begin(), vHistTemp.end()); cUpper = *max_element(vHistTemp.begin(), vHistTemp.end()); cScale = cUpper - cLower; vector<double> vec_min(vHistTemp.size(), cLower); vector<double> vec_scale(vHistTemp.size(), cScale); if (cScale != 0) { transform(vHistTemp.begin(), vHistTemp.end(), vec_min.begin(), vHistTemp.begin(), std::minus<double>()); transform(vHistTemp.begin(), vHistTemp.end(), vec_scale.begin(), vHistTemp.begin(), std::divides<double>()); } /* // L-2 Norm UStatistics stats( vHistTemp ); double histSquareSum = stats.SquareSum(); if( histSquareSum != 0 ) { for( unsigned int j=0; j<vHistTemp.size(); j++ ) { vHistTemp[j] = sqrt( vHistTemp[j]/histSquareSum ); } } */ /* // Divide max normalization double nHistMaxValue; UExtrema<double> extremaHist( vHistTemp ); nHistMaxValue = extremaHist.Max(); if( nHistMaxValue != 0 ) { vHistTemp[j] = vHistTemp[j]/nHistMaxValue; } */ vHistNorTemp.push_back(vHistTemp); } vHistNor = vHistNorTemp; }
__SIMDd _SIMD_min_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_min_pd(a,b); #elif defined USE_AVX return _mm256_min_pd(a,b); #elif defined USE_IBM return vec_min(a,b); #endif }
folgen_vektor_p folgen_vektor_faltung(folgen_vektor_p f, folgen_matrix_p Gamma) { folgen_vektor_p back; vec_p vec_1, n_1, n_2, grad_1, grad_2, r, s; int k, j, a, b; int size_1, size_2, dim; folge_p temp, temp1, sum; ASSERT( f->grad->dim == Gamma->grad1->dim ); dim = f->grad->dim; vec_1 = vec_one(dim); grad_2 = vec_min( f->grad, Gamma->grad2 ); n_2 = vec_add( grad_2, vec_1 ); size_2 = vec_size( n_2 ); grad_1 = vec_copy( Gamma->grad1 ); n_1 = vec_add( grad_1, vec_1 ); size_1 = vec_size( n_1 ); back = folgen_vektor_new( grad_1 ); for(k=0;k<size_1;k++) { sum = folge_new( vec_new(dim), vec_new(dim) ); for(j=0;j<size_2;j++) { r = entry_one2d( j, n_2 ); s = vec_add( f->grad, vec_1 ); a = entry_d2one( r, s ); vec_del( s ); s = vec_add( Gamma->grad2, vec_1 ); b = entry_d2one( r, s ); vec_del( s ); vec_del( r ); temp = folge_faltung( f->vektor[a], Gamma->matrix[k][b] ); temp1 = folge_add( sum, temp ); folge_del( temp ); folge_del( sum ); sum = temp1; } folge_del( back->vektor[k] ); back->vektor[k] = sum; } vec_del( n_1 ); vec_del( n_2 ); vec_del( grad_2 ); vec_del( vec_1 ); return back; }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the pix_abs8x8_altivec code above w/ squaring added. */ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
folge_p folge_add(folge_p f, folge_p g) { folge_p back; int size, k, size_g, size_f; fepc_real_t x, y; vec_p temp1, temp2, max, lang, min, r; size_f = vec_size( f->lang ); size_g = vec_size( g->lang ); if(size_f == 0) { back = folge_copy( g ); return back; } if(size_g == 0) { back = folge_copy( f ); return back; } if( (size_g!=0) && (size_f!=0) ) { min = vec_min( f->start, g->start ); temp1 = vec_add( f->start, f->lang ); temp2 = vec_add( g->start, g->lang ); max = vec_max( temp1, temp2 ); vec_del( temp1 ); vec_del( temp2 ); lang = vec_op( 1, max, -1, min); vec_del( max ); back = folge_new( min, lang ); size = vec_size( lang ); for(k=0;k<size;k++) { temp1 = entry_one2d( k, lang ); r = vec_add( min, temp1 ); vec_del( temp1 ); x = folge_glied( r, f ); y = folge_glied( r, g ); vec_del( r ); back->glied[k] = x + y; } return back; } }
static unsigned reg_sad_altivec(const kvz_pixel * const data1, const kvz_pixel * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { vector unsigned int vsad = {0,0,0,0}, vzero = {0,0,0,0}; vector signed int sumdiffs; int tmpsad, sad = 0; int y, x; for (y = 0; y < height; ++y) { vector unsigned char perm1, perm2; perm1 = vec_lvsl(0, &data1[y * stride1]); perm2 = vec_lvsl(0, &data2[y * stride2]); for (x = 0; x <= width-16; x+=16) { vector unsigned char t1, t2, t3, t4, t5; vector unsigned char *current, *previous; current = (vector unsigned char *) &data1[y * stride1 + x]; previous = (vector unsigned char *) &data2[y * stride2 + x]; t1 = vec_perm(current[0], current[1], perm1 ); /* align current vector */ t2 = vec_perm(previous[0], previous[1], perm2 );/* align previous vector */ t3 = vec_max(t1, t2 ); /* find largest of two */ t4 = vec_min(t1, t2 ); /* find smaller of two */ t5 = vec_sub(t3, t4); /* find absolute difference */ vsad = vec_sum4s(t5, vsad); /* accumulate sum of differences */ } for (; x < width; ++x) { sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); } } sumdiffs = vec_sums((vector signed int) vsad, (vector signed int) vzero); /* copy vector sum into unaligned result */ sumdiffs = vec_splat( sumdiffs, 3); vec_ste( sumdiffs, 0, &tmpsad ); sad += tmpsad; return sad; }
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ pix1v = vec_ld( 0, pix1); pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read potentially unaligned pixels into t1 and t2 */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_perm(pix1v[0], pix1v[1], perm1); t2 = vec_perm(pix2v[0], pix2v[1], perm2); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 */ vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_ld(0, pix1); t2 = vec_perm(pix2l, pix2r, perm); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
fepc_real_t folge_norm(folge_p f, folge_p g) { vec_p temp, temp1, temp2, min, max, lang, vec_1; int k, dim, size; fepc_real_t norm, diff; ASSERT(f->start->dim == g->start->dim); dim = f->start->dim; vec_1 = vec_one( dim ); min = vec_min( f->start, g->start ); temp1 = vec_add( f->start, f->lang ); temp2 = vec_add( g->start, g->lang ); temp = vec_max( temp1, temp2 ); vec_del( temp1 ); vec_del( temp2 ); lang = vec_op( 1, temp, -1, min ); vec_del( temp ); size = vec_size( lang ); norm = 0.0; for(k=0;k<size;k++) { temp = entry_one2d( k, lang ); temp1 = vec_add( temp, min ); diff = folge_glied( temp1, f ) - folge_glied( temp1, g ); norm = norm + ( diff * diff ); vec_del( temp ); vec_del( temp1 ); } vec_del( vec_1 ); vec_del( min ); vec_del( lang ); norm = sqrt(norm); return norm; }
// Called by validator when canonical result has been selected. // Compute credit for valid instances. // This is called exactly once for each valid result. // int assign_credit_set( WORKUNIT& wu, vector<RESULT>& results, DB_APP& app, vector<DB_APP_VERSION>& app_versions, vector<DB_HOST_APP_VERSION>& host_app_versions, double max_granted_credit, double& credit ) { unsigned int i; int mode, retval; double pfc; vector<double> normal; vector<double> approx; for (i=0; i<results.size(); i++) { RESULT& r = results[i]; if (r.validate_state != VALIDATE_STATE_VALID) continue; DB_HOST_APP_VERSION& hav = host_app_versions[i]; retval = get_pfc(r, wu, app, app_versions, hav, pfc, mode); if (retval) { log_messages.printf(MSG_CRITICAL, "get_pfc() error: %s\n", boincerror(retval) ); continue; } else { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [RESULT#%d] get_pfc() returns credit %g mode %s\n", r.id, pfc*COBBLESTONE_SCALE, (mode==PFC_MODE_NORMAL)?"normal":"approx" ); } } if (pfc > wu.rsc_fpops_bound) { log_messages.printf(MSG_NORMAL, "[credit] PFC too high: %f\n", pfc*COBBLESTONE_SCALE ); pfc = wu_estimated_pfc(wu, app); } // max_granted_credit trumps rsc_fpops_bound; // the latter may be set absurdly high // if (max_granted_credit && pfc*COBBLESTONE_SCALE > max_granted_credit) { log_messages.printf(MSG_NORMAL, "[credit] Credit too high: %f\n", pfc*COBBLESTONE_SCALE ); pfc = max_granted_credit/COBBLESTONE_SCALE; } if (mode == PFC_MODE_NORMAL) { normal.push_back(pfc); } else { approx.push_back(pfc); } } // averaging policy: if there is least one normal result, // use the "low average" of normal results. // Otherwise use the min of all results // double x; if (normal.size()) { x = low_average(normal); } else if (approx.size()) { x = vec_min(approx); } else { x = 0; } x *= COBBLESTONE_SCALE; if (config.debug_credit) { log_messages.printf(MSG_NORMAL, "[credit] [WU#%d] assign_credit_set: credit %g\n", wu.id, x ); } credit = x; return 0; }
template<> SIMD_INLINE v128_s16 InterferenceChange<true>(v128_s16 statistic, v128_s16 value, v128_s16 saturation) { return vec_min(vec_add(statistic, value), saturation); }
void test1() { // CHECK-LABEL: define void @test1 // CHECK-LE-LABEL: define void @test1 res_vf = vec_abs(vf); // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> // CHECK-LE: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_cpsgn */ res_vf = vec_cpsgn(vf, vf); // CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) // CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) res_vd = vec_cpsgn(vd, vd); // CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) // CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) /* vec_div */ res_vsll = vec_div(vsll, vsll); // CHECK: sdiv <2 x i64> // CHECK-LE: sdiv <2 x i64> res_vull = vec_div(vull, vull); // CHECK: udiv <2 x i64> // CHECK-LE: udiv <2 x i64> res_vf = vec_div(vf, vf); // CHECK: fdiv <4 x float> // CHECK-LE: fdiv <4 x float> res_vd = vec_div(vd, vd); // CHECK: fdiv <2 x double> // CHECK-LE: fdiv <2 x double> /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp // CHECK-LE: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp // CHECK-LE: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp // CHECK-LE: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp // CHECK-LE: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbll = vec_perm(vbll, vbll, vuc); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vf = vec_round(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> res_vd = vec_round(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_splat(vd, 1); // CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vbll = vec_splat(vbll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsll = vec_splat(vsll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vull = vec_splat(vull, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsi = vec_pack(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vui = vec_pack(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbi = vec_pack(vbll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vss = vec_vsx_ld(0, &vss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vss = vec_vsx_ld(0, &ss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &vus); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &us); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vbc = vec_vsx_ld(0, &vbc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &vsc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &vuc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &sc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &uc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsi, 0, &res_si); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_ui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vss, 0, &res_vss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vss, 0, &res_ss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_vus); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_us); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_vsc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_vuc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_vbc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) /* vec_mergeh */ res_vsll = vec_mergeh(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_mergel */ res_vsll = vec_mergel(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_msub */ res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vsll = vec_mul(vsll, vsll); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vull = vec_mul(vull, vull); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_or(vbll, vd); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vd = vec_or(vd, vbll); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vf = vec_re(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> res_vd = vec_re(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_cts(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_cts(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vd = vec_ctf(vsll, 0); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vsll, 31); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 0); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 31); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> }
static int dct_quantize_altivec(MpegEncContext* s, DCTELEM* data, int n, int qscale, int* overflow) { int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; const vector float zero = (const vector float)FOUROF(0.); // used after quantize step int oldBaseValue = 0; // Load the data into the row/alt vectors { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_ld(0, data); data1 = vec_ld(16, data); data2 = vec_ld(32, data); data3 = vec_ld(48, data); data4 = vec_ld(64, data); data5 = vec_ld(80, data); data6 = vec_ld(96, data); data7 = vec_ld(112, data); // Transpose the data before we start TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); // load the data into floating point vectors. We load // the high half of each row into the main row vectors // and the low half into the alt vectors. row0 = vec_ctf(vec_unpackh(data0), 0); alt0 = vec_ctf(vec_unpackl(data0), 0); row1 = vec_ctf(vec_unpackh(data1), 0); alt1 = vec_ctf(vec_unpackl(data1), 0); row2 = vec_ctf(vec_unpackh(data2), 0); alt2 = vec_ctf(vec_unpackl(data2), 0); row3 = vec_ctf(vec_unpackh(data3), 0); alt3 = vec_ctf(vec_unpackl(data3), 0); row4 = vec_ctf(vec_unpackh(data4), 0); alt4 = vec_ctf(vec_unpackl(data4), 0); row5 = vec_ctf(vec_unpackh(data5), 0); alt5 = vec_ctf(vec_unpackl(data5), 0); row6 = vec_ctf(vec_unpackh(data6), 0); alt6 = vec_ctf(vec_unpackl(data6), 0); row7 = vec_ctf(vec_unpackh(data7), 0); alt7 = vec_ctf(vec_unpackl(data7), 0); } // The following block could exist as a separate an altivec dct // function. However, if we put it inline, the DCT data can remain // in the vector local variables, as floats, which we'll use during the // quantize step... { const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); int whichPass, whichHalf; for(whichPass = 1; whichPass<=2; whichPass++) { for(whichHalf = 1; whichHalf<=2; whichHalf++) { vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp10, tmp11, tmp12, tmp13; vector float z1, z2, z3, z4, z5; tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); row0 = vec_add(tmp10, tmp11); // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); row4 = vec_sub(tmp10, tmp11); // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), // CONST_BITS-PASS1_BITS); row2 = vec_madd(tmp13, vec_0_765366865, z1); // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), // CONST_BITS-PASS1_BITS); row6 = vec_madd(tmp12, vec_1_847759065, z1); z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z3 = vec_madd(z3, vec_1_961570560, z5); // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ z4 = vec_madd(z4, vec_0_390180644, z5); // The following adds are rolled into the multiplies above // z3 = vec_add(z3, z5); // z3 += z5; // z4 = vec_add(z4, z5); // z4 += z5; // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ // Wow! It's actually more efficient to roll this multiply // into the adds below, even thought the multiply gets done twice! // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ // Same with this one... // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); // Swap the row values with the alts. If this is the first half, // this sets up the low values to be acted on in the second half. // If this is the second half, it puts the high values back in // the row values where they are expected to be when we're done. SWAP(row0, alt0); SWAP(row1, alt1); SWAP(row2, alt2); SWAP(row3, alt3); SWAP(row4, alt4); SWAP(row5, alt5); SWAP(row6, alt6); SWAP(row7, alt7); } if (whichPass == 1) { // transpose the data for the second pass // First, block transpose the upper right with lower left. SWAP(row4, alt0); SWAP(row5, alt1); SWAP(row6, alt2); SWAP(row7, alt3); // Now, transpose each block of four TRANSPOSE4(row0, row1, row2, row3); TRANSPOSE4(row4, row5, row6, row7); TRANSPOSE4(alt0, alt1, alt2, alt3); TRANSPOSE4(alt4, alt5, alt6, alt7); } } } // perform the quantize step, using the floating point data // still in the row/alt registers { const int* biasAddr; const vector signed int* qmat; vector float bias, negBias; if (s->mb_intra) { vector signed int baseVector; // We must cache element 0 in the intra case // (it needs special handling). baseVector = vec_cts(vec_splat(row0, 0), 0); vec_ste(baseVector, 0, &oldBaseValue); qmat = (vector signed int*)s->q_intra_matrix[qscale]; biasAddr = &(s->intra_quant_bias); } else { qmat = (vector signed int*)s->q_inter_matrix[qscale]; biasAddr = &(s->inter_quant_bias); } // Load the bias vector (We add 0.5 to the bias so that we're // rounding when we convert to int, instead of flooring.) { vector signed int biasInt; const vector float negOneFloat = (vector float)FOUROF(-1.0f); LOAD4(biasInt, biasAddr); bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); negBias = vec_madd(bias, negOneFloat, zero); } { vector float q0, q1, q2, q3, q4, q5, q6, q7; q0 = vec_ctf(qmat[0], QMAT_SHIFT); q1 = vec_ctf(qmat[2], QMAT_SHIFT); q2 = vec_ctf(qmat[4], QMAT_SHIFT); q3 = vec_ctf(qmat[6], QMAT_SHIFT); q4 = vec_ctf(qmat[8], QMAT_SHIFT); q5 = vec_ctf(qmat[10], QMAT_SHIFT); q6 = vec_ctf(qmat[12], QMAT_SHIFT); q7 = vec_ctf(qmat[14], QMAT_SHIFT); row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), vec_cmpgt(row0, zero)); row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), vec_cmpgt(row1, zero)); row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), vec_cmpgt(row2, zero)); row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), vec_cmpgt(row3, zero)); row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), vec_cmpgt(row4, zero)); row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), vec_cmpgt(row5, zero)); row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), vec_cmpgt(row6, zero)); row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), vec_cmpgt(row7, zero)); q0 = vec_ctf(qmat[1], QMAT_SHIFT); q1 = vec_ctf(qmat[3], QMAT_SHIFT); q2 = vec_ctf(qmat[5], QMAT_SHIFT); q3 = vec_ctf(qmat[7], QMAT_SHIFT); q4 = vec_ctf(qmat[9], QMAT_SHIFT); q5 = vec_ctf(qmat[11], QMAT_SHIFT); q6 = vec_ctf(qmat[13], QMAT_SHIFT); q7 = vec_ctf(qmat[15], QMAT_SHIFT); alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), vec_cmpgt(alt0, zero)); alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), vec_cmpgt(alt1, zero)); alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), vec_cmpgt(alt2, zero)); alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), vec_cmpgt(alt3, zero)); alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), vec_cmpgt(alt4, zero)); alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), vec_cmpgt(alt5, zero)); alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), vec_cmpgt(alt6, zero)); alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), vec_cmpgt(alt7, zero)); } } // Store the data back into the original block { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); { // Clamp for overflow vector signed int max_q_int, min_q_int; vector signed short max_q, min_q; LOAD4(max_q_int, &(s->max_qcoeff)); LOAD4(min_q_int, &(s->min_qcoeff)); max_q = vec_pack(max_q_int, max_q_int); min_q = vec_pack(min_q_int, min_q_int); data0 = vec_max(vec_min(data0, max_q), min_q); data1 = vec_max(vec_min(data1, max_q), min_q); data2 = vec_max(vec_min(data2, max_q), min_q); data4 = vec_max(vec_min(data4, max_q), min_q); data5 = vec_max(vec_min(data5, max_q), min_q); data6 = vec_max(vec_min(data6, max_q), min_q); data7 = vec_max(vec_min(data7, max_q), min_q); } { vector bool char zero_01, zero_23, zero_45, zero_67; vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67; vector signed char negOne = vec_splat_s8(-1); vector signed char* scanPtr = (vector signed char*)(s->intra_scantable.inverse); signed char lastNonZeroChar; // Determine the largest non-zero index. zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero), vec_cmpeq(data1, (vector signed short)zero)); zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero), vec_cmpeq(data3, (vector signed short)zero)); zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero), vec_cmpeq(data5, (vector signed short)zero)); zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero), vec_cmpeq(data7, (vector signed short)zero)); // 64 biggest values scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01); scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23); scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45); scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67); // 32 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23); scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67); // 16 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45); // 8 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 4 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 2 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // largest value scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); scanIndexes_01 = vec_splat(scanIndexes_01, 0); vec_ste(scanIndexes_01, 0, &lastNonZeroChar); lastNonZero = lastNonZeroChar; // While the data is still in vectors we check for the transpose IDCT permute // and handle it using the vector unit if we can. This is the permute used // by the altivec idct, so it is common when using the altivec dct. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); } vec_st(data0, 0, data); vec_st(data1, 16, data); vec_st(data2, 32, data); vec_st(data3, 48, data); vec_st(data4, 64, data); vec_st(data5, 80, data); vec_st(data6, 96, data); vec_st(data7, 112, data); } } // special handling of block[0] if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) oldBaseValue /= s->y_dc_scale; else oldBaseValue /= s->c_dc_scale; } // Divide by 8, rounding the result data[0] = (oldBaseValue + 4) >> 3; } // We handled the transpose permutation above and we don't // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { ff_block_permute(data, s->dsp.idct_permutation, s->intra_scantable.scantable, lastNonZero); } return lastNonZero; }
vector unsigned int test8_min (vector unsigned int x, vector bool int y) { return vec_min (x, y); }
vector signed int test3_min (vector signed int x, vector signed int y) { return vec_min (x, y); }
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL) { int i; vector signed short vsrc; uint16_t *qmat; vector unsigned short vqmat; vector unsigned short vmquant; vector bool short eqzero, ltzero; vector signed short val, t0; vector signed short zero, one; vector unsigned int four; vector signed short min, max; int offset, offset2; int16_t dst0; union { vector unsigned short vu16; unsigned short mquant; vector signed int vs32; struct { signed int pad[3]; signed int sum; } s; } vu; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY /* {{{ */ if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat)) mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)", wsp->intra_q_mat); if (NOT_VECTOR_ALIGNED(src)) mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src); if (NOT_VECTOR_ALIGNED(dst)) mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst); for (i = 0; i < 64; i++) if (src[i] < -256 || src[i] > 255) mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)", i, src[i]); #endif /* }}} */ AMBER_START; dst0 = src[0] << (3 - dc_prec); qmat = (uint16_t*)wsp->intra_q_mat; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(64/8,1,0); vec_dst(src, dsc.control, 0); vec_dst(qmat, dsc.control, 1); #endif /* vmquant = (vector unsigned short)(mquant); */ vu.mquant = (unsigned short)mquant; vmquant = vec_splat(vu.vu16, 0); zero = vec_splat_s16(0); one = vec_splat_s16(1); four = vec_splat_u32(4); /* max = (2047); min = (-2048); {{{ */ vu8(max) = vec_splat_u8(0x7); t0 = vec_splat_s16(-1); /* 0xffff */ vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */ min = vec_sub(t0, max); /* }}} */ offset = 0; #if 1 vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); i = (64/8) - 1; do { /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); offset2 = offset; offset += 8*sizeof(int16_t); vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset2, dst); } while (--i); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); #else /* {{{ */ i = (64/8); do { vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); offset += 8*sizeof(int16_t); } while (--i); /* }}} */ #endif dst[0] = dst0; AMBER_STOP; }
void test1() { // CHECK-LABEL: define void @test1 res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_div */ res_vf = vec_div(vf, vf); // CHECK: @llvm.ppc.vsx.xvdivsp res_vd = vec_div(vd, vd); // CHECK: @llvm.ppc.vsx.xvdivdp /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> }