static inline vec_uint4 GENBX(vec_uint4 a, vec_uint4 b, vec_uint4 c) { return vec_and(vec_or(vec_cmpgt(a, b), vec_and(vec_cmpeq(a, b), c)), vec_splat_u32(1)); }
// a > b void _SIMD_cmpgt_ps(__SIMD a, __SIMD b, void** resultPtr) { __SIMD* result = (__SIMD*)malloc(sizeof(__SIMD)); *resultPtr = result; #ifdef USE_SSE *result = _mm_cmpgt_ps(a,b); #elif defined USE_AVX *result = _mm256_cmp_ps(a,b,30); #elif defined USE_IBM *result = vec_cmpgt(a,b); #endif }
void _SIMD_cmpgt_pd(__SIMDd a, __SIMDd b, void** resultPtr) { __SIMDd* result = (__SIMDd*)malloc(sizeof(__SIMDd)); *resultPtr = result; #ifdef USE_SSE *result = _mm_cmple_pd(a,b); #elif defined USE_AVX *result = _mm256_cmp(a,b,30); #elif defined USE_IBM *result = vec_cmpgt(a,b); #endif }
int main(int argc, char **argv) { int i; __vector float *vin = (__vector float *) in; __vector float *vout = (__vector float *) out; __vector float vin_negative; __vector unsigned int vpat; __vector float vzero = (__vector float) { 0.0f, 0.0f, 0.0f, 0.0f }; __vector float vminus = (__vector float) { -1.0f, -1.0f, -1.0f, -1.0f }; for (i = 0; i < SIZE/4; i++) { vpat = vec_cmpgt(vin[i], vzero); vin_negative = vec_madd(vin[i], vminus, vzero); vout[i] = vec_sel(vin_negative, vin[i], vpat); } for (i = 0; i < SIZE; i++) { printf("out[%02d]=%0.0f\n", i, out[i]); } return 0; }
static int dct_quantize_altivec(MpegEncContext* s, DCTELEM* data, int n, int qscale, int* overflow) { int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; const vector float zero = (const vector float)FOUROF(0.); // used after quantize step int oldBaseValue = 0; // Load the data into the row/alt vectors { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_ld(0, data); data1 = vec_ld(16, data); data2 = vec_ld(32, data); data3 = vec_ld(48, data); data4 = vec_ld(64, data); data5 = vec_ld(80, data); data6 = vec_ld(96, data); data7 = vec_ld(112, data); // Transpose the data before we start TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); // load the data into floating point vectors. We load // the high half of each row into the main row vectors // and the low half into the alt vectors. row0 = vec_ctf(vec_unpackh(data0), 0); alt0 = vec_ctf(vec_unpackl(data0), 0); row1 = vec_ctf(vec_unpackh(data1), 0); alt1 = vec_ctf(vec_unpackl(data1), 0); row2 = vec_ctf(vec_unpackh(data2), 0); alt2 = vec_ctf(vec_unpackl(data2), 0); row3 = vec_ctf(vec_unpackh(data3), 0); alt3 = vec_ctf(vec_unpackl(data3), 0); row4 = vec_ctf(vec_unpackh(data4), 0); alt4 = vec_ctf(vec_unpackl(data4), 0); row5 = vec_ctf(vec_unpackh(data5), 0); alt5 = vec_ctf(vec_unpackl(data5), 0); row6 = vec_ctf(vec_unpackh(data6), 0); alt6 = vec_ctf(vec_unpackl(data6), 0); row7 = vec_ctf(vec_unpackh(data7), 0); alt7 = vec_ctf(vec_unpackl(data7), 0); } // The following block could exist as a separate an altivec dct // function. However, if we put it inline, the DCT data can remain // in the vector local variables, as floats, which we'll use during the // quantize step... { const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); int whichPass, whichHalf; for(whichPass = 1; whichPass<=2; whichPass++) { for(whichHalf = 1; whichHalf<=2; whichHalf++) { vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp10, tmp11, tmp12, tmp13; vector float z1, z2, z3, z4, z5; tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); row0 = vec_add(tmp10, tmp11); // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); row4 = vec_sub(tmp10, tmp11); // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), // CONST_BITS-PASS1_BITS); row2 = vec_madd(tmp13, vec_0_765366865, z1); // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), // CONST_BITS-PASS1_BITS); row6 = vec_madd(tmp12, vec_1_847759065, z1); z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z3 = vec_madd(z3, vec_1_961570560, z5); // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ z4 = vec_madd(z4, vec_0_390180644, z5); // The following adds are rolled into the multiplies above // z3 = vec_add(z3, z5); // z3 += z5; // z4 = vec_add(z4, z5); // z4 += z5; // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ // Wow! It's actually more efficient to roll this multiply // into the adds below, even thought the multiply gets done twice! // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ // Same with this one... // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); // Swap the row values with the alts. If this is the first half, // this sets up the low values to be acted on in the second half. // If this is the second half, it puts the high values back in // the row values where they are expected to be when we're done. SWAP(row0, alt0); SWAP(row1, alt1); SWAP(row2, alt2); SWAP(row3, alt3); SWAP(row4, alt4); SWAP(row5, alt5); SWAP(row6, alt6); SWAP(row7, alt7); } if (whichPass == 1) { // transpose the data for the second pass // First, block transpose the upper right with lower left. SWAP(row4, alt0); SWAP(row5, alt1); SWAP(row6, alt2); SWAP(row7, alt3); // Now, transpose each block of four TRANSPOSE4(row0, row1, row2, row3); TRANSPOSE4(row4, row5, row6, row7); TRANSPOSE4(alt0, alt1, alt2, alt3); TRANSPOSE4(alt4, alt5, alt6, alt7); } } } // perform the quantize step, using the floating point data // still in the row/alt registers { const int* biasAddr; const vector signed int* qmat; vector float bias, negBias; if (s->mb_intra) { vector signed int baseVector; // We must cache element 0 in the intra case // (it needs special handling). baseVector = vec_cts(vec_splat(row0, 0), 0); vec_ste(baseVector, 0, &oldBaseValue); qmat = (vector signed int*)s->q_intra_matrix[qscale]; biasAddr = &(s->intra_quant_bias); } else { qmat = (vector signed int*)s->q_inter_matrix[qscale]; biasAddr = &(s->inter_quant_bias); } // Load the bias vector (We add 0.5 to the bias so that we're // rounding when we convert to int, instead of flooring.) { vector signed int biasInt; const vector float negOneFloat = (vector float)FOUROF(-1.0f); LOAD4(biasInt, biasAddr); bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); negBias = vec_madd(bias, negOneFloat, zero); } { vector float q0, q1, q2, q3, q4, q5, q6, q7; q0 = vec_ctf(qmat[0], QMAT_SHIFT); q1 = vec_ctf(qmat[2], QMAT_SHIFT); q2 = vec_ctf(qmat[4], QMAT_SHIFT); q3 = vec_ctf(qmat[6], QMAT_SHIFT); q4 = vec_ctf(qmat[8], QMAT_SHIFT); q5 = vec_ctf(qmat[10], QMAT_SHIFT); q6 = vec_ctf(qmat[12], QMAT_SHIFT); q7 = vec_ctf(qmat[14], QMAT_SHIFT); row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), vec_cmpgt(row0, zero)); row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), vec_cmpgt(row1, zero)); row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), vec_cmpgt(row2, zero)); row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), vec_cmpgt(row3, zero)); row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), vec_cmpgt(row4, zero)); row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), vec_cmpgt(row5, zero)); row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), vec_cmpgt(row6, zero)); row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), vec_cmpgt(row7, zero)); q0 = vec_ctf(qmat[1], QMAT_SHIFT); q1 = vec_ctf(qmat[3], QMAT_SHIFT); q2 = vec_ctf(qmat[5], QMAT_SHIFT); q3 = vec_ctf(qmat[7], QMAT_SHIFT); q4 = vec_ctf(qmat[9], QMAT_SHIFT); q5 = vec_ctf(qmat[11], QMAT_SHIFT); q6 = vec_ctf(qmat[13], QMAT_SHIFT); q7 = vec_ctf(qmat[15], QMAT_SHIFT); alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), vec_cmpgt(alt0, zero)); alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), vec_cmpgt(alt1, zero)); alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), vec_cmpgt(alt2, zero)); alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), vec_cmpgt(alt3, zero)); alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), vec_cmpgt(alt4, zero)); alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), vec_cmpgt(alt5, zero)); alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), vec_cmpgt(alt6, zero)); alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), vec_cmpgt(alt7, zero)); } } // Store the data back into the original block { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); { // Clamp for overflow vector signed int max_q_int, min_q_int; vector signed short max_q, min_q; LOAD4(max_q_int, &(s->max_qcoeff)); LOAD4(min_q_int, &(s->min_qcoeff)); max_q = vec_pack(max_q_int, max_q_int); min_q = vec_pack(min_q_int, min_q_int); data0 = vec_max(vec_min(data0, max_q), min_q); data1 = vec_max(vec_min(data1, max_q), min_q); data2 = vec_max(vec_min(data2, max_q), min_q); data4 = vec_max(vec_min(data4, max_q), min_q); data5 = vec_max(vec_min(data5, max_q), min_q); data6 = vec_max(vec_min(data6, max_q), min_q); data7 = vec_max(vec_min(data7, max_q), min_q); } { vector bool char zero_01, zero_23, zero_45, zero_67; vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67; vector signed char negOne = vec_splat_s8(-1); vector signed char* scanPtr = (vector signed char*)(s->intra_scantable.inverse); signed char lastNonZeroChar; // Determine the largest non-zero index. zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero), vec_cmpeq(data1, (vector signed short)zero)); zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero), vec_cmpeq(data3, (vector signed short)zero)); zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero), vec_cmpeq(data5, (vector signed short)zero)); zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero), vec_cmpeq(data7, (vector signed short)zero)); // 64 biggest values scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01); scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23); scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45); scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67); // 32 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23); scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67); // 16 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45); // 8 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 4 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 2 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // largest value scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); scanIndexes_01 = vec_splat(scanIndexes_01, 0); vec_ste(scanIndexes_01, 0, &lastNonZeroChar); lastNonZero = lastNonZeroChar; // While the data is still in vectors we check for the transpose IDCT permute // and handle it using the vector unit if we can. This is the permute used // by the altivec idct, so it is common when using the altivec dct. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); } vec_st(data0, 0, data); vec_st(data1, 16, data); vec_st(data2, 32, data); vec_st(data3, 48, data); vec_st(data4, 64, data); vec_st(data5, 80, data); vec_st(data6, 96, data); vec_st(data7, 112, data); } } // special handling of block[0] if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) oldBaseValue /= s->y_dc_scale; else oldBaseValue /= s->c_dc_scale; } // Divide by 8, rounding the result data[0] = (oldBaseValue + 4) >> 3; } // We handled the transpose permutation above and we don't // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { ff_block_permute(data, s->dsp.idct_permutation, s->intra_scantable.scantable, lastNonZero); } return lastNonZero; }
void test1() { // CHECK-LABEL: define void @test1 res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_div */ res_vf = vec_div(vf, vf); // CHECK: @llvm.ppc.vsx.xvdivsp res_vd = vec_div(vd, vd); // CHECK: @llvm.ppc.vsx.xvdivdp /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> }
/* Function: p7_OptimalAccuracy() * Synopsis: DP fill of an optimal accuracy alignment calculation. * Incept: SRE, Mon Aug 18 11:04:48 2008 [Janelia] * * Purpose: Calculates the fill step of the optimal accuracy decoding * algorithm \citep{Kall05}. * * Caller provides the posterior decoding matrix <pp>, * which was calculated by Forward/Backward on a target sequence * of length <pp->L> using the query model <om>. * * Caller also provides a DP matrix <ox>, allocated for a full * <om->M> by <L> comparison. The routine fills this in * with OA scores. * * Args: gm - query profile * pp - posterior decoding matrix created by <p7_GPosteriorDecoding()> * gx - RESULT: caller provided DP matrix for <gm->M> by <L> * ret_e - RETURN: expected number of correctly decoded positions * * Returns: <eslOK> on success, and <*ret_e> contains the final OA * score, which is the expected number of correctly decoded * positions in the target sequence (up to <L>). * * Throws: (no abnormal error conditions) */ int p7_OptimalAccuracy(const P7_OPROFILE *om, const P7_OMX *pp, P7_OMX *ox, float *ret_e) { vector float mpv, dpv, ipv; /* previous row values */ vector float sv; /* temp storage of 1 curr row value in progress */ vector float xEv; /* E state: keeps max for Mk->E as we go */ vector float xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector float dcv; float *xmx = ox->xmx; vector float *dpc = ox->dpf[0]; /* current row, for use in {MDI}MO(dpp,q) access macro */ vector float *dpp; /* previous row, for use in {MDI}MO(dpp,q) access macro */ vector float *ppp; /* quads in the <pp> posterior probability matrix */ vector float *tp; /* quads in the <om->tfv> transition scores */ vector float zerov; vector float infv; int M = om->M; int Q = p7O_NQF(M); int q; int j; int i; float t1, t2; zerov = (vector float) vec_splat_u32(0); infv = esl_vmx_set_float(-eslINFINITY); ox->M = om->M; ox->L = pp->L; for (q = 0; q < Q; q++) MMO(dpc, q) = IMO(dpc,q) = DMO(dpc,q) = infv; XMXo(0, p7X_E) = -eslINFINITY; XMXo(0, p7X_N) = 0.; XMXo(0, p7X_J) = -eslINFINITY; XMXo(0, p7X_B) = 0.; XMXo(0, p7X_C) = -eslINFINITY; for (i = 1; i <= pp->L; i++) { dpp = dpc; /* previous DP row in OA matrix */ dpc = ox->dpf[i]; /* current DP row in OA matrix */ ppp = pp->dpf[i]; /* current row in the posterior probabilities per position */ tp = om->tfv; /* transition probabilities */ dcv = infv; xEv = infv; xBv = esl_vmx_set_float(XMXo(i-1, p7X_B)); mpv = vec_sld(infv, MMO(dpp,Q-1), 12); /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. */ dpv = vec_sld(infv, DMO(dpp,Q-1), 12); ipv = vec_sld(infv, IMO(dpp,Q-1), 12); for (q = 0; q < Q; q++) { sv = vec_and(vec_cmpgt(*tp, zerov), xBv); tp++; sv = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), mpv)); tp++; sv = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), ipv)); tp++; sv = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), dpv)); tp++; sv = vec_add(sv, *ppp); ppp += 2; xEv = vec_max(xEv, sv); mpv = MMO(dpp,q); dpv = DMO(dpp,q); ipv = IMO(dpp,q); MMO(dpc,q) = sv; DMO(dpc,q) = dcv; dcv = vec_and(vec_cmpgt(*tp, zerov), sv); tp++; sv = vec_and(vec_cmpgt(*tp, zerov), mpv); tp++; sv = vec_max(sv, vec_and(vec_cmpgt(*tp, zerov), ipv)); tp++; IMO(dpc,q) = vec_add(sv, *ppp); ppp++; } /* dcv has carried through from end of q loop above; store it * in first pass, we add M->D and D->D path into DMX */ dcv = vec_sld(infv, dcv, 12); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { DMO(dpc, q) = vec_max(dcv, DMO(dpc, q)); dcv = vec_and(vec_cmpgt(*tp, zerov), DMO(dpc,q)); tp++; } /* fully serialized D->D; can optimize later */ for (j = 1; j < 4; j++) { dcv = vec_sld(infv, dcv, 12); tp = om->tfv + 7*Q; for (q = 0; q < Q; q++) { DMO(dpc, q) = vec_max(dcv, DMO(dpc, q)); dcv = vec_and(vec_cmpgt(*tp, zerov), dcv); tp++; } } /* D->E paths */ for (q = 0; q < Q; q++) xEv = vec_max(xEv, DMO(dpc,q)); /* Specials */ XMXo(i,p7X_E) = esl_vmx_hmax_float(xEv); t1 = ( (om->xf[p7O_J][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_J] + pp->xmx[i*p7X_NXCELLS+p7X_J]); t2 = ( (om->xf[p7O_E][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[ i *p7X_NXCELLS+p7X_E]); ox->xmx[i*p7X_NXCELLS+p7X_J] = ESL_MAX(t1, t2); t1 = ( (om->xf[p7O_C][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_C] + pp->xmx[i*p7X_NXCELLS+p7X_C]); t2 = ( (om->xf[p7O_E][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[ i *p7X_NXCELLS+p7X_E]); ox->xmx[i*p7X_NXCELLS+p7X_C] = ESL_MAX(t1, t2); ox->xmx[i*p7X_NXCELLS+p7X_N] = ((om->xf[p7O_N][p7O_LOOP] == 0.0) ? 0.0 : ox->xmx[(i-1)*p7X_NXCELLS+p7X_N] + pp->xmx[i*p7X_NXCELLS+p7X_N]); t1 = ( (om->xf[p7O_N][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[i*p7X_NXCELLS+p7X_N]); t2 = ( (om->xf[p7O_J][p7O_MOVE] == 0.0) ? 0.0 : ox->xmx[i*p7X_NXCELLS+p7X_J]); ox->xmx[i*p7X_NXCELLS+p7X_B] = ESL_MAX(t1, t2); } *ret_e = ox->xmx[pp->L*p7X_NXCELLS+p7X_C]; return eslOK; }
void pix_movement :: processYUVAltivec(imageStruct &image) { if (image.xsize*image.ysize != buffer.xsize*buffer.ysize){ buffer.xsize = image.xsize; buffer.ysize = image.ysize; buffer.reallocate(buffer.xsize*buffer.ysize*2); } int pixsize = image.ysize * image.xsize/8; union{ signed short c[8]; vector signed short v; }shortBuffer; union{ unsigned short c[8]; vector unsigned short v; }ushortBuffer; int i; vector signed short thresh; shortBuffer.c[0] = threshold; thresh = shortBuffer.v; thresh = (vector signed short)vec_splat(thresh,0); vector unsigned char *rp = (vector unsigned char *) image.data; // read pointer vector unsigned char *wp = (vector unsigned char *) buffer.data; // write pointer to the copy vector unsigned char grey0,grey1; vector unsigned char one = vec_splat_u8(1); vector unsigned short Y0,Ywp0,hiImage0,loImage0; vector unsigned short Y1,Ywp1,hiImage1,loImage1; vector unsigned short UVwp0,UVwp1; vector signed short temp0,temp1; ushortBuffer.c[0]=127; vector unsigned short UV0= (vector unsigned short)vec_splat(ushortBuffer.v, 0); vector unsigned short UV1= (vector unsigned short)vec_splat(ushortBuffer.v, 0); #ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 0, 256 ); vec_dst( rp, prefetchSize, 0 ); vec_dst( wp, prefetchSize, 1 ); #endif int j = 16; pixsize/=2; for (i=0; i < pixsize; i++) { # ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( j, 0, j * 16 ); vec_dst( rp, prefetchSize, 0 ); vec_dst( wp, prefetchSize, 1 ); vec_dst( rp+16, prefetchSize, 2 ); vec_dst( wp+16, prefetchSize, 3 ); # endif grey0 = rp[0]; grey1 = rp[1]; // rp[Y0]=255*(abs(grey0-*wp)>thresh); // UV0= (vector unsigned short)vec_mule(grey0,one); Y0 = (vector unsigned short)vec_mulo(grey0,one); // UV1= (vector unsigned short)vec_mule(grey1,one); Y1 = (vector unsigned short)vec_mulo(grey1,one); //wp is actually 1/2 the size of the image because it is only Y?? //here the full U Y V Y is stored // UVwp0= (vector unsigned short)vec_mule(wp[0],one); Ywp0 = (vector unsigned short)vec_mulo(wp[0],one); // UVwp1= (vector unsigned short)vec_mule(wp[1],one); Ywp1 = (vector unsigned short)vec_mulo(wp[1],one); //store the current pixels as the history for next time wp[0]=grey0; wp++; wp[0]=grey1; wp++; temp0 = vec_abs(vec_sub((vector signed short)Y0,(vector signed short)Ywp0)); Y0 = (vector unsigned short)vec_cmpgt(temp0,thresh); temp1 = vec_abs(vec_sub((vector signed short)Y1,(vector signed short)Ywp1)); Y1 = (vector unsigned short)vec_cmpgt(temp1,thresh); hiImage0 = vec_mergeh(UV0,Y0); loImage0 = vec_mergel(UV0,Y0); hiImage1 = vec_mergeh(UV1,Y1); loImage1 = vec_mergel(UV1,Y1); grey0 = vec_packsu(hiImage0,loImage0); grey1 = vec_packsu(hiImage1,loImage1); rp[0]=grey0; rp++; rp[0]=grey1; rp++; // grey = rp[0]; // rp[Y1]=255*(abs(grey-*wp)>thresh); // *wp++=grey; // rp+=4; // rp++; } # ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); # endif }
, ((simd_<arithmetic_<A0>,tag::altivec_>)) ((simd_<arithmetic_<A0>,tag::altivec_>)) ); //////////////////////////////////////////////////////////////////////////////// // Overloads implementation //////////////////////////////////////////////////////////////////////////////// namespace nt2 { namespace ext { template<class Dummy> struct call< tag::is_greater_( tag::simd_<tag::arithmetic_,tag::altivec_> , tag::simd_<tag::arithmetic_,tag::altivec_> ) , tag::cpu_, Dummy > : callable { template<class Sig> struct result; template<class This,class A> struct result<This(A,A)> : meta::strip<A> {}; NT2_FUNCTOR_CALL(2) { A0 that = { simd::native_cast<A0>(vec_cmpgt(a0(),a1())) }; return that; } }; } } #endif
void pix_compare :: processYUV_Altivec(imageStruct &image, imageStruct &right) { register int h,w,i,j,width; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec"); return; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector bool short Ymask1; register vector unsigned char one = vec_splat_u8(1); vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; #ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif if (m_direction) { for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); //compare the Y values Ymask1 = vec_cmpgt(Yres1,Yres2); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres2,Yres1,Ymask1); UVres1 = vec_sel(UVres2,UVres1,Ymask1); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(1); vec_dss(0); #endif } }else{ for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Ymask1 = vec_cmplt(Yres1,Yres2); Yres1 = vec_sel(Yres2,Yres1,Ymask1); UVres1 = vec_sel(UVres2,UVres1,Ymask1); hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(1); vec_dss(0); #endif } } }
int main(int argc, char **argv) { // setup, assign particles initla positions and masses // this is done in scalar fashion, NOT SIMD // insignificant to performance since it's only done once //time_t startTime = time(NULL); //seed random generator srand( time(NULL) ); printf("\n\n\n~~~~~~~~Printing out particles and their randomly assigned positions: \n\n"); int pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { int grideSize = GRID_SIZE; // printf("\n grideSize/2: %d", grideSize/2); float xPos = (float)( rand() % grideSize - grideSize/2); float yPos = (float)( rand() % grideSize - grideSize/2); float zPos = (float)( rand() % grideSize - grideSize/2); particle_Array_PPU[pC].position[0] = xPos; particle_Array_PPU[pC].position[1] = yPos; particle_Array_PPU[pC].position[2] = zPos; particle_Array_PPU[pC].velocity[3] = PARTICLES_DEFAULTMASS; if(pC == 0) { // center, high mass particle_Array_PPU[pC].position = zeroVector; particle_Array_PPU[pC].velocity = zeroVector; //initialVelocityVector_Y_minus; printf("Earth mass: %f\n", earthMass ); particle_Array_PPU[pC].velocity[3] = earthMass; // PARTICLES_DEFAULTMASS * 500.0f; } if(pC == 1) { particle_Array_PPU[pC].position = issPosition; //initPositionVector; particle_Array_PPU[pC].velocity = issVelocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = issMass; //PARTICLES_DEFAULTMASS * 500.0f; } if(pC == 2) { particle_Array_PPU[pC].position = sat1Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat1Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 3) { particle_Array_PPU[pC].position = sat2Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat2Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 4) { particle_Array_PPU[pC].position = sat3Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat3Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 5) { particle_Array_PPU[pC].position = sat4Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat4Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 6) { particle_Array_PPU[pC].position = moonPosition; //initPositionVector; particle_Array_PPU[pC].velocity = moonVelocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = moonMass; } else { } //particle_Array_PPU[pC].position = vec_splat(particle_Array_PPU[pC].position, 1); //particle_Array_PPU[pC].position = vec_splats((float)GRAVITATIONALCONSTANT); --> use splats, seems faster printf("Particle %d: ", pC ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]); printf("\n"); } // copy arrays into spe ones pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { spe1_Data[pC] = particle_Array_PPU[pC]; spe2_Data[pC] = particle_Array_PPU[pC]; spe3_Data[pC] = particle_Array_PPU[pC]; spe4_Data[pC] = particle_Array_PPU[pC]; spe5_Data[pC] = particle_Array_PPU[pC]; spe6_Data[pC] = particle_Array_PPU[pC]; } for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes // compare with zero vector to get on which side of each axis the particle is // 0 is negative, 1 is positive side of the axis __vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector); // need to manually set, can't cast due to size difference error __vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0], (unsigned int)axisDirection[1], (unsigned int)axisDirection[2], 0}; // need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES shiftedAxis = vec_andc(oneVector, shiftedAxis); /* printf("Particle %d axis sign: ", i ); printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]); printf("\n"); */ // shift 3 axies simultaneously (actually only 2, 1 stays in origina positon //, with intent to OR them later shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector __vector unsigned int axis_Y = vec_splats(shiftedAxis[1]); __vector unsigned int axis_Z = vec_splats(shiftedAxis[2]); // merge shhifted x y z values by OR-ing // this gives the octant id, range from 0-7 (000 to 111 in binary) shiftedAxis = vec_or(shiftedAxis, axis_Y); shiftedAxis = vec_or(shiftedAxis, axis_Z); // insert octant value into last slot of position vector of particle particle_Array_PPU[i].position[3] = (float)shiftedAxis[0]; //printf("Oct ID: %d \n", shiftedAxis[0]); /////// Update octant vector by incrementing octant that the particle is in // The only possible non SIMD line in the entire program, //irreleant since quadrant counting should occur on PPU anyways octantCount[shiftedAxis[0]] ++ ; } i=0; printf("\n"); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); int speCount = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES,-1); /* printf("\n"); printf("%d", speCount); printf("\n"); printf("\n"); printf("--------------\n"); printf("Starting spe1 part\n"); */ /* // wait for user input, gives time to start graphics printf("Press Enter to continue\n"); getchar(); */ struct timeval start; gettimeofday(&start,NULL); int iterCount = 0; for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++) { //printf("++++++++++++++ START of ITERATION # %d of %d +++++++++++++++\n", i, ITERATION_COUNT ); int retval; pthread_t spe1_Thread; pthread_t spe2_Thread; pthread_t spe3_Thread; pthread_t spe4_Thread; pthread_t spe5_Thread; pthread_t spe6_Thread; //speData = spe1_Data; speNumber = 0; /* Create Thread */ // printf("spe1_Data value: %d\n", (int)spe1_Data ); retval = pthread_create(&spe1_Thread, // Thread object NULL, // Thread attributes spe_code_launch_1, // Thread function NULL // Thread argument ); // printf("spe2_Data value: %d\n", (int)spe2_Data ); retval = pthread_create(&spe2_Thread, // Thread object NULL, // Thread attributes spe_code_launch_2, // Thread function NULL // Thread argument ); retval = pthread_create(&spe3_Thread, // Thread object NULL, // Thread attributes spe_code_launch_3, // Thread function NULL // Thread argument ); retval = pthread_create(&spe4_Thread, // Thread object NULL, // Thread attributes spe_code_launch_4, // Thread function NULL // Thread argument ); retval = pthread_create(&spe5_Thread, // Thread object NULL, // Thread attributes spe_code_launch_5, // Thread function NULL // Thread argument ); retval = pthread_create(&spe6_Thread, // Thread object NULL, // Thread attributes spe_code_launch_6, // Thread function NULL // Thread argument ); //Wait for Thread Completion retval = pthread_join(spe1_Thread, NULL); retval = pthread_join(spe2_Thread, NULL); retval = pthread_join(spe3_Thread, NULL); retval = pthread_join(spe4_Thread, NULL); retval = pthread_join(spe5_Thread, NULL); retval = pthread_join(spe6_Thread, NULL); speNumber = 1; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe1_Data[i]; } speNumber = 2; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe2_Data[i]; } speNumber = 3; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe3_Data[i]; } speNumber = 4; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe4_Data[i]; } speNumber = 5; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe5_Data[i]; } speNumber = 6; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<PARTICLES_MAXCOUNT; ++i) { particle_Array_PPU[i] = spe6_Data[i]; } // reset spe counter speNumber = 0; // copy arrays into spe ones pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { spe1_Data[pC] = particle_Array_PPU[pC]; spe2_Data[pC] = particle_Array_PPU[pC]; spe3_Data[pC] = particle_Array_PPU[pC]; spe4_Data[pC] = particle_Array_PPU[pC]; spe5_Data[pC] = particle_Array_PPU[pC]; spe6_Data[pC] = particle_Array_PPU[pC]; // update values for shared array (graphics) /* particle_Array_Shared[pC].position[0] = particle_Array_PPU[pC].position[0]; particle_Array_Shared[pC].position[1] = particle_Array_PPU[pC].position[1]; particle_Array_Shared[pC].position[2] = particle_Array_PPU[pC].position[2]; particle_Array_Shared[pC].position[3] = particle_Array_PPU[pC].position[3]; */ /* printf("Particle %d positions: ", pC ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]); printf("\n"); */ fullSimilationData[iterCount].particleArray[pC]= particle_Array_PPU[pC]; } // printf("++++++++++++++ END of ITERATION # %d of %d +++++++++++++++\n", iterCount, ITERATION_COUNT ); } struct timeval end; gettimeofday(&end,NULL); float deltaTime = ((end.tv_sec - start.tv_sec)*1000.0f + (end.tv_usec -start.tv_usec)/1000.0f); printf("print out values from post spe calculations\n"); i = 0; for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { printf("Particle %d positions: ", i ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[i].position[0], particle_Array_PPU[i].position[1], particle_Array_PPU[i].position[2], particle_Array_PPU[i].velocity[3]); printf("\n"); } //cleaining the array octantCount = resetOctantCount; for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes // compare with zero vector to get on which side of each axis the particle is // 0 is negative, 1 is positive side of the axis __vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector); // need to manually set, can't cast due to size difference error __vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0], (unsigned int)axisDirection[1], (unsigned int)axisDirection[2], 0}; // need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES shiftedAxis = vec_andc(oneVector, shiftedAxis); /* printf("Particle %d axis sign: ", i ); printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]); printf("\n"); */ // shift 3 axies simultaneously (actually only 2, 1 stays in origina positon //, with intent to OR them later shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector __vector unsigned int axis_Y = vec_splats(shiftedAxis[1]); __vector unsigned int axis_Z = vec_splats(shiftedAxis[2]); // merge shhifted x y z values by OR-ing // this gives the octant id, range from 0-7 (000 to 111 in binary) shiftedAxis = vec_or(shiftedAxis, axis_Y); shiftedAxis = vec_or(shiftedAxis, axis_Z); // insert octant value into last slot of position vector of particle particle_Array_PPU[i].position[3] = (float)shiftedAxis[0]; //printf("Oct ID: %d \n", shiftedAxis[0]); /////// Update octant vector by incrementing octant that the particle is in // The only possible non SIMD line in the entire program, //irreleant since quadrant counting should occur on PPU anyways octantCount[shiftedAxis[0]] ++ ; } i=0; printf("\n"); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); /* time_t endTime = time(NULL); int deltaTime = endTime - startTime; */ // need to look into http://www.xmlsoft.org/ printf("Execution time: %f\n",deltaTime); FILE *filePointer; filePointer = fopen("fileLog1.txt","w"); //fprintf(filePointer, "<SimulationData>\n"); iterCount = 0; for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++) { //printf("Iteration: %d\n", iterCount); //fprintf(filePointer,"<Iter>\n"); fprintf(filePointer,"\n"); pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { //printf("Particle %d positions: ", pC ); // fprintf(filePointer,"<Obj>\n"); //printf("x= %f, y=%f, z=%f", fullSimilationData[iterCount].particleArray[pC].position[0], fullSimilationData[iterCount].particleArray[pC].position[1], fullSimilationData[iterCount].particleArray[pC].position[2]); //printf("\n"); /* fprintf(filePointer,"<PX>%f</PX>\n",fullSimilationData[iterCount].particleArray[pC].position[0]); fprintf(filePointer,"<PY>%f</PY>\n",fullSimilationData[iterCount].particleArray[pC].position[1]); fprintf(filePointer,"<PZ>%f</PZ>\n",fullSimilationData[iterCount].particleArray[pC].position[2]); */ fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[0]); fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[1]); fprintf(filePointer,"%f",fullSimilationData[iterCount].particleArray[pC].position[2]); fprintf(filePointer,"|"); //fprintf(filePointer,"</Obj>\n"); //fullSimilationData[fullDataCounter].particleArray[pC]= particle_Array_PPU[pC]; } //fprintf(filePointer,"</Iter>\n"); } //fprintf(filePointer, "</SimulationData>\n"); fclose(filePointer); return 0; }
// CHECK-LABEL: define void @test1 void test1() { /* vec_cmpeq */ res_vbll = vec_cmpeq(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd // CHECK-LE: @llvm.ppc.altivec.vcmpequd // CHECK-PPC: error: call to 'vec_cmpeq' is ambiguous res_vbll = vec_cmpeq(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd // CHECK-LE: @llvm.ppc.altivec.vcmpequd // CHECK-PPC: error: call to 'vec_cmpeq' is ambiguous /* vec_cmpgt */ res_vbll = vec_cmpgt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd // CHECK-PPC: error: call to 'vec_cmpgt' is ambiguous res_vbll = vec_cmpgt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud // CHECK-LE: @llvm.ppc.altivec.vcmpgtud // CHECK-PPC: error: call to 'vec_cmpgt' is ambiguous /* ----------------------- predicates --------------------------- */ /* vec_all_eq */ res_i = vec_all_eq(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous res_i = vec_all_eq(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_eq' is ambiguous /* vec_all_ne */ res_i = vec_all_ne(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous res_i = vec_all_ne(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_all_ne' is ambiguous /* vec_any_eq */ res_i = vec_any_eq(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous res_i = vec_any_eq(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_eq' is ambiguous /* vec_any_ne */ res_i = vec_any_ne(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous res_i = vec_any_ne(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpequd.p // CHECK-LE: @llvm.ppc.altivec.vcmpequd.p // CHECK-PPC: error: call to 'vec_any_ne' is ambiguous /* vec_all_ge */ res_i = vec_all_ge(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous res_i = vec_all_ge(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_ge' is ambiguous /* vec_all_gt */ res_i = vec_all_gt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous res_i = vec_all_gt(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_gt' is ambiguous /* vec_all_le */ res_i = vec_all_le(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous res_i = vec_all_le(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_le' is ambiguous /* vec_all_lt */ res_i = vec_all_lt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous res_i = vec_all_lt(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_all_lt' is ambiguous /* vec_any_ge */ res_i = vec_any_ge(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous res_i = vec_any_ge(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_ge' is ambiguous /* vec_any_gt */ res_i = vec_any_gt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous res_i = vec_any_gt(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_gt' is ambiguous /* vec_any_le */ res_i = vec_any_le(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous res_i = vec_any_le(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_le' is ambiguous /* vec_any_lt */ res_i = vec_any_lt(vsll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vsll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtsd.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vull, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vull, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vbll, vsll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vbll, vull); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous res_i = vec_any_lt(vbll, vbll); // CHECK: @llvm.ppc.altivec.vcmpgtud.p // CHECK-LE: @llvm.ppc.altivec.vcmpgtud.p // CHECK-PPC: error: call to 'vec_any_lt' is ambiguous /* vec_max */ res_vsll = vec_max(vsll, vsll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vsll = vec_max(vbll, vsll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vsll = vec_max(vsll, vbll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vull = vec_max(vull, vull); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vull = vec_max(vbll, vull); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud // CHECK-PPC: error: call to 'vec_max' is ambiguous res_vull = vec_max(vull, vbll); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud // CHECK-PPC: error: call to 'vec_max' is ambiguous /* vec_min */ res_vsll = vec_min(vsll, vsll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vsll = vec_min(vbll, vsll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vsll = vec_min(vsll, vbll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vull = vec_min(vull, vull); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vull = vec_min(vbll, vull); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud // CHECK-PPC: error: call to 'vec_min' is ambiguous res_vull = vec_min(vull, vbll); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud // CHECK-PPC: error: call to 'vec_min' is ambiguous /* vec_mule */ res_vsll = vec_mule(vi, vi); // CHECK: @llvm.ppc.altivec.vmulesw // CHECK-LE: @llvm.ppc.altivec.vmulosw // CHECK-PPC: error: call to 'vec_mule' is ambiguous res_vull = vec_mule(vui , vui); // CHECK: @llvm.ppc.altivec.vmuleuw // CHECK-LE: @llvm.ppc.altivec.vmulouw // CHECK-PPC: error: call to 'vec_mule' is ambiguous /* vec_mulo */ res_vsll = vec_mulo(vi, vi); // CHECK: @llvm.ppc.altivec.vmulosw // CHECK-LE: @llvm.ppc.altivec.vmulesw // CHECK-PPC: error: call to 'vec_mulo' is ambiguous res_vull = vec_mulo(vui, vui); // CHECK: @llvm.ppc.altivec.vmulouw // CHECK-LE: @llvm.ppc.altivec.vmuleuw // CHECK-PPC: error: call to 'vec_mulo' is ambiguous /* vec_packs */ res_vi = vec_packs(vsll, vsll); // CHECK: @llvm.ppc.altivec.vpksdss // CHECK-LE: @llvm.ppc.altivec.vpksdss // CHECK-PPC: error: call to 'vec_packs' is ambiguous res_vui = vec_packs(vull, vull); // CHECK: @llvm.ppc.altivec.vpkudus // CHECK-LE: @llvm.ppc.altivec.vpkudus // CHECK-PPC: error: call to 'vec_packs' is ambiguous /* vec_packsu */ res_vui = vec_packsu(vsll, vsll); // CHECK: @llvm.ppc.altivec.vpksdus // CHECK-LE: @llvm.ppc.altivec.vpksdus // CHECK-PPC: error: call to 'vec_packsu' is ambiguous res_vui = vec_packsu(vull, vull); // CHECK: @llvm.ppc.altivec.vpkudus // CHECK-LE: @llvm.ppc.altivec.vpkudus // CHECK-PPC: error: call to 'vec_packsu' is ambiguous /* vec_rl */ res_vsll = vec_rl(vsll, vull); // CHECK: @llvm.ppc.altivec.vrld // CHECK-LE: @llvm.ppc.altivec.vrld // CHECK-PPC: error: call to 'vec_rl' is ambiguous res_vull = vec_rl(vull, vull); // CHECK: @llvm.ppc.altivec.vrld // CHECK-LE: @llvm.ppc.altivec.vrld // CHECK-PPC: error: call to 'vec_rl' is ambiguous /* vec_sl */ res_vsll = vec_sl(vsll, vull); // CHECK: shl <2 x i64> // CHECK-LE: shl <2 x i64> // CHECK-PPC: error: call to 'vec_sl' is ambiguous res_vull = vec_sl(vull, vull); // CHECK: shl <2 x i64> // CHECK-LE: shl <2 x i64> // CHECK-PPC: error: call to 'vec_sl' is ambiguous /* vec_sr */ res_vsll = vec_sr(vsll, vull); // CHECK: ashr <2 x i64> // CHECK-LE: ashr <2 x i64> // CHECK-PPC: error: call to 'vec_sr' is ambiguous res_vull = vec_sr(vull, vull); // CHECK: lshr <2 x i64> // CHECK-LE: lshr <2 x i64> // CHECK-PPC: error: call to 'vec_sr' is ambiguous /* vec_sra */ res_vsll = vec_sra(vsll, vull); // CHECK: ashr <2 x i64> // CHECK-LE: ashr <2 x i64> // CHECK-PPC: error: call to 'vec_sra' is ambiguous res_vull = vec_sra(vull, vull); // CHECK: ashr <2 x i64> // CHECK-LE: ashr <2 x i64> // CHECK-PPC: error: call to 'vec_sra' is ambiguous /* vec_unpackh */ res_vsll = vec_unpackh(vi); // CHECK: llvm.ppc.altivec.vupkhsw // CHECK-LE: llvm.ppc.altivec.vupklsw // CHECK-PPC: error: call to 'vec_unpackh' is ambiguous res_vbll = vec_unpackh(vbi); // CHECK: llvm.ppc.altivec.vupkhsw // CHECK-LE: llvm.ppc.altivec.vupklsw // CHECK-PPC: error: call to 'vec_unpackh' is ambiguous /* vec_unpackl */ res_vsll = vec_unpackl(vi); // CHECK: llvm.ppc.altivec.vupklsw // CHECK-LE: llvm.ppc.altivec.vupkhsw // CHECK-PPC: error: call to 'vec_unpackl' is ambiguous res_vbll = vec_unpackl(vbi); // CHECK: llvm.ppc.altivec.vupklsw // CHECK-LE: llvm.ppc.altivec.vupkhsw // CHECK-PPC: error: call to 'vec_unpackl' is ambiguous /* vec_vpksdss */ res_vi = vec_vpksdss(vsll, vsll); // CHECK: llvm.ppc.altivec.vpksdss // CHECK-LE: llvm.ppc.altivec.vpksdss // CHECK-PPC: warning: implicit declaration of function 'vec_vpksdss' /* vec_vpksdus */ res_vui = vec_vpksdus(vsll, vsll); // CHECK: llvm.ppc.altivec.vpksdus // CHECK-LE: llvm.ppc.altivec.vpksdus // CHECK-PPC: warning: implicit declaration of function 'vec_vpksdus' /* vec_vpkudum */ res_vi = vec_vpkudum(vsll, vsll); // CHECK: vperm // CHECK-LE: vperm // CHECK-PPC: warning: implicit declaration of function 'vec_vpkudum' res_vui = vec_vpkudum(vull, vull); // CHECK: vperm // CHECK-LE: vperm res_vui = vec_vpkudus(vull, vull); // CHECK: llvm.ppc.altivec.vpkudus // CHECK-LE: llvm.ppc.altivec.vpkudus // CHECK-PPC: warning: implicit declaration of function 'vec_vpkudus' /* vec_vupkhsw */ res_vsll = vec_vupkhsw(vi); // CHECK: llvm.ppc.altivec.vupkhsw // CHECK-LE: llvm.ppc.altivec.vupklsw // CHECK-PPC: warning: implicit declaration of function 'vec_vupkhsw' res_vbll = vec_vupkhsw(vbi); // CHECK: llvm.ppc.altivec.vupkhsw // CHECK-LE: llvm.ppc.altivec.vupklsw /* vec_vupklsw */ res_vsll = vec_vupklsw(vi); // CHECK: llvm.ppc.altivec.vupklsw // CHECK-LE: llvm.ppc.altivec.vupkhsw // CHECK-PPC: warning: implicit declaration of function 'vec_vupklsw' res_vbll = vec_vupklsw(vbi); // CHECK: llvm.ppc.altivec.vupklsw // CHECK-LE: llvm.ppc.altivec.vupkhsw /* vec_max */ res_vsll = vec_max(vsll, vsll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd res_vsll = vec_max(vbll, vsll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd res_vsll = vec_max(vsll, vbll); // CHECK: @llvm.ppc.altivec.vmaxsd // CHECK-LE: @llvm.ppc.altivec.vmaxsd res_vull = vec_max(vull, vull); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud res_vull = vec_max(vbll, vull); // CHECK: @llvm.ppc.altivec.vmaxud // CHECK-LE: @llvm.ppc.altivec.vmaxud /* vec_min */ res_vsll = vec_min(vsll, vsll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd res_vsll = vec_min(vbll, vsll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd res_vsll = vec_min(vsll, vbll); // CHECK: @llvm.ppc.altivec.vminsd // CHECK-LE: @llvm.ppc.altivec.vminsd res_vull = vec_min(vull, vull); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud res_vull = vec_min(vbll, vull); // CHECK: @llvm.ppc.altivec.vminud // CHECK-LE: @llvm.ppc.altivec.vminud }
int main(int argc, char **argv) { time_t startTime = time(NULL); // setup, assign particles initla positions and masses // this is done in scalar fashion, NOT SIMD // insignificant to performance since it's only done once struct timeval start; gettimeofday(&start,NULL); //seed random generator srand( time(NULL) ); printf("\n\n\n~~~~~~~~Printing out particles and their randomly assigned positions: \n\n"); int pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { int grideSize = GRID_SIZE; // printf("\n grideSize/2: %d", grideSize/2); float xPos = (float)( rand() % grideSize - grideSize/2); float yPos = (float)( rand() % grideSize - grideSize/2); float zPos = (float)( rand() % grideSize - grideSize/2); particle_Array[pC].position[0] = xPos; particle_Array[pC].position[1] = yPos; particle_Array[pC].position[2] = zPos; particle_Array[pC].velocity[3] = PARTICLES_DEFAULTMASS; //particle_Array[pC].position = vec_splat(particle_Array[pC].position, 1); //particle_Array[pC].position = vec_splats((float)GRAVITATIONALCONSTANT); --> use splats, seems faster printf("Particle %d: ", pC ); printf("x= %f, y=%f, z=%f", particle_Array[pC].position[0], particle_Array[pC].position[1], particle_Array[pC].position[2]); printf("\n"); } ///main loop // temp particle Datas used for calculations, not pointers, purposefully passed by value particle_Data pDi; particle_Data pDj; //temp vectors used for calculations in loop __vector float tempAcceleration = {0,0,0,0}; __vector float tempVelocity = {0,0,0,0}; __vector float tempDistance = {0,0,0,0}; //--> use 4th element to store radius __vector float tempDistanceRL1 = {0,0,0,0}; __vector float tempDistanceRL2 = {0,0,0,0}; __vector float tempNumerator = {0,0,0,0}; __vector float tempMassSplat = {0,0,0,0}; __vector float tempGConstant = {GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT }; __vector float tempDELATTIME = {DELTA_TIME, DELTA_TIME, DELTA_TIME, DELTA_TIME}; __vector float tempEPS= {EPS, EPS, EPS, EPS}; __vector float zeroVector = {0,0,0,0}; __vector unsigned int oneVector = {1,1,1,1}; __vector unsigned int axisBitShiftMask = {0,1,2,0}; __vector unsigned char yzxwMask = { 4,5,6,7, 8,9,10,11, 0,1,2,3, 12,13,14,15}; __vector unsigned char zxywMask = { 8,9,10,11, 0,1,2,3, 4,5,6,7, 12,13,14,15}; __vector unsigned short resetOctantCount = {0,0,0,0,0,0,0}; __vector unsigned short increment = {1,1,1,1,1,1,1,1}; __vector float tempUnitVector = {0,0,0,0}; __vector float distanceVector = {0,0,0,0}; //stupid C99, need to declare indicies before for loops int i = 0; int j = 0; int it_counter = 0; printf("\n^^^^^^^ Now starting main loop\n\n\n"); for(it_counter = 0; it_counter < ITERATION_COUNT; ++it_counter) { octantCount = resetOctantCount; // printf("\nIteration: %d\n",it_counter ); // this first loop is to calculate the forces/accelerations // NOTE ---> NO FORCES ARE APPLIED IN THIS LOOP, NO POSITIONS WILL BE CHANGED. // The calculated accelerations will be used to increment the particles velocity vector, NOT POSITION for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { //cache the particle data struct to the temp declared outside the loops pDi = particle_Array[i]; for(j = 0; j<PARTICLES_MAXCOUNT; ++j) { //for every particle i, calculate for all j's // get resultant total velocity, don't apply it in these loops, // apply velocities for all bodies at the same time, in seperate loop at the end. //cache the particle data struct to the temp declared outside the loops pDj = particle_Array[j]; // Formula being used --> a = (G * m )/(r^2) tempDistance = vec_sub(pDj.position,pDi.position); //actual distance vector between objects i and j // save value for unit vector calculation later distanceVector = tempDistance; /* //Print distances between particles printf("Particle %d: ", i ); printf("x= %f, y=%f, z=%f", tempDistance[0], tempDistance[1], tempDistance[2]); printf("\n"); */ //use the distance vector right now for numerator, before we overwrite is later in the code // use mass of subject mass tempMassSplat = vec_splats((float)pDi.velocity[3]); //mass is stored in the last element (3) of velocity vector tempNumerator = vec_madd(tempMassSplat, tempGConstant, zeroVector); /* //Print numerator printf("Numerator %d: ", i ); printf("x= %f, y=%f, z=%f", tempNumerator[0], tempNumerator[1], tempNumerator[2]); printf("\n"); */ //Assembly for vector rotate //__asm__("addi 4,4,1;"); // denominator part // sqaure each component, x,y,z beforehand tempDistance = vec_madd(tempDistance, tempDistance, zeroVector); //using perm instead of rotate, bleurg tempDistanceRL1 = vec_perm(tempDistance, zeroVector, yzxwMask); // imitates lxfloat left rotate tempDistanceRL2 = vec_perm(tempDistance, zeroVector, zxywMask); // imitates 2xfloat left rotate //add both tempDistanceRL1 = vec_add(tempDistanceRL1, tempDistanceRL2); //add to original to get total ---> x+y+z tempDistance = vec_add(tempDistance, tempDistanceRL1); //tempDistance is now total distance squared // add EPS to avoid singularity tempDistance = vec_add(tempDistance, tempEPS); //this is now the denominator value //save inverse magnitude for unit vector later tempUnitVector = vec_rsqrte(tempDistance); // invert vector to avoid division later tempDistance = vec_re(tempDistance); // this is final denominator (already inverted), only need to multiply // tempDistance is now eqivalent to 1/r^2 /* //Print denominator printf("Denominator %d: ", i ); printf("x= %f, y=%f, z=%f", tempDistance[0], tempDistance[1], tempDistance[2]); printf("\n"); */ //total acceleration applied to particle i, by particle j tempAcceleration = vec_madd(tempDistance, tempNumerator, zeroVector); // create unit vector tempUnitVector = vec_madd(distanceVector, tempUnitVector, zeroVector); // apply unit vector to acceleration tempAcceleration = vec_madd(tempUnitVector, tempAcceleration, zeroVector); //increment velocity value of particle with a*dt // need to explicitly call the array, since pDi is only a temp pass by value, doesn't change the particle particle_Array[i].velocity = vec_madd(tempAcceleration, tempDELATTIME, particle_Array[i].velocity); /* //Print velocity printf("Velocity %d: ", i ); printf("x= %f, y=%f, z=%f", pDi.velocity[0], pDi.velocity[1], pDi.velocity[2]); printf("\n"); */ /* printf("Particle %d: ", i ); printf("x= %f, y=%f, z=%f", pDi.velocity[0], pDi.velocity[1], pDi.velocity[2]); printf("\n"); */ //end of this loop } //printf("\n"); } //now that all the accelerations for all particles are calculated, //apply them and update velocity for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { //incrementing position with v*dt // vec_madd is awesome, it all gets done in one line! emulated the += operator, kinda, but more flexible particle_Array[i].position = vec_madd(particle_Array[i].velocity, tempDELATTIME, particle_Array[i].position); /* printf("Particle %d positions: ", i ); printf("x= %f, y=%f, z=%f", particle_Array[i].position[0], particle_Array[i].position[1], particle_Array[i].position[2]); printf("\n"); */ ///// ALL CODE BELOW THIS SHOULD ONLY BE RUN ON PPU \\\\\\\\\\\\\\\\\\ /////////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes // compare with zero vector to get on which side of each axis the particle is // 0 is negative, 1 is positive side of the axis __vector bool int axisDirection = vec_cmpgt(particle_Array[i].position, zeroVector); // need to manually set, can't cast due to size difference error __vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0], (unsigned int)axisDirection[1], (unsigned int)axisDirection[2], 0}; // need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES shiftedAxis = vec_andc(oneVector, shiftedAxis); /* printf("Particle %d axis sign: ", i ); printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]); printf("\n"); */ // shift 3 axies simultaneously (actually only 2, 1 stays in origina positon //, with intent to OR them later shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector __vector unsigned int axis_Y = vec_splats(shiftedAxis[1]); __vector unsigned int axis_Z = vec_splats(shiftedAxis[2]); // merge shhifted x y z values by OR-ing // this gives the octant id, range from 0-7 (000 to 111 in binary) shiftedAxis = vec_or(shiftedAxis, axis_Y); shiftedAxis = vec_or(shiftedAxis, axis_Z); // insert octant value into last slot of position vector of particle particle_Array[i].position[3] = (float)shiftedAxis[0]; //printf("Oct ID: %d \n", shiftedAxis[0]); /////// Update octant vector by incrementing octant that the particle is in // The only possible non SIMD line in the entire program, //irreleant since quadrant counting should occur on PPU anyways octantCount[shiftedAxis[0]] ++ ; } //end of main loop /* printf("End of iteration %d ---> ",it_counter ); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); */ } /* printf("\n"); for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { printf("Particle %d final position: ", i ); printf("x= %f, y=%f, z=%f", particle_Array[i].position[0], particle_Array[i].position[1], particle_Array[i].position[2]); printf("\n"); printf("End of iteration %d ---> ",it_counter ); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); } */ printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); struct timeval end; gettimeofday(&end,NULL); float deltaTime = ((end.tv_sec - start.tv_sec)*1000.0f + (end.tv_usec -start.tv_usec)/1000.0f); printf("Execution time: %f\n",deltaTime); return 0; }
void test1() { // CHECK-LABEL: define void @test1 // CHECK-LE-LABEL: define void @test1 res_vf = vec_abs(vf); // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> // CHECK-LE: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_cpsgn */ res_vf = vec_cpsgn(vf, vf); // CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) // CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) res_vd = vec_cpsgn(vd, vd); // CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) // CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) /* vec_div */ res_vsll = vec_div(vsll, vsll); // CHECK: sdiv <2 x i64> // CHECK-LE: sdiv <2 x i64> res_vull = vec_div(vull, vull); // CHECK: udiv <2 x i64> // CHECK-LE: udiv <2 x i64> res_vf = vec_div(vf, vf); // CHECK: fdiv <4 x float> // CHECK-LE: fdiv <4 x float> res_vd = vec_div(vd, vd); // CHECK: fdiv <2 x double> // CHECK-LE: fdiv <2 x double> /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp // CHECK-LE: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp // CHECK-LE: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp // CHECK-LE: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp // CHECK-LE: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbll = vec_perm(vbll, vbll, vuc); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vf = vec_round(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> res_vd = vec_round(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_splat(vd, 1); // CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vbll = vec_splat(vbll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsll = vec_splat(vsll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vull = vec_splat(vull, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsi = vec_pack(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vui = vec_pack(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbi = vec_pack(vbll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vss = vec_vsx_ld(0, &vss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vss = vec_vsx_ld(0, &ss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &vus); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &us); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vbc = vec_vsx_ld(0, &vbc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &vsc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &vuc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &sc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &uc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsi, 0, &res_si); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_ui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vss, 0, &res_vss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vss, 0, &res_ss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_vus); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_us); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_vsc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_vuc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_vbc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) /* vec_mergeh */ res_vsll = vec_mergeh(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_mergel */ res_vsll = vec_mergel(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_msub */ res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vsll = vec_mul(vsll, vsll); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vull = vec_mul(vull, vull); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_or(vbll, vd); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vd = vec_or(vd, vbll); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vf = vec_re(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> res_vd = vec_re(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_cts(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_cts(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vd = vec_ctf(vsll, 0); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vsll, 31); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 0); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 31); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> }
void init_particles(particle* system){ /* * particles will be placed randomly 4 quadrants of 3d space * q1: x>0, y>0 * q2: x<0, y>0 * q3: x<0, y<0 * q4: x>0, y<0 * */ int i = 0; srand(time(NULL)); // q1 x>0, y>0 for(; i < NO_OF_PARTICLES/4; i++){ system[i].position = (__vector float){rand()%INIT_BOUNDING_BOX, rand()%INIT_BOUNDING_BOX, (rand()%INIT_BOUNDING_BOX*2)-INIT_BOUNDING_BOX, 0}; system[i].velocity = VECZERO; system[i].acceleration = VECZERO; } // q2 x<0, y>0 for(; i < NO_OF_PARTICLES/2; i++){ system[i].position = (__vector float){-rand()%INIT_BOUNDING_BOX, rand()%INIT_BOUNDING_BOX, (rand()%INIT_BOUNDING_BOX*2)-INIT_BOUNDING_BOX, 0}; system[i].velocity = VECZERO; system[i].acceleration = VECZERO; } // q3 x<0, y<0 for(; i < 3*NO_OF_PARTICLES/4; i++){ system[i].position = (__vector float){-rand()%INIT_BOUNDING_BOX, -rand()%INIT_BOUNDING_BOX, (rand()%INIT_BOUNDING_BOX*2)-INIT_BOUNDING_BOX, 0}; system[i].velocity = VECZERO; system[i].acceleration = VECZERO; } // q4 x>0, y<0 for(; i < NO_OF_PARTICLES; i++){ system[i].position = (__vector float){rand()%INIT_BOUNDING_BOX, -rand()%INIT_BOUNDING_BOX, (rand()%INIT_BOUNDING_BOX*2)-INIT_BOUNDING_BOX, 0}; system[i].velocity = VECZERO; system[i].acceleration = VECZERO; } } void compute_interaction(particle* i , particle* j){ __vector float radius, radius_sqr, s_vector, displ, accel, distSqr, distSixth, invDistCube; /*compute acceleration of particle i*/ radius = vec_sub(j->position,i->position); radius_sqr = vec_madd(radius,radius, VECZERO); distSqr = vec_add(vec_splat(radius_sqr,0),vec_splat(radius_sqr,1)); distSqr = vec_add(vec_splat(radius_sqr,2),distSqr); distSqr = vec_add(EPS2_VECTOR,distSqr); distSixth = vec_madd(distSqr,distSqr,VECZERO); distSixth = vec_madd(distSixth,distSqr,VEC3ZERO); invDistCube = vec_rsqrte(distSixth); s_vector = vec_madd(MASS_VECTOR,invDistCube,VECZERO); i->acceleration = vec_madd(radius,s_vector,i->acceleration); /*compute new position & velocity of particle i*/ displ = vec_madd(i->velocity,TIME_STEP_VECTOR,i->position); accel = vec_madd(VECHALF,i->acceleration, VECZERO); i->position = vec_madd(accel,TIME_SQUARED, displ); i->velocity = vec_madd(i->acceleration,TIME_STEP_VECTOR, i->velocity); } void update_particles(particle* system){ int i, j; //Thread 1 ? for(i = 0;i<NO_OF_PARTICLES/4;i++){ for(j = 0;j<NO_OF_PARTICLES;j++){ compute_interaction(&system[i],&system[j]); } } //Thread 2 ? for(i = NO_OF_PARTICLES/4;i<NO_OF_PARTICLES/2;i++){ for(j = 0;j<NO_OF_PARTICLES;j++){ compute_interaction(&system[i],&system[j]); } } //Thread 3 ? for(i = NO_OF_PARTICLES/2;i<3*NO_OF_PARTICLES/4;i++){ for(j = 0;j<NO_OF_PARTICLES;j++){ compute_interaction(&system[i],&system[j]); } } //Thread 4 ? for(i = 3*NO_OF_PARTICLES/4;i<NO_OF_PARTICLES;i++){ for(j = 0;j<NO_OF_PARTICLES;j++){ compute_interaction(&system[i],&system[j]); } } } __vector int get_quadrant_count(particle* system){ __vector int quad_count = VECINTZERO; __vector int quad_mask = VECINTZERO; int i; for(i = 0; i < NO_OF_PARTICLES ; i++){ __vector int top2 = (__vector int){1,1,0,0}; __vector int bottom2 = (__vector int){0,0,1,1}; __vector int left2 = (__vector int){0,1,0,1}; __vector int right2 = (__vector int){1,0,1,0}; __vector int mask1, mask2; __vector float vx = vec_splat(system[i].position,0); __vector float vy = vec_splat(system[i].position,1); mask1 = vec_sel(right2, left2, vec_cmpgt(vx,VECZERO)); mask2 = vec_sel(top2, bottom2, vec_cmpgt(vy,VECZERO)); quad_mask = vec_and(mask1,mask2); quad_count = vec_add(quad_count,quad_mask); } return quad_count; } void render(particle* system){ int i = 0; for(; i < NO_OF_PARTICLES; i++){ float *pos = (float*) &system[i].position; float *vel = (float*) &system[i].velocity; float *acc = (float*) &system[i].acceleration; printf("position : %f %f %f ", pos[0], pos[1], pos[2]); printf("velocity : %f %f %f ", vel[0], vel[1], vel[2]); printf("acceleration : %f %f %f \n", acc[0], acc[1], acc[2]); } } int main () { /* create particle system --> array of particles */ particle particle_system[NO_OF_PARTICLES] __attribute__((aligned(64))); /* vector that tracks the no. of particles in each quadrant */ __vector int quad_count; int * qc; /* place particles in 4 quadrants */ init_particles(particle_system); /* run simulation */ float simulationTime = 0.0; int iterations = COMPUTE_ITERATIONS; printf("----------------------------------------------"); printf("----------------------------------------------\n"); printf("Running Simulation with %d particles & %d iterations with %f seconds time steps\n", NO_OF_PARTICLES, COMPUTE_ITERATIONS, TIME_STEP); printf("----------------------------------------------"); printf("----------------------------------------------\n"); while(iterations > 0){ /* Compute */ update_particles(particle_system); /* Display */ //render(particle_system); /* Update Time */ simulationTime = simulationTime + TIME_STEP; printf("----------------------------------"); printf("Simulation Time: %f |",simulationTime); quad_count = get_quadrant_count(particle_system); qc = (int*)&quad_count; printf(" q1:%d q2:%d q3:%d q4:%d",qc[0], qc[1], qc[2], qc[3]); printf("----------------------------------\n"); iterations --; } return 0; }
void pix_background :: processYUVAltivec(imageStruct &image) { register int h,w,i,j,width; int pixsize = image.xsize * image.ysize * image.csize; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec - try something SD or HD maybe?"); return; } union{ unsigned short s[8]; vector unsigned short v; }shortBuffer; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); m_reset = 0; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector unsigned short Yrange, UVrange, Yblank,UVblank,blank; register vector bool short Ymasklo,Ymaskhi, UVmaskhi; register vector unsigned short Yhi,Ylo,UVhi,UVlo; register vector unsigned char one = vec_splat_u8(1); register vector unsigned short sone = vec_splat_u16(1); register vector unsigned int Uhi, Ulo, Vhi, Vlo,Ures,Vres; register vector bool int Umasklo, Umaskhi, Vmaskhi, Vmasklo; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) m_savedImage.data; shortBuffer.s[0] = m_Yrange; Yrange = shortBuffer.v; Yrange = vec_splat(Yrange,0); shortBuffer.s[0] = 128; shortBuffer.s[1] = 0; shortBuffer.s[2] = 128; shortBuffer.s[3] = 0; shortBuffer.s[4] = 128; shortBuffer.s[5] = 0; shortBuffer.s[6] = 128; shortBuffer.s[7] = 0; blank = shortBuffer.v; shortBuffer.s[0] = 0; Yblank = shortBuffer.v; Yblank = vec_splat(Yblank,0); shortBuffer.s[0] = 128; UVblank = shortBuffer.v; UVblank = vec_splat(UVblank,0); shortBuffer.s[0] = m_Urange; shortBuffer.s[1] = m_Vrange; shortBuffer.s[2] = m_Urange; shortBuffer.s[3] = m_Vrange; shortBuffer.s[4] = m_Urange; shortBuffer.s[5] = m_Vrange; shortBuffer.s[6] = m_Urange; shortBuffer.s[7] = m_Vrange; UVrange = shortBuffer.v; //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //PPC970 for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Yhi = vec_adds(Yres2,Yrange); Ylo = vec_subs(Yres2,Yrange); //go to ints for comparison UVhi = vec_adds(UVres2,UVrange); UVlo = vec_subs(UVres2,UVrange); Uhi = vec_mule(sone,UVhi); Ulo = vec_mule(sone,UVlo); Vhi = vec_mulo(sone,UVhi); Vlo = vec_mulo(sone,UVlo); Ures = vec_mule(sone,UVres1); Vres = vec_mulo(sone,UVres1); Umasklo = vec_cmpgt(Ures,Ulo); Umaskhi = vec_cmplt(Ures,Uhi); Vmasklo = vec_cmpgt(Vres,Vlo); Vmaskhi = vec_cmplt(Vres,Vhi); Umaskhi = vec_and(Umaskhi,Umasklo); Vmaskhi = vec_and(Vmaskhi,Vmasklo); Umasklo = vec_and(Umaskhi,Vmaskhi); Vmasklo = vec_and(Umaskhi,Vmaskhi); hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo); loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo); //pack it back down to bool short UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage); Ymasklo = vec_cmpgt(Yres1,Ylo); Ymaskhi = vec_cmplt(Yres1,Yhi); Ymaskhi = vec_and(Ymaskhi,Ymasklo); Ymaskhi = vec_and(Ymaskhi,UVmaskhi); UVmaskhi = vec_and(Ymaskhi,UVmaskhi); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres1,Yblank,Ymaskhi); //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi); UVres1 = vec_sel(UVres1,UVblank,UVmaskhi); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); #endif } }
static int forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc) { vector float mpv, dpv, ipv; /* previous row values */ vector float sv; /* temp storage of 1 curr row value in progress */ vector float dcv; /* delayed storage of D(i,q+1) */ vector float xEv; /* E state: keeps max for Mk->E as we go */ vector float xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector float zerov; /* splatted 0.0's in a vector */ float xN, xE, xB, xC, xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over quads 0..nq-1 */ int j; /* counter over DD iterations (4 is full serialization) */ int Q = p7O_NQF(om->M); /* segment length: # of vectors */ vector float *dpc = ox->dpf[0]; /* current row, for use in {MDI}MO(dpp,q) access macro */ vector float *dpp; /* previous row, for use in {MDI}MO(dpp,q) access macro */ vector float *rp; /* will point at om->rfv[x] for residue x[i] */ vector float *tp; /* will point into (and step thru) om->tfv */ /* Initialization. */ ox->M = om->M; ox->L = L; ox->has_own_scales = TRUE; /* all forward matrices control their own scalefactors */ zerov = (vector float) vec_splat_u32(0); for (q = 0; q < Q; q++) MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov; xE = ox->xmx[p7X_E] = 0.; xN = ox->xmx[p7X_N] = 1.; xJ = ox->xmx[p7X_J] = 0.; xB = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE]; xC = ox->xmx[p7X_C] = 0.; ox->xmx[p7X_SCALE] = 1.0; ox->totscale = 0.0; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=0, width=8, precision=5*/ #endif for (i = 1; i <= L; i++) { dpp = dpc; dpc = ox->dpf[do_full * i]; /* avoid conditional, use do_full as kronecker delta */ rp = om->rfv[dsq[i]]; tp = om->tfv; dcv = (vector float) vec_splat_u32(0); xEv = (vector float) vec_splat_u32(0); xBv = esl_vmx_set_float(xB); /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. Shift zeros on. */ mpv = vec_sld(zerov, MMO(dpp,Q-1), 12); dpv = vec_sld(zerov, DMO(dpp,Q-1), 12); ipv = vec_sld(zerov, IMO(dpp,Q-1), 12); for (q = 0; q < Q; q++) { /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */ sv = (vector float) vec_splat_u32(0); sv = vec_madd(xBv, *tp, sv); tp++; sv = vec_madd(mpv, *tp, sv); tp++; sv = vec_madd(ipv, *tp, sv); tp++; sv = vec_madd(dpv, *tp, sv); tp++; sv = vec_madd(sv, *rp, zerov); rp++; xEv = vec_add(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMO(dpp,q); dpv = DMO(dpp,q); ipv = IMO(dpp,q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMO(dpc,q) = sv; DMO(dpc,q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = vec_madd(sv, *tp, zerov); tp++; /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */ sv = vec_madd(mpv, *tp, zerov); tp++; IMO(dpc,q) = vec_madd(ipv, *tp, sv); tp++; } /* Now the DD paths. We would rather not serialize them but * in an accurate Forward calculation, we have few options. */ /* dcv has carried through from end of q loop above; store it * in first pass, we add M->D and D->D path into DMX */ /* We're almost certainly're obligated to do at least one complete * DD path to be sure: */ dcv = vec_sld(zerov, dcv, 12); DMO(dpc,0) = (vector float) vec_splat_u32(0); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { DMO(dpc,q) = vec_add(dcv, DMO(dpc,q)); dcv = vec_madd(DMO(dpc,q), *tp, zerov); tp++; /* extend DMO(q), so we include M->D and D->D paths */ } /* now. on small models, it seems best (empirically) to just go * ahead and serialize. on large models, we can do a bit better, * by testing for when dcv (DD path) accrued to DMO(q) is below * machine epsilon for all q, in which case we know DMO(q) are all * at their final values. The tradeoff point is (empirically) somewhere around M=100, * at least on my desktop. We don't worry about the conditional here; * it's outside any inner loops. */ if (om->M < 100) { /* Fully serialized version */ for (j = 1; j < 4; j++) { dcv = vec_sld(zerov, dcv, 12); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { /* note, extend dcv, not DMO(q); only adding DD paths now */ DMO(dpc,q) = vec_add(dcv, DMO(dpc,q)); dcv = vec_madd(dcv, *tp, zerov); tp++; } } } else { /* Slightly parallelized version, but which incurs some overhead */ for (j = 1; j < 4; j++) { vector bool int cv; /* keeps track of whether any DD's change DMO(q) */ dcv = vec_sld(zerov, dcv, 12); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ cv = (vector bool int) vec_splat_u32(0); for (q = 0; q < Q; q++) { /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */ sv = vec_add(dcv, DMO(dpc,q)); cv = vec_or(cv, vec_cmpgt(sv, DMO(dpc,q))); DMO(dpc,q) = sv; /* store new DMO(q) */ dcv = vec_madd(dcv, *tp, zerov); tp++; /* note, extend dcv, not DMO(q) */ } /* DD's didn't change any DMO(q)? Then done, break out. */ if (vec_all_eq(cv, (vector bool int)zerov)) break; } } /* Add D's to xEv */ for (q = 0; q < Q; q++) xEv = vec_add(DMO(dpc,q), xEv); /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */ /* The following incantation is a horizontal sum of xEv's elements */ /* These must follow DD calculations, because D's contribute to E in Forward * (as opposed to Viterbi) */ xE = esl_vmx_hsum_float(xEv); xN = xN * om->xf[p7O_N][p7O_LOOP]; xC = (xC * om->xf[p7O_C][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_MOVE]); xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_LOOP]); xB = (xJ * om->xf[p7O_J][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Sparse rescaling. xE above threshold? trigger a rescaling event. */ if (xE > 1.0e4) /* that's a little less than e^10, ~10% of our dynamic range */ { xN = xN / xE; xC = xC / xE; xJ = xJ / xE; xB = xB / xE; xEv = esl_vmx_set_float(1.0 / xE); for (q = 0; q < Q; q++) { MMO(dpc,q) = vec_madd(MMO(dpc,q), xEv, zerov); DMO(dpc,q) = vec_madd(DMO(dpc,q), xEv, zerov); IMO(dpc,q) = vec_madd(IMO(dpc,q), xEv, zerov); } ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE; ox->totscale += log(xE); xE = 1.0; } else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0; /* Storage of the specials. We could've stored these already * but using xE, etc. variables makes it easy to convert this * code to O(M) memory versions just by deleting storage steps. */ ox->xmx[i*p7X_NXCELLS+p7X_E] = xE; ox->xmx[i*p7X_NXCELLS+p7X_N] = xN; ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ; ox->xmx[i*p7X_NXCELLS+p7X_B] = xB; ox->xmx[i*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=i, width=8, precision=5*/ #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and flip total score back to log space (nats) */ /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */ /* On an underflow (which shouldn't happen), we counterintuitively return infinity: * the effect of this is to force the caller to rescore us with full range. */ if (isnan(xC)) ESL_EXCEPTION(eslERANGE, "forward score is NaN"); else if (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)"); /* [J5/118] */ else if (isinf(xC) == 1) ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)"); if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]); return eslOK; }