int utpm_imul(int P, int D, int M, int N, double *y, int ldy, double *x, int ldx){ /* computes Y *= X in Taylor arithmetic */ int k,d,p; double *xd, *yd, *zd; double *xp, *yp; int pstridex, pstridey; int dstridex, dstridey; dstridex = ldx*N; dstridey = ldy*N; pstridex = (D-1)*dstridex; pstridey = (D-1)*dstridey; /* d > 0: higher order coefficients */ for(p = 0; p < P; ++p){ xp = x + p*pstridex; yp = y + p*pstridey; for(d = D-1; 0 < d; --d){ /* compute y_d += x_0 y_d */ xd = x; yd = yp + d*dstridey; zd = yd; imul(M, N, yd, ldy, xd, ldx); /* compute y_d += sum_{k=1}^{d-1} x_k y_{d-k} */ xd = xp + dstridex; yd = yp + (d-1)*dstridey; for(k = 1; k < d; ++k){ amul(M, N, xd, ldx, yd, ldy, zd, ldy); ++xd; yd -= (2*dstridey-1); } /* compute y_d += x_d y_0 */ yd = y; xd = xp + d*dstridex; amul(M, N, xd, ldx, yd, ldy, zd, ldy); } } yd = y; xd = x; /* d = 0: base point z_0 */ imul(M, N, yd, ldy, xd, ldx); return 0; }
Code() { Xbyak::Label label; cmpss(xmm0, ptr[rip + label], 0); test(dword[rip + label], 33); bt(dword[rip + label ], 3); vblendpd(xmm0, dword[rip + label], 3); vpalignr(xmm0, qword[rip + label], 4); vextractf128(dword[rip + label], ymm3, 12); vperm2i128(ymm0, ymm1, qword[rip + label], 13); vcvtps2ph(ptr[rip + label], xmm2, 44); mov(dword[rip + label], 0x1234); shl(dword[rip + label], 3); shr(dword[rip + label], 1); shld(qword[rip + label], rax, 3); imul(rax, qword[rip + label], 21); rorx(rax, qword[rip + label], 21); test(dword[rip + label], 5); pextrq(ptr[rip + label], xmm0, 3); pinsrq(xmm2, ptr[rip + label], 5); pextrw(ptr[rip + label], xmm1, 4); adc(dword[rip + label], 0x12345); bt(byte[rip + label], 0x34); btc(word[rip + label], 0x34); btr(dword[rip + label], 0x34); rcl(dword[rip + label], 4); shld(qword[rip + label], rax, 4); palignr(mm0, ptr[rip + label], 4); aeskeygenassist(xmm3, ptr[rip + label], 4); vpcmpestrm(xmm2, ptr[rip + label], 7); ret(); L(label); dq(0x123456789abcdef0ull); };
int main() { int i; scanf( "%d %d", &n, &k); b2[0] = 1; b2[1] = k - 1; b1[0] = b1[1] = 1; imul(b1, k * (k - 1)); if (n == 1) b1[0] = 1, b1[1] = k - 1; for (i = 3; i <= n; i++) { memcpy(b, b1, sizeof(b1)); add(b1, b2); imul(b1, k - 1); memcpy(b2, b, sizeof(b)); } for (i = b1[0]; i; printf("%d", b1[i--])); }
int main(int argc, const char *argv[]) { int i; int *mem = malloc(1100000); capture_stdio(); for (i = 0;; ++i) { int32_t t0[2], t1[2]; char *msg; int n; gp_get_usertime(t0); switch (i) { case 0: iadd(0, n = 10000000, &msg); break; case 1: imul(1, n = 1000000, &msg); break; case 2: idiv(1, n = 1000000, &msg); break; case 3: fadd(3.14, n = 10000000, &msg); break; case 4: fmul(1.0000001, n = 10000000, &msg); break; case 5: fdiv(1.0000001, n = 1000000, &msg); break; case 6: fconv(12345, n = 10000000, &msg); break; case 7: mfast(mem, n = 10000000, &msg); break; case 8: mslow(mem, n = 1000000, &msg); break; default: free(mem); exit(0); } gp_get_usertime(t1); fprintf(stdout, "Time for %9d %s = %g ms\n", n, msg, (t1[0] - t0[0]) * 1000.0 + (t1[1] - t0[1]) / 1000000.0); fflush(stdout); } }
ARFloat Tracker::arModifyMatrix(ARFloat rot[3][3], ARFloat trans[3], ARFloat cpara[3][4], ARFloat vertex[][3], ARFloat pos2d[][2], int num) { ARFloat a, b, c; ARFloat a2, b2, c2; ARFloat ma, mb, mc; int t1, t2, t3; int s1, s2, s3; int i, j, k; ARFloat minerr; I32 _hx, _hy, _h; I32 _err, _minerr; U32 _ures; I32 _a1,_b1,_c1; I32 _a2,_b2,_c2; I32 _ma, _mb, _mc; // PROFILE_BEGINSEC(profiler, MODIFYMATRIX) FIXED_VEC3D *_vertex = (FIXED_VEC3D*)malloc(num*sizeof(FIXED_VEC3D)), *_pos2d = (FIXED_VEC3D*)malloc(num*sizeof(FIXED_VEC3D)), _combo[3], _vec1, _vec2, _trans; I32 _combo3[3]; FIXED_VEC3D _cpara[3]; I32 _cpara3[3]; _vec1.z = 0; _trans.x = FIXED_Float_To_Fixed_n(trans[0], 12); _trans.y = FIXED_Float_To_Fixed_n(trans[1], 12); _trans.z = FIXED_Float_To_Fixed_n(trans[2], 12); for(j=0; j<3; j++) { _cpara[j].x = FIXED_Float_To_Fixed_n(cpara[j][0], 12); _cpara[j].y = FIXED_Float_To_Fixed_n(cpara[j][1], 12); _cpara[j].z = FIXED_Float_To_Fixed_n(cpara[j][2], 12); _cpara3[j] = FIXED_Float_To_Fixed_n(cpara[j][3], 12); } for(j=0; j<num; j++) { _vertex[j].x = FIXED_Float_To_Fixed_n(vertex[j][0], BITS); _vertex[j].y = FIXED_Float_To_Fixed_n(vertex[j][1], BITS); _vertex[j].z = FIXED_Float_To_Fixed_n(vertex[j][2], BITS); _pos2d[j].x = FIXED_Float_To_Fixed_n(pos2d[j][0], BITS); _pos2d[j].y = FIXED_Float_To_Fixed_n(pos2d[j][1], BITS); _pos2d[j].z = 0; } arGetAngle( rot, &a, &b, &c ); // _cpara and _trans are constant, which allows us to calcualte _combo3 beforehand... // for(j=0; j<3; j++) { FIXED_VEC3_DOT(_cpara+j, &_trans, _combo3+j, 12); _combo3[j] += _cpara3[j]; _combo3[j] >>= 4; } // PROFILE_BEGINSEC(profiler, MODIFYMATRIX_LOOP) a2 = a; b2 = b; c2 = c; //factor = (ARFloat)(10.0*M_PI/180.0); //I32 fix_a2, fix_b2, fix_c2; I32 fix_factor = FIXED_Float_To_Fixed_n((10.0*M_PI/180.0), 12); I32 fix_a[3], fix_b[3], fix_c[3]; _a2 = FIXED_Float_To_Fixed_n(a2, 12); _b2 = FIXED_Float_To_Fixed_n(b2, 12); _c2 = FIXED_Float_To_Fixed_n(c2, 12); for( j = 0; j < 10; j++ ) { _minerr = 0x40000000; // value fix_a[0] = _a2 - fix_factor; fix_a[1] = _a2; fix_a[2] = _a2 + fix_factor; fix_b[0] = _b2 - fix_factor; fix_b[1] = _b2; fix_b[2] = _b2 + fix_factor; fix_c[0] = _c2 - fix_factor; fix_c[1] = _c2; fix_c[2] = _c2 + fix_factor; for(t1=-1;t1<=1;t1++) { for(t2=-1;t2<=1;t2++) { for(t3=-1;t3<=1;t3++) { _a1 = fix_a[t1+1]; _b1 = fix_b[t2+1]; _c1 = fix_c[t3+1]; //PROFILE_BEGINSEC(profiler, GETNEWMATRIX) arGetNewMatrix12(_a1, _b1, _c1, _trans, NULL, _cpara, _cpara3, _combo, _combo3, profiler); //PROFILE_ENDSEC(profiler, GETNEWMATRIX) for(k=0; k<3; k++) { _combo[k].x >>= 4; _combo[k].y >>= 4; _combo[k].z >>= 4; //_combo3[k] >>= 4; } _err = 0; for( i = 0; i < num; i++ ) { FIXED_VEC3_DOT(_combo+0, _vertex+i, &_hx, BITS); _hx += _combo3[0]; FIXED_VEC3_DOT(_combo+1, _vertex+i, &_hy, BITS); _hy += _combo3[1]; FIXED_VEC3_DOT(_combo+2, _vertex+i, &_h, BITS); _h += _combo3[2]; // old method of doing two divides // there is no reason at all to use this anymore // //FIXED_DIV2(_hx, _h, _vec1.x, BITS); //FIXED_DIV2(_hy, _h, _vec1.y, BITS); #ifndef _USE_DIV_TABLE_ // new method of doing one correct division // and two multiplications. as accurate as two divisions but faster // I64 _rev = ((I64)1<<(BITS+32))/_h; _vec1.x = (I32)((_hx*_rev)>>32); _vec1.y = (I32)((_hy*_rev)>>32); #endif #ifdef _USE_DIV_TABLE_ // latest method of using a lookup table for division // extremely fast (+30%) but less accurate if((_h>>8)>=DIV_TABLE_MIN && (_h>>8)<DIV_TABLE_MAX) { I32 _revT4 = DIV_TABLE_FIXEDx4_FROM_FIXED(_h<<8); I32 __x = imul(_hx, _revT4, 18); I32 __y = imul(_hy, _revT4, 18); #ifdef DEBUG_DIV_RANGE int dx = __x-_vec1.x>0 ? __x-_vec1.x : _vec1.x-__x; int dy = __y-_vec1.y>0 ? __y-_vec1.y : _vec1.y-__y; if(dx>dbgInfo.dxMax) dbgInfo.dxMax = dx; if(dy>dbgInfo.dyMax) dbgInfo.dyMax = dy; if(dx/256.0f > 1.0f) dx = dx; if(dy/256.0f > 1.0f) dy = dy; #endif _vec1.x = __x; _vec1.y = __y; } else { I64 _rev = ((I64)1<<(BITS+32))/_h; _vec1.x = (I32)((_hx*_rev)>>32); _vec1.y = (I32)((_hy*_rev)>>32); } #endif //_USE_DIV_TABLE_ #ifdef DEBUG_DIV_RANGE if(_h<dbgInfo.hMin) dbgInfo.hMin = _h; if(_h>dbgInfo.hMax) dbgInfo.hMax = _h; if(_hx<dbgInfo.hxMin) dbgInfo.hxMin = _hx; if(_hx>dbgInfo.hxMax) dbgInfo.hxMax = _hx; if(_hy<dbgInfo.hyMin) dbgInfo.hyMin = _hy; if(_hy>dbgInfo.hyMax) dbgInfo.hyMax = _hy; if(_hy>dbgInfo.hyMax) dbgInfo.hyMax = _hy; #endif FIXED_VEC2_SUB(_pos2d+i, &_vec1, &_vec2); FIXED_VEC2_LENGTH_SQ(&_vec2, &_ures, BITS); _err += _ures; } if( _err < _minerr ) { _minerr = _err; _ma = _a1; _mb = _b1; _mc = _c1; s1 = t1; s2 = t2; s3 = t3; } } } } if(s1 == 0 && s2 == 0 && s3 == 0) fix_factor >>= 1; _a2 = _ma; _b2 = _mb; _c2 = _mc; }