Beispiel #1
0
int utpm_imul(int P, int D, int M, int N, double *y, int ldy, double *x, int ldx){
    /* computes Y *= X in Taylor arithmetic */
    
    int k,d,p;
    double *xd, *yd, *zd;
    double *xp, *yp;
    
    int pstridex, pstridey;
    int dstridex, dstridey;
    
    dstridex = ldx*N;
    dstridey = ldy*N;
    
    pstridex = (D-1)*dstridex;
    pstridey = (D-1)*dstridey;

    /* d > 0: higher order coefficients */
    for(p = 0; p < P; ++p){
        xp = x + p*pstridex;
        yp = y + p*pstridey;
        
        for(d = D-1; 0 < d; --d){
            
            /* compute y_d += x_0 y_d */
            xd = x;
            yd = yp + d*dstridey;
            zd = yd;
            imul(M, N, yd, ldy, xd, ldx);
            
            /* compute y_d += sum_{k=1}^{d-1} x_k y_{d-k} */
            xd = xp + dstridex;
            yd = yp + (d-1)*dstridey;
            
            for(k = 1; k < d; ++k){
                amul(M, N, xd, ldx, yd, ldy, zd, ldy);
                ++xd;
                yd -= (2*dstridey-1);
            }
            
            /* compute y_d += x_d y_0 */
            yd = y;
            xd = xp + d*dstridex;
            amul(M, N, xd, ldx, yd, ldy, zd, ldy);
        }
    }
    
    yd = y;
    xd = x;
    /* d = 0: base point z_0 */
    imul(M, N, yd, ldy, xd, ldx);
    
    return 0;
}
Beispiel #2
0
	Code()
	{
		Xbyak::Label label;
		cmpss(xmm0, ptr[rip + label], 0);
		test(dword[rip + label], 33);
		bt(dword[rip + label ], 3);
		vblendpd(xmm0, dword[rip + label], 3);
		vpalignr(xmm0, qword[rip + label], 4);
		vextractf128(dword[rip + label], ymm3, 12);
		vperm2i128(ymm0, ymm1, qword[rip + label], 13);
		vcvtps2ph(ptr[rip + label], xmm2, 44);
		mov(dword[rip + label], 0x1234);
		shl(dword[rip + label], 3);
		shr(dword[rip + label], 1);
		shld(qword[rip + label], rax, 3);
		imul(rax, qword[rip + label], 21);
		rorx(rax, qword[rip + label], 21);
		test(dword[rip + label], 5);
		pextrq(ptr[rip + label], xmm0, 3);
		pinsrq(xmm2, ptr[rip + label], 5);
		pextrw(ptr[rip + label], xmm1, 4);
		adc(dword[rip + label], 0x12345);
		bt(byte[rip + label], 0x34);
		btc(word[rip + label], 0x34);
		btr(dword[rip + label], 0x34);
		rcl(dword[rip + label], 4);
		shld(qword[rip + label], rax, 4);
		palignr(mm0, ptr[rip + label], 4);
		aeskeygenassist(xmm3, ptr[rip + label], 4);
		vpcmpestrm(xmm2, ptr[rip + label], 7);
		ret();
	L(label);
		dq(0x123456789abcdef0ull);
	};
Beispiel #3
0
int main()
{
	int i;
	scanf( "%d %d", &n, &k);
	b2[0] = 1; b2[1] = k - 1;
	b1[0] = b1[1] = 1;
	imul(b1, k * (k - 1));
	if (n == 1) b1[0] = 1, b1[1] = k - 1;
	for (i = 3; i <= n; i++)
	{
		memcpy(b, b1, sizeof(b1));
		add(b1, b2);
		imul(b1, k - 1);
		memcpy(b2, b, sizeof(b));
	}
	for (i = b1[0]; i; printf("%d", b1[i--]));
}
Beispiel #4
0
int
main(int argc, const char *argv[])
{
    int i;
    int *mem = malloc(1100000);

    capture_stdio();
    for (i = 0;; ++i) {
	int32_t t0[2], t1[2];
	char *msg;
	int n;

	gp_get_usertime(t0);
	switch (i) {
	    case 0:
		iadd(0, n = 10000000, &msg);
		break;
	    case 1:
		imul(1, n = 1000000, &msg);
		break;
	    case 2:
		idiv(1, n = 1000000, &msg);
		break;
	    case 3:
		fadd(3.14, n = 10000000, &msg);
		break;
	    case 4:
		fmul(1.0000001, n = 10000000, &msg);
		break;
	    case 5:
		fdiv(1.0000001, n = 1000000, &msg);
		break;
	    case 6:
		fconv(12345, n = 10000000, &msg);
		break;
	    case 7:
		mfast(mem, n = 10000000, &msg);
		break;
	    case 8:
		mslow(mem, n = 1000000, &msg);
		break;
	    default:
		free(mem);
		exit(0);
	}
	gp_get_usertime(t1);
	fprintf(stdout, "Time for %9d %s = %g ms\n", n, msg,
		(t1[0] - t0[0]) * 1000.0 + (t1[1] - t0[1]) / 1000000.0);
	fflush(stdout);
    }
}
Beispiel #5
0
ARFloat
Tracker::arModifyMatrix(ARFloat rot[3][3], ARFloat trans[3], ARFloat cpara[3][4],
        ARFloat vertex[][3], ARFloat pos2d[][2], int num)
{
    ARFloat a, b, c;
    ARFloat a2, b2, c2;
    ARFloat ma, mb, mc;
    int t1, t2, t3;
    int s1, s2, s3;
    int i, j, k;
    ARFloat minerr;

    I32 _hx, _hy, _h;
    I32 _err, _minerr;
    U32 _ures;
    I32 _a1,_b1,_c1;
    I32 _a2,_b2,_c2;
    I32 _ma, _mb, _mc;

    //	PROFILE_BEGINSEC(profiler, MODIFYMATRIX)

    FIXED_VEC3D *_vertex = (FIXED_VEC3D*)malloc(num*sizeof(FIXED_VEC3D)),
    *_pos2d = (FIXED_VEC3D*)malloc(num*sizeof(FIXED_VEC3D)),
    _combo[3], _vec1, _vec2, _trans;
    I32 _combo3[3];

    FIXED_VEC3D _cpara[3];
    I32 _cpara3[3];

    _vec1.z = 0;

    _trans.x = FIXED_Float_To_Fixed_n(trans[0], 12);
    _trans.y = FIXED_Float_To_Fixed_n(trans[1], 12);
    _trans.z = FIXED_Float_To_Fixed_n(trans[2], 12);

    for(j=0; j<3; j++)
    {
        _cpara[j].x = FIXED_Float_To_Fixed_n(cpara[j][0], 12);
        _cpara[j].y = FIXED_Float_To_Fixed_n(cpara[j][1], 12);
        _cpara[j].z = FIXED_Float_To_Fixed_n(cpara[j][2], 12);
        _cpara3[j] = FIXED_Float_To_Fixed_n(cpara[j][3], 12);
    }

    for(j=0; j<num; j++)
    {
        _vertex[j].x = FIXED_Float_To_Fixed_n(vertex[j][0], BITS);
        _vertex[j].y = FIXED_Float_To_Fixed_n(vertex[j][1], BITS);
        _vertex[j].z = FIXED_Float_To_Fixed_n(vertex[j][2], BITS);

        _pos2d[j].x = FIXED_Float_To_Fixed_n(pos2d[j][0], BITS);
        _pos2d[j].y = FIXED_Float_To_Fixed_n(pos2d[j][1], BITS);
        _pos2d[j].z = 0;
    }

    arGetAngle( rot, &a, &b, &c );

    // _cpara and _trans are constant, which allows us to calcualte _combo3 beforehand...
    //
    for(j=0; j<3; j++)
    {
        FIXED_VEC3_DOT(_cpara+j, &_trans, _combo3+j, 12);
        _combo3[j] += _cpara3[j];
        _combo3[j] >>= 4;
    }

    //	PROFILE_BEGINSEC(profiler, MODIFYMATRIX_LOOP)

    a2 = a;
    b2 = b;
    c2 = c;
    //factor = (ARFloat)(10.0*M_PI/180.0);

    //I32 fix_a2, fix_b2, fix_c2;
    I32 fix_factor = FIXED_Float_To_Fixed_n((10.0*M_PI/180.0), 12);
    I32 fix_a[3], fix_b[3], fix_c[3];

    _a2 = FIXED_Float_To_Fixed_n(a2, 12);
    _b2 = FIXED_Float_To_Fixed_n(b2, 12);
    _c2 = FIXED_Float_To_Fixed_n(c2, 12);

    for( j = 0; j < 10; j++ ) {
        _minerr = 0x40000000; // value

        fix_a[0] = _a2 - fix_factor; fix_a[1] = _a2; fix_a[2] = _a2 + fix_factor;
        fix_b[0] = _b2 - fix_factor; fix_b[1] = _b2; fix_b[2] = _b2 + fix_factor;
        fix_c[0] = _c2 - fix_factor; fix_c[1] = _c2; fix_c[2] = _c2 + fix_factor;

        for(t1=-1;t1<=1;t1++) {
            for(t2=-1;t2<=1;t2++) {
                for(t3=-1;t3<=1;t3++) {

                    _a1 = fix_a[t1+1];
                    _b1 = fix_b[t2+1];
                    _c1 = fix_c[t3+1];

                    //PROFILE_BEGINSEC(profiler, GETNEWMATRIX)
                    arGetNewMatrix12(_a1, _b1, _c1, _trans, NULL, _cpara, _cpara3, _combo, _combo3, profiler);
                    //PROFILE_ENDSEC(profiler, GETNEWMATRIX)

                    for(k=0; k<3; k++)
                    {
                        _combo[k].x >>= 4;
                        _combo[k].y >>= 4;
                        _combo[k].z >>= 4;
                        //_combo3[k] >>= 4;
                    }

                    _err = 0;
                    for( i = 0; i < num; i++ ) {
                        FIXED_VEC3_DOT(_combo+0, _vertex+i, &_hx, BITS);
                        _hx += _combo3[0];

                        FIXED_VEC3_DOT(_combo+1, _vertex+i, &_hy, BITS);
                        _hy += _combo3[1];

                        FIXED_VEC3_DOT(_combo+2, _vertex+i, &_h, BITS);
                        _h += _combo3[2];

                        // old method of doing two divides
                        // there is no reason at all to use this anymore
                        //
                        //FIXED_DIV2(_hx, _h, _vec1.x, BITS);
                        //FIXED_DIV2(_hy, _h, _vec1.y, BITS);


#ifndef _USE_DIV_TABLE_
                        // new method of doing one correct division
                        // and two multiplications. as accurate as two divisions but faster
                        //
                        I64 _rev = ((I64)1<<(BITS+32))/_h;
                        _vec1.x = (I32)((_hx*_rev)>>32);
                        _vec1.y = (I32)((_hy*_rev)>>32);
#endif

#ifdef _USE_DIV_TABLE_
                        // latest method of using a lookup table for division
                        // extremely fast (+30%) but less accurate
                        if((_h>>8)>=DIV_TABLE_MIN && (_h>>8)<DIV_TABLE_MAX)
                        {
                            I32 _revT4 = DIV_TABLE_FIXEDx4_FROM_FIXED(_h<<8);
                            I32 __x = imul(_hx, _revT4, 18);
                            I32 __y = imul(_hy, _revT4, 18);

#ifdef DEBUG_DIV_RANGE
                            int dx = __x-_vec1.x>0 ? __x-_vec1.x : _vec1.x-__x;
                            int dy = __y-_vec1.y>0 ? __y-_vec1.y : _vec1.y-__y;
                            if(dx>dbgInfo.dxMax) dbgInfo.dxMax = dx;
                            if(dy>dbgInfo.dyMax) dbgInfo.dyMax = dy;

                            if(dx/256.0f > 1.0f)
                            dx = dx;
                            if(dy/256.0f > 1.0f)
                            dy = dy;
#endif
                            _vec1.x = __x;
                            _vec1.y = __y;
                        }
                        else
                        {
                            I64 _rev = ((I64)1<<(BITS+32))/_h;
                            _vec1.x = (I32)((_hx*_rev)>>32);
                            _vec1.y = (I32)((_hy*_rev)>>32);
                        }
#endif //_USE_DIV_TABLE_
#ifdef DEBUG_DIV_RANGE
                        if(_h<dbgInfo.hMin) dbgInfo.hMin = _h;
                        if(_h>dbgInfo.hMax) dbgInfo.hMax = _h;
                        if(_hx<dbgInfo.hxMin) dbgInfo.hxMin = _hx;
                        if(_hx>dbgInfo.hxMax) dbgInfo.hxMax = _hx;
                        if(_hy<dbgInfo.hyMin) dbgInfo.hyMin = _hy;
                        if(_hy>dbgInfo.hyMax) dbgInfo.hyMax = _hy;
                        if(_hy>dbgInfo.hyMax) dbgInfo.hyMax = _hy;
#endif

                        FIXED_VEC2_SUB(_pos2d+i, &_vec1, &_vec2);
                        FIXED_VEC2_LENGTH_SQ(&_vec2, &_ures, BITS);
                        _err += _ures;
                    }

                    if( _err < _minerr ) {
                        _minerr = _err;
                        _ma = _a1;
                        _mb = _b1;
                        _mc = _c1;
                        s1 = t1; s2 = t2; s3 = t3;
                    }
                }
            }
        }

        if(s1 == 0 && s2 == 0 && s3 == 0)
        fix_factor >>= 1;

        _a2 = _ma;
        _b2 = _mb;
        _c2 = _mc;
    }