Beispiel #1
0
int main(int argc, const char * argv[]) {
    add(1,2);
    add(1,2);
    PRINTMAX(12, 13);
    PRINTMAX(12, 13);
    printf("%d\n",MAXOFNUMBER(100, 200));
    
    printf("*******************\n");
    double sum = ADD(1.1, 2);//预处理 阶段 就会换成 1+2
    printf("sum = %f\n",sum);
    
    printf("%d\n",ADD(1, 2)*ADD(2, 3));//8 //1+2*2+3
    printf("%d\n",ADD2(1, 2)*ADD2(2, 3));//(1+2)*(2+3)
    
    printf("%d\n",MUL(3-1, 5-2));//(3-1*5-2)
    printf("%d\n",MUL2(3-1, 5-2));//((3-1)*(5-2))
    
    
    printf("*******************\n");
    printf(kPath);
    double r = 2.0;
    double s = PI*r*r;
    double c = 2*PI*r;
    printf("s = %f c= %f\n",s,c);
    return 0;
}
Beispiel #2
0
static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
                                            const uint8_t* ppred,
                                            uint8_t* poutput, int stride,
                                            int size) {
  int w;
  const v16i8 zero = { 0 };
  while (size >= 16) {
    v16u8 pred0, dst0;
    v8i16 a0, a1, b0, b1, c0, c1;
    const v16u8 tmp0 = LD_UB(ppred - 1);
    const v16u8 tmp1 = LD_UB(ppred - stride);
    const v16u8 tmp2 = LD_UB(ppred - stride - 1);
    const v16u8 src0 = LD_UB(pinput);
    ILVRL_B2_SH(zero, tmp0, a0, a1);
    ILVRL_B2_SH(zero, tmp1, b0, b1);
    ILVRL_B2_SH(zero, tmp2, c0, c1);
    ADD2(a0, b0, a1, b1, a0, a1);
    SUB2(a0, c0, a1, c1, a0, a1);
    CLIP_SH2_0_255(a0, a1);
    pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
    dst0 = src0 - pred0;
    ST_UB(dst0, poutput);
    ppred += 16;
    pinput += 16;
    poutput += 16;
    size -= 16;
  }
  for (w = 0; w < size; ++w) {
    const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
    poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
  }
}
Beispiel #3
0
MMath::MMath(): Methoded("math") {
	// ^FUNC(expr)	
#define ADDX(name, X) \
	add_native_method(#name, Method::CT_STATIC, _##name, X, X)
#define ADD0(name) ADDX(name, 0)
#define ADD1(name) ADDX(name, 1)
#define ADD2(name) ADDX(name, 2)

	ADD1(round);	ADD1(floor);	ADD1(ceiling);
	ADD1(trunc);	ADD1(frac);
	ADD1(abs);	ADD1(sign);
	ADD1(exp);
	ADD1(log);	ADD1(log10);
	ADD1(sin);	ADD1(asin);	
	ADD1(cos);	ADD1(acos);	
	ADD1(tan);	ADD1(atan);
	ADD1(degrees);	ADD1(radians);
	ADD1(sqrt);
	ADD1(random);

	// ^math:pow(x;y)
	ADD2(pow);

	// ^math:crypt[password;salt]
	ADD2(crypt);

	// ^math:md5[string]
	ADD1(md5);

	// ^math:sha1[string]
	ADD1(sha1);
	
	// ^math:digest[method;string|file;options]
	add_native_method("digest", Method::CT_STATIC, _digest, 2, 3);
	
	// ^math:crc32[string]
	ADD1(crc32);

	// ^math:uuid[]
	ADD0(uuid);

	// ^math:uid64[]
	ADD0(uid64);

	// ^math:convert[number](base-from;base-to)
	add_native_method("convert", Method::CT_STATIC, _convert, 3, 3);
}
Beispiel #4
0
void C_RLibrary::initbuiltinape(void)
{
#define ADD(sym) extern C_RBASE * sym(char *desc); _add_dll(0,sym,"Builtin_" #sym, 0)  
#define ADD2(sym,name) extern C_RBASE * sym(char *desc); _add_dll(0,sym,name, 0)  
#ifdef LASER
  ADD(RLASER_Cone);
  ADD(RLASER_BeatHold);
  ADD(RLASER_Line);
  ADD(RLASER_Bren); // not including it for now
  ADD(RLASER_Transform);
#else
  ADD2(R_ChannelShift,"Channel Shift");
  ADD2(R_ColorReduction,"Color Reduction");
  ADD2(R_Multiplier,"Multiplier");
  ADD2(R_VideoDelay,"Holden04: Video Delay");
  ADD2(R_MultiDelay,"Holden05: Multi Delay");
#endif
#undef ADD
#undef ADD2
}
Beispiel #5
0
double tan(double x) {
#include "utan.h"
#include "utan.tbl"

    int ux,i,n;
    double a,da,a2,b,db,c,dc,c1,cc1,c2,cc2,c3,cc3,fi,ffi,gi,pz,s,sy,
           t,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,w,x2,xn,xx2,y,ya,yya,z0,z,zz,z2,zz2;
    int p;
    number num,v;
    mp_no mpa,mpt1,mpt2;
#if 0
    mp_no mpy;
#endif

    int __branred(double, double *, double *);
    int __mpranred(double, mp_no *, int);

    /* x=+-INF, x=NaN */
    num.d = x;
    ux = num.i[HIGH_HALF];
    if ((ux&0x7ff00000)==0x7ff00000) return x-x;

    w=(x<ZERO) ? -x : x;

    /* (I) The case abs(x) <= 1.259e-8 */
    if (w<=g1.d)  return x;

    /* (II) The case 1.259e-8 < abs(x) <= 0.0608 */
    if (w<=g2.d) {

        /* First stage */
        x2 = x*x;
        t2 = x*x2*(d3.d+x2*(d5.d+x2*(d7.d+x2*(d9.d+x2*d11.d))));
        if ((y=x+(t2-u1.d*t2)) == x+(t2+u1.d*t2))  return y;

        /* Second stage */
        c1 = x2*(a15.d+x2*(a17.d+x2*(a19.d+x2*(a21.d+x2*(a23.d+x2*(a25.d+
                                               x2*a27.d))))));
        EMULV(x,x,x2,xx2,t1,t2,t3,t4,t5)
        ADD2(a13.d,aa13.d,c1,zero.d,c2,cc2,t1,t2)
        MUL2(x2,xx2,c2,cc2,c1,cc1,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(a11.d,aa11.d,c1,cc1,c2,cc2,t1,t2)
        MUL2(x2,xx2,c2,cc2,c1,cc1,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(a9.d ,aa9.d ,c1,cc1,c2,cc2,t1,t2)
        MUL2(x2,xx2,c2,cc2,c1,cc1,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(a7.d ,aa7.d ,c1,cc1,c2,cc2,t1,t2)
        MUL2(x2,xx2,c2,cc2,c1,cc1,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(a5.d ,aa5.d ,c1,cc1,c2,cc2,t1,t2)
        MUL2(x2,xx2,c2,cc2,c1,cc1,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(a3.d ,aa3.d ,c1,cc1,c2,cc2,t1,t2)
        MUL2(x2,xx2,c2,cc2,c1,cc1,t1,t2,t3,t4,t5,t6,t7,t8)
        MUL2(x ,zero.d,c1,cc1,c2,cc2,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(x    ,zero.d,c2,cc2,c1,cc1,t1,t2)
        if ((y=c1+(cc1-u2.d*c1)) == c1+(cc1+u2.d*c1))  return y;
        return tanMp(x);
    }
Beispiel #6
0
void
SECTION
__doasin(double x, double dx, double v[]) {

#include "doasin.h"

  static const double
    d5 =  0.22372159090911789889975459505194491E-01,
    d6 =  0.17352764422456822913014975683014622E-01,
    d7 =  0.13964843843786693521653681033981614E-01,
    d8 =  0.11551791438485242609036067259086589E-01,
    d9 =  0.97622386568166960207425666787248914E-02,
    d10 = 0.83638737193775788576092749009744976E-02,
    d11 = 0.79470250400727425881446981833568758E-02;

  double xx,p,pp,u,uu,r,s;
  double tc,tcc;
#ifndef DLA_FMS
  double hx,tx,hy,ty,tp,tq;
#endif


/* Taylor series for arcsin for Double-Length numbers         */
  xx = x*x+2.0*x*dx;
  p = ((((((d11*xx+d10)*xx+d9)*xx+d8)*xx+d7)*xx+d6)*xx+d5)*xx;
  pp = 0;

  MUL2(x,dx,x,dx,u,uu,tp,hx,tx,hy,ty,tq,tc,tcc);
  ADD2(p,pp,c4.x,cc4.x,p,pp,r,s);
  MUL2(p,pp,u,uu,p,pp,tp,hx,tx,hy,ty,tq,tc,tcc);
  ADD2(p,pp,c3.x,cc3.x,p,pp,r,s);
  MUL2(p,pp,u,uu,p,pp,tp,hx,tx,hy,ty,tq,tc,tcc);
  ADD2(p,pp,c2.x,cc2.x,p,pp,r,s);
  MUL2(p,pp,u,uu,p,pp,tp,hx,tx,hy,ty,tq,tc,tcc);
  ADD2(p,pp,c1.x,cc1.x,p,pp,r,s);
  MUL2(p,pp,u,uu,p,pp,tp,hx,tx,hy,ty,tq,tc,tcc);
  MUL2(p,pp,x,dx,p,pp,tp,hx,tx,hy,ty,tq,tc,tcc);
  ADD2(p,pp,x,dx,p,pp,r,s);
  v[0]=p;
  v[1]=pp; /* arcsin(x+dx)=v[0]+v[1] */
}
Beispiel #7
0
void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v4i32 vec_w;

  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
  ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
  ADD2(in0, in2, in4, in6, in0, in4);
  vec_w = __msa_hadd_s_w(in0, in0);
  vec_w += __msa_hadd_s_w(in4, in4);
  out[0] = HADD_SW_S32(vec_w);
  out[1] = 0;
}
Beispiel #8
0
static void shDrawPaintMesh(VGContext *c, SHVector2 *min, SHVector2 *max,
                            VGPaintMode mode, GLenum texUnit)
{
  SHPaint *p;
  SHVector2 pmin, pmax;
  SHfloat K = 1.0f;
  
  /* Pick the right paint */
  if (mode == VG_FILL_PATH) {
    p = (c->fillPaint ? c->fillPaint : &c->defaultPaint);
  }else if (mode == VG_STROKE_PATH) {
    p = (c->strokePaint ? c->strokePaint : &c->defaultPaint);
    K = SH_CEIL(c->strokeMiterLimit * c->strokeLineWidth) + 1.0f;
  }
  
  /* We want to be sure to cover every pixel of this path so better
     take a pixel more than leave some out (multisampling is tricky). */
  SET2V(pmin, (*min)); SUB2(pmin, K,K);
  SET2V(pmax, (*max)); ADD2(pmax, K,K);

  /* Construct appropriate OpenGL primitives so as
     to fill the stencil mask with select paint */

  switch (p->type) {
  case VG_PAINT_TYPE_LINEAR_GRADIENT:
    shDrawLinearGradientMesh(p, min, max, mode, texUnit);
    break;

  case VG_PAINT_TYPE_RADIAL_GRADIENT:
    shDrawRadialGradientMesh(p, min, max, mode, texUnit);
    break;
    
  case VG_PAINT_TYPE_PATTERN:
    if (p->pattern != VG_INVALID_HANDLE) {
      shDrawPatternMesh(p, min, max, mode, texUnit);
      break;
    }/* else behave as a color paint */
  
  case VG_PAINT_TYPE_COLOR:
    glColor4fv((GLfloat*)&p->color);
    glBegin(GL_QUADS);
    glVertex2f(pmin.x, pmin.y);
    glVertex2f(pmax.x, pmin.y);
    glVertex2f(pmax.x, pmax.y);
    glVertex2f(pmin.x, pmax.y);
    glEnd();
    break;
  }
}
Beispiel #9
0
static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
{
    uint32_t dst0, dst1, dst2, dst3;
    v8i16 dst_r0, dst_l0, in0, in1;
    v4i32 dst_vec = { 0 };
    v16u8 zeros = { 0 };

    LD_SH2(coeffs, 8, in0, in1);
    LW4(dst, stride, dst0, dst1, dst2, dst3);
    INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec);
    ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0);
    ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
    CLIP_SH2_0_255(dst_r0, dst_l0);
    dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
    ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
}
Beispiel #10
0
void
SECTION
__dubsin (double x, double dx, double v[])
{
  double r, s, c, cc, d, dd, d2, dd2, e, ee,
	 sn, ssn, cs, ccs, ds, dss, dc, dcc;
#ifndef DLA_FMS
  double p, hx, tx, hy, ty, q;
#endif
  mynumber u;
  int4 k;

  u.x = x + big.x;
  k = u.i[LOW_HALF] << 2;
  x = x - (u.x - big.x);
  d = x + dx;
  dd = (x - d) + dx;
  /* sin(x+dx)=sin(Xi+t)=sin(Xi)*cos(t) + cos(Xi)sin(t) where t ->0 */
  MUL2 (d, dd, d, dd, d2, dd2, p, hx, tx, hy, ty, q, c, cc);
  sn = __sincostab.x[k];       /*                                  */
  ssn = __sincostab.x[k + 1];  /*      sin(Xi) and cos(Xi)         */
  cs = __sincostab.x[k + 2];   /*                                  */
  ccs = __sincostab.x[k + 3];  /*                                  */
  /* Taylor series for sin ds=sin(t) */
  MUL2 (d2, dd2, s7.x, ss7.x, ds, dss, p, hx, tx, hy, ty, q, c, cc);
  ADD2 (ds, dss, s5.x, ss5.x, ds, dss, r, s);
  MUL2 (d2, dd2, ds, dss, ds, dss, p, hx, tx, hy, ty, q, c, cc);
  ADD2 (ds, dss, s3.x, ss3.x, ds, dss, r, s);
  MUL2 (d2, dd2, ds, dss, ds, dss, p, hx, tx, hy, ty, q, c, cc);
  MUL2 (d, dd, ds, dss, ds, dss, p, hx, tx, hy, ty, q, c, cc);
  ADD2 (ds, dss, d, dd, ds, dss, r, s);

  /* Taylor series for cos dc=cos(t) */
  MUL2 (d2, dd2, c8.x, cc8.x, dc, dcc, p, hx, tx, hy, ty, q, c, cc);
  ADD2 (dc, dcc, c6.x, cc6.x, dc, dcc, r, s);
  MUL2 (d2, dd2, dc, dcc, dc, dcc, p, hx, tx, hy, ty, q, c, cc);
  ADD2 (dc, dcc, c4.x, cc4.x, dc, dcc, r, s);
  MUL2 (d2, dd2, dc, dcc, dc, dcc, p, hx, tx, hy, ty, q, c, cc);
  ADD2 (dc, dcc, c2.x, cc2.x, dc, dcc, r, s);
  MUL2 (d2, dd2, dc, dcc, dc, dcc, p, hx, tx, hy, ty, q, c, cc);

  MUL2 (cs, ccs, ds, dss, e, ee, p, hx, tx, hy, ty, q, c, cc);
  MUL2 (dc, dcc, sn, ssn, dc, dcc, p, hx, tx, hy, ty, q, c, cc);
  SUB2 (e, ee, dc, dcc, e, ee, r, s);
  ADD2 (e, ee, sn, ssn, e, ee, r, s);                    /* e+ee=sin(x+dx) */

  v[0] = e;
  v[1] = ee;
}
Beispiel #11
0
void __dubsin(Double x, Double dx, Double v[]) {
  Double r,s,p,hx,tx,hy,ty,q,c,cc,d,dd,d2,dd2,e,ee,
    sn,ssn,cs,ccs,ds,dss,dc,dcc;
#if 0
  Double xx,y,yy,z,zz;
#endif
  mynumber u;
  int4 k;

  u.x()=x+big.x();
  k = u.i[LOW_HALF]<<2;
  x=x-(u.x()-big.x());
  d=x+dx;
  dd=(x-d)+dx;
         /* sin(x+dx)=sin(Xi+t)=sin(Xi)*cos(t) + cos(Xi)sin(t) where t ->0 */
  MUL2(d,dd,d,dd,d2,dd2,p,hx,tx,hy,ty,q,c,cc);
  sn=sincos.x(k);     /*                                  */
  ssn=sincos.x(k+1);  /*      sin(Xi) and cos(Xi)         */
  cs=sincos.x(k+2);   /*                                  */
  ccs=sincos.x(k+3);  /*                                  */
  MUL2(d2,dd2,s7.x(),ss7.x(),ds,dss,p,hx,tx,hy,ty,q,c,cc);  /* Taylor    */
  ADD2(ds,dss,s5.x(),ss5.x(),ds,dss,r,s);
  MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);      /* series    */
  ADD2(ds,dss,s3.x(),ss3.x(),ds,dss,r,s);
  MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);      /* for sin   */
  MUL2(d,dd,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  ADD2(ds,dss,d,dd,ds,dss,r,s);                         /* ds=sin(t) */

  MUL2(d2,dd2,c8.x(),cc8.x(),dc,dcc,p,hx,tx,hy,ty,q,c,cc); ;/* Taylor    */
  ADD2(dc,dcc,c6.x(),cc6.x(),dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);      /* series    */
  ADD2(dc,dcc,c4.x(),cc4.x(),dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);      /* for cos   */
  ADD2(dc,dcc,c2.x(),cc2.x(),dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);      /* dc=cos(t) */

  MUL2(cs,ccs,ds,dss,e,ee,p,hx,tx,hy,ty,q,c,cc);
  MUL2(dc,dcc,sn,ssn,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  SUB2(e,ee,dc,dcc,e,ee,r,s);
  ADD2(e,ee,sn,ssn,e,ee,r,s);                    /* e+ee=sin(x+dx) */

  v[0]=e;
  v[1]=ee;
}
Beispiel #12
0
uint32_t vp10_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
  uint32_t sum_out;
  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
  v4u32 sum = { 0 };

  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
  sum0 += sum4;

  sum = __msa_hadd_u_w(sum0, sum0);
  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
  sum = __msa_hadd_u_w(sum0, sum0);
  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
  sum_out = __msa_copy_u_w((v4i32)sum, 0);

  return sum_out;
}
Beispiel #13
0
void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
  int sum, i;
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v4i32 vec_w = { 0 };

  for (i = 0; i < 4; ++i) {
    LD_SH2(input, 8, in0, in1);
    input += stride;
    LD_SH2(input, 8, in2, in3);
    input += stride;
    LD_SH2(input, 8, in4, in5);
    input += stride;
    LD_SH2(input, 8, in6, in7);
    input += stride;
    ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
    ADD2(in0, in2, in4, in6, in0, in4);
    vec_w += __msa_hadd_s_w(in0, in0);
    vec_w += __msa_hadd_s_w(in4, in4);
  }

  sum = HADD_SW_S32(vec_w);
  out[0] = (int16_t)(sum >> 1);
}
Beispiel #14
0
static void render_world_transform (Render* render, Ship* player)
{
	COPY(render->cam, player->pos);

	if (render->shaking) {
		float hit = ship_hit(player);
		Vec3 shake = {
			shake_hit    * RAND * player->radius * hit + 
			shake_vel    * RAND * player->radius * player->vel[0] / MAX_VEL_X +
			shake_thrust * RAND * player->radius * player->lefton + 
			shake_thrust * RAND * player->radius * player->righton +
			shake_velZ   * RAND * player->radius * player->vel[2] / MAX_VEL_Z,
					 
			shake_hit    * RAND * player->radius * hit + 
			shake_vel    * RAND * player->radius * player->vel[1] / MAX_VEL_Y +
			shake_velZ   * RAND * player->radius * player->vel[2] / MAX_VEL_Z,
					 
			shake_hit    * RAND * player->radius * hit +
			shake_vel    * RAND * player->radius * player->vel[2] / MAX_VEL_Z
		};
		ADD(render->cam, shake);
		ADDSCALE(render->cam, player->repulsion, hit * player->radius * 2);
	}

	ADD2(render->target, player->pos, player->lookAt);
	//render->target[1]=render->target[1]*.5+player->pos[1]*.5;
	//render->target[2]+=10;

	GLfloat lightpos[] = {render->cam[0], render->cam[1], render->cam[2] + 1, 1.0f};
	glLightfv(GL_LIGHT1, GL_POSITION, lightpos);

	gluLookAt(
		render->cam[0], render->cam[1], render->cam[2],
		render->target[0], render->target[1], render->target[2],
			sin(player->roll), cos(player->roll), 0
	);
}
Beispiel #15
0
double
SECTION
__ieee754_atan2 (double y, double x)
{
  int i, de, ux, dx, uy, dy;
  static const int pr[MM] = { 6, 8, 10, 20, 32 };
  double ax, ay, u, du, u9, ua, v, vv, dv, t1, t2, t3, t7, t8,
    z, zz, cor, s1, ss1, s2, ss2;
#ifndef DLA_FMS
  double t4, t5, t6;
#endif
  number num;

  static const int ep = 59768832,	/*  57*16**5   */
    em = -59768832;		/* -57*16**5   */

  /* x=NaN or y=NaN */
  num.d = x;
  ux = num.i[HIGH_HALF];
  dx = num.i[LOW_HALF];
  if ((ux & 0x7ff00000) == 0x7ff00000)
    {
      if (((ux & 0x000fffff) | dx) != 0x00000000)
	return x + x;
    }
  num.d = y;
  uy = num.i[HIGH_HALF];
  dy = num.i[LOW_HALF];
  if ((uy & 0x7ff00000) == 0x7ff00000)
    {
      if (((uy & 0x000fffff) | dy) != 0x00000000)
	return y + y;
    }

  /* y=+-0 */
  if (uy == 0x00000000)
    {
      if (dy == 0x00000000)
	{
	  if ((ux & 0x80000000) == 0x00000000)
	    return 0;
	  else
	    return opi.d;
	}
    }
  else if (uy == 0x80000000)
    {
      if (dy == 0x00000000)
	{
	  if ((ux & 0x80000000) == 0x00000000)
	    return -0.0;
	  else
	    return mopi.d;
	}
    }

  /* x=+-0 */
  if (x == 0)
    {
      if ((uy & 0x80000000) == 0x00000000)
	return hpi.d;
      else
	return mhpi.d;
    }

  /* x=+-INF */
  if (ux == 0x7ff00000)
    {
      if (dx == 0x00000000)
	{
	  if (uy == 0x7ff00000)
	    {
	      if (dy == 0x00000000)
		return qpi.d;
	    }
	  else if (uy == 0xfff00000)
	    {
	      if (dy == 0x00000000)
		return mqpi.d;
	    }
	  else
	    {
	      if ((uy & 0x80000000) == 0x00000000)
		return 0;
	      else
		return -0.0;
	    }
	}
    }
  else if (ux == 0xfff00000)
    {
      if (dx == 0x00000000)
	{
	  if (uy == 0x7ff00000)
	    {
	      if (dy == 0x00000000)
		return tqpi.d;
	    }
	  else if (uy == 0xfff00000)
	    {
	      if (dy == 0x00000000)
		return mtqpi.d;
	    }
	  else
	    {
	      if ((uy & 0x80000000) == 0x00000000)
		return opi.d;
	      else
		return mopi.d;
	    }
	}
    }

  /* y=+-INF */
  if (uy == 0x7ff00000)
    {
      if (dy == 0x00000000)
	return hpi.d;
    }
  else if (uy == 0xfff00000)
    {
      if (dy == 0x00000000)
	return mhpi.d;
    }

  /* either x/y or y/x is very close to zero */
  ax = (x < 0) ? -x : x;
  ay = (y < 0) ? -y : y;
  de = (uy & 0x7ff00000) - (ux & 0x7ff00000);
  if (de >= ep)
    {
      return ((y > 0) ? hpi.d : mhpi.d);
    }
  else if (de <= em)
    {
      if (x > 0)
	{
	  if ((z = ay / ax) < TWOM1022)
	    return normalized (ax, ay, y, z);
	  else
	    return signArctan2 (y, z);
	}
      else
	{
	  return ((y > 0) ? opi.d : mopi.d);
	}
    }

  /* if either x or y is extremely close to zero, scale abs(x), abs(y). */
  if (ax < twom500.d || ay < twom500.d)
    {
      ax *= two500.d;
      ay *= two500.d;
    }

  /* Likewise for large x and y.  */
  if (ax > two500.d || ay > two500.d)
    {
      ax *= twom500.d;
      ay *= twom500.d;
    }

  /* x,y which are neither special nor extreme */
  if (ay < ax)
    {
      u = ay / ax;
      EMULV (ax, u, v, vv, t1, t2, t3, t4, t5);
      du = ((ay - v) - vv) / ax;
    }
  else
    {
      u = ax / ay;
      EMULV (ay, u, v, vv, t1, t2, t3, t4, t5);
      du = ((ax - v) - vv) / ay;
    }

  if (x > 0)
    {
      /* (i)   x>0, abs(y)< abs(x):  atan(ay/ax) */
      if (ay < ax)
	{
	  if (u < inv16.d)
	    {
	      v = u * u;

	      zz = du + u * v * (d3.d
				 + v * (d5.d
					+ v * (d7.d
					       + v * (d9.d
						      + v * (d11.d
							     + v * d13.d)))));

	      if ((z = u + (zz - u1.d * u)) == u + (zz + u1.d * u))
		return signArctan2 (y, z);

	      MUL2 (u, du, u, du, v, vv, t1, t2, t3, t4, t5, t6, t7, t8);
	      s1 = v * (f11.d + v * (f13.d
				     + v * (f15.d + v * (f17.d + v * f19.d))));
	      ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      MUL2 (u, du, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (u, du, s2, ss2, s1, ss1, t1, t2);

	      if ((z = s1 + (ss1 - u5.d * s1)) == s1 + (ss1 + u5.d * s1))
		return signArctan2 (y, z);

	      return atan2Mp (x, y, pr);
	    }

	  i = (TWO52 + TWO8 * u) - TWO52;
	  i -= 16;
	  t3 = u - cij[i][0].d;
	  EADD (t3, du, v, dv);
	  t1 = cij[i][1].d;
	  t2 = cij[i][2].d;
	  zz = v * t2 + (dv * t2
			 + v * v * (cij[i][3].d
				    + v * (cij[i][4].d
					   + v * (cij[i][5].d
						  + v * cij[i][6].d))));
	  if (i < 112)
	    {
	      if (i < 48)
		u9 = u91.d;	/* u < 1/4	*/
	      else
		u9 = u92.d;
	    }		/* 1/4 <= u < 1/2 */
	  else
	    {
	      if (i < 176)
		u9 = u93.d;	/* 1/2 <= u < 3/4 */
	      else
		u9 = u94.d;
	    }		/* 3/4 <= u <= 1  */
	  if ((z = t1 + (zz - u9 * t1)) == t1 + (zz + u9 * t1))
	    return signArctan2 (y, z);

	  t1 = u - hij[i][0].d;
	  EADD (t1, du, v, vv);
	  s1 = v * (hij[i][11].d
		    + v * (hij[i][12].d
			   +  v * (hij[i][13].d
				   + v * (hij[i][14].d
					  + v * hij[i][15].d))));
	  ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2);

	  if ((z = s2 + (ss2 - ub.d * s2)) == s2 + (ss2 + ub.d * s2))
	    return signArctan2 (y, z);
	  return atan2Mp (x, y, pr);
	}

      /* (ii)  x>0, abs(x)<=abs(y):  pi/2-atan(ax/ay) */
      if (u < inv16.d)
	{
	  v = u * u;
	  zz = u * v * (d3.d
			+ v * (d5.d
			       + v * (d7.d
				      + v * (d9.d
					     + v * (d11.d
						    + v * d13.d)))));
	  ESUB (hpi.d, u, t2, cor);
	  t3 = ((hpi1.d + cor) - du) - zz;
	  if ((z = t2 + (t3 - u2.d)) == t2 + (t3 + u2.d))
	    return signArctan2 (y, z);

	  MUL2 (u, du, u, du, v, vv, t1, t2, t3, t4, t5, t6, t7, t8);
	  s1 = v * (f11.d
		    + v * (f13.d
			   + v * (f15.d + v * (f17.d + v * f19.d))));
	  ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  MUL2 (u, du, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (u, du, s2, ss2, s1, ss1, t1, t2);
	  SUB2 (hpi.d, hpi1.d, s1, ss1, s2, ss2, t1, t2);

	  if ((z = s2 + (ss2 - u6.d)) == s2 + (ss2 + u6.d))
	    return signArctan2 (y, z);
	  return atan2Mp (x, y, pr);
	}

      i = (TWO52 + TWO8 * u) - TWO52;
      i -= 16;
      v = (u - cij[i][0].d) + du;

      zz = hpi1.d - v * (cij[i][2].d
			 + v * (cij[i][3].d
				+ v * (cij[i][4].d
				       + v * (cij[i][5].d
					      + v * cij[i][6].d))));
      t1 = hpi.d - cij[i][1].d;
      if (i < 112)
	ua = ua1.d;	/* w <  1/2 */
      else
	ua = ua2.d;	/* w >= 1/2 */
      if ((z = t1 + (zz - ua)) == t1 + (zz + ua))
	return signArctan2 (y, z);

      t1 = u - hij[i][0].d;
      EADD (t1, du, v, vv);

      s1 = v * (hij[i][11].d
		+ v * (hij[i][12].d
		       + v * (hij[i][13].d
			      + v * (hij[i][14].d
				     + v * hij[i][15].d))));

      ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2);
      SUB2 (hpi.d, hpi1.d, s2, ss2, s1, ss1, t1, t2);

      if ((z = s1 + (ss1 - uc.d)) == s1 + (ss1 + uc.d))
	return signArctan2 (y, z);
      return atan2Mp (x, y, pr);
    }

  /* (iii) x<0, abs(x)< abs(y):  pi/2+atan(ax/ay) */
  if (ax < ay)
    {
      if (u < inv16.d)
	{
	  v = u * u;
	  zz = u * v * (d3.d
			+ v * (d5.d
			       + v * (d7.d
				      + v * (d9.d
					     + v * (d11.d + v * d13.d)))));
	  EADD (hpi.d, u, t2, cor);
	  t3 = ((hpi1.d + cor) + du) + zz;
	  if ((z = t2 + (t3 - u3.d)) == t2 + (t3 + u3.d))
	    return signArctan2 (y, z);

	  MUL2 (u, du, u, du, v, vv, t1, t2, t3, t4, t5, t6, t7, t8);
	  s1 = v * (f11.d
		    + v * (f13.d + v * (f15.d + v * (f17.d + v * f19.d))));
	  ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  MUL2 (u, du, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (u, du, s2, ss2, s1, ss1, t1, t2);
	  ADD2 (hpi.d, hpi1.d, s1, ss1, s2, ss2, t1, t2);

	  if ((z = s2 + (ss2 - u7.d)) == s2 + (ss2 + u7.d))
	    return signArctan2 (y, z);
	  return atan2Mp (x, y, pr);
	}

      i = (TWO52 + TWO8 * u) - TWO52;
      i -= 16;
      v = (u - cij[i][0].d) + du;
      zz = hpi1.d + v * (cij[i][2].d
			 + v * (cij[i][3].d
				+ v * (cij[i][4].d
				       + v * (cij[i][5].d
					      + v * cij[i][6].d))));
      t1 = hpi.d + cij[i][1].d;
      if (i < 112)
	ua = ua1.d;	/* w <  1/2 */
      else
	ua = ua2.d;	/* w >= 1/2 */
      if ((z = t1 + (zz - ua)) == t1 + (zz + ua))
	return signArctan2 (y, z);

      t1 = u - hij[i][0].d;
      EADD (t1, du, v, vv);
      s1 = v * (hij[i][11].d
		+ v * (hij[i][12].d
		       + v * (hij[i][13].d
			      + v * (hij[i][14].d
				     + v * hij[i][15].d))));
      ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2);
      ADD2 (hpi.d, hpi1.d, s2, ss2, s1, ss1, t1, t2);

      if ((z = s1 + (ss1 - uc.d)) == s1 + (ss1 + uc.d))
	return signArctan2 (y, z);
      return atan2Mp (x, y, pr);
    }

  /* (iv)  x<0, abs(y)<=abs(x):  pi-atan(ax/ay) */
  if (u < inv16.d)
    {
      v = u * u;
      zz = u * v * (d3.d
		    + v * (d5.d
			   + v * (d7.d
				  + v * (d9.d + v * (d11.d + v * d13.d)))));
      ESUB (opi.d, u, t2, cor);
      t3 = ((opi1.d + cor) - du) - zz;
      if ((z = t2 + (t3 - u4.d)) == t2 + (t3 + u4.d))
	return signArctan2 (y, z);

      MUL2 (u, du, u, du, v, vv, t1, t2, t3, t4, t5, t6, t7, t8);
      s1 = v * (f11.d + v * (f13.d + v * (f15.d + v * (f17.d + v * f19.d))));
      ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2);
      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
      MUL2 (u, du, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8);
      ADD2 (u, du, s2, ss2, s1, ss1, t1, t2);
      SUB2 (opi.d, opi1.d, s1, ss1, s2, ss2, t1, t2);

      if ((z = s2 + (ss2 - u8.d)) == s2 + (ss2 + u8.d))
	return signArctan2 (y, z);
      return atan2Mp (x, y, pr);
    }

  i = (TWO52 + TWO8 * u) - TWO52;
  i -= 16;
  v = (u - cij[i][0].d) + du;
  zz = opi1.d - v * (cij[i][2].d
		     + v * (cij[i][3].d
			    + v * (cij[i][4].d
				   + v * (cij[i][5].d + v * cij[i][6].d))));
  t1 = opi.d - cij[i][1].d;
  if (i < 112)
    ua = ua1.d;	/* w <  1/2 */
  else
    ua = ua2.d;	/* w >= 1/2 */
  if ((z = t1 + (zz - ua)) == t1 + (zz + ua))
    return signArctan2 (y, z);

  t1 = u - hij[i][0].d;

  EADD (t1, du, v, vv);

  s1 = v * (hij[i][11].d
	    + v * (hij[i][12].d
		   + v * (hij[i][13].d
			  + v * (hij[i][14].d + v * hij[i][15].d))));

  ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2);
  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
  ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2);
  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
  ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2);
  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
  ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2);
  MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
  ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2);
  SUB2 (opi.d, opi1.d, s2, ss2, s1, ss1, t1, t2);

  if ((z = s1 + (ss1 - uc.d)) == s1 + (ss1 + uc.d))
    return signArctan2 (y, z);
  return atan2Mp (x, y, pr);
}
Beispiel #16
0
int32_t vp8_mbuverror_msa(MACROBLOCK *mb)
{
    BLOCK *be;
    BLOCKD *bd;
    int16_t *coeff_ptr, *dq_coeff_ptr;
    int32_t err = 0;
    uint32_t loop_cnt;
    v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
    v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
    v4i32 diff0, diff1;
    v2i64 err0, err1, err_dup0, err_dup1;

    for (loop_cnt = 16; loop_cnt < 24; loop_cnt += 2)
    {
        be = &mb->block[loop_cnt];
        bd = &mb->e_mbd.block[loop_cnt];
        coeff_ptr = be->coeff;
        dq_coeff_ptr = bd->dqcoeff;
        coeff = LD_SH(coeff_ptr);
        dq_coeff = LD_SH(dq_coeff_ptr);
        coeff_ptr += 8;
        dq_coeff_ptr += 8;
        coeff2 = LD_SH(coeff_ptr);
        dq_coeff2 = LD_SH(dq_coeff_ptr);
        be = &mb->block[loop_cnt + 1];
        bd = &mb->e_mbd.block[loop_cnt + 1];
        coeff_ptr = be->coeff;
        dq_coeff_ptr = bd->dqcoeff;
        coeff3 = LD_SH(coeff_ptr);
        dq_coeff3 = LD_SH(dq_coeff_ptr);
        coeff_ptr += 8;
        dq_coeff_ptr += 8;
        coeff4 = LD_SH(coeff_ptr);
        dq_coeff4 = LD_SH(dq_coeff_ptr);

        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);

        ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DPADD_SD2_SD(diff0, diff1, err0, err1);
        err_dup0 = __msa_splati_d(err0, 1);
        err_dup1 = __msa_splati_d(err1, 1);
        ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
        err += __msa_copy_s_d(err0, 0);
        err += __msa_copy_s_d(err1, 0);

        ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
        ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DPADD_SD2_SD(diff0, diff1, err0, err1);
        err_dup0 = __msa_splati_d(err0, 1);
        err_dup1 = __msa_splati_d(err1, 1);
        ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
        err += __msa_copy_s_d(err0, 0);
        err += __msa_copy_s_d(err1, 0);
    }

    return err;
}
/* routine computes the correctly rounded (to nearest) value of atan(x). */
double atan(double x) {


  double cor,s1,ss1,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,u,u2,u3,
         v,vv,w,ww,y,yy,z,zz;
#if 0
  double y1,y2;
#endif
  int i,ux,dx;
#if 0
  int p;
#endif
  static const int pr[M]={6,8,10,32};
  number num;
#if 0
  mp_no mpt1,mpx,mpy,mpy1,mpy2,mperr;
#endif

  num.d = x;  ux = num.i[HIGH_HALF];  dx = num.i[LOW_HALF];

  /* x=NaN */
  if (((ux&0x7ff00000)==0x7ff00000) && (((ux&0x000fffff)|dx)!=0x00000000))
    return x+x;

  /* Regular values of x, including denormals +-0 and +-INF */
  u = (x<ZERO) ? -x : x;
  if (u<C) {
    if (u<B) {
      if (u<A) {                                           /* u < A */
         return x; }
      else {                                               /* A <= u < B */
        v=x*x;  yy=x*v*(d3.d+v*(d5.d+v*(d7.d+v*(d9.d+v*(d11.d+v*d13.d)))));
        if ((y=x+(yy-U1*x)) == x+(yy+U1*x))  return y;

        EMULV(x,x,v,vv,t1,t2,t3,t4,t5)                       /* v+vv=x^2 */
        s1=v*(f11.d+v*(f13.d+v*(f15.d+v*(f17.d+v*f19.d))));
        ADD2(f9.d,ff9.d,s1,ZERO,s2,ss2,t1,t2)
        MUL2(v,vv,s2,ss2,s1,ss1,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(f7.d,ff7.d,s1,ss1,s2,ss2,t1,t2)
        MUL2(v,vv,s2,ss2,s1,ss1,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(f5.d,ff5.d,s1,ss1,s2,ss2,t1,t2)
        MUL2(v,vv,s2,ss2,s1,ss1,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(f3.d,ff3.d,s1,ss1,s2,ss2,t1,t2)
        MUL2(v,vv,s2,ss2,s1,ss1,t1,t2,t3,t4,t5,t6,t7,t8)
        MUL2(x,ZERO,s1,ss1,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
        ADD2(x,ZERO,s2,ss2,s1,ss1,t1,t2)
        if ((y=s1+(ss1-U5*s1)) == s1+(ss1+U5*s1))  return y;

        return atanMp(x,pr);
      } }
    else {  /* B <= u < C */
      i=(TWO52+TWO8*u)-TWO52;  i-=16;
      z=u-cij[i][0].d;
      yy=z*(cij[i][2].d+z*(cij[i][3].d+z*(cij[i][4].d+
                        z*(cij[i][5].d+z* cij[i][6].d))));
      t1=cij[i][1].d;
      if (i<112) {
        if (i<48)  u2=U21;    /* u < 1/4        */
        else       u2=U22; }  /* 1/4 <= u < 1/2 */
      else {
        if (i<176) u2=U23;    /* 1/2 <= u < 3/4 */
        else       u2=U24; }  /* 3/4 <= u <= 1  */
      if ((y=t1+(yy-u2*t1)) == t1+(yy+u2*t1))  return __signArctan(x,y);

      z=u-hij[i][0].d;
      s1=z*(hij[i][11].d+z*(hij[i][12].d+z*(hij[i][13].d+
         z*(hij[i][14].d+z* hij[i][15].d))));
      ADD2(hij[i][9].d,hij[i][10].d,s1,ZERO,s2,ss2,t1,t2)
      MUL2(z,ZERO,s2,ss2,s1,ss1,t1,t2,t3,t4,t5,t6,t7,t8)
      ADD2(hij[i][7].d,hij[i][8].d,s1,ss1,s2,ss2,t1,t2)
      MUL2(z,ZERO,s2,ss2,s1,ss1,t1,t2,t3,t4,t5,t6,t7,t8)
      ADD2(hij[i][5].d,hij[i][6].d,s1,ss1,s2,ss2,t1,t2)
      MUL2(z,ZERO,s2,ss2,s1,ss1,t1,t2,t3,t4,t5,t6,t7,t8)
      ADD2(hij[i][3].d,hij[i][4].d,s1,ss1,s2,ss2,t1,t2)
      MUL2(z,ZERO,s2,ss2,s1,ss1,t1,t2,t3,t4,t5,t6,t7,t8)
      ADD2(hij[i][1].d,hij[i][2].d,s1,ss1,s2,ss2,t1,t2)
      if ((y=s2+(ss2-U6*s2)) == s2+(ss2+U6*s2))  return __signArctan(x,y);

      return atanMp(x,pr);
    }
  }
Beispiel #18
0
int shDrawRadialGradientMesh(SHPaint *p, SHVector2 *min, SHVector2 *max,
                             VGPaintMode mode, GLenum texUnit)
{
  SHint i, j;
  float a, n;
  
  SHfloat cx = p->radialGradient[0];
  SHfloat cy = p->radialGradient[1];
  SHfloat fx = p->radialGradient[2];
  SHfloat fy = p->radialGradient[3];
  float r = p->radialGradient[4];
  float fcx, fcy, rr, C;
  
  SHVector2 ux;
  SHVector2 uy;
  SHVector2 c, f;
  SHVector2 cf;

  SHMatrix3x3 *m;
  SHMatrix3x3 mi;
  SHint invertible;
  SHVector2 corners[4];
  SHVector2 fcorners[4];
  SHfloat minOffset=0.0f;
  SHfloat maxOffset=0.0f;
  
  SHint maxI=0, maxJ=0;
  SHfloat maxA=0.0f;
  SHfloat startA=0.0f;
  
  int numsteps = 100;
  float step = 2*PI/numsteps;
  SHVector2 tmin, tmax;
  SHVector2 min1, max1, min2, max2;
  
  /* Pick paint transform matrix */
  SH_GETCONTEXT(0);
  if (mode == VG_FILL_PATH)
    m = &context->fillTransform;
  else if (mode == VG_STROKE_PATH)
    m = &context->strokeTransform;
  
  /* Move focus into circle if outside */
  SET2(cf, fx,fy);
  SUB2(cf, cx,cy);
  n = NORM2(cf);
  if (n > r) {
    DIV2(cf, n);
    fx = cx + 0.995f * r * cf.x;
    fy = cy + 0.995f * r * cf.y;
  }
  
  /* Precalculations */
  rr = r*r;
  fcx = fx - cx;
  fcy = fy - cy;
  C = fcx*fcx + fcy*fcy - rr;
  
  /* Apply paint-to-user transformation
     to focus and unit vectors */
  SET2(f, fx, fy);
  SET2(c, cx, cy);
  SET2(ux, 1, 0);
  SET2(uy, 0, 1);
  ADD2(ux, cx, cy);
  ADD2(uy, cx, cy);
  TRANSFORM2(f, (*m));
  TRANSFORM2(c, (*m));
  TRANSFORM2(ux, (*m));
  TRANSFORM2(uy, (*m));
  SUB2V(ux, c); SUB2V(uy, c);
  
  /* Boundbox corners */
  SET2(corners[0], min->x, min->y);
  SET2(corners[1], max->x, min->y);
  SET2(corners[2], max->x, max->y);
  SET2(corners[3], min->x, max->y);
  
  /* Find inverse transformation (back to paint space) */
  invertible = shInvertMatrix(m, &mi);
  if (!invertible || r <= 0.0f) {
    
    /* Fill boundbox with color at offset 1 */
    SHColor *c = &p->stops.items[p->stops.size-1].color;
    glColor4fv((GLfloat*)c); glBegin(GL_QUADS);
    for (i=0; i<4; ++i) glVertex2fv((GLfloat*)&corners[i]);
    glEnd();
    return 1;
  }
  
  /*--------------------------------------------------------*/
  
  /* Find min/max offset */
  for (i=0; i<4; ++i) {
    
    /* Transform to paint space */
    SHfloat ax,ay, A,B,D,t, off;
    TRANSFORM2TO(corners[i], mi, fcorners[i]);
    SUB2(fcorners[i], fx, fy);
    n = NORM2(fcorners[i]);
    if (n == 0.0f) {
      
      /* Avoid zero-length vectors */
      off = 0.0f;
      
    }else{
      
      /* Distance from focus to circle at corner angle */
      DIV2(fcorners[i], n);
      ax = fcorners[i].x;
      ay = fcorners[i].y;
      A = ax*ax + ay*ay;
      B = 2 * (fcx*ax + fcy*ay);
      D = B*B - 4*A*C;
      t = (-B + SH_SQRT(D)) / (2*A);
      
      /* Relative offset of boundbox corner */
      if (D <= 0.0f) off = 1.0f;
      else off = n / t;
    }
    
    /* Find smallest and largest offset */
    if (off < minOffset || i==0) minOffset = off;
    if (off > maxOffset || i==0) maxOffset = off;
  }
  
  /* Is transformed focus inside original boundbox? */
  if (f.x >= min->x && f.x <= max->x &&
      f.y >= min->y && f.y <= max->y) {
    
    /* Draw whole circle */
    minOffset = 0.0f;
    startA = 0.0f;
    maxA = 2*PI;
    
  }else{
    
    /* Find most distant corner pair */
    for (i=0; i<3; ++i) {
      if (ISZERO2(fcorners[i])) continue;
      for (j=i+1; j<4; ++j) {
        if (ISZERO2(fcorners[j])) continue;
        a = ANGLE2N(fcorners[i], fcorners[j]);
        if (a > maxA || maxA == 0.0f)
          {maxA=a; maxI=i; maxJ=j;}
      }}
    
    /* Pick starting angle */
    if (CROSS2(fcorners[maxI],fcorners[maxJ]) > 0.0f)
      startA = shVectorOrientation(&fcorners[maxI]);
    else startA = shVectorOrientation(&fcorners[maxJ]);
  }
  
  /*---------------------------------------------------------*/
  
  /* TODO: for minOffset we'd actually need to find minimum
     of the gradient function when X and Y are substitued
     with a line equation for each bound-box edge. As a
     workaround we use 0.0f for now. */
  minOffset = 0.0f;
  step = PI/50;
  numsteps = (SHint)SH_CEIL(maxA / step) + 1;
  
  glActiveTexture(texUnit);
  shSetGradientTexGLState(p);
  
  glEnable(GL_TEXTURE_1D);
  glBegin(GL_QUADS);
  
  /* Walk the steps and draw gradient mesh */
  for (i=0, a=startA; i<numsteps; ++i, a+=step) {
    
    /* Distance from focus to circle border
         at current angle (gradient space) */
    float ax = SH_COS(a);
    float ay = SH_SIN(a);
    float A = ax*ax + ay*ay;
    float B = 2 * (fcx*ax + fcy*ay);
    float D = B*B - 4*A*C;
    float t = (-B + SH_SQRT(D)) / (2*A);
    if (D <= 0.0f) t = 0.0f;
    
    /* Vectors pointing towards minimum and maximum
         offset at current angle (gradient space) */
    tmin.x = ax * t * minOffset;
    tmin.y = ay * t * minOffset;
    tmax.x = ax * t * maxOffset;
    tmax.y = ay * t * maxOffset;
    
    /* Transform back to user space */
    min2.x = f.x + tmin.x * ux.x + tmin.y * uy.x;
    min2.y = f.y + tmin.x * ux.y + tmin.y * uy.y;
    max2.x = f.x + tmax.x * ux.x + tmax.y * uy.x;
    max2.y = f.y + tmax.x * ux.y + tmax.y * uy.y;
    
    /* Draw quad */
    if (i!=0) {
      glMultiTexCoord1f(texUnit, minOffset);
      glVertex2fv((GLfloat*)&min1);
      glVertex2fv((GLfloat*)&min2);
      glMultiTexCoord1f(texUnit, maxOffset);
      glVertex2fv((GLfloat*)&max2);
      glVertex2fv((GLfloat*)&max1);
    }
    
    /* Save prev points */
    min1 = min2;
    max1 = max2;
  }
  
  glEnd();
  glDisable(GL_TEXTURE_1D);

  return 1;
}
static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
                                    int16_t *out) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
  v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;

  /* fdct32 even */
  /* stage 2 */
  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);

  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
               in8, in9, in10, in11, in12, in13, in14, in15,
               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
               in8, in9, in10, in11, in12, in13, in14, in15);
  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);

  /* Stage 3 */
  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
       tmp0_w, tmp1_w, tmp2_w, tmp3_w);
  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
       vec0_r, vec1_r, vec2_r, vec3_r);

  tmp3_w = vec0_r + vec3_r;
  vec0_r = vec0_r - vec3_r;
  vec3_r = vec1_r + vec2_r;
  vec1_r = vec1_r - vec2_r;

  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
                    cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
  FDCT32_POSTPROC_NEG_W(vec4_r);
  FDCT32_POSTPROC_NEG_W(tmp3_w);
  FDCT32_POSTPROC_NEG_W(vec6_r);
  FDCT32_POSTPROC_NEG_W(vec3_r);
  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
  ST_SH2(vec5, vec4, out, 8);

  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
                    cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
  FDCT32_POSTPROC_NEG_W(vec4_r);
  FDCT32_POSTPROC_NEG_W(tmp3_w);
  FDCT32_POSTPROC_NEG_W(vec6_r);
  FDCT32_POSTPROC_NEG_W(vec3_r);
  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
  ST_SH2(vec5, vec4, out + 16, 8);

  LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 32);
  ST_SH(in5, out + 56);

  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 40);
  ST_SH(in5, out + 48);

  LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
  ADD2(in0, in1, in2, in3, vec0, vec7);
  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 64);
  ST_SH(in5, out + 120);

  SUB2(in0, in1, in2, in3, in0, in2);
  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 72);
  ST_SH(in5, out + 112);

  SUB2(in9, vec2, in14, vec5, vec2, vec5);
  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 80);
  ST_SH(in5, out + 104);

  ADD2(in3, in2, in0, in1, vec3, vec4);
  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 96);
  ST_SH(in5, out + 88);
}
Beispiel #20
0
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                        int32_t src_stride) {
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
  v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,
                  -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
  v8i16 coeff1 = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,
                   cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };
  v8i16 coeff2 = {
    -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
  };

  LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
          in10, in11, in12, in13, in14, in15);
  SLLI_4V(in0, in1, in2, in3, 2);
  SLLI_4V(in4, in5, in6, in7, 2);
  SLLI_4V(in8, in9, in10, in11, 2);
  SLLI_4V(in12, in13, in14, in15, 2);
  ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
  ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
  ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
  SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
  SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);

  tmp_ptr += 16;

  /* stp 1 */
  ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
  ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);

  cnst4 = __msa_splati_h(coeff, 0);
  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);

  cnst5 = __msa_splati_h(coeff, 1);
  cnst5 = __msa_ilvev_h(cnst5, cnst4);
  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
  stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
  stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);

  /* stp2 */
  BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
  BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
  ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
  ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
  SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);
  stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);

  cnst0 = __msa_splati_h(coeff, 4);
  cnst1 = __msa_ilvev_h(cnst1, cnst0);
  stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);

  BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
  ILVRL_H2_SH(in15, in8, vec1, vec0);
  SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);

  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr);

  cnst0 = __msa_splati_h(coeff2, 0);
  cnst0 = __msa_ilvev_h(cnst1, cnst0);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 224);

  ILVRL_H2_SH(in14, in9, vec1, vec0);
  SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
  cnst1 = __msa_ilvev_h(cnst1, cnst0);

  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
  ST_SH(in8, tmp_ptr + 128);

  cnst1 = __msa_splati_h(coeff2, 2);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 96);

  SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
  cnst1 = __msa_ilvev_h(cnst1, cnst0);

  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);

  cnst1 = __msa_splati_h(coeff, 3);
  cnst1 = __msa_ilvev_h(cnst0, cnst1);
  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);

  /* stp4 */
  ADD2(stp34, stp25, stp33, stp22, in13, in10);

  ILVRL_H2_SH(in13, in10, vec1, vec0);
  SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 64);

  cnst0 = __msa_splati_h(coeff2, 1);
  cnst0 = __msa_ilvev_h(cnst1, cnst0);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 160);

  SUB2(stp34, stp25, stp33, stp22, in12, in11);
  ILVRL_H2_SH(in12, in11, vec1, vec0);
  SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
  cnst1 = __msa_ilvev_h(cnst1, cnst0);

  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
  ST_SH(in8, tmp_ptr + 192);

  cnst1 = __msa_splati_h(coeff2, 3);
  cnst0 = __msa_ilvev_h(cnst0, cnst1);
  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
  ST_SH(in8, tmp_ptr + 32);
}
Beispiel #21
0
double
SECTION
__ieee754_log(double x) {
#define M 4
  static const int pr[M]={8,10,18,32};
  int i,j,n,ux,dx,p;
#if 0
  int k;
#endif
  double dbl_n,u,p0,q,r0,w,nln2a,luai,lubi,lvaj,lvbj,
	 sij,ssij,ttij,A,B,B0,y,y1,y2,polI,polII,sa,sb,
	 t1,t2,t7,t8,t,ra,rb,ww,
	 a0,aa0,s1,s2,ss2,s3,ss3,a1,aa1,a,aa,b,bb,c;
#ifndef DLA_FMS
  double t3,t4,t5,t6;
#endif
  number num;
  mp_no mpx,mpy,mpy1,mpy2,mperr;

#include "ulog.tbl"
#include "ulog.h"

  /* Treating special values of x ( x<=0, x=INF, x=NaN etc.). */

  num.d = x;  ux = num.i[HIGH_HALF];  dx = num.i[LOW_HALF];
  n=0;
  if (__builtin_expect(ux < 0x00100000, 0)) {
    if (__builtin_expect(((ux & 0x7fffffff) | dx) == 0, 0))
      return MHALF/ZERO; /* return -INF */
    if (__builtin_expect(ux < 0, 0))
      return (x-x)/ZERO;                         /* return NaN  */
    n -= 54;    x *= two54.d;                              /* scale x     */
    num.d = x;
  }
  if (__builtin_expect(ux >= 0x7ff00000, 0))
    return x+x;                        /* INF or NaN  */

  /* Regular values of x */

  w = x-ONE;
  if (__builtin_expect(ABS(w) > U03, 1)) { goto case_03; }


  /*--- Stage I, the case abs(x-1) < 0.03 */

  t8 = MHALF*w;
  EMULV(t8,w,a,aa,t1,t2,t3,t4,t5)
  EADD(w,a,b,bb)

  /* Evaluate polynomial II */
  polII = (b0.d+w*(b1.d+w*(b2.d+w*(b3.d+w*(b4.d+
	  w*(b5.d+w*(b6.d+w*(b7.d+w*b8.d))))))))*w*w*w;
  c = (aa+bb)+polII;

  /* End stage I, case abs(x-1) < 0.03 */
  if ((y=b+(c+b*E2)) == b+(c-b*E2))  return y;

  /*--- Stage II, the case abs(x-1) < 0.03 */

  a = d11.d+w*(d12.d+w*(d13.d+w*(d14.d+w*(d15.d+w*(d16.d+
	    w*(d17.d+w*(d18.d+w*(d19.d+w*d20.d))))))));
  EMULV(w,a,s2,ss2,t1,t2,t3,t4,t5)
  ADD2(d10.d,dd10.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(d9.d,dd9.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(d8.d,dd8.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(d7.d,dd7.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(d6.d,dd6.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(d5.d,dd5.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(d4.d,dd4.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(d3.d,dd3.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(d2.d,dd2.d,s2,ss2,s3,ss3,t1,t2)
  MUL2(w,ZERO,s3,ss3,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  MUL2(w,ZERO,s2,ss2,s3,ss3,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(w,ZERO,    s3,ss3, b, bb,t1,t2)

  /* End stage II, case abs(x-1) < 0.03 */
  if ((y=b+(bb+b*E4)) == b+(bb-b*E4))  return y;
  goto stage_n;

  /*--- Stage I, the case abs(x-1) > 0.03 */
  case_03:

  /* Find n,u such that x = u*2**n,   1/sqrt(2) < u < sqrt(2)  */
  n += (num.i[HIGH_HALF] >> 20) - 1023;
  num.i[HIGH_HALF] = (num.i[HIGH_HALF] & 0x000fffff) | 0x3ff00000;
  if (num.d > SQRT_2) { num.d *= HALF;  n++; }
  u = num.d;  dbl_n = (double) n;

  /* Find i such that ui=1+(i-75)/2**8 is closest to u (i= 0,1,2,...,181) */
  num.d += h1.d;
  i = (num.i[HIGH_HALF] & 0x000fffff) >> 12;

  /* Find j such that vj=1+(j-180)/2**16 is closest to v=u/ui (j= 0,...,361) */
  num.d = u*Iu[i].d + h2.d;
  j = (num.i[HIGH_HALF] & 0x000fffff) >> 4;

  /* Compute w=(u-ui*vj)/(ui*vj) */
  p0=(ONE+(i-75)*DEL_U)*(ONE+(j-180)*DEL_V);
  q=u-p0;   r0=Iu[i].d*Iv[j].d;   w=q*r0;

  /* Evaluate polynomial I */
  polI = w+(a2.d+a3.d*w)*w*w;

  /* Add up everything */
  nln2a = dbl_n*LN2A;
  luai  = Lu[i][0].d;   lubi  = Lu[i][1].d;
  lvaj  = Lv[j][0].d;   lvbj  = Lv[j][1].d;
  EADD(luai,lvaj,sij,ssij)
  EADD(nln2a,sij,A  ,ttij)
  B0 = (((lubi+lvbj)+ssij)+ttij)+dbl_n*LN2B;
  B  = polI+B0;

  /* End stage I, case abs(x-1) >= 0.03 */
  if ((y=A+(B+E1)) == A+(B-E1))  return y;


  /*--- Stage II, the case abs(x-1) > 0.03 */

  /* Improve the accuracy of r0 */
  EMULV(p0,r0,sa,sb,t1,t2,t3,t4,t5)
  t=r0*((ONE-sa)-sb);
  EADD(r0,t,ra,rb)

  /* Compute w */
  MUL2(q,ZERO,ra,rb,w,ww,t1,t2,t3,t4,t5,t6,t7,t8)

  EADD(A,B0,a0,aa0)

  /* Evaluate polynomial III */
  s1 = (c3.d+(c4.d+c5.d*w)*w)*w;
  EADD(c2.d,s1,s2,ss2)
  MUL2(s2,ss2,w,ww,s3,ss3,t1,t2,t3,t4,t5,t6,t7,t8)
  MUL2(s3,ss3,w,ww,s2,ss2,t1,t2,t3,t4,t5,t6,t7,t8)
  ADD2(s2,ss2,w,ww,s3,ss3,t1,t2)
  ADD2(s3,ss3,a0,aa0,a1,aa1,t1,t2)

  /* End stage II, case abs(x-1) >= 0.03 */
  if ((y=a1+(aa1+E3)) == a1+(aa1-E3)) return y;


  /* Final stages. Use multi-precision arithmetic. */
  stage_n:

  for (i=0; i<M; i++) {
    p = pr[i];
    __dbl_mp(x,&mpx,p);  __dbl_mp(y,&mpy,p);
    __mplog(&mpx,&mpy,p);
    __dbl_mp(e[i].d,&mperr,p);
    __add(&mpy,&mperr,&mpy1,p);  __sub(&mpy,&mperr,&mpy2,p);
    __mp_dbl(&mpy1,&y1,p);       __mp_dbl(&mpy2,&y2,p);
    if (y1==y2)   return y1;
  }
  return y1;
}
static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
                                   int16_t *out) {
  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
  v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
  v8i16 vec4, vec5;

  in20 = LD_SH(temp + 32);
  in21 = LD_SH(temp + 40);
  in26 = LD_SH(temp + 80);
  in27 = LD_SH(temp + 88);

  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);

  FDCT_POSTPROC_2V_NEG_H(in20, in21);
  FDCT_POSTPROC_2V_NEG_H(in26, in27);

  in18 = LD_SH(temp + 16);
  in19 = LD_SH(temp + 24);
  in28 = LD_SH(temp + 96);
  in29 = LD_SH(temp + 104);

  FDCT_POSTPROC_2V_NEG_H(in18, in19);
  FDCT_POSTPROC_2V_NEG_H(in28, in29);

  vec4 = in19 - in20;
  ST_SH(vec4, interm_ptr + 32);
  vec4 = in18 - in21;
  ST_SH(vec4, interm_ptr + 88);
  vec4 = in29 - in26;
  ST_SH(vec4, interm_ptr + 64);
  vec4 = in28 - in27;
  ST_SH(vec4, interm_ptr + 56);

  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);

  in22 = LD_SH(temp + 48);
  in23 = LD_SH(temp + 56);
  in24 = LD_SH(temp + 64);
  in25 = LD_SH(temp + 72);

  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
  FDCT_POSTPROC_2V_NEG_H(in22, in23);
  FDCT_POSTPROC_2V_NEG_H(in24, in25);

  in16 = LD_SH(temp);
  in17 = LD_SH(temp + 8);
  in30 = LD_SH(temp + 112);
  in31 = LD_SH(temp + 120);

  FDCT_POSTPROC_2V_NEG_H(in16, in17);
  FDCT_POSTPROC_2V_NEG_H(in30, in31);

  vec4 = in17 - in22;
  ST_SH(vec4, interm_ptr + 40);
  vec4 = in30 - in25;
  ST_SH(vec4, interm_ptr + 48);
  vec4 = in31 - in24;
  ST_SH(vec4, interm_ptr + 72);
  vec4 = in16 - in23;
  ST_SH(vec4, interm_ptr + 80);

  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
  ADD2(in27, in26, in25, in24, in23, in20);
  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
  ST_SH(vec5, out);
  ST_SH(vec4, out + 120);

  SUB2(in27, in26, in25, in24, in22, in21);
  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
  ST_SH(vec5, out + 112);
  ST_SH(vec4, out + 8);

  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
  SUB2(in26, in27, in24, in25, in23, in20);
  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
  ST_SH(vec4, out + 16);
  ST_SH(vec5, out + 104);

  ADD2(in26, in27, in24, in25, in22, in21);
  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
  ST_SH(vec4, out + 24);
  ST_SH(vec5, out + 96);

  in20 = LD_SH(interm_ptr + 32);
  in21 = LD_SH(interm_ptr + 88);
  in27 = LD_SH(interm_ptr + 56);
  in26 = LD_SH(interm_ptr + 64);

  in16 = in20;
  in17 = in21;
  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);

  in22 = LD_SH(interm_ptr + 40);
  in25 = LD_SH(interm_ptr + 48);
  in24 = LD_SH(interm_ptr + 72);
  in23 = LD_SH(interm_ptr + 80);

  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
  in16 = in28 + in29;
  in19 = in31 + in30;
  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
  ST_SH(vec5, out + 32);
  ST_SH(vec4, out + 88);

  SUB2(in28, in29, in31, in30, in17, in18);
  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
  ST_SH(vec5, out + 40);
  ST_SH(vec4, out + 80);

  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
  SUB2(in29, in28, in30, in31, in16, in19);
  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
  ST_SH(vec5, out + 72);
  ST_SH(vec4, out + 48);

  ADD2(in29, in28, in30, in31, in17, in18);
  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
  ST_SH(vec4, out + 56);
  ST_SH(vec5, out + 64);
}
static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;

  /* fdct32 even */
  /* stage 2 */
  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);

  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
               in8, in9, in10, in11, in12, in13, in14, in15,
               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
               in8, in9, in10, in11, in12, in13, in14, in15);
  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
  FDCT_POSTPROC_2V_NEG_H(in8, in9);
  FDCT_POSTPROC_2V_NEG_H(in10, in11);
  FDCT_POSTPROC_2V_NEG_H(in12, in13);
  FDCT_POSTPROC_2V_NEG_H(in14, in15);

  /* Stage 3 */
  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);

  temp0 = in0 + in3;
  in0 = in0 - in3;
  in3 = in1 + in2;
  in1 = in1 - in2;

  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
  ST_SH(temp0, out);
  ST_SH(temp1, out + 8);

  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
  ST_SH(temp0, out + 16);
  ST_SH(temp1, out + 24);

  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
  ST_SH(temp0, out + 32);
  ST_SH(temp1, out + 56);

  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
  ST_SH(temp0, out + 40);
  ST_SH(temp1, out + 48);

  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
  ADD2(in0, in1, in2, in3, vec0, vec7);
  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
  ST_SH(temp0, out + 64);
  ST_SH(temp1, out + 120);

  SUB2(in0, in1, in2, in3, in0, in2);
  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
  ST_SH(temp0, out + 72);
  ST_SH(temp1, out + 112);

  SUB2(in9, vec2, in14, vec5, vec2, vec5);
  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
  ST_SH(temp0, out + 80);
  ST_SH(temp1, out + 104);

  ADD2(in3, in2, in0, in1, vec3, vec4);
  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
  ST_SH(temp0, out + 96);
  ST_SH(temp1, out + 88);
}
static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  v8i16 temp0, temp1;

  /* fdct even */
  LD_SH4(input, 8, in0, in1, in2, in3);
  LD_SH4(input + 96, 8, in12, in13, in14, in15);
  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
              vec0, vec1, vec2, vec3, in12, in13, in14, in15);
  LD_SH4(input + 32, 8, in4, in5, in6, in7);
  LD_SH4(input + 64, 8, in8, in9, in10, in11);
  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
              vec4, vec5, vec6, vec7, in8, in9, in10, in11);

  /* Stage 3 */
  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp);
  ST_SH(temp1, temp + 512);

  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 256);
  ST_SH(temp1, temp + 768);

  SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 128);
  ST_SH(temp1, temp + 896);

  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 640);
  ST_SH(temp1, temp + 384);

  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
  ADD2(in0, in1, in2, in3, vec0, vec7);
  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 64);
  ST_SH(temp1, temp + 960);

  SUB2(in0, in1, in2, in3, in0, in2);
  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 576);
  ST_SH(temp1, temp + 448);

  SUB2(in9, vec2, in14, vec5, vec2, vec5);
  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 320);
  ST_SH(temp1, temp + 704);

  ADD2(in3, in2, in0, in1, vec3, vec4);
  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
  ST_SH(temp0, temp + 192);
  ST_SH(temp1, temp + 832);
}
Beispiel #25
0
void __dubcos(double x, double dx, double v[]) {
  double r,s,p,hx,tx,hy,ty,q,c,cc,d,dd,d2,dd2,e,ee,
    sn,ssn,cs,ccs,ds,dss,dc,dcc;
#if 0
  double xx,y,yy,z,zz;
#endif
  mynumber u;
  int4 k;
  u.x=x+big.x;
  k = u.i[LOW_HALF]<<2;
  x=x-(u.x-big.x);
  d=x+dx;
  dd=(x-d)+dx;  /* cos(x+dx)=cos(Xi+t)=cos(Xi)cos(t) - sin(Xi)sin(t) */
  MUL2(d,dd,d,dd,d2,dd2,p,hx,tx,hy,ty,q,c,cc);
  sn=sincos.x[k];     /*                                  */
  ssn=sincos.x[k+1];  /*      sin(Xi) and cos(Xi)         */
  cs=sincos.x[k+2];   /*                                  */
  ccs=sincos.x[k+3];  /*                                  */
  MUL2(d2,dd2,s7.x,ss7.x,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  ADD2(ds,dss,s5.x,ss5.x,ds,dss,r,s);
  MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  ADD2(ds,dss,s3.x,ss3.x,ds,dss,r,s);
  MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  MUL2(d,dd,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  ADD2(ds,dss,d,dd,ds,dss,r,s);

  MUL2(d2,dd2,c8.x,cc8.x,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  ADD2(dc,dcc,c6.x,cc6.x,dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  ADD2(dc,dcc,c4.x,cc4.x,dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  ADD2(dc,dcc,c2.x,cc2.x,dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);

  MUL2(cs,ccs,ds,dss,e,ee,p,hx,tx,hy,ty,q,c,cc);
  MUL2(dc,dcc,sn,ssn,dc,dcc,p,hx,tx,hy,ty,q,c,cc);

  MUL2(d2,dd2,s7.x,ss7.x,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  ADD2(ds,dss,s5.x,ss5.x,ds,dss,r,s);
  MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  ADD2(ds,dss,s3.x,ss3.x,ds,dss,r,s);
  MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  MUL2(d,dd,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc);
  ADD2(ds,dss,d,dd,ds,dss,r,s);
  MUL2(d2,dd2,c8.x,cc8.x,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  ADD2(dc,dcc,c6.x,cc6.x,dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  ADD2(dc,dcc,c4.x,cc4.x,dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  ADD2(dc,dcc,c2.x,cc2.x,dc,dcc,r,s);
  MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  MUL2(sn,ssn,ds,dss,e,ee,p,hx,tx,hy,ty,q,c,cc);
  MUL2(dc,dcc,cs,ccs,dc,dcc,p,hx,tx,hy,ty,q,c,cc);
  ADD2(e,ee,dc,dcc,e,ee,r,s);
  SUB2(cs,ccs,e,ee,e,ee,r,s);

  v[0]=e;
  v[1]=ee;
}
Beispiel #26
0
static void shDrawPaintMesh(VGContext *c, SHVector2 *min, SHVector2 *max,
                            VGPaintMode mode, GLenum texUnit)
{
  SHPaint *p;
  SHVector2 pmin, pmax;
  SHfloat K = 1.0f;
#ifdef ANDROIDVG
	SHColor *color;
	GLfloat v[6][2];
#endif
  
  /* Pick the right paint */
  if (mode == VG_FILL_PATH) {
    p = (c->fillPaint ? c->fillPaint : &c->defaultPaint);
  }else if (mode == VG_STROKE_PATH) {
    p = (c->strokePaint ? c->strokePaint : &c->defaultPaint);
    K = SH_CEIL(c->strokeMiterLimit * c->strokeLineWidth) + 1.0f;
  }
  
  /* We want to be sure to cover every pixel of this path so better
     take a pixel more than leave some out (multisampling is tricky). */
  SET2V(pmin, (*min)); SUB2(pmin, K,K);
  SET2V(pmax, (*max)); ADD2(pmax, K,K);

  /* Construct appropriate OpenGL primitives so as
     to fill the stencil mask with select paint */

  switch (p->type) {
  case VG_PAINT_TYPE_LINEAR_GRADIENT:
    shDrawLinearGradientMesh(p, min, max, mode, texUnit);
    break;

  case VG_PAINT_TYPE_RADIAL_GRADIENT:
    shDrawRadialGradientMesh(p, min, max, mode, texUnit);
    break;
    
  case VG_PAINT_TYPE_PATTERN:
    if (p->pattern != VG_INVALID_HANDLE) {
      shDrawPatternMesh(p, min, max, mode, texUnit);
      break;
    }/* else behave as a color paint */
  
  case VG_PAINT_TYPE_COLOR:
#ifdef ANDROIDVG
	v[0][0] = pmin.x; v[0][1] = pmin.y;
	v[1][0] = pmax.x; v[1][1] = pmin.y;
	v[2][0] = pmax.x; v[2][1] = pmax.y;
	v[3][0] = pmin.x; v[3][1] = pmin.y;
	v[4][0] = pmax.x; v[4][1] = pmax.y;
	v[5][0] = pmin.x; v[5][1] = pmax.y;
	color = &p->color;
	glColor4f(color->r, color->g, color->b, color->a);
	glEnableClientState(GL_VERTEX_ARRAY);
	glVertexPointer(2, GL_FLOAT, 0, v); 
	glDrawArrays(GL_TRIANGLES, 0, 6); 
	glDisableClientState(GL_VERTEX_ARRAY);
#else
	glColor4fv((GLfloat*)&p->color);
	glBegin(GL_QUADS);
	glVertex2f(pmin.x, pmin.y);
	glVertex2f(pmax.x, pmin.y);
	glVertex2f(pmax.x, pmax.y);
	glVertex2f(pmin.x, pmax.y);
	glEnd();
#endif
    break;
  }
}
static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
                                            uint32_t stride,
                                            uint8_t *frame2_ptr,
                                            int32_t strength_in,
                                            int32_t filter_wt_in,
                                            uint32_t *acc, uint16_t *cnt)
{
    uint32_t row;
    uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
    v16i8 frame1 = { 0 };
    v16i8 frame2 = { 0 };
    v16i8 frame3 = { 0 };
    v16i8 frame4 = { 0 };
    v16u8 frame_l, frame_h;
    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
    v8i16 diff0, diff1, cnt0, cnt1;
    v4i32 const3, const16;
    v4i32 filter_wt, strength;
    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
    v4i32 acc0, acc1, acc2, acc3;

    filter_wt = __msa_fill_w(filter_wt_in);
    strength = __msa_fill_w(strength_in);
    const3 = __msa_ldi_w(3);
    const16 = __msa_ldi_w(16);

    for (row = 2; row--;)
    {
        LD2(frame1_ptr, stride, f0, f1);
        frame1_ptr += (2 * stride);
        LD2(frame2_ptr, 8, f2, f3);
        frame2_ptr += 16;
        LD2(frame1_ptr, stride, f4, f5);
        frame1_ptr += (2 * stride);
        LD2(frame2_ptr, 8, f6, f7);
        frame2_ptr += 16;

        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        INSERT_D2_SB(f0, f1, frame1);
        INSERT_D2_SB(f2, f3, frame2);
        INSERT_D2_SB(f4, f5, frame3);
        INSERT_D2_SB(f6, f7, frame4);
        ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;

        UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;

        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;

        UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;
    }
}
static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;

  in20 = LD_SH(input + 32);
  in21 = LD_SH(input + 40);
  in26 = LD_SH(input + 80);
  in27 = LD_SH(input + 88);

  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);

  in18 = LD_SH(input + 16);
  in19 = LD_SH(input + 24);
  in28 = LD_SH(input + 96);
  in29 = LD_SH(input + 104);

  vec4 = in19 - in20;
  ST_SH(vec4, input + 32);
  vec4 = in18 - in21;
  ST_SH(vec4, input + 40);
  vec4 = in29 - in26;
  ST_SH(vec4, input + 80);
  vec4 = in28 - in27;
  ST_SH(vec4, input + 88);

  in21 = in18 + in21;
  in20 = in19 + in20;
  in27 = in28 + in27;
  in26 = in29 + in26;

  LD_SH4(input + 48, 8, in22, in23, in24, in25);
  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);

  in16 = LD_SH(input);
  in17 = LD_SH(input + 8);
  in30 = LD_SH(input + 112);
  in31 = LD_SH(input + 120);

  vec4 = in17 - in22;
  ST_SH(vec4, input + 16);
  vec4 = in16 - in23;
  ST_SH(vec4, input + 24);
  vec4 = in31 - in24;
  ST_SH(vec4, input + 96);
  vec4 = in30 - in25;
  ST_SH(vec4, input + 104);

  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
  ADD2(in27, in26, in25, in24, in23, in20);
  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
  ST_SH(vec5, temp_ptr);
  ST_SH(vec4, temp_ptr + 960);

  SUB2(in27, in26, in25, in24, in22, in21);
  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
  ST_SH(vec5, temp_ptr + 448);
  ST_SH(vec4, temp_ptr + 512);

  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
  SUB2(in26, in27, in24, in25, in23, in20);
  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
  ST_SH(vec4, temp_ptr + 704);
  ST_SH(vec5, temp_ptr + 256);

  ADD2(in26, in27, in24, in25, in22, in21);
  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
  ST_SH(vec4, temp_ptr + 192);
  ST_SH(vec5, temp_ptr + 768);

  LD_SH4(input + 16, 8, in22, in23, in20, in21);
  LD_SH4(input + 80, 8, in26, in27, in24, in25);
  in16 = in20;
  in17 = in21;
  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
  ADD2(in28, in29, in31, in30, in16, in19);
  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
  ST_SH(vec5, temp_ptr + 832);
  ST_SH(vec4, temp_ptr + 128);

  SUB2(in28, in29, in31, in30, in17, in18);
  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
  ST_SH(vec5, temp_ptr + 320);
  ST_SH(vec4, temp_ptr + 640);
  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
  SUB2(in29, in28, in30, in31, in16, in19);
  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
  ST_SH(vec5, temp_ptr + 576);
  ST_SH(vec4, temp_ptr + 384);

  ADD2(in29, in28, in30, in31, in17, in18);
  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
  ST_SH(vec5, temp_ptr + 64);
  ST_SH(vec4, temp_ptr + 896);
}
static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
                                             uint32_t stride,
                                             uint8_t *frame2_ptr,
                                             int32_t strength_in,
                                             int32_t filter_wt_in,
                                             uint32_t *acc, uint16_t *cnt)
{
    uint32_t row;
    v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
    v16u8 frame_l, frame_h;
    v16i8 zero = { 0 };
    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
    v8i16 diff0, diff1, cnt0, cnt1;
    v4i32 const3, const16, filter_wt, strength;
    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
    v4i32 acc0, acc1, acc2, acc3;

    filter_wt = __msa_fill_w(filter_wt_in);
    strength = __msa_fill_w(strength_in);
    const3 = __msa_ldi_w(3);
    const16 = __msa_ldi_w(16);

    for (row = 8; row--;)
    {
        frame1_0_b = LD_SB(frame1_ptr);
        frame2_0_b = LD_SB(frame2_ptr);
        frame1_ptr += stride;
        frame2_ptr += 16;
        frame1_1_b = LD_SB(frame1_ptr);
        frame2_1_b = LD_SB(frame2_ptr);
        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;
        ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;
        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;

        UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;
        frame1_ptr += stride;
        frame2_ptr += 16;
    }
}
Beispiel #30
0
/* routine computes the correctly rounded (to nearest) value of atan(x). */
double
atan (double x)
{
  double cor, s1, ss1, s2, ss2, t1, t2, t3, t7, t8, t9, t10, u, u2, u3,
	 v, vv, w, ww, y, yy, z, zz;
#ifndef DLA_FMS
  double t4, t5, t6;
#endif
  int i, ux, dx;
  static const int pr[M] = { 6, 8, 10, 32 };
  number num;

  num.d = x;
  ux = num.i[HIGH_HALF];
  dx = num.i[LOW_HALF];

  /* x=NaN */
  if (((ux & 0x7ff00000) == 0x7ff00000)
      && (((ux & 0x000fffff) | dx) != 0x00000000))
    return x + x;

  /* Regular values of x, including denormals +-0 and +-INF */
  SET_RESTORE_ROUND (FE_TONEAREST);
  u = (x < 0) ? -x : x;
  if (u < C)
    {
      if (u < B)
	{
	  if (u < A)
	    {
	      math_check_force_underflow_nonneg (u);
	      return x;
	    }
	  else
	    {			/* A <= u < B */
	      v = x * x;
	      yy = d11.d + v * d13.d;
	      yy = d9.d + v * yy;
	      yy = d7.d + v * yy;
	      yy = d5.d + v * yy;
	      yy = d3.d + v * yy;
	      yy *= x * v;

	      if ((y = x + (yy - U1 * x)) == x + (yy + U1 * x))
		return y;

	      EMULV (x, x, v, vv, t1, t2, t3, t4, t5);	/* v+vv=x^2 */

	      s1 = f17.d + v * f19.d;
	      s1 = f15.d + v * s1;
	      s1 = f13.d + v * s1;
	      s1 = f11.d + v * s1;
	      s1 *= v;

	      ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      MUL2 (x, 0, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7,
		    t8);
	      ADD2 (x, 0, s2, ss2, s1, ss1, t1, t2);
	      if ((y = s1 + (ss1 - U5 * s1)) == s1 + (ss1 + U5 * s1))
		return y;

	      return atanMp (x, pr);
	    }
	}
      else
	{			/* B <= u < C */
	  i = (TWO52 + TWO8 * u) - TWO52;
	  i -= 16;
	  z = u - cij[i][0].d;
	  yy = cij[i][5].d + z * cij[i][6].d;
	  yy = cij[i][4].d + z * yy;
	  yy = cij[i][3].d + z * yy;
	  yy = cij[i][2].d + z * yy;
	  yy *= z;

	  t1 = cij[i][1].d;
	  if (i < 112)
	    {
	      if (i < 48)
		u2 = U21;	/* u < 1/4        */
	      else
		u2 = U22;
	    }			/* 1/4 <= u < 1/2 */
	  else
	    {
	      if (i < 176)
		u2 = U23;	/* 1/2 <= u < 3/4 */
	      else
		u2 = U24;
	    }			/* 3/4 <= u <= 1  */
	  if ((y = t1 + (yy - u2 * t1)) == t1 + (yy + u2 * t1))
	    return __signArctan (x, y);

	  z = u - hij[i][0].d;

	  s1 = hij[i][14].d + z * hij[i][15].d;
	  s1 = hij[i][13].d + z * s1;
	  s1 = hij[i][12].d + z * s1;
	  s1 = hij[i][11].d + z * s1;
	  s1 *= z;

	  ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2);
	  MUL2 (z, 0, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (z, 0, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (z, 0, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (z, 0, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2);
	  if ((y = s2 + (ss2 - U6 * s2)) == s2 + (ss2 + U6 * s2))
	    return __signArctan (x, y);

	  return atanMp (x, pr);
	}
    }
  else
    {
      if (u < D)
	{			/* C <= u < D */
	  w = 1 / u;
	  EMULV (w, u, t1, t2, t3, t4, t5, t6, t7);
	  ww = w * ((1 - t1) - t2);
	  i = (TWO52 + TWO8 * w) - TWO52;
	  i -= 16;
	  z = (w - cij[i][0].d) + ww;

	  yy = cij[i][5].d + z * cij[i][6].d;
	  yy = cij[i][4].d + z * yy;
	  yy = cij[i][3].d + z * yy;
	  yy = cij[i][2].d + z * yy;
	  yy = HPI1 - z * yy;

	  t1 = HPI - cij[i][1].d;
	  if (i < 112)
	    u3 = U31;           /* w <  1/2 */
	  else
	    u3 = U32;           /* w >= 1/2 */
	  if ((y = t1 + (yy - u3)) == t1 + (yy + u3))
	    return __signArctan (x, y);

	  DIV2 (1, 0, u, 0, w, ww, t1, t2, t3, t4, t5, t6, t7, t8, t9,
		t10);
	  t1 = w - hij[i][0].d;
	  EADD (t1, ww, z, zz);

	  s1 = hij[i][14].d + z * hij[i][15].d;
	  s1 = hij[i][13].d + z * s1;
	  s1 = hij[i][12].d + z * s1;
	  s1 = hij[i][11].d + z * s1;
	  s1 *= z;

	  ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2);
	  MUL2 (z, zz, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (z, zz, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (z, zz, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2);
	  MUL2 (z, zz, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	  ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2);
	  SUB2 (HPI, HPI1, s2, ss2, s1, ss1, t1, t2);
	  if ((y = s1 + (ss1 - U7)) == s1 + (ss1 + U7))
	    return __signArctan (x, y);

	  return atanMp (x, pr);
	}
      else
	{
	  if (u < E)
	    {                   /* D <= u < E */
	      w = 1 / u;
	      v = w * w;
	      EMULV (w, u, t1, t2, t3, t4, t5, t6, t7);

	      yy = d11.d + v * d13.d;
	      yy = d9.d + v * yy;
	      yy = d7.d + v * yy;
	      yy = d5.d + v * yy;
	      yy = d3.d + v * yy;
	      yy *= w * v;

	      ww = w * ((1 - t1) - t2);
	      ESUB (HPI, w, t3, cor);
	      yy = ((HPI1 + cor) - ww) - yy;
	      if ((y = t3 + (yy - U4)) == t3 + (yy + U4))
		return __signArctan (x, y);

	      DIV2 (1, 0, u, 0, w, ww, t1, t2, t3, t4, t5, t6, t7, t8,
		    t9, t10);
	      MUL2 (w, ww, w, ww, v, vv, t1, t2, t3, t4, t5, t6, t7, t8);

	      s1 = f17.d + v * f19.d;
	      s1 = f15.d + v * s1;
	      s1 = f13.d + v * s1;
	      s1 = f11.d + v * s1;
	      s1 *= v;

	      ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2);
	      MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8);
	      MUL2 (w, ww, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8);
	      ADD2 (w, ww, s2, ss2, s1, ss1, t1, t2);
	      SUB2 (HPI, HPI1, s1, ss1, s2, ss2, t1, t2);

	      if ((y = s2 + (ss2 - U8)) == s2 + (ss2 + U8))
		return __signArctan (x, y);

	      return atanMp (x, pr);
	    }
	  else
	    {
	      /* u >= E */
	      if (x > 0)
		return HPI;
	      else
		return MHPI;
	    }
	}
    }
}