void vp9_fwht4x4_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3, in4; LD_SH4(input, src_stride, in0, in1, in2, in3); in0 += in1; in3 -= in2; in4 = (in0 - in3) >> 1; SUB2(in4, in1, in4, in2, in1, in2); in0 -= in2; in3 += in1; TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); in0 += in2; in1 -= in3; in4 = (in0 - in1) >> 1; SUB2(in4, in2, in4, in3, in2, in3); in0 -= in3; in1 += in2; SLLI_4V(in0, in1, in2, in3, 2); TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); ST4x2_UB(in0, output, 4); ST4x2_UB(in3, output + 4, 4); ST4x2_UB(in1, output + 8, 4); ST4x2_UB(in2, output + 12, 4); }
static WEBP_INLINE void PredictLineInverse0(const uint8_t* src, const uint8_t* pred, uint8_t* dst, int length) { v16u8 src0, pred0, dst0; assert(length >= 0); while (length >= 32) { v16u8 src1, pred1, dst1; LD_UB2(src, 16, src0, src1); LD_UB2(pred, 16, pred0, pred1); SUB2(src0, pred0, src1, pred1, dst0, dst1); ST_UB2(dst0, dst1, dst, 16); src += 32; pred += 32; dst += 32; length -= 32; } if (length > 0) { int i; if (length >= 16) { src0 = LD_UB(src); pred0 = LD_UB(pred); dst0 = src0 - pred0; ST_UB(dst0, dst); src += 16; pred += 16; dst += 16; length -= 16; } for (i = 0; i < length; i++) { dst[i] = src[i] - pred[i]; } } }
static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput, const uint8_t* ppred, uint8_t* poutput, int stride, int size) { int w; const v16i8 zero = { 0 }; while (size >= 16) { v16u8 pred0, dst0; v8i16 a0, a1, b0, b1, c0, c1; const v16u8 tmp0 = LD_UB(ppred - 1); const v16u8 tmp1 = LD_UB(ppred - stride); const v16u8 tmp2 = LD_UB(ppred - stride - 1); const v16u8 src0 = LD_UB(pinput); ILVRL_B2_SH(zero, tmp0, a0, a1); ILVRL_B2_SH(zero, tmp1, b0, b1); ILVRL_B2_SH(zero, tmp2, c0, c1); ADD2(a0, b0, a1, b1, a0, a1); SUB2(a0, c0, a1, c1, a0, a1); CLIP_SH2_0_255(a0, a1); pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0); dst0 = src0 - pred0; ST_UB(dst0, poutput); ppred += 16; pinput += 16; poutput += 16; size -= 16; } for (w = 0; w < size; ++w) { const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1]; poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred); } }
void __dubcos(double x, double dx, double v[]) { double r,s,p,hx,tx,hy,ty,q,c,cc,d,dd,d2,dd2,e,ee, sn,ssn,cs,ccs,ds,dss,dc,dcc; #if 0 double xx,y,yy,z,zz; #endif mynumber u; int4 k; u.x=x+big.x; k = u.i[LOW_HALF]<<2; x=x-(u.x-big.x); d=x+dx; dd=(x-d)+dx; /* cos(x+dx)=cos(Xi+t)=cos(Xi)cos(t) - sin(Xi)sin(t) */ MUL2(d,dd,d,dd,d2,dd2,p,hx,tx,hy,ty,q,c,cc); sn=sincos.x[k]; /* */ ssn=sincos.x[k+1]; /* sin(Xi) and cos(Xi) */ cs=sincos.x[k+2]; /* */ ccs=sincos.x[k+3]; /* */ MUL2(d2,dd2,s7.x,ss7.x,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,s5.x,ss5.x,ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,s3.x,ss3.x,ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); MUL2(d,dd,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,d,dd,ds,dss,r,s); MUL2(d2,dd2,c8.x,cc8.x,dc,dcc,p,hx,tx,hy,ty,q,c,cc); ADD2(dc,dcc,c6.x,cc6.x,dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); ADD2(dc,dcc,c4.x,cc4.x,dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); ADD2(dc,dcc,c2.x,cc2.x,dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); MUL2(cs,ccs,ds,dss,e,ee,p,hx,tx,hy,ty,q,c,cc); MUL2(dc,dcc,sn,ssn,dc,dcc,p,hx,tx,hy,ty,q,c,cc); MUL2(d2,dd2,s7.x,ss7.x,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,s5.x,ss5.x,ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,s3.x,ss3.x,ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); MUL2(d,dd,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,d,dd,ds,dss,r,s); MUL2(d2,dd2,c8.x,cc8.x,dc,dcc,p,hx,tx,hy,ty,q,c,cc); ADD2(dc,dcc,c6.x,cc6.x,dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); ADD2(dc,dcc,c4.x,cc4.x,dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); ADD2(dc,dcc,c2.x,cc2.x,dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); MUL2(sn,ssn,ds,dss,e,ee,p,hx,tx,hy,ty,q,c,cc); MUL2(dc,dcc,cs,ccs,dc,dcc,p,hx,tx,hy,ty,q,c,cc); ADD2(e,ee,dc,dcc,e,ee,r,s); SUB2(cs,ccs,e,ee,e,ee,r,s); v[0]=e; v[1]=ee; }
static void shDrawPaintMesh(VGContext *c, SHVector2 *min, SHVector2 *max, VGPaintMode mode, GLenum texUnit) { SHPaint *p; SHVector2 pmin, pmax; SHfloat K = 1.0f; /* Pick the right paint */ if (mode == VG_FILL_PATH) { p = (c->fillPaint ? c->fillPaint : &c->defaultPaint); }else if (mode == VG_STROKE_PATH) { p = (c->strokePaint ? c->strokePaint : &c->defaultPaint); K = SH_CEIL(c->strokeMiterLimit * c->strokeLineWidth) + 1.0f; } /* We want to be sure to cover every pixel of this path so better take a pixel more than leave some out (multisampling is tricky). */ SET2V(pmin, (*min)); SUB2(pmin, K,K); SET2V(pmax, (*max)); ADD2(pmax, K,K); /* Construct appropriate OpenGL primitives so as to fill the stencil mask with select paint */ switch (p->type) { case VG_PAINT_TYPE_LINEAR_GRADIENT: shDrawLinearGradientMesh(p, min, max, mode, texUnit); break; case VG_PAINT_TYPE_RADIAL_GRADIENT: shDrawRadialGradientMesh(p, min, max, mode, texUnit); break; case VG_PAINT_TYPE_PATTERN: if (p->pattern != VG_INVALID_HANDLE) { shDrawPatternMesh(p, min, max, mode, texUnit); break; }/* else behave as a color paint */ case VG_PAINT_TYPE_COLOR: glColor4fv((GLfloat*)&p->color); glBegin(GL_QUADS); glVertex2f(pmin.x, pmin.y); glVertex2f(pmax.x, pmin.y); glVertex2f(pmax.x, pmax.y); glVertex2f(pmin.x, pmax.y); glEnd(); break; } }
void SECTION __dubsin (double x, double dx, double v[]) { double r, s, c, cc, d, dd, d2, dd2, e, ee, sn, ssn, cs, ccs, ds, dss, dc, dcc; #ifndef DLA_FMS double p, hx, tx, hy, ty, q; #endif mynumber u; int4 k; u.x = x + big.x; k = u.i[LOW_HALF] << 2; x = x - (u.x - big.x); d = x + dx; dd = (x - d) + dx; /* sin(x+dx)=sin(Xi+t)=sin(Xi)*cos(t) + cos(Xi)sin(t) where t ->0 */ MUL2 (d, dd, d, dd, d2, dd2, p, hx, tx, hy, ty, q, c, cc); sn = __sincostab.x[k]; /* */ ssn = __sincostab.x[k + 1]; /* sin(Xi) and cos(Xi) */ cs = __sincostab.x[k + 2]; /* */ ccs = __sincostab.x[k + 3]; /* */ /* Taylor series for sin ds=sin(t) */ MUL2 (d2, dd2, s7.x, ss7.x, ds, dss, p, hx, tx, hy, ty, q, c, cc); ADD2 (ds, dss, s5.x, ss5.x, ds, dss, r, s); MUL2 (d2, dd2, ds, dss, ds, dss, p, hx, tx, hy, ty, q, c, cc); ADD2 (ds, dss, s3.x, ss3.x, ds, dss, r, s); MUL2 (d2, dd2, ds, dss, ds, dss, p, hx, tx, hy, ty, q, c, cc); MUL2 (d, dd, ds, dss, ds, dss, p, hx, tx, hy, ty, q, c, cc); ADD2 (ds, dss, d, dd, ds, dss, r, s); /* Taylor series for cos dc=cos(t) */ MUL2 (d2, dd2, c8.x, cc8.x, dc, dcc, p, hx, tx, hy, ty, q, c, cc); ADD2 (dc, dcc, c6.x, cc6.x, dc, dcc, r, s); MUL2 (d2, dd2, dc, dcc, dc, dcc, p, hx, tx, hy, ty, q, c, cc); ADD2 (dc, dcc, c4.x, cc4.x, dc, dcc, r, s); MUL2 (d2, dd2, dc, dcc, dc, dcc, p, hx, tx, hy, ty, q, c, cc); ADD2 (dc, dcc, c2.x, cc2.x, dc, dcc, r, s); MUL2 (d2, dd2, dc, dcc, dc, dcc, p, hx, tx, hy, ty, q, c, cc); MUL2 (cs, ccs, ds, dss, e, ee, p, hx, tx, hy, ty, q, c, cc); MUL2 (dc, dcc, sn, ssn, dc, dcc, p, hx, tx, hy, ty, q, c, cc); SUB2 (e, ee, dc, dcc, e, ee, r, s); ADD2 (e, ee, sn, ssn, e, ee, r, s); /* e+ee=sin(x+dx) */ v[0] = e; v[1] = ee; }
void __dubsin(Double x, Double dx, Double v[]) { Double r,s,p,hx,tx,hy,ty,q,c,cc,d,dd,d2,dd2,e,ee, sn,ssn,cs,ccs,ds,dss,dc,dcc; #if 0 Double xx,y,yy,z,zz; #endif mynumber u; int4 k; u.x()=x+big.x(); k = u.i[LOW_HALF]<<2; x=x-(u.x()-big.x()); d=x+dx; dd=(x-d)+dx; /* sin(x+dx)=sin(Xi+t)=sin(Xi)*cos(t) + cos(Xi)sin(t) where t ->0 */ MUL2(d,dd,d,dd,d2,dd2,p,hx,tx,hy,ty,q,c,cc); sn=sincos.x(k); /* */ ssn=sincos.x(k+1); /* sin(Xi) and cos(Xi) */ cs=sincos.x(k+2); /* */ ccs=sincos.x(k+3); /* */ MUL2(d2,dd2,s7.x(),ss7.x(),ds,dss,p,hx,tx,hy,ty,q,c,cc); /* Taylor */ ADD2(ds,dss,s5.x(),ss5.x(),ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); /* series */ ADD2(ds,dss,s3.x(),ss3.x(),ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); /* for sin */ MUL2(d,dd,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,d,dd,ds,dss,r,s); /* ds=sin(t) */ MUL2(d2,dd2,c8.x(),cc8.x(),dc,dcc,p,hx,tx,hy,ty,q,c,cc); ;/* Taylor */ ADD2(dc,dcc,c6.x(),cc6.x(),dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); /* series */ ADD2(dc,dcc,c4.x(),cc4.x(),dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); /* for cos */ ADD2(dc,dcc,c2.x(),cc2.x(),dc,dcc,r,s); MUL2(d2,dd2,dc,dcc,dc,dcc,p,hx,tx,hy,ty,q,c,cc); /* dc=cos(t) */ MUL2(cs,ccs,ds,dss,e,ee,p,hx,tx,hy,ty,q,c,cc); MUL2(dc,dcc,sn,ssn,dc,dcc,p,hx,tx,hy,ty,q,c,cc); SUB2(e,ee,dc,dcc,e,ee,r,s); ADD2(e,ee,sn,ssn,e,ee,r,s); /* e+ee=sin(x+dx) */ v[0]=e; v[1]=ee; }
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, const uint8_t* top) { if (left != NULL) { if (top != NULL) { int j; v8i16 d1, d2; const v16i8 zero = { 0 }; const v8i16 TL = (v8i16)__msa_fill_h(left[-1]); const v16u8 T = LD_UB(top); ILVRL_B2_SH(zero, T, d1, d2); SUB2(d1, TL, d2, TL, d1, d2); for (j = 0; j < 16; j += 4) { v16i8 t0, t1, t2, t3; v8i16 r0, r1, r2, r3, r4, r5, r6, r7; const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]); const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]); const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]); const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]); ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3); ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7); CLIP_SH4_0_255(r0, r1, r2, r3); CLIP_SH4_0_255(r4, r5, r6, r7); PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3); ST_SB4(t0, t1, t2, t3, dst, BPS); dst += 4 * BPS; } } else { HorizontalPred16x16(dst, left); } } else { if (top != NULL) { VerticalPred16x16(dst, top); } else { const v16u8 out = (v16u8)__msa_fill_b(0x81); STORE16x16(out, dst); } } }
int shDrawRadialGradientMesh(SHPaint *p, SHVector2 *min, SHVector2 *max, VGPaintMode mode, GLenum texUnit) { SHint i, j; float a, n; SHfloat cx = p->radialGradient[0]; SHfloat cy = p->radialGradient[1]; SHfloat fx = p->radialGradient[2]; SHfloat fy = p->radialGradient[3]; float r = p->radialGradient[4]; float fcx, fcy, rr, C; SHVector2 ux; SHVector2 uy; SHVector2 c, f; SHVector2 cf; SHMatrix3x3 *m; SHMatrix3x3 mi; SHint invertible; SHVector2 corners[4]; SHVector2 fcorners[4]; SHfloat minOffset=0.0f; SHfloat maxOffset=0.0f; SHint maxI=0, maxJ=0; SHfloat maxA=0.0f; SHfloat startA=0.0f; int numsteps = 100; float step = 2*PI/numsteps; SHVector2 tmin, tmax; SHVector2 min1, max1, min2, max2; /* Pick paint transform matrix */ SH_GETCONTEXT(0); if (mode == VG_FILL_PATH) m = &context->fillTransform; else if (mode == VG_STROKE_PATH) m = &context->strokeTransform; /* Move focus into circle if outside */ SET2(cf, fx,fy); SUB2(cf, cx,cy); n = NORM2(cf); if (n > r) { DIV2(cf, n); fx = cx + 0.995f * r * cf.x; fy = cy + 0.995f * r * cf.y; } /* Precalculations */ rr = r*r; fcx = fx - cx; fcy = fy - cy; C = fcx*fcx + fcy*fcy - rr; /* Apply paint-to-user transformation to focus and unit vectors */ SET2(f, fx, fy); SET2(c, cx, cy); SET2(ux, 1, 0); SET2(uy, 0, 1); ADD2(ux, cx, cy); ADD2(uy, cx, cy); TRANSFORM2(f, (*m)); TRANSFORM2(c, (*m)); TRANSFORM2(ux, (*m)); TRANSFORM2(uy, (*m)); SUB2V(ux, c); SUB2V(uy, c); /* Boundbox corners */ SET2(corners[0], min->x, min->y); SET2(corners[1], max->x, min->y); SET2(corners[2], max->x, max->y); SET2(corners[3], min->x, max->y); /* Find inverse transformation (back to paint space) */ invertible = shInvertMatrix(m, &mi); if (!invertible || r <= 0.0f) { /* Fill boundbox with color at offset 1 */ SHColor *c = &p->stops.items[p->stops.size-1].color; glColor4fv((GLfloat*)c); glBegin(GL_QUADS); for (i=0; i<4; ++i) glVertex2fv((GLfloat*)&corners[i]); glEnd(); return 1; } /*--------------------------------------------------------*/ /* Find min/max offset */ for (i=0; i<4; ++i) { /* Transform to paint space */ SHfloat ax,ay, A,B,D,t, off; TRANSFORM2TO(corners[i], mi, fcorners[i]); SUB2(fcorners[i], fx, fy); n = NORM2(fcorners[i]); if (n == 0.0f) { /* Avoid zero-length vectors */ off = 0.0f; }else{ /* Distance from focus to circle at corner angle */ DIV2(fcorners[i], n); ax = fcorners[i].x; ay = fcorners[i].y; A = ax*ax + ay*ay; B = 2 * (fcx*ax + fcy*ay); D = B*B - 4*A*C; t = (-B + SH_SQRT(D)) / (2*A); /* Relative offset of boundbox corner */ if (D <= 0.0f) off = 1.0f; else off = n / t; } /* Find smallest and largest offset */ if (off < minOffset || i==0) minOffset = off; if (off > maxOffset || i==0) maxOffset = off; } /* Is transformed focus inside original boundbox? */ if (f.x >= min->x && f.x <= max->x && f.y >= min->y && f.y <= max->y) { /* Draw whole circle */ minOffset = 0.0f; startA = 0.0f; maxA = 2*PI; }else{ /* Find most distant corner pair */ for (i=0; i<3; ++i) { if (ISZERO2(fcorners[i])) continue; for (j=i+1; j<4; ++j) { if (ISZERO2(fcorners[j])) continue; a = ANGLE2N(fcorners[i], fcorners[j]); if (a > maxA || maxA == 0.0f) {maxA=a; maxI=i; maxJ=j;} }} /* Pick starting angle */ if (CROSS2(fcorners[maxI],fcorners[maxJ]) > 0.0f) startA = shVectorOrientation(&fcorners[maxI]); else startA = shVectorOrientation(&fcorners[maxJ]); } /*---------------------------------------------------------*/ /* TODO: for minOffset we'd actually need to find minimum of the gradient function when X and Y are substitued with a line equation for each bound-box edge. As a workaround we use 0.0f for now. */ minOffset = 0.0f; step = PI/50; numsteps = (SHint)SH_CEIL(maxA / step) + 1; glActiveTexture(texUnit); shSetGradientTexGLState(p); glEnable(GL_TEXTURE_1D); glBegin(GL_QUADS); /* Walk the steps and draw gradient mesh */ for (i=0, a=startA; i<numsteps; ++i, a+=step) { /* Distance from focus to circle border at current angle (gradient space) */ float ax = SH_COS(a); float ay = SH_SIN(a); float A = ax*ax + ay*ay; float B = 2 * (fcx*ax + fcy*ay); float D = B*B - 4*A*C; float t = (-B + SH_SQRT(D)) / (2*A); if (D <= 0.0f) t = 0.0f; /* Vectors pointing towards minimum and maximum offset at current angle (gradient space) */ tmin.x = ax * t * minOffset; tmin.y = ay * t * minOffset; tmax.x = ax * t * maxOffset; tmax.y = ay * t * maxOffset; /* Transform back to user space */ min2.x = f.x + tmin.x * ux.x + tmin.y * uy.x; min2.y = f.y + tmin.x * ux.y + tmin.y * uy.y; max2.x = f.x + tmax.x * ux.x + tmax.y * uy.x; max2.y = f.y + tmax.x * ux.y + tmax.y * uy.y; /* Draw quad */ if (i!=0) { glMultiTexCoord1f(texUnit, minOffset); glVertex2fv((GLfloat*)&min1); glVertex2fv((GLfloat*)&min2); glMultiTexCoord1f(texUnit, maxOffset); glVertex2fv((GLfloat*)&max2); glVertex2fv((GLfloat*)&max1); } /* Save prev points */ min1 = min2; max1 = max2; } glEnd(); glDisable(GL_TEXTURE_1D); return 1; }
static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, int16_t *out) { v8i16 in16, in17, in18, in19, in20, in21, in22, in23; v8i16 in24, in25, in26, in27, in28, in29, in30, in31; v8i16 vec4, vec5; in20 = LD_SH(temp + 32); in21 = LD_SH(temp + 40); in26 = LD_SH(temp + 80); in27 = LD_SH(temp + 88); DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); FDCT_POSTPROC_2V_NEG_H(in20, in21); FDCT_POSTPROC_2V_NEG_H(in26, in27); in18 = LD_SH(temp + 16); in19 = LD_SH(temp + 24); in28 = LD_SH(temp + 96); in29 = LD_SH(temp + 104); FDCT_POSTPROC_2V_NEG_H(in18, in19); FDCT_POSTPROC_2V_NEG_H(in28, in29); vec4 = in19 - in20; ST_SH(vec4, interm_ptr + 32); vec4 = in18 - in21; ST_SH(vec4, interm_ptr + 88); vec4 = in29 - in26; ST_SH(vec4, interm_ptr + 64); vec4 = in28 - in27; ST_SH(vec4, interm_ptr + 56); ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); in22 = LD_SH(temp + 48); in23 = LD_SH(temp + 56); in24 = LD_SH(temp + 64); in25 = LD_SH(temp + 72); DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); FDCT_POSTPROC_2V_NEG_H(in22, in23); FDCT_POSTPROC_2V_NEG_H(in24, in25); in16 = LD_SH(temp); in17 = LD_SH(temp + 8); in30 = LD_SH(temp + 112); in31 = LD_SH(temp + 120); FDCT_POSTPROC_2V_NEG_H(in16, in17); FDCT_POSTPROC_2V_NEG_H(in30, in31); vec4 = in17 - in22; ST_SH(vec4, interm_ptr + 40); vec4 = in30 - in25; ST_SH(vec4, interm_ptr + 48); vec4 = in31 - in24; ST_SH(vec4, interm_ptr + 72); vec4 = in16 - in23; ST_SH(vec4, interm_ptr + 80); ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); ADD2(in27, in26, in25, in24, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); ST_SH(vec5, out); ST_SH(vec4, out + 120); SUB2(in27, in26, in25, in24, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); ST_SH(vec5, out + 112); ST_SH(vec4, out + 8); SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); SUB2(in26, in27, in24, in25, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); ST_SH(vec4, out + 16); ST_SH(vec5, out + 104); ADD2(in26, in27, in24, in25, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); ST_SH(vec4, out + 24); ST_SH(vec5, out + 96); in20 = LD_SH(interm_ptr + 32); in21 = LD_SH(interm_ptr + 88); in27 = LD_SH(interm_ptr + 56); in26 = LD_SH(interm_ptr + 64); in16 = in20; in17 = in21; DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); in22 = LD_SH(interm_ptr + 40); in25 = LD_SH(interm_ptr + 48); in24 = LD_SH(interm_ptr + 72); in23 = LD_SH(interm_ptr + 80); SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); in16 = in28 + in29; in19 = in31 + in30; DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); ST_SH(vec5, out + 32); ST_SH(vec4, out + 88); SUB2(in28, in29, in31, in30, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); ST_SH(vec5, out + 40); ST_SH(vec4, out + 80); ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); SUB2(in29, in28, in30, in31, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); ST_SH(vec5, out + 72); ST_SH(vec4, out + 48); ADD2(in29, in28, in30, in31, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); ST_SH(vec4, out + 56); ST_SH(vec5, out + 64); }
static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; /* fdct32 even */ /* stage 2 */ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); FDCT_POSTPROC_2V_NEG_H(vec0, vec1); FDCT_POSTPROC_2V_NEG_H(vec2, vec3); FDCT_POSTPROC_2V_NEG_H(vec4, vec5); FDCT_POSTPROC_2V_NEG_H(vec6, vec7); FDCT_POSTPROC_2V_NEG_H(in8, in9); FDCT_POSTPROC_2V_NEG_H(in10, in11); FDCT_POSTPROC_2V_NEG_H(in12, in13); FDCT_POSTPROC_2V_NEG_H(in14, in15); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); temp0 = in0 + in3; in0 = in0 - in3; in3 = in1 + in2; in1 = in1 - in2; DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); ST_SH(temp0, out); ST_SH(temp1, out + 8); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); ST_SH(temp0, out + 16); ST_SH(temp1, out + 24); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); ST_SH(temp0, out + 32); ST_SH(temp1, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); ST_SH(temp0, out + 40); ST_SH(temp1, out + 48); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); ST_SH(temp0, out + 64); ST_SH(temp1, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); ST_SH(temp0, out + 72); ST_SH(temp1, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); ST_SH(temp0, out + 80); ST_SH(temp1, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); ST_SH(temp0, out + 96); ST_SH(temp1, out + 88); }
static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { v8i16 in16, in17, in18, in19, in20, in21, in22, in23; v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; in20 = LD_SH(input + 32); in21 = LD_SH(input + 40); in26 = LD_SH(input + 80); in27 = LD_SH(input + 88); DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); in18 = LD_SH(input + 16); in19 = LD_SH(input + 24); in28 = LD_SH(input + 96); in29 = LD_SH(input + 104); vec4 = in19 - in20; ST_SH(vec4, input + 32); vec4 = in18 - in21; ST_SH(vec4, input + 40); vec4 = in29 - in26; ST_SH(vec4, input + 80); vec4 = in28 - in27; ST_SH(vec4, input + 88); in21 = in18 + in21; in20 = in19 + in20; in27 = in28 + in27; in26 = in29 + in26; LD_SH4(input + 48, 8, in22, in23, in24, in25); DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); in16 = LD_SH(input); in17 = LD_SH(input + 8); in30 = LD_SH(input + 112); in31 = LD_SH(input + 120); vec4 = in17 - in22; ST_SH(vec4, input + 16); vec4 = in16 - in23; ST_SH(vec4, input + 24); vec4 = in31 - in24; ST_SH(vec4, input + 96); vec4 = in30 - in25; ST_SH(vec4, input + 104); ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); ADD2(in27, in26, in25, in24, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr); ST_SH(vec4, temp_ptr + 960); SUB2(in27, in26, in25, in24, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 448); ST_SH(vec4, temp_ptr + 512); SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); SUB2(in26, in27, in24, in25, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec4, temp_ptr + 704); ST_SH(vec5, temp_ptr + 256); ADD2(in26, in27, in24, in25, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec4, temp_ptr + 192); ST_SH(vec5, temp_ptr + 768); LD_SH4(input + 16, 8, in22, in23, in20, in21); LD_SH4(input + 80, 8, in26, in27, in24, in25); in16 = in20; in17 = in21; DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); ADD2(in28, in29, in31, in30, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 832); ST_SH(vec4, temp_ptr + 128); SUB2(in28, in29, in31, in30, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 320); ST_SH(vec4, temp_ptr + 640); ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); SUB2(in29, in28, in30, in31, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 576); ST_SH(vec4, temp_ptr + 384); ADD2(in29, in28, in30, in31, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 64); ST_SH(vec4, temp_ptr + 896); }
static void shDrawPaintMesh(VGContext *c, SHVector2 *min, SHVector2 *max, VGPaintMode mode, GLenum texUnit) { SHPaint *p; SHVector2 pmin, pmax; SHfloat K = 1.0f; #ifdef ANDROIDVG SHColor *color; GLfloat v[6][2]; #endif /* Pick the right paint */ if (mode == VG_FILL_PATH) { p = (c->fillPaint ? c->fillPaint : &c->defaultPaint); }else if (mode == VG_STROKE_PATH) { p = (c->strokePaint ? c->strokePaint : &c->defaultPaint); K = SH_CEIL(c->strokeMiterLimit * c->strokeLineWidth) + 1.0f; } /* We want to be sure to cover every pixel of this path so better take a pixel more than leave some out (multisampling is tricky). */ SET2V(pmin, (*min)); SUB2(pmin, K,K); SET2V(pmax, (*max)); ADD2(pmax, K,K); /* Construct appropriate OpenGL primitives so as to fill the stencil mask with select paint */ switch (p->type) { case VG_PAINT_TYPE_LINEAR_GRADIENT: shDrawLinearGradientMesh(p, min, max, mode, texUnit); break; case VG_PAINT_TYPE_RADIAL_GRADIENT: shDrawRadialGradientMesh(p, min, max, mode, texUnit); break; case VG_PAINT_TYPE_PATTERN: if (p->pattern != VG_INVALID_HANDLE) { shDrawPatternMesh(p, min, max, mode, texUnit); break; }/* else behave as a color paint */ case VG_PAINT_TYPE_COLOR: #ifdef ANDROIDVG v[0][0] = pmin.x; v[0][1] = pmin.y; v[1][0] = pmax.x; v[1][1] = pmin.y; v[2][0] = pmax.x; v[2][1] = pmax.y; v[3][0] = pmin.x; v[3][1] = pmin.y; v[4][0] = pmax.x; v[4][1] = pmax.y; v[5][0] = pmin.x; v[5][1] = pmax.y; color = &p->color; glColor4f(color->r, color->g, color->b, color->a); glEnableClientState(GL_VERTEX_ARRAY); glVertexPointer(2, GL_FLOAT, 0, v); glDrawArrays(GL_TRIANGLES, 0, 6); glDisableClientState(GL_VERTEX_ARRAY); #else glColor4fv((GLfloat*)&p->color); glBegin(GL_QUADS); glVertex2f(pmin.x, pmin.y); glVertex2f(pmax.x, pmin.y); glVertex2f(pmax.x, pmax.y); glVertex2f(pmin.x, pmax.y); glEnd(); #endif break; } }
static void cave_model (Render* render, Cave* cave, int mode, float minimapoffset) { for (int i = 0; i < SEGMENT_COUNT-1; ++i) { // aid bread-crumb track if (render->aidtrack && mode == DISPLAYMODE_NORMAL && !(i&1)) { glDisable(GL_LIGHTING); glColor4f(0.5,0.5,1,1); glBegin(GL_LINE_STRIP); #define CRUMB_SIZE 0.1 glVertex3f(cave->centers[i][0]-CRUMB_SIZE,cave->centers[i][1]-CRUMB_SIZE,cave->centers[i][2]); glVertex3f(cave->centers[i][0]+CRUMB_SIZE,cave->centers[i][1]+CRUMB_SIZE,cave->centers[i][2]); glVertex3fv(cave->centers[i]); glVertex3f(cave->centers[i][0]+CRUMB_SIZE,cave->centers[i][1]-CRUMB_SIZE,cave->centers[i][2]); glVertex3f(cave->centers[i][0]-CRUMB_SIZE,cave->centers[i][1]+CRUMB_SIZE,cave->centers[i][2]); glEnd(); } if (render->lighting) glEnable(GL_LIGHTING); int i0 = (cave->i + i)%SEGMENT_COUNT; if (cave->dirty[i0]) { for (int mode = 0; mode < DISPLAYMODE_COUNT; ++mode) { if (glIsList (render->gl_list[mode][i0])) glDeleteLists (render->gl_list[mode][i0], 1); render->gl_list[mode][i0] = 0; } cave->dirty[i0] = false; } if (render->gl_list[mode][i0] == 0) { int id = render->gl_list[mode][i0] = i0 + render->list_start[mode]; glNewList (id, GL_COMPILE); int i1 = (i0 + 1)%SEGMENT_COUNT; if (mode == DISPLAYMODE_NORMAL) { glBindTexture (GL_TEXTURE_2D, #ifdef OUTSIDE_TEXTURE_FILE cave->segs[0][0][2] < ROOM_LEN/2 ? render->outside_texture_id : #endif render->wall_texture_id ); glColor4f (1, 1, 1, 0.5); } glBegin (GL_QUAD_STRIP); for (int k = 0; k <= SECTOR_COUNT; ++k) { int k0 = k%SECTOR_COUNT; if (mode == DISPLAYMODE_NORMAL) { glTexCoord2f( cave->segs[i0][k0][2]/SEGMENT_LEN/SEGMENT_COUNT, (float)k/SECTOR_COUNT); } GLfloat thenormal[] = {0, 0, 0}; SUB2(thenormal, cave->centers[i0], cave->segs[i0][k0]); NORM(thenormal); glNormal3fv(thenormal); glVertex3fv(cave->segs[i0][k0]); if (mode == DISPLAYMODE_NORMAL) { glTexCoord2f( cave->segs[i1][k0][2]/SEGMENT_LEN/SEGMENT_COUNT, (float)k/SECTOR_COUNT); } SUB2(thenormal, cave->centers[i1], cave->segs[i1][k0]); NORM(thenormal); glNormal3fv(thenormal); glVertex3fv(cave->segs[i1][k0]); } glEnd(); glEndList(); } if (mode == DISPLAYMODE_NORMAL) { glEnable (GL_DEPTH_TEST); glDisable (GL_BLEND); glEnable (GL_TEXTURE_2D); } else { glDisable (GL_DEPTH_TEST); glEnable (GL_BLEND); glDisable (GL_TEXTURE_2D); glDisable(GL_LIGHTING); } if (mode == DISPLAYMODE_MINIMAP) { // apparently pow is a lot more complicated than what we need here. #define FASTPOW6(a) a * a * a * a * a * a float ioffset = minimapoffset - (int)(minimapoffset); // i wonder if this works at all? float alpha = .12f - .12f * FASTPOW6(((SEGMENT_COUNT / 2.0f - i + ioffset) / SEGMENT_COUNT * 2.0f)); if(i > render->gauge * SEGMENT_COUNT) glColor4f (1, 1, 1, alpha); else glColor4f ( huemap[i][0], huemap[i][1], huemap[i][2], alpha); } glCallList (render->gl_list[mode][i0]); } }
double SECTION __ieee754_atan2 (double y, double x) { int i, de, ux, dx, uy, dy; static const int pr[MM] = { 6, 8, 10, 20, 32 }; double ax, ay, u, du, u9, ua, v, vv, dv, t1, t2, t3, t7, t8, z, zz, cor, s1, ss1, s2, ss2; #ifndef DLA_FMS double t4, t5, t6; #endif number num; static const int ep = 59768832, /* 57*16**5 */ em = -59768832; /* -57*16**5 */ /* x=NaN or y=NaN */ num.d = x; ux = num.i[HIGH_HALF]; dx = num.i[LOW_HALF]; if ((ux & 0x7ff00000) == 0x7ff00000) { if (((ux & 0x000fffff) | dx) != 0x00000000) return x + x; } num.d = y; uy = num.i[HIGH_HALF]; dy = num.i[LOW_HALF]; if ((uy & 0x7ff00000) == 0x7ff00000) { if (((uy & 0x000fffff) | dy) != 0x00000000) return y + y; } /* y=+-0 */ if (uy == 0x00000000) { if (dy == 0x00000000) { if ((ux & 0x80000000) == 0x00000000) return 0; else return opi.d; } } else if (uy == 0x80000000) { if (dy == 0x00000000) { if ((ux & 0x80000000) == 0x00000000) return -0.0; else return mopi.d; } } /* x=+-0 */ if (x == 0) { if ((uy & 0x80000000) == 0x00000000) return hpi.d; else return mhpi.d; } /* x=+-INF */ if (ux == 0x7ff00000) { if (dx == 0x00000000) { if (uy == 0x7ff00000) { if (dy == 0x00000000) return qpi.d; } else if (uy == 0xfff00000) { if (dy == 0x00000000) return mqpi.d; } else { if ((uy & 0x80000000) == 0x00000000) return 0; else return -0.0; } } } else if (ux == 0xfff00000) { if (dx == 0x00000000) { if (uy == 0x7ff00000) { if (dy == 0x00000000) return tqpi.d; } else if (uy == 0xfff00000) { if (dy == 0x00000000) return mtqpi.d; } else { if ((uy & 0x80000000) == 0x00000000) return opi.d; else return mopi.d; } } } /* y=+-INF */ if (uy == 0x7ff00000) { if (dy == 0x00000000) return hpi.d; } else if (uy == 0xfff00000) { if (dy == 0x00000000) return mhpi.d; } /* either x/y or y/x is very close to zero */ ax = (x < 0) ? -x : x; ay = (y < 0) ? -y : y; de = (uy & 0x7ff00000) - (ux & 0x7ff00000); if (de >= ep) { return ((y > 0) ? hpi.d : mhpi.d); } else if (de <= em) { if (x > 0) { if ((z = ay / ax) < TWOM1022) return normalized (ax, ay, y, z); else return signArctan2 (y, z); } else { return ((y > 0) ? opi.d : mopi.d); } } /* if either x or y is extremely close to zero, scale abs(x), abs(y). */ if (ax < twom500.d || ay < twom500.d) { ax *= two500.d; ay *= two500.d; } /* Likewise for large x and y. */ if (ax > two500.d || ay > two500.d) { ax *= twom500.d; ay *= twom500.d; } /* x,y which are neither special nor extreme */ if (ay < ax) { u = ay / ax; EMULV (ax, u, v, vv, t1, t2, t3, t4, t5); du = ((ay - v) - vv) / ax; } else { u = ax / ay; EMULV (ay, u, v, vv, t1, t2, t3, t4, t5); du = ((ax - v) - vv) / ay; } if (x > 0) { /* (i) x>0, abs(y)< abs(x): atan(ay/ax) */ if (ay < ax) { if (u < inv16.d) { v = u * u; zz = du + u * v * (d3.d + v * (d5.d + v * (d7.d + v * (d9.d + v * (d11.d + v * d13.d))))); if ((z = u + (zz - u1.d * u)) == u + (zz + u1.d * u)) return signArctan2 (y, z); MUL2 (u, du, u, du, v, vv, t1, t2, t3, t4, t5, t6, t7, t8); s1 = v * (f11.d + v * (f13.d + v * (f15.d + v * (f17.d + v * f19.d)))); ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); MUL2 (u, du, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (u, du, s2, ss2, s1, ss1, t1, t2); if ((z = s1 + (ss1 - u5.d * s1)) == s1 + (ss1 + u5.d * s1)) return signArctan2 (y, z); return atan2Mp (x, y, pr); } i = (TWO52 + TWO8 * u) - TWO52; i -= 16; t3 = u - cij[i][0].d; EADD (t3, du, v, dv); t1 = cij[i][1].d; t2 = cij[i][2].d; zz = v * t2 + (dv * t2 + v * v * (cij[i][3].d + v * (cij[i][4].d + v * (cij[i][5].d + v * cij[i][6].d)))); if (i < 112) { if (i < 48) u9 = u91.d; /* u < 1/4 */ else u9 = u92.d; } /* 1/4 <= u < 1/2 */ else { if (i < 176) u9 = u93.d; /* 1/2 <= u < 3/4 */ else u9 = u94.d; } /* 3/4 <= u <= 1 */ if ((z = t1 + (zz - u9 * t1)) == t1 + (zz + u9 * t1)) return signArctan2 (y, z); t1 = u - hij[i][0].d; EADD (t1, du, v, vv); s1 = v * (hij[i][11].d + v * (hij[i][12].d + v * (hij[i][13].d + v * (hij[i][14].d + v * hij[i][15].d)))); ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2); if ((z = s2 + (ss2 - ub.d * s2)) == s2 + (ss2 + ub.d * s2)) return signArctan2 (y, z); return atan2Mp (x, y, pr); } /* (ii) x>0, abs(x)<=abs(y): pi/2-atan(ax/ay) */ if (u < inv16.d) { v = u * u; zz = u * v * (d3.d + v * (d5.d + v * (d7.d + v * (d9.d + v * (d11.d + v * d13.d))))); ESUB (hpi.d, u, t2, cor); t3 = ((hpi1.d + cor) - du) - zz; if ((z = t2 + (t3 - u2.d)) == t2 + (t3 + u2.d)) return signArctan2 (y, z); MUL2 (u, du, u, du, v, vv, t1, t2, t3, t4, t5, t6, t7, t8); s1 = v * (f11.d + v * (f13.d + v * (f15.d + v * (f17.d + v * f19.d)))); ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); MUL2 (u, du, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (u, du, s2, ss2, s1, ss1, t1, t2); SUB2 (hpi.d, hpi1.d, s1, ss1, s2, ss2, t1, t2); if ((z = s2 + (ss2 - u6.d)) == s2 + (ss2 + u6.d)) return signArctan2 (y, z); return atan2Mp (x, y, pr); } i = (TWO52 + TWO8 * u) - TWO52; i -= 16; v = (u - cij[i][0].d) + du; zz = hpi1.d - v * (cij[i][2].d + v * (cij[i][3].d + v * (cij[i][4].d + v * (cij[i][5].d + v * cij[i][6].d)))); t1 = hpi.d - cij[i][1].d; if (i < 112) ua = ua1.d; /* w < 1/2 */ else ua = ua2.d; /* w >= 1/2 */ if ((z = t1 + (zz - ua)) == t1 + (zz + ua)) return signArctan2 (y, z); t1 = u - hij[i][0].d; EADD (t1, du, v, vv); s1 = v * (hij[i][11].d + v * (hij[i][12].d + v * (hij[i][13].d + v * (hij[i][14].d + v * hij[i][15].d)))); ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2); SUB2 (hpi.d, hpi1.d, s2, ss2, s1, ss1, t1, t2); if ((z = s1 + (ss1 - uc.d)) == s1 + (ss1 + uc.d)) return signArctan2 (y, z); return atan2Mp (x, y, pr); } /* (iii) x<0, abs(x)< abs(y): pi/2+atan(ax/ay) */ if (ax < ay) { if (u < inv16.d) { v = u * u; zz = u * v * (d3.d + v * (d5.d + v * (d7.d + v * (d9.d + v * (d11.d + v * d13.d))))); EADD (hpi.d, u, t2, cor); t3 = ((hpi1.d + cor) + du) + zz; if ((z = t2 + (t3 - u3.d)) == t2 + (t3 + u3.d)) return signArctan2 (y, z); MUL2 (u, du, u, du, v, vv, t1, t2, t3, t4, t5, t6, t7, t8); s1 = v * (f11.d + v * (f13.d + v * (f15.d + v * (f17.d + v * f19.d)))); ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); MUL2 (u, du, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (u, du, s2, ss2, s1, ss1, t1, t2); ADD2 (hpi.d, hpi1.d, s1, ss1, s2, ss2, t1, t2); if ((z = s2 + (ss2 - u7.d)) == s2 + (ss2 + u7.d)) return signArctan2 (y, z); return atan2Mp (x, y, pr); } i = (TWO52 + TWO8 * u) - TWO52; i -= 16; v = (u - cij[i][0].d) + du; zz = hpi1.d + v * (cij[i][2].d + v * (cij[i][3].d + v * (cij[i][4].d + v * (cij[i][5].d + v * cij[i][6].d)))); t1 = hpi.d + cij[i][1].d; if (i < 112) ua = ua1.d; /* w < 1/2 */ else ua = ua2.d; /* w >= 1/2 */ if ((z = t1 + (zz - ua)) == t1 + (zz + ua)) return signArctan2 (y, z); t1 = u - hij[i][0].d; EADD (t1, du, v, vv); s1 = v * (hij[i][11].d + v * (hij[i][12].d + v * (hij[i][13].d + v * (hij[i][14].d + v * hij[i][15].d)))); ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2); ADD2 (hpi.d, hpi1.d, s2, ss2, s1, ss1, t1, t2); if ((z = s1 + (ss1 - uc.d)) == s1 + (ss1 + uc.d)) return signArctan2 (y, z); return atan2Mp (x, y, pr); } /* (iv) x<0, abs(y)<=abs(x): pi-atan(ax/ay) */ if (u < inv16.d) { v = u * u; zz = u * v * (d3.d + v * (d5.d + v * (d7.d + v * (d9.d + v * (d11.d + v * d13.d))))); ESUB (opi.d, u, t2, cor); t3 = ((opi1.d + cor) - du) - zz; if ((z = t2 + (t3 - u4.d)) == t2 + (t3 + u4.d)) return signArctan2 (y, z); MUL2 (u, du, u, du, v, vv, t1, t2, t3, t4, t5, t6, t7, t8); s1 = v * (f11.d + v * (f13.d + v * (f15.d + v * (f17.d + v * f19.d)))); ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); MUL2 (u, du, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (u, du, s2, ss2, s1, ss1, t1, t2); SUB2 (opi.d, opi1.d, s1, ss1, s2, ss2, t1, t2); if ((z = s2 + (ss2 - u8.d)) == s2 + (ss2 + u8.d)) return signArctan2 (y, z); return atan2Mp (x, y, pr); } i = (TWO52 + TWO8 * u) - TWO52; i -= 16; v = (u - cij[i][0].d) + du; zz = opi1.d - v * (cij[i][2].d + v * (cij[i][3].d + v * (cij[i][4].d + v * (cij[i][5].d + v * cij[i][6].d)))); t1 = opi.d - cij[i][1].d; if (i < 112) ua = ua1.d; /* w < 1/2 */ else ua = ua2.d; /* w >= 1/2 */ if ((z = t1 + (zz - ua)) == t1 + (zz + ua)) return signArctan2 (y, z); t1 = u - hij[i][0].d; EADD (t1, du, v, vv); s1 = v * (hij[i][11].d + v * (hij[i][12].d + v * (hij[i][13].d + v * (hij[i][14].d + v * hij[i][15].d)))); ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2); SUB2 (opi.d, opi1.d, s2, ss2, s1, ss1, t1, t2); if ((z = s1 + (ss1 - uc.d)) == s1 + (ss1 + uc.d)) return signArctan2 (y, z); return atan2Mp (x, y, pr); }
static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; /* fdct32 even */ /* stage 2 */ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); /* Stage 3 */ UNPCK_SH_SW(vec0, vec0_l, vec0_r); UNPCK_SH_SW(vec1, vec1_l, vec1_r); UNPCK_SH_SW(vec2, vec2_l, vec2_r); UNPCK_SH_SW(vec3, vec3_l, vec3_r); UNPCK_SH_SW(vec4, vec4_l, vec4_r); UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r); ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); tmp3_w = vec0_r + vec3_r; vec0_r = vec0_r - vec3_r; vec3_r = vec1_r + vec2_r; vec1_r = vec1_r - vec2_r; DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out, 8); DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out + 16, 8); LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 32); ST_SH(in5, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 40); ST_SH(in5, out + 48); LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 64); ST_SH(in5, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 72); ST_SH(in5, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 80); ST_SH(in5, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 96); ST_SH(in5, out + 88); }
static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 temp0, temp1; /* fdct even */ LD_SH4(input, 8, in0, in1, in2, in3); LD_SH4(input + 96, 8, in12, in13, in14, in15); BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, vec3, in12, in13, in14, in15); LD_SH4(input + 32, 8, in4, in5, in6, in7); LD_SH4(input + 64, 8, in8, in9, in10, in11); BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, in8, in9, in10, in11); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp); ST_SH(temp1, temp + 512); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 256); ST_SH(temp1, temp + 768); SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 128); ST_SH(temp1, temp + 896); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 640); ST_SH(temp1, temp + 384); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 64); ST_SH(temp1, temp + 960); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 576); ST_SH(temp1, temp + 448); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 320); ST_SH(temp1, temp + 704); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 192); ST_SH(temp1, temp + 832); }
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 }; LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in8, in9, in10, in11, 2); SLLI_4V(in12, in13, in14, in15, 2); ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); tmp_ptr += 16; /* stp 1 */ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); cnst4 = __msa_splati_h(coeff, 0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); cnst5 = __msa_splati_h(coeff, 1); cnst5 = __msa_ilvev_h(cnst5, cnst4); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); /* stp2 */ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); cnst0 = __msa_splati_h(coeff, 4); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); ILVRL_H2_SH(in15, in8, vec1, vec0); SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr); cnst0 = __msa_splati_h(coeff2, 0); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 224); ILVRL_H2_SH(in14, in9, vec1, vec0); SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 128); cnst1 = __msa_splati_h(coeff2, 2); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 96); SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); cnst1 = __msa_splati_h(coeff, 3); cnst1 = __msa_ilvev_h(cnst0, cnst1); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); /* stp4 */ ADD2(stp34, stp25, stp33, stp22, in13, in10); ILVRL_H2_SH(in13, in10, vec1, vec0); SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 64); cnst0 = __msa_splati_h(coeff2, 1); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 160); SUB2(stp34, stp25, stp33, stp22, in12, in11); ILVRL_H2_SH(in12, in11, vec1, vec0); SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 192); cnst1 = __msa_splati_h(coeff2, 3); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 32); }
/* routine computes the correctly rounded (to nearest) value of atan(x). */ double atan (double x) { double cor, s1, ss1, s2, ss2, t1, t2, t3, t7, t8, t9, t10, u, u2, u3, v, vv, w, ww, y, yy, z, zz; #ifndef DLA_FMS double t4, t5, t6; #endif int i, ux, dx; static const int pr[M] = { 6, 8, 10, 32 }; number num; num.d = x; ux = num.i[HIGH_HALF]; dx = num.i[LOW_HALF]; /* x=NaN */ if (((ux & 0x7ff00000) == 0x7ff00000) && (((ux & 0x000fffff) | dx) != 0x00000000)) return x + x; /* Regular values of x, including denormals +-0 and +-INF */ SET_RESTORE_ROUND (FE_TONEAREST); u = (x < 0) ? -x : x; if (u < C) { if (u < B) { if (u < A) { math_check_force_underflow_nonneg (u); return x; } else { /* A <= u < B */ v = x * x; yy = d11.d + v * d13.d; yy = d9.d + v * yy; yy = d7.d + v * yy; yy = d5.d + v * yy; yy = d3.d + v * yy; yy *= x * v; if ((y = x + (yy - U1 * x)) == x + (yy + U1 * x)) return y; EMULV (x, x, v, vv, t1, t2, t3, t4, t5); /* v+vv=x^2 */ s1 = f17.d + v * f19.d; s1 = f15.d + v * s1; s1 = f13.d + v * s1; s1 = f11.d + v * s1; s1 *= v; ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); MUL2 (x, 0, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (x, 0, s2, ss2, s1, ss1, t1, t2); if ((y = s1 + (ss1 - U5 * s1)) == s1 + (ss1 + U5 * s1)) return y; return atanMp (x, pr); } } else { /* B <= u < C */ i = (TWO52 + TWO8 * u) - TWO52; i -= 16; z = u - cij[i][0].d; yy = cij[i][5].d + z * cij[i][6].d; yy = cij[i][4].d + z * yy; yy = cij[i][3].d + z * yy; yy = cij[i][2].d + z * yy; yy *= z; t1 = cij[i][1].d; if (i < 112) { if (i < 48) u2 = U21; /* u < 1/4 */ else u2 = U22; } /* 1/4 <= u < 1/2 */ else { if (i < 176) u2 = U23; /* 1/2 <= u < 3/4 */ else u2 = U24; } /* 3/4 <= u <= 1 */ if ((y = t1 + (yy - u2 * t1)) == t1 + (yy + u2 * t1)) return __signArctan (x, y); z = u - hij[i][0].d; s1 = hij[i][14].d + z * hij[i][15].d; s1 = hij[i][13].d + z * s1; s1 = hij[i][12].d + z * s1; s1 = hij[i][11].d + z * s1; s1 *= z; ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2); MUL2 (z, 0, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2); MUL2 (z, 0, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2); MUL2 (z, 0, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2); MUL2 (z, 0, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2); if ((y = s2 + (ss2 - U6 * s2)) == s2 + (ss2 + U6 * s2)) return __signArctan (x, y); return atanMp (x, pr); } } else { if (u < D) { /* C <= u < D */ w = 1 / u; EMULV (w, u, t1, t2, t3, t4, t5, t6, t7); ww = w * ((1 - t1) - t2); i = (TWO52 + TWO8 * w) - TWO52; i -= 16; z = (w - cij[i][0].d) + ww; yy = cij[i][5].d + z * cij[i][6].d; yy = cij[i][4].d + z * yy; yy = cij[i][3].d + z * yy; yy = cij[i][2].d + z * yy; yy = HPI1 - z * yy; t1 = HPI - cij[i][1].d; if (i < 112) u3 = U31; /* w < 1/2 */ else u3 = U32; /* w >= 1/2 */ if ((y = t1 + (yy - u3)) == t1 + (yy + u3)) return __signArctan (x, y); DIV2 (1, 0, u, 0, w, ww, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10); t1 = w - hij[i][0].d; EADD (t1, ww, z, zz); s1 = hij[i][14].d + z * hij[i][15].d; s1 = hij[i][13].d + z * s1; s1 = hij[i][12].d + z * s1; s1 = hij[i][11].d + z * s1; s1 *= z; ADD2 (hij[i][9].d, hij[i][10].d, s1, 0, s2, ss2, t1, t2); MUL2 (z, zz, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][7].d, hij[i][8].d, s1, ss1, s2, ss2, t1, t2); MUL2 (z, zz, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][5].d, hij[i][6].d, s1, ss1, s2, ss2, t1, t2); MUL2 (z, zz, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][3].d, hij[i][4].d, s1, ss1, s2, ss2, t1, t2); MUL2 (z, zz, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (hij[i][1].d, hij[i][2].d, s1, ss1, s2, ss2, t1, t2); SUB2 (HPI, HPI1, s2, ss2, s1, ss1, t1, t2); if ((y = s1 + (ss1 - U7)) == s1 + (ss1 + U7)) return __signArctan (x, y); return atanMp (x, pr); } else { if (u < E) { /* D <= u < E */ w = 1 / u; v = w * w; EMULV (w, u, t1, t2, t3, t4, t5, t6, t7); yy = d11.d + v * d13.d; yy = d9.d + v * yy; yy = d7.d + v * yy; yy = d5.d + v * yy; yy = d3.d + v * yy; yy *= w * v; ww = w * ((1 - t1) - t2); ESUB (HPI, w, t3, cor); yy = ((HPI1 + cor) - ww) - yy; if ((y = t3 + (yy - U4)) == t3 + (yy + U4)) return __signArctan (x, y); DIV2 (1, 0, u, 0, w, ww, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10); MUL2 (w, ww, w, ww, v, vv, t1, t2, t3, t4, t5, t6, t7, t8); s1 = f17.d + v * f19.d; s1 = f15.d + v * s1; s1 = f13.d + v * s1; s1 = f11.d + v * s1; s1 *= v; ADD2 (f9.d, ff9.d, s1, 0, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f7.d, ff7.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f5.d, ff5.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (f3.d, ff3.d, s1, ss1, s2, ss2, t1, t2); MUL2 (v, vv, s2, ss2, s1, ss1, t1, t2, t3, t4, t5, t6, t7, t8); MUL2 (w, ww, s1, ss1, s2, ss2, t1, t2, t3, t4, t5, t6, t7, t8); ADD2 (w, ww, s2, ss2, s1, ss1, t1, t2); SUB2 (HPI, HPI1, s1, ss1, s2, ss2, t1, t2); if ((y = s2 + (ss2 - U8)) == s2 + (ss2 + U8)) return __signArctan (x, y); return atanMp (x, pr); } else { /* u >= E */ if (x > 0) return HPI; else return MHPI; } } } }