fast_idct_8 (short *in, int stride) { INT32 tmp10, tmp11, tmp12, tmp13; INT32 tmp20, tmp21, tmp22, tmp23; INT32 tmp30, tmp31; INT32 tmp40, tmp41, tmp42, tmp43; INT32 tmp50, tmp51, tmp52, tmp53; INT32 in0, in1, in2, in3, in4, in5, in6, in7; int i, j; in0 = in[ 0]; in1 = in[stride ]; in2 = in[stride*2]; in3 = in[stride*3]; in4 = in[stride*4]; in5 = in[stride*5]; in6 = in[stride*6]; in7 = in[stride*7]; tmp10 = (in0 + in4) * COS_1_4; tmp11 = (in0 - in4) * COS_1_4; tmp12 = in2 * SIN_1_8 - in6 * COS_1_8; tmp13 = in6 * SIN_1_8 + in2 * COS_1_8; tmp20 = tmp10 + tmp13; tmp21 = tmp11 + tmp12; tmp22 = tmp11 - tmp12; tmp23 = tmp10 - tmp13; tmp30 = UNFIXO((in3 + in5) * COS_1_4); tmp31 = UNFIXO((in3 - in5) * COS_1_4); tmp40 = OVERSH(in1) + tmp30; tmp41 = OVERSH(in7) + tmp31; tmp42 = OVERSH(in1) - tmp30; tmp43 = OVERSH(in7) - tmp31; tmp50 = tmp40 * OCOS_1_16 + tmp41 * OSIN_1_16; tmp51 = tmp40 * OSIN_1_16 - tmp41 * OCOS_1_16; tmp52 = tmp42 * OCOS_5_16 + tmp43 * OSIN_5_16; tmp53 = tmp42 * OSIN_5_16 - tmp43 * OCOS_5_16; in[ 0] = UNFIXH(tmp20 + tmp50); in[stride ] = UNFIXH(tmp21 + tmp53); in[stride*2] = UNFIXH(tmp22 + tmp52); in[stride*3] = UNFIXH(tmp23 + tmp51); in[stride*4] = UNFIXH(tmp23 - tmp51); in[stride*5] = UNFIXH(tmp22 - tmp52); in[stride*6] = UNFIXH(tmp21 - tmp53); in[stride*7] = UNFIXH(tmp20 - tmp50); }
/* __inline__ voidmp_fwd_dct_fast(data2d, dest2d) Block data2d, dest2d; */ __inline__ void mp_fwd_dct_fast(Block data2d, Block dest2d) { int32 *data = (int32 *) data2d; /* this algorithm wants * a 1-d array */ int32 *dest = (int32 *) dest2d; int rowctr, columncounter; register int32 *inptr, *outptr; int32 workspace[DCTSIZE_SQ]; int32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int32 tmp10, tmp11, tmp12, tmp13; int32 tmp14, tmp15, tmp16, tmp17; int32 tmp25, tmp26; SHIFT_TEMPS; /* * Each iteration of the inner loop performs one 8-point 1-D DCT. It * reads from a *row* of the input matrix and stores into a *column* * of the output matrix. In the first pass, we read from the data[] * array and store into the local workspace[]. In the second pass, * we read from the workspace[] array and store into data[], thus * performing the equivalent of a columnar DCT pass with no variable * array indexing. */ inptr = data; /* initialize pointers for first pass */ outptr = workspace; /* PASS ONE */ for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) { /* * many tmps have nonoverlapping lifetime -- flashy * register colourers should be able to do this lot * very well */ /* SHIFT_TEMPS */ /* temp0 through tmp7: -512 to +512 */ /* if I-block, then -256 to +256 */ tmp0 = inptr[7] + inptr[0]; tmp1 = inptr[6] + inptr[1]; tmp2 = inptr[5] + inptr[2]; tmp3 = inptr[4] + inptr[3]; tmp4 = inptr[3] - inptr[4]; tmp5 = inptr[2] - inptr[5]; tmp6 = inptr[1] - inptr[6]; tmp7 = inptr[0] - inptr[7]; /* tmp10 through tmp13: -1024 to +1024 */ /* if I-block, then -512 to +512 */ tmp10 = tmp3 + tmp0; tmp11 = tmp2 + tmp1; tmp12 = tmp1 - tmp2; tmp13 = tmp0 - tmp3; outptr[0] = (int32) UNFIXH((tmp10 + tmp11) * SIN_1_4); outptr[DCTSIZE * 4] = (int32) UNFIXH((tmp10 - tmp11) * COS_1_4); outptr[DCTSIZE * 2] = (int32) UNFIXH(tmp13 * COS_1_8 + tmp12 * SIN_1_8); outptr[DCTSIZE * 6] = (int32) UNFIXH(tmp13 * SIN_1_8 - tmp12 * COS_1_8); tmp16 = UNFIXO((tmp6 + tmp5) * SIN_1_4); tmp15 = UNFIXO((tmp6 - tmp5) * COS_1_4); OVERSHIFT(tmp4); OVERSHIFT(tmp7); /* * tmp4, tmp7, tmp15, tmp16 are overscaled by * OVERSCALE */ tmp14 = tmp4 + tmp15; tmp25 = tmp4 - tmp15; tmp26 = tmp7 - tmp16; tmp17 = tmp7 + tmp16; outptr[DCTSIZE] = (int32) UNFIXH(tmp17 * OCOS_1_16 + tmp14 * OSIN_1_16); outptr[DCTSIZE * 7] = (int32) UNFIXH(tmp17 * OCOS_7_16 - tmp14 * OSIN_7_16); outptr[DCTSIZE * 5] = (int32) UNFIXH(tmp26 * OCOS_5_16 + tmp25 * OSIN_5_16); outptr[DCTSIZE * 3] = (int32) UNFIXH(tmp26 * OCOS_3_16 - tmp25 * OSIN_3_16); inptr += DCTSIZE; /* advance inptr to next row */ outptr++; /* advance outptr to next column */ } /* end of pass; in case it was pass 1, set up for pass 2 */ inptr = workspace; outptr = dest; columncounter = 0; /* PASS TWO */ for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) { /* * many tmps have nonoverlapping lifetime -- flashy * register colourers should be able to do this lot * very well */ /* SHIFT_TEMPS */ /* temp0 through tmp7: -512 to +512 */ /* if I-block, then -256 to +256 */ tmp0 = inptr[7] + inptr[0]; tmp1 = inptr[6] + inptr[1]; tmp2 = inptr[5] + inptr[2]; tmp3 = inptr[4] + inptr[3]; tmp4 = inptr[3] - inptr[4]; tmp5 = inptr[2] - inptr[5]; tmp6 = inptr[1] - inptr[6]; tmp7 = inptr[0] - inptr[7]; /* tmp10 through tmp13: -1024 to +1024 */ /* if I-block, then -512 to +512 */ tmp10 = tmp3 + tmp0; tmp11 = tmp2 + tmp1; tmp12 = tmp1 - tmp2; tmp13 = tmp0 - tmp3; outptr[ zigzag[0][columncounter] ] = (int32) UNFIXH((tmp10 + tmp11) * SIN_1_4); outptr[ zigzag[4][columncounter] ] = (int32) UNFIXH((tmp10 - tmp11) * COS_1_4); outptr[ zigzag[2][columncounter] ] = (int32) UNFIXH(tmp13 * COS_1_8 + tmp12 * SIN_1_8); outptr[ zigzag[6][columncounter] ] = (int32) UNFIXH(tmp13 * SIN_1_8 - tmp12 * COS_1_8); tmp16 = UNFIXO((tmp6 + tmp5) * SIN_1_4); tmp15 = UNFIXO((tmp6 - tmp5) * COS_1_4); OVERSHIFT(tmp4); OVERSHIFT(tmp7); /* * tmp4, tmp7, tmp15, tmp16 are overscaled by * OVERSCALE */ tmp14 = tmp4 + tmp15; tmp25 = tmp4 - tmp15; tmp26 = tmp7 - tmp16; tmp17 = tmp7 + tmp16; outptr[ zigzag[1][columncounter] ] = (int32) UNFIXH(tmp17 * OCOS_1_16 + tmp14 * OSIN_1_16); outptr[ zigzag[7][columncounter] ] = (int32) UNFIXH(tmp17 * OCOS_7_16 - tmp14 * OSIN_7_16); outptr[ zigzag[5][columncounter] ] = (int32) UNFIXH(tmp26 * OCOS_5_16 + tmp25 * OSIN_5_16); outptr[ zigzag[3][columncounter] ] = (int32) UNFIXH(tmp26 * OCOS_3_16 - tmp25 * OSIN_3_16); inptr += DCTSIZE; /* advance inptr to next row */ /* outptr++;*/ /* advance outptr to next column */ columncounter++; } /* END OF PASS TWO */ }