mlib_status __mlib_VideoIDCT8x8_S16_S16_Q1( mlib_s16 *block, const mlib_s16 *coeffs) { const mlib_s16 *coeffPtr = coeffs; mlib_s16 *blockPtr = block; mlib_s64 workspace[64]; mlib_s64 *workPtr = workspace; mlib_s64 x0, x1, x2, x3, x4, x5, x6, x7, x8; mlib_s32 str = 8; mlib_s32 i; IDCT1(coeffPtr, workPtr, 16384); workPtr = workspace; IDCT2(workPtr, blockPtr, 2048); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoIDCT8x8_U8_S16_Q1( mlib_u8 *block, const mlib_s16 *coeffs, mlib_s32 stride) { const mlib_s16 *coeffPtr = coeffs; mlib_u8 *blockPtr = block; mlib_s64 workspace[64]; mlib_s64 *workPtr = workspace; mlib_s64 x0, x1, x2, x3, x4, x5, x6, x7, x8; mlib_s32 i; mlib_s32 str = 8; mlib_s64 *inPtr; IDCT1(coeffPtr, workPtr, (-128 * 8)); inPtr = workspace; for (i = 0; i < 8; i++) { /* first stage */ x4 = RCOS_1_16 * inPtr[8 * 1]; x5 = RCOS_7_16 * inPtr[8 * 1]; x6 = RCOS_3_16 * inPtr[8 * 3]; x7 = RCOS_5_16 * inPtr[8 * 3]; /* second stage */ x8 = RTWOSQRT2 * (inPtr[8 * 0]); x0 = RTWOSQRT2 * (inPtr[8 * 0]); x2 = RCOS_6_16 * inPtr[8 * 2]; x3 = RCOS_2_16 * inPtr[8 * 2]; x1 = x4 + x6; x4 = ROUND(COS_4_16 * (x4 - x6)); x6 = x5 - x7; x5 = ROUND(COS_4_16 * (x5 + x7)); /* third stage */ x7 = x8 + x3; x8 -= x3; x3 = x0 + x2; x0 -= x2; x2 = (x4 + x5); x4 -= x5; /* fourth stage */ SATURATE(x7 + x1, blockPtr[0]); SATURATE(x3 + x2, blockPtr[1]); SATURATE(x0 + x4, blockPtr[2]); SATURATE(x8 + x6, blockPtr[3]); SATURATE(x8 - x6, blockPtr[4]); SATURATE(x0 - x4, blockPtr[5]); SATURATE(x3 - x2, blockPtr[6]); SATURATE(x7 - x1, blockPtr[7]); inPtr++; blockPtr += stride; } return (MLIB_SUCCESS); }
void IDCT(BLOCK *block,int k) { int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int z5, z10, z11, z12, z13; BLOCK *ptr; int i; /* Pass 1: process columns from input, store into work array. */ switch(k){ case 1:IDCT1(block); return; } ptr = block; for (i = 0; i< DCTSIZE; i++,ptr++) { /* Due to quantization, we will usually find that many of the input * coefficients are zero, especially the AC terms. We can exploit this * by short-circuiting the IDCT calculation for any column in which all * the AC terms are zero. In that case each output is equal to the * DC coefficient (with scale factor as needed). * With typical images and quantization tables, half or more of the * column DCT calculations can be simplified this way. */ if ((ptr[DCTSIZE*1] | ptr[DCTSIZE*2] | ptr[DCTSIZE*3] | ptr[DCTSIZE*4] | ptr[DCTSIZE*5] | ptr[DCTSIZE*6] | ptr[DCTSIZE*7]) == 0) { /* AC terms all zero */ ptr[DCTSIZE*0] = ptr[DCTSIZE*1] = ptr[DCTSIZE*2] = ptr[DCTSIZE*3] = ptr[DCTSIZE*4] = ptr[DCTSIZE*5] = ptr[DCTSIZE*6] = ptr[DCTSIZE*7] = ptr[DCTSIZE*0]; continue; } /* Even part */ z10 = ptr[DCTSIZE*0] + ptr[DCTSIZE*4]; /* phase 3 */ z11 = ptr[DCTSIZE*0] - ptr[DCTSIZE*4]; z13 = ptr[DCTSIZE*2] + ptr[DCTSIZE*6]; /* phases 5-3 */ z12 = MULTIPLY(ptr[DCTSIZE*2] - ptr[DCTSIZE*6], FIX_1_414213562) - z13; /* 2*c4 */ tmp0 = z10 + z13; /* phase 2 */ tmp3 = z10 - z13; tmp1 = z11 + z12; tmp2 = z11 - z12; /* Odd part */ z13 = ptr[DCTSIZE*3] + ptr[DCTSIZE*5]; /* phase 6 */ z10 = ptr[DCTSIZE*3] - ptr[DCTSIZE*5]; z11 = ptr[DCTSIZE*1] + ptr[DCTSIZE*7]; z12 = ptr[DCTSIZE*1] - ptr[DCTSIZE*7]; z5 = MULTIPLY(z12 - z10, FIX_1_847759065); tmp7 = z11 + z13; /* phase 5 */ tmp6 = MULTIPLY(z10, FIX_2_613125930) + z5 - tmp7; /* phase 2 */ tmp5 = MULTIPLY(z11 - z13, FIX_1_414213562) - tmp6; tmp4 = MULTIPLY(z12, FIX_1_082392200) - z5 + tmp5; ptr[DCTSIZE*0] = (tmp0 + tmp7); ptr[DCTSIZE*7] = (tmp0 - tmp7); ptr[DCTSIZE*1] = (tmp1 + tmp6); ptr[DCTSIZE*6] = (tmp1 - tmp6); ptr[DCTSIZE*2] = (tmp2 + tmp5); ptr[DCTSIZE*5] = (tmp2 - tmp5); ptr[DCTSIZE*4] = (tmp3 + tmp4); ptr[DCTSIZE*3] = (tmp3 - tmp4); } /* Pass 2: process rows from work array, store into output array. */ /* Note that we must descale the results by a factor of 8 == 2**3, */ /* and also undo the PASS1_BITS scaling. */ ptr = block; for (i = 0; i < DCTSIZE; i++ ,ptr+=DCTSIZE) { /* Rows of zeroes can be exploited in the same way as we did with columns. * However, the column calculation has created many nonzero AC terms, so * the simplification applies less often (typically 5% to 10% of the time). * On machines with very fast multiplication, it's possible that the * test takes more time than it's worth. In that case this section * may be commented out. */ #ifndef NO_ZERO_ROW_TEST if ((ptr[1] | ptr[2] | ptr[3] | ptr[4] | ptr[5] | ptr[6] | ptr[7]) == 0) { /* AC terms all zero */ ptr[0] = ptr[1] = ptr[2] = ptr[3] = ptr[4] = ptr[5] = ptr[6] = ptr[7] = RANGE(DESCALE(ptr[0], PASS1_BITS+3));; continue; } #endif /* Even part */ z10 = ptr[0] + ptr[4]; z11 = ptr[0] - ptr[4]; z13 = ptr[2] + ptr[6]; z12 = MULTIPLY(ptr[2] - ptr[6], FIX_1_414213562) - z13; tmp0 = z10 + z13; tmp3 = z10 - z13; tmp1 = z11 + z12; tmp2 = z11 - z12; /* Odd part */ z13 = ptr[3] + ptr[5]; z10 = ptr[3] - ptr[5]; z11 = ptr[1] + ptr[7]; z12 = ptr[1] - ptr[7]; z5 = MULTIPLY(z12 - z10, FIX_1_847759065); tmp7 = z11 + z13; /* phase 5 */ tmp6 = MULTIPLY(z10, FIX_2_613125930) + z5 - tmp7; /* phase 2 */ tmp5 = MULTIPLY(z11 - z13, FIX_1_414213562) - tmp6; tmp4 = MULTIPLY(z12, FIX_1_082392200) - z5 + tmp5; /* Final output stage: scale down by a factor of 8 and range-limit */ ptr[0] = RANGE(DESCALE(tmp0 + tmp7, PASS1_BITS+3));; ptr[7] = RANGE(DESCALE(tmp0 - tmp7, PASS1_BITS+3));; ptr[1] = RANGE(DESCALE(tmp1 + tmp6, PASS1_BITS+3));; ptr[6] = RANGE(DESCALE(tmp1 - tmp6, PASS1_BITS+3));; ptr[2] = RANGE(DESCALE(tmp2 + tmp5, PASS1_BITS+3));; ptr[5] = RANGE(DESCALE(tmp2 - tmp5, PASS1_BITS+3));; ptr[4] = RANGE(DESCALE(tmp3 + tmp4, PASS1_BITS+3));; ptr[3] = RANGE(DESCALE(tmp3 - tmp4, PASS1_BITS+3));; } }