static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride) { __m128i A, B, C, D, E, F, G, H; A = _mm_loadl_epi64((const __m128i *)src); B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); _mm_storel_epi64((__m128i *)dst, A); _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); }
mlib_status __mlib_VideoDCT8x8Quantize_S16_U8_NA( mlib_s16 coeffs[64], const mlib_u8 *block, const mlib_d64 qtable[64], mlib_s32 stride) { mlib_u8 *sp = (void *)block; mlib_d64 d00, d10, d20, d30, d40, d50, d60, d70; mlib_d64 d01, d11, d21, d31, d41, d51, d61, d71; mlib_d64 t00, t10, t20, t30, t40, t50, t60, t70, t80, t90; mlib_d64 t01, t11, t21, t31, t41, t51, t61, t71, t81, t91; mlib_d64 r00, r10, r20, r30, r40, r50, r60, r70; mlib_d64 r01, r11, r21, r31, r41, r51, r61, r71; mlib_d64 *dp; mlib_d64 w_const = vis_to_double_dup(0x4000); mlib_s32 mask; mlib_f32 FCOS, FONE, c17, c26, c35; if (block == NULL || coeffs == NULL || stride <= 0) return (MLIB_FAILURE); if (!(((mlib_addr)block | (mlib_addr)coeffs | stride) & 7)) { return (__mlib_VideoDCT8x8Quantize_S16_U8(coeffs, block, qtable, stride)); } vis_write_gsr(1 << 3); /* * first stage */ LOADCONSTS5; LOAD_DATA_GE_INTRA; TRANSPOSE_8X8(d00, d10, d20, d30, d40, d50, d60, d70, d01, d11, d21, d31, d41, d51, d61, d71); PREPARE_DATA_INTRA(hi, 0); COMPUTING_DATA(0); PREPARE_DATA_INTRA(lo, 1); TRANSPOSE(d40, d50, d60, d70, r01, r11, r21, r31); TRANSPOSE(d00, d10, d20, d30, r00, r10, r20, r30); COMPUTING_DATA(1); TRANSPOSE(d01, d11, d21, d31, r40, r50, r60, r70); /* * second stage */ PREPARE_DATA_INTER(0); COMPUTING_DATA(0); ENDSCALE(0); TRANSPOSE(d41, d51, d61, d71, r41, r51, r61, r71); dp = (mlib_d64 *)vis_alignaddr(coeffs, -1); mask = 0xFF >> ((mlib_addr)coeffs - (mlib_addr)dp); vis_alignaddrl((void *)coeffs, 0); PREPARE_DATA_INTER(1); COMPUTING_DATA(1); ENDSCALE(1); Quant_ST_NA(d00, d00, qtable[0]); Quant_ST_NA(d01, d01, qtable[1]); Quant_ST_NA(d10, d10, qtable[2]); Quant_ST_NA(d11, d11, qtable[3]); Quant_ST_NA(d20, d20, qtable[4]); Quant_ST_NA(d21, d21, qtable[5]); Quant_ST_NA(d30, d30, qtable[6]); Quant_ST_NA(d31, d31, qtable[7]); Quant_ST_NA(d40, d40, qtable[8]); Quant_ST_NA(d41, d41, qtable[9]); Quant_ST_NA(d50, d50, qtable[10]); Quant_ST_NA(d51, d51, qtable[11]); Quant_ST_NA(d60, d60, qtable[12]); Quant_ST_NA(d61, d61, qtable[13]); Quant_ST_NA(d70, d70, qtable[14]); Quant_ST_NA(d71, d71, qtable[15]); dp[1] = vis_faligndata(d00, d01); dp[2] = vis_faligndata(d01, d10); dp[3] = vis_faligndata(d10, d11); dp[4] = vis_faligndata(d11, d20); dp[5] = vis_faligndata(d20, d21); dp[6] = vis_faligndata(d21, d30); dp[7] = vis_faligndata(d30, d31); dp[8] = vis_faligndata(d31, d40); dp[9] = vis_faligndata(d40, d41); dp[10] = vis_faligndata(d41, d50); dp[11] = vis_faligndata(d50, d51); dp[12] = vis_faligndata(d51, d60); dp[13] = vis_faligndata(d60, d61); dp[14] = vis_faligndata(d61, d70); dp[15] = vis_faligndata(d70, d71); vis_pst_8(vis_faligndata(d71, d71), dp + 16, ~mask); if ((mlib_addr)coeffs & 7) vis_pst_8(vis_faligndata(d00, d00), dp, mask); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoDCT8x8Quantize_S16_U8( mlib_s16 coeffs[64], const mlib_u8 *block, const mlib_d64 qtable[64], mlib_s32 stride) { mlib_u8 *sp = (void *)block; mlib_d64 d00, d10, d20, d30, d40, d50, d60, d70; mlib_d64 d01, d11, d21, d31, d41, d51, d61, d71; mlib_d64 t00, t10, t20, t30, t40, t50, t60, t70, t80, t90; mlib_d64 t01, t11, t21, t31, t41, t51, t61, t71, t81, t91; mlib_d64 r00, r10, r20, r30, r40, r50, r60, r70; mlib_d64 r01, r11, r21, r31, r41, r51, r61, r71; mlib_d64 *dp = (mlib_d64 *)coeffs; mlib_f32 FCOS, FONE, c17, c26, c35; vis_write_gsr(1 << 3); /* * first stage */ LOAD_DATA_AA_INTRA TRANSPOSE_8X8( d00, d10, d20, d30, d40, d50, d60, d70, d01, d11, d21, d31, d41, d51, d61, d71); LOADCONSTS5 PREPARE_DATA_INTRA( hi, 0); COMPUTING_DATA(0); PREPARE_DATA_INTRA(lo, 1); TRANSPOSE(d40, d50, d60, d70, r01, r11, r21, r31); TRANSPOSE(d00, d10, d20, d30, r00, r10, r20, r30); COMPUTING_DATA(1); TRANSPOSE(d01, d11, d21, d31, r40, r50, r60, r70); /* * second stage */ PREPARE_DATA_INTER(0); COMPUTING_DATA(0); TRANSPOSE(d41, d51, d61, d71, r41, r51, r61, r71); ENDSCALE(0); Quant_ST(0, d00, qtable[0]); Quant_ST(2, d10, qtable[2]); Quant_ST(4, d20, qtable[4]); Quant_ST(6, d30, qtable[6]); Quant_ST(8, d40, qtable[8]); Quant_ST(10, d50, qtable[10]); Quant_ST(12, d60, qtable[12]); Quant_ST(14, d70, qtable[14]); PREPARE_DATA_INTER(1); COMPUTING_DATA(1); ENDSCALE(1); Quant_ST(1, d01, qtable[1]); Quant_ST(3, d11, qtable[3]); Quant_ST(5, d21, qtable[5]); Quant_ST(7, d31, qtable[7]); Quant_ST(9, d41, qtable[9]); Quant_ST(11, d51, qtable[11]); Quant_ST(13, d61, qtable[13]); Quant_ST(15, d71, qtable[15]); return (MLIB_SUCCESS); }