mlib_status __mlib_VideoAddBlock_U8_S16( mlib_u8 *curr_block, const mlib_s16 *mc_block, mlib_s32 stride) { mlib_s32 y; mlib_d64 *dp, *sp, s1hi, s1lo, s2hi, s2lo, dd; mlib_f32 zeros = vis_fzeros(); /* * mlib_s32 mlib_imult = 0x100; * mlib_f32 mult = *(mlib_f32*) & mlib_imult; */ mlib_f32 mult = vis_to_float(0x100); vis_write_gsr(7 << 3); dp = (mlib_d64 *)curr_block; sp = (mlib_d64 *)mc_block; #pragma pipeloop(0) for (y = 0; y < 8; y++) { dd = *dp; s1hi = (*sp++); s1lo = (*sp++); s2hi = vis_fpmerge(zeros, vis_read_hi(dd)); s2lo = vis_fmul8x16al(vis_read_lo(dd), mult); s1hi = vis_fpadd16(s1hi, s2hi); s1lo = vis_fpadd16(s1lo, s2lo); *dp = vis_fpack16_pair(s1hi, s1lo); dp = (mlib_d64 *)((mlib_u8 *)dp + stride); } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_d64 *pd = (mlib_d64 *)dst; mlib_d64 d0; mlib_d64 tmp, data0, data1; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, edge; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); vis_write_bmask(0x02461357, 0); #pragma pipeloop(0) for (i = 0; i <= n - 16; i += 16) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16_pair(data0, data1); } if (i < n) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = vis_ld_d64_nf(sp0); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); edge = vis_edge8(pd, (dst + (n / 2) - 1)); vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge); } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorJFIFYCC2RGB444( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd; mlib_f32 fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f4fd); mlib_d64 k02 = vis_to_double_dup(0x2cdde926); mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4); mlib_d64 k12 = vis_to_double_dup(0xe9260000); mlib_d64 k21 = vis_to_double_dup(0x38b40000); mlib_d64 k22 = vis_to_double_dup(0x00002cdd); mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff); mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6); mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1); mlib_d64 k_0 = vis_to_double_dup(0x20002000); if (size <= 0) return (MLIB_FAILURE); vis_write_gsr((2 << 3) + 2); vis_write_bmask(0x0489AB37, 0); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = (n - 1) >> 2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)cb; sf2 = (mlib_f32 *)cr; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_0145; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); s20 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, s20); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_0235); pfd[2] = vis_read_lo(d_0145); pfd += 3; } /* * last pixels */ if ((mlib_u8 *)pfd <= dend) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_xx14, d_0145; mlib_f32 x0, x1, x2; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; x0 = *sf0; x1 = *sf1; x2 = *sf2; s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, d_xx14); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_0235); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; cb += n; cr += n; rgb += 3 * n; size -= n; } while (size); return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2RGB411_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3; /* pointers to dst address */ mlib_u8 *dp, *dl; /* all. pointer to y */ mlib_d64 *spy; /* all. pointers to u, v */ mlib_d64 *dfu, *dfv; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy0, dy1, dy2, dy3; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1, fu0, fu1; mlib_d64 dv1, dv2, fv0, fv1; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 dtmp; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_d64 *ddp, dd01, dd11, dd21, dd02, dd12, dd22; mlib_u8 *tmp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } /* * initialize GSR scale factor */ vis_write_gsr(3 << 3); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dp = (mlib_u8 *)buf; dl = rgb; ddp = (mlib_d64 *)dp; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = (*dfu++); fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = (*dfv++); fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; vis_alignaddr(sp1, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = vis_ld_d64_nf(spy); spy++; dy2 = vis_faligndata(dy3, dy0); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = vis_ld_d64_nf(dfu); dfu++; fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = vis_ld_d64_nf(dfv); dfv++; fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 16; i += 16) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(dr, dg); dd02 = vis_bshuffle(dr1, dg1); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(dr, dg); dd12 = vis_bshuffle(dr1, dg1); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(dr, dg); dd22 = vis_bshuffle(dr1, dg1); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd01, db); ddp[3] = vis_bshuffle(dd02, db1); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd11, db); ddp[4] = vis_bshuffle(dd12, db1); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd21, db); ddp[5] = vis_bshuffle(dd22, db1); dy3 = vis_ld_d64_nf(spy); spy++; vis_alignaddr(sp1, 0); dy1 = vis_faligndata(dy0, dy3); dy0 = vis_ld_d64_nf(spy); spy++; dy2 = vis_faligndata(dy3, dy0); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); dfu = (mlib_d64 *)vis_alignaddr(sp2, 0); fu0 = vis_ld_d64_nf(dfu); dfu++; fu1 = vis_ld_d64_nf(dfu); dfu++; fu = vis_read_hi(vis_faligndata(fu0, fu1)); sp2 += 4; dfv = (mlib_d64 *)vis_alignaddr(sp3, 0); fv0 = vis_ld_d64_nf(dfv); dfv++; fv1 = vis_ld_d64_nf(dfv); dfv++; fv = vis_read_hi(vis_faligndata(fv0, fv1)); sp3 += 4; ddp += 6; } if (i <= width - 8) { vis_write_bmask(0x0801902A, 0); dd01 = vis_bshuffle(dr, dg); vis_write_bmask(0x03B04C05, 0); dd11 = vis_bshuffle(dr, dg); vis_write_bmask(0xD06E07F0, 0); dd21 = vis_bshuffle(dr, dg); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd01, db); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd11, db); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd21, db); db = db1; dr = dr1; dg = dg1; ddp += 3; i += 8; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); db = vis_faligndata(db, db); dg = vis_faligndata(dg, dg); dr = vis_faligndata(dr, dr); dp += ((width - i - 1) * 3); vis_alignaddr((void *)7, 0); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; __mlib_VectorCopy_U8(dl, (mlib_u8 *)buf, width * 3); dl = dp = dl + rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
mlib_status mlib_v_conv3x3_8nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon, mlib_s32 cmask) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u8 *sa, *sa1, *sa2; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointers to rows in interm. src buf */ mlib_d64 *sbuf3; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d_1, d_2, d21, d22; /* data */ mlib_d64 d3, d_3, d23; mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1; mlib_d64 tmp0, tmp1, rnd; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1, sd00; mlib_s32 emask, cmask1; mlib_s32 rval, gsr_scale, i, j; gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3)); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); cmask = ((cmask & 0xf) << 4) + (cmask & 0xf); cmask = (cmask << 8) + (cmask); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); buf_slb = (4 * dw + 24) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 2; dw *= 4; dh -= 2; sa = adr_src; sa1 = sa + slb; sa2 = sa1 + slb; d_a = adr_dst + dlb + 4; /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf2, sa); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf3, sa1); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); PREPARE_TO_LOAD_LINE(sbuf3, sa2); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); vis_alignaddr(s1, 4); d1 = *s1; d2 = *s2; d3 = *s3; #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_3 = *(s3 + 1); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k3k4); CONV_AU(d3, k7k8); d21 = vis_faligndata(d1, d_1); d22 = vis_faligndata(d2, d_2); d23 = vis_faligndata(d3, d_3); CONV_AL(d21, k1k2); CONV_AU(d22, k5k6); CONV_AL(d23, k7k8); CONV_AU(d_1, k3k4); CONV_AL(d_2, k5k6); CONV_AU(d_3, k9k9); (*ddst++) = vis_fpack16_pair(out0, out1); d1 = d_1; d2 = d_2; d3 = d_3; s1++; s2++; s3++; } ddst = dbuf; /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr)da & (~7)); i = (mlib_addr)dp - (mlib_addr)da; cmask1 = cmask >> (-i); ddst = vis_alignaddr(ddst, i); /* generate edge mask for the start point */ emask = vis_edge8(da, dend); sd1 = ddst[0]; if (emask != 0xff) { sd0 = sd1; sd1 = ddst[1]; sd0 = vis_faligndata(sd0, sd1); vis_pst_8(sd0, dp++, emask & cmask1); ddst++; i += 8; } #pragma pipeloop(0) for (; i <= (dw - 8); i += 8) { sd0 = sd1; sd1 = ddst[1]; sd00 = vis_faligndata(sd0, sd1); vis_pst_8(sd00, dp++, cmask1); ddst++; } if (i < dw) { sd0 = vis_faligndata(sd1, ddst[1]); emask = vis_edge8(dp, dend); vis_pst_8(sd0, dp, emask & cmask1); } sa2 = sa2 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
static void mlib_v_VideoYUV2ABGR_aarray_411( mlib_u32 *abgr, const mlib_d64 *y, const mlib_f32 *u, const mlib_f32 *v, const mlib_d64 *a_array, mlib_s32 count, mlib_s32 left, mlib_s32 isrgb) { /* all. pointer to dst */ mlib_d64 *dpp = (mlib_d64 *)abgr; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy1, dy2; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1; mlib_d64 dv1, dv2; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 *dpa, da0, da1, da2, da3, da4; mlib_d64 dtmp; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variables */ mlib_s32 i; if (isrgb) { f0 = vis_to_float(0x12a1); f1 = vis_to_float(0x3317); f4 = vis_to_float(0xe5fa); f5 = vis_to_float(0xf375); f8 = vis_to_float(0x4097); doff0 = vis_to_double_dup(0xe420e420); doff1 = vis_to_double_dup(0x10f410f4); doff2 = vis_to_double_dup(0xdd60dd60); } dpa = vis_alignaddr((void *)a_array, 0); dy1 = (*y++); dy2 = vis_ld_d64_nf((mlib_d64 *)y); y++; fu = (*u++); fv = (*v++); da2 = (*dpa++); da3 = vis_ld_d64_nf(dpa); dpa++; da4 = vis_ld_d64_nf(dpa); dpa++; du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); if (!((mlib_addr)abgr & 7)) { #pragma pipeloop(0) for (i = 0; i < count; i++) { da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i); dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1); fu = vis_ld_f32_nf((mlib_f32 *)u + i); fv = vis_ld_f32_nf((mlib_f32 *)v + i); da2 = da4; da3 = vis_ld_d64_nf(dpa + 2 * i); da4 = vis_ld_d64_nf(dpa + 2 * i + 1); dpp[8 * i] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dpp[8 * i + 2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); dpp[8 * i + 4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); dpp[8 * i + 6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dpp[8 * i + 7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); } } else { mlib_d64 dd; #pragma pipeloop(0) for (i = 0; i < count; i++) { da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dy1 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i); dy2 = vis_ld_d64_nf((mlib_d64 *)y + 2 * i + 1); fu = vis_ld_f32_nf((mlib_f32 *)u + i); fv = vis_ld_f32_nf((mlib_f32 *)v + i); da2 = da4; da3 = vis_ld_d64_nf(dpa + 2 * i); da4 = vis_ld_d64_nf(dpa + 2 * i + 1); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 1] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 2] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 3] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 4] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 5] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 6] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 7] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 8] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 9] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 10] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 11] = vis_read_lo(dd); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); dd = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); ((mlib_f32 *)dpp)[16 * i + 12] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 13] = vis_read_lo(dd); dd = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); ((mlib_f32 *)dpp)[16 * i + 14] = vis_read_hi(dd); ((mlib_f32 *)dpp)[16 * i + 15] = vis_read_lo(dd); du0 = vis_fmul8x16al(fu, f1); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dv2 = vis_fmul8x16al(fv, f8); } } if (left) { mlib_d64 res_buf[8]; da0 = vis_faligndata(da2, da3); da1 = vis_faligndata(da3, da4); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); db = vis_fpadd16(du0, doff0); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dr = vis_fpadd16(dv2, doff2); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(da0), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); res_buf[0] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[1] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da0), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); res_buf[2] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[3] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_hi(da1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); res_buf[4] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[5] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dg2 = vis_fpmerge(vis_read_lo(da1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); res_buf[6] = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); res_buf[7] = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); for (i = 0; i < left; i++) ((mlib_f32 *)dpp)[16 * count + i] = ((mlib_f32 *)res_buf)[i]; } }
mlib_status __mlib_VectorConvert_U8_S16_Sat( mlib_u8 *z, const mlib_s16 *x, mlib_s32 n) { mlib_s16 *src = (void *)x; mlib_u8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6, d7; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_s16 c; if (n < 16) { PACK_S_U_DF(mlib_s16, mlib_u8, MLIB_U8_MAX, 0); } /* * First try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c); length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d4 = (*dsrc++); d5 = (*dsrc++); d3 = vis_fpack16_pair(d4, d5); (*ddst++) = d3; } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = (*dsrc++); d5 = (*dsrc++); d6 = (*dsrc++); d3 = vis_fpack16_pair(d1, d2); d7 = vis_fpack16_pair(d5, d6); (*ddst++) = d3; (*ddst++) = d7; } } else { /* * Source address is 2-byte aligned. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d4 = vis_faligndata(d1, d2); d3 = vis_fpack16_pair(d3, d4); (*ddst++) = d3; } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d4 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d5 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d6 = vis_faligndata(d1, d2); d3 = vis_fpack16_pair(d3, d4); d5 = vis_fpack16_pair(d5, d6); (*ddst++) = d3; (*ddst++) = d5; } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c); return (MLIB_SUCCESS); }
} else { /* if (channel == 2) */ #pragma pipeloop(0) for (i = 0; i < ww; i++) { ss = *sp; a0 = vis_freg_pair(*(mlib_f32 *)(p_tbl + ap[0]), *(mlib_f32 *)(p_tbl + vis_ld_u8_nf(ap + 2))); a1 = vis_freg_pair(*(mlib_f32 *)(p_tbl + vis_ld_u8_nf(ap + 4)), *(mlib_f32 *)(p_tbl + vis_ld_u8_nf(ap + 6))); DIV_ALPHA(d0, vis_read_hi(ss), a0); DIV_ALPHA(d1, vis_read_lo(ss), a1); *dp = vis_fpack16_pair(d0, d1); ap += 8; sp++; dp++; } } if (dflag) { MEM_COPY(buffd, dl, width * sizeof (mlib_u8)); } sl += sstride; dl += dstride; } __mlib_free(buffs);
mlib_status __mlib_VectorConvert_S8_U8_Sat( mlib_s8 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *src = (void *)x; mlib_s8 *dst = z; mlib_d64 fzero = vis_fzeros(); mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_u8 c; mlib_d64 dsp = vis_to_double_dup(0x800080); mlib_d64 rst = vis_to_double_dup(0x80808080); mlib_f32 fm = vis_to_float(0x100); if (length < 16) { PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d4 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); d2 = vis_fpmerge(fzero, vis_read_hi(d4)); d3 = vis_fmul8x16al(vis_read_lo(d4), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); (*ddst++) = vis_fxor(d4, rst); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d1 = vis_faligndata(d1, d2); d3 = vis_fmul8x16al(vis_read_hi(d1), fm); d4 = vis_fmul8x16al(vis_read_lo(d1), fm); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpadd16(dsp, d4); d1 = vis_fpack16_pair(d3, d4); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_faligndata(d1, d2); d4 = vis_fmul8x16al(vis_read_hi(d3), fm); d5 = vis_fmul8x16al(vis_read_lo(d3), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d3 = vis_fpack16_pair(d4, d5); d4 = vis_fmul8x16al(vis_read_hi(d6), fm); d5 = vis_fmul8x16al(vis_read_lo(d6), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d6 = vis_fpack16_pair(d4, d5); (*ddst++) = vis_fxor(d3, rst); (*ddst++) = vis_fxor(d6, rst); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c; return (MLIB_SUCCESS); }
mlib_status mlib_convMxN_8nw_mask(mlib_image *dst, const mlib_image *src, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, const mlib_s32 *kern, mlib_s32 scale, mlib_s32 cmask) { mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff; mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe; mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3; mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31; mlib_d64 dd, d0, d1; mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff; mlib_u8 *sl, *sp, *dl; mlib_s32 hgt = mlib_ImageGetHeight(src); mlib_s32 wid = mlib_ImageGetWidth(src); mlib_s32 sll = mlib_ImageGetStride(src); mlib_s32 dll = mlib_ImageGetStride(dst); mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src); mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst); mlib_s32 ssize, xsize, dsize, esize, buff_ind; mlib_d64 *pbuff, *dp; mlib_f32 *karr = (mlib_f32 *) kern; mlib_s32 gsr_scale = (31 - scale) << 3; mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); mlib_s32 i, j, l, chan, testchan; mlib_s32 nchan = mlib_ImageGetChannels(dst); void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); if (n > MAX_N) { buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *)); if (buffs == NULL) return MLIB_FAILURE; } buff = buffs + 2 * (n + 1); adr_dst += dn * dll + dm * nchan; ssize = wid; dsize = (ssize + 7) / 8; esize = dsize + 4; pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64)); if (pbuff == NULL) { if (buffs != buffs_local) mlib_free(buffs); return MLIB_FAILURE; } for (i = 0; i < (n + 1); i++) buffs[i] = pbuff + i * esize; for (i = 0; i < (n + 1); i++) buffs[(n + 1) + i] = buffs[i]; buffd = buffs[n] + esize; buffe = buffd + 2 * esize; hgt -= (n - 1); xsize = ssize - (m - 1); vis_write_gsr(gsr_scale + 7); if (nchan == 2) { p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1; } else if (nchan == 3) { p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1; } else { p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1; } testchan = 1; for (chan = 0; chan < nchan; chan++) { buff_ind = 0; sl = adr_src; dl = adr_dst; if ((cmask & testchan) == 0) { testchan <<= 1; continue; } for (l = 0; l < n; l++) { mlib_d64 *buffn = buffs[l]; sp = sl + l * sll; (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan); } /* init buffer */ #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } for (j = 0; j < hgt; j++) { mlib_d64 **buffc = buffs + buff_ind; mlib_f32 *pk = karr, k0, k1, k2, k3; sp = sl + n * sll; for (l = 0; l < n; l++) { buff[l] = buffc[l]; } buffn = buffc[n]; (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan); ik_last = (m - 1); for (jk = 0; jk < n; jk += jk_size) { jk_size = n - jk; if (jk_size >= 6) jk_size = 4; if (jk_size == 5) jk_size = 3; coff = 0; if (jk_size == 1) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; doff = coff / 8; buff0 = buff[jk] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); s01 = buff0[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s01 = buff0[i + 1]; s0 = vis_faligndata(s00, s01); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d00, d0); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } pk += m; } else if (jk_size == 2) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); s01 = buff0[0]; s11 = buff1[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d00, d0); d0 = vis_fpadd16(d10, d0); d1 = vis_fpadd16(d01, d1); d1 = vis_fpadd16(d11, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } pk += 2 * m; } else if (jk_size == 3) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; k2 = pk[ik + 2 * m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; buff2 = buff[jk + 2] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); if (off == 0) { #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s0 = buff0[i]; s1 = buff1[i]; s2 = buff2[i]; d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else if (off == 4) { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; d00 = vis_fmul8x16au(vis_read_lo(s00), k0); d01 = vis_fmul8x16au(vis_read_hi(s01), k0); d10 = vis_fmul8x16au(vis_read_lo(s10), k1); d11 = vis_fmul8x16au(vis_read_hi(s11), k1); d20 = vis_fmul8x16au(vis_read_lo(s20), k2); d21 = vis_fmul8x16au(vis_read_hi(s21), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } } pk += 3 * m; } else { /* jk_size == 4 */ for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; k2 = pk[ik + 2 * m]; k3 = pk[ik + 3 * m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; buff2 = buff[jk + 2] + doff; buff3 = buff[jk + 3] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); if (off == 0) { #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s0 = buff0[i]; s1 = buff1[i]; s2 = buff2[i]; s3 = buff3[i]; d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else if (off == 4) { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; d00 = vis_fmul8x16au(vis_read_lo(s00), k0); d01 = vis_fmul8x16au(vis_read_hi(s01), k0); d10 = vis_fmul8x16au(vis_read_lo(s10), k1); d11 = vis_fmul8x16au(vis_read_hi(s11), k1); d20 = vis_fmul8x16au(vis_read_lo(s20), k2); d21 = vis_fmul8x16au(vis_read_hi(s21), k2); d30 = vis_fmul8x16au(vis_read_lo(s30), k3); d31 = vis_fmul8x16au(vis_read_hi(s31), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); s3 = vis_faligndata(s30, s31); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } } pk += 4 * m; } } /***************************************** ***************************************** ** Final iteration ** ***************************************** *****************************************/ jk_size = n; if (jk_size >= 6) jk_size = 4; if (jk_size == 5) jk_size = 3; k0 = karr[ik_last]; k1 = karr[ik_last + m]; k2 = karr[ik_last + 2 * m]; k3 = karr[ik_last + 3 * m]; off = ik_last; doff = off / 8; off &= 7; buff0 = buff[0] + doff; buff1 = buff[1] + doff; buff2 = buff[2] + doff; buff3 = buff[3] + doff; vis_write_gsr(gsr_scale + off); if (jk_size == 1) { dp = buffe; s01 = buff0[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s01 = buff0[i + 1]; s0 = vis_faligndata(s00, s01); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d1 = vis_fpadd16(d1, d01); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else if (jk_size == 2) { dp = buffe; s01 = buff0[0]; s11 = buff1[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else if (jk_size == 3) { dp = buffe; s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d0 = vis_fpadd16(d0, d20); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); d1 = vis_fpadd16(d1, d21); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else { /* if (jk_size == 4) */ dp = buffe; s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); s3 = vis_faligndata(s30, s31); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d0 = vis_fpadd16(d0, d20); d0 = vis_fpadd16(d0, d30); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); d1 = vis_fpadd16(d1, d21); d1 = vis_fpadd16(d1, d31); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan); sl += sll; dl += dll; buff_ind++; if (buff_ind >= (n + 1)) buff_ind = 0; } testchan <<= 1; } mlib_free(pbuff); if (buffs != buffs_local) mlib_free(buffs); return MLIB_SUCCESS; }
mlib_status __mlib_VideoUpSample420( mlib_u8 *dst0, mlib_u8 *dst1, const mlib_u8 *src0, const mlib_u8 *src1, const mlib_u8 *src2, mlib_s32 n) { mlib_u8 *dend0 = dst0 + 2 * n - 1; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_d64 *sp0 = (mlib_d64 *)src0; mlib_d64 *sp1 = (mlib_d64 *)src1; mlib_d64 *sp2 = (mlib_d64 *)src2; mlib_d64 d00, d01, d10, d11, d20, d21; mlib_d64 thiscolsum0_hi, thiscolsum0_lo, lastcolsum0_hi, lastcolsum0_lo; mlib_d64 shiftcolsum0_hi, shiftcolsum0_lo; mlib_d64 thiscolsum1_hi, thiscolsum1_lo, lastcolsum1_hi, lastcolsum1_lo; mlib_d64 shiftcolsum1_hi, shiftcolsum1_lo; mlib_d64 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; mlib_d64 ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7; mlib_d64 data0, data1, data2, data3, tmp0, tmp1; mlib_f32 fone = vis_to_float(0x4000000); mlib_f32 fthree = vis_to_float(0xC000000); mlib_f32 fone1 = vis_to_float(0x40404040); mlib_f32 fthree1 = vis_to_float(0xC0C0C0C0); mlib_d64 dseven = vis_to_double_dup(0x70007); mlib_d64 deight = vis_to_double_dup(0x80008); mlib_s32 i, emask; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr((3 << 3) + 2); d00 = vis_ld_d64_nf(sp0); d10 = vis_ld_d64_nf(sp1); d20 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; lastcolsum0_hi = vis_fmul8x16au(vis_read_hi(d00), fone); lastcolsum0_lo = vis_fmul8x16au(vis_read_lo(d00), fone); lastcolsum1_hi = vis_fmul8x16au(vis_read_hi(d20), fone); lastcolsum1_lo = vis_fmul8x16au(vis_read_lo(d20), fone); tmp0 = vis_fmul8x16au(vis_read_hi(d10), fthree); tmp1 = vis_fmul8x16au(vis_read_lo(d10), fthree); lastcolsum0_hi = vis_fpadd16(lastcolsum0_hi, tmp0); lastcolsum0_lo = vis_fpadd16(lastcolsum0_lo, tmp1); lastcolsum1_hi = vis_fpadd16(lastcolsum1_hi, tmp0); lastcolsum1_lo = vis_fpadd16(lastcolsum1_lo, tmp1); #pragma pipeloop(0) for (i = 0; i < n - 8; i += 8) { d01 = *sp0; d11 = *sp1; d21 = *sp2; sp0++; sp1++; sp2++; thiscolsum0_hi = vis_fmul8x16au(vis_read_hi(d01), fone); thiscolsum0_lo = vis_fmul8x16au(vis_read_lo(d01), fone); thiscolsum1_hi = vis_fmul8x16au(vis_read_hi(d21), fone); thiscolsum1_lo = vis_fmul8x16au(vis_read_lo(d21), fone); tmp0 = vis_fmul8x16au(vis_read_hi(d11), fthree); tmp1 = vis_fmul8x16au(vis_read_lo(d11), fthree); thiscolsum0_hi = vis_fpadd16(thiscolsum0_hi, tmp0); thiscolsum0_lo = vis_fpadd16(thiscolsum0_lo, tmp1); thiscolsum1_hi = vis_fpadd16(thiscolsum1_hi, tmp0); thiscolsum1_lo = vis_fpadd16(thiscolsum1_lo, tmp1); acc0 = vis_fmul8x16(fone1, lastcolsum0_hi); acc1 = vis_fmul8x16(fone1, lastcolsum0_lo); acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi); acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo); acc4 = vis_fmul8x16(fone1, lastcolsum1_hi); acc5 = vis_fmul8x16(fone1, lastcolsum1_lo); acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi); acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo); shiftcolsum0_hi = vis_faligndata(lastcolsum0_hi, lastcolsum0_lo); shiftcolsum0_lo = vis_faligndata(lastcolsum0_lo, thiscolsum0_hi); shiftcolsum1_hi = vis_faligndata(lastcolsum1_hi, lastcolsum1_lo); shiftcolsum1_lo = vis_faligndata(lastcolsum1_lo, thiscolsum1_hi); acc0 = vis_fpadd16(acc0, deight); acc1 = vis_fpadd16(acc1, deight); acc2 = vis_fpadd16(acc2, dseven); acc3 = vis_fpadd16(acc3, dseven); acc4 = vis_fpadd16(acc4, deight); acc5 = vis_fpadd16(acc5, deight); acc6 = vis_fpadd16(acc6, dseven); acc7 = vis_fpadd16(acc7, dseven); ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi); ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo); ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi); ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo); ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi); ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo); ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi); ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo); acc0 = vis_fpadd16(acc0, ac0); acc1 = vis_fpadd16(acc1, ac1); acc2 = vis_fpadd16(acc2, ac2); acc3 = vis_fpadd16(acc3, ac3); acc4 = vis_fpadd16(acc4, ac4); acc5 = vis_fpadd16(acc5, ac5); acc6 = vis_fpadd16(acc6, ac6); acc7 = vis_fpadd16(acc7, ac7); data0 = vis_fpack16_pair(acc0, acc1); data1 = vis_fpack16_pair(acc2, acc3); data2 = vis_fpack16_pair(acc4, acc5); data3 = vis_fpack16_pair(acc6, acc7); dp0[0] = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0)); dp0[1] = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0)); dp1[0] = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2)); dp1[1] = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2)); dp0 += 2; dp1 += 2; lastcolsum0_hi = thiscolsum0_hi; lastcolsum0_lo = thiscolsum0_lo; lastcolsum1_hi = thiscolsum1_hi; lastcolsum1_lo = thiscolsum1_lo; } if (i < n) { acc0 = vis_fmul8x16(fone1, lastcolsum0_hi); acc1 = vis_fmul8x16(fone1, lastcolsum0_lo); acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi); acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo); acc4 = vis_fmul8x16(fone1, lastcolsum1_hi); acc5 = vis_fmul8x16(fone1, lastcolsum1_lo); acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi); acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo); shiftcolsum0_hi = vis_faligndata(lastcolsum0_hi, lastcolsum0_lo); shiftcolsum0_lo = vis_faligndata(lastcolsum0_lo, lastcolsum0_lo); shiftcolsum1_hi = vis_faligndata(lastcolsum1_hi, lastcolsum1_lo); shiftcolsum1_lo = vis_faligndata(lastcolsum1_lo, lastcolsum1_lo); acc0 = vis_fpadd16(acc0, deight); acc1 = vis_fpadd16(acc1, deight); acc2 = vis_fpadd16(acc2, dseven); acc3 = vis_fpadd16(acc3, dseven); acc4 = vis_fpadd16(acc4, deight); acc5 = vis_fpadd16(acc5, deight); acc6 = vis_fpadd16(acc6, dseven); acc7 = vis_fpadd16(acc7, dseven); ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi); ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo); ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi); ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo); ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi); ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo); ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi); ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo); acc0 = vis_fpadd16(acc0, ac0); acc1 = vis_fpadd16(acc1, ac1); acc2 = vis_fpadd16(acc2, ac2); acc3 = vis_fpadd16(acc3, ac3); acc4 = vis_fpadd16(acc4, ac4); acc5 = vis_fpadd16(acc5, ac5); acc6 = vis_fpadd16(acc6, ac6); acc7 = vis_fpadd16(acc7, ac7); data0 = vis_fpack16_pair(acc0, acc1); data1 = vis_fpack16_pair(acc2, acc3); data2 = vis_fpack16_pair(acc4, acc5); data3 = vis_fpack16_pair(acc6, acc7); acc0 = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0)); acc1 = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2)); emask = vis_edge8(dp0, dend0); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); i += 4; dp0++; dp1++; if (i < n) { acc0 = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0)); acc1 = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2)); emask = vis_edge8(dp0, dend0); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); } } vis_write_gsr(7); dp0 = (mlib_d64 *)dst0; dp1 = (mlib_d64 *)dst1; ac0 = *dp0; ac2 = *dp1; #pragma pipeloop(0) for (i = 0; i < 2 * n - 8; i += 8) { ac1 = *dp0; ac3 = *dp1; *dp0 = vis_faligndata(ac0, ac1); *dp1 = vis_faligndata(ac2, ac3); dp0++; dp1++; ac0 = ac1; ac2 = ac3; } if (i < 2 * n) { ac1 = vis_ld_d64_nf(dp0); ac3 = vis_ld_d64_nf(dp1); emask = vis_edge8(dp0, dend0); acc0 = vis_faligndata(ac0, ac1); acc1 = vis_faligndata(ac2, ac3); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); } dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4; dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4; dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4; dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4; return (MLIB_SUCCESS); }
mlib_status mlib_v_conv5x5_8nw_mask( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon, mlib_s32 cmask) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *dend, *adr_src; /* pointers to src rows */ mlib_u8 *sa, *sa2, *sa3, *sa4, *sa5, *sa6, *sa_6, *prow; /* pointers to rows in interm. src buf */ mlib_u8 *buff_src, *sbuf1, *sbuf2, *sbuf3, *sbuf4, *sbuf5, *s_buf1; /* pointers to row in interm. dst buf */ mlib_u8 *dbuf, *d_buf; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3, *s4, *s5; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst, *ddst1; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1, tmp0, tmp1, rnd; /* data */ mlib_d64 d1, d2, d3, d4, d5, d_1, d_2, d_3, d_4, d_5; /* temp. data, used in faligndata */ mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5; /* shifted data */ mlib_d64 d21, d22, d23, d24, d25; mlib_f32 k1k2, k17k18, k19k20, k21k22, k23k24, k25; mlib_f32 k3k4, k5k6, k7k8, k9k10, k11k12, k13k14, k15k16; mlib_s32 rval, gsr_scale, i, j, nchannel, nchannel1, chan, testchan; /* temp, used in load-store */ mlib_s32 t1, t2, t3, t4, t5, t6, t7, t8, tt1, tt2, tt3, tt4, tt5, tt6, tt7, tt8; adr_src = mlib_ImageGetData(src); adr_dst = mlib_ImageGetData(dst); nchannel = mlib_ImageGetChannels(src); slb = mlib_ImageGetStride(src); dlb = mlib_ImageGetStride(dst); dh = mlib_ImageGetHeight(dst); dw = mlib_ImageGetWidth(dst); /* buf_slb - 8-byte aligned */ buf_slb = (dw + 16) & (~7); /* alloc. interm. src and dst buffer */ buff_src = (mlib_u8 *)__mlib_malloc(7 * buf_slb * sizeof (mlib_u8) + 8); if (buff_src == NULL) return (MLIB_FAILURE); /* edge - no write */ dw -= 4; dh -= 4; /* * The 8x16 mult has built-in 8-bit R shift, and fpack16 has 7-bit * fixed R shift (preceded by variable-bit L shift controlled by GSR * scalefactor field). Thus net R shift = (8+7)-(GSR.scalefactor_field), * so GSR.scalefactor_field = 15-(net R shift): */ gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3) + 1); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); sbuf1 = (mlib_u8 *)((mlib_addr)(buff_src + 8) & (~7)); sbuf2 = sbuf1 + buf_slb; sbuf3 = sbuf2 + buf_slb; sbuf4 = sbuf3 + buf_slb; sbuf5 = sbuf4 + buf_slb; dbuf = sbuf5 + buf_slb; LOAD_KERNEL_INTO_FLOAT(); testchan = 1; for (chan = nchannel - 1; chan >= 0; chan--) { if ((cmask & testchan) == 0) { testchan <<= 1; continue; } testchan <<= 1; sa = adr_src + chan; sa2 = sa + slb; sa3 = sa2 + slb; sa4 = sa3 + slb; sa5 = sa4 + slb; sa_6 = sa6 = sa5 + slb; d_a = adr_dst + (dlb << 1) + (nchannel << 1) + chan; /* load interm. src buff */ for (i = 0, j = 0; j < (dw + 4); i += nchannel, j++) { sbuf1[j] = sa5[i]; sbuf2[j] = sa[i]; sbuf3[j] = sa2[i]; sbuf4[j] = sa3[i]; sbuf5[j] = sa4[i]; } for (j = 0; j < dh - 1; j++) { ddst1 = ddst = (mlib_d64 *)(dbuf); d_buf = (dbuf - 8); da = d_a; dend = da + (dw - 1) * nchannel; prow = sbuf1; sbuf1 = sbuf2; sbuf2 = sbuf3; sbuf3 = sbuf4; sbuf4 = sbuf5; sbuf5 = prow; s1 = (mlib_d64 *)sbuf1; s2 = (mlib_d64 *)sbuf2; s3 = (mlib_d64 *)sbuf3; s4 = (mlib_d64 *)sbuf4; s5 = (mlib_d64 *)sbuf5; s_buf1 = sbuf1; d1 = *s1; d2 = *s2; d3 = *s3; nchannel1 = 0; #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_3 = *(s3 + 1); out0 = out1 = rnd; t1 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; CONV_AU(d1, k1k2); t2 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; CONV_AL(d2, k5k6); t3 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; CONV_AU(d3, k11k12); t4 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; d21 = vis_faligndata(d1, d_1); dt_1 = vis_faligndata(d_1, d1); t5 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; d22 = vis_faligndata(d2, d_2); dt_2 = vis_faligndata(d_2, d2); t6 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; d23 = vis_faligndata(d3, d_3); t7 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; dt_3 = vis_faligndata(d_3, d3); t8 = vis_ld_u8_nf(sa_6); sa_6 += nchannel; CONV_AL(d21, k1k2); (*s_buf1++) = t1; CONV_AU(d22, k7k8); (*s_buf1++) = t2; CONV_AL(d23, k11k12); (*s_buf1++) = t3; SHIFT_U8_1; CONV_AU(d21, k3k4); (*s_buf1++) = t4; CONV_AL(d22, k7k8); CONV_AU(d23, k13k14); d21 = vis_faligndata(d21, dt_1); d22 = vis_faligndata(d22, dt_2); (*s_buf1++) = t5; d23 = vis_faligndata(d23, dt_3); CONV_AL(d21, k3k4); (*s_buf1++) = t6; CONV_AU(d22, k9k10); (*s_buf1++) = t7; CONV_AL(d23, k13k14); d21 = vis_freg_pair(vis_read_lo(d1), vis_read_hi(d_1)); CONV_AU(d21, k5k6); d22 = vis_freg_pair(vis_read_lo(d2), vis_read_hi(d_2)); CONV_AL(d22, k9k10); d23 = vis_freg_pair(vis_read_lo(d3), vis_read_hi(d_3)); CONV_AU(d23, k15k16); (*s_buf1++) = t8; ddst[0] = out0; ddst[1] = out1; ddst += 2; d1 = d_1; d2 = d_2; d3 = d_3; s1++; s2++; s3++; } ddst = (mlib_d64 *)(dbuf); d4 = *s4; d5 = *s5; /* * in each iteration store result from prev. iterat. * and load data for processing next row */ #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_4 = *(s4 + 1); d_5 = *(s5 + 1); out0 = ddst[0]; out1 = ddst[1]; ddst += 2; tt1 = (*d_buf++); CONV_AL(d4, k15k16); tt2 = (*d_buf++); CONV_AU(d5, k21k22); d24 = vis_faligndata(d4, d_4); tt3 = (*d_buf++); dt_4 = vis_faligndata(d_4, d4); d25 = vis_faligndata(d5, d_5); tt4 = (*d_buf++); dt_5 = vis_faligndata(d_5, d5); tt5 = (*d_buf++); CONV_AU(d24, k17k18); tt6 = (*d_buf++); CONV_AL(d25, k21k22); tt7 = (*d_buf++); SHIFT_U8_2; tt8 = (*d_buf++); CONV_AL(d24, k17k18); *da = tt1; da += nchannel1; CONV_AU(d25, k23k24); *da = tt2; da += nchannel1; d24 = vis_faligndata(d24, dt_4); *da = tt3; da += nchannel1; d25 = vis_faligndata(d25, dt_5); *da = tt4; da += nchannel1; CONV_AU(d24, k19k20); *da = tt5; da += nchannel1; CONV_AL(d25, k23k24); *da = tt6; da += nchannel1; d24 = vis_freg_pair(vis_read_lo(d4), vis_read_hi(d_4)); CONV_AL(d24, k19k20); *da = tt7; da += nchannel1; d25 = vis_freg_pair(vis_read_lo(d5), vis_read_hi(d_5)); CONV_AU(d25, k25); *da = tt8; da += nchannel1; (*ddst1++) = vis_fpack16_pair(out0, out1); d4 = d_4; d5 = d_5; s4++; s5++; nchannel1 = nchannel; } (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); sa_6 += nchannel; (*s_buf1++) = vis_ld_u8_nf(sa_6); if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); } sa_6 = sa6 = sa6 + slb; d_a += dlb; } /* process last row - no need to load data */ ddst1 = ddst = (mlib_d64 *)(dbuf); d_buf = (dbuf - 8); da = d_a; dend = da + (dw - 1) * nchannel; prow = sbuf1; sbuf1 = sbuf2; sbuf2 = sbuf3; sbuf3 = sbuf4; sbuf4 = sbuf5; sbuf5 = prow; s1 = (mlib_d64 *)sbuf1; s2 = (mlib_d64 *)sbuf2; s3 = (mlib_d64 *)sbuf3; s4 = (mlib_d64 *)sbuf4; s5 = (mlib_d64 *)sbuf5; d1 = *s1; d2 = *s2; d3 = *s3; nchannel1 = 0; #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_3 = *(s3 + 1); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k5k6); CONV_AU(d3, k11k12); d21 = vis_faligndata(d1, d_1); dt_1 = vis_faligndata(d_1, d1); d22 = vis_faligndata(d2, d_2); dt_2 = vis_faligndata(d_2, d2); d23 = vis_faligndata(d3, d_3); dt_3 = vis_faligndata(d_3, d3); CONV_AL(d21, k1k2); CONV_AU(d22, k7k8); CONV_AL(d23, k11k12); SHIFT_U8_1; CONV_AU(d21, k3k4); CONV_AL(d22, k7k8); CONV_AU(d23, k13k14); d21 = vis_faligndata(d21, dt_1); d22 = vis_faligndata(d22, dt_2); d23 = vis_faligndata(d23, dt_3); CONV_AL(d21, k3k4); CONV_AU(d22, k9k10); CONV_AL(d23, k13k14); d21 = vis_freg_pair(vis_read_lo(d1), vis_read_hi(d_1)); CONV_AU(d21, k5k6); d22 = vis_freg_pair(vis_read_lo(d2), vis_read_hi(d_2)); CONV_AL(d22, k9k10); d23 = vis_freg_pair(vis_read_lo(d3), vis_read_hi(d_3)); CONV_AU(d23, k15k16); ddst[0] = out0; ddst[1] = out1; ddst += 2; d1 = d_1; d2 = d_2; d3 = d_3; s1++; s2++; s3++; } ddst = (mlib_d64 *)(dbuf); d4 = *s4; d5 = *s5; /* * in each iteration store result from prev. iterat. * and load data for processing next row */ #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_4 = *(s4 + 1); d_5 = *(s5 + 1); out0 = ddst[0]; out1 = ddst[1]; ddst += 2; tt1 = (*d_buf++); CONV_AL(d4, k15k16); tt2 = (*d_buf++); CONV_AU(d5, k21k22); d24 = vis_faligndata(d4, d_4); tt3 = (*d_buf++); dt_4 = vis_faligndata(d_4, d4); d25 = vis_faligndata(d5, d_5); tt4 = (*d_buf++); dt_5 = vis_faligndata(d_5, d5); tt5 = (*d_buf++); CONV_AU(d24, k17k18); tt6 = (*d_buf++); CONV_AL(d25, k21k22); tt7 = (*d_buf++); SHIFT_U8_2; tt8 = (*d_buf++); CONV_AL(d24, k17k18); *da = tt1; da += nchannel1; CONV_AU(d25, k23k24); *da = tt2; da += nchannel1; d24 = vis_faligndata(d24, dt_4); *da = tt3; da += nchannel1; d25 = vis_faligndata(d25, dt_5); *da = tt4; da += nchannel1; CONV_AU(d24, k19k20); *da = tt5; da += nchannel1; CONV_AL(d25, k23k24); *da = tt6; da += nchannel1; d24 = vis_freg_pair(vis_read_lo(d4), vis_read_hi(d_4)); CONV_AL(d24, k19k20); *da = tt7; da += nchannel1; d25 = vis_freg_pair(vis_read_lo(d5), vis_read_hi(d_5)); CONV_AU(d25, k25); *da = tt8; da += nchannel1; (*ddst1++) = vis_fpack16_pair(out0, out1); d4 = d_4; d5 = d_5; s4++; s5++; nchannel1 = nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = (*d_buf++); } } __mlib_free(buff_src); return (MLIB_SUCCESS); }
mlib_status mlib_v_conv5x5_8nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u8 *sa, *sa1, *sa2, *sa3, *sa4; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointers to rows in interm. src buf */ mlib_d64 *sbuf3, *sbuf4, *sbuf5; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf, *dbuf1; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3, *s4, *s5; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d3, d4, d5; /* data */ mlib_d64 d11, d12, d13, d14, d15; /* data */ mlib_d64 d21, d22, d23, d24, d25; /* data */ mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5; mlib_f32 k1k2, k3k4, k5k6, k7k8; mlib_f32 k9k10, k11k12, k13k14, k15k16; mlib_f32 k17k18, k19k20, k21k22, k23k24, k25; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1; mlib_d64 tmp0, tmp1, rnd; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1; mlib_s32 emask; mlib_s32 rval, gsr_scale, i, j; gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3)); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); buf_slb = (4 * dw + 24) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 4; dw *= 4; dh -= 4; sa = adr_src; sa1 = sa + slb; sa2 = sa1 + slb; sa3 = sa2 + slb; sa4 = sa3 + slb; d_a = adr_dst + 2 * dlb + 8; /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf2, sa); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf3, sa1); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf4, sa2); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf5, sa3); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); PREPARE_TO_LOAD_LINE(sbuf5, sa4); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER_NF(16); vis_alignaddr(s1, 4); dbuf1 = dbuf; d1 = *s1; d2 = *s2; d3 = *s3; d11 = *(s1 + 1); d12 = *(s2 + 1); d13 = *(s3 + 1); #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d21 = *(s1 + 2); d22 = *(s2 + 2); d23 = *(s3 + 2); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k5k6); CONV_AU(d3, k11k12); dt_1 = vis_faligndata(d1, d11); dt_2 = vis_faligndata(d2, d12); dt_3 = vis_faligndata(d3, d13); CONV_AL(dt_1, k1k2); CONV_AU(dt_2, k7k8); CONV_AL(dt_3, k11k12); CONV_AU(d11, k3k4); CONV_AL(d12, k7k8); CONV_AU(d13, k13k14); dt_1 = vis_faligndata(d11, d21); dt_2 = vis_faligndata(d12, d22); dt_3 = vis_faligndata(d13, d23); CONV_AL(dt_1, k3k4); CONV_AU(dt_2, k9k10); CONV_AL(dt_3, k13k14); CONV_AU(d21, k5k6); CONV_AL(d22, k9k10); CONV_AU(d23, k15k16); dbuf1[0] = out0; dbuf1[1] = out1; dbuf1 += 2; d1 = d11; d2 = d12; d3 = d13; d11 = d21; d12 = d22; d13 = d23; s1++; s2++; s3++; } dbuf1 = dbuf; d4 = *s4; d5 = *s5; d14 = *(s4 + 1); d15 = *(s5 + 1); #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d24 = *(s4 + 2); d25 = *(s5 + 2); out0 = dbuf1[0]; out1 = dbuf1[1]; CONV_AL(d4, k15k16); CONV_AU(d5, k21k22); dt_4 = vis_faligndata(d4, d14); dt_5 = vis_faligndata(d5, d15); CONV_AU(dt_4, k17k18); CONV_AL(dt_5, k21k22); CONV_AL(d14, k17k18); CONV_AU(d15, k23k24); dt_4 = vis_faligndata(d14, d24); dt_5 = vis_faligndata(d15, d25); CONV_AU(dt_4, k19k20); CONV_AL(dt_5, k23k24); CONV_AL(d24, k19k20); CONV_AU(d25, k25); dbuf1 += 2; (*ddst++) = vis_fpack16_pair(out0, out1); d4 = d14; d5 = d15; d14 = d24; d15 = d25; s4++; s5++; } PREPARE_TO_COPY_INTERM_BUF_TO_DST(); #pragma pipeloop(0) COPY_INTERM_BUF_TO_DST(); COPY_TAIL(); sa4 = sa4 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2ABGR411_dst_nonalign( mlib_u8 *abgr, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 abgr_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp1, *sp2, *sp3, *sl1, *sl2, *sl3; /* pointers to dst address */ mlib_u8 *dp, *dl, *dend; /* all. pointer to y */ mlib_d64 *spy; /* all. pointer to dst */ mlib_d64 *dpp; /* u, v data */ mlib_f32 fu, fv; /* y data */ mlib_d64 dy0, dy1, dy2; mlib_d64 ddy1, ddy2, ddy3, ddy4; mlib_d64 du0, du1; mlib_d64 dv1, dv2; mlib_d64 dr, dr1, dr2, dr3, dr4; mlib_d64 dg, dg1, dg2, dg3, dg4; mlib_d64 db, db1, db2, db3, db4; mlib_d64 dd, dd0, dd1, dtmp; /* used to load u, v into mlib_f32 */ mlib_f32 ffu[1], ffv[1]; /* used to load u, v into mlib_f32 */ mlib_u8 *ufu, *vfu; /* 1.1644 * 4096 */ mlib_f32 f0 = vis_to_float(0x12a1); /* 2.0184 * 8192 */ mlib_f32 f1 = vis_to_float(0x4097); /* -0.3920 * 8192 */ mlib_f32 f4 = vis_to_float(0xf375); /* -0.8132 * 8192 */ mlib_f32 f5 = vis_to_float(0xe5fa); /* 1.5966 * 8192 */ mlib_f32 f8 = vis_to_float(0x3317); /* -276.9856 * 32 */ mlib_d64 doff0 = vis_to_double_dup(0xdd60dd60); /* 135.6352 * 32 */ mlib_d64 doff1 = vis_to_double_dup(0x10f410f4); /* -222.9952 * 32 */ mlib_d64 doff2 = vis_to_double_dup(0xe420e420); mlib_f32 fscale = vis_to_float(0x80808080); /* loop variables */ mlib_s32 i, j; /* alpha_ch. is not written */ mlib_s32 emask = 0x7777; mlib_s32 emask1; mlib_d64 *buf; mlib_s32 inc; ufu = (mlib_u8 *)ffu; vfu = (mlib_u8 *)ffv; /* * initialize GSR scale factor */ vis_write_gsr(3 << 3); buf = (mlib_d64 *)__mlib_malloc((width / 8 + 1) * sizeof (mlib_d64)); if (buf == NULL) return (MLIB_FAILURE); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dl = dp = (mlib_u8 *)abgr; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dpp = buf; dy0 = vis_ld_d64_nf(spy); spy++; #pragma pipeloop(0) for (i = 0; i < width; i += 8) { dy1 = vis_ld_d64_nf(spy); spy++; (*dpp++) = vis_faligndata(dy0, dy1); dy0 = dy1; } spy = buf; dend = dp + width * 4 - 1; emask1 = vis_edge8(dp, dend); dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = dp - (mlib_u8 *)dpp; emask >>= i; vis_alignaddr((void *)(8 - i), 0); inc = (emask1 != 0xff); emask1 &= emask; ufu[0] = vis_ld_u8_nf(sp2); ufu[1] = vis_ld_u8_nf(sp2 + 1); ufu[2] = vis_ld_u8_nf(sp2 + 2); ufu[3] = vis_ld_u8_nf(sp2 + 3); vfu[0] = vis_ld_u8_nf(sp3); vfu[1] = vis_ld_u8_nf(sp3 + 1); vfu[2] = vis_ld_u8_nf(sp3 + 2); vfu[3] = vis_ld_u8_nf(sp3 + 3); sp2 += 4; sp3 += 4; fu = ffu[0]; fv = ffv[0]; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 16; i += 16) { dy1 = (*spy++); dy2 = (*spy++); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ufu[0] = vis_ld_u8_nf(sp2); ddy3 = vis_fmul8x16al(vis_read_hi(dy2), f0); ddy4 = vis_fmul8x16al(vis_read_lo(dy2), f0); ufu[1] = vis_ld_u8_nf(sp2 + 1); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); ufu[2] = vis_ld_u8_nf(sp2 + 2); db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); ufu[3] = vis_ld_u8_nf(sp2 + 3); db3 = vis_fmul8x16au(fscale, vis_read_lo(db)); db3 = vis_fpadd16(ddy3, db3); vfu[0] = vis_ld_u8_nf(sp3); db4 = vis_fmul8x16al(fscale, vis_read_lo(db)); db4 = vis_fpadd16(ddy4, db4); vfu[1] = vis_ld_u8_nf(sp3 + 1); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); vfu[2] = vis_ld_u8_nf(sp3 + 2); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); vfu[3] = vis_ld_u8_nf(sp3 + 3); dg3 = vis_fmul8x16au(fscale, vis_read_lo(dg)); dg3 = vis_fpadd16(ddy3, dg3); dg4 = vis_fmul8x16al(fscale, vis_read_lo(dg)); dg4 = vis_fpadd16(ddy4, dg4); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr3 = vis_fmul8x16au(fscale, vis_read_lo(dr)); dr3 = vis_fpadd16(ddy3, dr3); dr4 = vis_fmul8x16al(fscale, vis_read_lo(dr)); dr4 = vis_fpadd16(ddy4, dr4); dr = vis_fpack16_pair(dr1, dr2); dr1 = vis_fpack16_pair(dr3, dr4); dg = vis_fpack16_pair(dg1, dg2); dg1 = vis_fpack16_pair(dg3, dg4); db = vis_fpack16_pair(db1, db2); db1 = vis_fpack16_pair(db3, db4); dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; inc = 1; dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); dg2 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dg1)); dg3 = vis_fpmerge(vis_read_hi(db1), vis_read_hi(dr1)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); dg2 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dg1)); dg3 = vis_fpmerge(vis_read_lo(db1), vis_read_lo(dr1)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); fu = ffu[0]; fv = ffv[0]; sp2 += 4; sp3 += 4; emask1 = emask; } if (i <= width - 8) { dy1 = (*spy++); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); ddy2 = vis_fmul8x16al(vis_read_lo(dy1), f0); ufu[0] = ufu[2]; db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); vfu[0] = vfu[2]; db2 = vis_fmul8x16al(fscale, vis_read_hi(db)); db2 = vis_fpadd16(ddy2, db2); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dg2 = vis_fmul8x16al(fscale, vis_read_hi(dg)); dg2 = vis_fpadd16(ddy2, dg2); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); dr2 = vis_fmul8x16al(fscale, vis_read_hi(dr)); dr2 = vis_fpadd16(ddy2, dr2); dr = vis_fpack16_pair(dr1, dr2); dg = vis_fpack16_pair(dg1, dg2); db = vis_fpack16_pair(db1, db2); dg2 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dg)); dg3 = vis_fpmerge(vis_read_hi(db), vis_read_hi(dr)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; inc = 1; dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); dg2 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dg)); dg3 = vis_fpmerge(vis_read_lo(db), vis_read_lo(dr)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); fu = ffu[0]; fv = ffv[0]; i += 8; emask1 = emask; } if (i < width) { dy1 = vis_ld_d64_nf(spy); du0 = vis_fmul8x16al(fu, f1); db = vis_fpadd16(du0, doff0); du1 = vis_fmul8x16al(fu, f4); dv1 = vis_fmul8x16al(fv, f5); dtmp = vis_fpadd16(du1, dv1); dg = vis_fpadd16(dtmp, doff1); dv2 = vis_fmul8x16al(fv, f8); dr = vis_fpadd16(dv2, doff2); ddy1 = vis_fmul8x16al(vis_read_hi(dy1), f0); db1 = vis_fmul8x16au(fscale, vis_read_hi(db)); db1 = vis_fpadd16(ddy1, db1); dg1 = vis_fmul8x16au(fscale, vis_read_hi(dg)); dg1 = vis_fpadd16(ddy1, dg1); dr1 = vis_fmul8x16au(fscale, vis_read_hi(dr)); dr1 = vis_fpadd16(ddy1, dr1); fu = vis_fpack16(db1); dg2 = vis_fpmerge(fu, vis_fpack16(dg1)); dg3 = vis_fpmerge(fu, vis_fpack16(dr1)); dd1 = vis_fpmerge(vis_read_hi(dg2), vis_read_hi(dg3)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; dd0 = vis_fpmerge(vis_read_lo(dg2), vis_read_lo(dg3)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); } emask1 = vis_edge8(dpp, dend); emask1 &= emask; dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; dl = dp = dl + abgr_stride; emask = 0x7777; } __mlib_free(buf); return (MLIB_SUCCESS); }
DEF_FUNC(mlib_ImageDivAlpha_U8, mlib_u8) { mlib_d64 mask7FFF = vis_to_double_dup(0x7FFF7FFF); mlib_d64 *p_tbl; mlib_d64 *buffs, *buffd; mlib_d64 *sp, *dp; mlib_d64 ss, d0, d1, dd, a0, a1; mlib_s32 cmask = (1 << (channel - alpha - 1)); mlib_s32 ww, dflag, i, j; vis_write_gsr(7 << 3); cmask |= (cmask << channel); cmask |= (cmask << 2 * channel); if (channel == 3) { p_tbl = (mlib_d64 *)mlib_DivAlpha_tbl; } else { p_tbl = (mlib_d64 *)mlib_DivAlpha_tbl4 + alpha * 256; } width *= channel; ww = (width + 7) / 8; if (channel == 3) { ww = 3 * ((ww + 2) / 3); } buffs = __mlib_malloc(2 * sizeof (mlib_d64) * ww); if (buffs == NULL) { return (MLIB_FAILURE); } buffd = buffs + ww; for (j = 0; j < height; j++) { mlib_u8 *ap = sl + alpha; if (((int)sl & 7)) { MEM_COPY(sl, buffs, width * sizeof (mlib_u8)); sp = buffs; } else { sp = (mlib_d64 *)sl; } dflag = 0; if (((int)dl | width) & 7) { dp = buffd; dflag = 1; } else { dp = (mlib_d64 *)dl; } if (channel == 4) { #pragma pipeloop(0) for (i = 0; i < ww; i++) { ss = *sp; GET_ALPHA(a0, sp, alpha); GET_ALPHA(a1, sp, alpha + 4); DIV_ALPHA(d0, vis_read_hi(ss), a0); DIV_ALPHA(d1, vis_read_lo(ss), a1); *dp = vis_fpack16_pair(d0, d1); sp++; dp++; } } else if (channel == 3) { mlib_d64 a0, a1, a2, aa; mlib_d64 b0, b1, b2, bb; mlib_d64 s0, s1, s2; mlib_d64 d0, d1; mlib_s32 cmask0, cmask1, cmask2; cmask0 = 0x492 >> alpha; cmask1 = 0x492 >> (alpha + 1); cmask2 = 0x492 >> (alpha + 2); vis_alignaddr((void *)0, 4); if (alpha == 0) { #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { GET_ALPHA_3CH_0(); DIV_ALPHA_3CH(); } if (i < ww) { GET_ALPHA_3CH_0_NF(); DIV_ALPHA_3CH_NF(); } } else if (alpha == 1) { #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { GET_ALPHA_3CH_1(); DIV_ALPHA_3CH(); } if (i < ww) { GET_ALPHA_3CH_1_NF(); DIV_ALPHA_3CH_NF(); } } else { /* if (alpha == 2) */ #pragma pipeloop(0) for (i = 0; i < ww - 3; i += 3) { GET_ALPHA_3CH_2(); DIV_ALPHA_3CH(); } if (i < ww) { GET_ALPHA_3CH_2_NF(); DIV_ALPHA_3CH_NF(); } } } else { /* if (channel == 2) */ #pragma pipeloop(0) for (i = 0; i < ww; i++) {
static mlib_status mlib_v_VideoColorYUV2RGB444_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 yuv_stride) { /* all. pointer to y, u, v */ mlib_d64 *spy, *dfu, *dfv; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv, du0, du1, dv0, dv1; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0); mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4); mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2; /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_u8 *tmp, *dp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); if (tmp == NULL) return (MLIB_FAILURE); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; for (j = 0; j < height; j++) { dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0); du0 = (*dfu++); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0); dv0 = (*dfv++); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); spy = (mlib_d64 *)vis_alignaddr((void *)y, 0); dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd0 = vis_bshuffle(red, green); vis_write_bmask(0x03B04C05, 0); dd1 = vis_bshuffle(red, green); vis_write_bmask(0xD06E07F0, 0); dd2 = vis_bshuffle(red, green); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd0, blue); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd1, blue); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd2, blue); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; ddp += 3; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); blue = vis_faligndata(blue, blue); green = vis_faligndata(green, green); red = vis_faligndata(red, red); dp += ((width - i - 1) * 3); vis_alignaddr((void *)spy, 7); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } __mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3); rgb += rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; y += yuv_stride; u += yuv_stride; v += yuv_stride; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_U8_S8_Sat( mlib_u8 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *src = (void *)x; mlib_u8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i, off; mlib_s8 c; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 zero = vis_fzeros(); if (length < 16) { PACK_S_U(mlib_s8, mlib_u8); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) < 0 ? 0 : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); off = (mlib_addr)src & 7; vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_bshuffle(d1, d2); d4 = vis_fmul8sux16(d3, four_16_ones); d3 = vis_faligndata(d3, d3); d5 = vis_fmul8sux16(d3, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (i; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_bshuffle(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_bshuffle(d1, d2); d4 = vis_fmul8sux16(d3, four_16_ones); d3 = vis_faligndata(d3, d3); d5 = vis_fmul8sux16(d3, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); d4 = vis_fmul8sux16(d6, four_16_ones); d6 = vis_faligndata(d6, d6); d5 = vis_fmul8sux16(d6, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : c; return (MLIB_SUCCESS); }
mlib_status FUNC( MxN) ( mlib_image *dst, const mlib_image *src, const mlib_s32 **dmask, mlib_s32 m, mlib_s32 n, mlib_s32 scale, const void *colormap) { mlib_type stype, dtype; const mlib_s32 *dmask0 = dmask[0], *dmask1 = dmask[1], *dmask2 = dmask[2]; mlib_s32 method = mlib_ImageGetMethod(colormap); mlib_u8 *sl, *dl; mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh, num_blk; mlib_s32 off, off1, kw, mstep, line_size, kern_size, xsize8, i, j, k; mlib_d64 *pbuff; mlib_u8 *p_dim; mlib_s16 *kern, *pkern; mlib_d64 *dkern; mlib_d64 dscale, dscale0, dscale1, dscale2; mlib_d64 ss, d0, d1; mlib_f32 fzeros = vis_fzeros(); mlib_s32 step0, half_step0, v0; mlib_s32 bit_offset = mlib_ImageGetBitOffset(dst); mlib_u8 *p_lut; MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl); MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl); p_lut = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap); step0 = abs(p_lut[1] - p_lut[0]); num_blk = (sw + (m - 1)) / m; mstep = m * NCHAN; line_size = (mstep * num_blk + 7) & ~7; xsize8 = (NCHAN * sw + 7) / 8; dscale = 1.0; while (scale > 30) { dscale *= 1.0 / (1 << 30); scale -= 30; } dscale /= (1 << scale); dscale0 = dscale * step0; half_step0 = (step0 - 1) >> 1; kern_size = n * line_size; kern = __mlib_malloc(kern_size * sizeof (mlib_s16)); if (kern == NULL) return (MLIB_FAILURE); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { pkern = kern + j * line_size + i; v0 = half_step0 - (mlib_s32)(dmask0[j * m + i] * dscale0); for (k = 0; k < num_blk; k++) { pkern[k * mstep] = v0; } } } pbuff = __mlib_malloc(xsize8 * sizeof (mlib_d64) + 16); if (pbuff == NULL) { __mlib_free(kern); return (MLIB_FAILURE); } pkern = kern; vis_write_gsr(7 << 3); for (j = 0; j < sh; j++) { dkern = (mlib_d64 *)pkern; if ((mlib_s32)sl & 7) { mlib_u8 *sp = sl; #pragma pipeloop(0) for (i = 0; i < xsize8; i++) { LOAD_NA_NF(ss, sp); d0 = vis_fpadd16(vis_fpmerge(vis_fzeros(), vis_read_hi(ss)), dkern[2 * i]); d1 = vis_fpadd16(vis_fpmerge(vis_fzeros(), vis_read_lo(ss)), dkern[2 * i + 1]); pbuff[i] = vis_fpack16_pair(d0, d1); sp += 8; } } else { mlib_d64 *sp = (mlib_d64 *)sl; #pragma pipeloop(0) for (i = 0; i < xsize8; i++) { ss = sp[i]; d0 = vis_fpadd16(vis_fpmerge(vis_fzeros(), vis_read_hi(ss)), dkern[2 * i]); d1 = vis_fpadd16(vis_fpmerge(vis_fzeros(), vis_read_lo(ss)), dkern[2 * i + 1]); pbuff[i] = vis_fpack16_pair(d0, d1); } } pkern += line_size; if (pkern >= kern + kern_size) pkern = kern; mlib_ImageColorTrue2IndexLine_U8_BIT_1((mlib_u8 *)pbuff, dl, bit_offset, sw, colormap); sl += sll; dl += dll; } __mlib_free(pbuff); __mlib_free(kern); return (MLIB_SUCCESS); }