mlib_status __mlib_VideoUpSample420_Nearest_S16( mlib_s16 *dst0, mlib_s16 *dst1, const mlib_s16 *src, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)src; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_s16 *dend = dst0 + 2 * n - 1; mlib_d64 sa, da, dr, dr1; mlib_s32 emask, i; if (n <= 0) return (MLIB_FAILURE); #pragma pipeloop(0) for (i = 0; i <= (n - 4); i += 4) { sa = sp[0]; sp++; dr = vis_fpmerge(vis_read_hi(sa), vis_read_lo(sa)); dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr)); dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr)); dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr)); dp0[0] = dp1[0] = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1)); dp0[1] = dp1[1] = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1)); dp0 += 2; dp1 += 2; } if ((mlib_s16 *)dp0 <= dend) { sa = sp[0]; dr = vis_fpmerge(vis_read_hi(sa), vis_read_lo(sa)); dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr)); dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr)); dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr)); da = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1)); emask = vis_edge16(dp0, dend); vis_pst_16(da, dp0, emask); vis_pst_16(da, dp1, emask); dp0++; dp1++; if ((mlib_s16 *)dp0 <= dend) { da = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1)); emask = vis_edge16(dp0, dend); vis_pst_16(da, dp0, emask); vis_pst_16(da, dp1, emask); } } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorNorm_S16_Sat( mlib_d64 *z, const mlib_s16 *x, mlib_s32 n) { mlib_s16 *px = (mlib_s16 *)x; mlib_s16 *pxend; mlib_d64 *dpx, *dpxend; mlib_d64 dx, ds, ds1; mlib_d64 edge[2]; type_union_mlib_d64 dr, dr1; mlib_s32 d_left; mlib_u8 emask; edge[0] = edge[1] = 0; if (n <= 0) return (MLIB_FAILURE); ds = ds1 = 0; dpx = (mlib_d64 *)((mlib_addr)px & (~7)); pxend = px + n - 1; emask = vis_edge16(px, pxend); vis_pst_16(dpx[0], edge, emask); dx = edge[0]; dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7)); d_left = dpxend - dpx; for (; d_left > 0; d_left--) { NORM16; dpx++; dx = dpx[0]; } if ((mlib_addr)dpx <= (mlib_addr)pxend) { emask = vis_edge16(dpx, pxend); vis_pst_16(dx, edge + 1, emask); dx = edge[1]; NORM16; } z[0] = mlib_sqrt(ds + ds1); return (MLIB_SUCCESS); }
void mlib_v_ImageLookUp_S32_S16_3_D1(mlib_s32 *src, mlib_s16 *dst, mlib_s32 xsize, mlib_s16 *table0, mlib_s16 *table1, mlib_s16 *table2) { mlib_s32 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_s16 *dl; /* pointer to start of destination */ mlib_s16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc0; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ mlib_s16 *table; dl = dst; sp = src; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *) 0, 6); if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for(i = 0; i <= xsize - 8; i+=4, sp += 4) { t3 = vis_ld_u16_i(table0, ((mlib_addr)2*s3)); t2 = vis_ld_u16_i(table2, ((mlib_addr)2*s2)); t1 = vis_ld_u16_i(table1, ((mlib_addr)2*s1)); t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; *dp++ = acc0; table = table0; table0 = table1; table1 = table2; table2 = table; } t3 = vis_ld_u16_i(table0, ((mlib_addr)2*s3)); t2 = vis_ld_u16_i(table2, ((mlib_addr)2*s2)); t1 = vis_ld_u16_i(table1, ((mlib_addr)2*s1)); t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); *dp++ = acc0; table = table0; table0 = table1; table1 = table2; table2 = table; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_s32)((mlib_s16*) dend - (mlib_s16*) dp); sp += num; num ++; if (num == 1) { s0 = *sp; t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); } else if (num == 2) { s0 = *sp; sp --; t0 = vis_ld_u16_i(table1, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); s0 = *sp; t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); } else if (num == 3) { s0 = *sp; sp --; t0 = vis_ld_u16_i(table2, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); s0 = *sp; sp --; t0 = vis_ld_u16_i(table1, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); s0 = *sp; t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge16(dp, dend); vis_pst_16(acc0, dp, emask); } }
mlib_status __mlib_VectorAdd_S16_S16_Mod( mlib_s16 *z, const mlib_s16 *x, const mlib_s16 *y, mlib_s32 n) { mlib_d64 *dpz, *dpx, *dpy; mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1; mlib_s16 *pz, *px, *py, *pzend; /* offset of address alignment in destination */ mlib_s32 off; /* edge masks */ mlib_s32 emask; mlib_s32 len = n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; if (n <= 0) return (MLIB_FAILURE); px = (mlib_s16 *)x; py = (mlib_s16 *)y; pz = (mlib_s16 *)z; dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (mlib_addr)dpz - (mlib_addr)z; pzend = pz + n - 1; /* * generate edge mask for the start point */ emask = vis_edge16(pz, pzend); /* * prepare the destination address */ if (off) { dpy = (mlib_d64 *)vis_alignaddr(py, off); dy0 = vis_ld_d64_nf(dpy); dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); dpx = (mlib_d64 *)vis_alignaddr(px, off); dx0 = vis_ld_d64_nf(dpx); dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); ADD16_MOD; px += (8 + off) >> 1; py += (8 + off) >> 1; len -= (8 + off) >> 1; dpz++; } if (len <= 0) return (MLIB_SUCCESS); even_8 = len >> 2; rest_8 = len & 0x3; emask = 0xf; /* * Now try to analyze source "x" and "y" addresses. */ if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) { /* * Both addresses are 8-byte aligned. No vis_alignaddr * and vis_faligndata at all. */ dpx = (mlib_d64 *)px; dpy = (mlib_d64 *)py; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx = (*dpx++); dy = (*dpy++); ADD16_MOD; dpz++; } dx1 = vis_ld_d64_nf(dpx); dpx++; dy1 = vis_ld_d64_nf(dpy); dpy++; } else if (!((mlib_addr)px & 7)) { /* * First ("x") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "y". */ dpx = (mlib_d64 *)px; dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx = (*dpx++); dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); ADD16_MOD; dy0 = dy1; dpz++; } dx1 = vis_ld_d64_nf(dpx); dpx++; dy1 = dy0; } else if (!((mlib_addr)py & 7)) { /* * Second ("y") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "x". */ dpy = (mlib_d64 *)py; dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); dy = (*dpy++); ADD16_MOD; dx0 = dx1; dpz++; } dx1 = dx0; dy1 = vis_ld_d64_nf(dpy); dpy++; } else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) { /* * Both ("x" and "y") address are identically aligned. * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); ADD16_MOD; dpz++; dx0 = dx1; dy0 = dy1; } dx1 = dx0; dy1 = dy0; } else { /* * Both ("x" and "y") address are arbitrary aligned. */ off = (mlib_addr)px & 7; dpx = (mlib_d64 *)((mlib_u8 *)px - off); vis_write_bmask(off * 0x11111111, 0x01234567); dx0 = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_bshuffle(dx0, dx1); dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); ADD16_MOD; dx0 = dx1; dy0 = dy1; dpz++; } dx1 = dx0; dy1 = dy0; } if (!rest_8) return (MLIB_SUCCESS); /* * prepare edge mask for the last bytes */ emask = ~(vis_edge16((void *)(rest_8 << 1), pzend)); off = (mlib_addr)px & 7; vis_write_bmask(off * 0x11111111, 0x01234567); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_bshuffle(dx0, dx1); vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); ADD16_MOD; return (MLIB_SUCCESS); }
void mlib_v_ImageLookUp_U16_U16_124_D1(const mlib_u16 *src, mlib_u16 *dst, mlib_s32 xsize, const mlib_u16 *table0, const mlib_u16 *table1, const mlib_u16 *table2, const mlib_u16 *table3) { mlib_u16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_u16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, acc0; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ dl = dst; sp = (void *)src; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *) 0, 6); i = 0; if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for(i = 0; i <= xsize - 8; i+=4, sp += 4) { t3 = VIS_LD_U16_I(table3, 2*s3); t2 = VIS_LD_U16_I(table2, 2*s2); t1 = VIS_LD_U16_I(table1, 2*s1); t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; *dp++ = acc0; } t3 = VIS_LD_U16_I(table3, 2*s3); t2 = VIS_LD_U16_I(table2, 2*s2); t1 = VIS_LD_U16_I(table1, 2*s1); t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); *dp++ = acc0; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_u16*) dend - (mlib_u16*) dp; sp += num; num ++; if (num == 1) { s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t0, acc0); } else if (num == 2) { s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table1, 2*s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t0, acc0); } else if (num == 3) { s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table2, 2*s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table1, 2*s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge16(dp, dend); vis_pst_16(acc0, dp, emask); } }
void mlib_v_ImageSquare_S16( mlib_s16 *src, mlib_s32 slb, mlib_s16 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { /* aligned pointer to source */ mlib_d64 *sp; /* pointer to a line in source */ mlib_s16 *sl; /* aligned pointer to destination */ mlib_d64 *dp; /* pointer to a line in destination */ mlib_s16 *dl; /* pointer to end of a line in dst */ mlib_s16 *dend; /* offset of address alignment in dst */ mlib_s32 off; /* edge masks */ mlib_s32 emask; /* source data */ mlib_d64 s0, s1; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 i, j, n; sl = src; dl = dst; /* row loop */ for (j = 0; j < ysize; j++) { /* prepare the destination address */ dp = (mlib_d64 *)((mlib_addr)dl & (~7)); off = (mlib_addr)dp - (mlib_addr)dl; dend = dl + xsize - 1; /* prepare the source address */ sp = (mlib_d64 *)vis_alignaddr(sl, off); /* generate edge mask for the start point */ emask = vis_edge16(dl, dend); /* first 4 pixels */ s0 = vis_ld_d64_nf(sp); sp++; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQUARE_S16(sd, dd); vis_pst_16(dd, dp++, emask); n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8; /* 4-pixel column loop */ #pragma pipeloop(0) for (i = 0; i < n; i++) { s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQUARE_S16(sd, dd); (*dp++) = dd; } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge16(dp, dend); s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQUARE_S16(sd, dd); vis_pst_16(dd, dp++, emask); } sl = (mlib_s16 *)((mlib_u8 *)sl + slb); dl = (mlib_s16 *)((mlib_u8 *)dl + dlb); } }
void mlib_v_ImageSqrShift_S16_D1( mlib_s16 *src, mlib_s16 *dst, mlib_s32 dsize, mlib_s32 shift) { /* aligned pointer to source */ mlib_d64 *sp; /* pointer to source */ mlib_s16 *sa; /* aligned pointer to destination */ mlib_d64 *dp; /* pointer to destination */ mlib_s16 *da; /* pointer to end of dst */ mlib_s16 *dend; /* offset of address alignment in dst */ mlib_s32 off; /* edge masks */ mlib_s32 emask; /* source data */ mlib_d64 s0, s1; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 rdhh, rdhl; /* temporaries used in macro */ mlib_d64 rdlh, rdll; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 i, n; sa = src; da = dst; /* prepare the destination address */ dp = (mlib_d64 *)((mlib_addr)da & (~7)); off = (mlib_addr)dp - (mlib_addr)da; dend = da + dsize - 1; /* prepare the source address */ sp = (mlib_d64 *)vis_alignaddr(sa, off); /* generate edge mask for the start point */ emask = vis_edge16(da, dend); /* first 4 pixels */ s0 = vis_ld_d64_nf(sp); sp++; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_S16(sd, dd); vis_pst_16(dd, dp++, emask); n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8; /* 4-pixel column loop */ #pragma pipeloop(0) for (i = 0; i < n; i++) { s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_S16(sd, dd); (*dp++) = dd; } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge16(dp, dend); s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_S16(sd, dd); vis_pst_16(dd, dp++, emask); } }
mlib_status __mlib_VideoUpSample420_S16( mlib_s16 *dst0, mlib_s16 *dst1, const mlib_s16 *src0, const mlib_s16 *src1, const mlib_s16 *src2, mlib_s32 n) { mlib_s16 *dend = dst0 + 2 * n - 1; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_d64 *sp0 = (mlib_d64 *)src0; mlib_d64 *sp1 = (mlib_d64 *)src1; mlib_d64 *sp2 = (mlib_d64 *)src2; mlib_d64 d00, d01, d02, d03; mlib_d64 d10, d11, d12, d13; mlib_d64 d20, d21, d22, d23; mlib_d64 ac00, ac01, ac02, ac03, ac04, ac05, ac06, ac07; mlib_d64 ac10, ac11, ac12, ac13, ac14, ac15, ac16, ac17; mlib_d64 ac20, ac21, ac22, ac23, ac24, ac25, ac26, ac27; mlib_f32 f13 = vis_to_float(0x10003); mlib_f32 f31 = vis_to_float(0x30001); mlib_f32 f39 = vis_to_float(0x30009); mlib_f32 f93 = vis_to_float(0x90003); mlib_d64 d87 = vis_to_double(8, 7); mlib_s32 i, emask; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr((12 << 3) + 2); d01 = vis_ld_d64_nf(sp0); d11 = vis_ld_d64_nf(sp1); d21 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; d00 = vis_faligndata(d00, d01); d10 = vis_faligndata(d10, d11); d20 = vis_faligndata(d20, d21); #pragma pipeloop(0) for (i = 0; i <= n - 4; i += 4) { d03 = vis_ld_d64_nf(sp0); d13 = vis_ld_d64_nf(sp1); d23 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; d02 = vis_faligndata(d01, d03); d12 = vis_faligndata(d11, d13); d22 = vis_faligndata(d21, d23); ac10 = vis_fmuld8ulx16(f39, vis_read_lo(d10)); ac12 = vis_fmuld8ulx16(f39, vis_read_hi(d11)); ac10 = vis_fpadd32(ac10, d87); ac12 = vis_fpadd32(ac12, d87); ac11 = vis_fmuld8ulx16(f93, vis_read_hi(d11)); ac13 = vis_fmuld8ulx16(f93, vis_read_hi(d12)); ac10 = vis_fpadd32(ac10, ac11); ac12 = vis_fpadd32(ac12, ac13); ac00 = vis_fmuld8ulx16(f13, vis_read_lo(d00)); ac01 = vis_fmuld8ulx16(f31, vis_read_hi(d01)); ac02 = vis_fmuld8ulx16(f13, vis_read_hi(d01)); ac03 = vis_fmuld8ulx16(f31, vis_read_hi(d02)); ac00 = vis_fpadd32(ac00, ac01); ac02 = vis_fpadd32(ac02, ac03); ac00 = vis_fpadd32(ac10, ac00); ac02 = vis_fpadd32(ac12, ac02); ac20 = vis_fmuld8ulx16(f13, vis_read_lo(d20)); ac21 = vis_fmuld8ulx16(f31, vis_read_hi(d21)); ac22 = vis_fmuld8ulx16(f13, vis_read_hi(d21)); ac23 = vis_fmuld8ulx16(f31, vis_read_hi(d22)); ac20 = vis_fpadd32(ac20, ac21); ac22 = vis_fpadd32(ac22, ac23); ac20 = vis_fpadd32(ac10, ac20); ac22 = vis_fpadd32(ac12, ac22); dp0[0] = vis_fpackfix_pair(ac00, ac02); dp1[0] = vis_fpackfix_pair(ac20, ac22); dp0 += 2; dp1 += 2; d00 = d02; d01 = d03; d10 = d12; d11 = d13; d20 = d22; d21 = d23; } dp0 = (mlib_d64 *)dst0; dp1 = (mlib_d64 *)dst1; sp0 = (mlib_d64 *)src0; sp1 = (mlib_d64 *)src1; sp2 = (mlib_d64 *)src2; d01 = vis_ld_d64_nf(sp0); d11 = vis_ld_d64_nf(sp1); d21 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; d00 = vis_faligndata(d00, d01); d10 = vis_faligndata(d10, d11); d20 = vis_faligndata(d20, d21); #pragma pipeloop(0) for (i = 0; i <= n - 4; i += 4) { d03 = vis_ld_d64_nf(sp0); d13 = vis_ld_d64_nf(sp1); d23 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; d02 = vis_faligndata(d01, d03); d12 = vis_faligndata(d11, d13); d22 = vis_faligndata(d21, d23); ac14 = vis_fmuld8ulx16(f39, vis_read_hi(d12)); ac16 = vis_fmuld8ulx16(f39, vis_read_lo(d11)); ac14 = vis_fpadd32(ac14, d87); ac16 = vis_fpadd32(ac16, d87); ac15 = vis_fmuld8ulx16(f93, vis_read_lo(d11)); ac17 = vis_fmuld8ulx16(f93, vis_read_lo(d12)); ac14 = vis_fpadd32(ac14, ac15); ac16 = vis_fpadd32(ac16, ac17); ac04 = vis_fmuld8ulx16(f13, vis_read_hi(d02)); ac05 = vis_fmuld8ulx16(f31, vis_read_lo(d01)); ac06 = vis_fmuld8ulx16(f13, vis_read_lo(d01)); ac07 = vis_fmuld8ulx16(f31, vis_read_lo(d02)); ac04 = vis_fpadd32(ac04, ac05); ac06 = vis_fpadd32(ac06, ac07); ac04 = vis_fpadd32(ac14, ac04); ac06 = vis_fpadd32(ac16, ac06); ac24 = vis_fmuld8ulx16(f13, vis_read_hi(d22)); ac25 = vis_fmuld8ulx16(f31, vis_read_lo(d21)); ac26 = vis_fmuld8ulx16(f13, vis_read_lo(d21)); ac27 = vis_fmuld8ulx16(f31, vis_read_lo(d22)); ac24 = vis_fpadd32(ac24, ac25); ac26 = vis_fpadd32(ac26, ac27); ac24 = vis_fpadd32(ac14, ac24); ac26 = vis_fpadd32(ac16, ac26); dp0[1] = vis_fpackfix_pair(ac04, ac06); dp1[1] = vis_fpackfix_pair(ac24, ac26); dp0 += 2; dp1 += 2; d00 = d02; d01 = d03; d10 = d12; d11 = d13; d20 = d22; d21 = d23; } if ((mlib_s16 *)dp0 <= dend) { d02 = vis_faligndata(d01, d03); d12 = vis_faligndata(d11, d13); d22 = vis_faligndata(d21, d23); ac10 = vis_fmuld8ulx16(f39, vis_read_lo(d10)); ac12 = vis_fmuld8ulx16(f39, vis_read_hi(d11)); ac10 = vis_fpadd32(ac10, d87); ac12 = vis_fpadd32(ac12, d87); ac11 = vis_fmuld8ulx16(f93, vis_read_hi(d11)); ac13 = vis_fmuld8ulx16(f93, vis_read_hi(d12)); ac10 = vis_fpadd32(ac10, ac11); ac12 = vis_fpadd32(ac12, ac13); ac00 = vis_fmuld8ulx16(f13, vis_read_lo(d00)); ac01 = vis_fmuld8ulx16(f31, vis_read_hi(d01)); ac02 = vis_fmuld8ulx16(f13, vis_read_hi(d01)); ac03 = vis_fmuld8ulx16(f31, vis_read_hi(d02)); ac00 = vis_fpadd32(ac00, ac01); ac02 = vis_fpadd32(ac02, ac03); ac00 = vis_fpadd32(ac10, ac00); ac02 = vis_fpadd32(ac12, ac02); ac20 = vis_fmuld8ulx16(f13, vis_read_lo(d20)); ac21 = vis_fmuld8ulx16(f31, vis_read_hi(d21)); ac22 = vis_fmuld8ulx16(f13, vis_read_hi(d21)); ac23 = vis_fmuld8ulx16(f31, vis_read_hi(d22)); ac20 = vis_fpadd32(ac20, ac21); ac22 = vis_fpadd32(ac22, ac23); ac20 = vis_fpadd32(ac10, ac20); ac22 = vis_fpadd32(ac12, ac22); ac00 = vis_fpackfix_pair(ac00, ac02); ac20 = vis_fpackfix_pair(ac20, ac22); emask = vis_edge16(dp0, dend); vis_pst_16(ac00, dp0, emask); vis_pst_16(ac20, dp1, emask); dp0++; dp1++; if ((mlib_s16 *)dp0 <= dend) { ac14 = vis_fmuld8ulx16(f39, vis_read_hi(d12)); ac16 = vis_fmuld8ulx16(f39, vis_read_lo(d11)); ac14 = vis_fpadd32(ac14, d87); ac16 = vis_fpadd32(ac16, d87); ac15 = vis_fmuld8ulx16(f93, vis_read_lo(d11)); ac17 = vis_fmuld8ulx16(f93, vis_read_lo(d12)); ac14 = vis_fpadd32(ac14, ac15); ac16 = vis_fpadd32(ac16, ac17); ac04 = vis_fmuld8ulx16(f13, vis_read_hi(d02)); ac05 = vis_fmuld8ulx16(f31, vis_read_lo(d01)); ac06 = vis_fmuld8ulx16(f13, vis_read_lo(d01)); ac07 = vis_fmuld8ulx16(f31, vis_read_lo(d02)); ac04 = vis_fpadd32(ac04, ac05); ac06 = vis_fpadd32(ac06, ac07); ac04 = vis_fpadd32(ac14, ac04); ac06 = vis_fpadd32(ac16, ac06); ac24 = vis_fmuld8ulx16(f13, vis_read_hi(d22)); ac25 = vis_fmuld8ulx16(f31, vis_read_lo(d21)); ac26 = vis_fmuld8ulx16(f13, vis_read_lo(d21)); ac27 = vis_fmuld8ulx16(f31, vis_read_lo(d22)); ac24 = vis_fpadd32(ac24, ac25); ac26 = vis_fpadd32(ac26, ac27); ac24 = vis_fpadd32(ac14, ac24); ac26 = vis_fpadd32(ac16, ac26); ac04 = vis_fpackfix_pair(ac04, ac06); ac24 = vis_fpackfix_pair(ac24, ac26); emask = vis_edge16(dp0, dend); vis_pst_16(ac04, dp0, emask); vis_pst_16(ac24, dp1, emask); } } dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4; dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4; dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4; dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4; return (MLIB_SUCCESS); }
mlib_status __mlib_VectorSubS_S16C_S16C_Mod( mlib_s16 *z, const mlib_s16 *x, const mlib_s16 *c, mlib_s32 n) { mlib_d64 *dpz, *dpx; mlib_d64 dx, dz, dx0, dx1; mlib_s16 *pz, *px, *pzend; /* offset of address alignment in destination */ mlib_s32 off; /* edge masks */ mlib_s32 emask; mlib_s32 len = n + n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; mlib_u16 uc0 = *((mlib_s16 *)c); mlib_u16 uc1 = *((mlib_s16 *)c + 1); mlib_d64 dc = ((mlib_addr)z & 2) ? vis_to_double_dup((uc1 << 16) | uc0) : vis_to_double_dup((uc0 << 16) | uc1); if (n <= 0) return (MLIB_FAILURE); px = (mlib_s16 *)x; pz = (mlib_s16 *)z; dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (mlib_addr)dpz - (mlib_addr)z; pzend = pz + n + n - 1; /* * generate edge mask for the start point */ emask = vis_edge16(pz, pzend); /* * prepare the destination address */ if (off) { dpx = (mlib_d64 *)vis_alignaddr(px, off); dx0 = vis_ld_d64_nf(dpx); dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); SUBS16_MOD; px += (8 + off) >> 1; len -= (8 + off) >> 1; dpz++; } if (len <= 0) return (MLIB_SUCCESS); even_8 = len >> 2; rest_8 = len & 0x3; emask = 0xf; /* * Now try to analyze source "x" and "y" addresses. */ if (!((mlib_addr)px & 7)) { dpx = (mlib_d64 *)px; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx = (*dpx++); SUBS16_MOD; dpz++; } dx1 = vis_ld_d64_nf(dpx); dpx++; } else { dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUBS16_MOD; dx0 = dx1; dpz++; } dx1 = dx0; } if (!rest_8) return (MLIB_SUCCESS); /* * prepare edge mask for the last bytes */ emask = ~(vis_edge16((void *)(rest_8 << 1), pzend)); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); SUBS16_MOD; return (MLIB_SUCCESS); }
mlib_status mlib_v_ImageAdd_U16( mlib_image *dst, const mlib_image *src1, const mlib_image *src2) { mlib_s32 i, j, k; mlib_s32 offdst, offsrc1, offsrc2, emask, mask; mlib_s32 amount; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr, tmp; mlib_d64 sd10, sd11, sd20, sd21; mlib_d64 ones = vis_to_double_dup(0x7fff7fff); mlib_d64 max_u16 = vis_to_double_dup(0xffffffff); mlib_u16 *dend; VALIDATE(mlib_u16); /* initialize GSR scale factor */ vis_write_gsr(15 << 3); sl1 = sp1; sl2 = sp2; dl = dp; amount = width * channels; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 3) == 0) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp, emask); i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U16(sd10, sd20, dpp) } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {
mlib_status mlib_v_ImageAdd_S16( mlib_image *dst, const mlib_image *src1, const mlib_image *src2) { mlib_s32 i, j, k; mlib_s32 offdst, offsrc1, offsrc2, emask; mlib_s32 amount; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr; mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21; mlib_s16 *dend; VALIDATE(mlib_s16); sl1 = sp1; sl2 = sp2; dl = dp; amount = width * channels; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 3) == 0) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd20 = vis_ld_d64_nf(spp2); if (emask != 0xf) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd20 = sd21; spp2++; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = vis_ld_d64_nf(spp1); if (emask != 0xf) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd10 = sd11; spp1++; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd10 = sd11; spp1++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd20 = (*spp2++); sd10 = vis_faligndata(sd10, vis_ld_d64_nf(spp1 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the source addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the destination addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd0); if (emask != 0xf) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp++, emask); dd0 = dd1; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); (*dpp++) = vis_faligndata(dd0, dd1); dd0 = dd1; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else { /* common case */ for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd10 = vis_faligndata(vis_ld_d64_nf(spp1), vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_faligndata(vis_ld_d64_nf(spp2), vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } /* copy src1 to dst */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd11 = vis_ld_d64_nf(spp1); tmp_ptr = dpp; #pragma pipeloop(0) for (k = i; k <= (amount - 4); k += 4) { sd10 = sd11; sd11 = vis_ld_d64_nf(spp1 + 1); (*tmp_ptr++) = vis_faligndata(sd10, sd11); spp1++; } sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_ld_d64_nf(spp2); tmp_ptr = dpp; #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*tmp_ptr++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd11, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } return (MLIB_SUCCESS); }
} } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd20 = vis_ld_d64_nf(spp2); if (emask != 0xf) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp, emask); sd20 = sd21; spp2++; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) {
mlib_status __mlib_SignalEmphasize_S16S_S16S_Sat( mlib_s16 *dst, const mlib_s16 *src, void *filter, mlib_s32 n) { mlib_emphasize_struct *fist = filter; mlib_d64 w_maskand0 = vis_to_double(0xFFFFFFFF, 0xFFFF); mlib_d64 w_maskor0 = vis_freg_pair(0.f, fist->v16_last0); mlib_d64 w_maskand1 = vis_to_double(0xFFFFFFFF, 0xFFFF0000); mlib_d64 w_maskor1 = vis_freg_pair(0.f, fist->v16_last1); mlib_f32 v_mask = vis_to_float(0x80008000); mlib_f32 v_alpha = fist->v_alpha; mlib_s16 *fdst = dst + n + n - 1; mlib_d64 *dpd, *dps, *dsrct1; mlib_d64 w_dst, w_src, w_src0, w_src1, w_src2, w_lsrc; mlib_d64 dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7; mlib_s32 i, times, t1, t2; /* check for obvious errors */ if ((fist == NULL) || (n <= 0) || (src == 0) || (dst == 0) || (fist->type != MLIB_EMPH)) { return (MLIB_FAILURE); } vis_write_gsr(1 << 3); w_maskor0 = vis_fand(w_maskor0, w_maskand1); w_maskor1 = vis_fand(w_maskor1, w_maskand0); vis_alignaddr((void *)(-(mlib_addr)src), 0); w_maskand0 = vis_faligndata(w_maskand0, w_maskand0); w_maskor0 = vis_faligndata(w_maskor0, w_maskor0); w_maskand1 = vis_faligndata(w_maskand1, w_maskand1); w_maskor1 = vis_faligndata(w_maskor1, w_maskor1); dpd = vis_alignaddr(dst, 0); times = (mlib_d64 *)vis_alignaddr(fdst, 0) - dpd; t1 = -((mlib_addr)(dst) & 7); t2 = t1 - 4; dps = vis_alignaddr((void *)src, t2); w_src0 = vis_ld_d64_nf(dps); dps++; w_src1 = vis_ld_d64_nf(dps); dps++; if ((((mlib_addr)dst ^ (mlib_addr)src) & 7)) { if (((mlib_addr)dps - (mlib_addr)src) >= 6) { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); } else { w_src1 = vis_fand(w_maskand0, w_src1); w_src1 = vis_for(w_maskor0, w_src1); } if (((mlib_addr)dps - (mlib_addr)src) >= 8) { w_src0 = vis_fand(w_maskand1, w_src0); w_src0 = vis_for(w_maskor1, w_src0); } else { w_src1 = vis_fand(w_maskand1, w_src1); w_src1 = vis_for(w_maskor1, w_src1); } w_lsrc = vis_faligndata(w_src0, w_src1); dsrct1 = vis_alignaddr((void *)src, t1); if (dps - 2 != dsrct1) { w_src2 = *dps; dps++; w_src = vis_faligndata(w_src1, w_src2); MLIB_MUL8; if ((mlib_addr)dst & 7) { times--; w_src0 = w_src1; w_src1 = w_src2; w_src2 = *dps; vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dpd++; dps++; } } else { w_src = vis_faligndata(w_src0, w_src1); MLIB_MUL8; if ((mlib_addr)dst & 7) { times--; w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; dpd++; } } } else { w_src = w_src1; if ((mlib_addr)src & 7) { times--; if (((mlib_addr)src & 7) == 2) { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); } else { w_src1 = vis_fand(w_maskand0, w_src1); w_src1 = vis_for(w_maskor0, w_src1); } w_src1 = vis_fand(w_maskand1, w_src1); w_src1 = vis_for(w_maskor1, w_src1); w_lsrc = vis_faligndata(w_src0, w_src1); MLIB_MUL8; w_src0 = w_src1; w_src1 = *dps; w_src = w_src1; w_lsrc = vis_faligndata(w_src0, w_src1); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } else { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); w_src0 = vis_fand(w_maskand1, w_src0); w_src0 = vis_for(w_maskor1, w_src0); w_lsrc = vis_faligndata(w_src0, w_src1); MLIB_MUL8; } w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); MLIB_MIX; w_src1 = w_src; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_src1 = w_src; w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); w_dst = vis_fpackfix_pair(dr2, dr3); dps++; dpd++; } } if (times >= 0) { vis_pst_16(w_dst, dpd, vis_edge16(dpd, fdst)); } ((mlib_s16 *)&fist->v16_last0)[0] = src[2 * n - 2]; ((mlib_s16 *)&fist->v16_last1)[1] = src[2 * n - 1]; return (MLIB_SUCCESS); }
void mlib_v_ImageLookUp_S16_S16_3_D1( const mlib_s16 *src, mlib_s16 *dst, mlib_s32 xsize, const mlib_s16 *table0, const mlib_s16 *table1, const mlib_s16 *table2) { /* pointer to source data */ mlib_s16 *sp; /* source data */ mlib_s32 s0, s1, s2, s3; /* pointer to start of destination */ mlib_s16 *dl; /* pointer to end of destination */ mlib_s16 *dend; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc0, acc1; /* edge mask */ mlib_s32 emask; /* loop variable */ mlib_s32 i, num; const mlib_s16 *table; dl = dst; sp = (void *)src; dp = (mlib_d64 *)dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 6); i = 0; if (xsize >= 4) { s0 = sp[0] << 1; s1 = sp[1] << 1; s2 = sp[2] << 1; s3 = sp[3] << 1; sp += 4; vis_write_bmask(0x012389ab, 0); #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t3 = VIS_LD_U16_I(table0, s3); t2 = VIS_LD_U16_I(table2, s2); t1 = VIS_LD_U16_I(table1, s1); t0 = VIS_LD_U16_I(table0, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0] << 1; s1 = sp[1] << 1; s2 = sp[2] << 1; s3 = sp[3] << 1; (*dp++) = vis_bshuffle(acc0, acc1); table = table0; table0 = table1; table1 = table2; table2 = table; } t3 = VIS_LD_U16_I(table0, s3); t2 = VIS_LD_U16_I(table2, s2); t1 = VIS_LD_U16_I(table1, s1); t0 = VIS_LD_U16_I(table0, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); (*dp++) = vis_bshuffle(acc0, acc1); table = table0; table0 = table1; table1 = table2; table2 = table; i += 4; } if ((mlib_addr)dp <= (mlib_addr)dend) { num = (mlib_s16 *)dend - (mlib_s16 *)dp; sp += num; num++; if (num == 1) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } else if (num == 2) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table1, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } else if (num == 3) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table2, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table1, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge16(dp, dend); vis_pst_16(acc0, dp, emask); } }
mlib_status mlib_ImageAbs_S16( mlib_s16 *dst, mlib_s16 *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) { /* 8-byte aligned src, dst ptrs */ mlib_d64 *sp, *dp; /* unaligned data */ mlib_d64 prev; mlib_d64 curr0; mlib_d64 curr1, curr2; /* aligned data */ mlib_d64 adat0; /* absolute values of result */ mlib_d64 dabs; /* aligned data */ mlib_d64 adat1, adat2, adat3; /* absolute values of result */ mlib_d64 dabs0, dabs1, dabs2, dabs3; mlib_d64 dtwo = vis_to_double_dup(0x20002); mlib_d64 mask = vis_to_double_dup(0x80008000); /* last pixel of line */ mlib_s16 *dlast; /* bit mask results of comp */ mlib_s32 mask0; /* pxl count of source line */ mlib_s32 slpxl = slb >> 1; /* pxl count of destination line */ mlib_s32 dlpxl = dlb >> 1; /* dst offset for address alignment */ mlib_s32 doffs; mlib_s32 row, block; /* full blocks, each of N d64s */ mlib_s32 numblocks; for (row = 0; row < hgt; row++) { /* ROW SETUP */ /* last dst pixel in row */ dlast = dst + wid - 1; doffs = (mlib_addr)dst & 7; /* aligned dest ptr */ dp = (mlib_d64 *)((mlib_addr)dst & ~7); /* aligned src ptr */ sp = (mlib_d64 *)vis_alignaddr(src, -(mlib_s32)doffs); prev = *sp; /* FIRST d64 NEEDS EDGE MASK FOR DESTINATION START POINT */ /* edge mask for start point */ mask0 = vis_edge16(dst, dlast); READ_PXLS_UNALIGN; CALC_ABS_S16; vis_pst_16(dabs, dp++, mask0); numblocks = ((mlib_u8 *)dlast + 1 - (mlib_u8 *)dp) >> 3; /* DO MOST OF ROW IN BLOCKS OF N d64s */ if ((((mlib_addr)src ^ (mlib_addr)dst) & 7) == 0) { #pragma pipeloop(0) for (block = 0; block < numblocks - 3; block += 4) { adat0 = sp[0]; adat1 = sp[1]; adat2 = sp[2]; CALC_ABS_S16_UNROLL(dabs0, adat0); adat3 = sp[3]; dp[0] = dabs0; CALC_ABS_S16_UNROLL(dabs1, adat1); dp[1] = dabs1; CALC_ABS_S16_UNROLL(dabs2, adat2); dp[2] = dabs2; CALC_ABS_S16_UNROLL(dabs3, adat3); dp[3] = dabs3; sp += 4; dp += 4; } #pragma pipeloop(0) for (; block < numblocks; block++) { READ_PXLS_ALIGN; CALC_ABS_S16; STORE_ABS_VALUES; } prev = *sp; } else { #pragma pipeloop(0) for (block = 0; block < numblocks - 3; block += 4) { curr0 = *(sp + 1); curr1 = *(sp + 2); curr2 = *(sp + 3); adat0 = vis_faligndata(prev, curr0); prev = *(sp + 4); CALC_ABS_S16_UNROLL(dabs0, adat0); dp[0] = dabs0; adat1 = vis_faligndata(curr0, curr1); CALC_ABS_S16_UNROLL(dabs1, adat1); dp[1] = dabs1; adat2 = vis_faligndata(curr1, curr2); CALC_ABS_S16_UNROLL(dabs2, adat2); dp[2] = dabs2; adat3 = vis_faligndata(curr2, prev); CALC_ABS_S16_UNROLL(dabs3, adat3); dp[3] = dabs3; sp += 4; dp += 4; } #pragma pipeloop(0) for (; block < numblocks; block++) { READ_PXLS_UNALIGN; CALC_ABS_S16; STORE_ABS_VALUES; } } /* LAST d64 NEEDS EDGE MASK FOR DESTINATION END POINT */ if ((mlib_addr)dp <= (mlib_addr)dlast) { curr0 = *(++sp); /* edge mask for end point */ mask0 = vis_edge16(dp, dlast); adat0 = vis_faligndata(prev, curr0); CALC_ABS_S16; vis_pst_16(dabs, dp, mask0); } /* ptrs to next src row */ src += slpxl; /* ptrs to next dst row */ dst += dlpxl; } return (MLIB_SUCCESS); }
void mlib_v_ImageMulShift_S16( mlib_s16 *sp1, mlib_s32 stride1, mlib_s16 *sp2, mlib_s32 stride2, mlib_s16 *dp, mlib_s32 strided, mlib_s32 width, mlib_s32 height, mlib_s32 shift) { /* pointers for line of source1 */ mlib_s16 *sl1; /* pointers for line of source2 */ mlib_s16 *sl2; /* pointers for line of dst */ mlib_s16 *dl; mlib_s32 offdst, offsrc1, offsrc2, emask; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr; mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21; mlib_s16 *dend; mlib_d64 rdhh, rdhl; mlib_d64 rdlh, rdll; mlib_d64 rdh, rdl; mlib_s32 i, j, k; if (width == stride1 && width == stride2 && width == strided) { width *= height; height = 1; } /* initialize GSR scale factor */ vis_write_gsr(((16 - shift) & 0x1f) << 3); sl1 = sp1; sl2 = sp2; dl = dp; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 3) == 0) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); (*dpp++) = dd; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd20 = spp2[0]; if (emask != 0xf) { sd10 = (*spp1++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd20 = sd21; spp2++; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = vis_faligndata(sd20, spp2[1]); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = spp1[0]; if (emask != 0xf) { sd20 = (*spp2++); sd11 = spp1[1]; sd10 = vis_faligndata(sd10, sd11); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd10 = sd11; spp1++; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd20 = (*spp2++); sd11 = spp1[1]; sd10 = vis_faligndata(sd10, sd11); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); (*dpp++) = dd; sd10 = sd11; spp1++; } if (i < width) { emask = vis_edge16(dpp, dend); sd20 = (*spp2++); sd10 = vis_faligndata(sd10, spp1[1]); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the source addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the destination addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd0); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp++, emask); dd0 = dd1; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1); (*dpp++) = vis_faligndata(dd0, dd1); dd0 = dd1; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd10 = vis_faligndata(spp1[0], spp1[1]); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_faligndata(spp2[0], spp2[1]); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } /* copy src1 to dst */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd11 = spp1[0]; tmp_ptr = dpp; #pragma pipeloop(0) for (k = i; k <= (width - 4); k += 4) { sd10 = sd11; sd11 = spp1[1]; (*tmp_ptr++) = vis_faligndata(sd10, sd11); spp1++; } sd11 = vis_faligndata(sd11, spp1[1]); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = spp2[0]; tmp_ptr = dpp; #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*tmp_ptr++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < width) { emask = vis_edge16(dpp, dend); sd20 = vis_faligndata(sd20, spp2[1]); MLIB_V_IMAGEMULSHIFT_S16(sd11, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } }
void mlib_v_ImageLookUpSI_U16_U8_2_DstA8D1(const mlib_u16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 **table) { mlib_u16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_u16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ const mlib_u8 *tab0 = &table[0][0]; const mlib_u8 *tab1 = &table[1][0]; sp = (void *)src; dl = (mlib_u16 *) dst; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 7); if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t7 = VIS_LD_U8_I(tab1, s3); t6 = VIS_LD_U8_I(tab0, s3); t5 = VIS_LD_U8_I(tab1, s2); t4 = VIS_LD_U8_I(tab0, s2); t3 = VIS_LD_U8_I(tab1, s1); t2 = VIS_LD_U8_I(tab0, s1); t1 = VIS_LD_U8_I(tab1, s0); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; *dp++ = acc; } t7 = VIS_LD_U8_I(tab1, s3); t6 = VIS_LD_U8_I(tab0, s3); t5 = VIS_LD_U8_I(tab1, s2); t4 = VIS_LD_U8_I(tab0, s2); t3 = VIS_LD_U8_I(tab1, s1); t2 = VIS_LD_U8_I(tab0, s1); t1 = VIS_LD_U8_I(tab1, s0); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); *dp++ = acc; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_u16 *) dend - (mlib_u16 *) dp; sp += num; num++; #pragma pipeloop(0) for (i = 0; i < num; i++) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(tab1, s0); acc = vis_faligndata(t0, acc); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t0, acc); } emask = vis_edge16(dp, dend); vis_pst_16(acc, dp, emask); } }
mlib_status __mlib_VectorAdd_S16_S16_Sat( mlib_s16 *z, const mlib_s16 *x, const mlib_s16 *y, mlib_s32 n) { mlib_d64 *dpz, *dpx, *dpy; mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1, dr0, dr1, dr2; mlib_s16 *pz, *px, *py, *pzend; /* offset of address alignment in destination */ mlib_s32 off; /* edge masks */ mlib_s32 emask; mlib_s32 mask1, mask2; mlib_s32 ovl, und; mlib_d64 fzero = vis_fzero(); mlib_d64 const_ovl = vis_to_double_dup(0x7fff7fff); mlib_d64 const_und = vis_fnot(const_ovl); mlib_s32 len = n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; if (n <= 0) return (MLIB_FAILURE); px = (mlib_s16 *)x; py = (mlib_s16 *)y; pz = (mlib_s16 *)z; dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (long)dpz - (long)z; pzend = pz + n - 1; /* * generate edge mask for the start point */ emask = vis_edge16(pz, pzend); /* * prepare the destination address */ if (off) { dpy = (mlib_d64 *)vis_alignaddr(py, off); dy0 = vis_ld_d64_nf(dpy); dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); dpx = (mlib_d64 *)vis_alignaddr(px, off); dx0 = vis_ld_d64_nf(dpx); dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); ADD16_SAT; px += (8 + off) >> 1; py += (8 + off) >> 1; len -= (8 + off) >> 1; dpz++; } if (len <= 0) return (MLIB_SUCCESS); even_8 = len >> 2; rest_8 = len & 0x3; emask = 0xf; /* * Now try to analyze source "x" and "y" addresses. */ if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) { /* * Both addresses are 8-byte aligned. No vis_alignaddr * and vis_faligndata at all. */ dpx = (mlib_d64 *)px; dpy = (mlib_d64 *)py; dx = vis_ld_d64_nf(dpx); dpx++; dy = vis_ld_d64_nf(dpy); dpy++; dx1 = vis_ld_d64_nf(dpx); dy1 = vis_ld_d64_nf(dpy); for (i = 0; i < even_8; i++) { ADD16_SAT; dx = dx1; dy = dy1; dpx++; dpy++; dpz++; dx1 = vis_ld_d64_nf(dpx); dy1 = vis_ld_d64_nf(dpy); } dx1 = dx; dy1 = dy; } else if ((!((mlib_addr)px & 7))) { /* * First ("x") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "y". */ #pragma unroll(1) /* * 11111 */ dpx = (mlib_d64 *)px; dx = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); for (i = 0; i < even_8; i++) { ADD16_SAT; dx = vis_ld_d64_nf(dpx); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); dpz++; dpx++; dpy++; } dx1 = dx; dy1 = dy0; } else if ((!((mlib_addr)py & 7))) { /* * Second ("y") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "x". */ dpy = (mlib_d64 *)py; dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; for (i = 0; i < even_8; i++) { dy = *dpy; dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); ADD16_SAT; dpx++; dpy++; dpz++; } dy1 = vis_ld_d64_nf(dpy); dpy++; #pragma unroll(8) } else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) { /* * Both ("x" and "y") address are identically aligned. * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy1 = vis_ld_d64_nf(dpy); dpy++; for (i = 0; i < even_8; i++) { dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); ADD16_SAT; dpz++; } } else { /* * Both ("x" and "y") address are arbitrary aligned. * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop. */ dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx + 1); for (i = 0; i < even_8; i++) { ADD16_SAT; vis_alignaddr(py, (mlib_addr)dpy); dy = vis_faligndata(dy0, dy1); vis_alignaddr(px, (mlib_addr)dpx); dx = vis_faligndata(dx0, dx1); dpz++; dpy++; dpx++; dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx + 1); } dx1 = dpx[-1]; dy1 = dpy[-1]; } if (!rest_8) return (MLIB_SUCCESS); /* * prepare edge mask for the last bytes */ emask = ~(vis_edge16((void *)(rest_8 << 1), pzend)); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); ADD16_SAT; return (MLIB_SUCCESS); }