mlib_status __mlib_VideoColorSplit2_S16( mlib_s16 *color1, mlib_s16 *color2, const mlib_s16 *colors, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)colors; mlib_d64 *dp0 = (mlib_d64 *)color1; mlib_d64 *dp1 = (mlib_d64 *)color2; mlib_d64 sd0, sd1, dd0, dd1, dd2, dd3; mlib_s32 i; vis_write_gsr64(((mlib_u64)0x014589cd << 32) | 2); /* * 8-pixels loop */ sd0 = sp[0]; sd1 = vis_ld_d64_nf(sp + 1); dd0 = vis_faligndata(sd0, sd1); dd1 = vis_faligndata(sd1, sd0); dd2 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(dd0, dd1); sd0 = vis_ld_d64_nf(sp + 2); sd1 = vis_ld_d64_nf(sp + 3); dd0 = vis_faligndata(sd0, sd1); dd1 = vis_faligndata(sd1, sd0); #pragma pipeloop(0) for (i = 0; i < (n / 4); i++) { (*dp0++) = dd2; (*dp1++) = dd3; dd2 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(dd0, dd1); sd0 = vis_ld_d64_nf(sp + 4); sd1 = vis_ld_d64_nf(sp + 5); dd0 = vis_faligndata(sd0, sd1); dd1 = vis_faligndata(sd1, sd0); sp += 2; } /* * last 8 pixels */ if (n & 3) { mlib_s32 emask = 0xF0 >> (n & 3); sd0 = sp[0]; sd1 = vis_ld_d64_nf(sp + 1); dd0 = vis_faligndata(sd0, sd1); dd1 = vis_faligndata(sd1, sd0); dd2 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(dd0, dd1); vis_pst_16(dd2, (mlib_f32 *)dp0, emask); vis_pst_16(dd3, (mlib_f32 *)dp1, emask); }
mlib_status __mlib_VideoUpSample420_Nearest_S16( mlib_s16 *dst0, mlib_s16 *dst1, const mlib_s16 *src, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)src; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_s16 *dend = dst0 + 2 * n - 1; mlib_d64 sa, da, dr, dr1; mlib_s32 emask, i; if (n <= 0) return (MLIB_FAILURE); #pragma pipeloop(0) for (i = 0; i <= (n - 4); i += 4) { sa = sp[0]; sp++; dr = vis_fpmerge(vis_read_hi(sa), vis_read_lo(sa)); dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr)); dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr)); dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr)); dp0[0] = dp1[0] = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1)); dp0[1] = dp1[1] = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1)); dp0 += 2; dp1 += 2; } if ((mlib_s16 *)dp0 <= dend) { sa = sp[0]; dr = vis_fpmerge(vis_read_hi(sa), vis_read_lo(sa)); dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr)); dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr)); dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr)); da = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1)); emask = vis_edge16(dp0, dend); vis_pst_16(da, dp0, emask); vis_pst_16(da, dp1, emask); dp0++; dp1++; if ((mlib_s16 *)dp0 <= dend) { da = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1)); emask = vis_edge16(dp0, dend); vis_pst_16(da, dp0, emask); vis_pst_16(da, dp1, emask); } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorSplit3_S16( mlib_s16 *color1, mlib_s16 *color2, mlib_s16 *color3, const mlib_s16 *colors, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)colors; mlib_d64 *dp0 = (mlib_d64 *)color1; mlib_d64 *dp1 = (mlib_d64 *)color2; mlib_d64 *dp2 = (mlib_d64 *)color3; mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2, dd3; mlib_s32 i; vis_write_gsr(4); vis_write_bmask(0x02CE13DF, 0); #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i <= (n - 4); i += 4) { sd0 = sp[0]; sd1 = sp[1]; sd2 = sp[2]; dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); (*dp0++) = vis_bshuffle(dd0, dd1); dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); dd3 = vis_faligndata(dd0, dd2); (*dp1++) = vis_bshuffle(dd3, dd3); (*dp2++) = vis_bshuffle(dd1, dd2); sp += 3; } /* * last 4 pixels */ if (i < n) { mlib_s32 emask = 0xF0 >> (n & 3); mlib_d64 st0, st1, st2; sd0 = sp[0]; sd1 = vis_ld_d64_nf(sp + 1); sd2 = vis_ld_d64_nf(sp + 2); dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); st0 = vis_bshuffle(dd0, dd1); dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); dd3 = vis_faligndata(dd0, dd2); st1 = vis_bshuffle(dd3, dd3); st2 = vis_bshuffle(dd1, dd2); vis_pst_16(st0, dp0, emask); vis_pst_16(st1, dp1, emask); vis_pst_16(st2, dp2, emask); }
mlib_status __mlib_VectorNorm_S16_Sat( mlib_d64 *z, const mlib_s16 *x, mlib_s32 n) { mlib_s16 *px = (mlib_s16 *)x; mlib_s16 *pxend; mlib_d64 *dpx, *dpxend; mlib_d64 dx, ds, ds1; mlib_d64 edge[2]; type_union_mlib_d64 dr, dr1; mlib_s32 d_left; mlib_u8 emask; edge[0] = edge[1] = 0; if (n <= 0) return (MLIB_FAILURE); ds = ds1 = 0; dpx = (mlib_d64 *)((mlib_addr)px & (~7)); pxend = px + n - 1; emask = vis_edge16(px, pxend); vis_pst_16(dpx[0], edge, emask); dx = edge[0]; dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7)); d_left = dpxend - dpx; for (; d_left > 0; d_left--) { NORM16; dpx++; dx = dpx[0]; } if ((mlib_addr)dpx <= (mlib_addr)pxend) { emask = vis_edge16(dpx, pxend); vis_pst_16(dx, edge + 1, emask); dx = edge[1]; NORM16; } z[0] = mlib_sqrt(ds + ds1); return (MLIB_SUCCESS); }
void mlib_v_ImageMulShift_S16( mlib_s16 *sp1, mlib_s32 stride1, mlib_s16 *sp2, mlib_s32 stride2, mlib_s16 *dp, mlib_s32 strided, mlib_s32 width, mlib_s32 height, mlib_s32 shift) { /* pointers for line of source1 */ mlib_s16 *sl1; /* pointers for line of source2 */ mlib_s16 *sl2; /* pointers for line of dst */ mlib_s16 *dl; mlib_s32 offdst, offsrc1, offsrc2, emask; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr; mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21; mlib_s16 *dend; mlib_d64 rdhh, rdhl; mlib_d64 rdlh, rdll; mlib_d64 rdh, rdl; mlib_s32 i, j, k; if (width == stride1 && width == stride2 && width == strided) { width *= height; height = 1; } /* initialize GSR scale factor */ vis_write_gsr(((16 - shift) & 0x1f) << 3); sl1 = sp1; sl2 = sp2; dl = dp; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 3) == 0) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); (*dpp++) = dd; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd20 = spp2[0]; if (emask != 0xf) { sd10 = (*spp1++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd20 = sd21; spp2++; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = vis_faligndata(sd20, spp2[1]); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = spp1[0]; if (emask != 0xf) { sd20 = (*spp2++); sd11 = spp1[1]; sd10 = vis_faligndata(sd10, sd11); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd10 = sd11; spp1++; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd20 = (*spp2++); sd11 = spp1[1]; sd10 = vis_faligndata(sd10, sd11); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); (*dpp++) = dd; sd10 = sd11; spp1++; } if (i < width) { emask = vis_edge16(dpp, dend); sd20 = (*spp2++); sd10 = vis_faligndata(sd10, spp1[1]); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the source addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the destination addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd0); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp++, emask); dd0 = dd1; i += 4; } #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1); (*dpp++) = vis_faligndata(dd0, dd1); dd0 = dd1; } if (i < width) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; dend = dp + width - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd10 = vis_faligndata(spp1[0], spp1[1]); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_faligndata(spp2[0], spp2[1]); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } /* copy src1 to dst */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd11 = spp1[0]; tmp_ptr = dpp; #pragma pipeloop(0) for (k = i; k <= (width - 4); k += 4) { sd10 = sd11; sd11 = spp1[1]; (*tmp_ptr++) = vis_faligndata(sd10, sd11); spp1++; } sd11 = vis_faligndata(sd11, spp1[1]); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = spp2[0]; tmp_ptr = dpp; #pragma pipeloop(0) for (; i <= width - 4; i += 4) { sd10 = (*tmp_ptr++); sd21 = spp2[1]; sd20 = vis_faligndata(sd20, sd21); MLIB_V_IMAGEMULSHIFT_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < width) { emask = vis_edge16(dpp, dend); sd20 = vis_faligndata(sd20, spp2[1]); MLIB_V_IMAGEMULSHIFT_S16(sd11, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } }
void mlib_v_ImageLookUp_S32_S16_3_D1(mlib_s32 *src, mlib_s16 *dst, mlib_s32 xsize, mlib_s16 *table0, mlib_s16 *table1, mlib_s16 *table2) { mlib_s32 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_s16 *dl; /* pointer to start of destination */ mlib_s16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc0; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ mlib_s16 *table; dl = dst; sp = src; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *) 0, 6); if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for(i = 0; i <= xsize - 8; i+=4, sp += 4) { t3 = vis_ld_u16_i(table0, ((mlib_addr)2*s3)); t2 = vis_ld_u16_i(table2, ((mlib_addr)2*s2)); t1 = vis_ld_u16_i(table1, ((mlib_addr)2*s1)); t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; *dp++ = acc0; table = table0; table0 = table1; table1 = table2; table2 = table; } t3 = vis_ld_u16_i(table0, ((mlib_addr)2*s3)); t2 = vis_ld_u16_i(table2, ((mlib_addr)2*s2)); t1 = vis_ld_u16_i(table1, ((mlib_addr)2*s1)); t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); *dp++ = acc0; table = table0; table0 = table1; table1 = table2; table2 = table; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_s32)((mlib_s16*) dend - (mlib_s16*) dp); sp += num; num ++; if (num == 1) { s0 = *sp; t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); } else if (num == 2) { s0 = *sp; sp --; t0 = vis_ld_u16_i(table1, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); s0 = *sp; t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); } else if (num == 3) { s0 = *sp; sp --; t0 = vis_ld_u16_i(table2, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); s0 = *sp; sp --; t0 = vis_ld_u16_i(table1, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); s0 = *sp; t0 = vis_ld_u16_i(table0, ((mlib_addr)2*s0)); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge16(dp, dend); vis_pst_16(acc0, dp, emask); } }
void mlib_v_ImageLookUp_U16_U16_124_D1(const mlib_u16 *src, mlib_u16 *dst, mlib_s32 xsize, const mlib_u16 *table0, const mlib_u16 *table1, const mlib_u16 *table2, const mlib_u16 *table3) { mlib_u16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_u16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, acc0; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ dl = dst; sp = (void *)src; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *) 0, 6); i = 0; if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for(i = 0; i <= xsize - 8; i+=4, sp += 4) { t3 = VIS_LD_U16_I(table3, 2*s3); t2 = VIS_LD_U16_I(table2, 2*s2); t1 = VIS_LD_U16_I(table1, 2*s1); t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; *dp++ = acc0; } t3 = VIS_LD_U16_I(table3, 2*s3); t2 = VIS_LD_U16_I(table2, 2*s2); t1 = VIS_LD_U16_I(table1, 2*s1); t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); *dp++ = acc0; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_u16*) dend - (mlib_u16*) dp; sp += num; num ++; if (num == 1) { s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t0, acc0); } else if (num == 2) { s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table1, 2*s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t0, acc0); } else if (num == 3) { s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table2, 2*s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table1, 2*s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) *sp; sp --; t0 = VIS_LD_U16_I(table0, 2*s0); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge16(dp, dend); vis_pst_16(acc0, dp, emask); } }
void mlib_v_ImageSquare_S16( mlib_s16 *src, mlib_s32 slb, mlib_s16 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { /* aligned pointer to source */ mlib_d64 *sp; /* pointer to a line in source */ mlib_s16 *sl; /* aligned pointer to destination */ mlib_d64 *dp; /* pointer to a line in destination */ mlib_s16 *dl; /* pointer to end of a line in dst */ mlib_s16 *dend; /* offset of address alignment in dst */ mlib_s32 off; /* edge masks */ mlib_s32 emask; /* source data */ mlib_d64 s0, s1; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 i, j, n; sl = src; dl = dst; /* row loop */ for (j = 0; j < ysize; j++) { /* prepare the destination address */ dp = (mlib_d64 *)((mlib_addr)dl & (~7)); off = (mlib_addr)dp - (mlib_addr)dl; dend = dl + xsize - 1; /* prepare the source address */ sp = (mlib_d64 *)vis_alignaddr(sl, off); /* generate edge mask for the start point */ emask = vis_edge16(dl, dend); /* first 4 pixels */ s0 = vis_ld_d64_nf(sp); sp++; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQUARE_S16(sd, dd); vis_pst_16(dd, dp++, emask); n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8; /* 4-pixel column loop */ #pragma pipeloop(0) for (i = 0; i < n; i++) { s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQUARE_S16(sd, dd); (*dp++) = dd; } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge16(dp, dend); s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQUARE_S16(sd, dd); vis_pst_16(dd, dp++, emask); } sl = (mlib_s16 *)((mlib_u8 *)sl + slb); dl = (mlib_s16 *)((mlib_u8 *)dl + dlb); } }
mlib_status __mlib_VideoUpSample420_S16( mlib_s16 *dst0, mlib_s16 *dst1, const mlib_s16 *src0, const mlib_s16 *src1, const mlib_s16 *src2, mlib_s32 n) { mlib_s16 *dend = dst0 + 2 * n - 1; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_d64 *sp0 = (mlib_d64 *)src0; mlib_d64 *sp1 = (mlib_d64 *)src1; mlib_d64 *sp2 = (mlib_d64 *)src2; mlib_d64 d00, d01, d02, d03; mlib_d64 d10, d11, d12, d13; mlib_d64 d20, d21, d22, d23; mlib_d64 ac00, ac01, ac02, ac03, ac04, ac05, ac06, ac07; mlib_d64 ac10, ac11, ac12, ac13, ac14, ac15, ac16, ac17; mlib_d64 ac20, ac21, ac22, ac23, ac24, ac25, ac26, ac27; mlib_f32 f13 = vis_to_float(0x10003); mlib_f32 f31 = vis_to_float(0x30001); mlib_f32 f39 = vis_to_float(0x30009); mlib_f32 f93 = vis_to_float(0x90003); mlib_d64 d87 = vis_to_double(8, 7); mlib_s32 i, emask; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr((12 << 3) + 2); d01 = vis_ld_d64_nf(sp0); d11 = vis_ld_d64_nf(sp1); d21 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; d00 = vis_faligndata(d00, d01); d10 = vis_faligndata(d10, d11); d20 = vis_faligndata(d20, d21); #pragma pipeloop(0) for (i = 0; i <= n - 4; i += 4) { d03 = vis_ld_d64_nf(sp0); d13 = vis_ld_d64_nf(sp1); d23 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; d02 = vis_faligndata(d01, d03); d12 = vis_faligndata(d11, d13); d22 = vis_faligndata(d21, d23); ac10 = vis_fmuld8ulx16(f39, vis_read_lo(d10)); ac12 = vis_fmuld8ulx16(f39, vis_read_hi(d11)); ac10 = vis_fpadd32(ac10, d87); ac12 = vis_fpadd32(ac12, d87); ac11 = vis_fmuld8ulx16(f93, vis_read_hi(d11)); ac13 = vis_fmuld8ulx16(f93, vis_read_hi(d12)); ac10 = vis_fpadd32(ac10, ac11); ac12 = vis_fpadd32(ac12, ac13); ac00 = vis_fmuld8ulx16(f13, vis_read_lo(d00)); ac01 = vis_fmuld8ulx16(f31, vis_read_hi(d01)); ac02 = vis_fmuld8ulx16(f13, vis_read_hi(d01)); ac03 = vis_fmuld8ulx16(f31, vis_read_hi(d02)); ac00 = vis_fpadd32(ac00, ac01); ac02 = vis_fpadd32(ac02, ac03); ac00 = vis_fpadd32(ac10, ac00); ac02 = vis_fpadd32(ac12, ac02); ac20 = vis_fmuld8ulx16(f13, vis_read_lo(d20)); ac21 = vis_fmuld8ulx16(f31, vis_read_hi(d21)); ac22 = vis_fmuld8ulx16(f13, vis_read_hi(d21)); ac23 = vis_fmuld8ulx16(f31, vis_read_hi(d22)); ac20 = vis_fpadd32(ac20, ac21); ac22 = vis_fpadd32(ac22, ac23); ac20 = vis_fpadd32(ac10, ac20); ac22 = vis_fpadd32(ac12, ac22); dp0[0] = vis_fpackfix_pair(ac00, ac02); dp1[0] = vis_fpackfix_pair(ac20, ac22); dp0 += 2; dp1 += 2; d00 = d02; d01 = d03; d10 = d12; d11 = d13; d20 = d22; d21 = d23; } dp0 = (mlib_d64 *)dst0; dp1 = (mlib_d64 *)dst1; sp0 = (mlib_d64 *)src0; sp1 = (mlib_d64 *)src1; sp2 = (mlib_d64 *)src2; d01 = vis_ld_d64_nf(sp0); d11 = vis_ld_d64_nf(sp1); d21 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; d00 = vis_faligndata(d00, d01); d10 = vis_faligndata(d10, d11); d20 = vis_faligndata(d20, d21); #pragma pipeloop(0) for (i = 0; i <= n - 4; i += 4) { d03 = vis_ld_d64_nf(sp0); d13 = vis_ld_d64_nf(sp1); d23 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; d02 = vis_faligndata(d01, d03); d12 = vis_faligndata(d11, d13); d22 = vis_faligndata(d21, d23); ac14 = vis_fmuld8ulx16(f39, vis_read_hi(d12)); ac16 = vis_fmuld8ulx16(f39, vis_read_lo(d11)); ac14 = vis_fpadd32(ac14, d87); ac16 = vis_fpadd32(ac16, d87); ac15 = vis_fmuld8ulx16(f93, vis_read_lo(d11)); ac17 = vis_fmuld8ulx16(f93, vis_read_lo(d12)); ac14 = vis_fpadd32(ac14, ac15); ac16 = vis_fpadd32(ac16, ac17); ac04 = vis_fmuld8ulx16(f13, vis_read_hi(d02)); ac05 = vis_fmuld8ulx16(f31, vis_read_lo(d01)); ac06 = vis_fmuld8ulx16(f13, vis_read_lo(d01)); ac07 = vis_fmuld8ulx16(f31, vis_read_lo(d02)); ac04 = vis_fpadd32(ac04, ac05); ac06 = vis_fpadd32(ac06, ac07); ac04 = vis_fpadd32(ac14, ac04); ac06 = vis_fpadd32(ac16, ac06); ac24 = vis_fmuld8ulx16(f13, vis_read_hi(d22)); ac25 = vis_fmuld8ulx16(f31, vis_read_lo(d21)); ac26 = vis_fmuld8ulx16(f13, vis_read_lo(d21)); ac27 = vis_fmuld8ulx16(f31, vis_read_lo(d22)); ac24 = vis_fpadd32(ac24, ac25); ac26 = vis_fpadd32(ac26, ac27); ac24 = vis_fpadd32(ac14, ac24); ac26 = vis_fpadd32(ac16, ac26); dp0[1] = vis_fpackfix_pair(ac04, ac06); dp1[1] = vis_fpackfix_pair(ac24, ac26); dp0 += 2; dp1 += 2; d00 = d02; d01 = d03; d10 = d12; d11 = d13; d20 = d22; d21 = d23; } if ((mlib_s16 *)dp0 <= dend) { d02 = vis_faligndata(d01, d03); d12 = vis_faligndata(d11, d13); d22 = vis_faligndata(d21, d23); ac10 = vis_fmuld8ulx16(f39, vis_read_lo(d10)); ac12 = vis_fmuld8ulx16(f39, vis_read_hi(d11)); ac10 = vis_fpadd32(ac10, d87); ac12 = vis_fpadd32(ac12, d87); ac11 = vis_fmuld8ulx16(f93, vis_read_hi(d11)); ac13 = vis_fmuld8ulx16(f93, vis_read_hi(d12)); ac10 = vis_fpadd32(ac10, ac11); ac12 = vis_fpadd32(ac12, ac13); ac00 = vis_fmuld8ulx16(f13, vis_read_lo(d00)); ac01 = vis_fmuld8ulx16(f31, vis_read_hi(d01)); ac02 = vis_fmuld8ulx16(f13, vis_read_hi(d01)); ac03 = vis_fmuld8ulx16(f31, vis_read_hi(d02)); ac00 = vis_fpadd32(ac00, ac01); ac02 = vis_fpadd32(ac02, ac03); ac00 = vis_fpadd32(ac10, ac00); ac02 = vis_fpadd32(ac12, ac02); ac20 = vis_fmuld8ulx16(f13, vis_read_lo(d20)); ac21 = vis_fmuld8ulx16(f31, vis_read_hi(d21)); ac22 = vis_fmuld8ulx16(f13, vis_read_hi(d21)); ac23 = vis_fmuld8ulx16(f31, vis_read_hi(d22)); ac20 = vis_fpadd32(ac20, ac21); ac22 = vis_fpadd32(ac22, ac23); ac20 = vis_fpadd32(ac10, ac20); ac22 = vis_fpadd32(ac12, ac22); ac00 = vis_fpackfix_pair(ac00, ac02); ac20 = vis_fpackfix_pair(ac20, ac22); emask = vis_edge16(dp0, dend); vis_pst_16(ac00, dp0, emask); vis_pst_16(ac20, dp1, emask); dp0++; dp1++; if ((mlib_s16 *)dp0 <= dend) { ac14 = vis_fmuld8ulx16(f39, vis_read_hi(d12)); ac16 = vis_fmuld8ulx16(f39, vis_read_lo(d11)); ac14 = vis_fpadd32(ac14, d87); ac16 = vis_fpadd32(ac16, d87); ac15 = vis_fmuld8ulx16(f93, vis_read_lo(d11)); ac17 = vis_fmuld8ulx16(f93, vis_read_lo(d12)); ac14 = vis_fpadd32(ac14, ac15); ac16 = vis_fpadd32(ac16, ac17); ac04 = vis_fmuld8ulx16(f13, vis_read_hi(d02)); ac05 = vis_fmuld8ulx16(f31, vis_read_lo(d01)); ac06 = vis_fmuld8ulx16(f13, vis_read_lo(d01)); ac07 = vis_fmuld8ulx16(f31, vis_read_lo(d02)); ac04 = vis_fpadd32(ac04, ac05); ac06 = vis_fpadd32(ac06, ac07); ac04 = vis_fpadd32(ac14, ac04); ac06 = vis_fpadd32(ac16, ac06); ac24 = vis_fmuld8ulx16(f13, vis_read_hi(d22)); ac25 = vis_fmuld8ulx16(f31, vis_read_lo(d21)); ac26 = vis_fmuld8ulx16(f13, vis_read_lo(d21)); ac27 = vis_fmuld8ulx16(f31, vis_read_lo(d22)); ac24 = vis_fpadd32(ac24, ac25); ac26 = vis_fpadd32(ac26, ac27); ac24 = vis_fpadd32(ac14, ac24); ac26 = vis_fpadd32(ac16, ac26); ac04 = vis_fpackfix_pair(ac04, ac06); ac24 = vis_fpackfix_pair(ac24, ac26); emask = vis_edge16(dp0, dend); vis_pst_16(ac04, dp0, emask); vis_pst_16(ac24, dp1, emask); } } dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4; dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4; dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4; dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4; return (MLIB_SUCCESS); }
mlib_status mlib_v_ImageAdd_S16( mlib_image *dst, const mlib_image *src1, const mlib_image *src2) { mlib_s32 i, j, k; mlib_s32 offdst, offsrc1, offsrc2, emask; mlib_s32 amount; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr; mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21; mlib_s16 *dend; VALIDATE(mlib_s16); sl1 = sp1; sl2 = sp2; dl = dp; amount = width * channels; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 3) == 0) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd20 = vis_ld_d64_nf(spp2); if (emask != 0xf) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd20 = sd21; spp2++; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = vis_ld_d64_nf(spp1); if (emask != 0xf) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd10 = sd11; spp1++; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd10 = sd11; spp1++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd20 = (*spp2++); sd10 = vis_faligndata(sd10, vis_ld_d64_nf(spp1 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the source addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the destination addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd0); if (emask != 0xf) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp++, emask); dd0 = dd1; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); (*dpp++) = vis_faligndata(dd0, dd1); dd0 = dd1; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else { /* common case */ for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd10 = vis_faligndata(vis_ld_d64_nf(spp1), vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_faligndata(vis_ld_d64_nf(spp2), vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } /* copy src1 to dst */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd11 = vis_ld_d64_nf(spp1); tmp_ptr = dpp; #pragma pipeloop(0) for (k = i; k <= (amount - 4); k += 4) { sd10 = sd11; sd11 = vis_ld_d64_nf(spp1 + 1); (*tmp_ptr++) = vis_faligndata(sd10, sd11); spp1++; } sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_ld_d64_nf(spp2); tmp_ptr = dpp; #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*tmp_ptr++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd11, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } return (MLIB_SUCCESS); }
mlib_status __mlib_SignalEmphasize_S16S_S16S_Sat( mlib_s16 *dst, const mlib_s16 *src, void *filter, mlib_s32 n) { mlib_emphasize_struct *fist = filter; mlib_d64 w_maskand0 = vis_to_double(0xFFFFFFFF, 0xFFFF); mlib_d64 w_maskor0 = vis_freg_pair(0.f, fist->v16_last0); mlib_d64 w_maskand1 = vis_to_double(0xFFFFFFFF, 0xFFFF0000); mlib_d64 w_maskor1 = vis_freg_pair(0.f, fist->v16_last1); mlib_f32 v_mask = vis_to_float(0x80008000); mlib_f32 v_alpha = fist->v_alpha; mlib_s16 *fdst = dst + n + n - 1; mlib_d64 *dpd, *dps, *dsrct1; mlib_d64 w_dst, w_src, w_src0, w_src1, w_src2, w_lsrc; mlib_d64 dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7; mlib_s32 i, times, t1, t2; /* check for obvious errors */ if ((fist == NULL) || (n <= 0) || (src == 0) || (dst == 0) || (fist->type != MLIB_EMPH)) { return (MLIB_FAILURE); } vis_write_gsr(1 << 3); w_maskor0 = vis_fand(w_maskor0, w_maskand1); w_maskor1 = vis_fand(w_maskor1, w_maskand0); vis_alignaddr((void *)(-(mlib_addr)src), 0); w_maskand0 = vis_faligndata(w_maskand0, w_maskand0); w_maskor0 = vis_faligndata(w_maskor0, w_maskor0); w_maskand1 = vis_faligndata(w_maskand1, w_maskand1); w_maskor1 = vis_faligndata(w_maskor1, w_maskor1); dpd = vis_alignaddr(dst, 0); times = (mlib_d64 *)vis_alignaddr(fdst, 0) - dpd; t1 = -((mlib_addr)(dst) & 7); t2 = t1 - 4; dps = vis_alignaddr((void *)src, t2); w_src0 = vis_ld_d64_nf(dps); dps++; w_src1 = vis_ld_d64_nf(dps); dps++; if ((((mlib_addr)dst ^ (mlib_addr)src) & 7)) { if (((mlib_addr)dps - (mlib_addr)src) >= 6) { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); } else { w_src1 = vis_fand(w_maskand0, w_src1); w_src1 = vis_for(w_maskor0, w_src1); } if (((mlib_addr)dps - (mlib_addr)src) >= 8) { w_src0 = vis_fand(w_maskand1, w_src0); w_src0 = vis_for(w_maskor1, w_src0); } else { w_src1 = vis_fand(w_maskand1, w_src1); w_src1 = vis_for(w_maskor1, w_src1); } w_lsrc = vis_faligndata(w_src0, w_src1); dsrct1 = vis_alignaddr((void *)src, t1); if (dps - 2 != dsrct1) { w_src2 = *dps; dps++; w_src = vis_faligndata(w_src1, w_src2); MLIB_MUL8; if ((mlib_addr)dst & 7) { times--; w_src0 = w_src1; w_src1 = w_src2; w_src2 = *dps; vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dpd++; dps++; } } else { w_src = vis_faligndata(w_src0, w_src1); MLIB_MUL8; if ((mlib_addr)dst & 7) { times--; w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; dpd++; } } } else { w_src = w_src1; if ((mlib_addr)src & 7) { times--; if (((mlib_addr)src & 7) == 2) { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); } else { w_src1 = vis_fand(w_maskand0, w_src1); w_src1 = vis_for(w_maskor0, w_src1); } w_src1 = vis_fand(w_maskand1, w_src1); w_src1 = vis_for(w_maskor1, w_src1); w_lsrc = vis_faligndata(w_src0, w_src1); MLIB_MUL8; w_src0 = w_src1; w_src1 = *dps; w_src = w_src1; w_lsrc = vis_faligndata(w_src0, w_src1); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } else { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); w_src0 = vis_fand(w_maskand1, w_src0); w_src0 = vis_for(w_maskor1, w_src0); w_lsrc = vis_faligndata(w_src0, w_src1); MLIB_MUL8; } w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); MLIB_MIX; w_src1 = w_src; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_src1 = w_src; w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); w_dst = vis_fpackfix_pair(dr2, dr3); dps++; dpd++; } } if (times >= 0) { vis_pst_16(w_dst, dpd, vis_edge16(dpd, fdst)); } ((mlib_s16 *)&fist->v16_last0)[0] = src[2 * n - 2]; ((mlib_s16 *)&fist->v16_last1)[1] = src[2 * n - 1]; return (MLIB_SUCCESS); }
void mlib_v_ImageLookUp_S16_S16_3_D1( const mlib_s16 *src, mlib_s16 *dst, mlib_s32 xsize, const mlib_s16 *table0, const mlib_s16 *table1, const mlib_s16 *table2) { /* pointer to source data */ mlib_s16 *sp; /* source data */ mlib_s32 s0, s1, s2, s3; /* pointer to start of destination */ mlib_s16 *dl; /* pointer to end of destination */ mlib_s16 *dend; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc0, acc1; /* edge mask */ mlib_s32 emask; /* loop variable */ mlib_s32 i, num; const mlib_s16 *table; dl = dst; sp = (void *)src; dp = (mlib_d64 *)dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 6); i = 0; if (xsize >= 4) { s0 = sp[0] << 1; s1 = sp[1] << 1; s2 = sp[2] << 1; s3 = sp[3] << 1; sp += 4; vis_write_bmask(0x012389ab, 0); #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t3 = VIS_LD_U16_I(table0, s3); t2 = VIS_LD_U16_I(table2, s2); t1 = VIS_LD_U16_I(table1, s1); t0 = VIS_LD_U16_I(table0, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0] << 1; s1 = sp[1] << 1; s2 = sp[2] << 1; s3 = sp[3] << 1; (*dp++) = vis_bshuffle(acc0, acc1); table = table0; table0 = table1; table1 = table2; table2 = table; } t3 = VIS_LD_U16_I(table0, s3); t2 = VIS_LD_U16_I(table2, s2); t1 = VIS_LD_U16_I(table1, s1); t0 = VIS_LD_U16_I(table0, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); (*dp++) = vis_bshuffle(acc0, acc1); table = table0; table0 = table1; table1 = table2; table2 = table; i += 4; } if ((mlib_addr)dp <= (mlib_addr)dend) { num = (mlib_s16 *)dend - (mlib_s16 *)dp; sp += num; num++; if (num == 1) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } else if (num == 2) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table1, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } else if (num == 3) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table2, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table1, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge16(dp, dend); vis_pst_16(acc0, dp, emask); } }
mlib_status __mlib_SignalMulBartlett_F32S_F32S( mlib_f32 *dst, const mlib_f32 *src, mlib_s32 n) { mlib_union64 tmp_1, tmp_2; mlib_d64 step, base, tmp1, tmp2; mlib_d64 tmp1_new, tmp2_new, tmp1_last, tmp2_last; mlib_s32 n1, i; mlib_f32 *dst2 = dst + 2 * n - 1; const mlib_f32 *src2 = src + 2 * n - 1; mlib_d64 *pdst, *pdst2; mlib_u8 emask1, emask2; if ((dst == NULL) || (n <= 1) || (src == NULL)) return (MLIB_FAILURE); n1 = n - 1; step = 2. / n1; base = 0.; if ((mlib_addr)dst & 7) { pdst = vis_alignaddr((void *)dst, 0); pdst2 = (mlib_d64 *)dst2; emask1 = 3; emask2 = 12; tmp1_last = vis_to_double_dup(0); src += 2; tmp2_last = vis_to_double_dup(0); src2 -= 2; vis_pst_16(tmp1_last, pdst, emask1); vis_pst_16(tmp2_last, pdst2, emask2); pdst++; pdst2--; base += step; n = n - 1; #pragma pipeloop(0) for (i = 0; i < n / 2; i++) { tmp_1.f32x2.i0 = base * src[0]; tmp_1.f32x2.i1 = base * src[1]; tmp1 = tmp_1.d64; src += 2; tmp_2.f32x2.i1 = base * src2[0]; src2--; tmp_2.f32x2.i0 = base * src2[0]; tmp2 = tmp_2.d64; src2--; tmp1_new = vis_faligndata(tmp1_last, tmp1); tmp1_last = tmp1; tmp2_new = vis_faligndata(tmp2, tmp2_last); tmp2_last = tmp2; pdst[0] = tmp1_new; pdst++; pdst2[0] = tmp2_new; pdst2--; base += step; } if (n & 1) { dst += i * 2 + 1; src--; dst[0] = src[0] * (base - step); dst[1] = src[1] * (base - step); } } else { pdst = (mlib_d64 *)dst; pdst2 = (mlib_d64 *)(dst2 - 1); #pragma pipeloop(0) for (i = 0; i < n / 2; i++) { tmp_1.f32x2.i0 = base * src[0]; tmp_1.f32x2.i1 = base * src[1]; tmp1 = tmp_1.d64; tmp_2.f32x2.i1 = base * src2[0]; src2--; tmp_2.f32x2.i0 = base * src2[0]; tmp2 = tmp_2.d64; pdst[0] = tmp1; pdst++; pdst2[0] = tmp2; pdst2--; src2--; src += 2; base += step; } if (n & 1) { dst += i * 2; dst[0] = src[0] * base; dst[1] = src[1] * base; } } return (MLIB_SUCCESS); }
mlib_status __mlib_SignalMulBartlett_F32S( mlib_f32 *data, mlib_s32 n) { mlib_union64 tmp_1, tmp_2; mlib_d64 step, base, tmp1, tmp2; mlib_d64 tmp1_new, tmp2_new, tmp1_last, tmp2_last; mlib_s32 n1, i; mlib_f32 *data2 = data + 2 * n - 1; mlib_d64 *pdata, *pdata2; mlib_u8 emask1, emask2; if ((data == NULL) || (n <= 1)) return (MLIB_FAILURE); n1 = n - 1; step = 2. / n1; base = 0.; if ((mlib_addr)data & 7) { pdata = vis_alignaddr((void *)data, 0); pdata2 = (mlib_d64 *)data2; emask1 = 3; emask2 = 12; tmp1_last = vis_to_double_dup(0); data += 2; tmp2_last = vis_to_double_dup(0); data2 -= 2; vis_pst_16(tmp1_last, pdata, emask1); vis_pst_16(tmp2_last, pdata2, emask2); pdata++; pdata2--; base += step; n = n - 1; #pragma pipeloop(0) for (i = 0; i < n / 2; i++) { tmp_1.f32x2.i0 = base * data[0]; tmp_1.f32x2.i1 = base * data[1]; tmp1 = tmp_1.d64; data += 2; tmp_2.f32x2.i1 = base * data2[0]; data2--; tmp_2.f32x2.i0 = base * data2[0]; tmp2 = tmp_2.d64; data2--; tmp1_new = vis_faligndata(tmp1_last, tmp1); tmp1_last = tmp1; tmp2_new = vis_faligndata(tmp2, tmp2_last); tmp2_last = tmp2; pdata[0] = tmp1_new; pdata++; pdata2[0] = tmp2_new; pdata2--; base += step; } if (n & 1) { data--; data[0] = data[0] * (base - step); data[1] = data[1] * (base - step); } } else { pdata = (mlib_d64 *)data; pdata2 = (mlib_d64 *)(data2 - 1); #pragma pipeloop(0) for (i = 0; i < n / 2; i++) { tmp_1.f32x2.i0 = base * data[0]; tmp_1.f32x2.i1 = base * data[1]; tmp1 = tmp_1.d64; tmp_2.f32x2.i1 = base * data2[0]; data2--; tmp_2.f32x2.i0 = base * data2[0]; tmp2 = tmp_2.d64; pdata[0] = tmp1; pdata++; pdata2[0] = tmp2; pdata2--; data2--; data += 2; base += step; } if (n & 1) { data--; data[0] = data[0] * base; data[1] = data[1] * base; } } return (MLIB_SUCCESS); }
mlib_status mlib_ImageAbs_S16( mlib_s16 *dst, mlib_s16 *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) { /* 8-byte aligned src, dst ptrs */ mlib_d64 *sp, *dp; /* unaligned data */ mlib_d64 prev; mlib_d64 curr0; mlib_d64 curr1, curr2; /* aligned data */ mlib_d64 adat0; /* absolute values of result */ mlib_d64 dabs; /* aligned data */ mlib_d64 adat1, adat2, adat3; /* absolute values of result */ mlib_d64 dabs0, dabs1, dabs2, dabs3; mlib_d64 dtwo = vis_to_double_dup(0x20002); mlib_d64 mask = vis_to_double_dup(0x80008000); /* last pixel of line */ mlib_s16 *dlast; /* bit mask results of comp */ mlib_s32 mask0; /* pxl count of source line */ mlib_s32 slpxl = slb >> 1; /* pxl count of destination line */ mlib_s32 dlpxl = dlb >> 1; /* dst offset for address alignment */ mlib_s32 doffs; mlib_s32 row, block; /* full blocks, each of N d64s */ mlib_s32 numblocks; for (row = 0; row < hgt; row++) { /* ROW SETUP */ /* last dst pixel in row */ dlast = dst + wid - 1; doffs = (mlib_addr)dst & 7; /* aligned dest ptr */ dp = (mlib_d64 *)((mlib_addr)dst & ~7); /* aligned src ptr */ sp = (mlib_d64 *)vis_alignaddr(src, -(mlib_s32)doffs); prev = *sp; /* FIRST d64 NEEDS EDGE MASK FOR DESTINATION START POINT */ /* edge mask for start point */ mask0 = vis_edge16(dst, dlast); READ_PXLS_UNALIGN; CALC_ABS_S16; vis_pst_16(dabs, dp++, mask0); numblocks = ((mlib_u8 *)dlast + 1 - (mlib_u8 *)dp) >> 3; /* DO MOST OF ROW IN BLOCKS OF N d64s */ if ((((mlib_addr)src ^ (mlib_addr)dst) & 7) == 0) { #pragma pipeloop(0) for (block = 0; block < numblocks - 3; block += 4) { adat0 = sp[0]; adat1 = sp[1]; adat2 = sp[2]; CALC_ABS_S16_UNROLL(dabs0, adat0); adat3 = sp[3]; dp[0] = dabs0; CALC_ABS_S16_UNROLL(dabs1, adat1); dp[1] = dabs1; CALC_ABS_S16_UNROLL(dabs2, adat2); dp[2] = dabs2; CALC_ABS_S16_UNROLL(dabs3, adat3); dp[3] = dabs3; sp += 4; dp += 4; } #pragma pipeloop(0) for (; block < numblocks; block++) { READ_PXLS_ALIGN; CALC_ABS_S16; STORE_ABS_VALUES; } prev = *sp; } else { #pragma pipeloop(0) for (block = 0; block < numblocks - 3; block += 4) { curr0 = *(sp + 1); curr1 = *(sp + 2); curr2 = *(sp + 3); adat0 = vis_faligndata(prev, curr0); prev = *(sp + 4); CALC_ABS_S16_UNROLL(dabs0, adat0); dp[0] = dabs0; adat1 = vis_faligndata(curr0, curr1); CALC_ABS_S16_UNROLL(dabs1, adat1); dp[1] = dabs1; adat2 = vis_faligndata(curr1, curr2); CALC_ABS_S16_UNROLL(dabs2, adat2); dp[2] = dabs2; adat3 = vis_faligndata(curr2, prev); CALC_ABS_S16_UNROLL(dabs3, adat3); dp[3] = dabs3; sp += 4; dp += 4; } #pragma pipeloop(0) for (; block < numblocks; block++) { READ_PXLS_UNALIGN; CALC_ABS_S16; STORE_ABS_VALUES; } } /* LAST d64 NEEDS EDGE MASK FOR DESTINATION END POINT */ if ((mlib_addr)dp <= (mlib_addr)dlast) { curr0 = *(++sp); /* edge mask for end point */ mask0 = vis_edge16(dp, dlast); adat0 = vis_faligndata(prev, curr0); CALC_ABS_S16; vis_pst_16(dabs, dp, mask0); } /* ptrs to next src row */ src += slpxl; /* ptrs to next dst row */ dst += dlpxl; } return (MLIB_SUCCESS); }
void mlib_v_ImageLookUpSI_U16_U8_2_DstA8D1(const mlib_u16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 **table) { mlib_u16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_u16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ const mlib_u8 *tab0 = &table[0][0]; const mlib_u8 *tab1 = &table[1][0]; sp = (void *)src; dl = (mlib_u16 *) dst; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 7); if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t7 = VIS_LD_U8_I(tab1, s3); t6 = VIS_LD_U8_I(tab0, s3); t5 = VIS_LD_U8_I(tab1, s2); t4 = VIS_LD_U8_I(tab0, s2); t3 = VIS_LD_U8_I(tab1, s1); t2 = VIS_LD_U8_I(tab0, s1); t1 = VIS_LD_U8_I(tab1, s0); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; *dp++ = acc; } t7 = VIS_LD_U8_I(tab1, s3); t6 = VIS_LD_U8_I(tab0, s3); t5 = VIS_LD_U8_I(tab1, s2); t4 = VIS_LD_U8_I(tab0, s2); t3 = VIS_LD_U8_I(tab1, s1); t2 = VIS_LD_U8_I(tab0, s1); t1 = VIS_LD_U8_I(tab1, s0); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); *dp++ = acc; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_u16 *) dend - (mlib_u16 *) dp; sp += num; num++; #pragma pipeloop(0) for (i = 0; i < num; i++) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(tab1, s0); acc = vis_faligndata(t0, acc); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t0, acc); } emask = vis_edge16(dp, dend); vis_pst_16(acc, dp, emask); } }
mlib_status __mlib_VectorConjRev_S16C_S16C_Sat( mlib_s16 *zz, const mlib_s16 *xx, mlib_s32 n) { mlib_s16 *x = (mlib_s16 *)xx, *z = (mlib_s16 *)zz; mlib_s16 *src = (mlib_s16 *)x, *dst = (mlib_s16 *)&z[2 * n]; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, dl, dh, d_rest; mlib_d64 dlog0 = vis_to_double_dup(0x0000ffff), dtwo = vis_to_double(0, 2); mlib_f32 f_two = vis_to_float(0x20002); mlib_s16 c; mlib_s32 i, rest_64, len_64, even_length, odd = 0, length = (mlib_s32)n * 2; mlib_s32 re_part; CHECK(x, z); if ((n < 16)) { CONJREVC(mlib_s16, MLIB_S16_MAX, MLIB_S16_MIN); } while (((mlib_addr)dst) & 7) { if ((c = src[1]) == MLIB_S16_MIN) *--dst = MLIB_S16_MAX; else *--dst = -c; length -= 2; src += 2; if (((mlib_addr)dst) & 7) { *--dst = src[-2]; } else { re_part = src[-2]; odd = 1; break; } } vis_write_gsr(15 << 3); ddst = (mlib_d64 *)dst; rest_64 = length & 3; len_64 = length >> 2; even_length = len_64 << 2; if (!odd) { /* * Aligning loop finished with imaginary part. The following processing * starts with real part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)src; #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ16; *--ddst = d4; } } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ16; *--ddst = d4; } } } else { /* * Aligning loop finished with real part. Th following processing * starts with imaginary part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 2); d_rest = vis_to_double((re_part << 16), 0); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ16; *--ddst = vis_faligndata(d4, d_rest); d_rest = d4; } ddst--; d_rest = vis_faligndata(d_rest, d_rest); vis_pst_16(d_rest, ddst, 0x1); } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ16; *--ddst = d4; } vis_write_gsr(2); d2 = *ddst; d3 = vis_faligndata(d1, d2); vis_pst_16(d3, (ddst - 1), 0x1); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = *(ddst + 1); (*ddst++) = vis_faligndata(d1, d2); } dst[-1] = re_part; } dst--; } if (!rest_64) return (MLIB_SUCCESS); for (i = 0; i < rest_64; i += 2) { dst[-even_length - 2 - i] = src[even_length + i]; if ((c = src[even_length + i + 1]) == MLIB_S16_MIN) dst[-even_length - 2 - i + 1] = MLIB_S16_MAX; else dst[-even_length - 2 - i + 1] = -c; } return (MLIB_SUCCESS); }
void mlib_v_ImageSqrShift_S16_D1( mlib_s16 *src, mlib_s16 *dst, mlib_s32 dsize, mlib_s32 shift) { /* aligned pointer to source */ mlib_d64 *sp; /* pointer to source */ mlib_s16 *sa; /* aligned pointer to destination */ mlib_d64 *dp; /* pointer to destination */ mlib_s16 *da; /* pointer to end of dst */ mlib_s16 *dend; /* offset of address alignment in dst */ mlib_s32 off; /* edge masks */ mlib_s32 emask; /* source data */ mlib_d64 s0, s1; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 rdhh, rdhl; /* temporaries used in macro */ mlib_d64 rdlh, rdll; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 i, n; sa = src; da = dst; /* prepare the destination address */ dp = (mlib_d64 *)((mlib_addr)da & (~7)); off = (mlib_addr)dp - (mlib_addr)da; dend = da + dsize - 1; /* prepare the source address */ sp = (mlib_d64 *)vis_alignaddr(sa, off); /* generate edge mask for the start point */ emask = vis_edge16(da, dend); /* first 4 pixels */ s0 = vis_ld_d64_nf(sp); sp++; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_S16(sd, dd); vis_pst_16(dd, dp++, emask); n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8; /* 4-pixel column loop */ #pragma pipeloop(0) for (i = 0; i < n; i++) { s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_S16(sd, dd); (*dp++) = dd; } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge16(dp, dend); s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_S16(sd, dd); vis_pst_16(dd, dp++, emask); } }