void mlib_v_ImageLookUp_S16_S16_3_D1( const mlib_s16 *src, mlib_s16 *dst, mlib_s32 xsize, const mlib_s16 *table0, const mlib_s16 *table1, const mlib_s16 *table2) { /* pointer to source data */ mlib_s16 *sp; /* source data */ mlib_s32 s0, s1, s2, s3; /* pointer to start of destination */ mlib_s16 *dl; /* pointer to end of destination */ mlib_s16 *dend; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc0, acc1; /* edge mask */ mlib_s32 emask; /* loop variable */ mlib_s32 i, num; const mlib_s16 *table; dl = dst; sp = (void *)src; dp = (mlib_d64 *)dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 6); i = 0; if (xsize >= 4) { s0 = sp[0] << 1; s1 = sp[1] << 1; s2 = sp[2] << 1; s3 = sp[3] << 1; sp += 4; vis_write_bmask(0x012389ab, 0); #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t3 = VIS_LD_U16_I(table0, s3); t2 = VIS_LD_U16_I(table2, s2); t1 = VIS_LD_U16_I(table1, s1); t0 = VIS_LD_U16_I(table0, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0] << 1; s1 = sp[1] << 1; s2 = sp[2] << 1; s3 = sp[3] << 1; (*dp++) = vis_bshuffle(acc0, acc1); table = table0; table0 = table1; table1 = table2; table2 = table; } t3 = VIS_LD_U16_I(table0, s3); t2 = VIS_LD_U16_I(table2, s2); t1 = VIS_LD_U16_I(table1, s1); t0 = VIS_LD_U16_I(table0, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); (*dp++) = vis_bshuffle(acc0, acc1); table = table0; table0 = table1; table1 = table2; table2 = table; i += 4; } if ((mlib_addr)dp <= (mlib_addr)dend) { num = (mlib_s16 *)dend - (mlib_s16 *)dp; sp += num; num++; if (num == 1) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } else if (num == 2) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table1, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } else if (num == 3) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table2, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table1, s0 << 1); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U16_I(table0, s0 << 1); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge16(dp, dend); vis_pst_16(acc0, dp, emask); } }
mlib_status __mlib_VectorConvert_U8_S32_Sat( mlib_u8 *z, const mlib_s32 *x, mlib_s32 n) { mlib_s32 *src = (void *)x; mlib_u8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d0, d_tmp, d1, d2, d3, d4; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_s32 c; if (n < 8) { PACK_S_S(mlib_s32, mlib_u8, MLIB_U8_MAX, 0); } /* * First try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c); length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(23 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < len_64; i++) { d1 = (*dsrc++); d2 = (*dsrc++); d3 = (*dsrc++); d4 = (*dsrc++); d1 = vis_fpack32(d1, d1); d2 = vis_fpack32(d1, d2); d3 = vis_fpack32(d2, d3); d4 = vis_fpack32(d3, d4); (*ddst++) = vis_fpmerge(vis_read_hi(d4), vis_read_lo(d4)); } } else { /* * Source address is arbitrary aligned. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d0 = (*dsrc++); #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < len_64; i++) { d_tmp = (*dsrc++); d1 = vis_faligndata(d0, d_tmp); d0 = (*dsrc++); d2 = vis_faligndata(d_tmp, d0); d_tmp = (*dsrc++); d3 = vis_faligndata(d0, d_tmp); d0 = vis_ld_d64_nf(dsrc); dsrc++; d4 = vis_faligndata(d_tmp, d0); d1 = vis_fpack32(d1, d1); d2 = vis_fpack32(d1, d2); d3 = vis_fpack32(d2, d3); d4 = vis_fpack32(d3, d4); (*ddst++) = vis_fpmerge(vis_read_hi(d4), vis_read_lo(d4)); } } for (i = 0; i < rest_64; i++) { c = src[even_length + i]; dst[even_length + i] = c < MLIB_U8_MIN ? MLIB_U8_MIN : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c); } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorSubS_S32_S16_Mod( mlib_s32 *z, const mlib_s16 *x, const mlib_s16 *c, mlib_s32 n) { mlib_d64 *dpz, *dpx; mlib_d64 dx, dx0, dx1; mlib_d64 dr1, dr2, dzh, dzl; mlib_f32 fone = vis_to_float(0x10001); mlib_s32 uc = *((mlib_s16 *)c); mlib_s16 *px; mlib_s32 *pz; mlib_s32 len = n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; mlib_d64 dc = vis_to_double_dup(uc); if (n <= 0) return (MLIB_FAILURE); px = (mlib_s16 *)x; pz = (mlib_s32 *)z; if (n <= 4) SUBS_S32_S16_IN_C; /* * prepare the destination address */ while ((mlib_addr)pz & 7) { (*pz++) = uc - ((mlib_s32)(*px)); px++; len--; } dpz = (mlib_d64 *)pz; even_8 = len >> 2; rest_8 = len & 0x3; if (!((mlib_addr)px & 7)) { /* * 'x' address is 8-byte aligned. * No vis_alignaddr and vis_faligndata at all. */ dpx = (mlib_d64 *)px; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx = (*dpx++); SUBS_S32_S16_MOD; /* * store 16 bytes of result */ dpz[0] = dzh; dpz[1] = dzl; dpz += 2; } } else { /* * "x" address is arbitrary aligned. * 1 vis_alignaddr and 1 vis_faligndata in the loop. */ dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUBS_S32_S16_MOD; dx0 = dx1; /* * store 16 bytes of result */ dpz[0] = dzh; dpz[1] = dzl; dpz += 2; } } if (!rest_8) return (MLIB_SUCCESS); px += (even_8 << 2); pz += (even_8 << 2); while (rest_8--) { (*pz++) = uc - ((mlib_s32)(*px)); px++; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S8_U8_Sat( mlib_s8 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *src = (void *)x; mlib_s8 *dst = z; mlib_d64 fzero = vis_fzeros(); mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_u8 c; mlib_d64 dsp = vis_to_double_dup(0x800080); mlib_d64 rst = vis_to_double_dup(0x80808080); mlib_f32 fm = vis_to_float(0x100); if (length < 16) { PACK_U_S(mlib_u8, mlib_s8, MLIB_S8_MAX); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) > MLIB_S8_MAX ? MLIB_S8_MAX : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d4 = (*dsrc++); d2 = vis_fpmerge(fzero, vis_read_hi(d1)); d3 = vis_fmul8x16al(vis_read_lo(d1), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d1 = vis_fpack16_pair(d2, d3); d2 = vis_fpmerge(fzero, vis_read_hi(d4)); d3 = vis_fmul8x16al(vis_read_lo(d4), fm); d2 = vis_fpadd16(dsp, d2); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpack16_pair(d2, d3); (*ddst++) = vis_fxor(d1, rst); (*ddst++) = vis_fxor(d4, rst); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d1 = vis_faligndata(d1, d2); d3 = vis_fmul8x16al(vis_read_hi(d1), fm); d4 = vis_fmul8x16al(vis_read_lo(d1), fm); d3 = vis_fpadd16(dsp, d3); d4 = vis_fpadd16(dsp, d4); d1 = vis_fpack16_pair(d3, d4); (*ddst++) = vis_fxor(d1, rst); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_faligndata(d1, d2); d4 = vis_fmul8x16al(vis_read_hi(d3), fm); d5 = vis_fmul8x16al(vis_read_lo(d3), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d3 = vis_fpack16_pair(d4, d5); d4 = vis_fmul8x16al(vis_read_hi(d6), fm); d5 = vis_fmul8x16al(vis_read_lo(d6), fm); d4 = vis_fpadd16(dsp, d4); d5 = vis_fpadd16(dsp, d5); d6 = vis_fpack16_pair(d4, d5); (*ddst++) = vis_fxor(d3, rst); (*ddst++) = vis_fxor(d6, rst); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) > MLIB_S8_MAX ? MLIB_S8_MAX : c; return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_U8_S16_Sat( mlib_u8 *z, const mlib_s16 *x, mlib_s32 n) { mlib_s16 *src = (void *)x; mlib_u8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6, d7; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_s16 c; if (n < 16) { PACK_S_U_DF(mlib_s16, mlib_u8, MLIB_U8_MAX, 0); } /* * First try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c); length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d4 = (*dsrc++); d5 = (*dsrc++); d3 = vis_fpack16_pair(d4, d5); (*ddst++) = d3; } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = (*dsrc++); d5 = (*dsrc++); d6 = (*dsrc++); d3 = vis_fpack16_pair(d1, d2); d7 = vis_fpack16_pair(d5, d6); (*ddst++) = d3; (*ddst++) = d7; } } else { /* * Source address is 2-byte aligned. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d4 = vis_faligndata(d1, d2); d3 = vis_fpack16_pair(d3, d4); (*ddst++) = d3; } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d4 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d5 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d6 = vis_faligndata(d1, d2); d3 = vis_fpack16_pair(d3, d4); d5 = vis_fpack16_pair(d5, d6); (*ddst++) = d3; (*ddst++) = d5; } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : (c > MLIB_U8_MAX ? MLIB_U8_MAX : c); return (MLIB_SUCCESS); }
mlib_status mlib_v_ImageAdd_U8( mlib_image *dst, const mlib_image *src1, const mlib_image *src2) { mlib_s32 i, j, k; mlib_s32 offdst, offsrc1, offsrc2, emask; mlib_s32 amount; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr; mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21; mlib_d64 sd1h, sd2h, sd1l, sd2l, rdh, rdl; mlib_u8 *dend; mlib_f32 nul = vis_to_float(0), fone = vis_to_float(0x100); VALIDATE(mlib_u8); /* initialize GSR scale factor */ vis_write_gsr(7 << 3); sl1 = sp1; sl2 = sp2; dl = dp; amount = width * channels; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 7) == 0) && (((strided ^ stride2) & 7) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); if (emask != 0xff) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp++, emask); i += 8; } #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); (*dpp++) = dd; } if (i < amount) { emask = vis_edge8(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 7) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); sd20 = vis_ld_d64_nf(spp2); if (emask != 0xff) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp++, emask); sd20 = sd21; spp2++; i += 8; } #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge8(dpp, dend); sd10 = (*spp1++); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc2) && (((strided ^ stride2) & 7) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; /* prepare the source addresses */ spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); spp1 = (mlib_d64 *)vis_alignaddr(sp1, i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); sd10 = vis_ld_d64_nf(spp1); if (emask != 0xff) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp++, emask); sd10 = sd11; spp1++; i += 8; } #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); (*dpp++) = dd; sd10 = sd11; spp1++; } if (i < amount) { emask = vis_edge8(dpp, dend); sd20 = (*spp2++); sd10 = vis_faligndata(sd10, vis_ld_d64_nf(spp1 + 1)); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 7) == 0)) { for (j = 0; j < height; j++) { /* prepare the source addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; /* prepare the destination addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, i); spp2 = (mlib_d64 *)vis_alignaddr(sp2, i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_U8(sd10, sd20, dd0); if (emask != 0xff) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; i += 8; } #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1); (*dpp++) = vis_faligndata(dd0, dd1); dd0 = dd1; } if (i < amount) { emask = vis_edge8(dpp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else { /* common case */ for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); if (emask != 0xff) { spp1 = (mlib_d64 *)vis_alignaddr(sp1, i); sd10 = vis_faligndata(vis_ld_d64_nf(spp1), vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, i); sd20 = vis_faligndata(vis_ld_d64_nf(spp2), vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp++, emask); i += 8; } /* copy src1 to dst */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, i); sd11 = vis_ld_d64_nf(spp1); tmp_ptr = dpp; #pragma pipeloop(0) for (k = i; k <= (amount - 8); k += 8) { sd10 = sd11; sd11 = vis_ld_d64_nf(spp1 + 1); (*tmp_ptr++) = vis_faligndata(sd10, sd11); spp1++; } sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, i); sd20 = vis_ld_d64_nf(spp2); tmp_ptr = dpp; #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd10 = (*tmp_ptr++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge8(dpp, dend); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_U8(sd11, sd20, dd); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } return (MLIB_SUCCESS); }
mlib_status mlib_v_ImageAdd_U16( mlib_image *dst, const mlib_image *src1, const mlib_image *src2) { mlib_s32 i, j, k; mlib_s32 offdst, offsrc1, offsrc2, emask, mask; mlib_s32 amount; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr, tmp; mlib_d64 sd10, sd11, sd20, sd21; mlib_d64 ones = vis_to_double_dup(0x7fff7fff); mlib_d64 max_u16 = vis_to_double_dup(0xffffffff); mlib_u16 *dend; VALIDATE(mlib_u16); /* initialize GSR scale factor */ vis_write_gsr(15 << 3); sl1 = sp1; sl2 = sp2; dl = dp; amount = width * channels; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 3) == 0) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp, emask); i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U16(sd10, sd20, dpp) } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) {
mlib_status __mlib_VectorSub_U8_U8_Mod( mlib_u8 *z, const mlib_u8 *x, const mlib_u8 *y, mlib_s32 n) { /* 8-byte aligned start point in destination */ mlib_d64 *dpz; /* 8-byte aligned start point in source */ mlib_d64 *dpx, *dpy; /* source data */ mlib_d64 dx, dy, dx0; mlib_d64 dx1, dy0, dy1; /* destination data */ mlib_d64 dz; /* intermediate result */ mlib_d64 dh, dl; mlib_d64 dxl, dyl; /* end point of a line in destination */ mlib_u8 *pzend; /* start point of a line in source */ mlib_u8 *px, *py; /* offset of address alignment in destination */ mlib_s32 off; /* edge mask */ mlib_s32 emask; mlib_u8 *pzend16; mlib_s32 sr1, sr2, sr3; mlib_s32 x8, x12, y8, y12; mlib_s32 mask = 0x7f7f7f7f; mlib_u8 *pz; mlib_s32 n16; mlib_s32 nrest; mlib_s32 len = n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; mlib_d64 mask_control = vis_to_double_dup(0xff00ff00); if (n <= 0) return (MLIB_FAILURE); px = (mlib_u8 *)x; py = (mlib_u8 *)y; pz = (mlib_u8 *)z; /* * prepare the destination address */ pzend = pz + n - 1; /* * check for 64-bit aligned special case */ if ((((mlib_addr)x | (mlib_addr)y | (mlib_addr)z) & 7) == 0) { /* * We can process source and destination vectors by 16 bytes. */ dpx = (mlib_d64 *)x; dx = vis_ld_d64_nf(dpx); dpy = (mlib_d64 *)y; dy = vis_ld_d64_nf(dpy); dpz = (mlib_d64 *)z; n16 = n & (~0xf); pzend16 = pz + n16; #pragma pipeloop(0) while ((mlib_addr)pz < (mlib_addr)pzend16) { x8 = *((mlib_s32 *)(px + 8)); y8 = *((mlib_s32 *)(py + 8)); sr1 = x8 ^ ~y8; sr2 = (x8 | ~mask) - (y8 & mask); sr3 = (sr1 & ~mask) ^ sr2; *((mlib_s32 *)(pz + 8)) = sr3; x12 = *((mlib_s32 *)(px + 12)); y12 = *((mlib_s32 *)(py + 12)); sr1 = x12 ^ ~y12; sr2 = (x12 | ~mask) - (y12 & mask); sr3 = (sr1 & ~mask) ^ sr2; *((mlib_s32 *)(pz + 12)) = sr3; SUB_S8_MOD; /* store 8 bytes of result */ *((mlib_d64 *)pz) = dz; dx = vis_ld_d64_nf(px + 16); dy = vis_ld_d64_nf(py + 16); px += 16; py += 16; pz += 16; } dpz = (mlib_d64 *)pzend16; nrest = n - n16; if (nrest >= 8) { SUB_S8_MOD; dpz[0] = dz; px += 8; py += 8; dpz++; nrest -= 8; } if (nrest > 0) { dx = *((mlib_d64 *)px); dy = *((mlib_d64 *)py); SUB_S8_MOD; emask = vis_edge8(dpz, pzend); vis_pst_8(dz, dpz, emask); } } else { /* * General case. */ dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (mlib_addr)dpz - (mlib_addr)z; /* * generate edge mask for the start point */ emask = vis_edge8(pz, pzend); /* * prepare the source address */ if (off) { dpy = (mlib_d64 *)vis_alignaddr(py, off); dy0 = vis_ld_d64_nf(dpy); dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); dpx = (mlib_d64 *)vis_alignaddr(px, off); dx0 = vis_ld_d64_nf(dpx); dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); SUB_S8_MOD; /* * store first bytes of result */ vis_pst_8(dz, dpz, emask); px += (8 + off); py += (8 + off); len -= (8 + off); dpz++; if (len <= 0) return (MLIB_SUCCESS); } even_8 = len >> 3; rest_8 = len & 0x7; /* * Now try to analyze source "x" and "y" addresses. */ if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) { /* * Both addresses are 8-byte aligned. No vis_alignaddr * and vis_faligndata at all. */ dpx = (mlib_d64 *)px; dpy = (mlib_d64 *)py; dx = vis_ld_d64_nf(dpx); dpx++; dy = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dy1 = vis_ld_d64_nf(dpy); SUB_S8_MOD; dx = dx1; dy = dy1; /* * store 8 bytes of result */ dpz[0] = dz; dpx++; dpy++; dpz++; } dx1 = dx; dy1 = dy; } else if ((!((mlib_addr)px & 7))) { /* * First ("x") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "y". */ dpx = (mlib_d64 *)px; dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); dx = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { SUB_S8_MOD; dx = vis_ld_d64_nf(dpx); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); /* * store 8 bytes of result */ (*dpz++) = dz; dpx++; dpy++; } dx1 = dx; dy1 = dy0; } else if ((!((mlib_addr)py & 7))) { /* * Second ("y") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "x". */ dpy = (mlib_d64 *)py; dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy = (*dpy++); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_MOD; /* * store 8 bytes of result */ (*dpz++) = dz; } dy1 = vis_ld_d64_nf(dpy); dpy++; } else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) { /* * Both ("x" and "y") address are identically aligned. * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy1 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_MOD; /* * store 8 bytes of result */ (*dpz++) = dz; } } else { /* * Both ("x" and "y") address are arbitrary aligned. * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); #pragma pipeloop(0) for (i = 0; i < even_8; i++) { SUB_S8_MOD; vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); /* * store 8 bytes of result */ (*dpz++) = dz; dpy++; dpx++; } dx1 = dx0; dy1 = dy0; } if (!rest_8) return (MLIB_SUCCESS); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); SUB_S8_MOD; /* * prepare edge mask for the last bytes */ emask = vis_edge8((void *)(rest_8), pzend); /* store last bytes of result */ vis_pst_8(dz, dpz, ~emask); } return (MLIB_SUCCESS); }
mlib_status __mlib_SignalEmphasize_S16S_S16S_Sat( mlib_s16 *dst, const mlib_s16 *src, void *filter, mlib_s32 n) { mlib_emphasize_struct *fist = filter; mlib_d64 w_maskand0 = vis_to_double(0xFFFFFFFF, 0xFFFF); mlib_d64 w_maskor0 = vis_freg_pair(0.f, fist->v16_last0); mlib_d64 w_maskand1 = vis_to_double(0xFFFFFFFF, 0xFFFF0000); mlib_d64 w_maskor1 = vis_freg_pair(0.f, fist->v16_last1); mlib_f32 v_mask = vis_to_float(0x80008000); mlib_f32 v_alpha = fist->v_alpha; mlib_s16 *fdst = dst + n + n - 1; mlib_d64 *dpd, *dps, *dsrct1; mlib_d64 w_dst, w_src, w_src0, w_src1, w_src2, w_lsrc; mlib_d64 dr0, dr1, dr2, dr3, dr4, dr5, dr6, dr7; mlib_s32 i, times, t1, t2; /* check for obvious errors */ if ((fist == NULL) || (n <= 0) || (src == 0) || (dst == 0) || (fist->type != MLIB_EMPH)) { return (MLIB_FAILURE); } vis_write_gsr(1 << 3); w_maskor0 = vis_fand(w_maskor0, w_maskand1); w_maskor1 = vis_fand(w_maskor1, w_maskand0); vis_alignaddr((void *)(-(mlib_addr)src), 0); w_maskand0 = vis_faligndata(w_maskand0, w_maskand0); w_maskor0 = vis_faligndata(w_maskor0, w_maskor0); w_maskand1 = vis_faligndata(w_maskand1, w_maskand1); w_maskor1 = vis_faligndata(w_maskor1, w_maskor1); dpd = vis_alignaddr(dst, 0); times = (mlib_d64 *)vis_alignaddr(fdst, 0) - dpd; t1 = -((mlib_addr)(dst) & 7); t2 = t1 - 4; dps = vis_alignaddr((void *)src, t2); w_src0 = vis_ld_d64_nf(dps); dps++; w_src1 = vis_ld_d64_nf(dps); dps++; if ((((mlib_addr)dst ^ (mlib_addr)src) & 7)) { if (((mlib_addr)dps - (mlib_addr)src) >= 6) { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); } else { w_src1 = vis_fand(w_maskand0, w_src1); w_src1 = vis_for(w_maskor0, w_src1); } if (((mlib_addr)dps - (mlib_addr)src) >= 8) { w_src0 = vis_fand(w_maskand1, w_src0); w_src0 = vis_for(w_maskor1, w_src0); } else { w_src1 = vis_fand(w_maskand1, w_src1); w_src1 = vis_for(w_maskor1, w_src1); } w_lsrc = vis_faligndata(w_src0, w_src1); dsrct1 = vis_alignaddr((void *)src, t1); if (dps - 2 != dsrct1) { w_src2 = *dps; dps++; w_src = vis_faligndata(w_src1, w_src2); MLIB_MUL8; if ((mlib_addr)dst & 7) { times--; w_src0 = w_src1; w_src1 = w_src2; w_src2 = *dps; vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); w_src0 = w_src1; w_src1 = w_src2; w_src2 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src1, w_src2); dpd++; dps++; } } else { w_src = vis_faligndata(w_src0, w_src1); MLIB_MUL8; if ((mlib_addr)dst & 7) { times--; w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); w_src0 = w_src1; w_src1 = vis_ld_d64_nf(dps); vis_alignaddr((void *)src, t2); w_lsrc = vis_faligndata(w_src0, w_src1); vis_alignaddr((void *)src, t1); w_src = vis_faligndata(w_src0, w_src1); dps++; dpd++; } } } else { w_src = w_src1; if ((mlib_addr)src & 7) { times--; if (((mlib_addr)src & 7) == 2) { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); } else { w_src1 = vis_fand(w_maskand0, w_src1); w_src1 = vis_for(w_maskor0, w_src1); } w_src1 = vis_fand(w_maskand1, w_src1); w_src1 = vis_for(w_maskor1, w_src1); w_lsrc = vis_faligndata(w_src0, w_src1); MLIB_MUL8; w_src0 = w_src1; w_src1 = *dps; w_src = w_src1; w_lsrc = vis_faligndata(w_src0, w_src1); dps++; MLIB_MIX; w_dst = vis_fpackfix_pair(dr2, dr3); vis_pst_16(w_dst, dpd, vis_edge16(dst, fdst)); dpd++; } else { w_src0 = vis_fand(w_maskand0, w_src0); w_src0 = vis_for(w_maskor0, w_src0); w_src0 = vis_fand(w_maskand1, w_src0); w_src0 = vis_for(w_maskor1, w_src0); w_lsrc = vis_faligndata(w_src0, w_src1); MLIB_MUL8; } w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); MLIB_MIX; w_src1 = w_src; w_dst = vis_fpackfix_pair(dr2, dr3); dps++; w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); dps++; for (i = 0; i < times; i++) { *dpd = w_dst; MLIB_MIX; w_src1 = w_src; w_src = vis_ld_d64_nf(dps); w_lsrc = vis_faligndata(w_src1, w_src); w_dst = vis_fpackfix_pair(dr2, dr3); dps++; dpd++; } } if (times >= 0) { vis_pst_16(w_dst, dpd, vis_edge16(dpd, fdst)); } ((mlib_s16 *)&fist->v16_last0)[0] = src[2 * n - 2]; ((mlib_s16 *)&fist->v16_last1)[1] = src[2 * n - 1]; return (MLIB_SUCCESS); }
mlib_status __mlib_VectorSub_S16_S8_Mod( mlib_s16 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { mlib_d64 *dpz, *dpx, *dpy; mlib_d64 dx, dy, dx0, dx1, dy0, dy1; mlib_d64 dxh, dxl, dyh, dyl, dzh, dzl; mlib_f32 fone = vis_to_float(0x100); mlib_s8 *px, *py; mlib_s16 *pz; mlib_s32 len = n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; mlib_d64 restore = vis_to_double_dup(0x80808080); if (n <= 0) return (MLIB_FAILURE); px = (mlib_s8 *)x; py = (mlib_s8 *)y; pz = (mlib_s16 *)z; if (n <= 8) { SUB_S16_S8_IN_C; } /* * prepare the source address */ while ((mlib_addr)pz & 7) { (*pz++) = ((mlib_s16)(*px)) - (*py); px++; py++; len--; } dpz = (mlib_d64 *)pz; even_8 = len >> 3; rest_8 = len & 0x7; if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) { /* * Both addresses are 8-byte aligned. * No vis_alignaddr and vis_faligndata at all. */ dpx = (mlib_d64 *)px; dpy = (mlib_d64 *)py; dx = vis_ld_d64_nf(dpx); dy = vis_ld_d64_nf(dpy); dpx++; dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dy1 = vis_ld_d64_nf(dpy); SUB_S8_S16; dx = dx1; dy = dy1; /* * store 16 bytes of result */ (*dpz++) = dzh; (*dpz++) = dzl; dpx++; dpy++; } } else if ((!((mlib_addr)px & 7))) { /* * First ("x") address is 8-byte aligned. * vis_alignaddr and vis_faligndata only for "y".** */ dpx = (mlib_d64 *)px; dpy = vis_alignaddr(py, 0); dy1 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx = (*dpx++); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); SUB_S8_S16; /* * store 16 bytes of result */ dpz[0] = dzh; dpz[1] = dzl; dpz += 2; } } else if ((!((mlib_addr)py & 7))) { /* * Second ("y") address is 8-byte aligned. * vis_alignaddr and vis_faligndata only for "x".** */ dpy = (mlib_d64 *)py; dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy = (*dpy++); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_S16; /* * store16 bytes of result */ dpz[0] = dzh; dpz[1] = dzl; dpz += 2; } } else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) { /* * Both ("x" and "y") address are identically aligned. * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy1 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_S16; /* * store 16 bytes of result */ dpz[0] = dzh; dpz[1] = dzl; dpz += 2; } } else { /* * Both ("x" and "y") address are arbitrary aligned. * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); #pragma pipeloop(0) for (i = 0; i < even_8; i++) { SUB_S8_S16; vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); /* * store 16 bytes of result */ dpz[0] = dzh; dpz[1] = dzl; dpz += 2; dpy++; dpx++; } } if (!rest_8) return (MLIB_SUCCESS); px += (even_8 << 3); py += (even_8 << 3); pz += (even_8 << 3); while (rest_8--) { (*pz++) = ((mlib_s16)(*px)) - (*py); px++; py++; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorSub_S8_S8_Sat( mlib_s8 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { mlib_d64 *dpz, *dpx, *dpy; mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1; mlib_d64 dxh, dxl, dyh, dyl, dzh, dzl; mlib_d64 dh, dl; mlib_s8 *pz = z, *px, *py, *pzend; /* offset of address alignment in destination */ mlib_s32 off; mlib_s32 len = n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; /* edge masks */ mlib_s32 emask; mlib_d64 displacement = vis_to_double_dup(0x8000800); mlib_d64 restore = vis_to_double_dup(0x80808080); mlib_f32 fmul = vis_to_float(0x1000); if (n <= 0) return (MLIB_FAILURE); px = (mlib_s8 *)x; py = (mlib_s8 *)y; /* initialize GSR scale factor */ vis_write_gsr(3 << 3); dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (mlib_addr)dpz - (mlib_addr)z; pzend = pz + n - 1; /* * generate edge mask for the start point */ emask = vis_edge8(pz, pzend); /* * prepare the source address */ if (off) { dpy = (mlib_d64 *)vis_alignaddr(py, off); dy0 = vis_ld_d64_nf(dpy); dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); dpx = (mlib_d64 *)vis_alignaddr(px, off); dx0 = vis_ld_d64_nf(dpx); dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); SUB_S8_SAT; /* * store first bytes of result */ vis_pst_8(dz, dpz, emask); px += (8 + off); py += (8 + off); len -= (8 + off); dpz++; if (len <= 0) return (MLIB_SUCCESS); } even_8 = len >> 3; rest_8 = len & 0x7; /* * Now try to analyze source "x" and "y" addresses. */ if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) { /* * Both addresses are 8-byte aligned. No vis_alignaddr * and vis_faligndata at all. */ dpx = (mlib_d64 *)px; dpy = (mlib_d64 *)py; dx = vis_ld_d64_nf(dpx); dpx++; dy = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dy1 = vis_ld_d64_nf(dpy); SUB_S8_SAT; dx = dx1; dy = dy1; /* * store 8 bytes of result */ dpz[0] = dz; dpx++; dpy++; dpz++; } dx1 = dx; dy1 = dy; } else if ((!((mlib_addr)px & 7))) { /* * First ("x") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "y". */ dpx = (mlib_d64 *)px; dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); dx = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { SUB_S8_SAT; dx = vis_ld_d64_nf(dpx); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); /* * store 8 bytes of result */ (*dpz++) = dz; dpx++; dpy++; } dx1 = dx; dy1 = dy0; } else if ((!((mlib_addr)py & 7))) { /* * Second ("y") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "x". */ dpy = (mlib_d64 *)py; dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy = (*dpy++); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_SAT; /* * store 8 bytes of result */ (*dpz++) = dz; } dy1 = vis_ld_d64_nf(dpy); dpy++; } else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) { /* * Both ("x" and "y") address are identically aligned. * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy1 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_SAT; /* * store 8 bytes of result */ (*dpz++) = dz; } } else { /* * Both ("x" and "y") address are arbitrary aligned. * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); /* #pragma pipeloop(0) */ for (i = 0; i < even_8; i++) { SUB_S8_SAT; vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); /* * store 8 bytes of result */ (*dpz++) = dz; dpy++; dpx++; } dx1 = dx0; dy1 = dy0; } if (!rest_8) return (MLIB_SUCCESS); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); SUB_S8_SAT; /* * prepare edge mask for the last bytes */ emask = vis_edge8((void *)(rest_8), pzend); /* store last bytes of result */ vis_pst_8(dz, dpz, ~emask); return (MLIB_SUCCESS); }
void mlib_v_ImageLookUpSI_S16_U8_4_DstOff3_D1( const mlib_s16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 **table) { /* pointer to source data */ mlib_s16 *sp; /* source data */ mlib_s32 s0, s1, s2; /* pointer to start of destination */ mlib_u8 *dl; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* loop variable */ mlib_s32 i; const mlib_u8 *tab0 = &table[0][32768]; const mlib_u8 *tab1 = &table[1][32768]; const mlib_u8 *tab2 = &table[2][32768]; const mlib_u8 *tab3 = &table[3][32768]; sp = (void *)src; dl = dst; dp = (mlib_d64 *)dl; vis_alignaddr((void *)0, 7); s0 = (*sp++); if (xsize >= 2) { s1 = sp[0]; s2 = sp[1]; sp += 2; #pragma pipeloop(0) for (i = 0; i <= xsize - 4; i += 2, sp += 2) { t7 = VIS_LD_U8_I(tab2, s2); t6 = VIS_LD_U8_I(tab1, s2); t5 = VIS_LD_U8_I(tab0, s2); t4 = VIS_LD_U8_I(tab3, s1); t3 = VIS_LD_U8_I(tab2, s1); t2 = VIS_LD_U8_I(tab1, s1); t1 = VIS_LD_U8_I(tab0, s1); t0 = VIS_LD_U8_I(tab3, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s2; s1 = sp[0]; s2 = sp[1]; (*dp++) = acc; } t7 = VIS_LD_U8_I(tab2, s2); t6 = VIS_LD_U8_I(tab1, s2); t5 = VIS_LD_U8_I(tab0, s2); t4 = VIS_LD_U8_I(tab3, s1); t3 = VIS_LD_U8_I(tab2, s1); t2 = VIS_LD_U8_I(tab1, s1); t1 = VIS_LD_U8_I(tab0, s1); t0 = VIS_LD_U8_I(tab3, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s2; (*dp++) = acc; } dl = (mlib_u8 *)dp; if ((xsize & 1) != 0) { s1 = sp[0]; t7 = VIS_LD_U8_I(tab2, s1); t6 = VIS_LD_U8_I(tab1, s1); t5 = VIS_LD_U8_I(tab0, s1); t4 = VIS_LD_U8_I(tab3, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); *(mlib_f32 *)dl = vis_read_hi(acc); dl += 4; s0 = s1; } dl[0] = tab3[s0]; }
void mlib_v_ImageLookUpSI_S16_U8_2_DstA8D1( const mlib_s16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 **table) { /* pointer to source data */ mlib_s16 *sp; /* source data */ mlib_s32 s0, s1, s2, s3; /* pointer to start of destination */ mlib_u16 *dl; /* pointer to end of destination */ mlib_u16 *dend; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* edge mask */ mlib_s32 emask; /* loop variable */ mlib_s32 i, num; const mlib_u8 *tab0 = &table[0][32768]; const mlib_u8 *tab1 = &table[1][32768]; sp = (void *)src; dl = (mlib_u16 *)dst; dp = (mlib_d64 *)dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 7); if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t7 = VIS_LD_U8_I(tab1, s3); t6 = VIS_LD_U8_I(tab0, s3); t5 = VIS_LD_U8_I(tab1, s2); t4 = VIS_LD_U8_I(tab0, s2); t3 = VIS_LD_U8_I(tab1, s1); t2 = VIS_LD_U8_I(tab0, s1); t1 = VIS_LD_U8_I(tab1, s0); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; (*dp++) = acc; } t7 = VIS_LD_U8_I(tab1, s3); t6 = VIS_LD_U8_I(tab0, s3); t5 = VIS_LD_U8_I(tab1, s2); t4 = VIS_LD_U8_I(tab0, s2); t3 = VIS_LD_U8_I(tab1, s1); t2 = VIS_LD_U8_I(tab0, s1); t1 = VIS_LD_U8_I(tab1, s0); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); (*dp++) = acc; } if ((mlib_addr)dp <= (mlib_addr)dend) { num = (mlib_u16 *)dend - (mlib_u16 *)dp; sp += num; #pragma pipeloop(0) for (i = 0; i <= num; i++) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(tab1, s0); acc = vis_faligndata(t0, acc); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t0, acc); } emask = vis_edge16(dp, dend); vis_pst_16(acc, dp, emask); } }
void mlib_v_ImageLookUpSI_S16_U8_3_D1( const mlib_s16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 **table) { /* pointer to source data */ mlib_s16 *sp; /* pointer to start of destination */ mlib_u8 *dl; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7; /* destination data */ mlib_d64 acc0, acc1, acc2; /* loop variable */ mlib_s32 i; const mlib_u8 *tab0 = &table[0][32768]; const mlib_u8 *tab1 = &table[1][32768]; const mlib_u8 *tab2 = &table[2][32768]; mlib_s32 s00, s01, s02, s03; mlib_s32 s10, s11, s12, s13; sp = (void *)src; dl = dst; dp = (mlib_d64 *)dl; vis_alignaddr((void *)0, 7); i = 0; if (xsize >= 8) { s00 = sp[0]; s01 = sp[1]; s02 = sp[2]; s03 = sp[3]; s10 = sp[4]; s11 = sp[5]; s12 = sp[6]; s13 = sp[7]; sp += 8; #pragma pipeloop(0) for (i = 0; i <= xsize - 16; i += 8, sp += 8) { t7 = VIS_LD_U8_I(tab1, s02); t6 = VIS_LD_U8_I(tab0, s02); t5 = VIS_LD_U8_I(tab2, s01); t4 = VIS_LD_U8_I(tab1, s01); t3 = VIS_LD_U8_I(tab0, s01); t2 = VIS_LD_U8_I(tab2, s00); t1 = VIS_LD_U8_I(tab1, s00); t0 = VIS_LD_U8_I(tab0, s00); acc0 = vis_faligndata(t7, acc0); acc0 = vis_faligndata(t6, acc0); acc0 = vis_faligndata(t5, acc0); acc0 = vis_faligndata(t4, acc0); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); t7 = VIS_LD_U8_I(tab0, s11); t6 = VIS_LD_U8_I(tab2, s10); t5 = VIS_LD_U8_I(tab1, s10); t4 = VIS_LD_U8_I(tab0, s10); t3 = VIS_LD_U8_I(tab2, s03); t2 = VIS_LD_U8_I(tab1, s03); t1 = VIS_LD_U8_I(tab0, s03); t0 = VIS_LD_U8_I(tab2, s02); acc1 = vis_faligndata(t7, acc1); acc1 = vis_faligndata(t6, acc1); acc1 = vis_faligndata(t5, acc1); acc1 = vis_faligndata(t4, acc1); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc1 = vis_faligndata(t1, acc1); acc1 = vis_faligndata(t0, acc1); t7 = VIS_LD_U8_I(tab2, s13); t6 = VIS_LD_U8_I(tab1, s13); t5 = VIS_LD_U8_I(tab0, s13); t4 = VIS_LD_U8_I(tab2, s12); t3 = VIS_LD_U8_I(tab1, s12); t2 = VIS_LD_U8_I(tab0, s12); t1 = VIS_LD_U8_I(tab2, s11); t0 = VIS_LD_U8_I(tab1, s11); acc2 = vis_faligndata(t7, acc2); acc2 = vis_faligndata(t6, acc2); acc2 = vis_faligndata(t5, acc2); acc2 = vis_faligndata(t4, acc2); acc2 = vis_faligndata(t3, acc2); acc2 = vis_faligndata(t2, acc2); acc2 = vis_faligndata(t1, acc2); acc2 = vis_faligndata(t0, acc2); s00 = sp[0]; s01 = sp[1]; s02 = sp[2]; s03 = sp[3]; s10 = sp[4]; s11 = sp[5]; s12 = sp[6]; s13 = sp[7]; (*dp++) = acc0; (*dp++) = acc1; (*dp++) = acc2; } t7 = VIS_LD_U8_I(tab1, s02); t6 = VIS_LD_U8_I(tab0, s02); t5 = VIS_LD_U8_I(tab2, s01); t4 = VIS_LD_U8_I(tab1, s01); t3 = VIS_LD_U8_I(tab0, s01); t2 = VIS_LD_U8_I(tab2, s00); t1 = VIS_LD_U8_I(tab1, s00); t0 = VIS_LD_U8_I(tab0, s00); acc0 = vis_faligndata(t7, acc0); acc0 = vis_faligndata(t6, acc0); acc0 = vis_faligndata(t5, acc0); acc0 = vis_faligndata(t4, acc0); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); t7 = VIS_LD_U8_I(tab0, s11); t6 = VIS_LD_U8_I(tab2, s10); t5 = VIS_LD_U8_I(tab1, s10); t4 = VIS_LD_U8_I(tab0, s10); t3 = VIS_LD_U8_I(tab2, s03); t2 = VIS_LD_U8_I(tab1, s03); t1 = VIS_LD_U8_I(tab0, s03); t0 = VIS_LD_U8_I(tab2, s02); acc1 = vis_faligndata(t7, acc1); acc1 = vis_faligndata(t6, acc1); acc1 = vis_faligndata(t5, acc1); acc1 = vis_faligndata(t4, acc1); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc1 = vis_faligndata(t1, acc1); acc1 = vis_faligndata(t0, acc1); t7 = VIS_LD_U8_I(tab2, s13); t6 = VIS_LD_U8_I(tab1, s13); t5 = VIS_LD_U8_I(tab0, s13); t4 = VIS_LD_U8_I(tab2, s12); t3 = VIS_LD_U8_I(tab1, s12); t2 = VIS_LD_U8_I(tab0, s12); t1 = VIS_LD_U8_I(tab2, s11); t0 = VIS_LD_U8_I(tab1, s11); acc2 = vis_faligndata(t7, acc2); acc2 = vis_faligndata(t6, acc2); acc2 = vis_faligndata(t5, acc2); acc2 = vis_faligndata(t4, acc2); acc2 = vis_faligndata(t3, acc2); acc2 = vis_faligndata(t2, acc2); acc2 = vis_faligndata(t1, acc2); acc2 = vis_faligndata(t0, acc2); (*dp++) = acc0; (*dp++) = acc1; (*dp++) = acc2; i += 8; } dl = (mlib_u8 *)dp; #pragma pipeloop(0) for (; i < xsize; i++) { s00 = sp[0]; dl[0] = tab0[s00]; dl[1] = tab1[s00]; dl[2] = tab2[s00]; dl += 3; sp++; } }
void mlib_v_ImageLookUp_S16_U8_124_D1(const mlib_s16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 *table0, const mlib_u8 *table1, const mlib_u8 *table2, const mlib_u8 *table3) { mlib_s16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_s32 s4, s5, s6, s7; /* source data */ mlib_u8 *dl; /* pointer to start of destination */ mlib_u8 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ dl = dst; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; sp = (void *)src; vis_alignaddr((void *)0, 7); if (xsize >= 8) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; s4 = sp[4]; s5 = sp[5]; s6 = sp[6]; s7 = sp[7]; sp += 8; #pragma pipeloop(0) for (i = 0; i <= xsize - 16; i += 8, sp += 8) { t7 = VIS_LD_U8_I(table3, s7); t6 = VIS_LD_U8_I(table2, s6); t5 = VIS_LD_U8_I(table1, s5); t4 = VIS_LD_U8_I(table0, s4); t3 = VIS_LD_U8_I(table3, s3); t2 = VIS_LD_U8_I(table2, s2); t1 = VIS_LD_U8_I(table1, s1); t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; s4 = sp[4]; s5 = sp[5]; s6 = sp[6]; s7 = sp[7]; *dp++ = acc; } t7 = VIS_LD_U8_I(table3, s7); t6 = VIS_LD_U8_I(table2, s6); t5 = VIS_LD_U8_I(table1, s5); t4 = VIS_LD_U8_I(table0, s4); t3 = VIS_LD_U8_I(table3, s3); t2 = VIS_LD_U8_I(table2, s2); t1 = VIS_LD_U8_I(table1, s1); t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); *dp++ = acc; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_addr) dend - (mlib_addr) dp; sp += num; num++; if ((num & 3) == 1) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t0, acc); num--; } else if ((num & 3) == 2) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table1, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t0, acc); num -= 2; } else if ((num & 3) == 3) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table2, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table1, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t0, acc); num -= 3; } if (num != 0) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table3, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table2, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table1, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t0, acc); } emask = vis_edge8(dp, dend); vis_pst_8(acc, dp, emask); } }
static mlib_status mlib_v_VideoColorYUV2ABGR422_nonalign( mlib_u8 *abgr, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 abgr_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp2, *sp3, *sl2, *sl3; /* pointers to src address */ mlib_u8 *sp1, *sl1; /* pointers to dst address */ mlib_u8 *dp, *dl, *dend; /* all. pointer to y */ mlib_d64 *spy; /* all. pointer to dst */ mlib_d64 *dpp; /* u, v data */ mlib_f32 fu0, fu1, fv0, fv1; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0); mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4); mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo; mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo; mlib_d64 dd, dd0, dd1; /* loop variable */ mlib_s32 i, j; /* alpha_ch. is not written */ mlib_s32 emask = 0x7777; mlib_s32 emask1; mlib_s32 off; mlib_f32 *dfu, *dfv; mlib_d64 du0, du1, dv0, dv1; mlib_s32 off2, off3; mlib_s32 inc; /* * initialize GSR scale factor */ vis_write_gsr(2 << 3); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dl = dp = (mlib_u8 *)abgr; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dpp = (mlib_d64 *)vis_alignaddr(dp, 0); dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3); off2 = (sp2 - (mlib_u8 *)dfu) * 2; dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3); off3 = (sp3 - (mlib_u8 *)dfv) * 2; dend = dp + width * 4 - 1; emask1 = vis_edge8(dp, dend); i = dp - (mlib_u8 *)dpp; emask >>= i; inc = (emask1 != 0xff); emask1 &= emask; off = 8 - i; vis_alignaddr((void *)off2, 0); fu0 = vis_ld_f32_nf(dfu); dfu++; fu1 = vis_ld_f32_nf(dfu); dfu++; du0 = vis_fpmerge(fu0, fu0); du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; vis_alignaddr((void *)off3, 0); fv0 = vis_ld_f32_nf(dfv); dfv++; fv1 = vis_ld_f32_nf(dfv); dfv++; dv0 = vis_fpmerge(fv0, fv0); dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); vis_alignaddr(sp1, 0); dy0 = vis_ld_d64_nf(spy); spy++; dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green_hi = vis_fpack16(temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue_hi = vis_fpack16(temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); red_hi = vis_fpack16(temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; green_lo = vis_fpack16(temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue_lo = vis_fpack16(temp_b_lo); x_green_hi = vis_fmul8x16au(green_hi, k5); red_lo = vis_fpack16(temp_r_lo); blue_red_hi = vis_fpmerge(blue_hi, red_hi); x_green_lo = vis_fmul8x16au(green_lo, k5); blue_red_lo = vis_fpmerge(blue_lo, red_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; vis_alignaddr((void *)off, 0); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); dd1 = vis_fpmerge(vis_read_hi(x_green_hi), vis_read_hi(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; inc = 1; /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); dd0 = vis_fpmerge(vis_read_lo(x_green_hi), vis_read_lo(blue_red_hi)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); dd1 = vis_fpmerge(vis_read_hi(x_green_lo), vis_read_hi(blue_red_lo)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dd0 = vis_fpmerge(vis_read_lo(x_green_lo), vis_read_lo(blue_red_lo)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); vis_alignaddr(sp1, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; emask1 = emask; } if (i < width) { vis_alignaddr((void *)off, 0); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green_hi = vis_fpack16(temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue_hi = vis_fpack16(temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); red_hi = vis_fpack16(temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); green_lo = vis_fpack16(temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue_lo = vis_fpack16(temp_b_lo); x_green_hi = vis_fmul8x16au(green_hi, k5); red_lo = vis_fpack16(temp_r_lo); blue_red_hi = vis_fpmerge(blue_hi, red_hi); x_green_lo = vis_fmul8x16au(green_lo, k5); blue_red_lo = vis_fpmerge(blue_lo, red_lo); dd1 = vis_fpmerge(vis_read_hi(x_green_hi), vis_read_hi(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dd0 = dd1; dpp += inc; i += 2; if (i < width) { dd1 = vis_fpmerge(vis_read_lo(x_green_hi), vis_read_lo(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; i += 2; if (i < width) { dd1 = vis_fpmerge(vis_read_hi (x_green_lo), vis_read_hi(blue_red_lo)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; } } } vis_alignaddr((void *)off, 0); emask1 = vis_edge8(dpp, dend); emask1 &= emask; dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; dl = dp = dl + abgr_stride; emask = 0x7777; } return (MLIB_SUCCESS); }
sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U16_emask(sd10, sd20, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd20 = vis_ld_d64_nf(spp2); if (emask != 0xf) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1);
mlib_status __mlib_VideoInterpAveX_U8_U8_16x16( mlib_u8 *curr_block, const mlib_u8 *ref_block, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_d64 s0, s1, s2, s3, s4, s5, s6; mlib_d64 sd0, sd1, sd2, sd3, d0, d1, d2, d3; mlib_d64 *sd, *dd; mlib_d64 dzero = vis_fzero(); const mlib_f32 fm2 = vis_to_float(0x1000200); mlib_f32 fzero = vis_read_hi(dzero); mlib_d64 rounder = vis_fpsub16(dzero, vis_fone()); mlib_s32 y; rounder = vis_fpadd16(vis_fpadd16(rounder, rounder), rounder); vis_write_gsr((5 << 3) + ((mlib_u32)ref_block & 7)); dd = (mlib_d64 *)curr_block; sd = (mlib_d64 *)((mlib_addr)ref_block & ~7); y = 8; if (((mlib_s32)(ref_block + 1) & 7)) { do { s0 = sd[0]; s1 = sd[1]; s2 = sd[2]; sd0 = vis_faligndata(s0, s1); sd1 = vis_faligndata(s1, s2); sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); s4 = sd[0]; s5 = sd[1]; s6 = sd[2]; sd2 = vis_faligndata(s4, s5); sd3 = vis_faligndata(s5, s6); vis_alignaddr((void *)(ref_block + 1), 0); sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d0 = dd[0]; d1 = dd[1]; d2 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[0]; d3 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[1]; s0 = vis_faligndata(s0, s1); s1 = vis_faligndata(s1, s2); s2 = vis_faligndata(s4, s5); s3 = vis_faligndata(s5, s6); MLIB_V_VIDEOINTERPAVG(d0, sd0, s0); MLIB_V_VIDEOINTERPAVG(d1, sd1, s1); MLIB_V_VIDEOINTERPAVG(d2, sd2, s2); MLIB_V_VIDEOINTERPAVG(d3, sd3, s3); dd[0] = d0; dd[1] = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); dd[0] = d2; dd[1] = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); vis_alignaddr((void *)ref_block, 0); } while (--y); } else { do { s0 = sd[0]; s1 = sd[1]; s2 = sd[2]; sd0 = vis_faligndata(s0, s1); sd1 = vis_faligndata(s1, s2); sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); s4 = sd[0]; s5 = sd[1]; s6 = sd[2]; sd2 = vis_faligndata(s4, s5); sd3 = vis_faligndata(s5, s6); sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d0 = dd[0]; d1 = dd[1]; d2 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[0]; d3 = ((mlib_d64 *)((mlib_u8 *)dd + field_stride))[1]; MLIB_V_VIDEOINTERPAVG0(d0, sd0, s1); MLIB_V_VIDEOINTERPAVG(d1, sd1, s2); MLIB_V_VIDEOINTERPAVG(d2, sd2, s5); MLIB_V_VIDEOINTERPAVG(d3, sd3, s6); dd[0] = d0; dd[1] = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); dd[0] = d2; dd[1] = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); } while (--y); } return (MLIB_SUCCESS); }
mlib_status mlib_v_ImageAdd_S16( mlib_image *dst, const mlib_image *src1, const mlib_image *src2) { mlib_s32 i, j, k; mlib_s32 offdst, offsrc1, offsrc2, emask; mlib_s32 amount; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr; mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21; mlib_s16 *dend; VALIDATE(mlib_s16); sl1 = sp1; sl2 = sp2; dl = dp; amount = width * channels; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 3) == 0) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd20 = vis_ld_d64_nf(spp2); if (emask != 0xf) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd20 = sd21; spp2++; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = (*spp1++); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc2) && (((strided ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the source addresses */ spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = vis_ld_d64_nf(spp1); if (emask != 0xf) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); sd10 = sd11; spp1++; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd10 = sd11; spp1++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd20 = (*spp2++); sd10 = vis_faligndata(sd10, vis_ld_d64_nf(spp1 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 3) == 0)) { for (j = 0; j < height; j++) { /* prepare the source addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; /* prepare the destination addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd0); if (emask != 0xf) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp++, emask); dd0 = dd1; i += 4; } #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); (*dpp++) = vis_faligndata(dd0, dd1); dd0 = dd1; } if (i < amount) { emask = vis_edge16(dpp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_S16(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else { /* common case */ for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_s16 *)dpp - dp; dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge16(dp, dend); if (emask != 0xf) { spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd10 = vis_faligndata(vis_ld_d64_nf(spp1), vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_faligndata(vis_ld_d64_nf(spp2), vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); vis_pst_16(dd, dpp++, emask); i += 4; } /* copy src1 to dst */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 2 * i); sd11 = vis_ld_d64_nf(spp1); tmp_ptr = dpp; #pragma pipeloop(0) for (k = i; k <= (amount - 4); k += 4) { sd10 = sd11; sd11 = vis_ld_d64_nf(spp1 + 1); (*tmp_ptr++) = vis_faligndata(sd10, sd11); spp1++; } sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 2 * i); sd20 = vis_ld_d64_nf(spp2); tmp_ptr = dpp; #pragma pipeloop(0) for (; i <= amount - 4; i += 4) { sd10 = (*tmp_ptr++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_S16(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge16(dpp, dend); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_S16(sd11, sd20, dd); vis_pst_16(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoInterpAveX_U8_U8( mlib_u8 *curr_block, const mlib_u8 *ref_block, mlib_s32 width, mlib_s32 height, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_d64 s0, s1, s2, s3, s4, s5, s6, s7; mlib_d64 sd0, sd1, sd2, sd3, d0, d1, d2, d3; mlib_d64 *sd, *dd; mlib_d64 dzero = vis_fzero(); const mlib_f32 fm2 = vis_to_float(0x1000200); mlib_f32 fzero = vis_read_hi(dzero); mlib_d64 rounder = vis_fpsub16(dzero, vis_fone()); mlib_s32 y; rounder = vis_fpadd16(vis_fpadd16(rounder, rounder), rounder); vis_write_gsr((5 << 3) + ((mlib_u32)ref_block & 7)); dd = (mlib_d64 *)curr_block; sd = (mlib_d64 *)((mlib_addr)ref_block & ~7); if (width == 8) { y = height >> 2; if (((mlib_s32)(ref_block + 1) & 7)) { do { s0 = sd[0]; s1 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd0 = vis_faligndata(s0, s1); s2 = sd[0]; s3 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd1 = vis_faligndata(s2, s3); s4 = sd[0]; s5 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd2 = vis_faligndata(s4, s5); s6 = sd[0]; s7 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd3 = vis_faligndata(s6, s7); vis_alignaddr((void *)(ref_block + 1), 0); d0 = *dd; d1 = *(mlib_d64 *)((mlib_u8 *)dd + field_stride); d2 = *(mlib_d64 *)((mlib_u8 *)dd + 2 * field_stride); d3 = *(mlib_d64 *)((mlib_u8 *)dd + 3 * field_stride); s0 = vis_faligndata(s0, s1); s1 = vis_faligndata(s2, s3); s2 = vis_faligndata(s4, s5); s3 = vis_faligndata(s6, s7); MLIB_V_VIDEOINTERPAVG(d0, sd0, s0); MLIB_V_VIDEOINTERPAVG(d1, sd1, s1); MLIB_V_VIDEOINTERPAVG(d2, sd2, s2); MLIB_V_VIDEOINTERPAVG(d3, sd3, s3); *dd = d0; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d2; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); vis_alignaddr((void *)ref_block, 0); } while (--y); } else { do { s0 = sd[0]; s1 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd0 = vis_faligndata(s0, s1); s2 = sd[0]; s3 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd1 = vis_faligndata(s2, s3); s4 = sd[0]; s5 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd2 = vis_faligndata(s4, s5); s6 = sd[0]; s7 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); sd3 = vis_faligndata(s6, s7); d0 = *dd; d1 = *(mlib_d64 *)((mlib_u8 *)dd + field_stride); d2 = *(mlib_d64 *)((mlib_u8 *)dd + 2 * field_stride); d3 = *(mlib_d64 *)((mlib_u8 *)dd + 3 * field_stride); MLIB_V_VIDEOINTERPAVG0(d0, sd0, s1); MLIB_V_VIDEOINTERPAVG(d1, sd1, s3); MLIB_V_VIDEOINTERPAVG(d2, sd2, s5); MLIB_V_VIDEOINTERPAVG(d3, sd3, s7); *dd = d0; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d2; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); } while (--y); } } else {
mlib_status __mlib_VectorConvert_S16_S32_Sat( mlib_s16 *z, const mlib_s32 *x, mlib_s32 n) { mlib_s32 *src = (void *)x; mlib_s16 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d0, d1, d2, d3, d4, d5, d6, d7, d8; mlib_s32 c; mlib_s32 len_64, even_length, rest_64, length = n, i; if (n < 16) { PACK_S_S(mlib_s32, mlib_s16, MLIB_S16_MAX, MLIB_S16_MIN); } /* * First try to align destination address for 8 bytes. */ while ((mlib_addr)dst & 7) { (*dst++) = (c = *src) > MLIB_S16_MAX ? MLIB_S16_MAX : (c < MLIB_S16_MIN ? MLIB_S16_MIN : c); src++; length--; } vis_write_gsr(16 << 3); rest_64 = length & 3; len_64 = length >> 2; even_length = len_64 << 2; ddst = (mlib_d64 *)dst; if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = (*dsrc++); (*ddst++) = vis_freg_pair(vis_fpackfix(d1), vis_fpackfix(d2)); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = (*dsrc++); d3 = (*dsrc++); d4 = (*dsrc++); (*ddst++) = vis_freg_pair(vis_fpackfix(d1), vis_fpackfix(d2)); (*ddst++) = vis_freg_pair(vis_fpackfix(d3), vis_fpackfix(d4)); } } else { /* * Source address is arbitrary aligned. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d4 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d4; d2 = (*dsrc++); d4 = vis_ld_d64_nf(dsrc); dsrc++; d5 = vis_faligndata(d1, d2); d6 = vis_faligndata(d2, d4); (*ddst++) = vis_freg_pair(vis_fpackfix(d5), vis_fpackfix(d6)); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d0 = d4; d1 = (*dsrc++); d2 = (*dsrc++); d3 = (*dsrc++); d4 = vis_ld_d64_nf(dsrc); dsrc++; d5 = vis_faligndata(d0, d1); d6 = vis_faligndata(d1, d2); d7 = vis_faligndata(d2, d3); d8 = vis_faligndata(d3, d4); (*ddst++) = vis_freg_pair(vis_fpackfix(d5), vis_fpackfix(d6)); (*ddst++) = vis_freg_pair(vis_fpackfix(d7), vis_fpackfix(d8)); } } for (i = 0; i < rest_64; i++) { c = src[even_length + i]; dst[even_length + i] = c > MLIB_S16_MAX ? MLIB_S16_MAX : (c < MLIB_S16_MIN ? MLIB_S16_MIN : c); } return (MLIB_SUCCESS); }
/* The case of even address of vector x */ static void mlib_VectorDotProd_U8C_al_x( mlib_d64 *z, const void *x, const void *y, mlib_s32 n) { mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y; mlib_d64 sum_r = 0.0, sum_i = 0.0; mlib_d64 *dpx, *dpy, *dpxend; mlib_d64 dx, dy, dy0, dy1; mlib_d64 dx_r, dy_r, dy_i; mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl; mlib_d64 d_ih, d_il, d_rh, d_rl; mlib_d64 ds_r, ds_i, ds1_r, ds1_i; mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF); mlib_d64 edge[2], fzero = vis_fzero(); mlib_f32 fsum; mlib_s32 d_left; mlib_s32 emask, off; edge[0] = edge[1] = 0; dpx = (mlib_d64 *)((mlib_addr)px & (~7)); off = (mlib_addr)dpx - (mlib_addr)px; dpy = vis_alignaddr((void *)py, off); pxend = px + n + n - 1; dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7)); emask = vis_edge8(px, pxend); vis_pst_8(dpx[0], edge, emask); dx = edge[0]; dy = vis_ld_d64_nf(dpy); if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) { vis_write_bmask(0x781A3C5E, 0); while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds_i = ds_r = ds1_i = ds1_r = 0.0; #pragma pipeloop(0) for (; d_left > 0; d_left--) { DPROD_U8C0; SUM_U8C; dx = dpx[1]; dy = dpy[1]; dpx++; dpy++; } ds_i = vis_fpadd32(ds_i, ds1_i); ds_r = vis_fpadd32(ds_r, ds1_r); fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } } else { mlib_s32 mask = ((mlib_addr)(py + off)) & 7; vis_write_bmask(0x11111111 * mask, 0x01234567); dy1 = vis_ld_d64_nf(dpy+1); dy = vis_bshuffle(dy, dy1); SET_ALIGN_U8C; while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds_i = ds_r = ds1_i = ds1_r = 0.0; #pragma pipeloop(0) for (; d_left > 0; d_left--) { DPROD_U8C; SUM_U8C; dy0 = dy1; dy1 = vis_ld_d64_nf(dpy+2); dx = vis_ld_d64_nf(dpx+1); dy = vis_bshuffle(dy0, dy1); dpx++; dpy++; } ds_i = vis_fpadd32(ds_i, ds1_i); ds_r = vis_fpadd32(ds_r, ds1_r); fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } } if ((mlib_addr)dpx <= (mlib_addr)pxend) { emask = vis_edge8(dpx, pxend); vis_pst_8(dx, edge + 1, emask); dx = edge[1]; SET_ALIGN_U8C; DPROD_U8C; SUM_U8C_TAIL; fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } z[0] = sum_r; z[1] = sum_i; #undef MAX_LOOP }
mlib_status __mlib_VectorConvert_U8_S8_Sat( mlib_u8 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *src = (void *)x; mlib_u8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6; mlib_s32 len_64, even_length, rest_64, length = n, i, off; mlib_s8 c; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 zero = vis_fzeros(); if (length < 16) { PACK_S_U(mlib_s8, mlib_u8); } /* * First, try to align destination address for 8 bytes . */ while ((mlib_addr)dst & 7) { (*dst++) = (c = (*src++)) < 0 ? 0 : c; length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr(7 << 3); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { /* * Source address is also 8-byte aligned. */ dsrc = (mlib_d64 *)src; /* * Peeling the 1st iteration. */ if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); } /* * Then loop with step==2. Unroll for 2 iterations. */ #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); d1 = (*dsrc++); d2 = vis_fmul8sux16(vis_fpmerge(vis_read_hi(d1), zero), four_16_ones); d3 = vis_fmul8sux16(vis_fpmerge(vis_read_lo(d1), zero), four_16_ones); (*ddst++) = vis_fpack16_pair(d2, d3); } } else { /* * Source address has arbitrary alignment. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); off = (mlib_addr)src & 7; vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_bshuffle(d1, d2); d4 = vis_fmul8sux16(d3, four_16_ones); d3 = vis_faligndata(d3, d3); d5 = vis_fmul8sux16(d3, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(4) for (i; i < len_64; i += 2) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d3 = vis_bshuffle(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d6 = vis_bshuffle(d1, d2); d4 = vis_fmul8sux16(d3, four_16_ones); d3 = vis_faligndata(d3, d3); d5 = vis_fmul8sux16(d3, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); d4 = vis_fmul8sux16(d6, four_16_ones); d6 = vis_faligndata(d6, d6); d5 = vis_fmul8sux16(d6, four_16_ones); (*ddst++) = vis_fpack16_pair(d4, d5); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = (c = src[even_length + i]) < 0 ? 0 : c; return (MLIB_SUCCESS); }
void mlib_v_ImageLookUp_S16_U16_124_D1(const mlib_s16 *src, mlib_u16 *dst, mlib_s32 xsize, const mlib_u16 *table0, const mlib_u16 *table1, const mlib_u16 *table2, const mlib_u16 *table3) { mlib_s16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_u16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, acc0; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ dl = dst; sp = (void *)src; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 6); i = 0; if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t3 = VIS_LD_U16_I(table3, 2 * s3); t2 = VIS_LD_U16_I(table2, 2 * s2); t1 = VIS_LD_U16_I(table1, 2 * s1); t0 = VIS_LD_U16_I(table0, 2 * s0); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; *dp++ = acc0; } t3 = VIS_LD_U16_I(table3, 2 * s3); t2 = VIS_LD_U16_I(table2, 2 * s2); t1 = VIS_LD_U16_I(table1, 2 * s1); t0 = VIS_LD_U16_I(table0, 2 * s0); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); *dp++ = acc0; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_u16 *) dend - (mlib_u16 *) dp; sp += num; num++; if (num == 1) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U16_I(table0, 2 * s0); acc0 = vis_faligndata(t0, acc0); } else if (num == 2) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U16_I(table1, 2 * s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U16_I(table0, 2 * s0); acc0 = vis_faligndata(t0, acc0); } else if (num == 3) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U16_I(table2, 2 * s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U16_I(table1, 2 * s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U16_I(table0, 2 * s0); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge16(dp, dend); vis_pst_16(acc0, dp, emask); } }
mlib_status __mlib_VectorConvert_S8_S16_Sat( mlib_s8 *z, const mlib_s16 *x, mlib_s32 n) { mlib_s16 *src = (void *)x; mlib_s8 *dst = z; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, d5, d6, d7; mlib_s32 len_64, even_length, rest_64, length = n, i; mlib_s16 c; if (n < 16) { PACK_S_S(mlib_s16, mlib_s8, MLIB_S8_MAX, MLIB_S8_MIN); } /* * First try to align destination address for 8 bytes . */ while ((mlib_s32)dst & 7) { (*dst++) = (c = (*src++)) < MLIB_S8_MIN ? MLIB_S8_MIN : (c > MLIB_S8_MAX ? MLIB_S8_MAX : c); length--; } rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; ddst = (mlib_d64 *)dst; vis_write_gsr64(((mlib_u64)0x082A4C6E << 32) | (8 << 3) | 2); /* * Now analyze source address alignment. */ if (((mlib_addr)src & 7) == 0) { dsrc = (mlib_d64 *)src; if (i = (len_64 & 1)) { d1 = (*dsrc++); d2 = (*dsrc++); d3 = vis_fpackfix_pair(d1, d2); d1 = vis_faligndata(d1, d1); d2 = vis_faligndata(d2, d2); d4 = vis_fpackfix_pair(d1, d2); (*ddst++) = vis_bshuffle(d3, d4); } #pragma pipeloop(0) #pragma unroll(2) for (; i < len_64; i += 2) { d1 = (*dsrc++); d2 = (*dsrc++); d3 = vis_fpackfix_pair(d1, d2); d1 = vis_faligndata(d1, d1); d2 = vis_faligndata(d2, d2); d4 = vis_fpackfix_pair(d1, d2); (*ddst++) = vis_bshuffle(d3, d4); d1 = (*dsrc++); d2 = (*dsrc++); d3 = vis_fpackfix_pair(d1, d2); d1 = vis_faligndata(d1, d1); d2 = vis_faligndata(d2, d2); d4 = vis_fpackfix_pair(d1, d2); (*ddst++) = vis_bshuffle(d3, d4); } } else { /* * Source address is arbitrary aligned. Use vis_alignaddr() and * vis_faligndata() functions. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d4 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d5 = vis_faligndata(d1, d2); d3 = vis_fpackfix_pair(d4, d5); d4 = vis_fpack32(d4, d4); d4 = vis_fpack32(d4, d4); d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5)); d5 = vis_fpmerge(vis_read_lo(d5), vis_read_hi(d5)); d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5)); d4 = vis_fpackfix_pair(d4, d5); (*ddst++) = vis_bshuffle(d3, d4); } /* * Then loop with step==2. */ #pragma pipeloop(0) #pragma unroll(2) for (i; i < len_64; i += 2) { d1 = d2; d2 = (*dsrc++); d4 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d5 = vis_faligndata(d1, d2); d1 = d2; d2 = (*dsrc++); d6 = vis_faligndata(d1, d2); d1 = d2; d2 = vis_ld_d64_nf(dsrc); dsrc++; d7 = vis_faligndata(d1, d2); d3 = vis_fpackfix_pair(d4, d5); d4 = vis_fpack32(d4, d4); d4 = vis_fpack32(d4, d4); d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5)); d5 = vis_fpmerge(vis_read_lo(d5), vis_read_hi(d5)); d5 = vis_fpmerge(vis_read_hi(d5), vis_read_lo(d5)); d4 = vis_fpackfix_pair(d4, d5); d5 = vis_fpackfix_pair(d6, d7); d6 = vis_fpack32(d6, d6); d6 = vis_fpack32(d6, d6); d7 = vis_fpmerge(vis_read_hi(d7), vis_read_lo(d7)); d7 = vis_fpmerge(vis_read_lo(d7), vis_read_hi(d7)); d7 = vis_fpmerge(vis_read_hi(d7), vis_read_lo(d7)); d6 = vis_fpackfix_pair(d6, d7); (*ddst++) = vis_bshuffle(d3, d4); (*ddst++) = vis_bshuffle(d5, d6); } } for (i = 0; i < rest_64; i++) { c = src[even_length + i]; dst[even_length + i] = c < MLIB_S8_MIN ? MLIB_S8_MIN : (c > MLIB_S8_MAX ? MLIB_S8_MAX : c); } return (MLIB_SUCCESS); }
void ADD_SUFF(ThreeByteBgrToIntArgbScaleConvert)(SCALE_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dd, maskFF; mlib_s32 i, i0, i1, j; if (width < 16) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; mlib_s32 *dst_end = dst + width; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); for (; dst < dst_end; dst++) { i = tmpsxloc >> shift; tmpsxloc += sxinc; *(mlib_s32*)dst = GBR_PIXEL(i); } PTR_ADD(dstBase, dstScan); syloc += syinc; } return; } maskFF = vis_fone(); vis_alignaddr(NULL, 7); for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_f32 *dst = dstBase; mlib_f32 *dst_end = dst + width; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); if ((mlib_s32)dst & 7) { i = tmpsxloc >> shift; tmpsxloc += sxinc; *(mlib_s32*)dst = GBR_PIXEL(i); dst++; } #pragma pipeloop(0) for (; dst <= dst_end - 2; dst += 2) { i0 = tmpsxloc >> shift; i1 = (tmpsxloc + sxinc) >> shift; tmpsxloc += 2*sxinc; dd = vis_faligndata(vis_ld_u8(src + 3*i1 ), dd); dd = vis_faligndata(vis_ld_u8(src + 3*i1 + 1), dd); dd = vis_faligndata(vis_ld_u8(src + 3*i1 + 2), dd); dd = vis_faligndata(maskFF, dd); dd = vis_faligndata(vis_ld_u8(src + 3*i0 ), dd); dd = vis_faligndata(vis_ld_u8(src + 3*i0 + 1), dd); dd = vis_faligndata(vis_ld_u8(src + 3*i0 + 2), dd); dd = vis_faligndata(maskFF, dd); *(mlib_d64*)dst = dd; } for (; dst < dst_end; dst++) { i = tmpsxloc >> shift; tmpsxloc += sxinc; *(mlib_s32*)dst = GBR_PIXEL(i); } PTR_ADD(dstBase, dstScan); syloc += syinc; }
mlib_status __mlib_VideoDCT8x8Quantize_S16_S16_B12_NA( mlib_s16 coeffs[64], const mlib_s16 *block, const mlib_d64 qtable[64]) { mlib_d64 *sp = (mlib_d64 *)block; mlib_d64 *dp = (mlib_d64 *)coeffs; mlib_d64 d00, d10, d20, d30, d40, d50, d60, d70; mlib_d64 d01, d11, d21, d31, d41, d51, d61, d71; mlib_d64 t00, t10, t20, t30, t40, t50, t60, t70, t80, t90; mlib_d64 t01, t11, t21, t31, t41, t51, t61, t71, t81, t91; mlib_d64 r00, r10, r20, r30, r40, r50, r60, r70; mlib_d64 r01, r11, r21, r31, r41, r51, r61, r71; mlib_f32 FCOS, c17, c26, c35, c_4; mlib_s32 mask; mlib_d64 w_const = vis_to_double_dup(0x4000); if (block == NULL || coeffs == NULL) return (MLIB_FAILURE); if (!(((mlib_addr)block | (mlib_addr)coeffs) & 7)) { return (__mlib_VideoDCT8x8Quantize_S16_S16_B12(coeffs, block, qtable)); } vis_write_gsr(1 << 3); /* * first stage */ LOAD_DATA_GE_INTER1; TRANSPOSE(d00, d20, d40, d60, r00, r10, r20, r30); TRANSPOSE(d10, d30, d50, d70, r40, r50, r60, r70); LOADCONSTS4_12; PREPARE_DATA_INTER(0); LOAD_DATA_GE_INTER2; TRANSPOSE(d01, d21, d41, d61, r01, r11, r21, r31); COMPUTING_DATA(0); TRANSPOSE(d11, d31, d51, d71, r41, r51, r61, r71); PREPARE_DATA_INTER(1); COMPUTING_DATA(1); /* * second stage */ TRANSPOSE(d01, d11, d21, d31, r40, r50, r60, r70); TRANSPOSE(d00, d10, d20, d30, r00, r10, r20, r30); PREPARE_DATA_INTER(0); TRANSPOSE(d40, d50, d60, d70, r01, r11, r21, r31); COMPUTING_DATA_12(0); TRANSPOSE(d41, d51, d61, d71, r41, r51, r61, r71); ENDSCALE_12(0); dp = (mlib_d64 *)vis_alignaddr(coeffs, -1); mask = 0xFF >> ((mlib_addr)coeffs - (mlib_addr)dp); vis_alignaddrl((void *)coeffs, 0); PREPARE_DATA_INTER(1); COMPUTING_DATA_12(1); ENDSCALE_12(1); Quant_ST_NA(d00, d00, qtable[0]); Quant_ST_NA(d01, d01, qtable[1]); Quant_ST_NA(d10, d10, qtable[2]); Quant_ST_NA(d11, d11, qtable[3]); Quant_ST_NA(d20, d20, qtable[4]); Quant_ST_NA(d21, d21, qtable[5]); Quant_ST_NA(d30, d30, qtable[6]); Quant_ST_NA(d31, d31, qtable[7]); Quant_ST_NA(d40, d40, qtable[8]); Quant_ST_NA(d41, d41, qtable[9]); Quant_ST_NA(d50, d50, qtable[10]); Quant_ST_NA(d51, d51, qtable[11]); Quant_ST_NA(d60, d60, qtable[12]); Quant_ST_NA(d61, d61, qtable[13]); Quant_ST_NA(d70, d70, qtable[14]); Quant_ST_NA(d71, d71, qtable[15]); dp[1] = vis_faligndata(d00, d01); dp[2] = vis_faligndata(d01, d10); dp[3] = vis_faligndata(d10, d11); dp[4] = vis_faligndata(d11, d20); dp[5] = vis_faligndata(d20, d21); dp[6] = vis_faligndata(d21, d30); dp[7] = vis_faligndata(d30, d31); dp[8] = vis_faligndata(d31, d40); dp[9] = vis_faligndata(d40, d41); dp[10] = vis_faligndata(d41, d50); dp[11] = vis_faligndata(d50, d51); dp[12] = vis_faligndata(d51, d60); dp[13] = vis_faligndata(d60, d61); dp[14] = vis_faligndata(d61, d70); dp[15] = vis_faligndata(d70, d71); vis_pst_8(vis_faligndata(d71, d71), dp + 16, ~mask); if ((mlib_addr)coeffs & 7) vis_pst_8(vis_faligndata(d00, d00), dp, mask); return (MLIB_SUCCESS); }
void ADD_SUFF(ThreeByteBgrToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 *sp; mlib_d64 s_0; mlib_d64 s0, s1, s2, s3, sd0, sd1, sd2, dd0, dd1, dd2, dd3; mlib_s32 i, i0, j; if (width < 16) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_s32 *dst = dstBase; for (i = 0; i < width; i++) { dst[i] = GBR_PIXEL(i); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } if (srcScan == 3*width && dstScan == 4*width) { width *= height; height = 1; } s_0 = vis_fone(); for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_f32 *dst = dstBase; i = i0 = 0; if ((mlib_s32)dst & 7) { ((mlib_s32*)dst)[i] = GBR_PIXEL(i); i0 = 1; } sp = vis_alignaddr(src, 3*i0); s3 = *sp++; #pragma pipeloop(0) for (i = i0; i <= (mlib_s32)width - 8; i += 8) { s0 = s3; s1 = *sp++; s2 = *sp++; s3 = *sp++; sd0 = vis_faligndata(s0, s1); sd1 = vis_faligndata(s1, s2); sd2 = vis_faligndata(s2, s3); BGR_TO_ARGB *(mlib_d64*)(dst + i ) = dd0; *(mlib_d64*)(dst + i + 2) = dd1; *(mlib_d64*)(dst + i + 4) = dd2; *(mlib_d64*)(dst + i + 6) = dd3; } for (; i < width; i++) { ((mlib_s32*)dst)[i] = GBR_PIXEL(i); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } }
mlib_status __mlib_VectorSubS_S16C_S16C_Sat( mlib_s16 *z, const mlib_s16 *x, const mlib_s16 *c, mlib_s32 n) { mlib_d64 *dpz, *dpx; mlib_d64 dx, dz, dx0, dx1, dr0, dr1, dr2; mlib_s16 *pz, *px, *pzend; /* offset of address alignment in destination */ mlib_s32 off; /* edge masks */ mlib_s32 emask; mlib_s32 mask1, mask2; mlib_s32 ovl, und; mlib_u16 uc0 = *((mlib_s16 *)c); mlib_u16 uc1 = *((mlib_s16 *)c + 1); mlib_d64 dc = ((mlib_addr)z & 2) ? vis_to_double_dup((uc1 << 16) | uc0) : vis_to_double_dup((uc0 << 16) | uc1); mlib_d64 fzero = vis_fzero(); mlib_d64 const_ovl = vis_to_double_dup(0x7fff7fff); mlib_d64 const_und = vis_fnot(const_ovl); mlib_s32 len = n + n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; if (n <= 0) return (MLIB_FAILURE); px = (mlib_s16 *)x; pz = (mlib_s16 *)z; /* * prepare the destination address */ dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (mlib_addr)dpz - (mlib_addr)z; pzend = pz + n + n - 1; /* * generate edge mask for the start point */ emask = vis_edge16(pz, pzend); /* * prepare the destination address */ if (off) { dpx = (mlib_d64 *)vis_alignaddr(px, off); dx0 = vis_ld_d64_nf(dpx); dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); SUBS16_SAT; px += (8 + off) >> 1; len -= (8 + off) >> 1; dpz++; } if (len <= 0) return (MLIB_SUCCESS); even_8 = len >> 2; rest_8 = len & 0x3; emask = 0xf; /* * Now try to analyze source "x" and "y" addresses. */ if (!((mlib_addr)px & 7)) { dpx = (mlib_d64 *)px; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx = (*dpx++); SUBS16_SAT; dpz++; } dx1 = vis_ld_d64_nf(dpx); dpx++; } else { dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUBS16_SAT; dx0 = dx1; dpz++; } dx1 = dx0; } if (!rest_8) return (MLIB_SUCCESS); /* * prepare edge mask for the last bytes */ emask = ~(vis_edge16((void *)(rest_8 << 1), pzend)); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); SUBS16_SAT; return (MLIB_SUCCESS); }
mlib_status mlib_v_conv5x5_8nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u8 *sa, *sa1, *sa2, *sa3, *sa4; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointers to rows in interm. src buf */ mlib_d64 *sbuf3, *sbuf4, *sbuf5; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf, *dbuf1; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3, *s4, *s5; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d3, d4, d5; /* data */ mlib_d64 d11, d12, d13, d14, d15; /* data */ mlib_d64 d21, d22, d23, d24, d25; /* data */ mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5; mlib_f32 k1k2, k3k4, k5k6, k7k8; mlib_f32 k9k10, k11k12, k13k14, k15k16; mlib_f32 k17k18, k19k20, k21k22, k23k24, k25; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1; mlib_d64 tmp0, tmp1, rnd; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1; mlib_s32 emask; mlib_s32 rval, gsr_scale, i, j; gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3)); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); buf_slb = (4 * dw + 24) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 4; dw *= 4; dh -= 4; sa = adr_src; sa1 = sa + slb; sa2 = sa1 + slb; sa3 = sa2 + slb; sa4 = sa3 + slb; d_a = adr_dst + 2 * dlb + 8; /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf2, sa); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf3, sa1); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf4, sa2); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf5, sa3); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); PREPARE_TO_LOAD_LINE(sbuf5, sa4); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER_NF(16); vis_alignaddr(s1, 4); dbuf1 = dbuf; d1 = *s1; d2 = *s2; d3 = *s3; d11 = *(s1 + 1); d12 = *(s2 + 1); d13 = *(s3 + 1); #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d21 = *(s1 + 2); d22 = *(s2 + 2); d23 = *(s3 + 2); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k5k6); CONV_AU(d3, k11k12); dt_1 = vis_faligndata(d1, d11); dt_2 = vis_faligndata(d2, d12); dt_3 = vis_faligndata(d3, d13); CONV_AL(dt_1, k1k2); CONV_AU(dt_2, k7k8); CONV_AL(dt_3, k11k12); CONV_AU(d11, k3k4); CONV_AL(d12, k7k8); CONV_AU(d13, k13k14); dt_1 = vis_faligndata(d11, d21); dt_2 = vis_faligndata(d12, d22); dt_3 = vis_faligndata(d13, d23); CONV_AL(dt_1, k3k4); CONV_AU(dt_2, k9k10); CONV_AL(dt_3, k13k14); CONV_AU(d21, k5k6); CONV_AL(d22, k9k10); CONV_AU(d23, k15k16); dbuf1[0] = out0; dbuf1[1] = out1; dbuf1 += 2; d1 = d11; d2 = d12; d3 = d13; d11 = d21; d12 = d22; d13 = d23; s1++; s2++; s3++; } dbuf1 = dbuf; d4 = *s4; d5 = *s5; d14 = *(s4 + 1); d15 = *(s5 + 1); #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d24 = *(s4 + 2); d25 = *(s5 + 2); out0 = dbuf1[0]; out1 = dbuf1[1]; CONV_AL(d4, k15k16); CONV_AU(d5, k21k22); dt_4 = vis_faligndata(d4, d14); dt_5 = vis_faligndata(d5, d15); CONV_AU(dt_4, k17k18); CONV_AL(dt_5, k21k22); CONV_AL(d14, k17k18); CONV_AU(d15, k23k24); dt_4 = vis_faligndata(d14, d24); dt_5 = vis_faligndata(d15, d25); CONV_AU(dt_4, k19k20); CONV_AL(dt_5, k23k24); CONV_AL(d24, k19k20); CONV_AU(d25, k25); dbuf1 += 2; (*ddst++) = vis_fpack16_pair(out0, out1); d4 = d14; d5 = d15; d14 = d24; d15 = d25; s4++; s5++; } PREPARE_TO_COPY_INTERM_BUF_TO_DST(); #pragma pipeloop(0) COPY_INTERM_BUF_TO_DST(); COPY_TAIL(); sa4 = sa4 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }