void mlib_v_ImageNot_na( mlib_u8 *sa, mlib_u8 *da, mlib_s32 size) { /* end points in dst */ mlib_u8 *dend; /* 8-byte aligned start points in dst */ mlib_d64 *dp; /* 8-byte aligned start point in src */ mlib_d64 *sp; /* 8-byte source data */ mlib_d64 s0, s1; /* offset of address in dst */ mlib_s32 j; /* edge mask */ mlib_s32 emask; /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr)da & (~7)); j = (mlib_addr)dp - (mlib_addr)da; dend = da + size - 1; /* prepare the source address */ sp = (mlib_d64 *)vis_alignaddr(sa, j); /* generate edge mask for the start point */ emask = vis_edge8(da, dend); s1 = vis_ld_d64_nf(sp); if (emask != 0xff) { s0 = s1; s1 = vis_ld_d64_nf(sp + 1); s0 = vis_faligndata(s0, s1); vis_pst_8(vis_fnot(s0), dp++, emask); sp++; j += 8; } #pragma pipeloop(0) for (; j <= (size - 8); j += 8) { s0 = s1; s1 = vis_ld_d64_nf(sp + 1); (*dp++) = vis_fnot(vis_faligndata(s0, s1)); sp++; } if (j < size) { s0 = vis_faligndata(s1, vis_ld_d64_nf(sp + 1)); emask = vis_edge8(dp, dend); vis_pst_8(vis_fnot(s0), dp, emask); } }
void mlib_ImageLineXor8000( const mlib_u8 *src, mlib_u8 *dst, mlib_s32 size) { mlib_u8 *dend; mlib_d64 *dptr; mlib_d64 *sptr; mlib_d64 s0, s1; mlib_d64 mask8000 = vis_to_double_dup(0x80008000); mlib_s32 j; mlib_s32 emask; /* prepare the destination addresses */ dptr = (mlib_d64 *)((mlib_addr)dst & (~7)); j = (mlib_addr)dptr - (mlib_addr)dst; dend = (mlib_u8 *)dst + size - 1; /* prepare the source address */ sptr = (mlib_d64 *)VIS_ALIGNADDR(src, j); /* generate edge mask for the start point */ emask = vis_edge8(dst, dend); s1 = vis_ld_d64_nf(sptr); if (emask != 0xff) { s0 = s1; s1 = vis_ld_d64_nf(sptr + 1); s0 = vis_fxor(vis_faligndata(s0, s1), mask8000); vis_pst_8(s0, dptr++, emask); sptr++; j += 8; } #pragma pipeloop(0) for (; j <= (size - 16); j += 8) { s0 = s1; s1 = sptr[1]; (*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000); sptr++; } if (j <= (size - 8)) { s0 = s1; s1 = vis_ld_d64_nf(sptr + 1); (*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000); sptr++; j += 8; } if (j < size) { s0 = vis_fxor(vis_faligndata(s1, vis_ld_d64_nf(sptr + 1)), mask8000); emask = vis_edge8(dptr, dend); vis_pst_8(s0, dptr, emask); } }
mlib_status __mlib_VideoColorSplit3_S16( mlib_s16 *color1, mlib_s16 *color2, mlib_s16 *color3, const mlib_s16 *colors, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)colors; mlib_d64 *dp0 = (mlib_d64 *)color1; mlib_d64 *dp1 = (mlib_d64 *)color2; mlib_d64 *dp2 = (mlib_d64 *)color3; mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2, dd3; mlib_s32 i; vis_write_gsr(4); vis_write_bmask(0x02CE13DF, 0); #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i <= (n - 4); i += 4) { sd0 = sp[0]; sd1 = sp[1]; sd2 = sp[2]; dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); (*dp0++) = vis_bshuffle(dd0, dd1); dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); dd3 = vis_faligndata(dd0, dd2); (*dp1++) = vis_bshuffle(dd3, dd3); (*dp2++) = vis_bshuffle(dd1, dd2); sp += 3; } /* * last 4 pixels */ if (i < n) { mlib_s32 emask = 0xF0 >> (n & 3); mlib_d64 st0, st1, st2; sd0 = sp[0]; sd1 = vis_ld_d64_nf(sp + 1); sd2 = vis_ld_d64_nf(sp + 2); dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_hi(sd2)); dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_lo(sd1)); st0 = vis_bshuffle(dd0, dd1); dd2 = vis_fpmerge(vis_read_hi(sd1), vis_read_lo(sd2)); dd3 = vis_faligndata(dd0, dd2); st1 = vis_bshuffle(dd3, dd3); st2 = vis_bshuffle(dd1, dd2); vis_pst_16(st0, dp0, emask); vis_pst_16(st1, dp1, emask); vis_pst_16(st2, dp2, emask); }
void mlib_v_ImageLookUpSI_S32_S16_4_DstOff2_D1(mlib_s32 *src, mlib_s16 *dst, mlib_s32 xsize, mlib_s16 **table) { mlib_s32 *sp; /* pointer to source data */ mlib_s32 s0, s1; /* source data */ mlib_s16 *dl; /* pointer to start of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc; /* destination data */ mlib_s32 i; /* loop variable */ mlib_s16 *tab0 = &table[0][(mlib_u32)2147483648]; mlib_s16 *tab1 = &table[1][(mlib_u32)2147483648]; mlib_s16 *tab2 = &table[2][(mlib_u32)2147483648]; mlib_s16 *tab3 = &table[3][(mlib_u32)2147483648]; sp = src; dl = dst; dp = (mlib_d64 *) dl; vis_alignaddr((void *) 0, 6); s0 = *sp++; if (xsize >= 1) { s1 = *sp++; #pragma pipeloop(0) for(i = 0; i <= xsize - 2; i++) { t3 = vis_ld_u16_i(tab1, ((mlib_addr)2*s1)); t2 = vis_ld_u16_i(tab0, ((mlib_addr)2*s1)); t1 = vis_ld_u16_i(tab3, ((mlib_addr)2*s0)); t0 = vis_ld_u16_i(tab2, ((mlib_addr)2*s0)); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s1; s1 = *sp++; *dp++ = acc; } t3 = vis_ld_u16_i(tab1, ((mlib_addr)2*s1)); t2 = vis_ld_u16_i(tab0, ((mlib_addr)2*s1)); t1 = vis_ld_u16_i(tab3, ((mlib_addr)2*s0)); t0 = vis_ld_u16_i(tab2, ((mlib_addr)2*s0)); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s1; *dp++ = acc; } dl = (mlib_s16*)dp; dl[0] = tab2[s0]; dl[1] = tab3[s0]; }
mlib_status __mlib_VideoColorSplit2_S16( mlib_s16 *color1, mlib_s16 *color2, const mlib_s16 *colors, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)colors; mlib_d64 *dp0 = (mlib_d64 *)color1; mlib_d64 *dp1 = (mlib_d64 *)color2; mlib_d64 sd0, sd1, dd0, dd1, dd2, dd3; mlib_s32 i; vis_write_gsr64(((mlib_u64)0x014589cd << 32) | 2); /* * 8-pixels loop */ sd0 = sp[0]; sd1 = vis_ld_d64_nf(sp + 1); dd0 = vis_faligndata(sd0, sd1); dd1 = vis_faligndata(sd1, sd0); dd2 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(dd0, dd1); sd0 = vis_ld_d64_nf(sp + 2); sd1 = vis_ld_d64_nf(sp + 3); dd0 = vis_faligndata(sd0, sd1); dd1 = vis_faligndata(sd1, sd0); #pragma pipeloop(0) for (i = 0; i < (n / 4); i++) { (*dp0++) = dd2; (*dp1++) = dd3; dd2 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(dd0, dd1); sd0 = vis_ld_d64_nf(sp + 4); sd1 = vis_ld_d64_nf(sp + 5); dd0 = vis_faligndata(sd0, sd1); dd1 = vis_faligndata(sd1, sd0); sp += 2; } /* * last 8 pixels */ if (n & 3) { mlib_s32 emask = 0xF0 >> (n & 3); sd0 = sp[0]; sd1 = vis_ld_d64_nf(sp + 1); dd0 = vis_faligndata(sd0, sd1); dd1 = vis_faligndata(sd1, sd0); dd2 = vis_bshuffle(sd0, sd1); dd3 = vis_bshuffle(dd0, dd1); vis_pst_16(dd2, (mlib_f32 *)dp0, emask); vis_pst_16(dd3, (mlib_f32 *)dp1, emask); }
void mlib_v_ImageColorRGB2Mono_U8_A8D2X8( const mlib_u8 *src, mlib_s32 slb, mlib_u8 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize, const mlib_d64 *weight) { mlib_d64 *sp, *dp; mlib_d64 *sl, *dl; mlib_d64 sd0, sd1, sd2, sd3; mlib_d64 dd; mlib_d64 rgdd0, bdd0, rgdd1, bdd1, ddt; mlib_f32 d32, e32, alpha, gamma, beta; mlib_s32 i, j; mlib_s32 mask0 = 0x0369147a; mlib_s32 mask1 = 0x258b258b; sp = sl = (mlib_d64 *)src; dp = dl = (mlib_d64 *)dst; /* prepare the weight */ alpha = vis_to_float(weight[0] * 8192); beta = vis_to_float(weight[1] * 8192); gamma = vis_to_float(weight[2] * 8192); vis_write_gsr((2 << 3) + 4); for (j = 0; j < ysize; j++) { #pragma pipeloop(0) for (i = 0; i < xsize / 8; i++) { sd0 = (*sp++); sd1 = (*sp++); sd2 = (*sp++); sd3 = vis_faligndata(sd2, sd2); sd2 = vis_faligndata(sd1, sd2); CHANNELSEPARATE_U8_AL(sd0, sd1, sd2, sd3, rgdd0, bdd0, rgdd1, bdd1); CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd); (*dp++) = dd; } sp = sl = (mlib_d64 *)((mlib_u8 *)sl + slb); dp = dl = (mlib_d64 *)((mlib_u8 *)dl + dlb); } }
void mlib_v_ImageLookUp_U8_U16_124_SrcOff0_D1(const mlib_u8 *src, mlib_u16 *dst, mlib_s32 xsize, const mlib_u16 *table0, const mlib_u16 *table1, const mlib_u16 *table2, const mlib_u16 *table3) { mlib_u32 *sa; /* aligned pointer to source data */ mlib_u8 *sp; /* pointer to source data */ mlib_u32 s0; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_u16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, acc0; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ sa = (mlib_u32*)src; dl = dst; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *) 0, 6); i = 0; if (xsize >= 4) { s0 = *sa++; #pragma pipeloop(0) for(i = 0; i <= xsize - 8; i+=4) { t3 = VIS_LD_U16_I(table3, (s0 << 1) & 0x1FE); t2 = VIS_LD_U16_I(table2, (s0 >> 7) & 0x1FE); t1 = VIS_LD_U16_I(table1, (s0 >> 15) & 0x1FE); t0 = VIS_LD_U16_I(table0, (s0 >> 23) & 0x1FE); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = *sa++; *dp++ = acc0; } t3 = VIS_LD_U16_I(table3, (s0 << 1) & 0x1FE); t2 = VIS_LD_U16_I(table2, (s0 >> 7) & 0x1FE); t1 = VIS_LD_U16_I(table1, (s0 >> 15) & 0x1FE); t0 = VIS_LD_U16_I(table0, (s0 >> 23) & 0x1FE); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); *dp++ = acc0; }
void mlib_v_ImageLookUpSI_S16_U16_4_DstOff0_D1(const mlib_s16 *src, mlib_u16 *dst, mlib_s32 xsize, const mlib_u16 **table) { mlib_s16 *sp; /* pointer to source data */ mlib_s32 s0; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc; /* destination data */ mlib_s32 i; /* loop variable */ const mlib_u16 *tab0 = &table[0][32768]; const mlib_u16 *tab1 = &table[1][32768]; const mlib_u16 *tab2 = &table[2][32768]; const mlib_u16 *tab3 = &table[3][32768]; sp = (void *)src; dl = dst; dp = (mlib_d64 *) dl; vis_alignaddr((void *) 0, 6); if (xsize >= 1) { s0 = (*sp++) << 1; #pragma pipeloop(0) for(i = 0; i <= xsize - 2; i++) { t3 = VIS_LD_U16_I(tab3, s0); t2 = VIS_LD_U16_I(tab2, s0); t1 = VIS_LD_U16_I(tab1, s0); t0 = VIS_LD_U16_I(tab0, s0); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = (*sp++) << 1; *dp++ = acc; } t3 = VIS_LD_U16_I(tab3, s0); t2 = VIS_LD_U16_I(tab2, s0); t1 = VIS_LD_U16_I(tab1, s0); t0 = VIS_LD_U16_I(tab0, s0); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); *dp++ = acc; } }
static void FUNC( m3) ( FUNC_M_ARG) { mlib_s32 i; mlib_d64 k0 = pkern[0]; mlib_d64 k1 = pkern[1]; mlib_d64 k2 = pkern[2]; mlib_d64 a0, a1, a2, aa, sum; mlib_d64 *perror = vis_alignaddr(perror1, 0); a0 = (*perror++); for (i = 0; i < sw; i++) { aa = (*perror++); a1 = vis_faligndata(a0, aa); a2 = vis_faligndata(a1, vis_faligndata(aa, aa)); sum = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0)); sum = vis_fpadd16(sum, FMUL_16x16(k1, a1)); buffd[i] = vis_fpadd16(sum, FMUL_16x16(k2, a2)); a0 = aa; } }
static void FUNC( m1) ( FUNC_M_ARG) { mlib_s32 i; mlib_d64 k0 = pkern[0]; mlib_d64 a0, e0, e1; mlib_d64 *perror = vis_alignaddr(perror1, 0); e0 = (*perror++); for (i = 0; i < (sw + 3) / 4; i++) { e1 = (*perror++); a0 = vis_faligndata(e0, e1); buffd[i] = vis_fpadd16(buffd[i], FMUL_16x16(k0, a0)); e0 = e1; } }
void ADD_SUFF(IntRgbxToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dd, mask; mlib_s32 i, i0, j; if (dstScan == 4*width && srcScan == 4*width) { width *= height; height = 1; } mask = vis_to_double_dup(0xFF000000); vis_alignaddr(NULL, 7); for (j = 0; j < height; j++) { mlib_u32 *src = srcBase; mlib_u32 *dst = dstBase; i = i0 = 0; if ((mlib_s32)dst & 7) { dst[i] = 0xff000000 | (src[i] >> 8); i0 = 1; } #pragma pipeloop(0) for (i = i0; i <= (mlib_s32)width - 2; i += 2) { dd = vis_freg_pair(((mlib_f32*)src)[i], ((mlib_f32*)src)[i + 1]); dd = vis_faligndata(dd, dd); *(mlib_d64*)(dst + i) = vis_for(dd, mask); } if (i < width) { dst[i] = 0xff000000 | (src[i] >> 8); }
void mlib_v_ImageLookUp_U8_U8_124_SrcOff0_D1( const mlib_u8 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 *table0, const mlib_u8 *table1, const mlib_u8 *table2, const mlib_u8 *table3) { /* aligned pointer to source data */ mlib_u32 *sa; /* pointer to source data */ mlib_u8 *sp; /* source data */ mlib_u32 s0, s1; /* pointer to start of destination */ mlib_u8 *dl; /* pointer to end of destination */ mlib_u8 *dend; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc0, acc1; /* edge mask */ mlib_s32 emask; /* loop variable */ mlib_s32 i, num; sa = (mlib_u32 *)src; dl = dst; dp = (mlib_d64 *)dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 7); if (xsize >= 8) { s0 = sa[0]; s1 = sa[1]; sa += 2; vis_write_bmask(0x012389ab, 0); #pragma pipeloop(0) for (i = 0; i <= xsize - 16; i += 8, sa += 2) { t7 = VIS_LD_U8_I(table3, s1 & 0xFF); t6 = VIS_LD_U8_I(table2, (s1 >> 8) & 0xFF); t5 = VIS_LD_U8_I(table1, (s1 >> 16) & 0xFF); t4 = VIS_LD_U8_I(table0, s1 >> 24); t3 = VIS_LD_U8_I(table3, s0 & 0xFF); t2 = VIS_LD_U8_I(table2, (s0 >> 8) & 0xFF); t1 = VIS_LD_U8_I(table1, (s0 >> 16) & 0xFF); t0 = VIS_LD_U8_I(table0, s0 >> 24); acc1 = vis_faligndata(t7, acc1); acc1 = vis_faligndata(t6, acc1); acc1 = vis_faligndata(t5, acc1); acc1 = vis_faligndata(t4, acc1); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sa[0]; s1 = sa[1]; (*dp++) = vis_bshuffle(acc0, acc1); } t7 = VIS_LD_U8_I(table3, s1 & 0xFF); t6 = VIS_LD_U8_I(table2, (s1 >> 8) & 0xFF); t5 = VIS_LD_U8_I(table1, (s1 >> 16) & 0xFF); t4 = VIS_LD_U8_I(table0, s1 >> 24); t3 = VIS_LD_U8_I(table3, s0 & 0xFF); t2 = VIS_LD_U8_I(table2, (s0 >> 8) & 0xFF); t1 = VIS_LD_U8_I(table1, (s0 >> 16) & 0xFF); t0 = VIS_LD_U8_I(table0, s0 >> 24); acc1 = vis_faligndata(t7, acc1); acc1 = vis_faligndata(t6, acc1); acc1 = vis_faligndata(t5, acc1); acc1 = vis_faligndata(t4, acc1); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); (*dp++) = vis_bshuffle(acc0, acc1); }
mlib_status __mlib_VectorConvert_S32_S16_Mod( mlib_s32 *z, const mlib_s16 *x, mlib_s32 n) { mlib_s32 i; const mlib_s16 *src = x; mlib_s32 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_s32 len_64, even_length, rest_64, length = n; mlib_d64 dd1, dd2, dd3, dd4; mlib_f32 two_16_ones = vis_to_float(0x10001); if (length < 16) { EXPAND(mlib_s16, mlib_s32); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 3; len_64 = length >> 2; even_length = len_64 << 2; dd2 = ddsrc[0]; if (!((mlib_addr)(src) & 7)) { /* * Source vector is 64-bit aligned. We can process it without * vis_faligndata. * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { dd3 = (*ddsrc++); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3)); } #pragma pipeloop(1) #pragma unroll(1) for (; i < len_64; i += 2) { dd3 = (*ddsrc++); dd4 = (*ddsrc++); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3)); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd4)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd4)); } } else { /* * Source vector is not 64-bit aligned. Use vis_faligndata. * Peeling of 1 iteration. */ i = 1; if ((len_64 & 1)) { i++; dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); dd3 = vis_faligndata(dd1, dd2); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3)); } /* * Now loop with step == 2. */ #pragma pipeloop(1) #pragma unroll(1) for (; i <= len_64; i += 2) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i); dd3 = vis_faligndata(dd1, dd2); dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_faligndata(dd1, dd2); /* * Now obtaining of the 4*32 - signed objects */ (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd3)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_hi(dd4)); (*ddst++) = vis_fmuld8ulx16(two_16_ones, vis_read_lo(dd4)); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S32_S8_Mod( mlib_s32 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *psrc = (mlib_s8 *)x; mlib_s32 *pdst = (mlib_s32 *)z; mlib_f32 fone = vis_to_float(0x10001); mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3, done = vis_to_double_dup(0x1000100); mlib_s32 i = 0; if (n <= 0) return (MLIB_FAILURE); if ((mlib_addr)pdst & 7) { (*pdst++) = (*psrc++); i = 1; } dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0); dsrc = vis_ld_d64_nf(dpsrc); vis_write_bmask(0x00012223, 0); if ((mlib_addr)psrc & 7) { dsrc1 = vis_ld_d64_nf(dpsrc + 1); dsrc = vis_faligndata(dsrc, dsrc1); #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fpmerge(vis_read_hi(dsrc), vis_read_hi(dsrc)); dst1 = vis_fmul8sux16(dst1, done); dst0 = vis_bshuffle(dst1, dst1); dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst3 = vis_fmul8sux16(dst3, done); dst2 = vis_fmuld8ulx16(fone, vis_read_hi(dst3)); dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3)); dsrc0 = dsrc1; dsrc1 = vis_ld_d64_nf(dpsrc + 2); dsrc = vis_faligndata(dsrc0, dsrc1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } else { #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fpmerge(vis_read_hi(dsrc), vis_read_hi(dsrc)); dst1 = vis_fmul8sux16(dst1, done); dst0 = vis_bshuffle(dst1, dst1); dst1 = vis_fmuld8ulx16(fone, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst3 = vis_fmul8sux16(dst3, done); dst2 = vis_bshuffle(dst3, dst3); dst3 = vis_fmuld8ulx16(fone, vis_read_lo(dst3)); dsrc = vis_ld_d64_nf(dpsrc + 1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } for (; i < n; i++) (*pdst++) = (*psrc++); return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S16_S8_Mod( mlib_s16 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s32 i; const mlib_s8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_d64 four_16_ones = vis_to_double_dup(0x01000100); mlib_f32 fzero = vis_fzeros(); mlib_s32 len_64, even_length, rest_64, length = n, off; mlib_d64 dd0, dd1, dd2, dd4, dd5, dd6, dd7; if (length < 16) { EXPAND(mlib_s8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; off = (mlib_addr)src & 7; if (!off) { /* * Both vectors are 64-bit aligned. */ /* * Peeling of 1 iteration. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd1), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_hi(dd2), fzero), four_16_ones); (*ddst++) = vis_fmul8sux16(vis_fpmerge(vis_read_lo(dd2), fzero), four_16_ones); } } else { /* * Source vector is not 64-bit aligned. * Peeling of 1 iteration. Then loop with step==2. */ vis_alignaddr((void *)0, 1); vis_write_bmask(0x11111111 * off, 0x04152637); i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd4 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); } #pragma pipeloop(0) #pragma unroll(4) for (; i <= len_64; i += 2) { dd0 = dd2; dd1 = vis_ld_d64_nf(ddsrc + i); dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_bshuffle(dd0, dd1); dd6 = vis_bshuffle(dd1, dd2); dd5 = vis_faligndata(dd4, dd4); dd7 = vis_faligndata(dd6, dd6); (*ddst++) = vis_fmul8sux16(dd4, four_16_ones); (*ddst++) = vis_fmul8sux16(dd5, four_16_ones); (*ddst++) = vis_fmul8sux16(dd6, four_16_ones); (*ddst++) = vis_fmul8sux16(dd7, four_16_ones); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConvert_S32_U8_Mod( mlib_s32 *z, const mlib_u8 *x, mlib_s32 n) { mlib_u8 *psrc = (mlib_u8 *)x; mlib_s32 *pdst = (mlib_s32 *)z; mlib_f32 fzero = vis_fzero(), fone1 = vis_to_float(0x100), fone2 = vis_to_float(0x10001); mlib_d64 *dpsrc, dsrc0, dsrc1, dsrc, dst0, dst1, dst2, dst3; mlib_s32 i = 0, off; if (n <= 0) return (MLIB_FAILURE); if ((mlib_addr)pdst & 7) { (*pdst++) = (*psrc++); i = 1; } dpsrc = (mlib_d64 *)vis_alignaddr(psrc, 0); dsrc = dpsrc[0]; off = (mlib_addr)psrc & 7; if (off) { dsrc1 = dsrc; vis_alignaddr((void *)0, 7); vis_write_bmask(0x11111111 * off, 0x40516273); #pragma pipeloop(0) #pragma unroll(2) for (; i <= (n - 8); i += 8) { dsrc0 = dsrc1; dsrc1 = vis_ld_d64_nf(dpsrc + 1); dsrc = vis_bshuffle(dsrc0, dsrc1); dst0 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2); dst1 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2); dsrc = vis_faligndata(dsrc, dsrc); dst2 = vis_fmuld8ulx16(vis_read_hi(dsrc), fone2); dst3 = vis_fmuld8ulx16(vis_read_lo(dsrc), fone2); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } else { #pragma pipeloop(1) #pragma unroll(1) for (; i <= (n - 8); i += 8) { dst1 = vis_fmul8x16al(vis_read_hi(dsrc), fone1); dst0 = vis_fpmerge(fzero, vis_read_hi(dst1)); dst1 = vis_fpmerge(fzero, vis_read_lo(dst1)); dst3 = vis_fpmerge(vis_read_lo(dsrc), vis_read_lo(dsrc)); dst2 = vis_fmuld8ulx16(vis_read_hi(dst3), fone2); dst3 = vis_fmuld8ulx16(vis_read_lo(dst3), fone2); dsrc = vis_ld_d64_nf(dpsrc + 1); ((mlib_d64 *)pdst)[0] = dst0; ((mlib_d64 *)pdst)[1] = dst1; ((mlib_d64 *)pdst)[2] = dst2; ((mlib_d64 *)pdst)[3] = dst3; pdst += 8; psrc += 8; dpsrc++; } } for (; i < n; i++) (*pdst++) = (*psrc++); return (MLIB_SUCCESS); }
void mlib_v_ImageLookUpSI_S16_U16_3_D1(const mlib_s16 *src, mlib_u16 *dst, mlib_s32 xsize, const mlib_u16 **table) { mlib_s16 *sp; /* pointer to source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc0, acc1, acc2; /* destination data */ mlib_s32 i; /* loop variable */ const mlib_u16 *tab0 = &table[0][32768]; const mlib_u16 *tab1 = &table[1][32768]; const mlib_u16 *tab2 = &table[2][32768]; mlib_s32 s00, s01, s02, s03; sp = (void *)src; dl = dst; dp = (mlib_d64 *) dl; vis_alignaddr((void *) 0, 6); i = 0; if (xsize >= 4) { s00 = (sp[0] << 1); s01 = (sp[1] << 1); s02 = (sp[2] << 1); s03 = (sp[3] << 1); sp += 4; #pragma pipeloop(0) for(i = 0; i <= xsize - 8; i+=4, sp+=4) { t3 = VIS_LD_U16_I(tab0, s01); t2 = VIS_LD_U16_I(tab2, s00); t1 = VIS_LD_U16_I(tab1, s00); t0 = VIS_LD_U16_I(tab0, s00); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); t3 = VIS_LD_U16_I(tab1, s02); t2 = VIS_LD_U16_I(tab0, s02); t1 = VIS_LD_U16_I(tab2, s01); t0 = VIS_LD_U16_I(tab1, s01); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc1 = vis_faligndata(t1, acc1); acc1 = vis_faligndata(t0, acc1); t3 = VIS_LD_U16_I(tab2, s03); t2 = VIS_LD_U16_I(tab1, s03); t1 = VIS_LD_U16_I(tab0, s03); t0 = VIS_LD_U16_I(tab2, s02); acc2 = vis_faligndata(t3, acc2); acc2 = vis_faligndata(t2, acc2); acc2 = vis_faligndata(t1, acc2); acc2 = vis_faligndata(t0, acc2); s00 = (sp[0] << 1); s01 = (sp[1] << 1); s02 = (sp[2] << 1); s03 = (sp[3] << 1); *dp++ = acc0; *dp++ = acc1; *dp++ = acc2; } t3 = VIS_LD_U16_I(tab0, s01); t2 = VIS_LD_U16_I(tab2, s00); t1 = VIS_LD_U16_I(tab1, s00); t0 = VIS_LD_U16_I(tab0, s00); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); t3 = VIS_LD_U16_I(tab1, s02); t2 = VIS_LD_U16_I(tab0, s02); t1 = VIS_LD_U16_I(tab2, s01); t0 = VIS_LD_U16_I(tab1, s01); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc1 = vis_faligndata(t1, acc1); acc1 = vis_faligndata(t0, acc1); t3 = VIS_LD_U16_I(tab2, s03); t2 = VIS_LD_U16_I(tab1, s03); t1 = VIS_LD_U16_I(tab0, s03); t0 = VIS_LD_U16_I(tab2, s02); acc2 = vis_faligndata(t3, acc2); acc2 = vis_faligndata(t2, acc2); acc2 = vis_faligndata(t1, acc2); acc2 = vis_faligndata(t0, acc2); *dp++ = acc0; *dp++ = acc1; *dp++ = acc2; i += 4; } dl = (mlib_u16*)dp; #pragma pipeloop(0) for (; i < xsize; i++) { s00 = sp[0]; dl[0] = tab0[s00]; dl[1] = tab1[s00]; dl[2] = tab2[s00]; dl += 3; sp ++; } }
void mlib_v_VideoColorYUV2RGB444_all_align( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4]; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f375); mlib_d64 k02 = vis_to_double_dup(0x3317e5fa); mlib_d64 k11 = vis_to_double_dup(0xf3754097); mlib_d64 k12 = vis_to_double_dup(0xe5fa0000); mlib_d64 k21 = vis_to_double_dup(0x40970000); mlib_d64 k22 = vis_to_double_dup(0x00003317); mlib_d64 c_0 = vis_to_double_dup(0xe42010f4); mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60); mlib_d64 c_2 = vis_to_double_dup(0xdd60e420); mlib_d64 k_0 = vis_to_double_dup(0x25432543); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = n >> 2; buff2 = pbuff_arr2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)u; sf2 = (mlib_f32 *)v; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; mlib_d64 d_0235, d_xx14, d_23xx, d_0145; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); /* * merge buff values to 3-channel array */ d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_23xx); pfd[2] = vis_read_lo(d_0145); buff2 += 2; pfd += 3; } if ((mlib_u8 *)pfd <= dend) { mlib_d64 d_0235, d_xx14, d_23xx, d_0145; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_23xx); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; u += n; v += n; rgb += 3 * n; size -= n; } while (size); }
void mlib_v_ImageLookUpSI_U16_U8_2_DstA8D1(const mlib_u16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 **table) { mlib_u16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_u16 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ const mlib_u8 *tab0 = &table[0][0]; const mlib_u8 *tab1 = &table[1][0]; sp = (void *)src; dl = (mlib_u16 *) dst; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 7); if (xsize >= 4) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; sp += 4; #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t7 = VIS_LD_U8_I(tab1, s3); t6 = VIS_LD_U8_I(tab0, s3); t5 = VIS_LD_U8_I(tab1, s2); t4 = VIS_LD_U8_I(tab0, s2); t3 = VIS_LD_U8_I(tab1, s1); t2 = VIS_LD_U8_I(tab0, s1); t1 = VIS_LD_U8_I(tab1, s0); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; *dp++ = acc; } t7 = VIS_LD_U8_I(tab1, s3); t6 = VIS_LD_U8_I(tab0, s3); t5 = VIS_LD_U8_I(tab1, s2); t4 = VIS_LD_U8_I(tab0, s2); t3 = VIS_LD_U8_I(tab1, s1); t2 = VIS_LD_U8_I(tab0, s1); t1 = VIS_LD_U8_I(tab1, s0); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); *dp++ = acc; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_u16 *) dend - (mlib_u16 *) dp; sp += num; num++; #pragma pipeloop(0) for (i = 0; i < num; i++) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(tab1, s0); acc = vis_faligndata(t0, acc); t0 = VIS_LD_U8_I(tab0, s0); acc = vis_faligndata(t0, acc); } emask = vis_edge16(dp, dend); vis_pst_16(acc, dp, emask); } }
void mlib_v_ImageLookUpSI_S16_U16_2_DstA8D1(const mlib_s16 *src, mlib_u16 *dst, mlib_s32 xsize, const mlib_u16 **table) { mlib_s16 *sp; /* pointer to source data */ mlib_s32 s0, s1; /* source data */ mlib_u16 *dl; /* pointer to start of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, acc; /* destination data */ mlib_s32 i; /* loop variable */ const mlib_u16 *tab0 = &table[0][32768]; const mlib_u16 *tab1 = &table[1][32768]; sp = (void *)src; dl = dst; dp = (mlib_d64 *) dl; vis_alignaddr((void *) 0, 6); if (xsize >= 2) { s0 = (sp[0] << 1); s1 = (sp[1] << 1); sp += 2; #pragma pipeloop(0) for(i = 0; i <= xsize - 4; i+=2, sp+=2) { t3 = VIS_LD_U16_I(tab1, s1); t2 = VIS_LD_U16_I(tab0, s1); t1 = VIS_LD_U16_I(tab1, s0); t0 = VIS_LD_U16_I(tab0, s0); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = (sp[0] << 1); s1 = (sp[1] << 1); *dp++ = acc; } t3 = VIS_LD_U16_I(tab1, s1); t2 = VIS_LD_U16_I(tab0, s1); t1 = VIS_LD_U16_I(tab1, s0); t0 = VIS_LD_U16_I(tab0, s0); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); *dp++ = acc; } if ((xsize & 1) != 0) { s0 = (sp[0] << 1); t1 = VIS_LD_U16_I(tab1, s0); t0 = VIS_LD_U16_I(tab0, s0); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); *(mlib_f32*)dp = vis_read_hi(acc); } }
void mlib_v_ImageLookUpSI_S32_S16_2_D1( const mlib_s32 *src, mlib_s16 *dst, mlib_s32 xsize, const mlib_s16 **table) { /* pointer to source data */ mlib_s32 *sp; /* source data */ mlib_s32 s0, s1, s2; /* pointer to start of destination */ mlib_s16 *dl; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, acc; /* loop variable */ mlib_s32 i; const mlib_s16 *tab0 = &table[0][(mlib_u32)2147483648u]; const mlib_s16 *tab1 = &table[1][(mlib_u32)2147483648u]; sp = (void *)src; dl = dst; vis_alignaddr((void *)0, 6); s0 = (*sp++); (*dl++) = tab0[s0]; dp = (mlib_d64 *)dl; xsize--; if (xsize >= 2) { s1 = sp[0]; s2 = sp[1]; sp += 2; #pragma pipeloop(0) for (i = 0; i <= xsize - 4; i += 2, sp += 2) { t3 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s2)); t2 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s1)); t1 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s1)); t0 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s0)); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s2; s1 = sp[0]; s2 = sp[1]; (*dp++) = acc; } t3 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s2)); t2 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s1)); t1 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s1)); t0 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s0)); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s2; (*dp++) = acc; } dl = (mlib_s16 *)dp; if ((xsize & 1) != 0) { s1 = sp[0]; t1 = VIS_LD_U16_I(tab0, ((mlib_addr)2 * s1)); t0 = VIS_LD_U16_I(tab1, ((mlib_addr)2 * s0)); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); *(mlib_f32 *)dp = vis_read_hi(acc); s0 = s1; dl += 2; } *dl = tab1[s0]; }
void mlib_v_ImageLookUpSI_U16_U8_4_DstOff3_D1(const mlib_u16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 **table) { mlib_u16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2; /* source data */ mlib_u8 *dl; /* pointer to start of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* destination data */ mlib_s32 i; /* loop variable */ const mlib_u8 *tab0 = &table[0][0]; const mlib_u8 *tab1 = &table[1][0]; const mlib_u8 *tab2 = &table[2][0]; const mlib_u8 *tab3 = &table[3][0]; sp = (void *)src; dl = dst; dp = (mlib_d64 *) dl; vis_alignaddr((void *)0, 7); s0 = *sp++; if (xsize >= 2) { s1 = sp[0]; s2 = sp[1]; sp += 2; #pragma pipeloop(0) for (i = 0; i <= xsize - 4; i += 2, sp += 2) { t7 = VIS_LD_U8_I(tab2, s2); t6 = VIS_LD_U8_I(tab1, s2); t5 = VIS_LD_U8_I(tab0, s2); t4 = VIS_LD_U8_I(tab3, s1); t3 = VIS_LD_U8_I(tab2, s1); t2 = VIS_LD_U8_I(tab1, s1); t1 = VIS_LD_U8_I(tab0, s1); t0 = VIS_LD_U8_I(tab3, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s2; s1 = sp[0]; s2 = sp[1]; *dp++ = acc; } t7 = VIS_LD_U8_I(tab2, s2); t6 = VIS_LD_U8_I(tab1, s2); t5 = VIS_LD_U8_I(tab0, s2); t4 = VIS_LD_U8_I(tab3, s1); t3 = VIS_LD_U8_I(tab2, s1); t2 = VIS_LD_U8_I(tab1, s1); t1 = VIS_LD_U8_I(tab0, s1); t0 = VIS_LD_U8_I(tab3, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s2; *dp++ = acc; } dl = (mlib_u8 *) dp; if ((xsize & 1) != 0) { s1 = sp[0]; t7 = VIS_LD_U8_I(tab2, s1); t6 = VIS_LD_U8_I(tab1, s1); t5 = VIS_LD_U8_I(tab0, s1); t4 = VIS_LD_U8_I(tab3, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); *(mlib_f32 *) dl = vis_read_hi(acc); dl += 4; s0 = s1; } dl[0] = tab3[s0]; }
mlib_status __mlib_VectorConjRev_S16C_S16C_Sat( mlib_s16 *zz, const mlib_s16 *xx, mlib_s32 n) { mlib_s16 *x = (mlib_s16 *)xx, *z = (mlib_s16 *)zz; mlib_s16 *src = (mlib_s16 *)x, *dst = (mlib_s16 *)&z[2 * n]; mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, dl, dh, d_rest; mlib_d64 dlog0 = vis_to_double_dup(0x0000ffff), dtwo = vis_to_double(0, 2); mlib_f32 f_two = vis_to_float(0x20002); mlib_s16 c; mlib_s32 i, rest_64, len_64, even_length, odd = 0, length = (mlib_s32)n * 2; mlib_s32 re_part; CHECK(x, z); if ((n < 16)) { CONJREVC(mlib_s16, MLIB_S16_MAX, MLIB_S16_MIN); } while (((mlib_addr)dst) & 7) { if ((c = src[1]) == MLIB_S16_MIN) *--dst = MLIB_S16_MAX; else *--dst = -c; length -= 2; src += 2; if (((mlib_addr)dst) & 7) { *--dst = src[-2]; } else { re_part = src[-2]; odd = 1; break; } } vis_write_gsr(15 << 3); ddst = (mlib_d64 *)dst; rest_64 = length & 3; len_64 = length >> 2; even_length = len_64 << 2; if (!odd) { /* * Aligning loop finished with imaginary part. The following processing * starts with real part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)src; #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ16; *--ddst = d4; } } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ16; *--ddst = d4; } } } else { /* * Aligning loop finished with real part. Th following processing * starts with imaginary part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 2); d_rest = vis_to_double((re_part << 16), 0); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ16; *--ddst = vis_faligndata(d4, d_rest); d_rest = d4; } ddst--; d_rest = vis_faligndata(d_rest, d_rest); vis_pst_16(d_rest, ddst, 0x1); } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ16; *--ddst = d4; } vis_write_gsr(2); d2 = *ddst; d3 = vis_faligndata(d1, d2); vis_pst_16(d3, (ddst - 1), 0x1); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = *(ddst + 1); (*ddst++) = vis_faligndata(d1, d2); } dst[-1] = re_part; } dst--; } if (!rest_64) return (MLIB_SUCCESS); for (i = 0; i < rest_64; i += 2) { dst[-even_length - 2 - i] = src[even_length + i]; if ((c = src[even_length + i + 1]) == MLIB_S16_MIN) dst[-even_length - 2 - i + 1] = MLIB_S16_MAX; else dst[-even_length - 2 - i + 1] = -c; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorConjRev_S8C_S8C_Sat( mlib_s8 *zz, const mlib_s8 *xx, mlib_s32 n) { const mlib_s8 *x = xx; mlib_s8 *z = zz; mlib_s8 *src = (mlib_s8 *)x, *dst = z + 2 * (n); mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, dl, dh, d_rest; mlib_d64 dcntr0 = vis_to_double_dup(0x00800080); mlib_d64 dxor0 = vis_to_double_dup(0x007f007f); mlib_d64 done = vis_to_double_dup(1); mlib_s8 c; mlib_s32 i, rest_64, len_64, even_length, odd = 0, length = (mlib_s32)n * 2; mlib_s32 re_part; mlib_f32 f_null = vis_to_float(0); CHECK(x, z); if (n < 8) { CONJREVC(mlib_s8, MLIB_S8_MAX, MLIB_S8_MIN); } while (((mlib_addr)dst) & 7) { if ((c = src[1]) == MLIB_S8_MIN) *--dst = MLIB_S8_MAX; else *--dst = -c; length -= 2; src += 2; if (((mlib_addr)dst) & 7) { *--dst = src[-2]; } else { re_part = src[-2]; odd = 1; break; } } vis_write_gsr(7 << 3); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; if (!odd) { /* * Aligning loop finished with imaginary part. The following processing * starts with real part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)src; #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ8; *--ddst = d4; } } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ8; *--ddst = d4; } } } else { /* * Aligning loop finished with real part. Th following processing * starts with imaginary part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 1); d_rest = vis_to_double((re_part << 24), 0); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ8; *--ddst = vis_faligndata(d4, d_rest); d_rest = d4; } ddst--; d_rest = vis_faligndata(d_rest, d_rest); vis_pst_8(d_rest, ddst, 0x1); } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ8; *--ddst = d4; } vis_write_gsr(1); d2 = *ddst; d3 = vis_faligndata(d1, d2); vis_pst_8(d3, (ddst - 1), 0x1); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = *(ddst + 1); (*ddst++) = vis_faligndata(d1, d2); } dst[-1] = re_part; } dst--; } if (!rest_64) return (MLIB_SUCCESS); for (i = 0; i < rest_64; i += 2) { dst[-even_length - 2 - i] = src[even_length + i]; if ((c = src[even_length + i + 1]) == MLIB_S8_MIN) dst[-even_length - 2 - i + 1] = MLIB_S8_MAX; else dst[-even_length - 2 - i + 1] = -c; } return (MLIB_SUCCESS); }
void mlib_v_ImageColorRGB2Mono_U8_D1( const mlib_u8 *src, mlib_u8 *dst, mlib_s32 dsize, const mlib_d64 *weight) { mlib_u8 *dst_end; mlib_d64 dd, d0, d1, d2, d3; mlib_d64 rgdd0, bdd0, rgdd1, bdd1, ddt; mlib_d64 *src_all, *dp; mlib_f32 d32, e32, alpha, gamma, beta; mlib_d64 sd0, sd1, sd2; mlib_s32 i, emask; mlib_s32 off; mlib_s32 mask0 = 0x0369147a; mlib_s32 mask1 = 0x258b258b; mlib_s32 mask2 = 0x47ad58be; mlib_s32 mask3 = 0x69cf69cf; /* prepare the weight */ alpha = vis_to_float(weight[0] * 8192); beta = vis_to_float(weight[1] * 8192); gamma = vis_to_float(weight[2] * 8192); vis_write_gsr(2 << 3); dp = (mlib_d64 *)((mlib_addr)dst & (~7)); off = (mlib_addr)dp - (mlib_addr)dst; dst_end = dst + (dsize - 1); emask = vis_edge8(dst, dst_end); src_all = vis_alignaddr((void *)src, (3 * off)); d0 = (*src_all++); d1 = (*src_all++); d2 = (*src_all++); d3 = (*src_all++); sd0 = vis_faligndata(d0, d1); sd1 = vis_faligndata(d1, d2); sd2 = vis_faligndata(d2, d3); CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1); CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd); vis_pst_8(dd, dp, emask); dp++; #pragma pipeloop(0) for (i = 8 + off; i <= (dsize - 8); i += 8) { d0 = d3; d1 = (*src_all++); d2 = (*src_all++); d3 = (*src_all++); sd0 = vis_faligndata(d0, d1); sd1 = vis_faligndata(d1, d2); sd2 = vis_faligndata(d2, d3); CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1); CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd); (*dp++) = dd; } if ((mlib_addr)dp <= (mlib_addr)dst_end) { emask = vis_edge8(dp, dst_end); d0 = d3; d1 = (*src_all++); d2 = (*src_all++); d3 = (*src_all++); sd0 = vis_faligndata(d0, d1); sd1 = vis_faligndata(d1, d2); sd2 = vis_faligndata(d2, d3); CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1); CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd); vis_pst_8(dd, dp, emask); } }
static mlib_status mlib_v_VideoColorYUV2RGB444_nonalign( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 rgb_stride, mlib_s32 yuv_stride) { /* all. pointer to y, u, v */ mlib_d64 *spy, *dfu, *dfv; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv, du0, du1, dv0, dv1; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double_dup(0x1be01be0); mlib_d64 k_135_6352 = vis_to_double_dup(0x10f410f4); mlib_d64 k_276_9856 = vis_to_double_dup(0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 red, green, blue, *ddp, dd0, dd1, dd2; /* loop variable */ mlib_s32 i, j; mlib_d64 *buf, BUFF[16 * 1024]; mlib_u8 *tmp, *dp; if (width * 3 > 16 * 1024) { tmp = __mlib_malloc(width * 3 * sizeof (mlib_u8) + 7); if (tmp == NULL) return (MLIB_FAILURE); buf = (mlib_d64 *)((mlib_addr)(tmp + 7) & ~7); } else { buf = (mlib_d64 *)BUFF; } dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; for (j = 0; j < height; j++) { dfu = (mlib_d64 *)vis_alignaddr((void *)u, 0); du0 = (*dfu++); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; dfv = (mlib_d64 *)vis_alignaddr((void *)v, 0); dv0 = (*dfv++); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); spy = (mlib_d64 *)vis_alignaddr((void *)y, 0); dy0 = (*spy++); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { vis_write_bmask(0x0801902A, 0); dd0 = vis_bshuffle(red, green); vis_write_bmask(0x03B04C05, 0); dd1 = vis_bshuffle(red, green); vis_write_bmask(0xD06E07F0, 0); dd2 = vis_bshuffle(red, green); vis_write_bmask(0x01834967, 0); ddp[0] = vis_bshuffle(dd0, blue); vis_write_bmask(0xA12B45C7, 0); ddp[1] = vis_bshuffle(dd1, blue); vis_write_bmask(0x0D23E56F, 0); ddp[2] = vis_bshuffle(dd2, blue); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); vis_alignaddr((void *)u, 0); du1 = vis_ld_d64_nf(dfu); dfu++; du = vis_faligndata(du0, du1); du0 = du1; /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); vis_alignaddr((void *)v, 0); dv1 = vis_ld_d64_nf(dfv); dfv++; dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); g_hi = vis_fpadd16(g_hi, y_11644_hi); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); g_lo = vis_fpadd16(g_lo, y_11644_lo); green = vis_fpack16_pair(g_hi, g_lo); b_hi = vis_fpadd16(b_hi, y_11644_hi); b_lo = vis_fpadd16(b_lo, y_11644_lo); blue = vis_fpack16_pair(b_hi, b_lo); r_hi = vis_fpadd16(r_hi, y_11644_hi); r_lo = vis_fpadd16(r_lo, y_11644_lo); red = vis_fpack16_pair(r_hi, r_lo); vis_alignaddr((void *)y, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; ddp += 3; } dp = (mlib_u8 *)ddp; vis_alignaddr((void *)(width - i), 0); blue = vis_faligndata(blue, blue); green = vis_faligndata(green, green); red = vis_faligndata(red, red); dp += ((width - i - 1) * 3); vis_alignaddr((void *)spy, 7); for (; i < width; i++) { STORE_PIXEL(0, 1, 2); dp -= 3; } __mlib_VectorCopy_U8(rgb, (mlib_u8 *)buf, width * 3); rgb += rgb_stride; dp = (mlib_u8 *)buf; ddp = (mlib_d64 *)dp; y += yuv_stride; u += yuv_stride; v += yuv_stride; } if (width * 3 > 16 * 1024) __mlib_free(tmp); return (MLIB_SUCCESS); }
void mlib_v_ImageLookUpSI_U16_S16_2_D1( const mlib_u16 *src, mlib_s16 *dst, mlib_s32 xsize, const mlib_s16 **table) { /* pointer to source data */ mlib_u16 *sp; /* source data */ mlib_s32 s0, s1, s2; /* pointer to start of destination */ mlib_s16 *dl; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, acc0; /* loop variable */ mlib_s32 i; const mlib_s16 *tab0 = &table[0][0]; const mlib_s16 *tab1 = &table[1][0]; /* destination data */ mlib_d64 acc1; sp = (void *)src; dl = dst; vis_alignaddr((void *)0, 6); s0 = (*sp++); (*dl++) = tab0[s0]; dp = (mlib_d64 *)dl; xsize--; s0 <<= 1; if (xsize >= 2) { s1 = (sp[0] << 1); s2 = (sp[1] << 1); sp += 2; vis_write_bmask(0x012389ab, 0); #pragma pipeloop(0) for (i = 0; i <= xsize - 4; i += 2, sp += 2) { t3 = VIS_LD_U16_I(tab0, s2); t2 = VIS_LD_U16_I(tab1, s1); t1 = VIS_LD_U16_I(tab0, s1); t0 = VIS_LD_U16_I(tab1, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = s2; s1 = (sp[0] << 1); s2 = (sp[1] << 1); (*dp++) = vis_bshuffle(acc0, acc1); } t3 = VIS_LD_U16_I(tab0, s2); t2 = VIS_LD_U16_I(tab1, s1); t1 = VIS_LD_U16_I(tab0, s1); t0 = VIS_LD_U16_I(tab1, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = s2; (*dp++) = vis_bshuffle(acc0, acc1); } dl = (mlib_s16 *)dp; if ((xsize & 1) != 0) { s1 = (sp[0] << 1); t1 = VIS_LD_U16_I(tab0, s1); t0 = VIS_LD_U16_I(tab1, s0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); *(mlib_f32 *)dp = vis_read_hi(acc0); s0 = s1; dl += 2; } s0 >>= 1; *dl = tab1[s0]; }
mlib_status __mlib_VectorConvert_S16_U8_Mod( mlib_s16 *z, const mlib_u8 *x, mlib_s32 n) { mlib_s32 i; const mlib_u8 *src = x; mlib_s16 *dst = z; mlib_d64 *ddsrc, *ddst; mlib_s32 len_64, even_length, rest_64, length = n; mlib_f32 fzero = vis_fzeros(); mlib_d64 dd1, dd2, dd3, dd4; mlib_f32 fm = vis_to_float(0x100); if (length < 16) { EXPAND(mlib_u8, mlib_s16); } while ((mlib_addr)dst & 7) { (*dst++) = (*src++); length--; } ddsrc = (mlib_d64 *)vis_alignaddr((void *)src, 0); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; dd2 = ddsrc[0]; if (!((mlib_addr)src & 7)) { /* * Both vectors are 64-bit aligned. We can process without * vis_faligndata * Peeling the 1 iteration. Then loop with step==2. */ if (i = (len_64 & 1)) { dd1 = (*ddsrc++); (*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd1)); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1)); } #pragma pipeloop(1) #pragma unroll(1) for (; i < len_64; i += 2) { dd1 = (*ddsrc++); dd2 = (*ddsrc++); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd1), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd1)); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd2), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd2)); } } else { /* * Source vector is not 64-bit aligned. Use vis_faligndata. * Peeling the 1 iteration. Then loop with step==2. */ i = 1; if (len_64 & 1) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + 1); i++; dd3 = vis_faligndata(dd1, dd2); (*ddst++) = vis_fpmerge(fzero, vis_read_hi(dd3)); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3)); } #pragma pipeloop(0) #pragma unroll(2) for (; i <= len_64; i += 2) { dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i); dd3 = vis_faligndata(dd1, dd2); dd1 = dd2; dd2 = vis_ld_d64_nf(ddsrc + i + 1); dd4 = vis_faligndata(dd1, dd2); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd3), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd3)); (*ddst++) = vis_fmul8x16al(vis_read_hi(dd4), fm); (*ddst++) = vis_fpmerge(fzero, vis_read_lo(dd4)); } } for (i = 0; i < rest_64; i++) dst[even_length + i] = src[even_length + i]; return (MLIB_SUCCESS); }
mlib_status __mlib_VideoInterpX_U8_U8( mlib_u8 *curr_block, const mlib_u8 *ref_block, mlib_s32 width, mlib_s32 height, mlib_s32 frame_stride, mlib_s32 field_stride) { mlib_d64 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, *sd, *dd; mlib_d64 dzero = vis_fzero(); const mlib_f32 fm1 = vis_to_float(0x100); mlib_f32 fzero = vis_read_hi(dzero); mlib_d64 rounder = vis_fone(); mlib_s32 y; vis_write_gsr((6 << 3) + ((mlib_u32)ref_block & 7)); dd = (mlib_d64 *)curr_block; sd = (mlib_d64 *)((mlib_addr)ref_block & ~7); if (width == 8) { y = height >> 2; if (((mlib_s32)(ref_block + 1) & 7)) { do { s0 = sd[0]; s1 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d0 = vis_faligndata(s0, s1); s2 = sd[0]; s3 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d1 = vis_faligndata(s2, s3); s4 = sd[0]; s5 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d2 = vis_faligndata(s4, s5); s6 = sd[0]; s7 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d3 = vis_faligndata(s6, s7); vis_alignaddr((void *)(ref_block + 1), 0); s0 = vis_faligndata(s0, s1); s1 = vis_faligndata(s2, s3); s2 = vis_faligndata(s4, s5); s3 = vis_faligndata(s6, s7); MLIB_V_VIDEOINTERP(d0, d0, s0); MLIB_V_VIDEOINTERP(d1, d1, s1); MLIB_V_VIDEOINTERP(d2, d2, s2); MLIB_V_VIDEOINTERP4(d3, d3, s3); *dd = d0; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d2; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); vis_alignaddr((void *)ref_block, 0); } while (--y); } else { do { s0 = sd[0]; s1 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d0 = vis_faligndata(s0, s1); s2 = sd[0]; s3 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d1 = vis_faligndata(s2, s3); s4 = sd[0]; s5 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d2 = vis_faligndata(s4, s5); s6 = sd[0]; s7 = sd[1]; sd = (mlib_d64 *)((mlib_u8 *)sd + field_stride); d3 = vis_faligndata(s6, s7); MLIB_V_VIDEOINTERP4(d0, d0, s1); MLIB_V_VIDEOINTERP4(d1, d1, s3); MLIB_V_VIDEOINTERP4(d2, d2, s5); MLIB_V_VIDEOINTERP4(d3, d3, s7); *dd = d0; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d1; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d2; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); *dd = d3; dd = (mlib_d64 *)((mlib_u8 *)dd + field_stride); } while (--y); } } else {
void mlib_v_ImageLookUpSI_U16_S16_4_DstOff3_D1( const mlib_u16 *src, mlib_s16 *dst, mlib_s32 xsize, const mlib_s16 **table) { /* pointer to source data */ mlib_u16 *sp; /* source data */ mlib_s32 s0, s1; /* pointer to start of destination */ mlib_s16 *dl; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2, t3; /* destination data */ mlib_d64 acc0; /* loop variable */ mlib_s32 i; const mlib_s16 *tab0 = &table[0][0]; const mlib_s16 *tab1 = &table[1][0]; const mlib_s16 *tab2 = &table[2][0]; const mlib_s16 *tab3 = &table[3][0]; /* destination data */ mlib_d64 acc1; sp = (void *)src; dl = dst; dp = (mlib_d64 *)dl; vis_alignaddr((void *)0, 6); s0 = ((*sp++)) << 1; if (xsize >= 1) { s1 = ((*sp++)) << 1; vis_write_bmask(0x012389ab, 0); #pragma pipeloop(0) for (i = 0; i <= xsize - 2; i++) { t3 = VIS_LD_U16_I(tab2, s1); t2 = VIS_LD_U16_I(tab1, s1); t1 = VIS_LD_U16_I(tab0, s1); t0 = VIS_LD_U16_I(tab3, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = s1; s1 = ((*sp++)) << 1; (*dp++) = vis_bshuffle(acc0, acc1); } t3 = VIS_LD_U16_I(tab2, s1); t2 = VIS_LD_U16_I(tab1, s1); t1 = VIS_LD_U16_I(tab0, s1); t0 = VIS_LD_U16_I(tab3, s0); acc1 = vis_faligndata(t3, acc1); acc1 = vis_faligndata(t2, acc1); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = s1; (*dp++) = vis_bshuffle(acc0, acc1); } dl = (mlib_s16 *)dp; s0 >>= 1; dl[0] = tab3[s0]; }