void mlib_ImageLineXor8000( const mlib_u8 *src, mlib_u8 *dst, mlib_s32 size) { mlib_u8 *dend; mlib_d64 *dptr; mlib_d64 *sptr; mlib_d64 s0, s1; mlib_d64 mask8000 = vis_to_double_dup(0x80008000); mlib_s32 j; mlib_s32 emask; /* prepare the destination addresses */ dptr = (mlib_d64 *)((mlib_addr)dst & (~7)); j = (mlib_addr)dptr - (mlib_addr)dst; dend = (mlib_u8 *)dst + size - 1; /* prepare the source address */ sptr = (mlib_d64 *)VIS_ALIGNADDR(src, j); /* generate edge mask for the start point */ emask = vis_edge8(dst, dend); s1 = vis_ld_d64_nf(sptr); if (emask != 0xff) { s0 = s1; s1 = vis_ld_d64_nf(sptr + 1); s0 = vis_fxor(vis_faligndata(s0, s1), mask8000); vis_pst_8(s0, dptr++, emask); sptr++; j += 8; } #pragma pipeloop(0) for (; j <= (size - 16); j += 8) { s0 = s1; s1 = sptr[1]; (*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000); sptr++; } if (j <= (size - 8)) { s0 = s1; s1 = vis_ld_d64_nf(sptr + 1); (*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000); sptr++; j += 8; } if (j < size) { s0 = vis_fxor(vis_faligndata(s1, vis_ld_d64_nf(sptr + 1)), mask8000); emask = vis_edge8(dptr, dend); vis_pst_8(s0, dptr, emask); } }
void mlib_v_ImageNot_na( mlib_u8 *sa, mlib_u8 *da, mlib_s32 size) { /* end points in dst */ mlib_u8 *dend; /* 8-byte aligned start points in dst */ mlib_d64 *dp; /* 8-byte aligned start point in src */ mlib_d64 *sp; /* 8-byte source data */ mlib_d64 s0, s1; /* offset of address in dst */ mlib_s32 j; /* edge mask */ mlib_s32 emask; /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr)da & (~7)); j = (mlib_addr)dp - (mlib_addr)da; dend = da + size - 1; /* prepare the source address */ sp = (mlib_d64 *)vis_alignaddr(sa, j); /* generate edge mask for the start point */ emask = vis_edge8(da, dend); s1 = vis_ld_d64_nf(sp); if (emask != 0xff) { s0 = s1; s1 = vis_ld_d64_nf(sp + 1); s0 = vis_faligndata(s0, s1); vis_pst_8(vis_fnot(s0), dp++, emask); sp++; j += 8; } #pragma pipeloop(0) for (; j <= (size - 8); j += 8) { s0 = s1; s1 = vis_ld_d64_nf(sp + 1); (*dp++) = vis_fnot(vis_faligndata(s0, s1)); sp++; } if (j < size) { s0 = vis_faligndata(s1, vis_ld_d64_nf(sp + 1)); emask = vis_edge8(dp, dend); vis_pst_8(vis_fnot(s0), dp, emask); } }
mlib_status __mlib_VideoUpSample420_Nearest( mlib_u8 *dst0, mlib_u8 *dst1, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)src; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_u8 *dend0 = dst0 + 2 * n - 1; mlib_d64 sa, da; mlib_s32 emask, i; if (n <= 0) return (MLIB_FAILURE); #pragma pipeloop(0) for (i = 0; i <= (n - 8); i += 8) { sa = *sp; *dp0 = *dp1 = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa)); *(dp0 + 1) = *(dp1 + 1) = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa)); sp++; dp0 += 2; dp1 += 2; } if (i < n) { sa = vis_ld_d64_nf(sp); da = vis_fpmerge(vis_read_hi(sa), vis_read_hi(sa)); emask = vis_edge8(dp0, dend0); vis_pst_8(da, dp0, emask); vis_pst_8(da, dp1, emask); i += 4; dp0++; dp1++; if (i < n) { da = vis_fpmerge(vis_read_lo(sa), vis_read_lo(sa)); emask = vis_edge8(dp0, dend0); vis_pst_8(da, dp0, emask); vis_pst_8(da, dp1, emask); } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorSplit3( mlib_u8 *color1, mlib_u8 *color2, mlib_u8 *color3, const mlib_u8 *colors, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)colors; mlib_d64 *dp0 = (mlib_d64 *)color1; mlib_d64 *dp1 = (mlib_d64 *)color2; mlib_d64 *dp2 = (mlib_d64 *)color3; mlib_d64 sd0, sd1, sd2, dd0, dd1, dd2; mlib_d64 sda, sdb, sdc, sdd, sde; mlib_s32 i; /* * 8-pixels loop */ #pragma pipeloop(0) for (i = 0; i < (n / 8); i++) { sd0 = (*sp++); sd1 = (*sp++); sd2 = (*sp++); MLIB_SPLIT3_U8(sd0, sd1, sd2, dd0, dd1, dd2); (*dp0++) = dd0; (*dp1++) = dd1; (*dp2++) = dd2; } /* * last 8 pixels */ if (n & 7) { mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF; sd0 = (*sp++); sd1 = vis_ld_d64_nf(sp); sp++; sd2 = vis_ld_d64_nf(sp); MLIB_SPLIT3_U8(sd0, sd1, sd2, dd0, dd1, dd2); vis_pst_8(dd0, dp0, emask); vis_pst_8(dd1, dp1, emask); vis_pst_8(dd2, dp2, emask); }
mlib_status __mlib_VideoColorSplit4( mlib_u8 *color1, mlib_u8 *color2, mlib_u8 *color3, mlib_u8 *color4, const mlib_u8 *colors, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)colors; mlib_d64 *dp0 = (mlib_d64 *)color1; mlib_d64 *dp1 = (mlib_d64 *)color2; mlib_d64 *dp2 = (mlib_d64 *)color3; mlib_d64 *dp3 = (mlib_d64 *)color4; mlib_d64 sd01, sd23, sd45, sd67, dd0, dd1, dd2, dd3; mlib_d64 sd04, sd26, sd15, sd37, dh0, dh1, dl0, dl1; mlib_s32 i; MLIB_LOAD_PREP_U8(sp); /* * 8-pixels loop */ #pragma pipeloop(0) for (i = 0; i < (n / 8); i++) { MLIB_LOAD_SPLIT4_U8(sp, dd0, dd1, dd2, dd3); (*dp0++) = dd0; (*dp1++) = dd1; (*dp2++) = dd2; (*dp3++) = dd3; } /* * last 8 pixels */ if (n & 7) { mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF; MLIB_LOAD_SPLIT4_U8(sp, dd0, dd1, dd2, dd3); vis_pst_8(dd0, dp0, emask); vis_pst_8(dd1, dp1, emask); vis_pst_8(dd2, dp2, emask); vis_pst_8(dd3, dp3, emask); }
mlib_status __mlib_SignalConvertShift_U8_F32_Sat( mlib_u8 *dst, const mlib_f32 *src, mlib_s32 shift, mlib_s32 xsize) { mlib_s32 i, off; mlib_d64 *sp, *dp; mlib_d64 dd, dd_old; type_union_mlib_d64 sd0, sd1, sd2, sd3; mlib_f32 fl_c; if (xsize <= 0) return (MLIB_FAILURE); if (!src || !dst) return (MLIB_NULLPOINTER); PREPARE_CONST(fl_c, shift + 8); if ((mlib_addr)src & 7) { mlib_f32 x = (*src++) * fl_c; if (x >= MLIB_U8_MAX) x = MLIB_U8_MAX; if (x <= MLIB_U8_MIN) x = MLIB_U8_MIN; (*dst++) = x; xsize--; } vis_write_gsr(23 << 3); off = ((mlib_addr)dst & 7); sp = (mlib_d64 *)src; dp = (mlib_d64 *)(dst - off); if (off == 0) { #pragma pipeloop(0) for (i = 0; i <= (xsize - 8); i += 8) { CONVERT_U8_F32(); (*dp++) = dd; } if (i < xsize) { mlib_s32 emask = 0xFF00 >> (xsize - i); CONVERT_U8_F32(); vis_pst_8(dd, dp, emask); } } else {
void ADD_SUFF(IntArgbBmToIntArgbConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dd, dmask, dFF; mlib_s32 i, i0, j, x, mask; if (dstScan == 4*width && srcScan == 4*width) { width *= height; height = 1; } dmask = vis_to_double_dup(0xFFFFFF); dFF = vis_to_double_dup(0xFFFFFFFF); for (j = 0; j < height; j++) { mlib_s32 *src = srcBase; mlib_s32 *dst = dstBase; i = i0 = 0; if ((mlib_s32)dst & 7) { x = src[i]; dst[i] = (x << 7) >> 7; i0 = 1; } #pragma pipeloop(0) for (i = i0; i <= (mlib_s32)width - 2; i += 2) { mlib_u8 *pp0 = (mlib_u8*)(src + i); mlib_u8 *pp1 = (mlib_u8*)(src + i + 1); dd = vis_freg_pair(*(mlib_f32*)pp0, *(mlib_f32*)pp1); dd = vis_fand(dd, dmask); #if 1 mask = ((*pp0 & 1) << 7) | ((*pp1 & 1) << 3); *(mlib_d64*)(dst + i) = dd; vis_pst_8(dFF, dst + i, mask); #else mask = ((*pp0 & 1) << 1) | (*pp1 & 1); dd = vis_for(dd, ((mlib_d64*)vis_amask_arr)[mask]); *(mlib_d64*)(dst + i) = dd; #endif } if (i < width) { x = src[i]; dst[i] = (x << 7) >> 7; }
mlib_status mlib_v_conv3x3_8nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon, mlib_s32 cmask) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u8 *sa, *sa1, *sa2; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointers to rows in interm. src buf */ mlib_d64 *sbuf3; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d_1, d_2, d21, d22; /* data */ mlib_d64 d3, d_3, d23; mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1; mlib_d64 tmp0, tmp1, rnd; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1, sd00; mlib_s32 emask, cmask1; mlib_s32 rval, gsr_scale, i, j; gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3)); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); cmask = ((cmask & 0xf) << 4) + (cmask & 0xf); cmask = (cmask << 8) + (cmask); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); buf_slb = (4 * dw + 24) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 2; dw *= 4; dh -= 2; sa = adr_src; sa1 = sa + slb; sa2 = sa1 + slb; d_a = adr_dst + dlb + 4; /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf2, sa); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf3, sa1); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); PREPARE_TO_LOAD_LINE(sbuf3, sa2); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); vis_alignaddr(s1, 4); d1 = *s1; d2 = *s2; d3 = *s3; #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_3 = *(s3 + 1); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k3k4); CONV_AU(d3, k7k8); d21 = vis_faligndata(d1, d_1); d22 = vis_faligndata(d2, d_2); d23 = vis_faligndata(d3, d_3); CONV_AL(d21, k1k2); CONV_AU(d22, k5k6); CONV_AL(d23, k7k8); CONV_AU(d_1, k3k4); CONV_AL(d_2, k5k6); CONV_AU(d_3, k9k9); (*ddst++) = vis_fpack16_pair(out0, out1); d1 = d_1; d2 = d_2; d3 = d_3; s1++; s2++; s3++; } ddst = dbuf; /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr)da & (~7)); i = (mlib_addr)dp - (mlib_addr)da; cmask1 = cmask >> (-i); ddst = vis_alignaddr(ddst, i); /* generate edge mask for the start point */ emask = vis_edge8(da, dend); sd1 = ddst[0]; if (emask != 0xff) { sd0 = sd1; sd1 = ddst[1]; sd0 = vis_faligndata(sd0, sd1); vis_pst_8(sd0, dp++, emask & cmask1); ddst++; i += 8; } #pragma pipeloop(0) for (; i <= (dw - 8); i += 8) { sd0 = sd1; sd1 = ddst[1]; sd00 = vis_faligndata(sd0, sd1); vis_pst_8(sd00, dp++, cmask1); ddst++; } if (i < dw) { sd0 = vis_faligndata(sd1, ddst[1]); emask = vis_edge8(dp, dend); vis_pst_8(sd0, dp, emask & cmask1); } sa2 = sa2 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
void mlib_v_ImageLookUp_S16_U8_124_D1( const mlib_s16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 *table0, const mlib_u8 *table1, const mlib_u8 *table2, const mlib_u8 *table3) { /* pointer to source data */ mlib_s16 *sp; /* source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_s32 s4, s5, s6, s7; /* pointer to start of destination */ mlib_u8 *dl; /* pointer to end of destination */ mlib_u8 *dend; /* aligned pointer to destination */ mlib_d64 *dp; /* destination data */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc0; /* edge mask */ mlib_s32 emask; /* loop variable */ mlib_s32 i, num; /* destination data */ mlib_d64 acc1; dl = dst; dp = (mlib_d64 *)dl; dend = dl + xsize - 1; sp = (void *)src; vis_alignaddr((void *)0, 7); if (xsize >= 8) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; s4 = sp[4]; s5 = sp[5]; s6 = sp[6]; s7 = sp[7]; sp += 8; vis_write_bmask(0x012389ab, 0); #pragma pipeloop(0) for (i = 0; i <= xsize - 16; i += 8, sp += 8) { t7 = VIS_LD_U8_I(table3, s7); t6 = VIS_LD_U8_I(table2, s6); t5 = VIS_LD_U8_I(table1, s5); t4 = VIS_LD_U8_I(table0, s4); t3 = VIS_LD_U8_I(table3, s3); t2 = VIS_LD_U8_I(table2, s2); t1 = VIS_LD_U8_I(table1, s1); t0 = VIS_LD_U8_I(table0, s0); acc1 = vis_faligndata(t7, acc1); acc1 = vis_faligndata(t6, acc1); acc1 = vis_faligndata(t5, acc1); acc1 = vis_faligndata(t4, acc1); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; s4 = sp[4]; s5 = sp[5]; s6 = sp[6]; s7 = sp[7]; (*dp++) = vis_bshuffle(acc0, acc1); } t7 = VIS_LD_U8_I(table3, s7); t6 = VIS_LD_U8_I(table2, s6); t5 = VIS_LD_U8_I(table1, s5); t4 = VIS_LD_U8_I(table0, s4); t3 = VIS_LD_U8_I(table3, s3); t2 = VIS_LD_U8_I(table2, s2); t1 = VIS_LD_U8_I(table1, s1); t0 = VIS_LD_U8_I(table0, s0); acc1 = vis_faligndata(t7, acc1); acc1 = vis_faligndata(t6, acc1); acc1 = vis_faligndata(t5, acc1); acc1 = vis_faligndata(t4, acc1); acc0 = vis_faligndata(t3, acc0); acc0 = vis_faligndata(t2, acc0); acc0 = vis_faligndata(t1, acc0); acc0 = vis_faligndata(t0, acc0); (*dp++) = vis_bshuffle(acc0, acc1); } if ((mlib_addr)dp <= (mlib_addr)dend) { num = (mlib_addr)dend - (mlib_addr)dp; sp += num; num++; if ((num & 3) == 1) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc0 = vis_faligndata(t0, acc0); num--; } else if ((num & 3) == 2) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table1, s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc0 = vis_faligndata(t0, acc0); num -= 2; } else if ((num & 3) == 3) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table2, s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table1, s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc0 = vis_faligndata(t0, acc0); num -= 3; } if (num != 0) { s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table3, s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table2, s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table1, s0); acc0 = vis_faligndata(t0, acc0); s0 = (mlib_s32)*sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc0 = vis_faligndata(t0, acc0); } emask = vis_edge8(dp, dend); vis_pst_8(acc0, dp, emask); } }
mlib_status __mlib_VideoUpSample420( mlib_u8 *dst0, mlib_u8 *dst1, const mlib_u8 *src0, const mlib_u8 *src1, const mlib_u8 *src2, mlib_s32 n) { mlib_u8 *dend0 = dst0 + 2 * n - 1; mlib_d64 *dp0 = (mlib_d64 *)dst0; mlib_d64 *dp1 = (mlib_d64 *)dst1; mlib_d64 *sp0 = (mlib_d64 *)src0; mlib_d64 *sp1 = (mlib_d64 *)src1; mlib_d64 *sp2 = (mlib_d64 *)src2; mlib_d64 d00, d01, d10, d11, d20, d21; mlib_d64 thiscolsum0_hi, thiscolsum0_lo, lastcolsum0_hi, lastcolsum0_lo; mlib_d64 shiftcolsum0_hi, shiftcolsum0_lo; mlib_d64 thiscolsum1_hi, thiscolsum1_lo, lastcolsum1_hi, lastcolsum1_lo; mlib_d64 shiftcolsum1_hi, shiftcolsum1_lo; mlib_d64 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; mlib_d64 ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7; mlib_d64 data0, data1, data2, data3, tmp0, tmp1; mlib_f32 fone = vis_to_float(0x4000000); mlib_f32 fthree = vis_to_float(0xC000000); mlib_f32 fone1 = vis_to_float(0x40404040); mlib_f32 fthree1 = vis_to_float(0xC0C0C0C0); mlib_d64 dseven = vis_to_double_dup(0x70007); mlib_d64 deight = vis_to_double_dup(0x80008); mlib_s32 i, emask; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr((3 << 3) + 2); d00 = vis_ld_d64_nf(sp0); d10 = vis_ld_d64_nf(sp1); d20 = vis_ld_d64_nf(sp2); sp0++; sp1++; sp2++; lastcolsum0_hi = vis_fmul8x16au(vis_read_hi(d00), fone); lastcolsum0_lo = vis_fmul8x16au(vis_read_lo(d00), fone); lastcolsum1_hi = vis_fmul8x16au(vis_read_hi(d20), fone); lastcolsum1_lo = vis_fmul8x16au(vis_read_lo(d20), fone); tmp0 = vis_fmul8x16au(vis_read_hi(d10), fthree); tmp1 = vis_fmul8x16au(vis_read_lo(d10), fthree); lastcolsum0_hi = vis_fpadd16(lastcolsum0_hi, tmp0); lastcolsum0_lo = vis_fpadd16(lastcolsum0_lo, tmp1); lastcolsum1_hi = vis_fpadd16(lastcolsum1_hi, tmp0); lastcolsum1_lo = vis_fpadd16(lastcolsum1_lo, tmp1); #pragma pipeloop(0) for (i = 0; i < n - 8; i += 8) { d01 = *sp0; d11 = *sp1; d21 = *sp2; sp0++; sp1++; sp2++; thiscolsum0_hi = vis_fmul8x16au(vis_read_hi(d01), fone); thiscolsum0_lo = vis_fmul8x16au(vis_read_lo(d01), fone); thiscolsum1_hi = vis_fmul8x16au(vis_read_hi(d21), fone); thiscolsum1_lo = vis_fmul8x16au(vis_read_lo(d21), fone); tmp0 = vis_fmul8x16au(vis_read_hi(d11), fthree); tmp1 = vis_fmul8x16au(vis_read_lo(d11), fthree); thiscolsum0_hi = vis_fpadd16(thiscolsum0_hi, tmp0); thiscolsum0_lo = vis_fpadd16(thiscolsum0_lo, tmp1); thiscolsum1_hi = vis_fpadd16(thiscolsum1_hi, tmp0); thiscolsum1_lo = vis_fpadd16(thiscolsum1_lo, tmp1); acc0 = vis_fmul8x16(fone1, lastcolsum0_hi); acc1 = vis_fmul8x16(fone1, lastcolsum0_lo); acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi); acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo); acc4 = vis_fmul8x16(fone1, lastcolsum1_hi); acc5 = vis_fmul8x16(fone1, lastcolsum1_lo); acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi); acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo); shiftcolsum0_hi = vis_faligndata(lastcolsum0_hi, lastcolsum0_lo); shiftcolsum0_lo = vis_faligndata(lastcolsum0_lo, thiscolsum0_hi); shiftcolsum1_hi = vis_faligndata(lastcolsum1_hi, lastcolsum1_lo); shiftcolsum1_lo = vis_faligndata(lastcolsum1_lo, thiscolsum1_hi); acc0 = vis_fpadd16(acc0, deight); acc1 = vis_fpadd16(acc1, deight); acc2 = vis_fpadd16(acc2, dseven); acc3 = vis_fpadd16(acc3, dseven); acc4 = vis_fpadd16(acc4, deight); acc5 = vis_fpadd16(acc5, deight); acc6 = vis_fpadd16(acc6, dseven); acc7 = vis_fpadd16(acc7, dseven); ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi); ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo); ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi); ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo); ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi); ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo); ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi); ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo); acc0 = vis_fpadd16(acc0, ac0); acc1 = vis_fpadd16(acc1, ac1); acc2 = vis_fpadd16(acc2, ac2); acc3 = vis_fpadd16(acc3, ac3); acc4 = vis_fpadd16(acc4, ac4); acc5 = vis_fpadd16(acc5, ac5); acc6 = vis_fpadd16(acc6, ac6); acc7 = vis_fpadd16(acc7, ac7); data0 = vis_fpack16_pair(acc0, acc1); data1 = vis_fpack16_pair(acc2, acc3); data2 = vis_fpack16_pair(acc4, acc5); data3 = vis_fpack16_pair(acc6, acc7); dp0[0] = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0)); dp0[1] = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0)); dp1[0] = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2)); dp1[1] = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2)); dp0 += 2; dp1 += 2; lastcolsum0_hi = thiscolsum0_hi; lastcolsum0_lo = thiscolsum0_lo; lastcolsum1_hi = thiscolsum1_hi; lastcolsum1_lo = thiscolsum1_lo; } if (i < n) { acc0 = vis_fmul8x16(fone1, lastcolsum0_hi); acc1 = vis_fmul8x16(fone1, lastcolsum0_lo); acc2 = vis_fmul8x16(fthree1, lastcolsum0_hi); acc3 = vis_fmul8x16(fthree1, lastcolsum0_lo); acc4 = vis_fmul8x16(fone1, lastcolsum1_hi); acc5 = vis_fmul8x16(fone1, lastcolsum1_lo); acc6 = vis_fmul8x16(fthree1, lastcolsum1_hi); acc7 = vis_fmul8x16(fthree1, lastcolsum1_lo); shiftcolsum0_hi = vis_faligndata(lastcolsum0_hi, lastcolsum0_lo); shiftcolsum0_lo = vis_faligndata(lastcolsum0_lo, lastcolsum0_lo); shiftcolsum1_hi = vis_faligndata(lastcolsum1_hi, lastcolsum1_lo); shiftcolsum1_lo = vis_faligndata(lastcolsum1_lo, lastcolsum1_lo); acc0 = vis_fpadd16(acc0, deight); acc1 = vis_fpadd16(acc1, deight); acc2 = vis_fpadd16(acc2, dseven); acc3 = vis_fpadd16(acc3, dseven); acc4 = vis_fpadd16(acc4, deight); acc5 = vis_fpadd16(acc5, deight); acc6 = vis_fpadd16(acc6, dseven); acc7 = vis_fpadd16(acc7, dseven); ac0 = vis_fmul8x16(fthree1, shiftcolsum0_hi); ac1 = vis_fmul8x16(fthree1, shiftcolsum0_lo); ac2 = vis_fmul8x16(fone1, shiftcolsum0_hi); ac3 = vis_fmul8x16(fone1, shiftcolsum0_lo); ac4 = vis_fmul8x16(fthree1, shiftcolsum1_hi); ac5 = vis_fmul8x16(fthree1, shiftcolsum1_lo); ac6 = vis_fmul8x16(fone1, shiftcolsum1_hi); ac7 = vis_fmul8x16(fone1, shiftcolsum1_lo); acc0 = vis_fpadd16(acc0, ac0); acc1 = vis_fpadd16(acc1, ac1); acc2 = vis_fpadd16(acc2, ac2); acc3 = vis_fpadd16(acc3, ac3); acc4 = vis_fpadd16(acc4, ac4); acc5 = vis_fpadd16(acc5, ac5); acc6 = vis_fpadd16(acc6, ac6); acc7 = vis_fpadd16(acc7, ac7); data0 = vis_fpack16_pair(acc0, acc1); data1 = vis_fpack16_pair(acc2, acc3); data2 = vis_fpack16_pair(acc4, acc5); data3 = vis_fpack16_pair(acc6, acc7); acc0 = vis_fpmerge(vis_read_hi(data1), vis_read_hi(data0)); acc1 = vis_fpmerge(vis_read_hi(data3), vis_read_hi(data2)); emask = vis_edge8(dp0, dend0); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); i += 4; dp0++; dp1++; if (i < n) { acc0 = vis_fpmerge(vis_read_lo(data1), vis_read_lo(data0)); acc1 = vis_fpmerge(vis_read_lo(data3), vis_read_lo(data2)); emask = vis_edge8(dp0, dend0); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); } } vis_write_gsr(7); dp0 = (mlib_d64 *)dst0; dp1 = (mlib_d64 *)dst1; ac0 = *dp0; ac2 = *dp1; #pragma pipeloop(0) for (i = 0; i < 2 * n - 8; i += 8) { ac1 = *dp0; ac3 = *dp1; *dp0 = vis_faligndata(ac0, ac1); *dp1 = vis_faligndata(ac2, ac3); dp0++; dp1++; ac0 = ac1; ac2 = ac3; } if (i < 2 * n) { ac1 = vis_ld_d64_nf(dp0); ac3 = vis_ld_d64_nf(dp1); emask = vis_edge8(dp0, dend0); acc0 = vis_faligndata(ac0, ac1); acc1 = vis_faligndata(ac2, ac3); vis_pst_8(acc0, dp0, emask); vis_pst_8(acc1, dp1, emask); } dst0[0] = (4 * (3 * src1[0] + src0[0]) + 8) >> 4; dst1[0] = (4 * (3 * src1[0] + src2[0]) + 8) >> 4; dst0[2 * n - 1] = (4 * (3 * src1[n - 1] + src0[n - 1]) + 7) >> 4; dst1[2 * n - 1] = (4 * (3 * src1[n - 1] + src2[n - 1]) + 7) >> 4; return (MLIB_SUCCESS); }
static void mlib_VectorDotProd_U8C_al_x( mlib_d64 *z, const void *x, const void *y, mlib_s32 n) /* The case of even address of vector x */ { mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y; mlib_d64 sum_r = 0.0, sum_i = 0.0; mlib_d64 *dpx, *dpy, *dpxend; mlib_d64 dx, dy, dy0, dy1; mlib_d64 dx_r, dy_r, dy_i; mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl; mlib_d64 d_ih, d_il, d_rh, d_rl; mlib_d64 ds_r, ds_i, ds1_r, ds1_i; mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF); mlib_d64 edge[2]; mlib_f32 fsum; mlib_s32 d_left; mlib_s32 emask, off; mlib_d64 done = vis_to_double_dup(0x1000100); edge[0] = edge[1] = 0; dpx = (mlib_d64 *)((mlib_addr)px & (~7)); off = (mlib_addr)dpx - (mlib_addr)px; dpy = vis_alignaddr((void *)py, off); pxend = px + n + n - 1; dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7)); emask = vis_edge8(px, pxend); vis_pst_8(dpx[0], edge, emask); dx = edge[0]; dy = vis_ld_d64_nf(dpy); if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) { while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds_i = ds_r = ds1_i = ds1_r = 0.0; for (; d_left > 0; d_left--) { DPROD_U8C; SUM_U8C; dx = dpx[1]; dy = dpy[1]; dpx++; dpy++; } ds_i = vis_fpadd32(ds_i, ds1_i); ds_r = vis_fpadd32(ds_r, ds1_r); fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } } else { dy1 = vis_ld_d64_nf(dpy+1); dy = vis_faligndata(dy, dy1); while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds_i = ds_r = ds1_i = ds1_r = 0.0; for (; d_left > 0; d_left--) { DPROD_U8C; SUM_U8C; dy0 = dy1; dy1 = vis_ld_d64_nf(dpy+2); dx = vis_ld_d64_nf(dpx+1); dy = vis_faligndata(dy0, dy1); dpx++; dpy++; } ds_i = vis_fpadd32(ds_i, ds1_i); ds_r = vis_fpadd32(ds_r, ds1_r); fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } } if ((mlib_addr)dpx <= (mlib_addr)pxend) { emask = vis_edge8(dpx, pxend); vis_pst_8(dx, edge + 1, emask); dx = edge[1]; DPROD_U8C; SUM_U8C_TAIL; fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } z[0] = sum_r; z[1] = sum_i; #undef MAX_LOOP }
void mlib_v_ImageColorRGB2Mono_U8_D1( const mlib_u8 *src, mlib_u8 *dst, mlib_s32 dsize, const mlib_d64 *weight) { mlib_u8 *dst_end; mlib_d64 dd, d0, d1, d2, d3; mlib_d64 rgdd0, bdd0, rgdd1, bdd1, ddt; mlib_d64 *src_all, *dp; mlib_f32 d32, e32, alpha, gamma, beta; mlib_d64 sd0, sd1, sd2; mlib_s32 i, emask; mlib_s32 off; mlib_s32 mask0 = 0x0369147a; mlib_s32 mask1 = 0x258b258b; mlib_s32 mask2 = 0x47ad58be; mlib_s32 mask3 = 0x69cf69cf; /* prepare the weight */ alpha = vis_to_float(weight[0] * 8192); beta = vis_to_float(weight[1] * 8192); gamma = vis_to_float(weight[2] * 8192); vis_write_gsr(2 << 3); dp = (mlib_d64 *)((mlib_addr)dst & (~7)); off = (mlib_addr)dp - (mlib_addr)dst; dst_end = dst + (dsize - 1); emask = vis_edge8(dst, dst_end); src_all = vis_alignaddr((void *)src, (3 * off)); d0 = (*src_all++); d1 = (*src_all++); d2 = (*src_all++); d3 = (*src_all++); sd0 = vis_faligndata(d0, d1); sd1 = vis_faligndata(d1, d2); sd2 = vis_faligndata(d2, d3); CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1); CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd); vis_pst_8(dd, dp, emask); dp++; #pragma pipeloop(0) for (i = 8 + off; i <= (dsize - 8); i += 8) { d0 = d3; d1 = (*src_all++); d2 = (*src_all++); d3 = (*src_all++); sd0 = vis_faligndata(d0, d1); sd1 = vis_faligndata(d1, d2); sd2 = vis_faligndata(d2, d3); CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1); CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd); (*dp++) = dd; } if ((mlib_addr)dp <= (mlib_addr)dst_end) { emask = vis_edge8(dp, dst_end); d0 = d3; d1 = (*src_all++); d2 = (*src_all++); d3 = (*src_all++); sd0 = vis_faligndata(d0, d1); sd1 = vis_faligndata(d1, d2); sd2 = vis_faligndata(d2, d3); CHANNELSEPARATE_U8(sd0, sd1, sd2, rgdd0, bdd0, rgdd1, bdd1); CHANNELWEIGHT_U8(rgdd0, bdd0, rgdd1, bdd1, dd); vis_pst_8(dd, dp, emask); } }
mlib_status __mlib_VectorSubS_U8_U8_Mod( mlib_u8 *z, const mlib_u8 *x, const mlib_u8 *c, mlib_s32 n) { /* edge masks */ mlib_s32 emask; /* offset of address alignment in destination */ mlib_s32 off; mlib_s8 *pzend; mlib_d64 *dpx, *dpz, *dpzend; mlib_d64 dx, dx0, dx1, dr0, dr1, dr; mlib_u8 uc = *((mlib_s8 *)c); mlib_d64 uncontrol_mask = vis_to_double_dup(0xff00ff00); /* prepare the scaling factors */ mlib_d64 dcl = vis_to_double_dup(uc | (uc << 16)); mlib_d64 dch = vis_to_double_dup((uc << 8) | (uc << 24)); mlib_s32 scal = uc << 24 | uc << 16 | uc << 8 | uc; mlib_s32 sr1, sr2, sr3, sr3_; mlib_s32 mask = 0x7f7f7f7f; mlib_s32 x8, x12; mlib_s32 nrest, i; if (n <= 0) return (MLIB_FAILURE); pzend = (mlib_s8 *)z + n - 1; dpzend = (mlib_d64 *)((mlib_addr)pzend & (~7)); /* * check for 64-bit aligned special case */ if ((((mlib_addr)x | (mlib_addr)z) & 7) == 0) { /* * We can process source and destination vectors by 16 bytes. */ dpx = (mlib_d64 *)x; dpz = (mlib_d64 *)z; #pragma pipeloop(0) for (i = 0; i < n >> 4; i++) { mlib_u64 ld0; dx = dpx[0]; SUBS_S8_MOD; (*dpz++) = dr; ld0 = *((mlib_u64 *)dpx + 1); x8 = ld0 >> 32; sr1 = x8 ^ ~scal; sr2 = (scal | ~mask) - (x8 & mask); sr3 = (sr1 & ~mask) ^ sr2; x12 = ld0 & 0xFFFFFFFF; sr1 = x12 ^ ~scal; sr2 = (scal | ~mask) - (x12 & mask); sr3_ = (sr1 & ~mask) ^ sr2; (*dpz++) = vis_to_double(sr3, sr3_); dpx += 2; } nrest = n & 0xf; if (nrest >= 8) { dx = (*dpx++); SUBS_S8_MOD; (*dpz++) = dr; nrest -= 8; } if (nrest > 0) { dx = (*dpx++); SUBS_S8_MOD; emask = vis_edge8(dpz, pzend); vis_pst_8(dr, dpz, emask); } } else {
mlib_status __mlib_VectorSub_S8_S8_Sat( mlib_s8 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { mlib_d64 *dpz, *dpx, *dpy; mlib_d64 dx, dy, dz, dx0, dx1, dy0, dy1; mlib_d64 dxh, dxl, dyh, dyl, dzh, dzl; mlib_d64 dh, dl; mlib_s8 *pz = z, *px, *py, *pzend; /* offset of address alignment in destination */ mlib_s32 off; mlib_s32 len = n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; /* edge masks */ mlib_s32 emask; mlib_d64 displacement = vis_to_double_dup(0x8000800); mlib_d64 restore = vis_to_double_dup(0x80808080); mlib_f32 fmul = vis_to_float(0x1000); if (n <= 0) return (MLIB_FAILURE); px = (mlib_s8 *)x; py = (mlib_s8 *)y; /* initialize GSR scale factor */ vis_write_gsr(3 << 3); dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (mlib_addr)dpz - (mlib_addr)z; pzend = pz + n - 1; /* * generate edge mask for the start point */ emask = vis_edge8(pz, pzend); /* * prepare the source address */ if (off) { dpy = (mlib_d64 *)vis_alignaddr(py, off); dy0 = vis_ld_d64_nf(dpy); dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); dpx = (mlib_d64 *)vis_alignaddr(px, off); dx0 = vis_ld_d64_nf(dpx); dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); SUB_S8_SAT; /* * store first bytes of result */ vis_pst_8(dz, dpz, emask); px += (8 + off); py += (8 + off); len -= (8 + off); dpz++; if (len <= 0) return (MLIB_SUCCESS); } even_8 = len >> 3; rest_8 = len & 0x7; /* * Now try to analyze source "x" and "y" addresses. */ if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) { /* * Both addresses are 8-byte aligned. No vis_alignaddr * and vis_faligndata at all. */ dpx = (mlib_d64 *)px; dpy = (mlib_d64 *)py; dx = vis_ld_d64_nf(dpx); dpx++; dy = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dy1 = vis_ld_d64_nf(dpy); SUB_S8_SAT; dx = dx1; dy = dy1; /* * store 8 bytes of result */ dpz[0] = dz; dpx++; dpy++; dpz++; } dx1 = dx; dy1 = dy; } else if ((!((mlib_addr)px & 7))) { /* * First ("x") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "y". */ dpx = (mlib_d64 *)px; dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); dx = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { SUB_S8_SAT; dx = vis_ld_d64_nf(dpx); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); /* * store 8 bytes of result */ (*dpz++) = dz; dpx++; dpy++; } dx1 = dx; dy1 = dy0; } else if ((!((mlib_addr)py & 7))) { /* * Second ("y") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "x". */ dpy = (mlib_d64 *)py; dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy = (*dpy++); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_SAT; /* * store 8 bytes of result */ (*dpz++) = dz; } dy1 = vis_ld_d64_nf(dpy); dpy++; } else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) { /* * Both ("x" and "y") address are identically aligned. * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy1 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_SAT; /* * store 8 bytes of result */ (*dpz++) = dz; } } else { /* * Both ("x" and "y") address are arbitrary aligned. * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); /* #pragma pipeloop(0) */ for (i = 0; i < even_8; i++) { SUB_S8_SAT; vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); /* * store 8 bytes of result */ (*dpz++) = dz; dpy++; dpx++; } dx1 = dx0; dy1 = dy0; } if (!rest_8) return (MLIB_SUCCESS); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); SUB_S8_SAT; /* * prepare edge mask for the last bytes */ emask = vis_edge8((void *)(rest_8), pzend); /* store last bytes of result */ vis_pst_8(dz, dpz, ~emask); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoDownSample422( mlib_u8 *dst, const mlib_u8 *src, mlib_s32 n) { mlib_d64 *sp0 = (mlib_d64 *)src; mlib_d64 *pd = (mlib_d64 *)dst; mlib_d64 d0; mlib_d64 tmp, data0, data1; mlib_d64 acc0_hi, acc0_lo; mlib_d64 round = vis_to_double_dup(0x1); mlib_f32 fone = vis_to_float(0x1000000); mlib_s32 i, edge; if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(6 << 3); vis_write_bmask(0x02461357, 0); #pragma pipeloop(0) for (i = 0; i <= n - 16; i += 16) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); (*pd++) = vis_fpack16_pair(data0, data1); } if (i < n) { d0 = (*sp0++); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data0 = vis_fpadd16(acc0_hi, round); d0 = vis_ld_d64_nf(sp0); tmp = vis_bshuffle(d0, d0); acc0_hi = vis_fmul8x16au(vis_read_hi(tmp), fone); acc0_lo = vis_fmul8x16au(vis_read_lo(tmp), fone); acc0_hi = vis_fpadd16(acc0_hi, acc0_lo); data1 = vis_fpadd16(acc0_hi, round); edge = vis_edge8(pd, (dst + (n / 2) - 1)); vis_pst_8(vis_fpack16_pair(data0, data1), pd, edge); } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorARGB2JFIFYCC422( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, const mlib_u8 *argb, mlib_s32 n) { mlib_d64 *sp = (mlib_d64 *)argb, *py = (mlib_d64 *)y; mlib_f32 *pcb = (mlib_f32 *)cb, *pcr = (mlib_f32 *)cr; mlib_u8 *yend = y + n, *cbend = cb + (n >> 1); mlib_d64 sd01, sd23, sd45, sd67, sd04, sd26, sd15, sd37; mlib_d64 dh0, dh1, dl0, dl1, z0, z1; mlib_s32 i; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 4096)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 4096)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 4096)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 4096)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 4096)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 4096)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off0 = vis_to_double_dup(0x00100010); if (n <= 0) return (MLIB_FAILURE); vis_write_gsr(2 << 3); n = n >> 3; #pragma pipeloop(0) for (i = 0; i < n; i++) { sd01 = (*sp++); sd23 = (*sp++); sd45 = (*sp++); sd67 = (*sp++); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); py[0] = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); pcb[0] = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); pcr[0] = vis_fpack16(vis_fpadd16(z0, z1)); py++; pcb++; pcr++; } if ((mlib_u8 *)pcb < cbend) { mlib_d64 yd; mlib_f32 cbf, crf; mlib_s32 ymask, cmask; sd01 = (*sp++); sd23 = vis_ld_d64_nf(sp); sp++; sd45 = vis_ld_d64_nf(sp); sp++; sd67 = vis_ld_d64_nf(sp); CHANNELSEPARATE_U8_422(sd01, sd23, sd45, sd67, dh0, dh1, dl0, dl1); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k11, k12, k13, off0, z0, z1); z1 = vis_fpadd16(z1, off0); yd = vis_fpmerge(vis_fpack16(z0), vis_fpack16(z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k21, k22, k23, off128, z0, z1); cbf = vis_fpack16(vis_fpadd16(z0, z1)); CHANNELWEIGHT_U8_2p(vis_read_lo(dh0), vis_read_hi(dh1), vis_read_lo(dh1), vis_read_lo(dl0), vis_read_hi(dl1), vis_read_lo(dl1), k31, k32, k33, off128, z0, z1); crf = vis_fpack16(vis_fpadd16(z0, z1)); ymask = vis_edge8(py, yend - 1); vis_pst_8(yd, py, ymask); cmask = vis_edge8(pcb, cbend - 1); if (cmask & 0xf0) { vis_pst_8(vis_freg_pair(cbf, vis_fzeros()), pcb, cmask); vis_pst_8(vis_freg_pair(crf, vis_fzeros()), pcr, cmask); } else { vis_pst_8(vis_freg_pair(vis_fzeros(), cbf), pcb - 1, cmask); vis_pst_8(vis_freg_pair(vis_fzeros(), crf), pcr - 1, cmask); } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorCMYK2JFIFYCCK444( mlib_u8 *y, mlib_u8 *cb, mlib_u8 *cr, mlib_u8 *k, const mlib_u8 *cmyk, mlib_s32 n) { mlib_d64 buff_arr[(SIZE / 2) + 2]; mlib_f32 *py, *pcb, *pcr, *pk; mlib_d64 *buff; mlib_d64 sdh, sdl, dr, dg, db, dd; mlib_s32 i, m, size, num; mlib_f32 k11 = vis_to_float((mlib_s32)(K11 * 8192)); mlib_f32 k12 = vis_to_float((mlib_s32)(K12 * 8192)); mlib_f32 k13 = vis_to_float((mlib_s32)(K13 * 8192)); mlib_f32 k21 = vis_to_float((mlib_s32)(K21 * 8192)); mlib_f32 k22 = vis_to_float((mlib_s32)(K22 * 8192)); mlib_f32 k23 = vis_to_float((mlib_s32)(K23 * 8192)); mlib_f32 k31 = vis_to_float((mlib_s32)(K31 * 8192)); mlib_f32 k32 = vis_to_float((mlib_s32)(K32 * 8192)); mlib_f32 k33 = vis_to_float((mlib_s32)(K33 * 8192)); mlib_d64 off128 = vis_to_double_dup(0x10101010); mlib_d64 off255 = vis_to_double_dup(0x1ff01ff0); vis_write_gsr(2 << 3); /* * 4-pixel loop */ for (size = 0; size < n; size += num) { num = n - size; if (num > SIZE) num = SIZE; m = (num + 3) / 4; mlib_channel_separate((mlib_d64 *)cmyk + size / 2, buff_arr, m); m = (num / 4) & ~1; py = (mlib_f32 *)y + size / 4; pcb = (mlib_f32 *)cb + size / 4; pcr = (mlib_f32 *)cr + size / 4; pk = (mlib_f32 *)k + size / 4; buff = buff_arr; #pragma pipeloop(0) for (i = 0; i < m; i++) { sdh = buff[0]; sdl = buff[1]; CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, py[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, pcb[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, pcr[0]); py++; pcb++; pcr++; (*pk++) = vis_read_lo(sdl); buff += 2; } } if (n & 7) { mlib_s32 emask = (0xFF00 >> (n & 7)) & 0xFF; mlib_d64 rbuff[4]; mlib_f32 *prbuff = (mlib_f32 *)rbuff; sdh = (*buff++); sdl = (*buff++); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, prbuff[0]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, prbuff[2]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, prbuff[4]); prbuff[6] = vis_read_lo(sdl); sdh = (*buff++); sdl = (*buff++); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k11, k12, k13, off255, prbuff[1]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k21, k22, k23, off128, prbuff[3]); CHANNELWEIGHT_U8(vis_read_hi(sdh), vis_read_lo(sdh), vis_read_hi(sdl), k31, k32, k33, off128, prbuff[5]); prbuff[7] = vis_read_lo(sdl); vis_pst_8(rbuff[0], py, emask); vis_pst_8(rbuff[1], pcb, emask); vis_pst_8(rbuff[2], pcr, emask); vis_pst_8(rbuff[3], pk, emask); }
/* The case of even address of vector x */ static void mlib_VectorDotProd_U8C_al_x( mlib_d64 *z, const void *x, const void *y, mlib_s32 n) { mlib_u8 *pxend, *px = (mlib_u8 *)x, *py = (mlib_u8 *)y; mlib_d64 sum_r = 0.0, sum_i = 0.0; mlib_d64 *dpx, *dpy, *dpxend; mlib_d64 dx, dy, dy0, dy1; mlib_d64 dx_r, dy_r, dy_i; mlib_d64 d_iih, d_iil, d_irh, d_irl, d_rih, d_ril, d_rrh, d_rrl; mlib_d64 d_ih, d_il, d_rh, d_rl; mlib_d64 ds_r, ds_i, ds1_r, ds1_i; mlib_d64 lb_mask = vis_to_double_dup(0x00FF00FF); mlib_d64 edge[2], fzero = vis_fzero(); mlib_f32 fsum; mlib_s32 d_left; mlib_s32 emask, off; edge[0] = edge[1] = 0; dpx = (mlib_d64 *)((mlib_addr)px & (~7)); off = (mlib_addr)dpx - (mlib_addr)px; dpy = vis_alignaddr((void *)py, off); pxend = px + n + n - 1; dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7)); emask = vis_edge8(px, pxend); vis_pst_8(dpx[0], edge, emask); dx = edge[0]; dy = vis_ld_d64_nf(dpy); if (((((mlib_addr)px) ^ ((mlib_addr)py)) & 7) == 0) { vis_write_bmask(0x781A3C5E, 0); while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds_i = ds_r = ds1_i = ds1_r = 0.0; #pragma pipeloop(0) for (; d_left > 0; d_left--) { DPROD_U8C0; SUM_U8C; dx = dpx[1]; dy = dpy[1]; dpx++; dpy++; } ds_i = vis_fpadd32(ds_i, ds1_i); ds_r = vis_fpadd32(ds_r, ds1_r); fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } } else { mlib_s32 mask = ((mlib_addr)(py + off)) & 7; vis_write_bmask(0x11111111 * mask, 0x01234567); dy1 = vis_ld_d64_nf(dpy+1); dy = vis_bshuffle(dy, dy1); SET_ALIGN_U8C; while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds_i = ds_r = ds1_i = ds1_r = 0.0; #pragma pipeloop(0) for (; d_left > 0; d_left--) { DPROD_U8C; SUM_U8C; dy0 = dy1; dy1 = vis_ld_d64_nf(dpy+2); dx = vis_ld_d64_nf(dpx+1); dy = vis_bshuffle(dy0, dy1); dpx++; dpy++; } ds_i = vis_fpadd32(ds_i, ds1_i); ds_r = vis_fpadd32(ds_r, ds1_r); fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } } if ((mlib_addr)dpx <= (mlib_addr)pxend) { emask = vis_edge8(dpx, pxend); vis_pst_8(dx, edge + 1, emask); dx = edge[1]; SET_ALIGN_U8C; DPROD_U8C; SUM_U8C_TAIL; fsum = vis_read_hi(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_r); sum_r += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds_i); sum_i += (mlib_d64)*((mlib_s32 *)&fsum); } z[0] = sum_r; z[1] = sum_i; #undef MAX_LOOP }
mlib_status __mlib_VectorSub_U8_U8_Mod( mlib_u8 *z, const mlib_u8 *x, const mlib_u8 *y, mlib_s32 n) { /* 8-byte aligned start point in destination */ mlib_d64 *dpz; /* 8-byte aligned start point in source */ mlib_d64 *dpx, *dpy; /* source data */ mlib_d64 dx, dy, dx0; mlib_d64 dx1, dy0, dy1; /* destination data */ mlib_d64 dz; /* intermediate result */ mlib_d64 dh, dl; mlib_d64 dxl, dyl; /* end point of a line in destination */ mlib_u8 *pzend; /* start point of a line in source */ mlib_u8 *px, *py; /* offset of address alignment in destination */ mlib_s32 off; /* edge mask */ mlib_s32 emask; mlib_u8 *pzend16; mlib_s32 sr1, sr2, sr3; mlib_s32 x8, x12, y8, y12; mlib_s32 mask = 0x7f7f7f7f; mlib_u8 *pz; mlib_s32 n16; mlib_s32 nrest; mlib_s32 len = n, i; /* rest and leng in terms of 8 bytes. */ mlib_s32 rest_8, even_8; mlib_d64 mask_control = vis_to_double_dup(0xff00ff00); if (n <= 0) return (MLIB_FAILURE); px = (mlib_u8 *)x; py = (mlib_u8 *)y; pz = (mlib_u8 *)z; /* * prepare the destination address */ pzend = pz + n - 1; /* * check for 64-bit aligned special case */ if ((((mlib_addr)x | (mlib_addr)y | (mlib_addr)z) & 7) == 0) { /* * We can process source and destination vectors by 16 bytes. */ dpx = (mlib_d64 *)x; dx = vis_ld_d64_nf(dpx); dpy = (mlib_d64 *)y; dy = vis_ld_d64_nf(dpy); dpz = (mlib_d64 *)z; n16 = n & (~0xf); pzend16 = pz + n16; #pragma pipeloop(0) while ((mlib_addr)pz < (mlib_addr)pzend16) { x8 = *((mlib_s32 *)(px + 8)); y8 = *((mlib_s32 *)(py + 8)); sr1 = x8 ^ ~y8; sr2 = (x8 | ~mask) - (y8 & mask); sr3 = (sr1 & ~mask) ^ sr2; *((mlib_s32 *)(pz + 8)) = sr3; x12 = *((mlib_s32 *)(px + 12)); y12 = *((mlib_s32 *)(py + 12)); sr1 = x12 ^ ~y12; sr2 = (x12 | ~mask) - (y12 & mask); sr3 = (sr1 & ~mask) ^ sr2; *((mlib_s32 *)(pz + 12)) = sr3; SUB_S8_MOD; /* store 8 bytes of result */ *((mlib_d64 *)pz) = dz; dx = vis_ld_d64_nf(px + 16); dy = vis_ld_d64_nf(py + 16); px += 16; py += 16; pz += 16; } dpz = (mlib_d64 *)pzend16; nrest = n - n16; if (nrest >= 8) { SUB_S8_MOD; dpz[0] = dz; px += 8; py += 8; dpz++; nrest -= 8; } if (nrest > 0) { dx = *((mlib_d64 *)px); dy = *((mlib_d64 *)py); SUB_S8_MOD; emask = vis_edge8(dpz, pzend); vis_pst_8(dz, dpz, emask); } } else { /* * General case. */ dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (mlib_addr)dpz - (mlib_addr)z; /* * generate edge mask for the start point */ emask = vis_edge8(pz, pzend); /* * prepare the source address */ if (off) { dpy = (mlib_d64 *)vis_alignaddr(py, off); dy0 = vis_ld_d64_nf(dpy); dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); dpx = (mlib_d64 *)vis_alignaddr(px, off); dx0 = vis_ld_d64_nf(dpx); dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); SUB_S8_MOD; /* * store first bytes of result */ vis_pst_8(dz, dpz, emask); px += (8 + off); py += (8 + off); len -= (8 + off); dpz++; if (len <= 0) return (MLIB_SUCCESS); } even_8 = len >> 3; rest_8 = len & 0x7; /* * Now try to analyze source "x" and "y" addresses. */ if ((!((mlib_addr)px & 7)) && (!((mlib_addr)py & 7))) { /* * Both addresses are 8-byte aligned. No vis_alignaddr * and vis_faligndata at all. */ dpx = (mlib_d64 *)px; dpy = (mlib_d64 *)py; dx = vis_ld_d64_nf(dpx); dpx++; dy = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dx1 = vis_ld_d64_nf(dpx); dy1 = vis_ld_d64_nf(dpy); SUB_S8_MOD; dx = dx1; dy = dy1; /* * store 8 bytes of result */ dpz[0] = dz; dpx++; dpy++; dpz++; } dx1 = dx; dy1 = dy; } else if ((!((mlib_addr)px & 7))) { /* * First ("x") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "y". */ dpx = (mlib_d64 *)px; dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); dx = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { SUB_S8_MOD; dx = vis_ld_d64_nf(dpx); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); /* * store 8 bytes of result */ (*dpz++) = dz; dpx++; dpy++; } dx1 = dx; dy1 = dy0; } else if ((!((mlib_addr)py & 7))) { /* * Second ("y") address is 8-byte aligned. vis_alignaddr * and vis_faligndata only for "x". */ dpy = (mlib_d64 *)py; dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy = (*dpy++); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_MOD; /* * store 8 bytes of result */ (*dpz++) = dz; } dy1 = vis_ld_d64_nf(dpy); dpy++; } else if (((mlib_addr)px & 7) == ((mlib_addr)py & 7)) { /* * Both ("x" and "y") address are identically aligned. * There are 1 vis_alignaddr and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx1 = vis_ld_d64_nf(dpx); dpx++; dpy = vis_alignaddr(py, 0); dy1 = vis_ld_d64_nf(dpy); dpy++; #pragma pipeloop(0) for (i = 0; i < even_8; i++) { dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dpy++; dy = vis_faligndata(dy0, dy1); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUB_S8_MOD; /* * store 8 bytes of result */ (*dpz++) = dz; } } else { /* * Both ("x" and "y") address are arbitrary aligned. * 2 vis_alignaddr(s) and 2 vis_faligndata(s) in the loop. */ dpx = vis_alignaddr(px, 0); dx0 = vis_ld_d64_nf(dpx); dpx++; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); dpy = vis_alignaddr(py, 0); dy0 = vis_ld_d64_nf(dpy); dpy++; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); #pragma pipeloop(0) for (i = 0; i < even_8; i++) { SUB_S8_MOD; vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy + 1); dy = vis_faligndata(dy0, dy1); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx + 1); dx = vis_faligndata(dx0, dx1); /* * store 8 bytes of result */ (*dpz++) = dz; dpy++; dpx++; } dx1 = dx0; dy1 = dy0; } if (!rest_8) return (MLIB_SUCCESS); vis_alignaddr(px, 0); dx0 = dx1; dx1 = vis_ld_d64_nf(dpx); dx = vis_faligndata(dx0, dx1); vis_alignaddr(py, 0); dy0 = dy1; dy1 = vis_ld_d64_nf(dpy); dy = vis_faligndata(dy0, dy1); SUB_S8_MOD; /* * prepare edge mask for the last bytes */ emask = vis_edge8((void *)(rest_8), pzend); /* store last bytes of result */ vis_pst_8(dz, dpz, ~emask); } return (MLIB_SUCCESS); }
static mlib_status mlib_v_VideoColorYUV2ABGR422_nonalign( mlib_u8 *abgr, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 abgr_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* pointers to src address */ mlib_u8 *sp2, *sp3, *sl2, *sl3; /* pointers to src address */ mlib_u8 *sp1, *sl1; /* pointers to dst address */ mlib_u8 *dp, *dl, *dend; /* all. pointer to y */ mlib_d64 *spy; /* all. pointer to dst */ mlib_d64 *dpp; /* u, v data */ mlib_f32 fu0, fu1, fv0, fv1; /* y data */ mlib_d64 dy0, dy1, dy3; mlib_d64 du, dv; /* (1.1644, 1.5966)*8192 */ mlib_f32 k12 = vis_to_float(0x25433317); /* (-.3920, -.8132)*8192 */ mlib_f32 k34 = vis_to_float(0xf375e5fa); /* 2.0184*8192 */ mlib_f32 k5 = vis_to_float(0x1004097); mlib_d64 k_222_9952 = vis_to_double(0x1be01be0, 0x1be01be0); mlib_d64 k_135_6352 = vis_to_double(0x10f410f4, 0x10f410f4); mlib_d64 k_276_9856 = vis_to_double(0x22a022a0, 0x22a022a0); mlib_d64 u_3920_hi, u_20184_hi, v_15966_hi, v_8132_hi; mlib_d64 u_3920_lo, u_20184_lo, v_15966_lo, v_8132_lo; mlib_d64 y_11644_hi, y_11644_lo; mlib_d64 r_hi, r_lo, g_hi, g_lo, b_hi, b_lo; mlib_d64 temp_r_hi, temp_r_lo, temp_g_hi, temp_g_lo, temp_b_hi, temp_b_lo; mlib_f32 red_hi, red_lo, green_hi, green_lo, blue_hi, blue_lo; mlib_d64 blue_red_hi, x_green_hi, blue_red_lo, x_green_lo; mlib_d64 dd, dd0, dd1; /* loop variable */ mlib_s32 i, j; /* alpha_ch. is not written */ mlib_s32 emask = 0x7777; mlib_s32 emask1; mlib_s32 off; mlib_f32 *dfu, *dfv; mlib_d64 du0, du1, dv0, dv1; mlib_s32 off2, off3; mlib_s32 inc; /* * initialize GSR scale factor */ vis_write_gsr(2 << 3); sp1 = sl1 = (mlib_u8 *)y; sp2 = sl2 = (mlib_u8 *)u; sp3 = sl3 = (mlib_u8 *)v; dl = dp = (mlib_u8 *)abgr; /* * row loop */ for (j = 0; j < height; j++) { spy = (mlib_d64 *)vis_alignaddr(sp1, 0); dpp = (mlib_d64 *)vis_alignaddr(dp, 0); dfu = (mlib_f32 *)((mlib_addr)sp2 & ~3); off2 = (sp2 - (mlib_u8 *)dfu) * 2; dfv = (mlib_f32 *)((mlib_addr)sp3 & ~3); off3 = (sp3 - (mlib_u8 *)dfv) * 2; dend = dp + width * 4 - 1; emask1 = vis_edge8(dp, dend); i = dp - (mlib_u8 *)dpp; emask >>= i; inc = (emask1 != 0xff); emask1 &= emask; off = 8 - i; vis_alignaddr((void *)off2, 0); fu0 = vis_ld_f32_nf(dfu); dfu++; fu1 = vis_ld_f32_nf(dfu); dfu++; du0 = vis_fpmerge(fu0, fu0); du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; vis_alignaddr((void *)off3, 0); fv0 = vis_ld_f32_nf(dfv); dfv++; fv1 = vis_ld_f32_nf(dfv); dfv++; dv0 = vis_fpmerge(fv0, fv0); dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); /* U*(-0.3920); */ u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); /* V*(-0.8132); */ v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); vis_alignaddr(sp1, 0); dy0 = vis_ld_d64_nf(spy); spy++; dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; /* * 16-pixel column loop */ #pragma pipeloop(0) for (i = 0; i <= width - 8; i += 8) { /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green_hi = vis_fpack16(temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue_hi = vis_fpack16(temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); red_hi = vis_fpack16(temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); vis_alignaddr((void *)off2, 0); fu1 = vis_ld_f32_nf(dfu); dfu++; du1 = vis_fpmerge(fu1, fu1); du = vis_faligndata(du0, du1); du0 = du1; green_lo = vis_fpack16(temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue_lo = vis_fpack16(temp_b_lo); x_green_hi = vis_fmul8x16au(green_hi, k5); red_lo = vis_fpack16(temp_r_lo); blue_red_hi = vis_fpmerge(blue_hi, red_hi); x_green_lo = vis_fmul8x16au(green_lo, k5); blue_red_lo = vis_fpmerge(blue_lo, red_lo); vis_alignaddr((void *)off3, 0); fv1 = vis_ld_f32_nf(dfv); dfv++; dv1 = vis_fpmerge(fv1, fv1); dv = vis_faligndata(dv0, dv1); dv0 = dv1; vis_alignaddr((void *)off, 0); /* U*(-0.3920); */ u_3920_hi = vis_fmul8x16au(vis_read_hi(du), k34); dd1 = vis_fpmerge(vis_read_hi(x_green_hi), vis_read_hi(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dpp += inc; inc = 1; /* V*(-0.8132); */ v_8132_hi = vis_fmul8x16al(vis_read_hi(dv), k34); dd0 = vis_fpmerge(vis_read_lo(x_green_hi), vis_read_lo(blue_red_hi)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); u_3920_lo = vis_fmul8x16au(vis_read_lo(du), k34); dd1 = vis_fpmerge(vis_read_hi(x_green_lo), vis_read_hi(blue_red_lo)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); v_8132_lo = vis_fmul8x16al(vis_read_lo(dv), k34); dd0 = vis_fpmerge(vis_read_lo(x_green_lo), vis_read_lo(blue_red_lo)); dd = vis_faligndata(dd1, dd0); vis_pst_8(dd, dpp++, emask); vis_alignaddr(sp1, 0); dy3 = vis_ld_d64_nf(spy); spy++; dy1 = vis_faligndata(dy0, dy3); dy0 = dy3; emask1 = emask; } if (i < width) { vis_alignaddr((void *)off, 0); /* U*2.0184 */ u_20184_hi = vis_fmul8x16al(vis_read_hi(du), k5); g_hi = vis_fpadd16(u_3920_hi, v_8132_hi); u_20184_lo = vis_fmul8x16al(vis_read_lo(du), k5); g_hi = vis_fpadd16(g_hi, k_135_6352); /* V*1.5966 */ v_15966_hi = vis_fmul8x16al(vis_read_hi(dv), k12); g_lo = vis_fpadd16(u_3920_lo, v_8132_lo); v_15966_lo = vis_fmul8x16al(vis_read_lo(dv), k12); g_lo = vis_fpadd16(g_lo, k_135_6352); /* Y*1.1644 */ y_11644_hi = vis_fmul8x16au(vis_read_hi(dy1), k12); b_hi = vis_fpsub16(u_20184_hi, k_276_9856); /* Y*1.1644 */ y_11644_lo = vis_fmul8x16au(vis_read_lo(dy1), k12); b_lo = vis_fpsub16(u_20184_lo, k_276_9856); r_hi = vis_fpsub16(v_15966_hi, k_222_9952); r_lo = vis_fpsub16(v_15966_lo, k_222_9952); temp_g_hi = vis_fpadd16(g_hi, y_11644_hi); temp_b_hi = vis_fpadd16(b_hi, y_11644_hi); green_hi = vis_fpack16(temp_g_hi); temp_r_hi = vis_fpadd16(r_hi, y_11644_hi); blue_hi = vis_fpack16(temp_b_hi); temp_g_lo = vis_fpadd16(g_lo, y_11644_lo); red_hi = vis_fpack16(temp_r_hi); temp_b_lo = vis_fpadd16(b_lo, y_11644_lo); green_lo = vis_fpack16(temp_g_lo); temp_r_lo = vis_fpadd16(r_lo, y_11644_lo); blue_lo = vis_fpack16(temp_b_lo); x_green_hi = vis_fmul8x16au(green_hi, k5); red_lo = vis_fpack16(temp_r_lo); blue_red_hi = vis_fpmerge(blue_hi, red_hi); x_green_lo = vis_fmul8x16au(green_lo, k5); blue_red_lo = vis_fpmerge(blue_lo, red_lo); dd1 = vis_fpmerge(vis_read_hi(x_green_hi), vis_read_hi(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); dd0 = dd1; dpp += inc; i += 2; if (i < width) { dd1 = vis_fpmerge(vis_read_lo(x_green_hi), vis_read_lo(blue_red_hi)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; i += 2; if (i < width) { dd1 = vis_fpmerge(vis_read_hi (x_green_lo), vis_read_hi(blue_red_lo)); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; } } } vis_alignaddr((void *)off, 0); emask1 = vis_edge8(dpp, dend); emask1 &= emask; dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask1); sp1 = sl1 = sl1 + y_stride; sp2 = sl2 = sl2 + uv_stride; sp3 = sl3 = sl3 + uv_stride; dl = dp = dl + abgr_stride; emask = 0x7777; } return (MLIB_SUCCESS); }
void mlib_v_ImageLookUpSI_U16_U8_2_D1(const mlib_u16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 **table) { mlib_u16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3, s4; /* source data */ mlib_u8 *dl; /* pointer to start of destination */ mlib_u8 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ const mlib_u8 *tab0 = &table[0][0]; const mlib_u8 *tab1 = &table[1][0]; sp = (void *)src; dl = dst; dend = dl + 2 * xsize - 1; vis_alignaddr((void *)0, 7); s0 = *sp++; *dl++ = tab0[s0]; dp = (mlib_d64 *) dl; xsize--; if (xsize >= 4) { s1 = sp[0]; s2 = sp[1]; s3 = sp[2]; s4 = sp[3]; sp += 4; #pragma pipeloop(0) for (i = 0; i <= xsize - 8; i += 4, sp += 4) { t7 = VIS_LD_U8_I(tab0, s4); t6 = VIS_LD_U8_I(tab1, s3); t5 = VIS_LD_U8_I(tab0, s3); t4 = VIS_LD_U8_I(tab1, s2); t3 = VIS_LD_U8_I(tab0, s2); t2 = VIS_LD_U8_I(tab1, s1); t1 = VIS_LD_U8_I(tab0, s1); t0 = VIS_LD_U8_I(tab1, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s4; s1 = sp[0]; s2 = sp[1]; s3 = sp[2]; s4 = sp[3]; *dp++ = acc; } t7 = VIS_LD_U8_I(tab0, s4); t6 = VIS_LD_U8_I(tab1, s3); t5 = VIS_LD_U8_I(tab0, s3); t4 = VIS_LD_U8_I(tab1, s2); t3 = VIS_LD_U8_I(tab0, s2); t2 = VIS_LD_U8_I(tab1, s1); t1 = VIS_LD_U8_I(tab0, s1); t0 = VIS_LD_U8_I(tab1, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); s0 = s4; *dp++ = acc; } num = ((mlib_u8 *) dend - (mlib_u8 *) dp) >> 1; sp += num; num++; #pragma pipeloop(0) for (i = 0; i < num; i++) { s1 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(tab1, s1); acc = vis_faligndata(t0, acc); t0 = VIS_LD_U8_I(tab0, s1); acc = vis_faligndata(t0, acc); } t0 = VIS_LD_U8_I(tab1, s0); acc = vis_faligndata(t0, acc); emask = vis_edge8(dp, dend); vis_pst_8(acc, dp, emask); }
void mlib_v_ImageLookUp_S16_U8_3_D1(const mlib_s16 *src, mlib_u8 *dst, mlib_s32 xsize, const mlib_u8 *table0, const mlib_u8 *table1, const mlib_u8 *table2) { mlib_s16 *sp; /* pointer to source data */ mlib_s32 s0, s1, s2, s3; /* source data */ mlib_s32 s4, s5, s6, s7; /* source data */ mlib_u8 *dl; /* pointer to start of destination */ mlib_u8 *dend; /* pointer to end of destination */ mlib_d64 *dp; /* aligned pointer to destination */ mlib_d64 t0, t1, t2; /* destination data */ mlib_d64 t3, t4, t5; /* destination data */ mlib_d64 t6, t7, acc; /* destination data */ mlib_s32 emask; /* edge mask */ mlib_s32 i, num; /* loop variable */ const mlib_u8 *table; dl = dst; sp = (void *)src; dp = (mlib_d64 *) dl; dend = dl + xsize - 1; vis_alignaddr((void *)0, 7); if (xsize >= 8) { s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; s4 = sp[4]; s5 = sp[5]; s6 = sp[6]; s7 = sp[7]; sp += 8; #pragma pipeloop(0) for (i = 0; i <= xsize - 16; i += 8, sp += 8) { t7 = VIS_LD_U8_I(table1, s7); t6 = VIS_LD_U8_I(table0, s6); t5 = VIS_LD_U8_I(table2, s5); t4 = VIS_LD_U8_I(table1, s4); t3 = VIS_LD_U8_I(table0, s3); t2 = VIS_LD_U8_I(table2, s2); t1 = VIS_LD_U8_I(table1, s1); t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); table = table0; table0 = table2; table2 = table1; table1 = table; s0 = sp[0]; s1 = sp[1]; s2 = sp[2]; s3 = sp[3]; s4 = sp[4]; s5 = sp[5]; s6 = sp[6]; s7 = sp[7]; *dp++ = acc; } t7 = VIS_LD_U8_I(table1, s7); t6 = VIS_LD_U8_I(table0, s6); t5 = VIS_LD_U8_I(table2, s5); t4 = VIS_LD_U8_I(table1, s4); t3 = VIS_LD_U8_I(table0, s3); t2 = VIS_LD_U8_I(table2, s2); t1 = VIS_LD_U8_I(table1, s1); t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t7, acc); acc = vis_faligndata(t6, acc); acc = vis_faligndata(t5, acc); acc = vis_faligndata(t4, acc); acc = vis_faligndata(t3, acc); acc = vis_faligndata(t2, acc); acc = vis_faligndata(t1, acc); acc = vis_faligndata(t0, acc); table = table0; table0 = table2; table2 = table1; table1 = table; *dp++ = acc; } if ((mlib_addr) dp <= (mlib_addr) dend) { num = (mlib_addr) dend - (mlib_addr) dp; sp += num; num++; i = num - 3 * (num / 3); if (i == 2) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table1, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t0, acc); num -= 2; } else if (i == 1) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t0, acc); num--; } #pragma pipeloop(0) for (i = 0; i < num; i += 3) { s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table2, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table1, s0); acc = vis_faligndata(t0, acc); s0 = (mlib_s32) * sp; sp--; t0 = VIS_LD_U8_I(table0, s0); acc = vis_faligndata(t0, acc); } emask = vis_edge8(dp, dend); vis_pst_8(acc, dp, emask); } }
mlib_status __mlib_VectorConjRev_S8C_S8C_Sat( mlib_s8 *zz, const mlib_s8 *xx, mlib_s32 n) { const mlib_s8 *x = xx; mlib_s8 *z = zz; mlib_s8 *src = (mlib_s8 *)x, *dst = z + 2 * (n); mlib_d64 *dsrc, *ddst; mlib_d64 d1, d2, d3, d4, dl, dh, d_rest; mlib_d64 dcntr0 = vis_to_double_dup(0x00800080); mlib_d64 dxor0 = vis_to_double_dup(0x007f007f); mlib_d64 done = vis_to_double_dup(1); mlib_s8 c; mlib_s32 i, rest_64, len_64, even_length, odd = 0, length = (mlib_s32)n * 2; mlib_s32 re_part; mlib_f32 f_null = vis_to_float(0); CHECK(x, z); if (n < 8) { CONJREVC(mlib_s8, MLIB_S8_MAX, MLIB_S8_MIN); } while (((mlib_addr)dst) & 7) { if ((c = src[1]) == MLIB_S8_MIN) *--dst = MLIB_S8_MAX; else *--dst = -c; length -= 2; src += 2; if (((mlib_addr)dst) & 7) { *--dst = src[-2]; } else { re_part = src[-2]; odd = 1; break; } } vis_write_gsr(7 << 3); ddst = (mlib_d64 *)dst; rest_64 = length & 7; len_64 = length >> 3; even_length = len_64 << 3; if (!odd) { /* * Aligning loop finished with imaginary part. The following processing * starts with real part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)src; #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ8; *--ddst = d4; } } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ8; *--ddst = d4; } } } else { /* * Aligning loop finished with real part. Th following processing * starts with imaginary part. */ if (!((mlib_addr)src & 7)) { /* * Src address is 8-byte aligned. */ dsrc = (mlib_d64 *)vis_alignaddr(src, 1); d_rest = vis_to_double((re_part << 24), 0); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d3 = (*dsrc++); CONJ8; *--ddst = vis_faligndata(d4, d_rest); d_rest = d4; } ddst--; d_rest = vis_faligndata(d_rest, d_rest); vis_pst_8(d_rest, ddst, 0x1); } else { dsrc = (mlib_d64 *)vis_alignaddr(src, 0); d2 = (*dsrc++); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = (*dsrc++); d3 = vis_faligndata(d1, d2); CONJ8; *--ddst = d4; } vis_write_gsr(1); d2 = *ddst; d3 = vis_faligndata(d1, d2); vis_pst_8(d3, (ddst - 1), 0x1); #pragma pipeloop(0) for (i = 0; i < len_64; i++) { d1 = d2; d2 = *(ddst + 1); (*ddst++) = vis_faligndata(d1, d2); } dst[-1] = re_part; } dst--; } if (!rest_64) return (MLIB_SUCCESS); for (i = 0; i < rest_64; i += 2) { dst[-even_length - 2 - i] = src[even_length + i]; if ((c = src[even_length + i + 1]) == MLIB_S8_MIN) dst[-even_length - 2 - i + 1] = MLIB_S8_MAX; else dst[-even_length - 2 - i + 1] = -c; } return (MLIB_SUCCESS); }
mlib_status mlib_v_ImageAdd_U8( mlib_image *dst, const mlib_image *src1, const mlib_image *src2) { mlib_s32 i, j, k; mlib_s32 offdst, offsrc1, offsrc2, emask; mlib_s32 amount; mlib_d64 *dpp, *spp2, *spp1, *tmp_ptr; mlib_d64 dd, dd0, dd1, sd10, sd11, sd20, sd21; mlib_d64 sd1h, sd2h, sd1l, sd2l, rdh, rdl; mlib_u8 *dend; mlib_f32 nul = vis_to_float(0), fone = vis_to_float(0x100); VALIDATE(mlib_u8); /* initialize GSR scale factor */ vis_write_gsr(7 << 3); sl1 = sp1; sl2 = sp2; dl = dp; amount = width * channels; offdst = ((mlib_addr)dp) & 7; offsrc1 = ((mlib_addr)sp1) & 7; offsrc2 = ((mlib_addr)sp2) & 7; if ((offdst == offsrc1) && (offdst == offsrc2) && (((strided ^ stride1) & 7) == 0) && (((strided ^ stride2) & 7) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); if (emask != 0xff) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp++, emask); i += 8; } #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); (*dpp++) = dd; } if (i < amount) { emask = vis_edge8(dpp, dend); sd10 = (*spp1++); sd20 = (*spp2++); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc1) && (((strided ^ stride1) & 7) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; /* prepare the source addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, 0); spp2 = (mlib_d64 *)vis_alignaddr(sp2, i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); sd20 = vis_ld_d64_nf(spp2); if (emask != 0xff) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp++, emask); sd20 = sd21; spp2++; i += 8; } #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd10 = (*spp1++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge8(dpp, dend); sd10 = (*spp1++); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offdst == offsrc2) && (((strided ^ stride2) & 7) == 0)) { for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; /* prepare the source addresses */ spp2 = (mlib_d64 *)vis_alignaddr(sp2, 0); spp1 = (mlib_d64 *)vis_alignaddr(sp1, i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); sd10 = vis_ld_d64_nf(spp1); if (emask != 0xff) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp++, emask); sd10 = sd11; spp1++; i += 8; } #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd20 = (*spp2++); sd11 = vis_ld_d64_nf(spp1 + 1); sd10 = vis_faligndata(sd10, sd11); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); (*dpp++) = dd; sd10 = sd11; spp1++; } if (i < amount) { emask = vis_edge8(dpp, dend); sd20 = (*spp2++); sd10 = vis_faligndata(sd10, vis_ld_d64_nf(spp1 + 1)); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else if ((offsrc1 == offsrc2) && (((stride1 ^ stride2) & 7) == 0)) { for (j = 0; j < height; j++) { /* prepare the source addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; /* prepare the destination addresses */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, i); spp2 = (mlib_d64 *)vis_alignaddr(sp2, i); dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_U8(sd10, sd20, dd0); if (emask != 0xff) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp++, emask); dd0 = dd1; i += 8; } #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1); (*dpp++) = vis_faligndata(dd0, dd1); dd0 = dd1; } if (i < amount) { emask = vis_edge8(dpp, dend); sd10 = vis_ld_d64_nf(spp1); spp1++; sd20 = vis_ld_d64_nf(spp2); spp2++; MLIB_V_ADDIMAGE_U8(sd10, sd20, dd1); dd = vis_faligndata(dd0, dd1); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } else { /* common case */ for (j = 0; j < height; j++) { /* prepare the destination addresses */ dpp = (mlib_d64 *)vis_alignaddr(dp, 0); i = (mlib_u8 *)dpp - dp; dend = dp + amount - 1; /* generate edge mask for the start point */ emask = vis_edge8(dp, dend); if (emask != 0xff) { spp1 = (mlib_d64 *)vis_alignaddr(sp1, i); sd10 = vis_faligndata(vis_ld_d64_nf(spp1), vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, i); sd20 = vis_faligndata(vis_ld_d64_nf(spp2), vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); vis_pst_8(dd, dpp++, emask); i += 8; } /* copy src1 to dst */ spp1 = (mlib_d64 *)vis_alignaddr(sp1, i); sd11 = vis_ld_d64_nf(spp1); tmp_ptr = dpp; #pragma pipeloop(0) for (k = i; k <= (amount - 8); k += 8) { sd10 = sd11; sd11 = vis_ld_d64_nf(spp1 + 1); (*tmp_ptr++) = vis_faligndata(sd10, sd11); spp1++; } sd11 = vis_faligndata(sd11, vis_ld_d64_nf(spp1 + 1)); spp2 = (mlib_d64 *)vis_alignaddr(sp2, i); sd20 = vis_ld_d64_nf(spp2); tmp_ptr = dpp; #pragma pipeloop(0) for (; i <= amount - 8; i += 8) { sd10 = (*tmp_ptr++); sd21 = vis_ld_d64_nf(spp2 + 1); sd20 = vis_faligndata(sd20, sd21); MLIB_V_ADDIMAGE_U8(sd10, sd20, dd); (*dpp++) = dd; sd20 = sd21; spp2++; } if (i < amount) { emask = vis_edge8(dpp, dend); sd20 = vis_faligndata(sd20, vis_ld_d64_nf(spp2 + 1)); MLIB_V_ADDIMAGE_U8(sd11, sd20, dd); vis_pst_8(dd, dpp, emask); } sp1 = sl1 += stride1; sp2 = sl2 += stride2; dp = dl += strided; } } return (MLIB_SUCCESS); }
mlib_status __mlib_VideoColorJFIFYCC2RGB444( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd; mlib_f32 fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f4fd); mlib_d64 k02 = vis_to_double_dup(0x2cdde926); mlib_d64 k11 = vis_to_double_dup(0xf4fd38b4); mlib_d64 k12 = vis_to_double_dup(0xe9260000); mlib_d64 k21 = vis_to_double_dup(0x38b40000); mlib_d64 k22 = vis_to_double_dup(0x00002cdd); mlib_d64 c_0 = vis_to_double_dup(0xe9a110ff); mlib_d64 c_1 = vis_to_double_dup(0x10ffe3b6); mlib_d64 c_2 = vis_to_double_dup(0xe3b6e9a1); mlib_d64 k_0 = vis_to_double_dup(0x20002000); if (size <= 0) return (MLIB_FAILURE); vis_write_gsr((2 << 3) + 2); vis_write_bmask(0x0489AB37, 0); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = (n - 1) >> 2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)cb; sf2 = (mlib_f32 *)cr; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) #pragma unroll(4) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_0145; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); s20 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, s20); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_0235); pfd[2] = vis_read_lo(d_0145); pfd += 3; } /* * last pixels */ if ((mlib_u8 *)pfd <= dend) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_d64 d_0235, d_xx14, d_0145; mlib_f32 x0, x1, x2; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; x0 = *sf0; x1 = *sf1; x2 = *sf2; s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpack16_pair(s00, s10); d_xx14 = vis_freg_pair(vis_fpack16(s20), fzero); d_0145 = vis_bshuffle(d_0235, d_xx14); d_0235 = vis_fpack32(d_0235, d_0235); d_0235 = vis_fpmerge(vis_read_hi(d_0235), vis_read_lo(d_0235)); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_0235); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; cb += n; cr += n; rgb += 3 * n; size -= n; } while (size); return (MLIB_SUCCESS); }
mlib_status __mlib_VideoDCT8x8Quantize_S16_S16_B12_NA( mlib_s16 coeffs[64], const mlib_s16 *block, const mlib_d64 qtable[64]) { mlib_d64 *sp = (mlib_d64 *)block; mlib_d64 *dp = (mlib_d64 *)coeffs; mlib_d64 d00, d10, d20, d30, d40, d50, d60, d70; mlib_d64 d01, d11, d21, d31, d41, d51, d61, d71; mlib_d64 t00, t10, t20, t30, t40, t50, t60, t70, t80, t90; mlib_d64 t01, t11, t21, t31, t41, t51, t61, t71, t81, t91; mlib_d64 r00, r10, r20, r30, r40, r50, r60, r70; mlib_d64 r01, r11, r21, r31, r41, r51, r61, r71; mlib_f32 FCOS, c17, c26, c35, c_4; mlib_s32 mask; mlib_d64 w_const = vis_to_double_dup(0x4000); if (block == NULL || coeffs == NULL) return (MLIB_FAILURE); if (!(((mlib_addr)block | (mlib_addr)coeffs) & 7)) { return (__mlib_VideoDCT8x8Quantize_S16_S16_B12(coeffs, block, qtable)); } vis_write_gsr(1 << 3); /* * first stage */ LOAD_DATA_GE_INTER1; TRANSPOSE(d00, d20, d40, d60, r00, r10, r20, r30); TRANSPOSE(d10, d30, d50, d70, r40, r50, r60, r70); LOADCONSTS4_12; PREPARE_DATA_INTER(0); LOAD_DATA_GE_INTER2; TRANSPOSE(d01, d21, d41, d61, r01, r11, r21, r31); COMPUTING_DATA(0); TRANSPOSE(d11, d31, d51, d71, r41, r51, r61, r71); PREPARE_DATA_INTER(1); COMPUTING_DATA(1); /* * second stage */ TRANSPOSE(d01, d11, d21, d31, r40, r50, r60, r70); TRANSPOSE(d00, d10, d20, d30, r00, r10, r20, r30); PREPARE_DATA_INTER(0); TRANSPOSE(d40, d50, d60, d70, r01, r11, r21, r31); COMPUTING_DATA_12(0); TRANSPOSE(d41, d51, d61, d71, r41, r51, r61, r71); ENDSCALE_12(0); dp = (mlib_d64 *)vis_alignaddr(coeffs, -1); mask = 0xFF >> ((mlib_addr)coeffs - (mlib_addr)dp); vis_alignaddrl((void *)coeffs, 0); PREPARE_DATA_INTER(1); COMPUTING_DATA_12(1); ENDSCALE_12(1); Quant_ST_NA(d00, d00, qtable[0]); Quant_ST_NA(d01, d01, qtable[1]); Quant_ST_NA(d10, d10, qtable[2]); Quant_ST_NA(d11, d11, qtable[3]); Quant_ST_NA(d20, d20, qtable[4]); Quant_ST_NA(d21, d21, qtable[5]); Quant_ST_NA(d30, d30, qtable[6]); Quant_ST_NA(d31, d31, qtable[7]); Quant_ST_NA(d40, d40, qtable[8]); Quant_ST_NA(d41, d41, qtable[9]); Quant_ST_NA(d50, d50, qtable[10]); Quant_ST_NA(d51, d51, qtable[11]); Quant_ST_NA(d60, d60, qtable[12]); Quant_ST_NA(d61, d61, qtable[13]); Quant_ST_NA(d70, d70, qtable[14]); Quant_ST_NA(d71, d71, qtable[15]); dp[1] = vis_faligndata(d00, d01); dp[2] = vis_faligndata(d01, d10); dp[3] = vis_faligndata(d10, d11); dp[4] = vis_faligndata(d11, d20); dp[5] = vis_faligndata(d20, d21); dp[6] = vis_faligndata(d21, d30); dp[7] = vis_faligndata(d30, d31); dp[8] = vis_faligndata(d31, d40); dp[9] = vis_faligndata(d40, d41); dp[10] = vis_faligndata(d41, d50); dp[11] = vis_faligndata(d50, d51); dp[12] = vis_faligndata(d51, d60); dp[13] = vis_faligndata(d60, d61); dp[14] = vis_faligndata(d61, d70); dp[15] = vis_faligndata(d70, d71); vis_pst_8(vis_faligndata(d71, d71), dp + 16, ~mask); if ((mlib_addr)coeffs & 7) vis_pst_8(vis_faligndata(d00, d00), dp, mask); return (MLIB_SUCCESS); }
void mlib_v_VideoColorYUV2RGB444_all_align( mlib_u8 *rgb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 size) { mlib_u8 *dend; mlib_f32 *sf0, *sf1, *sf2, *pfd, fzero = vis_fzeros(); mlib_s32 i, n, m, emask; mlib_d64 *buff2, pbuff_arr2[BUFF_SIZE + 4]; mlib_d64 tmp_arr64[2]; mlib_d64 k01 = vis_to_double_dup(0x0000f375); mlib_d64 k02 = vis_to_double_dup(0x3317e5fa); mlib_d64 k11 = vis_to_double_dup(0xf3754097); mlib_d64 k12 = vis_to_double_dup(0xe5fa0000); mlib_d64 k21 = vis_to_double_dup(0x40970000); mlib_d64 k22 = vis_to_double_dup(0x00003317); mlib_d64 c_0 = vis_to_double_dup(0xe42010f4); mlib_d64 c_1 = vis_to_double_dup(0x10f4dd60); mlib_d64 c_2 = vis_to_double_dup(0xdd60e420); mlib_d64 k_0 = vis_to_double_dup(0x25432543); do { /* loop on buffer size */ if (size > 2 * BUFF_SIZE) { n = 2 * BUFF_SIZE; } else { n = size; } m = n >> 2; buff2 = pbuff_arr2; sf0 = (mlib_f32 *)y; sf1 = (mlib_f32 *)u; sf2 = (mlib_f32 *)v; dend = rgb + 3 * n - 1; pfd = (mlib_f32 *)rgb; #pragma pipeloop(0) for (i = 0; i < m; i++) { mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; mlib_d64 d_0235, d_xx14, d_23xx, d_0145; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); /* * merge buff values to 3-channel array */ d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); pfd[0] = vis_read_hi(d_0145); pfd[1] = vis_read_hi(d_23xx); pfd[2] = vis_read_lo(d_0145); buff2 += 2; pfd += 3; } if ((mlib_u8 *)pfd <= dend) { mlib_d64 d_0235, d_xx14, d_23xx, d_0145; mlib_f32 *tmp_arr32 = (mlib_f32 *)tmp_arr64; mlib_d64 s00, s01, s02, s10, s11, s12, s20, s21, s22, s_0; mlib_f32 x0, x1, x2; x0 = (*sf0++); x1 = (*sf1++); x2 = (*sf2++); s_0 = vis_fmul8x16(x0, k_0); s01 = vis_fmul8x16(x1, k01); s11 = vis_fmul8x16(x1, k11); s21 = vis_fmul8x16(x1, k21); s02 = vis_fmul8x16(x2, k02); s12 = vis_fmul8x16(x2, k12); s22 = vis_fmul8x16(x2, k22); s00 = vis_fpadd16(s_0, s01); s10 = vis_fpadd16(s_0, s11); s20 = vis_fpadd16(s_0, s21); s02 = vis_fpadd16(s02, c_0); s12 = vis_fpadd16(s12, c_1); s22 = vis_fpadd16(s22, c_2); s00 = vis_fpadd16(s00, s02); s10 = vis_fpadd16(s10, s12); s20 = vis_fpadd16(s20, s22); d_0235 = vis_fpmerge(vis_fpack16(s00), vis_fpack16(s10)); d_xx14 = vis_freg_pair(fzero, vis_fpack16(s20)); d_23xx = vis_faligndata(d_0235, d_0235); d_0145 = vis_bshuffle(d_0235, d_xx14); emask = vis_edge8(pfd, dend); if ((mlib_addr)pfd & 7) { pfd--; tmp_arr32++; } tmp_arr32[0] = vis_read_hi(d_0145); tmp_arr32[1] = vis_read_hi(d_23xx); tmp_arr32[2] = vis_read_lo(d_0145); vis_pst_8(tmp_arr64[0], pfd, emask); pfd += 2; emask = vis_edge8(pfd, dend); if ((mlib_u8 *)pfd <= dend) vis_pst_8(tmp_arr64[1], pfd, emask); } y += n; u += n; v += n; rgb += 3 * n; size -= n; } while (size); }
void mlib_v_ImageSqrShift_U8( mlib_u8 *src, mlib_s32 slb, mlib_u8 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize) { /* pointer to a line in source image */ mlib_u8 *sl; /* 8-byte aligned pointer to source image */ mlib_d64 *sp; /* pointer to a line in destination image */ mlib_u8 *dl; /* pointer to end of a line in destination image */ mlib_u8 *dend; /* 8-byte aligned pointer to destination image */ mlib_d64 *dp; /* offset of address alignment in destination */ mlib_s32 off; /* edge masks */ mlib_s32 emask; /* source data */ mlib_d64 s0, s1; /* source data */ mlib_d64 sd; /* destination data */ mlib_d64 dd; /* temporaries used in macro */ mlib_d64 sdh, sdl; /* temporaries used in macro */ mlib_d64 rdh, rdl; /* loop variable */ mlib_s32 i, j, n; sl = src; dl = dst; /* row loop */ for (j = 0; j < ysize; j++) { /* prepare the destination address */ dp = (mlib_d64 *)((mlib_addr)dl & (~7)); off = (mlib_addr)dp - (mlib_addr)dl; dend = dl + xsize - 1; /* prepare the source address */ sp = (mlib_d64 *)vis_alignaddr(sl, off); /* generate edge mask for the start point */ emask = vis_edge8(dl, dend); /* first 8 pixels */ s0 = vis_ld_d64_nf(sp); sp++; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_U8(sd, dd); vis_pst_8(dd, dp++, emask); n = ((mlib_u8 *)(dend + 1) - (mlib_u8 *)dp) / 8; /* 8-pixel column loop */ #pragma pipeloop(0) for (i = 0; i < n; i++) { s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_U8(sd, dd); (*dp++) = dd; } /* end point handling */ if ((mlib_addr)dp <= (mlib_addr)dend) { emask = vis_edge8(dp, dend); s0 = s1; s1 = vis_ld_d64_nf(sp); sp++; sd = vis_faligndata(s0, s1); MLIB_V_IMAGESQRSHIFT_U8(sd, dd); vis_pst_8(dd, dp++, emask); } sl += slb; dl += dlb; } }
mlib_status __mlib_VectorNorm_S8_Sat( mlib_d64 *z, const mlib_s8 *x, mlib_s32 n) { mlib_s8 *pxend, *px = (mlib_s8 *)x; mlib_d64 *dpx, *dpxend; mlib_d64 sum = 0.0; mlib_d64 dx, dr1, dr2, dr3, dr4, dr5, dr6, dr7, dr8; mlib_d64 ds1, ds2; mlib_d64 edge[2]; mlib_d64 fzero = vis_fzero(); mlib_f32 f4ones = vis_to_float(0x01010101); mlib_f32 fsum; mlib_s32 d_left; mlib_s32 emask; if (n <= 0) return (MLIB_FAILURE); edge[0] = edge[1] = 0; dpx = (mlib_d64 *)((mlib_addr)px & (~7)); pxend = px + n - 1; dpxend = (mlib_d64 *)((mlib_addr)pxend & (~7)); emask = vis_edge8(px, pxend); vis_pst_8(dpx[0], edge, emask); dx = edge[0]; while ((mlib_addr)dpx < (mlib_addr)dpxend) { d_left = dpxend - dpx; if (d_left > MAX_LOOP) d_left = MAX_LOOP; ds1 = ds2 = 0.0; for (; d_left > 0; d_left--) { NORM_S8; SUM_S8; dpx++; dx = dpx[0]; } fsum = vis_read_hi(ds1); sum += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds1); sum += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds2); sum += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds2); sum += (mlib_d64)*((mlib_s32 *)&fsum); } if ((mlib_addr)dpx <= (mlib_addr)pxend) { emask = vis_edge8(dpx, pxend); vis_pst_8(dx, edge + 1, emask); dx = edge[1]; NORM_S8; ds1 = vis_fpadd32(dr5, dr6); ds2 = vis_fpadd32(dr7, dr8); fsum = vis_read_hi(ds1); sum += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds1); sum += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_hi(ds2); sum += (mlib_d64)*((mlib_s32 *)&fsum); fsum = vis_read_lo(ds2); sum += (mlib_d64)*((mlib_s32 *)&fsum); } z[0] = mlib_sqrt(sum / 256.0); return (MLIB_SUCCESS); #undef MAX_LOOP }
mlib_status __mlib_VectorSubS_U8_U8_Sat( mlib_u8 *z, const mlib_u8 *x, const mlib_u8 *c, mlib_s32 n) { /* edge masks */ mlib_s32 emask; /* offset of address alignment in destination */ mlib_s32 off; mlib_u8 *pzend; mlib_d64 *dpx, *dpz, *dpzend; mlib_d64 dx, dx0, dx1, dr0, dr1, dr; mlib_u16 cc = *((mlib_u8 *)c); /* prepare the scaling factors */ mlib_d64 dc = vis_to_double_dup((cc << 4) | (cc << 20)); if (n <= 0) return (MLIB_FAILURE); /* initialize GSR scale factor */ vis_write_gsr(3 << 3); pzend = (mlib_u8 *)z + n - 1; dpzend = (mlib_d64 *)((mlib_addr)pzend & (~7)); dpz = (mlib_d64 *)((mlib_addr)z & (~7)); off = (mlib_addr)dpz - (mlib_addr)z; /* * prepare the source address */ dpx = (mlib_d64 *)vis_alignaddr((void *)x, off); dx0 = vis_ld_d64_nf(dpx); dpx++; dx1 = vis_ld_d64_nf(dpx); dpx++; /* * generate edge mask for the start bytes */ emask = vis_edge8(z, pzend); dx = vis_faligndata(dx0, dx1); SUBS_U8_SAT; /* store first bytes of result */ vis_pst_8(dr, dpz, emask); dpz++; dx0 = dx1; #pragma pipeloop(0) for (; (mlib_addr)dpz < (mlib_addr)dpzend; ) { dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUBS_U8_SAT; (*dpz++) = dr; dx0 = dx1; } if ((mlib_addr)dpz <= (mlib_addr)pzend) { dx1 = vis_ld_d64_nf(dpx); dpx++; dx = vis_faligndata(dx0, dx1); SUBS_U8_SAT; /* prepare edge mask for the last bytes */ emask = vis_edge8(dpz, pzend); /* store last bytes of result */ vis_pst_8(dr, dpz, emask); } return (MLIB_SUCCESS); }