Exemplo n.º 1
0
void
mlib_ImageLineXor8000(
    const mlib_u8 *src,
    mlib_u8 *dst,
    mlib_s32 size)
{
	mlib_u8 *dend;
	mlib_d64 *dptr;
	mlib_d64 *sptr;
	mlib_d64 s0, s1;
	mlib_d64 mask8000 = vis_to_double_dup(0x80008000);
	mlib_s32 j;
	mlib_s32 emask;

/* prepare the destination addresses */
	dptr = (mlib_d64 *)((mlib_addr)dst & (~7));
	j = (mlib_addr)dptr - (mlib_addr)dst;
	dend = (mlib_u8 *)dst + size - 1;

/* prepare the source address */
	sptr = (mlib_d64 *)VIS_ALIGNADDR(src, j);
/* generate edge mask for the start point */
	emask = vis_edge8(dst, dend);

	s1 = vis_ld_d64_nf(sptr);

	if (emask != 0xff) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		s0 = vis_fxor(vis_faligndata(s0, s1), mask8000);
		vis_pst_8(s0, dptr++, emask);
		sptr++;
		j += 8;
	}

#pragma pipeloop(0)
	for (; j <= (size - 16); j += 8) {
		s0 = s1;
		s1 = sptr[1];
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
	}

	if (j <= (size - 8)) {
		s0 = s1;
		s1 = vis_ld_d64_nf(sptr + 1);
		(*dptr++) = vis_fxor(vis_faligndata(s0, s1), mask8000);
		sptr++;
		j += 8;
	}

	if (j < size) {
		s0 = vis_fxor(vis_faligndata(s1, vis_ld_d64_nf(sptr + 1)),
		    mask8000);
		emask = vis_edge8(dptr, dend);
		vis_pst_8(s0, dptr, emask);
	}
}
mlib_status
mlib_ImageChannelMerge2_S16(
    mlib_s16 *dst_s16_0,
    const mlib_s16 *src_s16_0,
    const mlib_s16 *src_s16_1,
    mlib_s32 height,
    mlib_s32 width,
    mlib_s32 dst_stride,
    mlib_s32 src0_stride,
    mlib_s32 src1_stride)
{
	mlib_s32 i, j, k, n = width * 2;
	mlib_u32 *dp, s0, s1;
	mlib_s16 *f_ptr, *s_ptr;
	mlib_d64 *sd0_ptr, *sd1_ptr;
	mlib_d64 sd0, sd1, sd2, sd3, s0h, s1h, s0l, s1l;
	mlib_u32 bm0 = 0x018923ab;
	mlib_u32 bm1 = 0x45cd67ef;
	mlib_d64 dd0, dd1, dd2, dd3;

	for (j = 0; j < height; j++) {
		i = 0;

		if ((mlib_addr)dst_s16_0 & 3) {
			dst_s16_0[0] = src_s16_0[0];
			f_ptr = (mlib_s16 *)(src_s16_1);
			s_ptr = (mlib_s16 *)(src_s16_0 + 1);
			i++;
		} else {
			f_ptr = (mlib_s16 *)src_s16_0;
			s_ptr = (mlib_s16 *)src_s16_1;
		}

		if (((mlib_addr)(dst_s16_0 + i) & 7) && (i < (n - 1))) {
			dst_s16_0[i + 0] = (*f_ptr++);
			dst_s16_0[i + 1] = (*s_ptr++);
			i += 2;
		}

		dp = (mlib_u32 *)(dst_s16_0 + i);

		if ((((mlib_addr)f_ptr & 7) == 0) &&
		    (((mlib_addr)s_ptr & 7) == 0)) {

#pragma pipeloop(0)
			for (; i < (n - 15); i += 16) {

				sd0 = ((mlib_d64 *)f_ptr)[0];
				sd1 = ((mlib_d64 *)s_ptr)[0];
				sd2 = ((mlib_d64 *)f_ptr)[1];
				sd3 = ((mlib_d64 *)s_ptr)[1];

				vis_write_bmask(bm0, 0);
				dd0 = vis_bshuffle(sd0, sd1);
				dd2 = vis_bshuffle(sd2, sd3);

				vis_write_bmask(bm1, 0);
				dd1 = vis_bshuffle(sd0, sd1);
				dd3 = vis_bshuffle(sd2, sd3);

				((mlib_d64 *)dp)[0] = dd0;
				((mlib_d64 *)dp)[1] = dd1;
				((mlib_d64 *)dp)[2] = dd2;
				((mlib_d64 *)dp)[3] = dd3;

				f_ptr += 8;
				s_ptr += 8;
				dp += 8;
			}
		} else if (((mlib_addr)f_ptr & 7) == ((mlib_addr)s_ptr & 7)) {

			mlib_d64 s0h, s1h, s0l, s1l;

			sd0_ptr = VIS_ALIGNADDR(f_ptr, 0);
			sd1_ptr = VIS_ALIGNADDR(s_ptr, 0);

			s0h = (*sd0_ptr++);
			s1h = (*sd1_ptr++);

#pragma pipeloop(0)
			for (; i < (n - 7); i += 8) {

				s0l = (*sd0_ptr++);
				s1l = (*sd1_ptr++);
				sd0 = vis_faligndata(s0h, s0l);
				sd1 = vis_faligndata(s1h, s1l);

				vis_write_bmask(bm0, 0);
				dd0 = vis_bshuffle(sd0, sd1);

				vis_write_bmask(bm1, 0);
				dd1 = vis_bshuffle(sd0, sd1);

				((mlib_d64 *)dp)[0] = dd0;
				((mlib_d64 *)dp)[1] = dd1;

				s0h = s0l;
				s1h = s1l;

				f_ptr += 4;
				s_ptr += 4;
				dp += 4;
			}
		} else {

			sd0_ptr = VIS_ALIGNADDR(f_ptr, 0);
			sd1_ptr = VIS_ALIGNADDR(s_ptr, 0);

			s0h = vis_ld_d64_nf(sd0_ptr); sd0_ptr++;
			s1h = vis_ld_d64_nf(sd1_ptr); sd1_ptr++;

#pragma pipeloop(0)
			for (k = 0; i < (n - 7); i += 8, k++) {

				VIS_ALIGNADDR(f_ptr, 0);
				s0l = vis_ld_d64_nf(sd0_ptr); sd0_ptr++;
				sd0 = vis_faligndata(s0h, s0l);

				VIS_ALIGNADDR(s_ptr, 0);
				s1l = vis_ld_d64_nf(sd1_ptr); sd1_ptr++;
				sd1 = vis_faligndata(s1h, s1l);

				vis_write_bmask(bm0, 0);
				dd0 = vis_bshuffle(sd0, sd1);

				vis_write_bmask(bm1, 0);
				dd1 = vis_bshuffle(sd0, sd1);

				((mlib_d64 *)dp)[0] = dd0;
				((mlib_d64 *)dp)[1] = dd1;

				s0h = s0l;
				s1h = s1l;

				dp += 4;
			}

			f_ptr += (k << 2);
			s_ptr += (k << 2);
		}

		for (; i < (n - 1); i += 2) {
			s0 = (mlib_u16)((*f_ptr++));
			s1 = (mlib_u16)((*s_ptr++));
			(*dp++) = (s0 << 16) + s1;
		}

		if (i < n)
			dst_s16_0[i] = *f_ptr;

		src_s16_0 += src0_stride;
		src_s16_1 += src1_stride;
		dst_s16_0 += dst_stride;
	}
	return (MLIB_SUCCESS);
}
mlib_status
mlib_ImageChannelMerge3_S16(
    mlib_s16 *dst_s16_0,
    const mlib_s16 *src_s16_0,
    const mlib_s16 *src_s16_1,
    const mlib_s16 *src_s16_2,
    mlib_s32 height,
    mlib_s32 width,
    mlib_s32 dst_stride,
    mlib_s32 src0_stride,
    mlib_s32 src1_stride,
    mlib_s32 src2_stride)
{
	mlib_s32 i, j, k, n = width * 3;
	mlib_f32 *dp;
	mlib_u32 bm0 = 0x0189ff23;
	mlib_u32 bm2 = 0xabff45cd;
	mlib_u32 bm4 = 0xff67efff;
	mlib_u32 bm1 = 0x01238967;
	mlib_u32 bm3 = 0x01ab4567;
	mlib_u32 bm5 = 0xcd2345ef;
	mlib_d64 sd0, sd1, sd2;
	mlib_d64 dd0, xx0, dd1, xx1;
	mlib_d64 dd2, xx2;

	vis_write_gsr(8 << 3);

	for (j = 0; j < height; j++) {

		i = 0;
		k = 0;

		for (; (i < (n - 2)) && ((mlib_addr)(dst_s16_0 + i) & 7);
		    i += 3, k++) {
			dst_s16_0[i + 0] = src_s16_0[k];
			dst_s16_0[i + 1] = src_s16_1[k];
			dst_s16_0[i + 2] = src_s16_2[k];
		}

		dp = (mlib_f32 *)(dst_s16_0 + i);

		if (((mlib_addr)(src_s16_0 + k) & 7) ||
		    ((mlib_addr)(src_s16_1 + k) & 7) ||
		    ((mlib_addr)(src_s16_2 + k) & 7)) {

			mlib_d64 s0h, s0l, s1h, s1l, s2h, s2l;
			mlib_d64 *sp0;
			mlib_d64 *sp1;
			mlib_d64 *sp2;

			sp0 = VIS_ALIGNADDR((src_s16_0 + k), 0);
			s0h = vis_ld_d64_nf(sp0); sp0++;
			sp1 = VIS_ALIGNADDR((src_s16_1 + k), 0);
			s1h = vis_ld_d64_nf(sp1); sp1++;
			sp2 = VIS_ALIGNADDR((src_s16_2 + k), 0);
			s2h = vis_ld_d64_nf(sp2); sp2++;

#pragma pipeloop(0)
			for (; i < (n - 11); i += 12, k += 4) {
				s0l = vis_ld_d64_nf(sp0); sp0++;
				s1l = vis_ld_d64_nf(sp1); sp1++;
				s2l = vis_ld_d64_nf(sp2); sp2++;

				VIS_ALIGNADDR((src_s16_0 + k), 0);
				sd0 = vis_faligndata(s0h, s0l);
				VIS_ALIGNADDR((src_s16_1 + k), 0);
				sd1 = vis_faligndata(s1h, s1l);
				VIS_ALIGNADDR((src_s16_2 + k), 0);
				sd2 = vis_faligndata(s2h, s2l);

				vis_write_bmask(bm0, 0);
				xx0 = vis_bshuffle(sd0, sd1);
				vis_write_bmask(bm1, 0);
				dd0 = vis_bshuffle(xx0, sd2);
				vis_write_bmask(bm2, 0);
				xx1 = vis_bshuffle(sd0, sd1);
				vis_write_bmask(bm3, 0);
				dd1 = vis_bshuffle(xx1, sd2);
				vis_write_bmask(bm4, 0);
				xx2 = vis_bshuffle(sd0, sd1);
				vis_write_bmask(bm5, 0);
				dd2 = vis_bshuffle(xx2, sd2);

				((mlib_d64 *)dp)[0] = dd0;
				((mlib_d64 *)dp)[1] = dd1;
				((mlib_d64 *)dp)[2] = dd2;

				dp += 6;
				s0h = s0l;
				s1h = s1l;
				s2h = s2l;
			}
		} else {

#pragma pipeloop(0)
			for (; i < (n - 11); i += 12, k += 4) {
				sd0 = *((mlib_d64 *)(src_s16_0 + k));
				sd1 = *((mlib_d64 *)(src_s16_1 + k));
				sd2 = *((mlib_d64 *)(src_s16_2 + k));

				vis_write_bmask(bm0, 0);
				xx0 = vis_bshuffle(sd0, sd1);
				vis_write_bmask(bm1, 0);
				dd0 = vis_bshuffle(xx0, sd2);
				vis_write_bmask(bm2, 0);
				xx1 = vis_bshuffle(sd0, sd1);
				vis_write_bmask(bm3, 0);
				dd1 = vis_bshuffle(xx1, sd2);
				vis_write_bmask(bm4, 0);
				xx2 = vis_bshuffle(sd0, sd1);
				vis_write_bmask(bm5, 0);
				dd2 = vis_bshuffle(xx2, sd2);

				((mlib_d64 *)dp)[0] = dd0;
				((mlib_d64 *)dp)[1] = dd1;
				((mlib_d64 *)dp)[2] = dd2;

				dp += 6;
			}
		}

		for (; i < (n - 2); i += 3, k++) {
			dst_s16_0[i + 0] = src_s16_0[k];
			dst_s16_0[i + 1] = src_s16_1[k];
			dst_s16_0[i + 2] = src_s16_2[k];
		}

		dst_s16_0 += dst_stride;
		src_s16_0 += src0_stride;
		src_s16_1 += src1_stride;
		src_s16_2 += src2_stride;
	}
	return (MLIB_SUCCESS);
}
mlib_status
mlib_ImageChannelMerge4_S16(
    mlib_s16 *dst_s16_0,
    const mlib_s16 *src_s16_0,
    const mlib_s16 *src_s16_1,
    const mlib_s16 *src_s16_2,
    const mlib_s16 *src_s16_3,
    mlib_s32 height,
    mlib_s32 width,
    mlib_s32 dst_stride,
    mlib_s32 src0_stride,
    mlib_s32 src1_stride,
    mlib_s32 src2_stride,
    mlib_s32 src3_stride)
{
	mlib_s32 i, j, n = width << 2;
	mlib_s16 *fi_ptr, *se_ptr, *th_ptr, *fo_ptr;
	mlib_d64 *dp;

	for (j = 0; j < height; j++) {

		i = 0;

		if ((mlib_addr)(dst_s16_0 + i) & 7) {
			dst_s16_0[i++] = src_s16_0[0];

			if ((mlib_addr)(dst_s16_0 + i) & 7) {
				dst_s16_0[i++] = src_s16_1[0];

				if ((mlib_addr)(dst_s16_0 + i) & 7) {
					dst_s16_0[i++] = src_s16_2[0];
				}
			}
		}

		if (i == 0) {
			fi_ptr = (mlib_s16 *)src_s16_0;
			se_ptr = (mlib_s16 *)src_s16_1;
			th_ptr = (mlib_s16 *)src_s16_2;
			fo_ptr = (mlib_s16 *)src_s16_3;
		} else if (i == 1) {
			fi_ptr = (mlib_s16 *)src_s16_1;
			se_ptr = (mlib_s16 *)src_s16_2;
			th_ptr = (mlib_s16 *)src_s16_3;
			fo_ptr = (mlib_s16 *)(src_s16_0 + 1);
		} else if (i == 2) {
			fi_ptr = (mlib_s16 *)src_s16_2;
			se_ptr = (mlib_s16 *)src_s16_3;
			th_ptr = (mlib_s16 *)(src_s16_0 + 1);
			fo_ptr = (mlib_s16 *)(src_s16_1 + 1);
		} else if (i == 3) {
			fi_ptr = (mlib_s16 *)src_s16_3;
			se_ptr = (mlib_s16 *)(src_s16_0 + 1);
			th_ptr = (mlib_s16 *)(src_s16_1 + 1);
			fo_ptr = (mlib_s16 *)(src_s16_2 + 1);
		}

		dp = (mlib_d64 *)(dst_s16_0 + i);

		if ((n - i) > 16) {
			if (((mlib_addr)fi_ptr & 7) ||
			    ((mlib_addr)se_ptr & 7) ||
			    ((mlib_addr)th_ptr & 7) ||
			    ((mlib_addr)fo_ptr & 7)) {

				mlib_d64 sd0, sd1, sd2, sd3;
				mlib_d64 dd0, dd1, dd2, dd3, dr02, dr13;
				mlib_d64 s0h, s0l, s1h, s1l, s2h, s2l, s3h, s3l;
				mlib_d64 *sp0;
				mlib_d64 *sp1;
				mlib_d64 *sp2;
				mlib_d64 *sp3;

				sp0 = VIS_ALIGNADDR(fi_ptr, 0);
				s0h = (*sp0++);
				sp1 = VIS_ALIGNADDR(se_ptr, 0);
				s1h = (*sp1++);
				sp2 = VIS_ALIGNADDR(th_ptr, 0);
				s2h = (*sp2++);
				sp3 = VIS_ALIGNADDR(fo_ptr, 0);
				s3h = (*sp3++);

#pragma pipeloop(0)
				for (; i < (n - 15); i += 16) {
					s0l = vis_ld_d64_nf(sp0); sp0++;
					s1l = vis_ld_d64_nf(sp1); sp1++;
					s2l = vis_ld_d64_nf(sp2); sp2++;
					s3l = vis_ld_d64_nf(sp3); sp3++;

					VIS_ALIGNADDR(fi_ptr, 0);
					sd0 = vis_faligndata(s0h, s0l);

					VIS_ALIGNADDR(se_ptr, 0);
					sd1 = vis_faligndata(s1h, s1l);

					VIS_ALIGNADDR(th_ptr, 0);
					sd2 = vis_faligndata(s2h, s2l);

					VIS_ALIGNADDR(fo_ptr, 0);
					sd3 = vis_faligndata(s3h, s3l);

					dr02 =
					    vis_fpmerge(vis_read_hi(sd0),
					    vis_read_hi(sd2));
					dr13 =
					    vis_fpmerge(vis_read_hi(sd1),
					    vis_read_hi(sd3));
					dd0 =
					    vis_fpmerge(vis_read_hi(dr02),
					    vis_read_hi(dr13));
					dp[0] =
					    vis_fpmerge(vis_read_hi(dd0),
					    vis_read_lo(dd0));
					dd1 =
					    vis_fpmerge(vis_read_lo(dr02),
					    vis_read_lo(dr13));
					dp[1] =
					    vis_fpmerge(vis_read_hi(dd1),
					    vis_read_lo(dd1));
					dr02 =
					    vis_fpmerge(vis_read_lo(sd0),
					    vis_read_lo(sd2));
					dr13 =
					    vis_fpmerge(vis_read_lo(sd1),
					    vis_read_lo(sd3));
					dd2 =
					    vis_fpmerge(vis_read_hi(dr02),
					    vis_read_hi(dr13));
					dp[2] =
					    vis_fpmerge(vis_read_hi(dd2),
					    vis_read_lo(dd2));
					dd3 =
					    vis_fpmerge(vis_read_lo(dr02),
					    vis_read_lo(dr13));
					dp[3] =
					    vis_fpmerge(vis_read_hi(dd3),
					    vis_read_lo(dd3));
					dp += 4;

					s0h = s0l;
					s1h = s1l;
					s2h = s2l;
					s3h = s3l;

					fi_ptr += 4;
					se_ptr += 4;
					th_ptr += 4;
					fo_ptr += 4;
				}
			} else {

				mlib_d64 sd0, sd1, sd2, sd3;
				mlib_d64 dd0, dd1, dd2, dd3, dr02, dr13;

#pragma pipeloop(0)
				for (; i < (n - 15); i += 16) {

					sd0 = ((mlib_d64 *)fi_ptr)[0];
					sd1 = ((mlib_d64 *)se_ptr)[0];
					sd2 = ((mlib_d64 *)th_ptr)[0];
					sd3 = ((mlib_d64 *)fo_ptr)[0];

					dr02 =
					    vis_fpmerge(vis_read_hi(sd0),
					    vis_read_hi(sd2));
					dr13 =
					    vis_fpmerge(vis_read_hi(sd1),
					    vis_read_hi(sd3));
					dd0 =
					    vis_fpmerge(vis_read_hi(dr02),
					    vis_read_hi(dr13));
					dp[0] =
					    vis_fpmerge(vis_read_hi(dd0),
					    vis_read_lo(dd0));
					dd1 =
					    vis_fpmerge(vis_read_lo(dr02),
					    vis_read_lo(dr13));
					dp[1] =
					    vis_fpmerge(vis_read_hi(dd1),
					    vis_read_lo(dd1));
					dr02 =
					    vis_fpmerge(vis_read_lo(sd0),
					    vis_read_lo(sd2));
					dr13 =
					    vis_fpmerge(vis_read_lo(sd1),
					    vis_read_lo(sd3));
					dd2 =
					    vis_fpmerge(vis_read_hi(dr02),
					    vis_read_hi(dr13));
					dp[2] =
					    vis_fpmerge(vis_read_hi(dd2),
					    vis_read_lo(dd2));
					dd3 =
					    vis_fpmerge(vis_read_lo(dr02),
					    vis_read_lo(dr13));
					dp[3] =
					    vis_fpmerge(vis_read_hi(dd3),
					    vis_read_lo(dd3));
					dp += 4;

					fi_ptr += 4;
					se_ptr += 4;
					th_ptr += 4;
					fo_ptr += 4;
				}
			}
		}
#pragma pipeloop(0)
		for (; i < (n - 3); i += 4) {
			dst_s16_0[i + 0] = (*fi_ptr++);
			dst_s16_0[i + 1] = (*se_ptr++);
			dst_s16_0[i + 2] = (*th_ptr++);
			dst_s16_0[i + 3] = (*fo_ptr++);
		}

		if (i < (n - 2)) {
			dst_s16_0[i + 0] = *fi_ptr;
			dst_s16_0[i + 1] = *se_ptr;
			dst_s16_0[i + 2] = *th_ptr;
		} else if (i < (n - 1)) {
			dst_s16_0[i + 0] = *fi_ptr;
			dst_s16_0[i + 1] = *se_ptr;
		} else if (i < n) {
			dst_s16_0[i + 0] = *fi_ptr;
		}

		dst_s16_0 += dst_stride;
		src_s16_0 += src0_stride;
		src_s16_1 += src1_stride;
		src_s16_2 += src2_stride;
		src_s16_3 += src3_stride;
	}
	return (MLIB_SUCCESS);
}
Exemplo n.º 5
0
  mask = mask0 >> offset;
  src = da[0];
  da[0] = (src & (~mask)) | (sa[0] & mask);
  da++;
  sa++;
  size = size - 8 + offset;
  b_size = size >> 3;                       /* size in bytes */

  /* prepare the destination addresses */
  dp = (mlib_d64 *) ((mlib_addr) da & (~7));
  j = (mlib_addr) dp - (mlib_addr) da;
  dend = da + b_size - 1;

  /* prepare the source address */
  sp = (mlib_d64 *) VIS_ALIGNADDR(sa, j);
  /* generate edge mask for the start point */
  emask = vis_edge8(da, dend);

  s1 = vis_ld_d64_nf(sp);
  if (emask != 0xff) {
    s0 = s1;
    s1 = vis_ld_d64_nf(sp+1);
    s0 = vis_faligndata(s0, s1);
    vis_pst_8(s0, dp++, emask);
    sp++;
    j += 8;
  }

#pragma pipeloop(0)
  for (; j <= (b_size - 8); j += 8) {