mlib_status
mlib_m_sconv3x3_16nw_1(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 hker0, hker1, hker2, vker0, vker1, vker2;
	__m64 s0, s1, s2, v0, v1, aa, bb, rr, rh, rl;
	__m64 *sp0, *sp1, *sp2, *dp;
	__m64 zero, _rnd;
	mlib_s32 shift, kerh_sum;
	mlib_s32 i, j;

	width -= 2;
	height -= 2;
	width *= NCHAN;
	dl += dll + NCHAN;

	GET_KERN();

	zero = _mm_setzero_si64();

	for (j = 0; j < height; j++) {
		sp0 = (__m64 *) sl;
		sp1 = (__m64 *) (sl + sll);
		sp2 = (__m64 *) (sl + 2 * sll);
		dp = (__m64 *) dl;

		PREP_V();

		for (i = 0; i < width / 4; i++) {
			CONV_3x3();

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_3x3();

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	return (MLIB_SUCCESS);
}
mlib_status
mlib_m_sconv5x5_u16nw_4(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 hker0, hker1, hker2, hker3, hker4;
	__m64 vker0, vker1, vker2, vker3, vker4;
	__m64 s0, s1, s2, s3, s4, v0, v1, v2, v3, v4, rr, rh, rl;
	__m64 zero, ker_off, mask8000;
	__m64 *sp0, *sp1, *sp2, *sp3, *sp4, *dp;
	mlib_s32 shift, ker_sum, kerh_sum = 0, kerv_sum = 0;
	mlib_s32 i, j;

	width -= 4;
	height -= 4;
	dl += 2 * (dll + NCHAN);

	GET_KERN();

	zero = _mm_setzero_si64();

	for (j = 0; j < height; j++) {
		sp0 = (__m64 *) sl;
		sp1 = (__m64 *) (sl + sll);
		sp2 = (__m64 *) (sl + 2 * sll);
		sp3 = (__m64 *) (sl + 3 * sll);
		sp4 = (__m64 *) (sl + 4 * sll);
		dp = (__m64 *) dl;

		PREP_V();

		for (i = 0; i < width; i++) {
			CONV_5x5();

			dp[i] = rr;
		}

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	return (MLIB_SUCCESS);
}
mlib_status
mlib_m_sconv7x7_16nw_4(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 hker0, hker1, hker2, hker3, hker4, hker5, hker6;
	__m64 vker0, vker1, vker2, vker3, vker4, vker5, vker6;
	__m64 s0, s1, s2, s3, s4, s5, s6, v0, v1, v2, v3, v4, v5, v6, rr, rh,
	    rl;
	__m64 zero, _rnd;
	__m64 *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *dp;
	mlib_s32 shift, kerh_sum;
	mlib_s32 i, j;

	width -= KSIZE1;
	height -= KSIZE1;
	width *= NCHAN;
	dl += (KSIZE / 2) * (dll + NCHAN);

	GET_KERN();

	zero = _mm_setzero_si64();

	for (j = 0; j < height; j++) {
		sp0 = (__m64 *) sl;
		sp1 = (__m64 *) (sl + sll);
		sp2 = (__m64 *) (sl + 2 * sll);
		sp3 = (__m64 *) (sl + 3 * sll);
		sp4 = (__m64 *) (sl + 4 * sll);
		sp5 = (__m64 *) (sl + 5 * sll);
		sp6 = (__m64 *) (sl + 6 * sll);
		dp = (__m64 *) dl;

		PREP_V(v1);
		PREP_V(v2);
		PREP_V(v3);
		PREP_V(v4);
		PREP_V(v5);
		PREP_V(v6);

		for (i = 0; i < width / 4; i++) {
			CONV_7x7();

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_7x7();

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	return (MLIB_SUCCESS);
}
/* *********************************************************** */
mlib_status
mlib_m_sconv3x3_8nw_1(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 hker0, hker1, hker2, vker0, vker1, vker2;
	__m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	dl += dll + 1;

	wid4 = (width + 7) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + wid4;

	for (j = 0; j < 2; j++) {
		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_3x3_1ch(lo, i);
		}

		sl += sll;

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < width / 8; i++) {
			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_hi, res_lo);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_hi, res_lo);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Esempio n. 5
0
mlib_status CONV_FUNC_I(MxN)(mlib_image       *dst,
                             const mlib_image *src,
                             const mlib_s32   *kernel,
                             mlib_s32         m,
                             mlib_s32         n,
                             mlib_s32         dm,
                             mlib_s32         dn,
                             mlib_s32         scale,
                             mlib_s32         cmask)
{
  mlib_s32 buff[BUFF_SIZE], *buffd = buff;
  mlib_s32 l, off, kw;
  mlib_s32 d0, d1, shift1, shift2;
  mlib_s32 k0, k1, k2, k3, k4, k5, k6;
  mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
  DTYPE    *adr_src, *sl, *sp = NULL;
  DTYPE    *adr_dst, *dl, *dp = NULL;
  mlib_s32 wid, hgt, sll, dll;
  mlib_s32 nchannel, chan1;
  mlib_s32 i, j, c;
  mlib_s32 chan2;
  mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
  GET_SRC_DST_PARAMETERS(DTYPE);

#if IMG_TYPE != 1
  shift1 = 16;
#else
  shift1 = 8;
#endif /* IMG_TYPE != 1 */
  shift2 = scale - shift1;

  chan1 = nchannel;
  chan2 = chan1 + chan1;

  wid -= (m - 1);
  hgt -= (n - 1);
  adr_dst += dn*dll + dm*nchannel;

  if (wid > BUFF_SIZE) {
    buffd = mlib_malloc(sizeof(mlib_s32)*wid);

    if (buffd == NULL) return MLIB_FAILURE;
  }

  if (m*n > MAX_N*MAX_N) {
    k = mlib_malloc(sizeof(mlib_s32)*(m*n));

    if (k == NULL) {
      if (buffd != buff) mlib_free(buffd);
      return MLIB_FAILURE;
    }
  }

  for (i = 0; i < m*n; i++) {
    k[i] = kernel[i] >> shift1;
  }

  for (c = 0; c < nchannel; c++) {
    if (!(cmask & (1 << (nchannel - 1 - c)))) continue;

    sl = adr_src + c;
    dl = adr_dst + c;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
    for (i = 0; i < wid; i++) buffd[i] = 0;

    for (j = 0; j < hgt; j++) {
      mlib_s32 *pk = k;

      for (l = 0; l < n; l++) {
        DTYPE *sp0 = sl + l*sll;

        for (off = 0; off < m;) {
          sp = sp0 + off*chan1;
          dp = dl;

          kw = m - off;

          if (kw > 2*MAX_KER) kw = MAX_KER; else
            if (kw > MAX_KER) kw = kw/2;
          off += kw;

          p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
          p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
          pk += kw;

          sp += (kw - 1)*chan1;

          if (kw == 7) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
                p6 = sp[0];
                p7 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
                p6 = sp[0];
                p7 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 6) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
                p5 = sp[0];
                p6 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
                p5 = sp[0];
                p6 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 5) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
                p4 = sp[0];
                p5 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
                p4 = sp[0];
                p5 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 4) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;
                p3 = sp[0];
                p4 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;
                p3 = sp[0];
                p4 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 3) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;
                p2 = sp[0];
                p3 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;
                p2 = sp[0];
                p3 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 2) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;
                p1 = sp[0];
                p2 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1;
                buffd[i + 1] += p1*k0 + p2*k1;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;
                p1 = sp[0];
                p2 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else /*if (kw == 1)*/ {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = sp[0];
                p1 = sp[chan1];

                buffd[i    ] += p0*k0;
                buffd[i + 1] += p1*k0;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = sp[0];
                p1 = sp[chan1];

                d0 = (p0*k0 + buffd[i    ]);
                d1 = (p1*k0 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }
          }
        }
      }

      /* last pixels */
      for (; i < wid; i++) {
        mlib_s32 *pk = k, s = 0;
        mlib_s32 x;

        for (l = 0; l < n; l++) {
          sp = sl + l*sll + i*chan1;

          for (x = 0; x < m; x++) {
            s += sp[0] * pk[0];
            sp += chan1;
            pk ++;
          }
        }

        STORE_RES(dp[0], s);

        sp += chan1;
        dp += chan1;
      }

      sl += sll;
      dl += dll;
    }
  }

  if (buffd != buff) mlib_free(buffd);
  if (k != k_locl) mlib_free(k);

  return MLIB_SUCCESS;
}
Esempio n. 6
0
mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
                           const mlib_image *src,
                           const mlib_s32   *kernel,
                           mlib_s32         m,
                           mlib_s32         n,
                           mlib_s32         dm,
                           mlib_s32         dn,
                           mlib_s32         scale,
                           mlib_s32         cmask)
{
  FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
  FTYPE    **buffs = buffs_arr, *buffd;
  FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
  mlib_s32 mn, l, off, kw, bsize, buff_ind;
  mlib_s32 d0, d1;
  FTYPE    k0, k1, k2, k3, k4, k5, k6;
  FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
  d64_2x32 dd;
  DEF_VARS(DTYPE);
  mlib_s32 chan2;
  mlib_s32 *buffo, *buffi;
  mlib_status status = MLIB_SUCCESS;

  GET_SRC_DST_PARAMETERS(DTYPE);

  if (scale > 30) {
    fscale *= 1.0/(1 << 30);
    scale -= 30;
  }

  fscale /= (1 << scale);

  mn = m*n;

  if (mn > 256) {
    k = mlib_malloc(mn*sizeof(mlib_d64));

    if (k == NULL) return MLIB_FAILURE;
  }

  for (i = 0; i < mn; i++) {
    k[i] = kernel[i]*fscale;
  }

  if (m == 1) {
    status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
    FREE_AND_RETURN_STATUS;
  }

  bsize = (n + 3)*wid;

  if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
    pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));

    if (pbuff == NULL) {
      status = MLIB_FAILURE;
      FREE_AND_RETURN_STATUS;
    }
    buffs = (FTYPE   **)(pbuff + bsize);
  }

  for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
  for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
  buffd = buffs[n] + wid;
  buffo = (mlib_s32*)(buffd + wid);
  buffi = buffo + (wid &~ 1);

  chan1 = nchannel;
  chan2 = chan1 + chan1;

  wid -= (m - 1);
  hgt -= (n - 1);
  adr_dst += dn*dll + dm*nchannel;

  for (c = 0; c < nchannel; c++) {
    if (!(cmask & (1 << (chan1 - 1 - c)))) continue;

    sl = adr_src + c;
    dl = adr_dst + c;

    for (l = 0; l < n; l++) {
      FTYPE    *buff = buffs[l];

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
      for (i = 0; i < wid + (m - 1); i++) {
        buff[i] = (FTYPE)sl[i*chan1];
      }

      sl += sll;
    }

    buff_ind = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
    for (i = 0; i < wid; i++) buffd[i] = 0.0;

    for (j = 0; j < hgt; j++) {
      FTYPE    **buffc = buffs + buff_ind;
      FTYPE    *buffn = buffc[n];
      FTYPE    *pk = k;

      for (l = 0; l < n; l++) {
        FTYPE    *buff_l = buffc[l];

        for (off = 0; off < m;) {
          FTYPE    *buff = buff_l + off;

          kw = m - off;

          if (kw > 2*MAX_KER) kw = MAX_KER; else
            if (kw > MAX_KER) kw = kw/2;
          off += kw;

          sp = sl;
          dp = dl;

          p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
          p5 = buff[3]; p6 = buff[4]; p7 = buff[5];

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
          pk += kw;

          if (kw == 7) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;

                p6 = buff[i + 6]; p7 = buff[i + 7];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;

                p6 = buff[i + 6]; p7 = buff[i + 7];

                LOAD_BUFF(buffi);

                dd.d64 = *(FTYPE   *)(buffi + i);
                buffn[i    ] = (FTYPE)dd.i32s.i0;
                buffn[i + 1] = (FTYPE)dd.i32s.i1;

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 6) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = buff[i + 5]; p6 = buff[i + 6];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = buff[i + 5]; p6 = buff[i + 6];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 5) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;

                p4 = buff[i + 4]; p5 = buff[i + 5];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;

                p4 = buff[i + 4]; p5 = buff[i + 5];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 4) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;

                p3 = buff[i + 3]; p4 = buff[i + 4];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;

                p3 = buff[i + 3]; p4 = buff[i + 4];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 3) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;

                p2 = buff[i + 2]; p3 = buff[i + 3];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;

                p2 = buff[i + 2]; p3 = buff[i + 3];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else /*if (kw == 2)*/ {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;

                p1 = buff[i + 1]; p2 = buff[i + 2];

                buffd[i    ] += p0*k0 + p1*k1;
                buffd[i + 1] += p1*k0 + p2*k1;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;

                p1 = buff[i + 1]; p2 = buff[i + 2];

                buffn[i    ] = (FTYPE)sp[0];
                buffn[i + 1] = (FTYPE)sp[chan1];

                d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
                d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);

                dp[0    ] = FROM_S32(d0);
                dp[chan1] = FROM_S32(d1);

                buffd[i    ] = 0.0;
                buffd[i + 1] = 0.0;

                sp += chan2;
                dp += chan2;
              }
            }
          }
        }
      }

      /* last pixels */
      for (; i < wid; i++) {
        FTYPE    *pk = k, s = 0;
        mlib_s32 x, d0;

        for (l = 0; l < n; l++) {
          FTYPE    *buff = buffc[l] + i;

          for (x = 0; x < m; x++) s += buff[x] * (*pk++);
        }

        d0 = D2I(s);
        dp[0] = FROM_S32(d0);

        buffn[i] = (FTYPE)sp[0];

        sp += chan1;
        dp += chan1;
      }

      for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];

      /* next line */
      sl += sll;
      dl += dll;

      buff_ind++;

      if (buff_ind >= n + 1) buff_ind = 0;
    }
  }

  FREE_AND_RETURN_STATUS;
}
Esempio n. 7
0
static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
                                     const mlib_image *src,
                                     const mlib_d64   *k,
                                     mlib_s32         n,
                                     mlib_s32         dn,
                                     mlib_s32         cmask)
{
  FTYPE    buff[BUFF_SIZE];
  mlib_s32 off, kh;
  mlib_s32 d0, d1;
  const FTYPE    *pk;
  FTYPE    k0, k1, k2, k3;
  FTYPE    p0, p1, p2, p3, p4;
  DEF_VARS(DTYPE);
  DTYPE    *sl_c, *dl_c, *sl0;
  mlib_s32 l, hsize, max_hsize;
  GET_SRC_DST_PARAMETERS(DTYPE);

  hgt -= (n - 1);
  adr_dst += dn*dll;

  max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;

  if (!max_hsize) max_hsize = 1;

  if (max_hsize > BUFF_SIZE) {
    pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
  }

  chan1 = nchannel;

  sl_c = adr_src;
  dl_c = adr_dst;

  for (l = 0; l < hgt; l += hsize) {
    hsize = hgt - l;

    if (hsize > max_hsize) hsize = max_hsize;

    for (c = 0; c < nchannel; c++) {
      if (!(cmask & (1 << (chan1 - 1 - c)))) continue;

      sl = sl_c + c;
      dl = dl_c + c;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
      for (j = 0; j < hsize; j++) pbuff[j] = 0.0;

      for (i = 0; i < wid; i++) {
        sl0 = sl;

        for (off = 0; off < (n - 4); off += 4) {
          pk = k + off;
          sp = sl0;

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
          sp += 3*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j < hsize; j += 2) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];
            p4 = sp[sll];

            pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
            pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;

            sp += 2*sll;
          }

          sl0 += 4*sll;
        }

        pk = k + off;
        sp = sl0;

        k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
        p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];

        dp = dl;
        kh = n - off;

        if (kh == 4) {
          sp += 3*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];
            p4 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else if (kh == 3) {
          sp += 2*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2; p1 = p3;
            p2 = sp[0];
            p3 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2; p1 = p3;
            p2 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else if (kh == 2) {
          sp += sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2;
            p1 = sp[0];
            p2 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2;
            p1 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else /* if (kh == 1) */ {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j < hsize; j++) {
            p0 = sp[0];

            d0 = D2I(p0*k0 + pbuff[j]);

            dp[0] = FROM_S32(d0);

            pbuff[j] = 0;

            sp += sll;
            dp += dll;
          }
        }

        sl += chan1;
        dl += chan1;
      }
    }

    sl_c += max_hsize*sll;
    dl_c += max_hsize*dll;
  }

  if (pbuff != buff) mlib_free(pbuff);

  return MLIB_SUCCESS;
}
mlib_status
mlib_m_conv5x5_u16nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker[5][5];
	__m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000;
	__m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h,
	    tmph;
	__m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l,
	    tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift, ind, ker_sum = 0;
	mlib_s32 row, wid4, i, j;

	width -= 4;
	height -= 4;
	width *= NCHAN;
	dl += 2 * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * 2 * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (__m64 *) sl;
		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < wid4; i++) {
			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < width / 4; i++) {
			CONV_5x5(hi, i);

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_5x5(hi, i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
mlib_status
mlib_v_conv2x2_u16nw_mask(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u16 *da, *d_a;

/* pointers to src, dst data */
	mlib_u16 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u16 *sa, *sa1, *sa2, *sa_2;

/* pointers to rows in interm. src buf */
	mlib_u16 *buff_src, *sbuf1, *sbuf2, *prow;
	mlib_u16 *s_buf1;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1, tmp0, tmp1, tmp2, tmp3;

/* data */
	mlib_d64 d1, d2, d_1, d_2;

/* shifted data */
	mlib_d64 d21, d22;

/* coefficients */
	mlib_f32 k1, k2, k3, k4;
	int gsr_scale, i, j, nchannel, chan, testchan;
	mlib_u16 t1, t2, t3, t4, t5, t6, t7, t8;
	type_mlib_d64 str;
	mlib_d64 ker_off, mask8000 = vis_to_double_dup(0x80008000);

	nchannel = mlib_ImageGetChannels(src);
	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	gsr_scale = 32 - scalef_expon;
	vis_write_gsr((gsr_scale << 3) + 2);

/* buf_slb - 8-byte aligned */
	buf_slb = (2 * dw + 26) & (~7);
/* alloc. interm. src buffer */
	buff_src =
	    (mlib_u16 *)__mlib_malloc(2 * buf_slb * sizeof (mlib_u8) + 8);

	if (buff_src == NULL)
		return (MLIB_FAILURE);

	buf_slb >>= 1;

	sbuf1 = (mlib_u16 *)((mlib_addr)(buff_src + 8) & (~7));
	sbuf2 = sbuf1 + buf_slb;

	dw -= 1;
/* edge - no write */
	dh -= 1;

	testchan = 1;

	for (chan = nchannel - 1; chan >= 0; chan--) {
		if ((cmask & testchan) == 0) {
			testchan <<= 1;
			continue;
		}

		testchan <<= 1;
		sa = adr_src + chan;
		sa1 = sa + slb;
		sa_2 = sa2 = sa1 + slb;
		d_a = adr_dst + chan;

/* load interm. src buff */
		for (i = 0, j = 0; j < (dw + 1); i += nchannel, j++) {
			sbuf1[j] = sa1[i];
			sbuf2[j] = sa[i];
		}

		for (j = 0; j < dh - 1; j++) {
			da = d_a;
			prow = sbuf1;
			sbuf1 = sbuf2;
			sbuf2 = prow;
			s1 = (mlib_d64 *)sbuf1;
			s2 = (mlib_d64 *)sbuf2;
			dend = da + (dw - 1) * nchannel;
			s_buf1 = sbuf1;
			d1 = *s1;
			d2 = *s2;
			d1 = vis_fxor(d1, mask8000);
			d2 = vis_fxor(d2, mask8000);

			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_1 = vis_fxor(d_1, mask8000);
			d_2 = vis_fxor(d_2, mask8000);
			CONV_16_BEGIN(d1, k1);
			CONV_16(d2, k3);
			d21 = vis_faligndata(d1, d_1);
			d22 = vis_faligndata(d2, d_2);
			CONV_16(d21, k2);
			CONV_16(d22, k4);
			str.value =
			    vis_fxor(vis_fpackfix_pair(out0, out1), mask8000);
			d1 = d_1;
			d2 = d_2;
			s1++;
			s2++;
/*
 * in each iteration store result from prev. iterat.
 * and load data for processing next row
 */
#pragma pipeloop(0)
			for (i = 0; i < dw - 4; i += 4) {
				t1 = *sa_2;
				sa_2 += nchannel;
				t2 = *sa_2;
				sa_2 += nchannel;
				d_1 = *(s1 + 1);
				d_2 = *(s2 + 1);
				d_1 = vis_fxor(d_1, mask8000);
				d_2 = vis_fxor(d_2, mask8000);
				CONV_16_BEGIN(d1, k1);
				t3 = *sa_2;
				sa_2 += nchannel;
				t4 = *sa_2;
				sa_2 += nchannel;
				CONV_16(d2, k3);
				t5 = str.forshort.ushort0;
				t6 = str.forshort.ushort1;
				d21 = vis_faligndata(d1, d_1);
				t7 = str.forshort.ushort2;
				d22 = vis_faligndata(d2, d_2);
				t8 = str.forshort.ushort3;
				CONV_16(d21, k2);
				(*s_buf1++) = t1;
				(*s_buf1++) = t2;
				CONV_16(d22, k4);
				(*s_buf1++) = t3;
				(*s_buf1++) = t4;
				*da = t5;
				da += nchannel;
				str.value =
				    vis_fxor(vis_fpackfix_pair(out0, out1),
				    mask8000);
				*da = t6;
				da += nchannel;
				d1 = d_1;
				d2 = d_2;
				*da = t7;
				da += nchannel;
				s1++;
				s2++;
				*da = t8;
				da += nchannel;
			}

			for (; i < dw + 1; i++) {
				(*s_buf1++) = *sa_2;
				sa_2 += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = str.forshort.ushort0;
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = str.forshort.ushort1;
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = str.forshort.ushort2;
				da += nchannel;
			}

			if ((mlib_addr)da <= (mlib_addr)dend) {
				*da = str.forshort.ushort3;
			}

			sa_2 = sa2 = sa2 + slb;
			d_a += dlb;
		}

/* process last row - no need to load data */
		da = d_a;
		prow = sbuf1;
		sbuf1 = sbuf2;
		sbuf2 = prow;
		s1 = (mlib_d64 *)sbuf1;
		s2 = (mlib_d64 *)sbuf2;
		dend = da + (dw - 1) * nchannel;
		d1 = *s1;
		d2 = *s2;
		d1 = vis_fxor(d1, mask8000);
		d2 = vis_fxor(d2, mask8000);

		d_1 = *(s1 + 1);
		d_2 = *(s2 + 1);
		d_1 = vis_fxor(d_1, mask8000);
		d_2 = vis_fxor(d_2, mask8000);
		CONV_16_BEGIN(d1, k1);
		CONV_16(d2, k3);
		d21 = vis_faligndata(d1, d_1);
		d22 = vis_faligndata(d2, d_2);
		CONV_16(d21, k2);
		CONV_16(d22, k4);
		d1 = d_1;
		d2 = d_2;
		s1++;
		s2++;

#pragma pipeloop(0)
		for (i = 4; i < dw; i += 4) {
			str.value =
			    vis_fxor(vis_fpackfix_pair(out0, out1), mask8000);
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_1 = vis_fxor(d_1, mask8000);
			d_2 = vis_fxor(d_2, mask8000);
			CONV_16_BEGIN(d1, k1);
			t5 = str.forshort.ushort0;
			CONV_16(d2, k3);
			d21 = vis_faligndata(d1, d_1);
			t6 = str.forshort.ushort1;
			d22 = vis_faligndata(d2, d_2);
			CONV_16(d21, k2);
			t7 = str.forshort.ushort2;
			CONV_16(d22, k4);
			t8 = str.forshort.ushort3;
			*da = t5;
			da += nchannel;
			*da = t6;
			da += nchannel;
			*da = t7;
			da += nchannel;
			d1 = d_1;
			d2 = d_2;
			*da = t8;
			da += nchannel;
			s1++;
			s2++;
		}

		str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000);

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = str.forshort.ushort0;
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = str.forshort.ushort1;
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = str.forshort.ushort2;
			da += nchannel;
		}

		if ((mlib_addr)da <= (mlib_addr)dend) {
			*da = str.forshort.ushort3;
		}
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
Esempio n. 10
0
mlib_status
mlib_v_conv2x2_u16nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon)
{
/* pointers to dst row */
	mlib_u16 *da, *d_a;

/* pointers to src, dst data */
	mlib_u16 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u16 *sa, *sa1;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d_1, d_2;
	mlib_f32 k1, k2, k3, k4;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1, tmp0, tmp1, tmp2, tmp3;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1;
	mlib_s32 emask;
	int gsr_scale, i, j;
	mlib_d64 ker_off, mask8000 = vis_to_double_dup(0x80008000);

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	gsr_scale = 32 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));

	buf_slb = (8 * dw + 16) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 1;
	dw *= 4;
	dh -= 1;

	sa = adr_src;
	sa1 = sa + slb;
	d_a = adr_dst;

/* load interm. src buff */
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(sbuf2, sa, 4);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER(sbuf2, sa1, 4);

		d1 = *s1;
		d2 = *s2;
		d1 = vis_fxor(d1, mask8000);
		d2 = vis_fxor(d2, mask8000);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 4) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_1 = vis_fxor(d_1, mask8000);
			d_2 = vis_fxor(d_2, mask8000);
			CONV_16_BEGIN(d1, k1);
			CONV_16(d2, k3);
			CONV_16(d_1, k2);
			CONV_16(d_2, k4);
			(*ddst++) =
			    vis_fxor(vis_fpackfix_pair(out0, out1), mask8000);
			d1 = d_1;
			d2 = d_2;
			s1++;
			s2++;
		}

		PREPARE_TO_COPY_INTERM_BUF_TO_DST();

#pragma pipeloop(0)
		COPY_INTERM_BUF_TO_DST();
		COPY_TAIL();

		sa1 = sa1 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
mlib_status
mlib_m_sconv5x5_8nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
    __m64 *pbuff, *buff_arr[5];
    __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT;
    GET_SRC_DST_PARAMETERS(mlib_u8);
    __m64 hker0, hker1, hker2, hker3, hker4;
    __m64 vker0, vker1, vker2, vker3, vker4;
    __m64 s0, d0, d1, d2, prev0;
    __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo;
    __m64 zero = _m_zero;
    mlib_s32 shift, ind;
    mlib_s32 *sp;
    mlib_s32 row, wid4, i, j;

    width -= 4;
    height -= 4;
    width *= NCHAN;
    dl += 2 * (dll + NCHAN);

    wid4 = 2 * ((width + 7) / 8);
    pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4);

    GET_KERN();

    for (i = 0; i < 5; i++) {
        buff_arr[i] = pbuff + i * wid4;
    }

    for (j = 0; j < 4; j++) {
        buff4 = buff_arr[j];

        sp = (mlib_s32 *)sl;

        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d2, lo);

        for (i = 0; i < wid4; i++) {
            *(mlib_s32 *)&s0 = sp[i];

            PREP_5x5(lo, i);
        }

        sl += sll;
        ind++;
    }

    buff0 = buff_arr[0];
    buff1 = buff_arr[1];
    buff2 = buff_arr[2];
    buff3 = buff_arr[3];
    buff4 = buff_arr[4];

    for (row = 0; row < height; row++) {
        __m64 *sp = (__m64 *) sl;
        __m64 *dp = (__m64 *) dl;

        s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        UNPACK_SRC(d2, hi);

        for (i = 0; i < width / 8; i++) {
            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);

            dp[i] = _mm_packs_pu16(res_lo, res_hi);
        }

        if (width & 7) {
            __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7];

            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);
            res_hi = _mm_packs_pu16(res_lo, res_hi);

            dp[i] =
                _mm_or_si64(_mm_and_si64(mask, res_hi),
                            _mm_andnot_si64(mask, dp[i]));
        }

        buffT = buff0;
        buff0 = buff1;
        buff1 = buff2;
        buff2 = buff3;
        buff3 = buff4;
        buff4 = buffT;

        sl += sll;
        dl += dll;
    }

    _mm_empty();
    mlib_free(pbuff);

    return (MLIB_SUCCESS);
}
Esempio n. 12
0
mlib_status
mlib_v_conv3x3_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d_1, d_2, d21, d22;

/* data */
	mlib_d64 d3, d_3, d23;
	mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1, sd00;
	mlib_s32 emask, cmask1;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	cmask = ((cmask & 0xf) << 4) + (cmask & 0xf);
	cmask = (cmask << 8) + (cmask);

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 2;
	dw *= 4;
	dh -= 2;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	d_a = adr_dst + dlb + 4;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(8);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf3, sa2);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER(8);

		vis_alignaddr(s1, 4);
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d_1 = *(s1 + 1);
			d_2 = *(s2 + 1);
			d_3 = *(s3 + 1);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k3k4);
			CONV_AU(d3, k7k8);
			d21 = vis_faligndata(d1, d_1);
			d22 = vis_faligndata(d2, d_2);
			d23 = vis_faligndata(d3, d_3);
			CONV_AL(d21, k1k2);
			CONV_AU(d22, k5k6);
			CONV_AL(d23, k7k8);
			CONV_AU(d_1, k3k4);
			CONV_AL(d_2, k5k6);
			CONV_AU(d_3, k9k9);
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d1 = d_1;
			d2 = d_2;
			d3 = d_3;
			s1++;
			s2++;
			s3++;
		}

		ddst = dbuf;
/* prepare the destination addresses */
		dp = (mlib_d64 *)((mlib_addr)da & (~7));
		i = (mlib_addr)dp - (mlib_addr)da;
		cmask1 = cmask >> (-i);
		ddst = vis_alignaddr(ddst, i);
/* generate edge mask for the start point */
		emask = vis_edge8(da, dend);
		sd1 = ddst[0];

		if (emask != 0xff) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd0 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd0, dp++, emask & cmask1);
			ddst++;
			i += 8;
		}
#pragma pipeloop(0)
		for (; i <= (dw - 8); i += 8) {
			sd0 = sd1;
			sd1 = ddst[1];
			sd00 = vis_faligndata(sd0, sd1);
			vis_pst_8(sd00, dp++, cmask1);
			ddst++;
		}

		if (i < dw) {
			sd0 = vis_faligndata(sd1, ddst[1]);
			emask = vis_edge8(dp, dend);
			vis_pst_8(sd0, dp, emask & cmask1);
		}

		sa2 = sa2 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
Esempio n. 13
0
mlib_status
mlib_m_conv5x5_8nw_4(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 ker[5][5];
	__m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc;
	__m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift, ind;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= (KSIZE - 1);
	height -= (KSIZE - 1);
	width *= NCHAN;
	dl += ((KSIZE - 1) / 2) * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d2, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d4, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		UNPACK_SRC(d2, hi);
		s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		UNPACK_SRC(d4, hi);

		for (i = 0; i < width / 8; i++) {
			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_lo, res_hi);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_lo, res_hi);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Esempio n. 14
0
mlib_status
mlib_c_conv2x2ext_u8(
    mlib_image *dst,
    const mlib_image *src,
    mlib_s32 dx_l,
    mlib_s32 dx_r,
    mlib_s32 dy_t,
    mlib_s32 dy_b,
    const mlib_s32 *kern,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
	mlib_d64 buff_arr[4 * BUFF_LINE];
	mlib_s32 *pbuff = (mlib_s32 *)buff_arr, *buff0, *buff1, *buff2, *buffT;
	DTYPE *adr_src, *sl, *sp, *sl1;
	DTYPE *adr_dst, *dl, *dp;
	mlib_d64 k0, k1, k2, k3, scalef = 1.0;
	mlib_d64 p00, p01, p02, p10, p11, p12;
	mlib_s32 wid, hgt, sll, dll, wid1;
	mlib_s32 nchannel, chan1, chan2;
	mlib_s32 i, j, c, swid;

	LOAD_KERNEL_INTO_DOUBLE();
	GET_SRC_DST_PARAMETERS(DTYPE);

	vis_write_gsr(23 << 3);

	swid = wid + D_KER;

	wid1 = (swid + 1) & ~1;

	if (wid1 > BUFF_LINE) {
		pbuff = __mlib_malloc(4 * sizeof (mlib_s32) * wid1);

		if (pbuff == NULL)
			return (MLIB_FAILURE);
	}

	buff0 = pbuff;
	buff1 = buff0 + wid1;
	buff2 = buff1 + wid1;

	chan1 = nchannel;
	chan2 = chan1 + chan1;

	swid -= dx_r;

	for (c = 0; c < nchannel; c++) {
		if (!(cmask & (1 << (nchannel - 1 - c))))
			continue;

		sl = adr_src + c;
		dl = adr_dst + c;

		if ((hgt - dy_b) > 0)
			sl1 = sl + sll;
		else
			sl1 = sl;

#pragma pipeloop(0)
		for (i = 0; i < swid; i++) {
			buff0[i - 1] = (mlib_s32)sl[i * chan1];
			buff1[i - 1] = (mlib_s32)sl1[i * chan1];
		}

		if (dx_r != 0) {
			buff0[swid - 1] = buff0[swid - 2];
			buff1[swid - 1] = buff1[swid - 2];
		}

		if ((hgt - dy_b) > 1)
			sl = sl1 + sll;
		else
			sl = sl1;

		for (j = 0; j < hgt; j++) {
			sp = sl;
			dp = dl;

			buff2[-1] = (mlib_s32)sp[0];
			sp += chan1;

			p02 = buff0[-1];
			p12 = buff1[-1];

#pragma pipeloop(0)
			for (i = 0; i <= (wid - 2); i += 2) {
				d64_2x32 sd0, sd1;
				d64_2x32 dd0, dd1;

				p00 = p02;
				p10 = p12;

				sd0.d64 = *(TYPE_64BIT *) (buff0 + i);
				sd1.d64 = *(TYPE_64BIT *) (buff1 + i);
				p01 = (mlib_d64)sd0.i32s.i0;
				p02 = (mlib_d64)sd0.i32s.i1;
				p11 = (mlib_d64)sd1.i32s.i0;
				p12 = (mlib_d64)sd1.i32s.i1;

				LOAD_BUFF(buff2);

				dd0.i32s.i0 =
				    CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 +
				    p11 * k3);
				dd0.i32s.i1 =
				    CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 +
				    p12 * k3);

				dd1.d64 = vis_fpack32(dd1.d64, dd0.d64);

				STORE2(dd1.i32s.i0, dd1.i32s.i1);

				sp += chan2;
				dp += chan2;
			}

			for (; i < wid; i++) {
				d64_2x32 dd0, dd1;

				p00 = buff0[i - 1];
				p10 = buff1[i - 1];
				p01 = buff0[i];
				p11 = buff1[i];

				buff2[i] = (mlib_s32)sp[0];

				dd0.i32s.i1 =
				    CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 +
				    p11 * k3);
				dd1.d64 = vis_fpack32(dd1.d64, dd0.d64);
				dp[0] = dd1.i32s.i1;

				sp += chan1;
				dp += chan1;
			}

			if (dx_r != 0)
				buff2[swid - 1] = buff2[swid - 2];

			if (j < hgt - dy_b - 2)
				sl += sll;
			dl += dll;

			buffT = buff0;
			buff0 = buff1;
			buff1 = buff2;
			buff2 = buffT;
		}
	}

	if (pbuff != (mlib_s32 *)buff_arr)
		__mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Esempio n. 15
0
mlib_status
mlib_v_conv5x5_8nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 scalef_expon)
{
/* pointers to dst row */
	mlib_u8 *da, *d_a;

/* pointers to src, dst data */
	mlib_u8 *adr_dst, *adr_src, *dend;

/* pointers to src rows */
	mlib_u8 *sa, *sa1, *sa2, *sa3, *sa4;

/* pointers to rows in interm. src buf */
	mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow;

/* pointers to rows in interm. src buf */
	mlib_d64 *sbuf3, *sbuf4, *sbuf5;

/* pointer to row in interm. dst buf */
	mlib_d64 *dbuf, *dbuf1;

/* mlib_d64 pointers to rows in interm. src buf */
	mlib_d64 *s1, *s2, *s3, *s4, *s5;

/* mlib_d64 pointer to row in interm. dst buf */
	mlib_d64 *ddst;

/* data */
	mlib_d64 d1, d2, d3, d4, d5;

/* data */
	mlib_d64 d11, d12, d13, d14, d15;

/* data */
	mlib_d64 d21, d22, d23, d24, d25;

/* data */
	mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5;
	mlib_f32 k1k2, k3k4, k5k6, k7k8;
	mlib_f32 k9k10, k11k12, k13k14, k15k16;
	mlib_f32 k17k18, k19k20, k21k22, k23k24, k25;

/* src, dst and interm. buf. strides */
	mlib_s32 dlb, slb, buf_slb;
	mlib_s32 dh, dw;
	mlib_d64 out0, out1;
	mlib_d64 tmp0, tmp1, rnd;
	mlib_d64 *dsa, *dp;
	mlib_d64 sd0, sd1;
	mlib_s32 emask;
	mlib_s32 rval, gsr_scale, i, j;

	gsr_scale = 31 - scalef_expon;
	vis_write_gsr((gsr_scale << 3));
	rval = mlib_round_8[gsr_scale];
	rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval));

	GET_SRC_DST_PARAMETERS();
	LOAD_KERNEL_INTO_FLOAT();

	buf_slb = (4 * dw + 24) >> 3;
	PREPARE_INTERM_BUFFERS();

	dw -= 4;
	dw *= 4;
	dh -= 4;

	sa = adr_src;
	sa1 = sa + slb;
	sa2 = sa1 + slb;
	sa3 = sa2 + slb;
	sa4 = sa3 + slb;
	d_a = adr_dst + 2 * dlb + 8;

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf2, sa);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf3, sa1);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf4, sa2);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

/* load interm. src buff */
	PREPARE_TO_LOAD_LINE(sbuf5, sa3);
#pragma pipeloop(0)
	LOAD_LINE_INTO_BUFFER(16);

#pragma pipeloop(0)
	for (j = 0; j < dh; j++) {
		LOOP_INI();

		PREPARE_TO_LOAD_LINE(sbuf5, sa4);
#pragma pipeloop(0)
		LOAD_LINE_INTO_BUFFER_NF(16);

		vis_alignaddr(s1, 4);
		dbuf1 = dbuf;
		d1 = *s1;
		d2 = *s2;
		d3 = *s3;
		d11 = *(s1 + 1);
		d12 = *(s2 + 1);
		d13 = *(s3 + 1);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d21 = *(s1 + 2);
			d22 = *(s2 + 2);
			d23 = *(s3 + 2);
			out0 = out1 = rnd;
			CONV_AU(d1, k1k2);
			CONV_AL(d2, k5k6);
			CONV_AU(d3, k11k12);
			dt_1 = vis_faligndata(d1, d11);
			dt_2 = vis_faligndata(d2, d12);
			dt_3 = vis_faligndata(d3, d13);
			CONV_AL(dt_1, k1k2);
			CONV_AU(dt_2, k7k8);
			CONV_AL(dt_3, k11k12);
			CONV_AU(d11, k3k4);
			CONV_AL(d12, k7k8);
			CONV_AU(d13, k13k14);
			dt_1 = vis_faligndata(d11, d21);
			dt_2 = vis_faligndata(d12, d22);
			dt_3 = vis_faligndata(d13, d23);
			CONV_AL(dt_1, k3k4);
			CONV_AU(dt_2, k9k10);
			CONV_AL(dt_3, k13k14);
			CONV_AU(d21, k5k6);
			CONV_AL(d22, k9k10);
			CONV_AU(d23, k15k16);
			dbuf1[0] = out0;
			dbuf1[1] = out1;
			dbuf1 += 2;
			d1 = d11;
			d2 = d12;
			d3 = d13;
			d11 = d21;
			d12 = d22;
			d13 = d23;
			s1++;
			s2++;
			s3++;
		}

		dbuf1 = dbuf;
		d4 = *s4;
		d5 = *s5;
		d14 = *(s4 + 1);
		d15 = *(s5 + 1);

#pragma pipeloop(0)
		for (i = 0; i < dw; i += 8) {
			d24 = *(s4 + 2);
			d25 = *(s5 + 2);
			out0 = dbuf1[0];
			out1 = dbuf1[1];
			CONV_AL(d4, k15k16);
			CONV_AU(d5, k21k22);
			dt_4 = vis_faligndata(d4, d14);
			dt_5 = vis_faligndata(d5, d15);
			CONV_AU(dt_4, k17k18);
			CONV_AL(dt_5, k21k22);
			CONV_AL(d14, k17k18);
			CONV_AU(d15, k23k24);
			dt_4 = vis_faligndata(d14, d24);
			dt_5 = vis_faligndata(d15, d25);
			CONV_AU(dt_4, k19k20);
			CONV_AL(dt_5, k23k24);
			CONV_AL(d24, k19k20);
			CONV_AU(d25, k25);
			dbuf1 += 2;
			(*ddst++) = vis_fpack16_pair(out0, out1);
			d4 = d14;
			d5 = d15;
			d14 = d24;
			d15 = d25;
			s4++;
			s5++;
		}

		PREPARE_TO_COPY_INTERM_BUF_TO_DST();

#pragma pipeloop(0)
		COPY_INTERM_BUF_TO_DST();
		COPY_TAIL();

		sa4 = sa4 + slb;
		d_a += dlb;
	}

	__mlib_free(buff_src);
	return (MLIB_SUCCESS);
}
mlib_status
mlib_m_conv3x3_16nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buff2, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9;
	__m64 d0, d1, d2, rr, tmpa, tmpb;
	__m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph;
	__m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	width *= NCHAN;
	dl += dll + NCHAN;

	wid4 = (width + 3) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + 2 * wid4;
	buff2 = buff1 + 2 * wid4;

	for (j = 0; j < 2; j++) {
		sp = (__m64 *) sl;

		d1 = (*sp++);
		d2 = (*sp++);
		for (i = 0; i < wid4; i++) {
			PREP_3x3(i);
		}

		sl += sll;

		if (j == 0) {
			buffT = buff1;
			buff1 = buff0;
			buff0 = buffT;
		}
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		d1 = (*sp++);
		d2 = (*sp++);
		for (i = 0; i < width / 4; i++) {
			CONV_3x3(i);
			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_3x3(i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}