mlib_status mlib_ImageConvKernelConvert(mlib_s32       *ikernel,
                                        mlib_s32       *iscale,
                                        const mlib_d64 *fkernel,
                                        mlib_s32       m,
                                        mlib_s32       n,
                                        mlib_type      type)
{
  mlib_d64 sum_pos, sum_neg, sum, norm, max, f;
  mlib_s32 isum_pos, isum_neg, isum, test;
  mlib_s32 i, scale, scale1, chk_flag;

  if (ikernel == NULL || iscale == NULL || fkernel == NULL || m < 1 || n < 1) {
    return MLIB_FAILURE;
  }

  if ((type == MLIB_BYTE) || (type == MLIB_SHORT) || (type == MLIB_USHORT)) {

    if (type != MLIB_SHORT) {               /* MLIB_BYTE, MLIB_USHORT */
      sum_pos = 0;
      sum_neg = 0;

      for (i = 0; i < m * n; i++) {
        if (fkernel[i] > 0)
          sum_pos += fkernel[i];
        else
          sum_neg -= fkernel[i];
      }

      sum = (sum_pos > sum_neg) ? sum_pos : sum_neg;
      scale = mlib_ilogb(sum);
      scale++;

      scale = 31 - scale;
    }
    else {                                  /* MLIB_SHORT */
      sum = 0;
      max = 0;

      for (i = 0; i < m * n; i++) {
        f = mlib_fabs(fkernel[i]);
        sum += f;
        max = (max > f) ? max : f;
      }

      scale1 = mlib_ilogb(max) + 1;
      scale = mlib_ilogb(sum);
      scale = (scale > scale1) ? scale : scale1;
      scale++;

      scale = 32 - scale;
    }

    if (scale <= 16)
      return MLIB_FAILURE;
    if (scale > 31)
      scale = 31;

    *iscale = scale;

    chk_flag = mlib_ImageConvVersion(m, n, scale, type);

    if (!chk_flag) {
      norm = (1u << scale);
      for (i = 0; i < m * n; i++) {
        CLAMP_S32(ikernel[i], fkernel[i] * norm);
      }

      return MLIB_SUCCESS;
    }

    /* try to round coefficients */
#ifdef __sparc
    scale1 = 16;                            /* shift of coefficients is 16 */
#else

    if (chk_flag == 3)
      scale1 = 16;                          /* MMX */
    else
      scale1 = (type == MLIB_BYTE) ? 8 : 16;
#endif /* __sparc */
    norm = (1u << (scale - scale1));

    for (i = 0; i < m * n; i++) {
      if (fkernel[i] > 0)
        ikernel[i] = (mlib_s32) (fkernel[i] * norm + 0.5);
      else
        ikernel[i] = (mlib_s32) (fkernel[i] * norm - 0.5);
    }

    isum_pos = 0;
    isum_neg = 0;
    test = 0;

    for (i = 0; i < m * n; i++) {
      if (ikernel[i] > 0)
        isum_pos += ikernel[i];
      else
        isum_neg -= ikernel[i];
    }

    if (type == MLIB_BYTE || type == MLIB_USHORT) {
      isum = (isum_pos > isum_neg) ? isum_pos : isum_neg;

      if (isum >= (1 << (31 - scale1)))
        test = 1;
    }
    else {
      isum = isum_pos + isum_neg;

      if (isum >= (1 << (32 - scale1)))
        test = 1;
      for (i = 0; i < m * n; i++) {
        if (abs(ikernel[i]) >= (1 << (31 - scale1)))
          test = 1;
      }
    }

    if (test == 1) {                        /* rounding according scale1 cause overflow, truncate instead of round */
      for (i = 0; i < m * n; i++)
        ikernel[i] = (mlib_s32) (fkernel[i] * norm) << scale1;
    }
    else {                                  /* rounding is Ok */
      for (i = 0; i < m * n; i++)
        ikernel[i] = ikernel[i] << scale1;
    }

    return MLIB_SUCCESS;
  }
  else if ((type == MLIB_INT) || (type == MLIB_BIT)) {
    max = 0;

    for (i = 0; i < m * n; i++) {
      f = mlib_fabs(fkernel[i]);
      max = (max > f) ? max : f;
    }

    scale = mlib_ilogb(max);

    if (scale > 29)
      return MLIB_FAILURE;

    if (scale < -100)
      scale = -100;

    *iscale = 29 - scale;
    scale = 29 - scale;

    norm = 1.0;
    while (scale > 30) {
      norm *= (1 << 30);
      scale -= 30;
    }

    norm *= (1 << scale);

    for (i = 0; i < m * n; i++) {
      if (fkernel[i] > 0) {
        CLAMP_S32(ikernel[i], fkernel[i] * norm + 0.5);
      }
      else {
        CLAMP_S32(ikernel[i], fkernel[i] * norm - 0.5);
      }
    }

    return MLIB_SUCCESS;
  }
  else {
    return MLIB_FAILURE;
  }
}
mlib_status
mlib_c_conv2x2ext_u8(
    mlib_image *dst,
    const mlib_image *src,
    mlib_s32 dx_l,
    mlib_s32 dx_r,
    mlib_s32 dy_t,
    mlib_s32 dy_b,
    const mlib_s32 *kern,
    mlib_s32 scalef_expon,
    mlib_s32 cmask)
{
	mlib_d64 buff_arr[4 * BUFF_LINE];
	mlib_s32 *pbuff = (mlib_s32 *)buff_arr, *buff0, *buff1, *buff2, *buffT;
	DTYPE *adr_src, *sl, *sp, *sl1;
	DTYPE *adr_dst, *dl, *dp;
	mlib_d64 k0, k1, k2, k3, scalef = 1.0;
	mlib_d64 p00, p01, p02, p10, p11, p12;
	mlib_s32 wid, hgt, sll, dll, wid1;
	mlib_s32 nchannel, chan1, chan2;
	mlib_s32 i, j, c, swid;

	LOAD_KERNEL_INTO_DOUBLE();
	GET_SRC_DST_PARAMETERS(DTYPE);

	vis_write_gsr(23 << 3);

	swid = wid + D_KER;

	wid1 = (swid + 1) & ~1;

	if (wid1 > BUFF_LINE) {
		pbuff = __mlib_malloc(4 * sizeof (mlib_s32) * wid1);

		if (pbuff == NULL)
			return (MLIB_FAILURE);
	}

	buff0 = pbuff;
	buff1 = buff0 + wid1;
	buff2 = buff1 + wid1;

	chan1 = nchannel;
	chan2 = chan1 + chan1;

	swid -= dx_r;

	for (c = 0; c < nchannel; c++) {
		if (!(cmask & (1 << (nchannel - 1 - c))))
			continue;

		sl = adr_src + c;
		dl = adr_dst + c;

		if ((hgt - dy_b) > 0)
			sl1 = sl + sll;
		else
			sl1 = sl;

#pragma pipeloop(0)
		for (i = 0; i < swid; i++) {
			buff0[i - 1] = (mlib_s32)sl[i * chan1];
			buff1[i - 1] = (mlib_s32)sl1[i * chan1];
		}

		if (dx_r != 0) {
			buff0[swid - 1] = buff0[swid - 2];
			buff1[swid - 1] = buff1[swid - 2];
		}

		if ((hgt - dy_b) > 1)
			sl = sl1 + sll;
		else
			sl = sl1;

		for (j = 0; j < hgt; j++) {
			sp = sl;
			dp = dl;

			buff2[-1] = (mlib_s32)sp[0];
			sp += chan1;

			p02 = buff0[-1];
			p12 = buff1[-1];

#pragma pipeloop(0)
			for (i = 0; i <= (wid - 2); i += 2) {
				d64_2x32 sd0, sd1;
				d64_2x32 dd0, dd1;

				p00 = p02;
				p10 = p12;

				sd0.d64 = *(TYPE_64BIT *) (buff0 + i);
				sd1.d64 = *(TYPE_64BIT *) (buff1 + i);
				p01 = (mlib_d64)sd0.i32s.i0;
				p02 = (mlib_d64)sd0.i32s.i1;
				p11 = (mlib_d64)sd1.i32s.i0;
				p12 = (mlib_d64)sd1.i32s.i1;

				LOAD_BUFF(buff2);

				dd0.i32s.i0 =
				    CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 +
				    p11 * k3);
				dd0.i32s.i1 =
				    CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 +
				    p12 * k3);

				dd1.d64 = vis_fpack32(dd1.d64, dd0.d64);

				STORE2(dd1.i32s.i0, dd1.i32s.i1);

				sp += chan2;
				dp += chan2;
			}

			for (; i < wid; i++) {
				d64_2x32 dd0, dd1;

				p00 = buff0[i - 1];
				p10 = buff1[i - 1];
				p01 = buff0[i];
				p11 = buff1[i];

				buff2[i] = (mlib_s32)sp[0];

				dd0.i32s.i1 =
				    CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 +
				    p11 * k3);
				dd1.d64 = vis_fpack32(dd1.d64, dd0.d64);
				dp[0] = dd1.i32s.i1;

				sp += chan1;
				dp += chan1;
			}

			if (dx_r != 0)
				buff2[swid - 1] = buff2[swid - 2];

			if (j < hgt - dy_b - 2)
				sl += sll;
			dl += dll;

			buffT = buff0;
			buff0 = buff1;
			buff1 = buff2;
			buff2 = buffT;
		}
	}

	if (pbuff != (mlib_s32 *)buff_arr)
		__mlib_free(pbuff);

	return (MLIB_SUCCESS);
}