mlib_status
__mlib_GraphicsDrawArc_AB_32(
    mlib_image *buffer,
    mlib_s16 x,
    mlib_s16 y,
    mlib_s32 r,
    mlib_f32 t1,
    mlib_f32 t2,
    mlib_s32 c,
    mlib_s32 a)
{
	mlib_s32 stride = mlib_ImageGetStride(buffer) / sizeof (mlib_s32);
	mlib_s32 width = mlib_ImageGetWidth(buffer) - 1;
	mlib_s32 height = mlib_ImageGetHeight(buffer) - 1;
	mlib_s32 *data = mlib_ImageGetData(buffer);
	mlib_s32 *line0, *line;
	mlib_s32 cx, cy, del, mask;

	mlib_s32 sin1, cos1, sin2, cos2, oct1, oct2, flagc, flagd;
	pinfo buf0[BUFSIZE], *buf = 0;
	mlib_s32 count, scount;
	mlib_s32 start, end;

	mlib_s32 ind0, ind1, ind2, mdel, k;
	mlib_d64 c0, c1, c2;
	mlib_f32 cf;
	mlib_d64 d_one = ((mlib_d64 *)mlib_v_TabAlias)[0];

	mlib_s32 a1;
	mlib_f32 fa;
	mlib_d64 da, da1, dc;
	mlib_f32 falpha = vis_to_float(0xFF000000);

	a &= 0xff;
	a1 = ~a & 0xff;

	fa = vis_to_float((a << 16) | (a << 8) | a);
	da = vis_to_double((a << 6), (a << 22) | (a << 6));
	da1 = vis_to_double((a1 << 6), (a1 << 22) | (a1 << 6));
	dc = vis_fmul8x16al(vis_to_float(c), vis_to_float(0x4000));
	vis_write_gsr((1 << 3) + 0);

	c |= 0xFF000000;
	cf = vis_to_float(c);
	if (!data)
		return (MLIB_NULLPOINTER);

	if (r < 0)
		return (MLIB_FAILURE);

	CHECK_INTERSECTION;

	if (r == 0) {
		if (INSIDE(x, y))
			BLE32((data + (stride * y + x)), 0);
		return (MLIB_SUCCESS);
	}

	if (mlib_fabs(t1 - t2) >= PIx2)
		return (__mlib_GraphicsDrawCircle_AB_32(buffer, x, y, r, c, a));
	{
		mlib_f32 tt = t1;

		t1 = -t2;
		t2 = -tt;
	}

	if (t1 > t2)
		t2 += PIx2;

	line0 = data + stride * y + x;

	if (r > RADMAX) {
		buf = (pinfo *) __mlib_malloc(sizeof (pinfo) * r);

		if (!buf)
			return (MLIB_FAILURE);
	} else
		buf = buf0;

	k = (0x100000 / r);

	FILL_BUF;

	GET_BORDERS;

	FILL_FLAGS;

	FILL_CL_FLAGS;

	start = 0;
	end = count;
	PROC_OCT(y, y - height, x - width, x, 2, cos, 1, 2, stride, -1);
	start = 1;
	end = count;
	PROC_OCT(y, y - height, -x, width - x, 1, cos, 2, 1, stride, 1);
	start = 0;
	end = count;
	PROC_OCT(height - y, -y, x - width, x, 5, cos, 2, 1, -stride, -1);
	start = 1;
	end = count;
	PROC_OCT(height - y, -y, -x, width - x, 6, cos, 1, 2, -stride, 1);

	start = 1;
	end = scount;
	PROC_OCT(x, x - width, y - height, y, 3, sin, 2, 1, 1, -stride);
	start = 0;
	end = scount;
	PROC_OCT(x, x - width, -y, height - y, 4, sin, 1, 2, 1, stride);
	start = 1;
	end = scount;
	PROC_OCT(width - x, -x, y - height, y, 0, sin, 1, 2, -1, -stride);
	start = 0;
	end = scount;
	PROC_OCT(width - x, -x, -y, height - y, 7, sin, 2, 1, -1, stride);

	if (buf != buf0)
		__mlib_free(buf);

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_VectorSumAbsDiff_S32_Sat(
    mlib_d64 *z,
    const mlib_s32 *x,
    const mlib_s32 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3;
    mlib_s32 *px = (mlib_s32 *)x, *py = (mlib_s32 *)y;
    __m128i zero, xbuf, ybuf, zbuf, xlo, xhi, mext;
    mlib_d64 dsum = 0.0;
    zero = _mm_setzero_si128();
    zbuf = zero;

    nstep = 16 / sizeof (mlib_s32);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s32);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }
        *z = dsum;
    } else {
        for (i = 0; i < n1; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }
        if (ax == ay) {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi32(ybuf, xbuf);
                xbuf = _mm_sub_epi32(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi32(xbuf, mext);
                xlo = _mm_unpacklo_epi32(xbuf, zero);
                xhi = _mm_unpackhi_epi32(xbuf, zero);
                zbuf = _mm_add_epi64(zbuf, xlo);
                zbuf = _mm_add_epi64(zbuf, xhi);
                px += nstep;
                py += nstep;
            }
        } else {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi32(ybuf, xbuf);
                xbuf = _mm_sub_epi32(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi32(xbuf, mext);
                xlo = _mm_unpacklo_epi32(xbuf, zero);
                xhi = _mm_unpackhi_epi32(xbuf, zero);
                zbuf = _mm_add_epi64(zbuf, xlo);
                zbuf = _mm_add_epi64(zbuf, xhi);
                px += nstep;
                py += nstep;
            }
        }
        for (i = 0; i < n3; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }

        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
mlib_status FUNC(
    MxN) (
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kernel,
    mlib_s32 m,
    mlib_s32 n,
    mlib_s32 dm,
    mlib_s32 dn,
    mlib_s32 scale,
    const void *colormap)
{
	mlib_type stype, dtype;
	mlib_u8 *sl, *dl;
	mlib_u8 *lut_table;
	mlib_s32 offset, off, kw, dn1;
	mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh;
	mlib_s32 row, i, j, bsize, buff_ind = 0, func_ind, method;
	mlib_u16 *pbuff, *buff_lcl[2 * MAX_N], **buff_arr = buff_lcl, **buff;
	mlib_d64 *buffd;
	mlib_d64 kern_lcl[MAX_N * MAX_M], *kern = kern_lcl, *pkern;
	mlib_d64 dscale;
	func_dm_type func_dm;

	mlib_s32 vis_scale, kern_i;
	mlib_s32 kern_size, isum;
	mlib_d64 sum, norm;
	mlib_f32 fscale;
	mlib_s32 bit_offset;
	mlib_u8 *buff_dst;

	MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl);
	MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl);
	bit_offset = mlib_ImageGetBitOffset(dst);

	if (!(stype == MLIB_BYTE && schan == 1)) {
		return (MLIB_FAILURE);
	}
#if 0
	for (i = 0; i <= m * dn + dm; i++) {
		if (kernel[i])
			return (MLIB_FAILURE);
	}

#endif /* 0 */

	dn = n - 1 - dn;
	dm = m - 1 - dm;
	kern_size = m * dn + dm;

	if (n > MAX_N || m > MAX_M) {
		kern =
		    __mlib_malloc(n * m * sizeof (mlib_d64) +
		    2 * n * sizeof (mlib_u16 *));

		if (kern == NULL)
			return (MLIB_FAILURE);
		buff_arr = (mlib_u16 **)(kern + n * m);
	}

	dscale = 1.0;
	while (scale > 30) {
		dscale *= 1.0 / (1 << 30);
		scale -= 30;
	}

	dscale /= (1 << scale);

/* load kernel */
	kernel += m * n - 1;
	sum = 0;
	for (i = 0; i < kern_size; i++) {
		kern[i] = dscale * kernel[-i];
		sum += mlib_fabs(kern[i]);
	}

	vis_scale = mlib_ilogb(sum);

	if (vis_scale > 13)
		return (MLIB_OUTOFRANGE);
	vis_scale = 14 - vis_scale;

	if (vis_scale > 15)
		vis_scale = 15;
	norm = 32768 >> (15 - vis_scale);
	isum = 0;
	for (i = 0; i < kern_size; i++) {
		if (kern[i] > 0.0) {
			kern_i = (mlib_s32)(kern[i] * norm + 0.5);
		} else {
			kern_i = (mlib_s32)(kern[i] * norm - 0.5);
		}

		isum += abs(kern_i);
		kern[i] = vis_to_double_dup((kern_i << 16) | (kern_i & 0xffff));
	}

/* recalc without rounding */
	if (isum > 32767) {
		dscale *= norm;
		for (i = 0; i < kern_size; i++) {
			kern_i = (mlib_s32)(dscale * kernel[-i]);
			kern[i] =
			    vis_to_double_dup((kern_i << 16) | (kern_i &
			    0xffff));
		}
	}

	fscale = vis_to_float(1 << (vis_scale - 1));
	vis_write_gsr(((16 - vis_scale) << 3) + 2);

	offset = mlib_ImageGetLutOffset(colormap);
	lut_table = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap);

	bsize = (sw + m) * NCHAN;
	bsize = (bsize + 7) & ~7;
	dn1 = (dn) ? dn : 1;
	pbuff =
	    __mlib_malloc((dn1 + 1) * bsize * sizeof (mlib_u16) + EXTRA_BUFF);

	if (pbuff == NULL) {
		if (kern != kern_lcl)
			__mlib_free(kern);
		return (MLIB_FAILURE);
	}

	for (j = 0; j < dn1; j++) {
		buff_arr[dn1 + j] = buff_arr[j] = pbuff + j * bsize;
	}

	buff_ind = 0;
	buffd = (mlib_d64 *)(pbuff + dn1 * bsize);
	buff_dst = (mlib_u8 *)((mlib_u16 *)buffd + bsize);

/* clear buffer */
	for (i = 0; i < dn * (bsize / 4); i++) {
		((mlib_d64 *)pbuff)[i] = 0;
	}

	func_ind = dm;

	if (func_ind > KH_MAX)
		func_ind = KH_MAX;
	method = mlib_ImageGetMethod(colormap);

	if (method == LUT_COLOR_CUBE_SEARCH)
		func_ind += KH_MAX + 1;
	else if (method == LUT_COLOR_DIMENSIONS)
		func_ind += 2 * (KH_MAX + 1);
	func_dm = func_dm_arr[func_ind];

	for (row = 0; row < sh; row++) {
		mlib_u8 *sp = sl;

		buff = buff_arr + buff_ind;

/* convert source line */
		for (i = 0; i < sw; i++) {
			mlib_d64 ss;

			ss = LD_U8(sp, i);
			ss = vis_fmul8x16al(vis_read_lo(ss), fscale);
			ST_U16(buffd, i, ss);
		}

		pkern = kern;
		for (j = 0; j < dn; j++) {
			for (off = 0; off < m; off += kw) {
				kw = m - off;

				if (kw > KW_MAX) {
					if (kw > 2 * KW_MAX)
						kw = KW_MAX;
					else
						kw = kw / 2;
				}

				func_m_arr[kw] (buffd, buff[j] + off * NCHAN,
				    pkern + off, sw);
			}

			pkern += m;
		}

#ifdef USE_COLOR2INDEXLINE
		func_dm(buff_dst, (void *)buffd, buff[dn] + dm * NCHAN, pkern,
		    colormap, lut_table, sw, dm, 0);
/*
 * mlib_ImageColorTrue2IndexLine_U8_BIT_1
 * (buff_dst, dl, bit_offset, sw, colormap);
 */
#else /* USE_COLOR2INDEXLINE */
		func_dm(dl, (void *)buffd, buff[dn] + dm * NCHAN, pkern,
		    colormap, lut_table, sw, dm, bit_offset);
#endif /* USE_COLOR2INDEXLINE */

		buff_ind++;

		if (buff_ind >= dn1)
			buff_ind -= dn1;

		sl += sll;
		dl += dll;
	}

	__mlib_free(pbuff);

	if (kern != kern_lcl)
		__mlib_free(kern);

	return (MLIB_SUCCESS);
}
mlib_status
__mlib_GraphicsFillEllipse_32(
    mlib_image *buffer,
    mlib_s16 x,
    mlib_s16 y,
    mlib_s32 a,
    mlib_s32 b,
    mlib_f32 t,
    mlib_s32 c)
{
	mlib_s32 stride = mlib_ImageGetStride(buffer) / 4;
	mlib_s32 width = mlib_ImageGetWidth(buffer);
	mlib_s32 height = mlib_ImageGetHeight(buffer);
	mlib_s32 *data = mlib_ImageGetData(buffer);
	mlib_s32 *line;
	mlib_f32 cosfi, sinfi, cosfi2, sinfi2;
	mlib_s32 doclip = 0;
	mlib_s32 left, right, bottom, top, length;
	mlib_s32 cx, cy;
	mlib_f32 a2, b2;
	mlib_s32 zeroangle = 0;
	mlib_s32 *buf, bufsize, ibuf;

	if (!data)
		return (MLIB_NULLPOINTER);

	if (a < 0 || b < 0)
		return (MLIB_FAILURE);

	mlib_sincosf(-t, &sinfi, &cosfi);

	if (a == 0 || b == 0) {
		mlib_s32 x1, y1, x2, y2;

		if (b == 0) {
			x1 = a * cosfi + ((cosfi > 0.0f) ? 0.5f : -0.5f);
			y1 = -a * sinfi + ((sinfi < 0.0f) ? 0.5f : -0.5f);
			x2 = -x1;
			y2 = -y1;
		} else {
			x1 = -b * sinfi + ((sinfi < 0.0f) ? 0.5f : -0.5f);
			y1 = -b * cosfi + ((cosfi < 0.0f) ? 0.5f : -0.5f);
			x2 = -x1;
			y2 = -y1;
		}

		return __mlib_GraphicsDrawLine_32(buffer, x + x1, y + y1,
		    x + x2, y + y2, c);
	}

	if (a == b)
		return (__mlib_GraphicsFillCircle_32(buffer, x, y, a, c));

	length = max(a, b);

	if (mlib_fabs(sinfi * length) < 0.5f)
		zeroangle = 1;
	else if (mlib_fabs(cosfi * length) < 0.5f) {
		mlib_s32 tmp = b;

		b = a;
		a = tmp;
		zeroangle = 1;
	}

	a2 = a * a;
	b2 = b * b;

	if (!zeroangle) {
		mlib_f32 A, B, C, D, A2, B2, C2, _2A, _2B, _2C, _4B, s, z1, z2;
		mlib_f32 fcx, fcy, _2fcx, _2fcy, fcx1, fcy1, fcx21, fcy21,
		    _2fcx1, _2fcy1, _2fcx3, _2fcy3;
		mlib_f32 d, dinside, doutside;
		mlib_f32 gradi, gradj;
		mlib_f32 d1n_n, d1n_ne, d1ne_n, d1ne_ne;
		mlib_f32 d2e_e, d2e_ne, d2ne_e, d2ne_ne;
		mlib_f32 d3e_e, d3e_se, d3se_e, d3se_se;
		mlib_f32 d4s_s, d4s_se, d4se_s, d4se_se;
		mlib_f32 grad1n_i, grad1ne_i;
		mlib_f32 grad2e_j, grad2ne_j;
		mlib_f32 grad3e_j, grad3se_j;
		mlib_f32 grad4s_i, grad4se_i;
		mlib_s32 stop1y, stop3x;
		mlib_s32 minxx, minxy, maxyx, maxyy, maxxx, maxxy, minyx, minyy;

		cosfi2 = cosfi * cosfi;
		sinfi2 = sinfi * sinfi;
		A = b2 * cosfi2 + a2 * sinfi2;
		B = sinfi * cosfi * (b2 - a2);
		C = b2 * sinfi2 + a2 * cosfi2;
		D = a2 * b2;
		_2A = A * 2.0f;
		_2B = B * 2.0f;
		_2C = C * 2.0f;
		_4B = B * 4.0f;
		A2 = A * A;
		B2 = B * B;
		C2 = C * C;

/* determine the rectangle that the the ellipse fits in */
		s = mlib_sqrtf(D / (A * C2 - C * B2));
		minxx = -C * s - 0.5f;
		minxy = B * s + ((B > 0.0f) ? 0.5f : -0.5f);
		maxxx = -minxx;
		maxxy = -minxy;
		s = mlib_sqrtf(D / (A2 * C - B2 * A));
		minyy = -A * s - 0.5f;
		minyx = B * s + ((B > 0.0f) ? 0.5f : -0.5f);

		maxyy = -minyy;
		maxyx = -minyx;

		z1 = C + B;
		z2 = A + B;
		s = mlib_sqrtf(D / (A * z1 * z1 - 2.0f * B * z1 * z2 +
		    C * z2 * z2));
		stop1y = (z2 > 0.0f) ? (z2 * s + 0.5f) : (z2 * s - 0.5f);
		z1 = C - B;
		z2 = A - B;
		s = mlib_sqrtf(D / (A * z1 * z1 + 2.0f * B * z1 * z2 +
		    C * z2 * z2));
		stop3x = (z1 > 0.0f) ? (z1 * s + 0.5f) : (z1 * s - 0.5f);

		left = minxx + x - 1;
		right = maxxx + x + 1;
		bottom = minyy + y - 1;
		top = maxyy + y + 1;

		if (right < 0 || left >= width || top < 0 || bottom >= height)
			return (MLIB_SUCCESS);

		if (left < 0 || right >= width || bottom < 0 || top >= height)
			doclip = 1;

		bufsize = ((minyy > 0) ? minyy : -minyy);

		if (!(buf =
		    (mlib_s32 *)__mlib_malloc(sizeof (mlib_s32) * (bufsize * 2 +
		    1))))
			return (MLIB_FAILURE);

		cx = minxx;
		cy = minxy;

		d4s_s = d1n_n = _2C;
		d1ne_n = d1n_ne = _2B + _2C;
		d2ne_ne = d1ne_ne = _2A + _4B + _2C;
		d3e_e = d2e_e = _2A;
		d2ne_e = d2e_ne = _2A + _2B;
		d3se_e = d3e_se = _2A - _2B;
		d4se_se = d3se_se = _2A - _4B + _2C;
		d4se_s = d4s_se = _2C - _2B;

		grad3e_j = grad2e_j = grad1n_i = _2B;
		grad1ne_i = _2A + _2B;
		grad2ne_j = _2C + _2B;
		grad3se_j = _2B - _2C;
		grad4s_i = -_2B;
		grad4se_i = _2A - _2B;

		ROTATED_ELLIPSE_PHASE1;
		ROTATED_ELLIPSE_PHASE2;
		ROTATED_ELLIPSE_PHASE3;
		ROTATED_ELLIPSE_PHASE4;
	} else {
/* Simple algorithm that fills a simple not rotated ellipse */
		mlib_s32 _2a2, _2b2;
		mlib_f32 _4a2b2;
		mlib_s32 cx1, cy1, _2cx, _2cy, _2cx1, _2cy1, _2cx3, _2cy3, cx21,
		    cy21;
		mlib_s32 d, dinside, doutside;
		mlib_s32 gradi, gradj;
		mlib_s32 d1e_e, d1e_se, d1se_e, d1se_se;
		mlib_s32 d2s_s, d2s_se, d2se_s, d2se_se;
		mlib_s32 grad1_j, grad2_i;
		mlib_s32 gradstop1;

		left = x - a;
		right = x + a;
		top = y + b;
		bottom = y - b;

		if (right < 0 || left >= width || top < 0 || bottom >= height)
			return (MLIB_SUCCESS);

		if (left < 0 || right >= width || bottom < 0 || top >= height)
			doclip = 1;

		bufsize = b;

		if (!(buf =
		    (mlib_s32 *)__mlib_malloc(sizeof (mlib_s32) * (b * 2 + 1))))
			return (MLIB_FAILURE);

		_2a2 = a2 * 2;
		_2b2 = b2 * 2;
		_4a2b2 = a2 * b2 * 4;

		gradstop1 = a2 / mlib_sqrtf(a2 + b2) + 0.5f;

		cx = 0;
		cy = b;

		d1se_e = d1e_se = d1e_e = _2b2;
		d2se_se = d1se_se = _2b2 + _2a2;
		d2se_s = d2s_se = d2s_s = _2a2;

		grad1_j = -_2a2;
		grad2_i = _2b2;

		SIMPLE_ELLIPSE_PHASE1;
		SIMPLE_ELLIPSE_PHASE2;
	}

	{
		mlib_d64 dcolor;

		line = data + (y - bufsize) * stride;
		DOUBLE_FROM_INT(dcolor, c);

		if (!doclip) {
			for (ibuf = 0; 2 * bufsize - ibuf >= 0; ++ibuf) {
				mlib_s32 beg = x + buf[2 * bufsize - ibuf];
				mlib_s32 end = x - buf[ibuf];

				MLIB_FILL_ROW_32(line, beg, end, c, dcolor);
				line += stride;
			}
		} else {
			mlib_s32 yb = (bufsize - y > 0) ? bufsize - y : 0;
			mlib_s32 ye =
			    (bufsize >
			    height - 1 - y) ? bufsize + (height - 1 -
			    y) : bufsize * 2;

			line += yb * stride;
			for (ibuf = yb; ibuf <= ye; ++ibuf) {
				mlib_s32 end = x - buf[ibuf];
				mlib_s32 beg = x + buf[2 * bufsize - ibuf];
				mlib_s32 mask = ((width - 1) - end) >> 31;

				beg &= ~(beg >> 31);
				end = (end & ~mask) | ((width - 1) & mask);
				MLIB_FILL_ROW_32(line, beg, end, c, dcolor);
				line += stride;
			}
		}
	}

	__mlib_free(buf);
	return (MLIB_SUCCESS);
}
mlib_status
__mlib_GraphicsFillArc_AB_32(
    mlib_image *buffer,
    mlib_s16 x,
    mlib_s16 y,
    mlib_s32 r,
    mlib_f32 t1,
    mlib_f32 t2,
    mlib_s32 c,
    mlib_s32 a)
{
	mlib_s32 stride = mlib_ImageGetStride(buffer) / sizeof (mlib_s32);
	mlib_s32 width = mlib_ImageGetWidth(buffer) - 1;
	mlib_s32 height = mlib_ImageGetHeight(buffer) - 1;
	mlib_s32 lwidth = width - 1;
	mlib_s32 lheight = height - 1;
	mlib_s32 *data = mlib_ImageGetData(buffer);
	mlib_s32 *line0, *line;
	mlib_s32 cx, cy, del, mask;
	mlib_status rez;

	mlib_s32 stepsignx1, stepsignx2, stepsigny1, stepsigny2;
	mlib_s32 lineindex1 = 0, lineindex2 = 0;
	mlib_s32 ifoverlap = 0;

	mlib_f32 tt1 = t1, tt2 = t2;
	mlib_s32 sn1, cs1, sn2, cs2;
	mlib_s32 sin1, cos1, sin2, cos2, oct1, oct2, flagc, flagd;
	mlib_s32 xx, yy, ux, uy, dx, dy;

	pinfo buf0[BUFSIZE], *buf = 0;
	pinfo_line buf0_line1[RADMAX+1], *buf_line1 = 0;
	pinfo_line buf0_line2[RADMAX+1], *buf_line2 = 0;
	pinfo_line *buf_line;

	mlib_s32 count, scount;
	mlib_s32 start, end;

	mlib_s32 ind0, ind1, ind2, c0, c1, c2, mdel, k;
	mlib_s32 alpha0, alpha1, alpha2;
	mlib_s32 cf0 = c & 0xff, cf1 = (c & 0xff00) >> 8, cf2 =
	    (c & 0xff0000) >> 16;
	mlib_u8 cfalpha = 0xFF;
	mlib_d64 A0, A1;

	a &= 0xff;
	A1 = a / 255.;
	A0 = 1. - A1;

	if (!data)
		return (MLIB_NULLPOINTER);

	if (r < 0)
		return (MLIB_FAILURE);

	CHECK_INTERSECTION;

	if (r == 0) {
		if (INSIDE(x, y)) {
			BLEND32((data + (stride * y + x))[0], c, a);
		}
		return (MLIB_SUCCESS);
	}

	if (mlib_fabs(t1 - t2) >= PIx2) {
		return (__mlib_GraphicsFillCircle_AB_32(buffer, x, y, r, c, a));
	}
	{
		mlib_f32 tt = t1;

		t1 = -t2;
		t2 = -tt;
	}

	if (t1 > t2)
		t2 += PIx2;

	line0 = data + stride * y + x;

	if (r > RADMAX) {
		buf = (pinfo *) __mlib_malloc(sizeof (pinfo) * r);

		if (!buf)
			return (MLIB_FAILURE);

		buf_line1 = (pinfo_line *) __mlib_malloc(sizeof (pinfo_line)
			* (r + 1));

		if (!buf_line1) {
			__mlib_free(buf);
			return (MLIB_FAILURE);
		}
		buf_line2 = (pinfo_line *) __mlib_malloc(sizeof (pinfo_line)
			* (r + 1));
		if (!buf_line2) {
			__mlib_free(buf);
			__mlib_free(buf_line1);
			return (MLIB_FAILURE);
		}
	} else {
		buf = buf0;
		buf_line1 = buf0_line1;
		buf_line2 = buf0_line2;
	}


	k = (0x100000 / r);

	FILL_BUF;

	GET_BORDERS;

	if (t1 == t2) {
		FREE_MEM;
		return (__mlib_GraphicsDrawLine_AB_32(buffer, x, y,
		    x + cs1, y - sn1, c, a));
	}

	FILL_FLAGS;

	FILL_CL_FLAGS;

	SET_STEPSIGN;

	ARC_FILL_LINEBUF(1);
	ARC_FILL_LINEBUF(2);

	start = 0;
	end = count;
	PROC_OCT(y, y - height, x - width, x, 2, cos, 1, 2, stride, -1);
	start = 1;
	end = count;
	PROC_OCT(y, y - height, -x, width - x, 1, cos, 2, 1, stride, 1);
	start = 0;
	end = count;
	PROC_OCT(height - y, -y, x - width, x, 5, cos, 2, 1, -stride, -1);
	start = 1;
	end = count;
	PROC_OCT(height - y, -y, -x, width - x, 6, cos, 1, 2, -stride, 1);

	start = 1;
	end = scount;
	PROC_OCT(x, x - width, y - height, y, 3, sin, 2, 1, 1, -stride);
	start = 0;
	end = scount;
	PROC_OCT(x, x - width, -y, height - y, 4, sin, 1, 2, 1, stride);
	start = 1;
	end = scount;
	PROC_OCT(width - x, -x, y - height, y, 0, sin, 1, 2, -1, -stride);
	start = 0;
	end = scount;
	PROC_OCT(width - x, -x, -y, height - y, 7, sin, 2, 1, -1, stride);

	rez = __mlib_GraphicsFillArc_B_32(buffer, x, y, r - 1, tt1, tt2, c, a);
	if (rez != MLIB_SUCCESS) {
		FREE_MEM;
		return (rez);
	}

	DRAW_LINE_AB(1, 2);
	DRAW_LINE_AB(2, 1);

	FREE_MEM;

	return (MLIB_SUCCESS);
}
mlib_status mlib_ImageConvKernelConvert(mlib_s32       *ikernel,
                                        mlib_s32       *iscale,
                                        const mlib_d64 *fkernel,
                                        mlib_s32       m,
                                        mlib_s32       n,
                                        mlib_type      type)
{
  mlib_d64 sum_pos, sum_neg, sum, norm, max, f;
  mlib_s32 isum_pos, isum_neg, isum, test;
  mlib_s32 i, scale, scale1, chk_flag;

  if (ikernel == NULL || iscale == NULL || fkernel == NULL || m < 1 || n < 1) {
    return MLIB_FAILURE;
  }

  if ((type == MLIB_BYTE) || (type == MLIB_SHORT) || (type == MLIB_USHORT)) {

    if (type != MLIB_SHORT) {               /* MLIB_BYTE, MLIB_USHORT */
      sum_pos = 0;
      sum_neg = 0;

      for (i = 0; i < m * n; i++) {
        if (fkernel[i] > 0)
          sum_pos += fkernel[i];
        else
          sum_neg -= fkernel[i];
      }

      sum = (sum_pos > sum_neg) ? sum_pos : sum_neg;
      scale = mlib_ilogb(sum);
      scale++;

      scale = 31 - scale;
    }
    else {                                  /* MLIB_SHORT */
      sum = 0;
      max = 0;

      for (i = 0; i < m * n; i++) {
        f = mlib_fabs(fkernel[i]);
        sum += f;
        max = (max > f) ? max : f;
      }

      scale1 = mlib_ilogb(max) + 1;
      scale = mlib_ilogb(sum);
      scale = (scale > scale1) ? scale : scale1;
      scale++;

      scale = 32 - scale;
    }

    if (scale <= 16)
      return MLIB_FAILURE;
    if (scale > 31)
      scale = 31;

    *iscale = scale;

    chk_flag = mlib_ImageConvVersion(m, n, scale, type);

    if (!chk_flag) {
      norm = (1u << scale);
      for (i = 0; i < m * n; i++) {
        CLAMP_S32(ikernel[i], fkernel[i] * norm);
      }

      return MLIB_SUCCESS;
    }

    /* try to round coefficients */
#ifdef __sparc
    scale1 = 16;                            /* shift of coefficients is 16 */
#else

    if (chk_flag == 3)
      scale1 = 16;                          /* MMX */
    else
      scale1 = (type == MLIB_BYTE) ? 8 : 16;
#endif /* __sparc */
    norm = (1u << (scale - scale1));

    for (i = 0; i < m * n; i++) {
      if (fkernel[i] > 0)
        ikernel[i] = (mlib_s32) (fkernel[i] * norm + 0.5);
      else
        ikernel[i] = (mlib_s32) (fkernel[i] * norm - 0.5);
    }

    isum_pos = 0;
    isum_neg = 0;
    test = 0;

    for (i = 0; i < m * n; i++) {
      if (ikernel[i] > 0)
        isum_pos += ikernel[i];
      else
        isum_neg -= ikernel[i];
    }

    if (type == MLIB_BYTE || type == MLIB_USHORT) {
      isum = (isum_pos > isum_neg) ? isum_pos : isum_neg;

      if (isum >= (1 << (31 - scale1)))
        test = 1;
    }
    else {
      isum = isum_pos + isum_neg;

      if (isum >= (1 << (32 - scale1)))
        test = 1;
      for (i = 0; i < m * n; i++) {
        if (abs(ikernel[i]) >= (1 << (31 - scale1)))
          test = 1;
      }
    }

    if (test == 1) {                        /* rounding according scale1 cause overflow, truncate instead of round */
      for (i = 0; i < m * n; i++)
        ikernel[i] = (mlib_s32) (fkernel[i] * norm) << scale1;
    }
    else {                                  /* rounding is Ok */
      for (i = 0; i < m * n; i++)
        ikernel[i] = ikernel[i] << scale1;
    }

    return MLIB_SUCCESS;
  }
  else if ((type == MLIB_INT) || (type == MLIB_BIT)) {
    max = 0;

    for (i = 0; i < m * n; i++) {
      f = mlib_fabs(fkernel[i]);
      max = (max > f) ? max : f;
    }

    scale = mlib_ilogb(max);

    if (scale > 29)
      return MLIB_FAILURE;

    if (scale < -100)
      scale = -100;

    *iscale = 29 - scale;
    scale = 29 - scale;

    norm = 1.0;
    while (scale > 30) {
      norm *= (1 << 30);
      scale -= 30;
    }

    norm *= (1 << scale);

    for (i = 0; i < m * n; i++) {
      if (fkernel[i] > 0) {
        CLAMP_S32(ikernel[i], fkernel[i] * norm + 0.5);
      }
      else {
        CLAMP_S32(ikernel[i], fkernel[i] * norm - 0.5);
      }
    }

    return MLIB_SUCCESS;
  }
  else {
    return MLIB_FAILURE;
  }
}
mlib_status
__mlib_GraphicsDrawEllipse_X_32(
    mlib_image *buffer,
    mlib_s16 x,
    mlib_s16 y,
    mlib_s32 a,
    mlib_s32 b,
    mlib_f32 t,
    mlib_s32 c0,
    mlib_s32 c1)
{
	mlib_s32 c = c0 ^ c1;
	mlib_s32 stride = mlib_ImageGetStride(buffer) / 4;
	mlib_s32 width = mlib_ImageGetWidth(buffer);
	mlib_s32 height = mlib_ImageGetHeight(buffer);
	mlib_u32 *data = mlib_ImageGetData(buffer);
	mlib_u32 *line0, *line1;
	mlib_f32 cosfi, sinfi, cosfi2, sinfi2;
	mlib_s32 doclip = 0;
	mlib_s32 left, right, bottom, top, length;
	mlib_s32 cx, cy;
	mlib_f32 a2, b2;
	mlib_s32 zeroangle = 0;

	if (!data)
		return (MLIB_NULLPOINTER);

	if (a < 0 || b < 0)
		return (MLIB_FAILURE);

	mlib_sincosf(-t, &sinfi, &cosfi);

	if (a == 0 || b == 0) {
		mlib_s32 x1, y1, x2, y2;

		if (b == 0) {
			x1 = a * cosfi + ((cosfi > 0.0f) ? 0.5f : -0.5f);
			y1 = -a * sinfi + ((sinfi < 0.0f) ? 0.5f : -0.5f);
			x2 = -x1;
			y2 = -y1;
		} else {
			x1 = -b * sinfi + ((sinfi < 0.0f) ? 0.5f : -0.5f);
			y1 = -b * cosfi + ((cosfi < 0.0f) ? 0.5f : -0.5f);
			x2 = -x1;
			y2 = -y1;
		}

		return __mlib_GraphicsDrawLine_X_32(buffer, x + x1, y + y1,
		    x + x2, y + y2, c0, c1);
	}

	if (a == b)
		return (__mlib_GraphicsDrawCircle_X_32
		    (buffer, x, y, a, c0, c1));

	length = MAXI(a, b);

	if (mlib_fabs(sinfi * length) < 0.5f)
		zeroangle = 1;
	else if (mlib_fabs(cosfi * length) < 0.5f) {
		mlib_s32 tmp = b;

		b = a;
		a = tmp;
		zeroangle = 1;
	}

	a2 = a * a;
	b2 = b * b;

	if (!zeroangle) {
		mlib_f32 A, B, C, D, A2, B2, C2, _2A, _2B, _2C, _4B, s, z1, z2;
		mlib_f32 fcx, fcy, _2fcx, _2fcy, fcx1, fcy1, fcx21, fcy21,
		    _2fcx1, _2fcy1, _2fcx3, _2fcy3;
		mlib_f32 d, dinside, doutside;
		mlib_f32 gradi, gradj;
		mlib_f32 d1n_n, d1n_ne, d1ne_n, d1ne_ne;
		mlib_f32 d2e_e, d2e_ne, d2ne_e, d2ne_ne;
		mlib_f32 d3e_e, d3e_se, d3se_e, d3se_se;
		mlib_f32 d4s_s, d4s_se, d4se_s, d4se_se;
		mlib_f32 grad1n_i, grad1ne_i;
		mlib_f32 grad2e_j, grad2ne_j;
		mlib_f32 grad3e_j, grad3se_j;
		mlib_f32 grad4s_i, grad4se_i;
		mlib_s32 stop1y, stop3x;
		mlib_s32 minxx, minxy, maxyx, maxyy, maxxx, maxxy, minyx, minyy;

		cosfi2 = cosfi * cosfi;
		sinfi2 = sinfi * sinfi;
		A = b2 * cosfi2 + a2 * sinfi2;
		B = sinfi * cosfi * (b2 - a2);
		C = b2 * sinfi2 + a2 * cosfi2;
		D = a2 * b2;
		_2A = A * 2.0f;
		_2B = B * 2.0f;
		_2C = C * 2.0f;
		_4B = B * 4.0f;
		A2 = A * A;
		B2 = B * B;
		C2 = C * C;

/* determine the rectangle that the the ellipse fits in */
		s = mlib_sqrtf(D / (A * C2 - C * B2));
		minxx = -C * s - 0.5f;
		minxy = B * s + ((B > 0.0f) ? 0.5f : -0.5f);
		maxxx = -minxx;
		maxxy = -minxy;
		s = mlib_sqrtf(D / (A2 * C - B2 * A));
		minyy = -A * s - 0.5f;
		minyx = B * s + ((B > 0.0f) ? 0.5f : -0.5f);
		maxyy = -minyy;
		maxyx = -minyx;

		z1 = C + B;
		z2 = A + B;
		s = mlib_sqrtf(D / (A * z1 * z1 - 2.0f * B * z1 * z2 +
		    C * z2 * z2));
		stop1y = (z2 > 0.0f) ? (z2 * s + 0.5f) : (z2 * s - 0.5f);
		z1 = C - B;
		z2 = A - B;
		s = mlib_sqrtf(D / (A * z1 * z1 + 2.0f * B * z1 * z2 +
		    C * z2 * z2));
		stop3x = (z1 > 0.0f) ? (z1 * s + 0.5f) : (z1 * s - 0.5f);

		left = minxx + x - 1;
		right = maxxx + x + 1;
		bottom = minyy + y - 1;
		top = maxyy + y + 1;

		if (right < 0 || left >= width || top < 0 || bottom >= height)
			return (MLIB_SUCCESS);

		if (left < 0 || right >= width || bottom < 0 || top >= height)
			doclip = 1;

		cx = minxx;
		cy = minxy;
		line0 = data + stride * (y - minxy) + (x + minxx);
		line1 = data + stride * (y + minxy) + (x - minxx);

		d4s_s = d1n_n = _2C;
		d1ne_n = d1n_ne = _2B + _2C;
		d2ne_ne = d1ne_ne = _2A + _4B + _2C;
		d3e_e = d2e_e = _2A;
		d2ne_e = d2e_ne = _2A + _2B;
		d3se_e = d3e_se = _2A - _2B;
		d4se_se = d3se_se = _2A - _4B + _2C;
		d4se_s = d4s_se = _2C - _2B;

		grad3e_j = grad2e_j = grad1n_i = _2B;
		grad1ne_i = _2A + _2B;
		grad2ne_j = _2C + _2B;
		grad3se_j = _2B - _2C;
		grad4s_i = -_2B;
		grad4se_i = _2A - _2B;

		if (!doclip) {
			mlib_s32 tmp0, tmp1;

			ROTATED_ELLIPSE_PHASE1_START;
			ROTATED_PUT_NOCLIP;
			PHASE_END;

			ROTATED_ELLIPSE_PHASE2_START;
			ROTATED_PUT_NOCLIP;
			PHASE_END;

			ROTATED_ELLIPSE_PHASE3_START;
			ROTATED_PUT_NOCLIP;
			PHASE_END;

			ROTATED_ELLIPSE_PHASE4_START;
			ROTATED_PUT_NOCLIP;
			PHASE_END;
		} else {
			ROTATED_ELLIPSE_PHASE1_START;
			ROTATED_PUT_CLIP;
			PHASE_END;

			ROTATED_ELLIPSE_PHASE2_START;
			ROTATED_PUT_CLIP;
			PHASE_END;

			ROTATED_ELLIPSE_PHASE3_START;
			ROTATED_PUT_CLIP;
			PHASE_END;

			ROTATED_ELLIPSE_PHASE4_START;
			ROTATED_PUT_CLIP;
			PHASE_END;
		}
	} else {
/* Simple algorithm that draws a simple not rotated ellipse */

		mlib_s32 _2a2, _2b2;
		mlib_f32 _4a2b2;
		mlib_s32 cx1, cy1, _2cx, _2cy, _2cx1, _2cy1, _2cx3, _2cy3, cx21,
		    cy21;
		mlib_s32 d, dinside, doutside;
		mlib_s32 gradi, gradj;
		mlib_s32 d1e_e, d1e_se, d1se_e, d1se_se;
		mlib_s32 d2s_s, d2s_se, d2se_s, d2se_se;
		mlib_s32 grad1_j, grad2_i;
		mlib_s32 gradstop1;

		left = x - a;
		right = x + a;
		top = y + b;
		bottom = y - b;

		if (right < 0 || left >= width || top < 0 || bottom >= height)
			return (MLIB_SUCCESS);

		if (left < 0 || right >= width || bottom < 0 || top >= height)
			doclip = 1;

		_2a2 = a2 * 2;
		_2b2 = b2 * 2;
		_4a2b2 = a2 * b2 * 4;

		gradstop1 = a2 / mlib_sqrtf(a2 + b2) + 0.5f;

		cx = 0;
		cy = b;
		line0 = data + stride * (y - b) + x;
		line1 = data + stride * (y + b) + x;

		d1se_e = d1e_se = d1e_e = _2b2;
		d2se_se = d1se_se = _2b2 + _2a2;
		d2se_s = d2s_se = d2s_s = _2a2;

		grad1_j = -_2a2;
		grad2_i = _2b2;

		if (!doclip) {
			mlib_s32 tmp0, tmp1, tmp2, tmp3;

			tmp0 = *line0 ^ c;
			tmp1 = *line1 ^ c;
			*line0 = tmp0;
			*line1 = tmp1;

			SIMPLE_ELLIPSE_PHASE1_START;
			SIMPLE_PUT_NOCLIP;
			PHASE_END;

			SIMPLE_ELLIPSE_PHASE2_START;
			SIMPLE_PUT_NOCLIP;
			PHASE_END;
		} else {
			mlib_s32 h0 = 0, h1 = 0, v0 = 0, v1 = 0;

			if ((x | ((width - 1) - (x + cx))) > 0) {
				h0 = (y - cy) | ((height - 1) - (y - cy));
				h1 = (y + cy) | ((height - 1) - (y + cy));

				if (h0 > 0)
					*line0 ^= c;

				if (h1 > 0)
					*line1 ^= c;
			}

			SIMPLE_ELLIPSE_PHASE1_START;
			SIMPLE_SET_CLIP_FLAGS;
			SIMPLE_PUT_CLIP;
			PHASE_END;

			SIMPLE_ELLIPSE_PHASE2_START;
			SIMPLE_SET_CLIP_FLAGS;
			SIMPLE_PUT_CLIP;
			PHASE_END;

			SIMPLE_PUT_CLIP_LAST;
		}
	}

	return (MLIB_SUCCESS);
}
mlib_status
mlib_m_ImageInitInterpTableAffine_S16(
    mlib_interp_table * table,
    mlib_s32 nchan)
{
	mlib_s32 width, height, width_bits, height_bits, vis_width_bits,
	    vis_height_bits;
	mlib_s32 subsampleBitsH, subsampleBitsV;
	mlib_s32 i, j, c, scale, num_copy, num_copy_old;
	mlib_s32 isum;
	mlib_s32 max_scale, min_scale, scaleh, scalev;
	mlib_s32 norm_scale_v, norm_scale_h;
	mlib_d64 dscale, *dataH, *dataV;
	mlib_d64 **ptr_tablex, *tablex, *tablex_old, *tabley;
	mlib_d64 max, d;
	mlib_d64 sumh, sumv, normh, normv;

	if (!table)
		return (MLIB_FAILURE);
	if (table->shift_vis_affine < 0)
		return (MLIB_FAILURE);

	if (nchan == 1) {
		num_copy = 1;
		ptr_tablex = &(table->dataH_s16_1);
	} else if (nchan == 2) {
		num_copy = 2;
		ptr_tablex = &(table->dataH_s16_3);
	} else if (nchan == 3 || nchan == 4) {
		num_copy = 4;
		ptr_tablex = &(table->dataH_s16_4);
	} else
		return (MLIB_FAILURE);

	if (*ptr_tablex != NULL && table->dataV_s16_1 != NULL)
		return (MLIB_SUCCESS);

	dataH = mlib_ImageGetInterpDoubleDataH(table);
	dataV = mlib_ImageGetInterpDoubleDataV(table);
	if (!dataH || !dataV)
		return (MLIB_FAILURE);

	width = mlib_ImageGetInterpWidth(table);
	height = mlib_ImageGetInterpHeight(table);
	width_bits = mlib_ImageGetInterpWidthBits(table);
	height_bits = mlib_ImageGetInterpHeightBits(table);
	vis_width_bits = table->vis_width_bits;
	vis_height_bits = table->vis_height_bits;
	subsampleBitsH = mlib_ImageGetInterpSubsampleBitsH(table);
	subsampleBitsV = mlib_ImageGetInterpSubsampleBitsV(table);

	if (table->dataV_s16_1 != NULL) {
		if (table->dataH_s16_1 != NULL) {
			tablex_old = table->dataH_s16_1;
			num_copy_old = 1;
		} else if (table->dataH_s16_3 != NULL) {
			tablex_old = table->dataH_s16_3;
			num_copy_old = 3;
		} else {
			tablex_old = table->dataH_s16_4;
			num_copy_old = 4;
		}

		tablex =
		    mlib_malloc(num_copy * (1 << subsampleBitsH) *
		    (1 << vis_width_bits) * sizeof (mlib_s16));
		if (tablex == NULL)
			return (MLIB_FAILURE);

		for (j = 0; j < ((width + 1) & ~1); j++) {
			mlib_s16 *tbl = (mlib_s16 *)tablex + j * num_copy;
			mlib_s16 *tbl_old =
			    (mlib_s16 *)tablex_old + j * num_copy_old;
			for (i = 0; i < (1 << subsampleBitsH); i++) {
				mlib_s16 v =
				    tbl_old[num_copy_old *
				    (i << vis_width_bits)];
				for (c = 0; c < num_copy; c++) {
					tbl[num_copy * (i << vis_width_bits) +
					    c] = v;
				}
			}
		}
		*ptr_tablex = tablex;
		return (MLIB_SUCCESS);
	}

	sumv = 0;
	max = 0;

	for (i = 0; i < (1 << subsampleBitsV); i++) {
		mlib_d64 s = 0;
		mlib_s32 ind = (i << height_bits);

		for (j = 0; j < height; j++) {
			d = mlib_fabs(dataV[j + ind]);
			s += d;
			max = (max > d) ? max : d;
		}
		sumv = (sumv > s) ? sumv : s;
	}

/* all fhkernels = 0 */
	if (sumv == 0) {
		dscale = 0;

/* X table */

		tablex =
		    mlib_malloc(num_copy * (1 << subsampleBitsH) *
		    (1 << vis_width_bits) * sizeof (mlib_s16));
		if (tablex == NULL)
			return (MLIB_FAILURE);

		INIT_TABLE_16(tablex, (1 << subsampleBitsH), width, width_bits,
		    vis_width_bits, dataH);

		if ((dataH == dataV) && num_copy == 4)
			tabley = tablex;
		else {

			num_copy = 4;

			tabley =
			    mlib_malloc(num_copy * (1 << subsampleBitsV) *
			    (1 << vis_height_bits) * sizeof (mlib_s16));
			if (tabley == NULL) {
				mlib_free(tablex);
				return (MLIB_FAILURE);
			}

			INIT_TABLE_16(tabley, (1 << subsampleBitsV), height,
			    height_bits, vis_height_bits, dataV);

			*ptr_tablex = tablex;
			table->dataV_s16_1 = tabley;

/* Store shift */
			table->shift_vis_affine = 43;

			return (MLIB_SUCCESS);
		}
	}

	normv = 32767.0 / (32768.0 * sumv);
	scalev = mlib_ilogb(sumv * normv);
	isum = mlib_ilogb(max * normv);

/* all elements must be in the range -32768, 32767 */
	if (scalev == isum)
		norm_scale_v = 14;
/* but sumv may be in the range -65576, 65575 */
	else
		norm_scale_v = 15;

	min_scale = 25;
	max_scale = 40;

	normh = 32768.0 * sumv / 32767;

	if (dataH != dataV) {
		sumh = 0;
		max = 0;

		for (i = 0; i < (1 << subsampleBitsH); i++) {
			mlib_d64 s = 0;
			mlib_s32 ind = (i << width_bits);

			for (j = 0; j < width; j++) {
				d = mlib_fabs(dataH[j + ind]);
				s += d;
				max = (max > d) ? max : d;
			}
			sumh = (sumh > s) ? sumh : s;
		}
	} else
		sumh = sumv;

	isum = mlib_ilogb(max * normh);
	scaleh = mlib_ilogb(sumh * normh);

/* all elements must be in the range -32768, 32767 */
	if (scaleh == isum)
		norm_scale_h = 14;
/* but sumh may be in the range -65576, 65575 */
	else
		norm_scale_h = 15;

	scale = norm_scale_v + norm_scale_h - (scaleh + scalev);

	if (scale < min_scale) {
		table->shift_vis_affine = -1;
/* koeff. are so large */
		return (MLIB_FAILURE);
	}

	if (scale > max_scale) {
		scaleh += (scale - max_scale + 1) >> 1;
		scalev += (scale - max_scale) >> 1;
		scale = max_scale;
	}
mlib_status
__mlib_GraphicsFillArc_X_8(
    mlib_image *buffer,
    mlib_s16 xx,
    mlib_s16 yy,
    mlib_s32 r,
    mlib_f32 t1,
    mlib_f32 t2,
    mlib_s32 c0,
    mlib_s32 c2)
{
	mlib_s32 stride = mlib_ImageGetStride(buffer);
	mlib_s32 width = mlib_ImageGetWidth(buffer);
	mlib_s32 height = mlib_ImageGetHeight(buffer);
	mlib_u8 *data = mlib_ImageGetData(buffer);
	mlib_u8 *line0 = NULL, *line = NULL;
	mlib_s32 cx, cy, del, mask;

	mlib_s32 sin1, cos1, sin2, cos2, oct1, oct2, flagd, flagc;
	mlib_s32 sn1, cs1, sn2, cs2, xl, xr;
	mlib_s32 buf0[BUFSIZE], *buf = NULL, count_arc;
	mlib_s32 line_fill_1_0[RADMAX], line_fill_2_0[RADMAX];
	mlib_s32 *line_fill_1 = NULL, *line_fill_2 = NULL, *point_line = NULL;
	mlib_s32 start, start_1, start_2, end, help;
	mlib_d64 dc;

	mlib_s32 xb, xe, clip, side, count_repeat, repeat;
	mlib_f32 help_t2, start_sin1;
	mlib_s32 x = xx, y = yy, cxor = c0 ^ c2, xb_last, xe_last;

	if (!data)
		return (MLIB_NULLPOINTER);

	if (r < 0)
		return (MLIB_FAILURE);

	if (r == 0) {
		if (y < height && y >= 0 && x < width && x >= 0)
			*(data + (stride * y + x)) ^= cxor;
		return (MLIB_SUCCESS);
	}

	if (x - r >= width || x + r < 0 || y - r >= height || y + r < 0)
		return (MLIB_SUCCESS);

	if (mlib_fabs(t1 - t2) >= PIx2)
		return (__mlib_GraphicsFillCircle_X_8
		    (buffer, xx, yy, r, c0, c2));

	xb_last = width;
	xe_last = -1;
	count_repeat = 1;
	repeat = 0;

	{
		mlib_f32 tt = t1;

		t1 = -t2;
		t2 = -tt;
	}

	if (t1 > t2)
		t2 += PIx2;

	{
		mlib_s32 n2p = t1 / PIx2;
		mlib_f32 sh = PIx2 * (mlib_f32)n2p;

		if (t1 < 0.0f)
			sh -= PIx2;
		t1 -= sh;
		t2 -= sh;
	}

	line0 = data + stride * y + x;

	MLIB_GRAPHICS_COLOR_8(cxor);
	MLIB_GRAPHICS_TO_DOUBLE(dc, cxor);

	if (r >= RADMAX) {
		buf = __mlib_malloc(sizeof (mlib_s32) * (r + 1));

		if (!buf)
			return (MLIB_FAILURE);
		line_fill_1 = __mlib_malloc(sizeof (mlib_s32) * (r + 1));

		if (!line_fill_1) {
			__mlib_free(buf);
			return (MLIB_FAILURE);
		}

		line_fill_2 = __mlib_malloc(sizeof (mlib_s32) * (r + 1));

		if (!line_fill_2) {
			__mlib_free(buf);
			__mlib_free(line_fill_1);
			return (MLIB_FAILURE);
		}
	} else {
		buf = buf0;
		line_fill_1 = line_fill_1_0;
		line_fill_2 = line_fill_2_0;
	}

	FILL_BUF;

	if (t2 > PIx2) {
		help_t2 = t2 - PIx2;
		t2 = PIx2;
		count_repeat = 0;
	}

	for (; count_repeat < 2; count_repeat++) {
		GET_X_Y_COORDINATE;

		if (oct2 < oct1) {
			mask =
			    __mlib_GraphicsDrawLine_X_8(buffer, xx, yy,
			    xx + cs1, yy + sn1, c0, c2);

			if (mask == MLIB_SUCCESS) {
				FINISH_STEP;
			} else
				return (mask);
		}

		if (oct2 == 8)
			oct2 = 7;

		if ((sn1 == 0) && (sn2 == 0) && (cs1 >= 0) && (cs2 >= 0) &&
		    (oct2 - oct1 > 4))
			return __mlib_GraphicsFillCircle_X_8(buffer, xx, yy, r,
			    c0, c2);

		if (count_repeat == 0)
			start_sin1 = sn1;

		point_line = &line_fill_1[0];
		FILL_LINE_POINT(cs1, sn1, 0);

		point_line = &line_fill_2[0];
		FILL_LINE_POINT(cs2, sn2, -1);

		FILL_FLAGD;

		if ((y | (height - 1 - y)) >= 0) {
			if (sn1 | sn2) {
				if (sn1 >= 0) {
					if (sn2 > 0) {
						LEFT(x + line_fill_2[0], xb);
						RIGHT(x + line_fill_1[0], xe);
					} else {
						LEFT(x - buf[0], xb);

						if (line_fill_1[0] >=
						    line_fill_2[0])
							RIGHT(x +
							    line_fill_1[0], xe)
							    else
							RIGHT(x +
							    line_fill_2[0], xe);
					}
				} else {
					LEFT(x + line_fill_1[0], xb);
					RIGHT(x + line_fill_2[0], xe);
				}
			} else if ((cs1 | cs2) > 0) {
				LEFT(x, xb);
				RIGHT(x + r, xe);

				if (xb_last > xb)
					xb_last = xb;

				if (xe_last < xe)
					xe_last = xe;
				FINISH_STEP;
			} else if ((cs2 & cs1) < 0) {
				LEFT(x - r, xb);
				RIGHT(x, xe);

				if (xb_last > xb)
					xb_last = xb;

				if (xe_last < xe)
					xe_last = xe;
				FINISH_STEP;
			} else {
				LEFT(x - r, xb);
				RIGHT(x + r, xe);
			}

			if (xb < xb_last)
				xb_last = xb;

			if (xe > xe_last)
				xe_last = xe;
		}
mlib_status
__mlib_VolumeRayCast_Blocked_Parallel_Nearest_U8_U8(
	mlib_rays *rays,
	const mlib_blkvolume *blk,
	void *buffer)
{
	mlib_s32 xysize;
	mlib_s32 nrays, i;
	mlib_d64 dOx, dOy, dOz;
	mlib_u8 *voxels, *dp;
	mlib_s32 nsteps, cray, minsteps, maxsteps, cstp;
	mlib_u32 n, m, index;
	mlib_d64 Ox, Oy, Oz;
	mlib_d64 cx[MAX_RAY_NUM], cy[MAX_RAY_NUM], cz[MAX_RAY_NUM];
	mlib_d64 X, Y, Z;
	mlib_u8 a;
	mlib_d64 go_to_rays = 0., go_to_steps = 0., testx, testy, testz;
	mlib_s32 xsrc, ysrc, zsrc;

	if (rays == NULL || blk == NULL)
		return (MLIB_NULLPOINTER);

	xysize = blk->xysize, nrays = rays->nrays;
	voxels = (mlib_u8 *)blk->voxels;

	dOx = rays->incs[0][0];
	dOy = rays->incs[0][1];
	dOz = rays->incs[0][2];

	i = xysize >> 6;
	n = 0;
	while (i >>= 1)
		n++;
	m = (n << 1) + 12;
	n += 11;

	if (nrays <= MAX_RAY_NUM) {
		minsteps = MLIB_S32_MAX;
		maxsteps = -1;

		for (cray = 0; cray < nrays; cray++) {
			if (rays->nsteps[cray] < minsteps)
				minsteps = rays->nsteps[cray];
			if (rays->nsteps[cray] > maxsteps)
				maxsteps = rays->nsteps[cray];
		}

		Ox = rays->starts[0][0];
		Oy = rays->starts[0][1];
		Oz = rays->starts[0][2];

		if (mlib_fabs(rays->incs[0][0]) / 12. > go_to_rays)
			go_to_rays = mlib_fabs(rays->incs[0][0] / 12.);

		if (mlib_fabs(rays->incs[0][1]) / 6. > go_to_rays)
			go_to_rays = mlib_fabs(rays->incs[0][1] / 6.);

		if (mlib_fabs(rays->incs[0][2]) > go_to_rays)
			go_to_rays = mlib_fabs(rays->incs[0][2]);

		cx[0] = 0;
		cy[0] = 0;
		cz[0] = 0;

		testx = 0;
		testy = 0;
		testz = 0;

		for (cray = 1; cray < nrays; cray++) {
			cx[cray] = rays->starts[cray][0] - Ox;
			testx += mlib_fabs(cx[cray] - cx[cray - 1]);
			cy[cray] = rays->starts[cray][1] - Oy;
			testy += mlib_fabs(cy[cray] - cy[cray - 1]);
			cz[cray] = rays->starts[cray][2] - Oz;
			testz += mlib_fabs(cz[cray] - cz[cray - 1]);
		}

		Ox = Ox + 0.5;
		Oy = Oy + 0.5;
		Oz = Oz + 0.5;

		if (nrays != 1) {
			if (testx / (nrays - 1) / 12. > go_to_steps)
				go_to_steps = testx / (nrays - 1) / 12.;

			if (testy / (nrays - 1) / 6. > go_to_steps)
				go_to_steps = testy / (nrays - 1) / 6.;

			if (testz / (nrays - 1) > go_to_steps)
				go_to_steps = testz / (nrays - 1);
		}
	}

	if ((go_to_rays > go_to_steps) && (nrays <= MAX_RAY_NUM)) {

		for (cstp = 0; cstp < minsteps; cstp++) {
			dp = (mlib_u8 *)rays->results[cstp];

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
			for (cray = 0; cray < nrays; cray++) {
				X = Ox + cx[cray];
				xsrc = (mlib_s32)X;
				Y = Oy + cy[cray];
				ysrc = (mlib_s32)Y;
				Z = Oz + cz[cray];
				zsrc = (mlib_s32)Z;
				index = mlib_indx[xsrc] |
					mlib_indy[ysrc & 0x3F] |
					mlib_indz[zsrc & 0x1F] |
					((ysrc & (~0x3F)) << n) |
					((zsrc & (~0x1F)) << m);

				*dp = voxels[index];
				dp++;
			}

			Ox += dOx;
			Oy += dOy;
			Oz += dOz;
		}

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (cstp = minsteps; cstp < maxsteps; cstp++) {
			dp = (mlib_u8 *)rays->results[cstp];

			for (cray = 0; cray < nrays; cray++) {

				if (cstp < rays->nsteps[cray]) {
					X = Ox + cx[cray];
					xsrc = (mlib_s32)X;
					Y = Oy + cy[cray];
					ysrc = (mlib_s32)Y;
					Z = Oz + cz[cray];
					zsrc = (mlib_s32)Z;
					index = mlib_indx[xsrc] |
						mlib_indy[ysrc & 0x3F] |
						mlib_indz[zsrc & 0x1F] |
						((ysrc & (~0x3F)) << n) |
						((zsrc & (~0x1F)) << m);

					*dp = voxels[index];
					dp++;

				} else {
					dp++;
				}

			}
			Ox += dOx;
			Oy += dOy;
			Oz += dOz;
		}
	} else {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (cray = 0; cray < nrays; cray++) {
			X = rays->starts[cray][0] + 0.5;
			Y = rays->starts[cray][1] + 0.5;
			Z = rays->starts[cray][2] + 0.5;

			xsrc = (mlib_s32)X;
			X += dOx;
			ysrc = (mlib_s32)Y;
			Y += dOy;
			zsrc = (mlib_s32)Z;
			Z += dOz;

			nsteps = rays->nsteps[cray];

			for (cstp = 0; cstp < nsteps; cstp++) {
				index = mlib_indx[xsrc] |
					mlib_indy[ysrc & 0x3F] |
					mlib_indz[zsrc & 0x1F] |
					((ysrc & (~0x3F)) << n) |
					((zsrc & (~0x1F)) << m);

				a = voxels[index];

				xsrc = (mlib_s32)X;
				X += dOx;
				ysrc = (mlib_s32)Y;
				Y += dOy;
				zsrc = (mlib_s32)Z;
				Z += dOz;

				((mlib_u8 *)rays->results[cstp])[cray] = a;
			}
		}
	}
	return (MLIB_SUCCESS);
}