void *mlib_ImageCreateRowTable(mlib_image *img) { mlib_u8 **rtable, *tline; mlib_s32 i, im_height, im_stride; if (img == NULL) return NULL; if (img -> state) return img -> state; im_height = mlib_ImageGetHeight(img); im_stride = mlib_ImageGetStride(img); tline = mlib_ImageGetData(img); rtable = mlib_malloc((3 + im_height)*sizeof(mlib_u8 *)); if (rtable == NULL || tline == NULL) return NULL; rtable[0] = 0; rtable[1] = (mlib_u8*)((void **)rtable + 1); rtable[2 + im_height] = (mlib_u8*)((void **)rtable + 1); for (i = 0; i < im_height; i++) { rtable[i+2] = tline; tline += im_stride; } img -> state = ((void **)rtable + 2); return img -> state; }
mlib_image *mlib_ImageCreateStruct(mlib_type type, mlib_s32 channels, mlib_s32 width, mlib_s32 height, mlib_s32 stride, const void *data) { mlib_image *image; if (stride <= 0) { return NULL; } image = (mlib_image *)mlib_malloc(sizeof(mlib_image)); if (image == NULL) { return NULL; } if (mlib_ImageSet(image, type, channels, width, height, stride, data) == NULL) { mlib_free(image); image = NULL; } return image; }
mlib_status mlib_ImageMinFilter7x7_S16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl; __m64 *dp0, *dp1; __m64 aa, bb, cc, dd, ee, ff, r0, r1; __m64 g0, g1, g2, g3, g4, g5, g6, gg; __m64 h0, h1, h2, h3, h4, h5, h6, hh; __m64 e_mask; mlib_s32 i, j, wid8, tail; wid = (wid - KSIZE1) * SSIZE; wid8 = (wid + 7) & ~7; pbuff = mlib_malloc(KSIZE1 * wid8); buff0 = pbuff; buff1 = buff0 + wid8; buff2 = buff1 + wid8; buff3 = buff2 + wid8; buff4 = buff3 + wid8; buff5 = buff4 + wid8; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE); tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; for (j = 0; j < 3; j++) { sp0 = buff4; sp1 = buff5; sp6 = sl; sp7 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; sp0 += 8; sp1 += 8; sp6 += 8; sp7 += 8; } if (j < 2) { buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; } } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; sp7 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: ee * line5: ff * line4: g0 g1 g2 g3 g4 g5 g6 * line5: h0 h1 h2 h3 h4 h5 h6 */ for (i = 0; i <= wid - 8; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; sp6 += 8; sp7 += 8; } if (tail) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; for (i = 0; i <= wid - 8; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; sp6 += 8; } if (tail) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_m_conv5x5_u16nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker[5][5]; __m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000; __m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h, tmph; __m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l, tmpl; __m64 *sp, *dp; mlib_s32 shift, ind, ker_sum = 0; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * 2 * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (__m64 *) sl; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < wid4; i++) { PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < width / 4; i++) { CONV_5x5(hi, i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(hi, i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_m_sconv5x5_8nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[5]; __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, d0, d1, d2, prev0; __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = 2 * ((width + 7) / 8); pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4); GET_KERN(); for (i = 0; i < 5; i++) { buff_arr[i] = pbuff + i * wid4; } for (j = 0; j < 4; j++) { buff4 = buff_arr[j]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(lo, i); } sl += sll; ind++; } buff0 = buff_arr[0]; buff1 = buff_arr[1]; buff2 = buff_arr[2]; buff3 = buff_arr[3]; buff4 = buff_arr[4]; for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff0; buff0 = buff1; buff1 = buff2; buff2 = buff3; buff3 = buff4; buff4 = buffT; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_AffineEdges(mlib_affine_param *param, const mlib_image *dst, const mlib_image *src, void *buff_lcl, mlib_s32 buff_size, mlib_s32 kw, mlib_s32 kh, mlib_s32 kw1, mlib_s32 kh1, mlib_edge edge, const mlib_d64 *mtx, mlib_s32 shiftx, mlib_s32 shifty) { mlib_u8 *buff = buff_lcl; mlib_u8 **lineAddr = param->lineAddr; mlib_s32 srcWidth, dstWidth, srcHeight, dstHeight, srcYStride, dstYStride; mlib_s32 *leftEdges, *rightEdges, *xStarts, *yStarts, bsize0, bsize1 = 0; mlib_u8 *srcData, *dstData; mlib_u8 *paddings; void *warp_tbl = NULL; mlib_s32 yStart = 0, yFinish = -1, dX, dY; mlib_d64 xClip, yClip, wClip, hClip; mlib_d64 delta = 0.; mlib_d64 minX, minY, maxX, maxY; mlib_d64 coords[4][2]; mlib_d64 a = mtx[0], b = mtx[1], tx = mtx[2], c = mtx[3], d = mtx[4], ty = mtx[5]; mlib_d64 a2, b2, tx2, c2, d2, ty2; mlib_d64 dx, dy, div; mlib_s32 sdx, sdy; mlib_d64 dTop; mlib_d64 val0; mlib_s32 top, bot; mlib_s32 topIdx, max_xsize = 0; mlib_s32 i, j, t; srcData = mlib_ImageGetData(src); dstData = mlib_ImageGetData(dst); srcWidth = mlib_ImageGetWidth(src); srcHeight = mlib_ImageGetHeight(src); dstWidth = mlib_ImageGetWidth(dst); dstHeight = mlib_ImageGetHeight(dst); srcYStride = mlib_ImageGetStride(src); dstYStride = mlib_ImageGetStride(dst); paddings = mlib_ImageGetPaddings(src); if (srcWidth >= (1 << 15) || srcHeight >= (1 << 15)) { return MLIB_FAILURE; } div = a * d - b * c; if (div == 0.0) { return MLIB_FAILURE; } bsize0 = (dstHeight * sizeof(mlib_s32) + 7) & ~7; if (lineAddr == NULL) { bsize1 = ((srcHeight + 4 * kh) * sizeof(mlib_u8 *) + 7) & ~7; } param->buff_malloc = NULL; if ((4 * bsize0 + bsize1) > buff_size) { buff = param->buff_malloc = mlib_malloc(4 * bsize0 + bsize1); if (buff == NULL) return MLIB_FAILURE; } leftEdges = (mlib_s32 *) (buff); rightEdges = (mlib_s32 *) (buff += bsize0); xStarts = (mlib_s32 *) (buff += bsize0); yStarts = (mlib_s32 *) (buff += bsize0); if (lineAddr == NULL) { mlib_u8 *srcLinePtr = srcData; lineAddr = (mlib_u8 **) (buff += bsize0); for (i = 0; i < 2 * kh; i++) lineAddr[i] = srcLinePtr; lineAddr += 2 * kh; for (i = 0; i < srcHeight - 1; i++) { lineAddr[i] = srcLinePtr; srcLinePtr += srcYStride; } for (i = srcHeight - 1; i < srcHeight + 2 * kh; i++) lineAddr[i] = srcLinePtr; } if ((mlib_s32) edge < 0) { /* process edges */ minX = 0; minY = 0; maxX = srcWidth; maxY = srcHeight; } else { if (kw > 1) delta = -0.5; /* for MLIB_NEAREST filter delta = 0. */ minX = (kw1 - delta); minY = (kh1 - delta); maxX = srcWidth - ((kw - 1) - (kw1 - delta)); maxY = srcHeight - ((kh - 1) - (kh1 - delta)); if (edge == MLIB_EDGE_SRC_PADDED) { if (minX < paddings[0]) minX = paddings[0]; if (minY < paddings[1]) minY = paddings[1]; if (maxX > (srcWidth - paddings[2])) maxX = srcWidth - paddings[2]; if (maxY > (srcHeight - paddings[3])) maxY = srcHeight - paddings[3]; } } xClip = minX; yClip = minY; wClip = maxX; hClip = maxY; /* * STORE_PARAM(param, src); * STORE_PARAM(param, dst); */ param->src = (void *)src; param->dst = (void *)dst; STORE_PARAM(param, lineAddr); STORE_PARAM(param, dstData); STORE_PARAM(param, srcYStride); STORE_PARAM(param, dstYStride); STORE_PARAM(param, leftEdges); STORE_PARAM(param, rightEdges); STORE_PARAM(param, xStarts); STORE_PARAM(param, yStarts); STORE_PARAM(param, max_xsize); STORE_PARAM(param, yStart); STORE_PARAM(param, yFinish); STORE_PARAM(param, warp_tbl); if ((xClip >= wClip) || (yClip >= hClip)) { return MLIB_SUCCESS; } a2 = d; b2 = -b; tx2 = (-d * tx + b * ty); c2 = -c; d2 = a; ty2 = (c * tx - a * ty); dx = a2; dy = c2; tx -= 0.5; ty -= 0.5; coords[0][0] = xClip * a + yClip * b + tx; coords[0][1] = xClip * c + yClip * d + ty; coords[2][0] = wClip * a + hClip * b + tx; coords[2][1] = wClip * c + hClip * d + ty; if (div > 0) { coords[1][0] = wClip * a + yClip * b + tx; coords[1][1] = wClip * c + yClip * d + ty; coords[3][0] = xClip * a + hClip * b + tx; coords[3][1] = xClip * c + hClip * d + ty; } else { coords[3][0] = wClip * a + yClip * b + tx; coords[3][1] = wClip * c + yClip * d + ty; coords[1][0] = xClip * a + hClip * b + tx; coords[1][1] = xClip * c + hClip * d + ty; } topIdx = 0; for (i = 1; i < 4; i++) { if (coords[i][1] < coords[topIdx][1]) topIdx = i; } dTop = coords[topIdx][1]; val0 = dTop; SAT32(top); bot = -1; if (top >= dstHeight) { return MLIB_SUCCESS; } if (dTop >= 0.0) { mlib_d64 xLeft, xRight, x; mlib_s32 nextIdx; if (dTop == top) { xLeft = coords[topIdx][0]; xRight = coords[topIdx][0]; nextIdx = (topIdx + 1) & 0x3; if (dTop == coords[nextIdx][1]) { x = coords[nextIdx][0]; xLeft = (xLeft <= x) ? xLeft : x; xRight = (xRight >= x) ? xRight : x; } nextIdx = (topIdx - 1) & 0x3; if (dTop == coords[nextIdx][1]) { x = coords[nextIdx][0]; xLeft = (xLeft <= x) ? xLeft : x; xRight = (xRight >= x) ? xRight : x; } val0 = xLeft; SAT32(t); leftEdges[top] = (t >= xLeft) ? t : ++t; if (xLeft >= MLIB_S32_MAX) leftEdges[top] = MLIB_S32_MAX; val0 = xRight; SAT32(rightEdges[top]); } else top++; } else top = 0; for (i = 0; i < 2; i++) { mlib_d64 dY1 = coords[(topIdx - i) & 0x3][1]; mlib_d64 dX1 = coords[(topIdx - i) & 0x3][0]; mlib_d64 dY2 = coords[(topIdx - i - 1) & 0x3][1]; mlib_d64 dX2 = coords[(topIdx - i - 1) & 0x3][0]; mlib_d64 x = dX1, slope = (dX2 - dX1) / (dY2 - dY1); mlib_s32 y1; mlib_s32 y2; if (dY1 == dY2) continue; if (dY1 < 0.0) y1 = 0; else { val0 = dY1 + 1; SAT32(y1); } val0 = dY2; SAT32(y2); if (y2 >= dstHeight) y2 = (mlib_s32) (dstHeight - 1); x += slope * (y1 - dY1); #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = y1; j <= y2; j++) { val0 = x; SAT32(t); leftEdges[j] = (t >= x) ? t : ++t; if (x >= MLIB_S32_MAX) leftEdges[j] = MLIB_S32_MAX; x += slope; } } for (i = 0; i < 2; i++) { mlib_d64 dY1 = coords[(topIdx + i) & 0x3][1]; mlib_d64 dX1 = coords[(topIdx + i) & 0x3][0]; mlib_d64 dY2 = coords[(topIdx + i + 1) & 0x3][1]; mlib_d64 dX2 = coords[(topIdx + i + 1) & 0x3][0]; mlib_d64 x = dX1, slope = (dX2 - dX1) / (dY2 - dY1); mlib_s32 y1; mlib_s32 y2; if (dY1 == dY2) continue; if (dY1 < 0.0) y1 = 0; else { val0 = dY1 + 1; SAT32(y1); } val0 = dY2; SAT32(y2); if (y2 >= dstHeight) y2 = (mlib_s32) (dstHeight - 1); x += slope * (y1 - dY1); #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = y1; j <= y2; j++) { val0 = x; SAT32(rightEdges[j]); x += slope; } bot = y2; } { mlib_d64 dxCl = xClip * div; mlib_d64 dyCl = yClip * div; mlib_d64 dwCl = wClip * div; mlib_d64 dhCl = hClip * div; mlib_s32 xCl = (mlib_s32) (xClip + delta); mlib_s32 yCl = (mlib_s32) (yClip + delta); mlib_s32 wCl = (mlib_s32) (wClip + delta); mlib_s32 hCl = (mlib_s32) (hClip + delta); /* * mlib_s32 xCl = (mlib_s32)(xClip + delta); * mlib_s32 yCl = (mlib_s32)(yClip + delta); * mlib_s32 wCl = (mlib_s32)(wClip); * mlib_s32 hCl = (mlib_s32)(hClip); */ if (edge == MLIB_EDGE_SRC_PADDED) { xCl = kw1; yCl = kh1; wCl = (mlib_s32) (srcWidth - ((kw - 1) - kw1)); hCl = (mlib_s32) (srcHeight - ((kh - 1) - kh1)); } div = 1.0 / div; sdx = (mlib_s32) (a2 * div * (1 << shiftx)); sdy = (mlib_s32) (c2 * div * (1 << shifty)); if (div > 0) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = top; i <= bot; i++) { mlib_s32 xLeft = leftEdges[i]; mlib_s32 xRight = rightEdges[i]; mlib_s32 xs, ys, x_e, y_e, x_s, y_s; mlib_d64 dxs, dys, dxe, dye; mlib_d64 xl, ii, xr; xLeft = (xLeft < 0) ? 0 : xLeft; xRight = (xRight >= dstWidth) ? (mlib_s32) (dstWidth - 1) : xRight; xl = xLeft + 0.5; ii = i + 0.5; xr = xRight + 0.5; dxs = xl * a2 + ii * b2 + tx2; dys = xl * c2 + ii * d2 + ty2; if ((dxs < dxCl) || (dxs >= dwCl) || (dys < dyCl) || (dys >= dhCl)) { dxs += dx; dys += dy; xLeft++; if ((dxs < dxCl) || (dxs >= dwCl) || (dys < dyCl) || (dys >= dhCl)) xRight = -1; } dxe = xr * a2 + ii * b2 + tx2; dye = xr * c2 + ii * d2 + ty2; if ((dxe < dxCl) || (dxe >= dwCl) || (dye < dyCl) || (dye >= dhCl)) { dxe -= dx; dye -= dy; xRight--; if ((dxe < dxCl) || (dxe >= dwCl) || (dye < dyCl) || (dye >= dhCl)) xRight = -1; } xs = (mlib_s32) ((dxs * div + delta) * (1 << shiftx)); x_s = xs >> shiftx; ys = (mlib_s32) ((dys * div + delta) * (1 << shifty)); y_s = ys >> shifty; if (x_s < xCl) xs = (xCl << shiftx); else if (x_s >= wCl) xs = ((wCl << shiftx) - 1); if (y_s < yCl) ys = (yCl << shifty); else if (y_s >= hCl) ys = ((hCl << shifty) - 1); if (xRight >= xLeft) { x_e = ((xRight - xLeft) * sdx + xs) >> shiftx; y_e = ((xRight - xLeft) * sdy + ys) >> shifty; if ((x_e < xCl) || (x_e >= wCl)) { if (sdx > 0) sdx -= 1; else sdx += 1; } if ((y_e < yCl) || (y_e >= hCl)) { if (sdy > 0) sdy -= 1; else sdy += 1; } } leftEdges[i] = xLeft; rightEdges[i] = xRight; xStarts[i] = xs; yStarts[i] = ys; if ((xRight - xLeft + 1) > max_xsize) max_xsize = (xRight - xLeft + 1); } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = top; i <= bot; i++) {
mlib_status mlib_m_conv5x5_8nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 ker[5][5]; __m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc; __m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= (KSIZE - 1); height -= (KSIZE - 1); width *= NCHAN; dl += ((KSIZE - 1) / 2) * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d3, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d4, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); s0 = (*sp++); UNPACK_SRC(d3, lo); UNPACK_SRC(d4, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status CONV_FUNC(MxN)(mlib_image *dst, const mlib_image *src, const mlib_d64 *ker, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, mlib_s32 cmask) { DTYPE k0, k1, k2, k3, k4, k5, k6, *sp; DTYPE p0, p1, p2, p3, p4, p5, p6, p7; mlib_s32 l, off, kw; DEF_VARS(DTYPE); mlib_s32 chan2 = chan1 + chan1; mlib_s32 chan3 = chan1 + chan2; #ifdef TYPE_DOUBLE const mlib_d64 *k = ker; #else mlib_f32 k_arr[MAX_NM], *k = k_arr; if (n*m > MAX_NM) { k = mlib_malloc(n*m*sizeof(mlib_f32)); if (k == NULL) return MLIB_FAILURE; } for (i = 0; i < n*m; i++) k[i] = (mlib_f32)ker[i]; #endif /* TYPE_DOUBLE */ if (m == 1) return mlib_ImageConv1xN(dst, src, k, n, dn, cmask); wid -= (m - 1); hgt -= (n - 1); adr_dst += dn*dll + dm*chan1; for (c = 0; c < chan1; c++) { if (!(cmask & (1 << (chan1 - 1 - c)))) continue; sl = adr_src + c; dl = adr_dst + c; for (j = 0; j < hgt; j++) { const DTYPE *pk = k; for (l = 0; l < n; l++) { DTYPE *sp0 = sl + l*sll; for (off = 0; off < m; off += kw, pk += kw, sp0 += chan1) { kw = m - off; if (kw > 2*MAX_KER) kw = MAX_KER; else if (kw > MAX_KER) kw = kw/2; p2 = sp0[0]; p3 = sp0[chan1]; p4 = sp0[chan2]; sp0 += chan3; p5 = sp0[0]; p6 = sp0[chan1]; p7 = sp0[chan2]; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; dp = dl; if (kw == 7) { sp = sp0 += chan3; if (pk == k) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1]; dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; sp += chan2; dp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1]; dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; sp += chan2; dp += chan2; } } } else if (kw == 6) { sp = sp0 += chan2; if (pk == k) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = sp[0]; p6 = sp[chan1]; dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; sp += chan2; dp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = sp[0]; p6 = sp[chan1]; dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; sp += chan2; dp += chan2; } } } else if (kw == 5) { sp = sp0 += chan1; if (pk == k) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = sp[0]; p5 = sp[chan1]; dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; sp += chan2; dp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = sp[0]; p5 = sp[chan1]; dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; sp += chan2; dp += chan2; } } } else if (kw == 4) { sp = sp0; if (pk == k) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[chan1]; dp[0 ] = p0*k0 + p1*k1 + p2*k2 + p3*k3; dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3; sp += chan2; dp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[chan1]; dp[0 ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; sp += chan2; dp += chan2; } } } else if (kw == 3) { sp = sp0 -= chan1; if (pk == k) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = sp[0]; p3 = sp[chan1]; dp[0 ] = p0*k0 + p1*k1 + p2*k2; dp[chan1] = p1*k0 + p2*k1 + p3*k2; sp += chan2; dp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = sp[0]; p3 = sp[chan1]; dp[0 ] += p0*k0 + p1*k1 + p2*k2; dp[chan1] += p1*k0 + p2*k1 + p3*k2; sp += chan2; dp += chan2; } } } else { /* kw == 2 */ sp = sp0 -= chan2; if (pk == k) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = sp[0]; p2 = sp[chan1]; dp[0 ] = p0*k0 + p1*k1; dp[chan1] = p1*k0 + p2*k1; sp += chan2; dp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = sp[0]; p2 = sp[chan1]; dp[0 ] += p0*k0 + p1*k1; dp[chan1] += p1*k0 + p2*k1; sp += chan2; dp += chan2; } } } } } /* last pixels */ if (wid & 1) { DTYPE *sp0 = sl + i*chan1, s = 0; const DTYPE *pk = k; mlib_s32 x; for (l = 0; l < n; l++) { DTYPE *sp = sp0 + l*sll; for (x = 0; x < m; x++) s += sp[x*chan1] * (*pk++); } dp[0] = s; } /* next line */ sl += sll; dl += dll; } } #ifndef TYPE_DOUBLE if (k != k_arr) mlib_free(k); #endif /* TYPE_DOUBLE */ return MLIB_SUCCESS; }
mlib_status CONV_FUNC_I(MxN)(mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, mlib_s32 scale, mlib_s32 cmask) { mlib_s32 buff[BUFF_SIZE], *buffd = buff; mlib_s32 l, off, kw; mlib_s32 d0, d1, shift1, shift2; mlib_s32 k0, k1, k2, k3, k4, k5, k6; mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7; DTYPE *adr_src, *sl, *sp = NULL; DTYPE *adr_dst, *dl, *dp = NULL; mlib_s32 wid, hgt, sll, dll; mlib_s32 nchannel, chan1; mlib_s32 i, j, c; mlib_s32 chan2; mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl; GET_SRC_DST_PARAMETERS(DTYPE); #if IMG_TYPE != 1 shift1 = 16; #else shift1 = 8; #endif /* IMG_TYPE != 1 */ shift2 = scale - shift1; chan1 = nchannel; chan2 = chan1 + chan1; wid -= (m - 1); hgt -= (n - 1); adr_dst += dn*dll + dm*nchannel; if (wid > BUFF_SIZE) { buffd = mlib_malloc(sizeof(mlib_s32)*wid); if (buffd == NULL) return MLIB_FAILURE; } if (m*n > MAX_N*MAX_N) { k = mlib_malloc(sizeof(mlib_s32)*(m*n)); if (k == NULL) { if (buffd != buff) mlib_free(buffd); return MLIB_FAILURE; } } for (i = 0; i < m*n; i++) { k[i] = kernel[i] >> shift1; } for (c = 0; c < nchannel; c++) { if (!(cmask & (1 << (nchannel - 1 - c)))) continue; sl = adr_src + c; dl = adr_dst + c; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < wid; i++) buffd[i] = 0; for (j = 0; j < hgt; j++) { mlib_s32 *pk = k; for (l = 0; l < n; l++) { DTYPE *sp0 = sl + l*sll; for (off = 0; off < m;) { sp = sp0 + off*chan1; dp = dl; kw = m - off; if (kw > 2*MAX_KER) kw = MAX_KER; else if (kw > MAX_KER) kw = kw/2; off += kw; p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2]; p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1]; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; pk += kw; sp += (kw - 1)*chan1; if (kw == 7) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; p6 = sp[0]; p7 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; p6 = sp[0]; p7 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 6) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = sp[0]; p6 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = sp[0]; p6 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 5) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = sp[0]; p5 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = sp[0]; p5 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 4) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 3) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = sp[0]; p3 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = sp[0]; p3 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 2) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = sp[0]; p2 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1; buffd[i + 1] += p1*k0 + p2*k1; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = sp[0]; p2 = sp[chan1]; d0 = (p0*k0 + p1*k1 + buffd[i ]); d1 = (p1*k0 + p2*k1 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else /*if (kw == 1)*/ { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = sp[0]; p1 = sp[chan1]; buffd[i ] += p0*k0; buffd[i + 1] += p1*k0; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = sp[0]; p1 = sp[chan1]; d0 = (p0*k0 + buffd[i ]); d1 = (p1*k0 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } } } /* last pixels */ for (; i < wid; i++) { mlib_s32 *pk = k, s = 0; mlib_s32 x; for (l = 0; l < n; l++) { sp = sl + l*sll + i*chan1; for (x = 0; x < m; x++) { s += sp[0] * pk[0]; sp += chan1; pk ++; } } STORE_RES(dp[0], s); sp += chan1; dp += chan1; } sl += sll; dl += dll; } } if (buffd != buff) mlib_free(buffd); if (k != k_locl) mlib_free(k); return MLIB_SUCCESS; }
mlib_status mlib_ImageAffineEdgeExtend_BL(mlib_affine_param *param, mlib_affine_param *param_e, const void *colormap) { GET_EDGE_PARAMS(); mlib_d64 scale = 1.0 / (mlib_d64) MLIB_PREC; mlib_s32 xDelta, yDelta, xFlag, yFlag; mlib_d64 t, u, pix0; mlib_d64 a00, a01, a10, a11; if (colormap != NULL) { mlib_s32 max_xsize = param_e->max_xsize; mlib_type ltype = mlib_ImageGetLutType(colormap); mlib_d64 *plut = (mlib_d64 *) mlib_ImageGetLutDoubleData(colormap); void *buff; channels = mlib_ImageGetLutChannels(colormap); plut -= channels * mlib_ImageGetLutOffset(colormap); if (max_xsize == 0) { return MLIB_SUCCESS; } if (ltype == MLIB_BYTE) { buff = mlib_malloc(channels * max_xsize); } else { buff = mlib_malloc(channels * max_xsize * sizeof(mlib_s16)); } if (buff == NULL) return MLIB_FAILURE; switch (ltype) { case MLIB_BYTE: switch (type) { case MLIB_BYTE: MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_u8i, mlib_u8); break; case MLIB_SHORT: srcStride >>= 1; MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_u8i, mlib_s16); break; } break; case MLIB_SHORT: switch (type) { case MLIB_BYTE: MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_s16i, mlib_u8); break; case MLIB_SHORT: srcStride >>= 1; MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_s16i, mlib_s16); break; } break; } mlib_free(buff); return MLIB_SUCCESS; }
mlib_status mlib_ImageMinFilter5x5_U8( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *dl; __m64 *dp0, *dp1; __m64 aa, bb, cc, dd, e0, e1, e2, e3, e4, ee, f0, f1, f2, f3, f4, ff, r0, r1; __m64 e_mask, mask8080; mlib_s32 i, j, wid8, tail; wid = (wid - KSIZE1) * SSIZE; wid8 = (wid + 7) & ~7; pbuff = mlib_malloc(4 * wid8); buff0 = pbuff; buff1 = buff0 + wid8; buff2 = buff1 + wid8; buff3 = buff2 + wid8; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + 2 * (dlb + SSIZE); tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; mask8080 = mmx_from_int_dup(0x80808080); for (j = 0; j < 2; j++) { sp0 = buff0; sp1 = buff1; sp4 = sl; sp5 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; sp0 += 8; sp1 += 8; sp4 += 8; sp5 += 8; } buffT = buff0; buff0 = buff2; buff2 = buffT; buffT = buff1; buff1 = buff3; buff3 = buffT; } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = sl; sp5 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: e0 e1 e2 e3 e4 * line5: f0 f1 f2 f3 f4 */ for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, ff); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, ff); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } buffT = buff0; buff0 = buff2; buff2 = buffT; buffT = buff1; buff1 = buff3; buff3 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = sl; for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); ee = C_COMP(e0, e1); e2 = C_COMP(e2, e3); ee = C_COMP(ee, e4); ee = C_COMP(ee, e2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); ee = C_COMP(e0, e1); e2 = C_COMP(e2, e3); ee = C_COMP(ee, e4); ee = C_COMP(ee, e2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_convMxN_8nw_mask(mlib_image *dst, const mlib_image *src, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, const mlib_s32 *kern, mlib_s32 scale, mlib_s32 cmask) { mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff; mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe; mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3; mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31; mlib_d64 dd, d0, d1; mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff; mlib_u8 *sl, *sp, *dl; mlib_s32 hgt = mlib_ImageGetHeight(src); mlib_s32 wid = mlib_ImageGetWidth(src); mlib_s32 sll = mlib_ImageGetStride(src); mlib_s32 dll = mlib_ImageGetStride(dst); mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src); mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst); mlib_s32 ssize, xsize, dsize, esize, buff_ind; mlib_d64 *pbuff, *dp; mlib_f32 *karr = (mlib_f32 *) kern; mlib_s32 gsr_scale = (31 - scale) << 3; mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); mlib_s32 i, j, l, chan, testchan; mlib_s32 nchan = mlib_ImageGetChannels(dst); void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); if (n > MAX_N) { buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *)); if (buffs == NULL) return MLIB_FAILURE; } buff = buffs + 2 * (n + 1); adr_dst += dn * dll + dm * nchan; ssize = wid; dsize = (ssize + 7) / 8; esize = dsize + 4; pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64)); if (pbuff == NULL) { if (buffs != buffs_local) mlib_free(buffs); return MLIB_FAILURE; } for (i = 0; i < (n + 1); i++) buffs[i] = pbuff + i * esize; for (i = 0; i < (n + 1); i++) buffs[(n + 1) + i] = buffs[i]; buffd = buffs[n] + esize; buffe = buffd + 2 * esize; hgt -= (n - 1); xsize = ssize - (m - 1); vis_write_gsr(gsr_scale + 7); if (nchan == 2) { p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1; } else if (nchan == 3) { p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1; } else { p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1; p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1; } testchan = 1; for (chan = 0; chan < nchan; chan++) { buff_ind = 0; sl = adr_src; dl = adr_dst; if ((cmask & testchan) == 0) { testchan <<= 1; continue; } for (l = 0; l < n; l++) { mlib_d64 *buffn = buffs[l]; sp = sl + l * sll; (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan); } /* init buffer */ #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } for (j = 0; j < hgt; j++) { mlib_d64 **buffc = buffs + buff_ind; mlib_f32 *pk = karr, k0, k1, k2, k3; sp = sl + n * sll; for (l = 0; l < n; l++) { buff[l] = buffc[l]; } buffn = buffc[n]; (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan); ik_last = (m - 1); for (jk = 0; jk < n; jk += jk_size) { jk_size = n - jk; if (jk_size >= 6) jk_size = 4; if (jk_size == 5) jk_size = 3; coff = 0; if (jk_size == 1) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; doff = coff / 8; buff0 = buff[jk] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); s01 = buff0[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s01 = buff0[i + 1]; s0 = vis_faligndata(s00, s01); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d00, d0); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } pk += m; } else if (jk_size == 2) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); s01 = buff0[0]; s11 = buff1[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d00, d0); d0 = vis_fpadd16(d10, d0); d1 = vis_fpadd16(d01, d1); d1 = vis_fpadd16(d11, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } pk += 2 * m; } else if (jk_size == 3) { for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; k2 = pk[ik + 2 * m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; buff2 = buff[jk + 2] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); if (off == 0) { #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s0 = buff0[i]; s1 = buff1[i]; s2 = buff2[i]; d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else if (off == 4) { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; d00 = vis_fmul8x16au(vis_read_lo(s00), k0); d01 = vis_fmul8x16au(vis_read_hi(s01), k0); d10 = vis_fmul8x16au(vis_read_lo(s10), k1); d11 = vis_fmul8x16au(vis_read_hi(s11), k1); d20 = vis_fmul8x16au(vis_read_lo(s20), k2); d21 = vis_fmul8x16au(vis_read_hi(s21), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d00 = vis_fpadd16(d00, d10); d0 = vis_fpadd16(d20, d0); d0 = vis_fpadd16(d00, d0); d01 = vis_fpadd16(d01, d11); d1 = vis_fpadd16(d21, d1); d1 = vis_fpadd16(d01, d1); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } } pk += 3 * m; } else { /* jk_size == 4 */ for (ik = 0; ik < m; ik++, coff++) { if (!jk && ik == ik_last) continue; k0 = pk[ik]; k1 = pk[ik + m]; k2 = pk[ik + 2 * m]; k3 = pk[ik + 3 * m]; doff = coff / 8; buff0 = buff[jk] + doff; buff1 = buff[jk + 1] + doff; buff2 = buff[jk + 2] + doff; buff3 = buff[jk + 3] + doff; off = coff & 7; vis_write_gsr(gsr_scale + off); if (off == 0) { #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s0 = buff0[i]; s1 = buff1[i]; s2 = buff2[i]; s3 = buff3[i]; d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else if (off == 4) { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; d00 = vis_fmul8x16au(vis_read_lo(s00), k0); d01 = vis_fmul8x16au(vis_read_hi(s01), k0); d10 = vis_fmul8x16au(vis_read_lo(s10), k1); d11 = vis_fmul8x16au(vis_read_hi(s11), k1); d20 = vis_fmul8x16au(vis_read_lo(s20), k2); d21 = vis_fmul8x16au(vis_read_hi(s21), k2); d30 = vis_fmul8x16au(vis_read_lo(s30), k3); d31 = vis_fmul8x16au(vis_read_hi(s31), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } else { s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); s3 = vis_faligndata(s30, s31); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d00 = vis_fpadd16(d00, d10); d20 = vis_fpadd16(d20, d30); d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d20); d01 = vis_fpadd16(d01, d11); d21 = vis_fpadd16(d21, d31); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d21); buffd[2 * i] = d0; buffd[2 * i + 1] = d1; } } } pk += 4 * m; } } /***************************************** ***************************************** ** Final iteration ** ***************************************** *****************************************/ jk_size = n; if (jk_size >= 6) jk_size = 4; if (jk_size == 5) jk_size = 3; k0 = karr[ik_last]; k1 = karr[ik_last + m]; k2 = karr[ik_last + 2 * m]; k3 = karr[ik_last + 3 * m]; off = ik_last; doff = off / 8; off &= 7; buff0 = buff[0] + doff; buff1 = buff[1] + doff; buff2 = buff[2] + doff; buff3 = buff[3] + doff; vis_write_gsr(gsr_scale + off); if (jk_size == 1) { dp = buffe; s01 = buff0[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s01 = buff0[i + 1]; s0 = vis_faligndata(s00, s01); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d1 = vis_fpadd16(d1, d01); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else if (jk_size == 2) { dp = buffe; s01 = buff0[0]; s11 = buff1[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else if (jk_size == 3) { dp = buffe; s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s20 = s21; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d0 = vis_fpadd16(d0, d20); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); d1 = vis_fpadd16(d1, d21); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } else { /* if (jk_size == 4) */ dp = buffe; s01 = buff0[0]; s11 = buff1[0]; s21 = buff2[0]; s31 = buff3[0]; #pragma pipeloop(0) for (i = 0; i < (xsize + 7) / 8; i++) { s00 = s01; s10 = s11; s20 = s21; s30 = s31; s01 = buff0[i + 1]; s11 = buff1[i + 1]; s21 = buff2[i + 1]; s31 = buff3[i + 1]; s0 = vis_faligndata(s00, s01); s1 = vis_faligndata(s10, s11); s2 = vis_faligndata(s20, s21); s3 = vis_faligndata(s30, s31); d00 = vis_fmul8x16au(vis_read_hi(s0), k0); d01 = vis_fmul8x16au(vis_read_lo(s0), k0); d10 = vis_fmul8x16au(vis_read_hi(s1), k1); d11 = vis_fmul8x16au(vis_read_lo(s1), k1); d20 = vis_fmul8x16au(vis_read_hi(s2), k2); d21 = vis_fmul8x16au(vis_read_lo(s2), k2); d30 = vis_fmul8x16au(vis_read_hi(s3), k3); d31 = vis_fmul8x16au(vis_read_lo(s3), k3); d0 = buffd[2 * i]; d1 = buffd[2 * i + 1]; d0 = vis_fpadd16(d0, d00); d0 = vis_fpadd16(d0, d10); d0 = vis_fpadd16(d0, d20); d0 = vis_fpadd16(d0, d30); d1 = vis_fpadd16(d1, d01); d1 = vis_fpadd16(d1, d11); d1 = vis_fpadd16(d1, d21); d1 = vis_fpadd16(d1, d31); dd = vis_fpack16_pair(d0, d1); dp[i] = dd; buffd[2 * i] = drnd; buffd[2 * i + 1] = drnd; } } (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan); sl += sll; dl += dll; buff_ind++; if (buff_ind >= (n + 1)) buff_ind = 0; } testchan <<= 1; } mlib_free(pbuff); if (buffs != buffs_local) mlib_free(buffs); return MLIB_SUCCESS; }
mlib_status mlib_m_ImageInitInterpTableAffine_S16( mlib_interp_table * table, mlib_s32 nchan) { mlib_s32 width, height, width_bits, height_bits, vis_width_bits, vis_height_bits; mlib_s32 subsampleBitsH, subsampleBitsV; mlib_s32 i, j, c, scale, num_copy, num_copy_old; mlib_s32 isum; mlib_s32 max_scale, min_scale, scaleh, scalev; mlib_s32 norm_scale_v, norm_scale_h; mlib_d64 dscale, *dataH, *dataV; mlib_d64 **ptr_tablex, *tablex, *tablex_old, *tabley; mlib_d64 max, d; mlib_d64 sumh, sumv, normh, normv; if (!table) return (MLIB_FAILURE); if (table->shift_vis_affine < 0) return (MLIB_FAILURE); if (nchan == 1) { num_copy = 1; ptr_tablex = &(table->dataH_s16_1); } else if (nchan == 2) { num_copy = 2; ptr_tablex = &(table->dataH_s16_3); } else if (nchan == 3 || nchan == 4) { num_copy = 4; ptr_tablex = &(table->dataH_s16_4); } else return (MLIB_FAILURE); if (*ptr_tablex != NULL && table->dataV_s16_1 != NULL) return (MLIB_SUCCESS); dataH = mlib_ImageGetInterpDoubleDataH(table); dataV = mlib_ImageGetInterpDoubleDataV(table); if (!dataH || !dataV) return (MLIB_FAILURE); width = mlib_ImageGetInterpWidth(table); height = mlib_ImageGetInterpHeight(table); width_bits = mlib_ImageGetInterpWidthBits(table); height_bits = mlib_ImageGetInterpHeightBits(table); vis_width_bits = table->vis_width_bits; vis_height_bits = table->vis_height_bits; subsampleBitsH = mlib_ImageGetInterpSubsampleBitsH(table); subsampleBitsV = mlib_ImageGetInterpSubsampleBitsV(table); if (table->dataV_s16_1 != NULL) { if (table->dataH_s16_1 != NULL) { tablex_old = table->dataH_s16_1; num_copy_old = 1; } else if (table->dataH_s16_3 != NULL) { tablex_old = table->dataH_s16_3; num_copy_old = 3; } else { tablex_old = table->dataH_s16_4; num_copy_old = 4; } tablex = mlib_malloc(num_copy * (1 << subsampleBitsH) * (1 << vis_width_bits) * sizeof (mlib_s16)); if (tablex == NULL) return (MLIB_FAILURE); for (j = 0; j < ((width + 1) & ~1); j++) { mlib_s16 *tbl = (mlib_s16 *)tablex + j * num_copy; mlib_s16 *tbl_old = (mlib_s16 *)tablex_old + j * num_copy_old; for (i = 0; i < (1 << subsampleBitsH); i++) { mlib_s16 v = tbl_old[num_copy_old * (i << vis_width_bits)]; for (c = 0; c < num_copy; c++) { tbl[num_copy * (i << vis_width_bits) + c] = v; } } } *ptr_tablex = tablex; return (MLIB_SUCCESS); } sumv = 0; max = 0; for (i = 0; i < (1 << subsampleBitsV); i++) { mlib_d64 s = 0; mlib_s32 ind = (i << height_bits); for (j = 0; j < height; j++) { d = mlib_fabs(dataV[j + ind]); s += d; max = (max > d) ? max : d; } sumv = (sumv > s) ? sumv : s; } /* all fhkernels = 0 */ if (sumv == 0) { dscale = 0; /* X table */ tablex = mlib_malloc(num_copy * (1 << subsampleBitsH) * (1 << vis_width_bits) * sizeof (mlib_s16)); if (tablex == NULL) return (MLIB_FAILURE); INIT_TABLE_16(tablex, (1 << subsampleBitsH), width, width_bits, vis_width_bits, dataH); if ((dataH == dataV) && num_copy == 4) tabley = tablex; else { num_copy = 4; tabley = mlib_malloc(num_copy * (1 << subsampleBitsV) * (1 << vis_height_bits) * sizeof (mlib_s16)); if (tabley == NULL) { mlib_free(tablex); return (MLIB_FAILURE); } INIT_TABLE_16(tabley, (1 << subsampleBitsV), height, height_bits, vis_height_bits, dataV); *ptr_tablex = tablex; table->dataV_s16_1 = tabley; /* Store shift */ table->shift_vis_affine = 43; return (MLIB_SUCCESS); } } normv = 32767.0 / (32768.0 * sumv); scalev = mlib_ilogb(sumv * normv); isum = mlib_ilogb(max * normv); /* all elements must be in the range -32768, 32767 */ if (scalev == isum) norm_scale_v = 14; /* but sumv may be in the range -65576, 65575 */ else norm_scale_v = 15; min_scale = 25; max_scale = 40; normh = 32768.0 * sumv / 32767; if (dataH != dataV) { sumh = 0; max = 0; for (i = 0; i < (1 << subsampleBitsH); i++) { mlib_d64 s = 0; mlib_s32 ind = (i << width_bits); for (j = 0; j < width; j++) { d = mlib_fabs(dataH[j + ind]); s += d; max = (max > d) ? max : d; } sumh = (sumh > s) ? sumh : s; } } else sumh = sumv; isum = mlib_ilogb(max * normh); scaleh = mlib_ilogb(sumh * normh); /* all elements must be in the range -32768, 32767 */ if (scaleh == isum) norm_scale_h = 14; /* but sumh may be in the range -65576, 65575 */ else norm_scale_h = 15; scale = norm_scale_v + norm_scale_h - (scaleh + scalev); if (scale < min_scale) { table->shift_vis_affine = -1; /* koeff. are so large */ return (MLIB_FAILURE); } if (scale > max_scale) { scaleh += (scale - max_scale + 1) >> 1; scalev += (scale - max_scale) >> 1; scale = max_scale; }
mlib_status mlib_m_conv3x3_16nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buff2, *buffT; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9; __m64 d0, d1, d2, rr, tmpa, tmpb; __m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph; __m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl; __m64 *sp, *dp; mlib_s32 shift; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; wid4 = (width + 3) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + 2 * wid4; buff2 = buff1 + 2 * wid4; for (j = 0; j < 2; j++) { sp = (__m64 *) sl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < wid4; i++) { PREP_3x3(i); } sl += sll; if (j == 0) { buffT = buff1; buff1 = buff0; buff0 = buffT; } } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < width / 4; i++) { CONV_3x3(i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
static mlib_status mlib_ImageConv1xN(mlib_image *dst, const mlib_image *src, const mlib_d64 *k, mlib_s32 n, mlib_s32 dn, mlib_s32 cmask) { FTYPE buff[BUFF_SIZE]; mlib_s32 off, kh; mlib_s32 d0, d1; const FTYPE *pk; FTYPE k0, k1, k2, k3; FTYPE p0, p1, p2, p3, p4; DEF_VARS(DTYPE); DTYPE *sl_c, *dl_c, *sl0; mlib_s32 l, hsize, max_hsize; GET_SRC_DST_PARAMETERS(DTYPE); hgt -= (n - 1); adr_dst += dn*dll; max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll; if (!max_hsize) max_hsize = 1; if (max_hsize > BUFF_SIZE) { pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize); } chan1 = nchannel; sl_c = adr_src; dl_c = adr_dst; for (l = 0; l < hgt; l += hsize) { hsize = hgt - l; if (hsize > max_hsize) hsize = max_hsize; for (c = 0; c < nchannel; c++) { if (!(cmask & (1 << (chan1 - 1 - c)))) continue; sl = sl_c + c; dl = dl_c + c; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < hsize; j++) pbuff[j] = 0.0; for (i = 0; i < wid; i++) { sl0 = sl; for (off = 0; off < (n - 4); off += 4) { pk = k + off; sp = sl0; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; sp += 3*sll; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < hsize; j += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[sll]; pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; sp += 2*sll; } sl0 += 4*sll; } pk = k + off; sp = sl0; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; dp = dl; kh = n - off; if (kh == 4) { sp += 3*sll; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j <= (hsize - 2); j += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[sll]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]); dp[0 ] = FROM_S32(d0); dp[dll] = FROM_S32(d1); pbuff[j] = 0; pbuff[j + 1] = 0; sp += 2*sll; dp += 2*dll; } if (j < hsize) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]); pbuff[j] = 0; dp[0] = FROM_S32(d0); } } else if (kh == 3) { sp += 2*sll; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j <= (hsize - 2); j += 2) { p0 = p2; p1 = p3; p2 = sp[0]; p3 = sp[sll]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]); dp[0 ] = FROM_S32(d0); dp[dll] = FROM_S32(d1); pbuff[j] = 0; pbuff[j + 1] = 0; sp += 2*sll; dp += 2*dll; } if (j < hsize) { p0 = p2; p1 = p3; p2 = sp[0]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]); pbuff[j] = 0; dp[0] = FROM_S32(d0); } } else if (kh == 2) { sp += sll; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j <= (hsize - 2); j += 2) { p0 = p2; p1 = sp[0]; p2 = sp[sll]; d0 = D2I(p0*k0 + p1*k1 + pbuff[j]); d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]); dp[0 ] = FROM_S32(d0); dp[dll] = FROM_S32(d1); pbuff[j] = 0; pbuff[j + 1] = 0; sp += 2*sll; dp += 2*dll; } if (j < hsize) { p0 = p2; p1 = sp[0]; d0 = D2I(p0*k0 + p1*k1 + pbuff[j]); pbuff[j] = 0; dp[0] = FROM_S32(d0); } } else /* if (kh == 1) */ { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < hsize; j++) { p0 = sp[0]; d0 = D2I(p0*k0 + pbuff[j]); dp[0] = FROM_S32(d0); pbuff[j] = 0; sp += sll; dp += dll; } } sl += chan1; dl += chan1; } } sl_c += max_hsize*sll; dl_c += max_hsize*dll; } if (pbuff != buff) mlib_free(pbuff); return MLIB_SUCCESS; }
mlib_status CONV_FUNC(MxN)(mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, mlib_s32 scale, mlib_s32 cmask) { FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)]; FTYPE **buffs = buffs_arr, *buffd; FTYPE akernel[256], *k = akernel, fscale = DSCALE; mlib_s32 mn, l, off, kw, bsize, buff_ind; mlib_s32 d0, d1; FTYPE k0, k1, k2, k3, k4, k5, k6; FTYPE p0, p1, p2, p3, p4, p5, p6, p7; d64_2x32 dd; DEF_VARS(DTYPE); mlib_s32 chan2; mlib_s32 *buffo, *buffi; mlib_status status = MLIB_SUCCESS; GET_SRC_DST_PARAMETERS(DTYPE); if (scale > 30) { fscale *= 1.0/(1 << 30); scale -= 30; } fscale /= (1 << scale); mn = m*n; if (mn > 256) { k = mlib_malloc(mn*sizeof(mlib_d64)); if (k == NULL) return MLIB_FAILURE; } for (i = 0; i < mn; i++) { k[i] = kernel[i]*fscale; } if (m == 1) { status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask); FREE_AND_RETURN_STATUS; } bsize = (n + 3)*wid; if ((bsize > BUFF_SIZE) || (n > MAX_N)) { pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1)); if (pbuff == NULL) { status = MLIB_FAILURE; FREE_AND_RETURN_STATUS; } buffs = (FTYPE **)(pbuff + bsize); } for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid; for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l]; buffd = buffs[n] + wid; buffo = (mlib_s32*)(buffd + wid); buffi = buffo + (wid &~ 1); chan1 = nchannel; chan2 = chan1 + chan1; wid -= (m - 1); hgt -= (n - 1); adr_dst += dn*dll + dm*nchannel; for (c = 0; c < nchannel; c++) { if (!(cmask & (1 << (chan1 - 1 - c)))) continue; sl = adr_src + c; dl = adr_dst + c; for (l = 0; l < n; l++) { FTYPE *buff = buffs[l]; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < wid + (m - 1); i++) { buff[i] = (FTYPE)sl[i*chan1]; } sl += sll; } buff_ind = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < wid; i++) buffd[i] = 0.0; for (j = 0; j < hgt; j++) { FTYPE **buffc = buffs + buff_ind; FTYPE *buffn = buffc[n]; FTYPE *pk = k; for (l = 0; l < n; l++) { FTYPE *buff_l = buffc[l]; for (off = 0; off < m;) { FTYPE *buff = buff_l + off; kw = m - off; if (kw > 2*MAX_KER) kw = MAX_KER; else if (kw > MAX_KER) kw = kw/2; off += kw; sp = sl; dp = dl; p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; p5 = buff[3]; p6 = buff[4]; p7 = buff[5]; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; pk += kw; if (kw == 7) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; p6 = buff[i + 6]; p7 = buff[i + 7]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; p6 = buff[i + 6]; p7 = buff[i + 7]; LOAD_BUFF(buffi); dd.d64 = *(FTYPE *)(buffi + i); buffn[i ] = (FTYPE)dd.i32s.i0; buffn[i + 1] = (FTYPE)dd.i32s.i1; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else if (kw == 6) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = buff[i + 5]; p6 = buff[i + 6]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = buff[i + 5]; p6 = buff[i + 6]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else if (kw == 5) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = buff[i + 4]; p5 = buff[i + 5]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = buff[i + 4]; p5 = buff[i + 5]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else if (kw == 4) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = buff[i + 3]; p4 = buff[i + 4]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = buff[i + 3]; p4 = buff[i + 4]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else if (kw == 3) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = buff[i + 2]; p3 = buff[i + 3]; buffd[i ] += p0*k0 + p1*k1 + p2*k2; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = buff[i + 2]; p3 = buff[i + 3]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else /*if (kw == 2)*/ { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = buff[i + 1]; p2 = buff[i + 2]; buffd[i ] += p0*k0 + p1*k1; buffd[i + 1] += p1*k0 + p2*k1; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = buff[i + 1]; p2 = buff[i + 2]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } } } /* last pixels */ for (; i < wid; i++) { FTYPE *pk = k, s = 0; mlib_s32 x, d0; for (l = 0; l < n; l++) { FTYPE *buff = buffc[l] + i; for (x = 0; x < m; x++) s += buff[x] * (*pk++); } d0 = D2I(s); dp[0] = FROM_S32(d0); buffn[i] = (FTYPE)sp[0]; sp += chan1; dp += chan1; } for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1]; /* next line */ sl += sll; dl += dll; buff_ind++; if (buff_ind >= n + 1) buff_ind = 0; } } FREE_AND_RETURN_STATUS; }
mlib_image *mlib_ImageCreate(mlib_type type, mlib_s32 channels, mlib_s32 width, mlib_s32 height) { mlib_image *image; mlib_s32 wb; /* width in bytes */ void *data; /* sanity check */ if (width <= 0 || height <= 0 || channels < 1 || channels > 4) { return NULL; }; switch (type) { case MLIB_DOUBLE: wb = width * channels * 8; break; case MLIB_FLOAT: case MLIB_INT: wb = width * channels * 4; break; case MLIB_USHORT: case MLIB_SHORT: wb = width * channels * 2; break; case MLIB_BYTE: wb = width * channels; break; case MLIB_BIT: wb = (width * channels + 7) / 8; break; default: return NULL; } data = mlib_malloc(wb * height); if (data == NULL) { return NULL; } image = (mlib_image *)mlib_malloc(sizeof(mlib_image)); if (image == NULL) { mlib_free(data); return NULL; }; image -> type = type; image -> channels = channels; image -> width = width; image -> height = height; image -> stride = wb; image -> data = data; image -> flags = ((width & 0xf) << 8); /* set width field */ image -> flags |= ((height & 0xf) << 12); /* set height field */ image -> flags |= ((wb & 0xf) << 16); /* set stride field */ image -> flags |= (mlib_addr)data & 0xff; image -> format = MLIB_FORMAT_UNKNOWN; image -> paddings[0] = 0; image -> paddings[1] = 0; image -> paddings[2] = 0; image -> paddings[3] = 0; image -> bitoffset = 0; if ((type == MLIB_BIT) && (wb * 8 != width * channels)) { image -> flags |= MLIB_IMAGE_ONEDVECTOR; /* not 1-d vector */ } image -> flags &= MLIB_IMAGE_ATTRIBUTESET; image -> state = NULL; return image; }
/* *********************************************************** */ mlib_status mlib_m_sconv3x3_8nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; dl += dll + 1; wid4 = (width + 7) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + wid4; for (j = 0; j < 2; j++) { sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_3x3_1ch(lo, i); } sl += sll; buffT = buff1; buff1 = buff0; buff0 = buffT; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < width / 8; i++) { CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); dp[i] = _mm_packs_pu16(res_hi, res_lo); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); res_hi = _mm_packs_pu16(res_hi, res_lo); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_ImageMinFilter3x3_S16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *buff, *buff1; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl; __m64 *dp0, *dp1; __m64 aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1; __m64 e_mask; mlib_s32 i, j, wid8, tail; wid = (wid - 2) * SSIZE; wid8 = (wid + 7) & ~7; buff = mlib_malloc(2 * wid8); buff1 = buff + wid8; sl = (mlib_u8 *)src; /* dst ptrs skip top j and left col */ dl = (mlib_u8 *)dst + dlb + SSIZE; tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } for (j = 0; j <= (hgt - 2 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; /* * line0: aa * line1: bb * line2: c0 c1 c2 * line3: d0 d1 d2 */ for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - 3)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); cc = C_COMP(c0, c1); cc = C_COMP(cc, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); c1 = C_COMP(c0, c1); cc = C_COMP(c1, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(buff); return (MLIB_SUCCESS); }