Example #1
0
void mlib_ImageDelete(mlib_image *img)
{
  if (img == NULL) return;
  if ((img -> flags & MLIB_IMAGE_USERALLOCATED) == 0) {
    mlib_free(img -> data);
  }

  mlib_ImageDeleteRowTable(img);
  mlib_free(img);
}
Example #2
0
/*
 *  Go poke the rgb values of a region, on the fly.
 *  This does not update the inmemory database,
 *  so any changes will vanish on next re-prep unless other measures
 *  are taken.
 */
int
sh_directchange_shader(ClientData UNUSED(clientData), Tcl_Interp *interp, int argc, const char *argv[])
{
    long int rtip_val;
    struct rt_i	*rtip;
    struct region *regp;
    struct directory *dp;
    struct bu_vls shader = BU_VLS_INIT_ZERO;

    if ( argc < 4 )  {
	Tcl_AppendResult(interp, "Usage: sh_directchange_shader $rtip comb shader_arg(s)\n", NULL);
	return TCL_ERROR;
    }

    rtip_val = atol(argv[1]);
    rtip = (struct rt_i *)rtip_val;
    RT_CK_RTI(rtip);

    if ( rtip->needprep )  {
	Tcl_AppendResult(interp, "rt_prep() hasn't been called yet, error.\n", NULL);
	return TCL_ERROR;
    }

    if ( (dp = db_lookup( rtip->rti_dbip, argv[2], LOOKUP_NOISY)) == RT_DIR_NULL )  {
	Tcl_AppendResult(interp, argv[2], ": not found\n", NULL);
	return TCL_ERROR;
    }

    bu_vls_from_argv(&shader, argc-3, argv+3);
    bu_vls_trimspace(&shader);

    /* Find all region names which match /comb/ pattern */
    for ( BU_LIST_FOR( regp, region, &rtip->HeadRegion ) )  {
/*	if ( dp->d_flags & RT_DIR_REGION )  {	*/
	    /* name will occur at end of region string w/leading slash */
/*	} else {	*/
	    /* name will occur anywhere, bracketed by slashes */
/*	}	*/

	/* XXX quick hack */
	if ( strstr( regp->reg_name, argv[2] ) == NULL )  continue;

	/* Modify the region's shader string */
	bu_log("sh_directchange_shader() changing %s\n", regp->reg_name);
	if ( regp->reg_mater.ma_shader )
	    bu_free( (void *)regp->reg_mater.ma_shader, "reg_mater.ma_shader");
	regp->reg_mater.ma_shader = bu_vls_strdup(&shader);

	/* Update the shader */
	mlib_free(regp);
	if ( mlib_setup( &mfHead, regp, rtip ) != 1 )  {
	    Tcl_AppendResult(interp, regp->reg_name, ": mlib_setup() failure\n", NULL);
	}
    }

    bu_vls_free(&shader);
    return TCL_OK;
}
Example #3
0
/*
 *  Go poke the rgb values of a region, on the fly.
 *  This does not update the inmemory database,
 *  so any changes will vanish on next re-prep unless other measures
 *  are taken.
 */
int
sh_directchange_rgb(ClientData UNUSED(clientData), Tcl_Interp *interp, int argc, const char *argv[])
{
    long int rtip_val;
    struct rt_i	*rtip;
    struct region	*regp;
    struct directory *dp;
    float		r, g, b;

    if ( argc != 6 )  {
	Tcl_AppendResult(interp, "Usage: sh_directchange_rgb $rtip comb r g b\n", NULL);
	return TCL_ERROR;
    }

    r = atoi(argv[3+0]) / 255.0;
    g = atoi(argv[3+1]) / 255.0;
    b = atoi(argv[3+2]) / 255.0;

    rtip_val = atol(argv[1]);
    rtip = (struct rt_i *)rtip_val;
    RT_CK_RTI(rtip);

    if ( rtip->needprep )  {
	Tcl_AppendResult(interp, "rt_prep() hasn't been called yet, error.\n", NULL);
	return TCL_ERROR;
    }

    if ( (dp = db_lookup( rtip->rti_dbip, argv[2], LOOKUP_NOISY)) == RT_DIR_NULL )  {
	Tcl_AppendResult(interp, argv[2], ": not found\n", NULL);
	return TCL_ERROR;
    }

    /* Find all region names which match /comb/ pattern */
    for ( BU_LIST_FOR( regp, region, &rtip->HeadRegion ) )  {
/*	if ( dp->d_flags & RT_DIR_REGION )  {	*/
	    /* name will occur at end of region string w/leading slash */
/*	} else {	*/
	    /* name will occur anywhere, bracketed by slashes */
/*	}	*/

	/* XXX quick hack */
	if ( strstr( regp->reg_name, argv[2] ) == NULL )  continue;

	/* Modify the region's color */
	bu_log("sh_directchange_rgb() changing %s\n", regp->reg_name);
	VSET( regp->reg_mater.ma_color, r, g, b );

	/* Update the shader */
	mlib_free(regp);
	if ( mlib_setup( &mfHead, regp, rtip ) != 1 )  {
	    Tcl_AppendResult(interp, regp->reg_name, ": mlib_setup() failure\n", NULL);
	}
    }

    return TCL_OK;
}
Example #4
0
void mlib_ImageDeleteRowTable(mlib_image *img)
{
  void **state;

  if (img == NULL) return;

  state = img -> state;
  if (!state) return;

  mlib_free(state - 2);
  img -> state = 0;
}
Example #5
0
mlib_image *mlib_ImageCreateStruct(mlib_type  type,
                                   mlib_s32   channels,
                                   mlib_s32   width,
                                   mlib_s32   height,
                                   mlib_s32   stride,
                                   const void *data)
{
  mlib_image *image;
  if (stride <= 0) {
    return NULL;
  }

  image = (mlib_image *)mlib_malloc(sizeof(mlib_image));
  if (image == NULL) {
    return NULL;
  }

  if (mlib_ImageSet(image, type, channels, width, height, stride, data) == NULL) {
    mlib_free(image);
    image = NULL;
  }

  return image;
}
mlib_status
mlib_m_conv5x5_8nw_4(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 ker[5][5];
	__m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc;
	__m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift, ind;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= (KSIZE - 1);
	height -= (KSIZE - 1);
	width *= NCHAN;
	dl += ((KSIZE - 1) / 2) * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d2, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d4, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		UNPACK_SRC(d2, hi);
		s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		UNPACK_SRC(d4, hi);

		for (i = 0; i < width / 8; i++) {
			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_lo, res_hi);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_lo, res_hi);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Example #7
0
mlib_image *mlib_ImageCreate(mlib_type type,
                             mlib_s32  channels,
                             mlib_s32  width,
                             mlib_s32  height)
{
  mlib_image *image;
  mlib_s32        wb;                /* width in bytes */
  void       *data;

/* sanity check */
  if (width <= 0 || height <= 0 || channels < 1 || channels > 4) {
    return NULL;
  };

  switch (type) {
    case MLIB_DOUBLE:
      wb = width * channels * 8;
      break;
    case MLIB_FLOAT:
    case MLIB_INT:
      wb = width * channels * 4;
      break;
    case MLIB_USHORT:
    case MLIB_SHORT:
      wb = width * channels * 2;
      break;
    case MLIB_BYTE:
      wb = width * channels;
      break;
    case MLIB_BIT:
      wb = (width * channels + 7) / 8;
      break;
    default:
      return NULL;
  }

  data = mlib_malloc(wb * height);
  if (data == NULL) {
    return NULL;
  }

  image = (mlib_image *)mlib_malloc(sizeof(mlib_image));
  if (image == NULL) {
    mlib_free(data);
    return NULL;
  };

  image -> type     = type;
  image -> channels = channels;
  image -> width    = width;
  image -> height   = height;
  image -> stride   = wb;
  image -> data     = data;
  image -> flags    = ((width & 0xf) << 8);        /* set width field */
  image -> flags   |= ((height & 0xf) << 12);      /* set height field */
  image -> flags   |= ((wb & 0xf) << 16);          /* set stride field */
  image -> flags   |= (mlib_addr)data & 0xff;
  image -> format   = MLIB_FORMAT_UNKNOWN;

  image -> paddings[0] = 0;
  image -> paddings[1] = 0;
  image -> paddings[2] = 0;
  image -> paddings[3] = 0;

  image -> bitoffset = 0;

  if ((type == MLIB_BIT) && (wb * 8 != width * channels)) {
    image -> flags |= MLIB_IMAGE_ONEDVECTOR;       /* not 1-d vector */
  }

  image -> flags &= MLIB_IMAGE_ATTRIBUTESET;
  image -> state  = NULL;

  return image;
}
Example #8
0
mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
                           const mlib_image *src,
                           const mlib_d64   *ker,
                           mlib_s32         m,
                           mlib_s32         n,
                           mlib_s32         dm,
                           mlib_s32         dn,
                           mlib_s32         cmask)
{
  DTYPE k0, k1, k2, k3, k4, k5, k6, *sp;
  DTYPE p0, p1, p2, p3, p4, p5, p6, p7;
  mlib_s32 l, off, kw;
  DEF_VARS(DTYPE);
  mlib_s32 chan2 = chan1 + chan1;
  mlib_s32 chan3 = chan1 + chan2;

#ifdef TYPE_DOUBLE
  const mlib_d64 *k = ker;
#else
  mlib_f32 k_arr[MAX_NM], *k = k_arr;

  if (n*m > MAX_NM) {
    k = mlib_malloc(n*m*sizeof(mlib_f32));

    if (k == NULL) return MLIB_FAILURE;
  }

  for (i = 0; i < n*m; i++) k[i] = (mlib_f32)ker[i];
#endif /* TYPE_DOUBLE */

  if (m == 1) return mlib_ImageConv1xN(dst, src, k, n, dn, cmask);

  wid -= (m - 1);
  hgt -= (n - 1);
  adr_dst += dn*dll + dm*chan1;

  for (c = 0; c < chan1; c++) {
    if (!(cmask & (1 << (chan1 - 1 - c)))) continue;

    sl = adr_src + c;
    dl = adr_dst + c;

    for (j = 0; j < hgt; j++) {
      const DTYPE *pk = k;

      for (l = 0; l < n; l++) {
        DTYPE *sp0 = sl + l*sll;

        for (off = 0; off < m; off += kw, pk += kw, sp0 += chan1) {
          kw = m - off;

          if (kw > 2*MAX_KER) kw = MAX_KER; else
            if (kw > MAX_KER) kw = kw/2;

          p2 = sp0[0]; p3 = sp0[chan1]; p4 = sp0[chan2];
          sp0 += chan3;
          p5 = sp0[0]; p6 = sp0[chan1]; p7 = sp0[chan2];

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];

          dp = dl;

          if (kw == 7) {
            sp = sp0 += chan3;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = sp[- chan1]; p6 = sp[0]; p7 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 6) {
            sp = sp0 += chan2;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = sp[0]; p6 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;

                p5 = sp[0]; p6 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 5) {
            sp = sp0 += chan1;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;

                p4 = sp[0]; p5 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;

                p4 = sp[0]; p5 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 4) {

            sp = sp0;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;

                p3 = sp[0]; p4 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2 + p3*k3;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2 + p4*k3;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;

                p3 = sp[0]; p4 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 3) {
            sp = sp0 -= chan1;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;

                p2 = sp[0]; p3 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1 + p2*k2;
                dp[chan1] = p1*k0 + p2*k1 + p3*k2;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;

                p2 = sp[0]; p3 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1 + p2*k2;
                dp[chan1] += p1*k0 + p2*k1 + p3*k2;

                sp += chan2;
                dp += chan2;
              }
            }

          } else { /* kw == 2 */
            sp = sp0 -= chan2;

            if (pk == k) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;

                p1 = sp[0]; p2 = sp[chan1];

                dp[0    ] = p0*k0 + p1*k1;
                dp[chan1] = p1*k0 + p2*k1;

                sp += chan2;
                dp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;

                p1 = sp[0]; p2 = sp[chan1];

                dp[0    ] += p0*k0 + p1*k1;
                dp[chan1] += p1*k0 + p2*k1;

                sp += chan2;
                dp += chan2;
              }
            }
          }
        }
      }

      /* last pixels */

      if (wid & 1) {
        DTYPE *sp0 = sl + i*chan1, s = 0;
        const DTYPE *pk = k;
        mlib_s32 x;

        for (l = 0; l < n; l++) {
          DTYPE *sp = sp0 + l*sll;

          for (x = 0; x < m; x++) s += sp[x*chan1] * (*pk++);
        }

        dp[0] = s;
      }

      /* next line */
      sl += sll;
      dl += dll;
    }
  }

#ifndef TYPE_DOUBLE

  if (k != k_arr) mlib_free(k);
#endif /* TYPE_DOUBLE */

  return MLIB_SUCCESS;
}
Example #9
0
mlib_status CONV_FUNC_I(MxN)(mlib_image       *dst,
                             const mlib_image *src,
                             const mlib_s32   *kernel,
                             mlib_s32         m,
                             mlib_s32         n,
                             mlib_s32         dm,
                             mlib_s32         dn,
                             mlib_s32         scale,
                             mlib_s32         cmask)
{
  mlib_s32 buff[BUFF_SIZE], *buffd = buff;
  mlib_s32 l, off, kw;
  mlib_s32 d0, d1, shift1, shift2;
  mlib_s32 k0, k1, k2, k3, k4, k5, k6;
  mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
  DTYPE    *adr_src, *sl, *sp = NULL;
  DTYPE    *adr_dst, *dl, *dp = NULL;
  mlib_s32 wid, hgt, sll, dll;
  mlib_s32 nchannel, chan1;
  mlib_s32 i, j, c;
  mlib_s32 chan2;
  mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
  GET_SRC_DST_PARAMETERS(DTYPE);

#if IMG_TYPE != 1
  shift1 = 16;
#else
  shift1 = 8;
#endif /* IMG_TYPE != 1 */
  shift2 = scale - shift1;

  chan1 = nchannel;
  chan2 = chan1 + chan1;

  wid -= (m - 1);
  hgt -= (n - 1);
  adr_dst += dn*dll + dm*nchannel;

  if (wid > BUFF_SIZE) {
    buffd = mlib_malloc(sizeof(mlib_s32)*wid);

    if (buffd == NULL) return MLIB_FAILURE;
  }

  if (m*n > MAX_N*MAX_N) {
    k = mlib_malloc(sizeof(mlib_s32)*(m*n));

    if (k == NULL) {
      if (buffd != buff) mlib_free(buffd);
      return MLIB_FAILURE;
    }
  }

  for (i = 0; i < m*n; i++) {
    k[i] = kernel[i] >> shift1;
  }

  for (c = 0; c < nchannel; c++) {
    if (!(cmask & (1 << (nchannel - 1 - c)))) continue;

    sl = adr_src + c;
    dl = adr_dst + c;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
    for (i = 0; i < wid; i++) buffd[i] = 0;

    for (j = 0; j < hgt; j++) {
      mlib_s32 *pk = k;

      for (l = 0; l < n; l++) {
        DTYPE *sp0 = sl + l*sll;

        for (off = 0; off < m;) {
          sp = sp0 + off*chan1;
          dp = dl;

          kw = m - off;

          if (kw > 2*MAX_KER) kw = MAX_KER; else
            if (kw > MAX_KER) kw = kw/2;
          off += kw;

          p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
          p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
          pk += kw;

          sp += (kw - 1)*chan1;

          if (kw == 7) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
                p6 = sp[0];
                p7 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
                p6 = sp[0];
                p7 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 6) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
                p5 = sp[0];
                p6 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
                p5 = sp[0];
                p6 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 5) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
                p4 = sp[0];
                p5 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
                p4 = sp[0];
                p5 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 4) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;
                p3 = sp[0];
                p4 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3; p2 = p4;
                p3 = sp[0];
                p4 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 3) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;
                p2 = sp[0];
                p3 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2; p1 = p3;
                p2 = sp[0];
                p3 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else if (kw == 2) {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;
                p1 = sp[0];
                p2 = sp[chan1];

                buffd[i    ] += p0*k0 + p1*k1;
                buffd[i + 1] += p1*k0 + p2*k1;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = p2;
                p1 = sp[0];
                p2 = sp[chan1];

                d0 = (p0*k0 + p1*k1 + buffd[i    ]);
                d1 = (p1*k0 + p2*k1 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }

          } else /*if (kw == 1)*/ {

            if (l < (n - 1) || off < m) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = sp[0];
                p1 = sp[chan1];

                buffd[i    ] += p0*k0;
                buffd[i + 1] += p1*k0;

                sp += chan2;
              }

            } else {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
              for (i = 0; i <= (wid - 2); i += 2) {
                p0 = sp[0];
                p1 = sp[chan1];

                d0 = (p0*k0 + buffd[i    ]);
                d1 = (p1*k0 + buffd[i + 1]);

                STORE_RES(dp[0    ], d0);
                STORE_RES(dp[chan1], d1);

                buffd[i    ] = 0;
                buffd[i + 1] = 0;

                sp += chan2;
                dp += chan2;
              }
            }
          }
        }
      }

      /* last pixels */
      for (; i < wid; i++) {
        mlib_s32 *pk = k, s = 0;
        mlib_s32 x;

        for (l = 0; l < n; l++) {
          sp = sl + l*sll + i*chan1;

          for (x = 0; x < m; x++) {
            s += sp[0] * pk[0];
            sp += chan1;
            pk ++;
          }
        }

        STORE_RES(dp[0], s);

        sp += chan1;
        dp += chan1;
      }

      sl += sll;
      dl += dll;
    }
  }

  if (buffd != buff) mlib_free(buffd);
  if (k != k_locl) mlib_free(k);

  return MLIB_SUCCESS;
}
mlib_status
mlib_ImageMinFilter7x7_S16(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, cc, dd, ee, ff, r0, r1;
	__m64 g0, g1, g2, g3, g4, g5, g6, gg;
	__m64 h0, h1, h2, h3, h4, h5, h6, hh;
	__m64 e_mask;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - KSIZE1) * SSIZE;
	wid8 = (wid + 7) & ~7;
	pbuff = mlib_malloc(KSIZE1 * wid8);
	buff0 = pbuff;
	buff1 = buff0 + wid8;
	buff2 = buff1 + wid8;
	buff3 = buff2 + wid8;
	buff4 = buff3 + wid8;
	buff5 = buff4 + wid8;

	sl = (mlib_u8 *)src;
	dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE);

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];

	for (j = 0; j < 3; j++) {
		sp0 = buff4;
		sp1 = buff5;
		sp6 = sl;
		sp7 = sl + slb;
		sl += 2 * slb;

		for (i = 0; i < wid; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;

			sp0 += 8;
			sp1 += 8;
			sp6 += 8;
			sp7 += 8;
		}

		if (j < 2) {
			buffT = buff0;
			buff0 = buff2;
			buff2 = buff4;
			buff4 = buffT;
			buffT = buff1;
			buff1 = buff3;
			buff3 = buff5;
			buff5 = buffT;
		}
	}

	for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = buff4;
		sp5 = buff5;
		sp6 = sl;
		sp7 = sl + slb;

/*
 *    line0:           aa
 *    line1:           bb
 *    line2:           cc
 *    line3:           dd
 *    line4:           ee
 *    line5:           ff
 *    line4:  g0 g1 g2 g3 g4 g5 g6
 *    line5:  h0 h1 h2 h3 h4 h5 h6
 */

		for (i = 0; i <= wid - 8; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, hh);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
			sp6 += 8;
			sp7 += 8;
		}

		if (tail) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);
			h0 = *(__m64 *) sp7;
			h1 = *(__m64 *) (sp7 + SSIZE);
			h2 = *(__m64 *) (sp7 + 2 * SSIZE);
			h3 = *(__m64 *) (sp7 + 3 * SSIZE);
			h4 = *(__m64 *) (sp7 + 4 * SSIZE);
			h5 = *(__m64 *) (sp7 + 5 * SSIZE);
			h6 = *(__m64 *) (sp7 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			hh = C_COMP(h0, h1);
			g2 = C_COMP(g2, g3);
			h2 = C_COMP(h2, h3);
			g4 = C_COMP(g4, g5);
			h4 = C_COMP(h4, h5);
			gg = C_COMP(gg, g2);
			hh = C_COMP(hh, h2);
			gg = C_COMP(gg, g4);
			hh = C_COMP(hh, h4);
			gg = C_COMP(gg, g6);
			hh = C_COMP(hh, h6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, hh);

			*(__m64 *) sp0 = gg;
			*(__m64 *) sp1 = hh;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buff4;
		buff4 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buff5;
		buff5 = buffT;

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - KSIZE1 - 1)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = buff4;
		sp5 = buff5;
		sp6 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			g2 = C_COMP(g2, g3);
			g4 = C_COMP(g4, g5);
			gg = C_COMP(gg, g2);
			gg = C_COMP(gg, g4);
			gg = C_COMP(gg, g6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);
			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
			sp6 += 8;
		}

		if (tail) {
			g0 = *(__m64 *) sp6;
			g1 = *(__m64 *) (sp6 + SSIZE);
			g2 = *(__m64 *) (sp6 + 2 * SSIZE);
			g3 = *(__m64 *) (sp6 + 3 * SSIZE);
			g4 = *(__m64 *) (sp6 + 4 * SSIZE);
			g5 = *(__m64 *) (sp6 + 5 * SSIZE);
			g6 = *(__m64 *) (sp6 + 6 * SSIZE);

			gg = C_COMP(g0, g1);
			g2 = C_COMP(g2, g3);
			g4 = C_COMP(g4, g5);
			gg = C_COMP(gg, g2);
			gg = C_COMP(gg, g4);
			gg = C_COMP(gg, g6);

			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			ee = *(__m64 *) sp4;
			ff = *(__m64 *) sp5;

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			ff = C_COMP(ff, gg);
			bb = C_COMP(bb, dd);
			bb = C_COMP(bb, ff);

			r0 = C_COMP(aa, bb);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Example #11
0
mlib_status mlib_ImageAffine_alltypes(mlib_image       *dst,
                                      const mlib_image *src,
                                      const mlib_d64   *mtx,
                                      mlib_filter      filter,
                                      mlib_edge        edge,
                                      const void       *colormap)
{
  mlib_affine_param param[1];
  mlib_status res;
  mlib_type type;
  mlib_s32 nchan, t_ind, kw, kw1;
  mlib_addr align;
  mlib_d64 buff_lcl[BUFF_SIZE / 8];
  mlib_u8 **lineAddr = NULL;

  /* check for obvious errors */
  MLIB_IMAGE_TYPE_EQUAL(src, dst);
  MLIB_IMAGE_CHAN_EQUAL(src, dst);

  type = mlib_ImageGetType(dst);
  nchan = mlib_ImageGetChannels(dst);

  switch (filter) {
    case MLIB_NEAREST:
      kw = 1;
      kw1 = 0;
      break;

    case MLIB_BILINEAR:
      kw = 2;
      kw1 = 0;
      break;

    case MLIB_BICUBIC:
    case MLIB_BICUBIC2:
      kw = 4;
      kw1 = 1;
      break;

    default:
      return MLIB_FAILURE;
  }

  STORE_PARAM(param, lineAddr);
  STORE_PARAM(param, filter);

  res = mlib_AffineEdges(param, dst, src, buff_lcl, BUFF_SIZE,
                         kw, kw, kw1, kw1, edge, mtx, MLIB_SHIFT, MLIB_SHIFT);

  if (res != MLIB_SUCCESS)
    return res;

  lineAddr = param->lineAddr;

  if (type == MLIB_BYTE)
    t_ind = 0;
  else if (type == MLIB_SHORT)
    t_ind = 1;
  else if (type == MLIB_INT)
    t_ind = 2;
  else if (type == MLIB_USHORT)
    t_ind = 3;
  else if (type == MLIB_FLOAT)
    t_ind = 4;
  else if (type == MLIB_DOUBLE)
    t_ind = 5;

  if (colormap != NULL && filter != MLIB_NEAREST) {
    if (t_ind != 0 && t_ind != 1)
      return MLIB_FAILURE;

    if (mlib_ImageGetLutType(colormap) == MLIB_SHORT)
      t_ind += 2;
    t_ind = 2 * t_ind;

    if (mlib_ImageGetLutChannels(colormap) == 4)
      t_ind++;
  }

  if (type == MLIB_BIT) {
    mlib_s32 s_bitoff = mlib_ImageGetBitOffset(src);
    mlib_s32 d_bitoff = mlib_ImageGetBitOffset(dst);

    if (nchan != 1 || filter != MLIB_NEAREST)
      return MLIB_FAILURE;
    mlib_ImageAffine_bit_1ch_nn(param, s_bitoff, d_bitoff);
  }
  else {
    switch (filter) {
      case MLIB_NEAREST:

        if (t_ind >= 3)
          t_ind -= 2;                                      /* correct types USHORT, FLOAT, DOUBLE; new values: 1, 2, 3 */

        /* two channels as one channel of next type */
        align = (mlib_addr) (param->dstData) | (mlib_addr) lineAddr[0];
        align |= param->dstYStride | param->srcYStride;
        while (((nchan | (align >> t_ind)) & 1) == 0 && t_ind < MAX_T_IND) {
          nchan >>= 1;
          t_ind++;
        }

        res = mlib_AffineFunArr_nn[4 * t_ind + (nchan - 1)] (param);
        break;

      case MLIB_BILINEAR:

        if (colormap != NULL) {
          res = mlib_AffineFunArr_bl_i[t_ind] (param, colormap);
        }
        else {
          res = mlib_AffineFunArr_bl[4 * t_ind + (nchan - 1)] (param);
        }

        break;

      case MLIB_BICUBIC:
      case MLIB_BICUBIC2:

        if (colormap != NULL) {
          res = mlib_AffineFunArr_bc_i[t_ind] (param, colormap);
        }
        else {
          res = mlib_AffineFunArr_bc[4 * t_ind + (nchan - 1)] (param);
        }

        break;
    }

    if (res != MLIB_SUCCESS) {
      if (param->buff_malloc != NULL)
        mlib_free(param->buff_malloc);
      return res;
    }
  }

  if (edge == MLIB_EDGE_SRC_PADDED)
    edge = MLIB_EDGE_DST_NO_WRITE;

  if (filter != MLIB_NEAREST && edge != MLIB_EDGE_DST_NO_WRITE) {
    mlib_affine_param param_e[1];
    mlib_d64 buff_lcl1[BUFF_SIZE / 8];

    STORE_PARAM(param_e, lineAddr);
    STORE_PARAM(param_e, filter);

    res = mlib_AffineEdges(param_e, dst, src, buff_lcl1, BUFF_SIZE,
                           kw, kw, kw1, kw1, -1, mtx, MLIB_SHIFT, MLIB_SHIFT);

    if (res != MLIB_SUCCESS) {
      if (param->buff_malloc != NULL)
        mlib_free(param->buff_malloc);
      return res;
    }

    switch (edge) {
      case MLIB_EDGE_DST_FILL_ZERO:
        mlib_ImageAffineEdgeZero(param, param_e, colormap);
        break;

      case MLIB_EDGE_OP_NEAREST:
        mlib_ImageAffineEdgeNearest(param, param_e);
        break;

      case MLIB_EDGE_SRC_EXTEND:

        if (filter == MLIB_BILINEAR) {
          res = mlib_ImageAffineEdgeExtend_BL(param, param_e, colormap);
        }
        else {
          res = mlib_ImageAffineEdgeExtend_BC(param, param_e, colormap);
        }

        break;
    }

    if (param_e->buff_malloc != NULL)
      mlib_free(param_e->buff_malloc);
  }

  if (param->buff_malloc != NULL)
    mlib_free(param->buff_malloc);

  return res;
}
Example #12
0
mlib_status mlib_convMxN_8nw_mask(mlib_image       *dst,
                                  const mlib_image *src,
                                  mlib_s32         m,
                                  mlib_s32         n,
                                  mlib_s32         dm,
                                  mlib_s32         dn,
                                  const mlib_s32   *kern,
                                  mlib_s32         scale,
                                  mlib_s32         cmask)
{
  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
  mlib_d64 dd, d0, d1;
  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
  mlib_u8 *sl, *sp, *dl;
  mlib_s32 hgt = mlib_ImageGetHeight(src);
  mlib_s32 wid = mlib_ImageGetWidth(src);
  mlib_s32 sll = mlib_ImageGetStride(src);
  mlib_s32 dll = mlib_ImageGetStride(dst);
  mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src);
  mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst);
  mlib_s32 ssize, xsize, dsize, esize, buff_ind;
  mlib_d64 *pbuff, *dp;
  mlib_f32 *karr = (mlib_f32 *) kern;
  mlib_s32 gsr_scale = (31 - scale) << 3;
  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);
  mlib_s32 i, j, l, chan, testchan;
  mlib_s32 nchan = mlib_ImageGetChannels(dst);
  void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
  void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);

  if (n > MAX_N) {
    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));

    if (buffs == NULL)
      return MLIB_FAILURE;
  }

  buff = buffs + 2 * (n + 1);

  adr_dst += dn * dll + dm * nchan;

  ssize = wid;
  dsize = (ssize + 7) / 8;
  esize = dsize + 4;
  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));

  if (pbuff == NULL) {
    if (buffs != buffs_local)
      mlib_free(buffs);
    return MLIB_FAILURE;
  }

  for (i = 0; i < (n + 1); i++)
    buffs[i] = pbuff + i * esize;
  for (i = 0; i < (n + 1); i++)
    buffs[(n + 1) + i] = buffs[i];
  buffd = buffs[n] + esize;
  buffe = buffd + 2 * esize;

  hgt -= (n - 1);
  xsize = ssize - (m - 1);

  vis_write_gsr(gsr_scale + 7);

  if (nchan == 2) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1;
  }
  else if (nchan == 3) {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1;
  }
  else {
    p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1;
    p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1;
  }

  testchan = 1;
  for (chan = 0; chan < nchan; chan++) {
    buff_ind = 0;
    sl = adr_src;
    dl = adr_dst;

    if ((cmask & testchan) == 0) {
      testchan <<= 1;
      continue;
    }

    for (l = 0; l < n; l++) {
      mlib_d64 *buffn = buffs[l];
      sp = sl + l * sll;

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);
    }

    /* init buffer */
#pragma pipeloop(0)
    for (i = 0; i < (xsize + 7) / 8; i++) {
      buffd[2 * i] = drnd;
      buffd[2 * i + 1] = drnd;
    }

    for (j = 0; j < hgt; j++) {
      mlib_d64 **buffc = buffs + buff_ind;
      mlib_f32 *pk = karr, k0, k1, k2, k3;
      sp = sl + n * sll;

      for (l = 0; l < n; l++) {
        buff[l] = buffc[l];
      }

      buffn = buffc[n];

      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);

      ik_last = (m - 1);

      for (jk = 0; jk < n; jk += jk_size) {
        jk_size = n - jk;

        if (jk_size >= 6)
          jk_size = 4;

        if (jk_size == 5)
          jk_size = 3;

        coff = 0;

        if (jk_size == 1) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];

            doff = coff / 8;
            buff0 = buff[jk] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s01 = buff0[i + 1];
              s0 = vis_faligndata(s00, s01);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d1 = vis_fpadd16(d01, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += m;
        }
        else if (jk_size == 2) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            s01 = buff0[0];
            s11 = buff1[0];
#pragma pipeloop(0)
            for (i = 0; i < (xsize + 7) / 8; i++) {
              s00 = s01;
              s10 = s11;
              s01 = buff0[i + 1];
              s11 = buff1[i + 1];
              s0 = vis_faligndata(s00, s01);
              s1 = vis_faligndata(s10, s11);

              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

              d0 = buffd[2 * i];
              d1 = buffd[2 * i + 1];
              d0 = vis_fpadd16(d00, d0);
              d0 = vis_fpadd16(d10, d0);
              d1 = vis_fpadd16(d01, d1);
              d1 = vis_fpadd16(d11, d1);
              buffd[2 * i] = d0;
              buffd[2 * i + 1] = d1;
            }
          }

          pk += 2 * m;
        }
        else if (jk_size == 3) {

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {
              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

                d00 = vis_fpadd16(d00, d10);
                d0 = vis_fpadd16(d20, d0);
                d0 = vis_fpadd16(d00, d0);
                d01 = vis_fpadd16(d01, d11);
                d1 = vis_fpadd16(d21, d1);
                d1 = vis_fpadd16(d01, d1);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 3 * m;
        }
        else {                              /* jk_size == 4 */

          for (ik = 0; ik < m; ik++, coff++) {
            if (!jk && ik == ik_last)
              continue;

            k0 = pk[ik];
            k1 = pk[ik + m];
            k2 = pk[ik + 2 * m];
            k3 = pk[ik + 3 * m];

            doff = coff / 8;
            buff0 = buff[jk] + doff;
            buff1 = buff[jk + 1] + doff;
            buff2 = buff[jk + 2] + doff;
            buff3 = buff[jk + 3] + doff;

            off = coff & 7;
            vis_write_gsr(gsr_scale + off);

            if (off == 0) {

#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s0 = buff0[i];
                s1 = buff1[i];
                s2 = buff2[i];
                s3 = buff3[i];

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else if (off == 4) {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];

                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
                d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
                d31 = vis_fmul8x16au(vis_read_hi(s31), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
            else {

              s01 = buff0[0];
              s11 = buff1[0];
              s21 = buff2[0];
              s31 = buff3[0];
#pragma pipeloop(0)
              for (i = 0; i < (xsize + 7) / 8; i++) {
                d0 = buffd[2 * i];
                d1 = buffd[2 * i + 1];

                s00 = s01;
                s10 = s11;
                s20 = s21;
                s30 = s31;
                s01 = buff0[i + 1];
                s11 = buff1[i + 1];
                s21 = buff2[i + 1];
                s31 = buff3[i + 1];
                s0 = vis_faligndata(s00, s01);
                s1 = vis_faligndata(s10, s11);
                s2 = vis_faligndata(s20, s21);
                s3 = vis_faligndata(s30, s31);

                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

                d00 = vis_fpadd16(d00, d10);
                d20 = vis_fpadd16(d20, d30);
                d0 = vis_fpadd16(d0, d00);
                d0 = vis_fpadd16(d0, d20);
                d01 = vis_fpadd16(d01, d11);
                d21 = vis_fpadd16(d21, d31);
                d1 = vis_fpadd16(d1, d01);
                d1 = vis_fpadd16(d1, d21);
                buffd[2 * i] = d0;
                buffd[2 * i + 1] = d1;
              }
            }
          }

          pk += 4 * m;
        }
      }

      /*****************************************
       *****************************************
       **          Final iteration            **
       *****************************************
       *****************************************/

      jk_size = n;

      if (jk_size >= 6)
        jk_size = 4;

      if (jk_size == 5)
        jk_size = 3;

      k0 = karr[ik_last];
      k1 = karr[ik_last + m];
      k2 = karr[ik_last + 2 * m];
      k3 = karr[ik_last + 3 * m];

      off = ik_last;
      doff = off / 8;
      off &= 7;
      buff0 = buff[0] + doff;
      buff1 = buff[1] + doff;
      buff2 = buff[2] + doff;
      buff3 = buff[3] + doff;
      vis_write_gsr(gsr_scale + off);

      if (jk_size == 1) {
        dp = buffe;

        s01 = buff0[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s01 = buff0[i + 1];
          s0 = vis_faligndata(s00, s01);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d1 = vis_fpadd16(d1, d01);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 2) {
        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else if (jk_size == 3) {

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }
      else {                                /* if (jk_size == 4) */

        dp = buffe;

        s01 = buff0[0];
        s11 = buff1[0];
        s21 = buff2[0];
        s31 = buff3[0];
#pragma pipeloop(0)
        for (i = 0; i < (xsize + 7) / 8; i++) {
          s00 = s01;
          s10 = s11;
          s20 = s21;
          s30 = s31;
          s01 = buff0[i + 1];
          s11 = buff1[i + 1];
          s21 = buff2[i + 1];
          s31 = buff3[i + 1];
          s0 = vis_faligndata(s00, s01);
          s1 = vis_faligndata(s10, s11);
          s2 = vis_faligndata(s20, s21);
          s3 = vis_faligndata(s30, s31);

          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
          d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
          d31 = vis_fmul8x16au(vis_read_lo(s3), k3);

          d0 = buffd[2 * i];
          d1 = buffd[2 * i + 1];
          d0 = vis_fpadd16(d0, d00);
          d0 = vis_fpadd16(d0, d10);
          d0 = vis_fpadd16(d0, d20);
          d0 = vis_fpadd16(d0, d30);
          d1 = vis_fpadd16(d1, d01);
          d1 = vis_fpadd16(d1, d11);
          d1 = vis_fpadd16(d1, d21);
          d1 = vis_fpadd16(d1, d31);

          dd = vis_fpack16_pair(d0, d1);
          dp[i] = dd;

          buffd[2 * i] = drnd;
          buffd[2 * i + 1] = drnd;
        }
      }

      (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan);

      sl += sll;
      dl += dll;

      buff_ind++;

      if (buff_ind >= (n + 1))
        buff_ind = 0;
    }

    testchan <<= 1;
  }

  mlib_free(pbuff);

  if (buffs != buffs_local)
    mlib_free(buffs);

  return MLIB_SUCCESS;
}
mlib_status
mlib_m_ImageInitInterpTableAffine_S16(
    mlib_interp_table * table,
    mlib_s32 nchan)
{
	mlib_s32 width, height, width_bits, height_bits, vis_width_bits,
	    vis_height_bits;
	mlib_s32 subsampleBitsH, subsampleBitsV;
	mlib_s32 i, j, c, scale, num_copy, num_copy_old;
	mlib_s32 isum;
	mlib_s32 max_scale, min_scale, scaleh, scalev;
	mlib_s32 norm_scale_v, norm_scale_h;
	mlib_d64 dscale, *dataH, *dataV;
	mlib_d64 **ptr_tablex, *tablex, *tablex_old, *tabley;
	mlib_d64 max, d;
	mlib_d64 sumh, sumv, normh, normv;

	if (!table)
		return (MLIB_FAILURE);
	if (table->shift_vis_affine < 0)
		return (MLIB_FAILURE);

	if (nchan == 1) {
		num_copy = 1;
		ptr_tablex = &(table->dataH_s16_1);
	} else if (nchan == 2) {
		num_copy = 2;
		ptr_tablex = &(table->dataH_s16_3);
	} else if (nchan == 3 || nchan == 4) {
		num_copy = 4;
		ptr_tablex = &(table->dataH_s16_4);
	} else
		return (MLIB_FAILURE);

	if (*ptr_tablex != NULL && table->dataV_s16_1 != NULL)
		return (MLIB_SUCCESS);

	dataH = mlib_ImageGetInterpDoubleDataH(table);
	dataV = mlib_ImageGetInterpDoubleDataV(table);
	if (!dataH || !dataV)
		return (MLIB_FAILURE);

	width = mlib_ImageGetInterpWidth(table);
	height = mlib_ImageGetInterpHeight(table);
	width_bits = mlib_ImageGetInterpWidthBits(table);
	height_bits = mlib_ImageGetInterpHeightBits(table);
	vis_width_bits = table->vis_width_bits;
	vis_height_bits = table->vis_height_bits;
	subsampleBitsH = mlib_ImageGetInterpSubsampleBitsH(table);
	subsampleBitsV = mlib_ImageGetInterpSubsampleBitsV(table);

	if (table->dataV_s16_1 != NULL) {
		if (table->dataH_s16_1 != NULL) {
			tablex_old = table->dataH_s16_1;
			num_copy_old = 1;
		} else if (table->dataH_s16_3 != NULL) {
			tablex_old = table->dataH_s16_3;
			num_copy_old = 3;
		} else {
			tablex_old = table->dataH_s16_4;
			num_copy_old = 4;
		}

		tablex =
		    mlib_malloc(num_copy * (1 << subsampleBitsH) *
		    (1 << vis_width_bits) * sizeof (mlib_s16));
		if (tablex == NULL)
			return (MLIB_FAILURE);

		for (j = 0; j < ((width + 1) & ~1); j++) {
			mlib_s16 *tbl = (mlib_s16 *)tablex + j * num_copy;
			mlib_s16 *tbl_old =
			    (mlib_s16 *)tablex_old + j * num_copy_old;
			for (i = 0; i < (1 << subsampleBitsH); i++) {
				mlib_s16 v =
				    tbl_old[num_copy_old *
				    (i << vis_width_bits)];
				for (c = 0; c < num_copy; c++) {
					tbl[num_copy * (i << vis_width_bits) +
					    c] = v;
				}
			}
		}
		*ptr_tablex = tablex;
		return (MLIB_SUCCESS);
	}

	sumv = 0;
	max = 0;

	for (i = 0; i < (1 << subsampleBitsV); i++) {
		mlib_d64 s = 0;
		mlib_s32 ind = (i << height_bits);

		for (j = 0; j < height; j++) {
			d = mlib_fabs(dataV[j + ind]);
			s += d;
			max = (max > d) ? max : d;
		}
		sumv = (sumv > s) ? sumv : s;
	}

/* all fhkernels = 0 */
	if (sumv == 0) {
		dscale = 0;

/* X table */

		tablex =
		    mlib_malloc(num_copy * (1 << subsampleBitsH) *
		    (1 << vis_width_bits) * sizeof (mlib_s16));
		if (tablex == NULL)
			return (MLIB_FAILURE);

		INIT_TABLE_16(tablex, (1 << subsampleBitsH), width, width_bits,
		    vis_width_bits, dataH);

		if ((dataH == dataV) && num_copy == 4)
			tabley = tablex;
		else {

			num_copy = 4;

			tabley =
			    mlib_malloc(num_copy * (1 << subsampleBitsV) *
			    (1 << vis_height_bits) * sizeof (mlib_s16));
			if (tabley == NULL) {
				mlib_free(tablex);
				return (MLIB_FAILURE);
			}

			INIT_TABLE_16(tabley, (1 << subsampleBitsV), height,
			    height_bits, vis_height_bits, dataV);

			*ptr_tablex = tablex;
			table->dataV_s16_1 = tabley;

/* Store shift */
			table->shift_vis_affine = 43;

			return (MLIB_SUCCESS);
		}
	}

	normv = 32767.0 / (32768.0 * sumv);
	scalev = mlib_ilogb(sumv * normv);
	isum = mlib_ilogb(max * normv);

/* all elements must be in the range -32768, 32767 */
	if (scalev == isum)
		norm_scale_v = 14;
/* but sumv may be in the range -65576, 65575 */
	else
		norm_scale_v = 15;

	min_scale = 25;
	max_scale = 40;

	normh = 32768.0 * sumv / 32767;

	if (dataH != dataV) {
		sumh = 0;
		max = 0;

		for (i = 0; i < (1 << subsampleBitsH); i++) {
			mlib_d64 s = 0;
			mlib_s32 ind = (i << width_bits);

			for (j = 0; j < width; j++) {
				d = mlib_fabs(dataH[j + ind]);
				s += d;
				max = (max > d) ? max : d;
			}
			sumh = (sumh > s) ? sumh : s;
		}
	} else
		sumh = sumv;

	isum = mlib_ilogb(max * normh);
	scaleh = mlib_ilogb(sumh * normh);

/* all elements must be in the range -32768, 32767 */
	if (scaleh == isum)
		norm_scale_h = 14;
/* but sumh may be in the range -65576, 65575 */
	else
		norm_scale_h = 15;

	scale = norm_scale_v + norm_scale_h - (scaleh + scalev);

	if (scale < min_scale) {
		table->shift_vis_affine = -1;
/* koeff. are so large */
		return (MLIB_FAILURE);
	}

	if (scale > max_scale) {
		scaleh += (scale - max_scale + 1) >> 1;
		scalev += (scale - max_scale) >> 1;
		scale = max_scale;
	}
mlib_status
mlib_m_conv3x3_16nw_4(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buff2, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9;
	__m64 d0, d1, d2, rr, tmpa, tmpb;
	__m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph;
	__m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	width *= NCHAN;
	dl += dll + NCHAN;

	wid4 = (width + 3) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + 2 * wid4;
	buff2 = buff1 + 2 * wid4;

	for (j = 0; j < 2; j++) {
		sp = (__m64 *) sl;

		d1 = (*sp++);
		d2 = (*sp++);
		for (i = 0; i < wid4; i++) {
			PREP_3x3(i);
		}

		sl += sll;

		if (j == 0) {
			buffT = buff1;
			buff1 = buff0;
			buff0 = buffT;
		}
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		d1 = (*sp++);
		d2 = (*sp++);
		for (i = 0; i < width / 4; i++) {
			CONV_3x3(i);
			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_3x3(i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
mlib_status
mlib_m_sconv5x5_8nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
    __m64 *pbuff, *buff_arr[5];
    __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT;
    GET_SRC_DST_PARAMETERS(mlib_u8);
    __m64 hker0, hker1, hker2, hker3, hker4;
    __m64 vker0, vker1, vker2, vker3, vker4;
    __m64 s0, d0, d1, d2, prev0;
    __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo;
    __m64 zero = _m_zero;
    mlib_s32 shift, ind;
    mlib_s32 *sp;
    mlib_s32 row, wid4, i, j;

    width -= 4;
    height -= 4;
    width *= NCHAN;
    dl += 2 * (dll + NCHAN);

    wid4 = 2 * ((width + 7) / 8);
    pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4);

    GET_KERN();

    for (i = 0; i < 5; i++) {
        buff_arr[i] = pbuff + i * wid4;
    }

    for (j = 0; j < 4; j++) {
        buff4 = buff_arr[j];

        sp = (mlib_s32 *)sl;

        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d2, lo);

        for (i = 0; i < wid4; i++) {
            *(mlib_s32 *)&s0 = sp[i];

            PREP_5x5(lo, i);
        }

        sl += sll;
        ind++;
    }

    buff0 = buff_arr[0];
    buff1 = buff_arr[1];
    buff2 = buff_arr[2];
    buff3 = buff_arr[3];
    buff4 = buff_arr[4];

    for (row = 0; row < height; row++) {
        __m64 *sp = (__m64 *) sl;
        __m64 *dp = (__m64 *) dl;

        s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        UNPACK_SRC(d2, hi);

        for (i = 0; i < width / 8; i++) {
            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);

            dp[i] = _mm_packs_pu16(res_lo, res_hi);
        }

        if (width & 7) {
            __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7];

            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);
            res_hi = _mm_packs_pu16(res_lo, res_hi);

            dp[i] =
                _mm_or_si64(_mm_and_si64(mask, res_hi),
                            _mm_andnot_si64(mask, dp[i]));
        }

        buffT = buff0;
        buff0 = buff1;
        buff1 = buff2;
        buff2 = buff3;
        buff3 = buff4;
        buff4 = buffT;

        sl += sll;
        dl += dll;
    }

    _mm_empty();
    mlib_free(pbuff);

    return (MLIB_SUCCESS);
}
mlib_status
mlib_m_conv5x5_u16nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker[5][5];
	__m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000;
	__m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h,
	    tmph;
	__m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l,
	    tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift, ind, ker_sum = 0;
	mlib_s32 row, wid4, i, j;

	width -= 4;
	height -= 4;
	width *= NCHAN;
	dl += 2 * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * 2 * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (__m64 *) sl;
		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < wid4; i++) {
			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < width / 4; i++) {
			CONV_5x5(hi, i);

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_5x5(hi, i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
mlib_status
mlib_ImageMinFilter5x5_U8(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buffT;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, cc, dd, e0, e1, e2, e3, e4, ee, f0, f1, f2, f3, f4, ff,
	    r0, r1;
	__m64 e_mask, mask8080;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - KSIZE1) * SSIZE;
	wid8 = (wid + 7) & ~7;
	pbuff = mlib_malloc(4 * wid8);
	buff0 = pbuff;
	buff1 = buff0 + wid8;
	buff2 = buff1 + wid8;
	buff3 = buff2 + wid8;

	sl = (mlib_u8 *)src;
	dl = (mlib_u8 *)dst + 2 * (dlb + SSIZE);

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];
	mask8080 = mmx_from_int_dup(0x80808080);

	for (j = 0; j < 2; j++) {
		sp0 = buff0;
		sp1 = buff1;
		sp4 = sl;
		sp5 = sl + slb;
		sl += 2 * slb;

		for (i = 0; i < wid; i += 8) {
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;

			sp0 += 8;
			sp1 += 8;
			sp4 += 8;
			sp5 += 8;
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buffT;
	}

	for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = sl;
		sp5 = sl + slb;

/*
 *    line0:        aa
 *    line1:        bb
 *    line2:        cc
 *    line3:        dd
 *    line4:  e0 e1 e2 e3 e4
 *    line5:  f0 f1 f2 f3 f4
 */

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, ff);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
			sp5 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);
			f0 = *(__m64 *) sp5;
			f1 = *(__m64 *) (sp5 + SSIZE);
			f2 = *(__m64 *) (sp5 + 2 * SSIZE);
			f3 = *(__m64 *) (sp5 + 3 * SSIZE);
			f4 = *(__m64 *) (sp5 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			ff = C_COMP(f0, f1);
			e2 = C_COMP(e2, e3);
			f2 = C_COMP(f2, f3);
			ee = C_COMP(ee, e4);
			ff = C_COMP(ff, f4);
			ee = C_COMP(ee, e2);
			ff = C_COMP(ff, f2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, ff);

			*(__m64 *) sp0 = ee;
			*(__m64 *) sp1 = ff;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		buffT = buff0;
		buff0 = buff2;
		buff2 = buffT;
		buffT = buff1;
		buff1 = buff3;
		buff3 = buffT;

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - KSIZE1 - 1)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff0;
		sp1 = buff1;
		sp2 = buff2;
		sp3 = buff3;
		sp4 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			e2 = C_COMP(e2, e3);
			ee = C_COMP(ee, e4);
			ee = C_COMP(ee, e2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);
			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
			sp4 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			cc = *(__m64 *) sp2;
			dd = *(__m64 *) sp3;
			e0 = *(__m64 *) sp4;
			e1 = *(__m64 *) (sp4 + SSIZE);
			e2 = *(__m64 *) (sp4 + 2 * SSIZE);
			e3 = *(__m64 *) (sp4 + 3 * SSIZE);
			e4 = *(__m64 *) (sp4 + 4 * SSIZE);

			ee = C_COMP(e0, e1);
			e2 = C_COMP(e2, e3);
			ee = C_COMP(ee, e4);
			ee = C_COMP(ee, e2);

			bb = C_COMP(bb, cc);
			dd = C_COMP(dd, ee);
			bb = C_COMP(bb, dd);

			r0 = C_COMP(aa, bb);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Example #18
0
static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
                                     const mlib_image *src,
                                     const mlib_d64   *k,
                                     mlib_s32         n,
                                     mlib_s32         dn,
                                     mlib_s32         cmask)
{
  FTYPE    buff[BUFF_SIZE];
  mlib_s32 off, kh;
  mlib_s32 d0, d1;
  const FTYPE    *pk;
  FTYPE    k0, k1, k2, k3;
  FTYPE    p0, p1, p2, p3, p4;
  DEF_VARS(DTYPE);
  DTYPE    *sl_c, *dl_c, *sl0;
  mlib_s32 l, hsize, max_hsize;
  GET_SRC_DST_PARAMETERS(DTYPE);

  hgt -= (n - 1);
  adr_dst += dn*dll;

  max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;

  if (!max_hsize) max_hsize = 1;

  if (max_hsize > BUFF_SIZE) {
    pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
  }

  chan1 = nchannel;

  sl_c = adr_src;
  dl_c = adr_dst;

  for (l = 0; l < hgt; l += hsize) {
    hsize = hgt - l;

    if (hsize > max_hsize) hsize = max_hsize;

    for (c = 0; c < nchannel; c++) {
      if (!(cmask & (1 << (chan1 - 1 - c)))) continue;

      sl = sl_c + c;
      dl = dl_c + c;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
      for (j = 0; j < hsize; j++) pbuff[j] = 0.0;

      for (i = 0; i < wid; i++) {
        sl0 = sl;

        for (off = 0; off < (n - 4); off += 4) {
          pk = k + off;
          sp = sl0;

          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
          p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
          sp += 3*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j < hsize; j += 2) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];
            p4 = sp[sll];

            pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
            pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;

            sp += 2*sll;
          }

          sl0 += 4*sll;
        }

        pk = k + off;
        sp = sl0;

        k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
        p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];

        dp = dl;
        kh = n - off;

        if (kh == 4) {
          sp += 3*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];
            p4 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2; p1 = p3; p2 = p4;
            p3 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else if (kh == 3) {
          sp += 2*sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2; p1 = p3;
            p2 = sp[0];
            p3 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2; p1 = p3;
            p2 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else if (kh == 2) {
          sp += sll;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j <= (hsize - 2); j += 2) {
            p0 = p2;
            p1 = sp[0];
            p2 = sp[sll];

            d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
            d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);

            dp[0  ] = FROM_S32(d0);
            dp[dll] = FROM_S32(d1);

            pbuff[j] = 0;
            pbuff[j + 1] = 0;

            sp += 2*sll;
            dp += 2*dll;
          }

          if (j < hsize) {
            p0 = p2;
            p1 = sp[0];

            d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);

            pbuff[j] = 0;

            dp[0] = FROM_S32(d0);
          }

        } else /* if (kh == 1) */ {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
          for (j = 0; j < hsize; j++) {
            p0 = sp[0];

            d0 = D2I(p0*k0 + pbuff[j]);

            dp[0] = FROM_S32(d0);

            pbuff[j] = 0;

            sp += sll;
            dp += dll;
          }
        }

        sl += chan1;
        dl += chan1;
      }
    }

    sl_c += max_hsize*sll;
    dl_c += max_hsize*dll;
  }

  if (pbuff != buff) mlib_free(pbuff);

  return MLIB_SUCCESS;
}
mlib_status mlib_ImageAffineEdgeExtend_BL(mlib_affine_param *param,
                                          mlib_affine_param *param_e,
                                          const void        *colormap)
{
  GET_EDGE_PARAMS();
  mlib_d64 scale = 1.0 / (mlib_d64) MLIB_PREC;
  mlib_s32 xDelta, yDelta, xFlag, yFlag;
  mlib_d64 t, u, pix0;
  mlib_d64 a00, a01, a10, a11;

  if (colormap != NULL) {
    mlib_s32 max_xsize = param_e->max_xsize;
    mlib_type ltype = mlib_ImageGetLutType(colormap);
    mlib_d64 *plut = (mlib_d64 *) mlib_ImageGetLutDoubleData(colormap);
    void *buff;

    channels = mlib_ImageGetLutChannels(colormap);
    plut -= channels * mlib_ImageGetLutOffset(colormap);

    if (max_xsize == 0) {
      return MLIB_SUCCESS;
    }

    if (ltype == MLIB_BYTE) {
      buff = mlib_malloc(channels * max_xsize);
    }
    else {
      buff = mlib_malloc(channels * max_xsize * sizeof(mlib_s16));
    }

    if (buff == NULL)
      return MLIB_FAILURE;

    switch (ltype) {
      case MLIB_BYTE:
        switch (type) {
          case MLIB_BYTE:
            MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_u8i, mlib_u8);
            break;

          case MLIB_SHORT:
            srcStride >>= 1;
            MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_u8i, mlib_s16);
            break;
        }

        break;

      case MLIB_SHORT:
        switch (type) {
          case MLIB_BYTE:
            MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_s16i, mlib_u8);
            break;

          case MLIB_SHORT:
            srcStride >>= 1;
            MLIB_PROCESS_EDGES(MLIB_EDGE_INDEX_s16i, mlib_s16);
            break;
        }

        break;
    }

    mlib_free(buff);

    return MLIB_SUCCESS;
  }
/* *********************************************************** */
mlib_status
mlib_m_sconv3x3_8nw_1(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 hker0, hker1, hker2, vker0, vker1, vker2;
	__m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	dl += dll + 1;

	wid4 = (width + 7) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + wid4;

	for (j = 0; j < 2; j++) {
		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_3x3_1ch(lo, i);
		}

		sl += sll;

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < width / 8; i++) {
			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_hi, res_lo);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_hi, res_lo);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
mlib_status
mlib_ImageMinFilter3x3_S16(
    void *dst,
    void *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
#endif		   /* MAX_FILTER */
{
	mlib_u8 *buff, *buff1;
	mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl;
	__m64 *dp0, *dp1;
	__m64 aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1;
	__m64 e_mask;
	mlib_s32 i, j, wid8, tail;

	wid = (wid - 2) * SSIZE;
	wid8 = (wid + 7) & ~7;
	buff = mlib_malloc(2 * wid8);
	buff1 = buff + wid8;

	sl = (mlib_u8 *)src;
/* dst ptrs skip top j and left col */
	dl = (mlib_u8 *)dst + dlb + SSIZE;

	tail = wid & 7;
	e_mask = ((__m64 *) mlib_mask64_arr)[tail];

	sp0 = buff;
	sp1 = buff1;
	sp2 = sl;
	sp3 = sp2 + slb;
	sl += 2 * slb;

	for (i = 0; i < wid; i += 8) {
		c0 = *(__m64 *) sp2;
		c1 = *(__m64 *) (sp2 + SSIZE);
		c2 = *(__m64 *) (sp2 + 2 * SSIZE);
		d0 = *(__m64 *) sp3;
		d1 = *(__m64 *) (sp3 + SSIZE);
		d2 = *(__m64 *) (sp3 + 2 * SSIZE);

		cc = C_COMP(c0, c1);
		dd = C_COMP(d0, d1);
		cc = C_COMP(cc, c2);
		dd = C_COMP(dd, d2);

		*(__m64 *) sp0 = cc;
		*(__m64 *) sp1 = dd;

		sp0 += 8;
		sp1 += 8;
		sp2 += 8;
		sp3 += 8;
	}

	for (j = 0; j <= (hgt - 2 - 2); j += 2) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff;
		sp1 = buff1;
		sp2 = sl;
		sp3 = sp2 + slb;

/*
 *    line0:     aa
 *    line1:     bb
 *    line2:  c0 c1 c2
 *    line3:  d0 d1 d2
 */

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);
			d0 = *(__m64 *) sp3;
			d1 = *(__m64 *) (sp3 + SSIZE);
			d2 = *(__m64 *) (sp3 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			dd = C_COMP(d0, d1);
			cc = C_COMP(cc, c2);
			dd = C_COMP(dd, d2);

			bb = C_COMP(bb, cc);
			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, dd);

			*(__m64 *) sp0 = cc;
			*(__m64 *) sp1 = dd;
			(*dp0++) = r0;
			(*dp1++) = r1;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
			sp3 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);
			d0 = *(__m64 *) sp3;
			d1 = *(__m64 *) (sp3 + SSIZE);
			d2 = *(__m64 *) (sp3 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			dd = C_COMP(d0, d1);
			cc = C_COMP(cc, c2);
			dd = C_COMP(dd, d2);

			bb = C_COMP(bb, cc);
			r0 = C_COMP(aa, bb);
			r1 = C_COMP(bb, dd);

			*(__m64 *) sp0 = cc;
			*(__m64 *) sp1 = dd;

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
			*dp1 =
			    _mm_or_si64(_mm_and_si64(e_mask, r1),
			    _mm_andnot_si64(e_mask, *dp1));
		}

		sl += 2 * slb;
		dl += 2 * dlb;
	}

/* last line */

	if (j == (hgt - 3)) {
		dp0 = (void *)dl;
		dp1 = (void *)(dl + dlb);
		sp0 = buff;
		sp1 = buff1;
		sp2 = sl;

		for (i = 0; i <= wid - 8; i += 8) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);

			cc = C_COMP(c0, c1);
			cc = C_COMP(cc, c2);

			r0 = C_COMP(aa, bb);
			r0 = C_COMP(r0, cc);

			(*dp0++) = r0;

			sp0 += 8;
			sp1 += 8;
			sp2 += 8;
		}

		if (tail) {
			aa = *(__m64 *) sp0;
			bb = *(__m64 *) sp1;
			c0 = *(__m64 *) sp2;
			c1 = *(__m64 *) (sp2 + SSIZE);
			c2 = *(__m64 *) (sp2 + 2 * SSIZE);

			c1 = C_COMP(c0, c1);
			cc = C_COMP(c1, c2);

			r0 = C_COMP(aa, bb);
			r0 = C_COMP(r0, cc);

			*dp0 =
			    _mm_or_si64(_mm_and_si64(e_mask, r0),
			    _mm_andnot_si64(e_mask, *dp0));
		}
	}

	_mm_empty();

	mlib_free(buff);

	return (MLIB_SUCCESS);
}