mlib_status
mlib_VectorConjSymExt_S32C_S32C_Sat_N(
	mlib_s32 *zz,
	const mlib_s32 *xx,
	mlib_s32 n,
	mlib_s32 n1)
{
	CHECK;

	const mlib_s32 *px = xx;
	mlib_s32 *pz = zz;
	mlib_s32 *pd = zz + n1 + n1;

	mlib_s32 i, ax, az, n2, n3, nstep, c, c0, c1;
	__m128i xbuf, zbuf, mask1, mask2, mask3;
	mask1 = _mm_setr_epi32(0, 0xffffffff, 0, 0xffffffff);
	mask2 = _mm_setr_epi32(0, 0x80000000, 0, 0x80000000);

	ax = (mlib_addr)px & 15;
	az = (mlib_addr)pz & 15;
	nstep = 16 / sizeof (mlib_s32);

	if (ax & 7) {
		for (i = 0; i < n / 2; i++) {
			CONJ_S32C(loadu, storeu);
		}
		for (i = 0; i < n % 2; i++) {
			CONJ(S32);
		}
	} else {
		n1 = ((16 - ax) & 15) / sizeof (mlib_s32);
		n2 = (n + n - n1) / nstep;
		n3 = n + n - n1 - n2 * nstep;

		if (n2 < 1) {
			for (i = 0; i < n; i++) {
				CONJ(S32);
			}
		} else {
			for (i = 0; i < n1 / 2; i++) {
				CONJ(S32);
			}

			if (ax == az) {
				for (i = 0; i < n2; i++) {
					CONJ_S32C(loadu, storeu);
				}
			} else {
				for (i = 0; i < n2; i++) {
					CONJ_S32C(loadu, storeu);
				}
			}
			for (i = 0; i < n3 / 2; i++) {
				CONJ(S32);
			}
		}
	}

	return (MLIB_SUCCESS);
}
Example #2
0
        template <bool align> void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red)
        {
            size_t size = width*3;
            size_t step = A*3;
            size_t alignedSize = AlignLo(width, A)*3;

            uint32_t bgrb = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(blue) << 24);
            uint32_t grbg = uint32_t(green) | (uint32_t(red) << 8) | (uint32_t(blue) << 16) | (uint32_t(green) << 24);
            uint32_t rbgr = uint32_t(red) | (uint32_t(blue) << 8) | (uint32_t(green) << 16) | (uint32_t(red) << 24);

            __m128i bgrs[3];
            bgrs[0] = _mm_setr_epi32(bgrb, grbg, rbgr, bgrb);
            bgrs[1] = _mm_setr_epi32(grbg, rbgr, bgrb, grbg);
            bgrs[2] = _mm_setr_epi32(rbgr, bgrb, grbg, rbgr);
            for(size_t row = 0; row < height; ++row)
            {
                size_t offset = 0;
                for(; offset < alignedSize; offset += step)
                {
                    Store<align>((__m128i*)(dst + offset) + 0, bgrs[0]);
                    Store<align>((__m128i*)(dst + offset) + 1, bgrs[1]);
                    Store<align>((__m128i*)(dst + offset) + 2, bgrs[2]);
                }
                if(offset < size)
                {
                    offset = size - step;
                    Store<false>((__m128i*)(dst + offset) + 0, bgrs[0]);
                    Store<false>((__m128i*)(dst + offset) + 1, bgrs[1]);
                    Store<false>((__m128i*)(dst + offset) + 2, bgrs[2]);
                }
                dst += stride;
            }
        }
Example #3
0
mlib_status
__mlib_VectorSet_S32C(
	mlib_s32 *z,
	const mlib_s32 *c,
	mlib_s32 n)
{
	if (n < 1)
		return (MLIB_FAILURE);

	mlib_s32 i, nstep, n1, n2, n3;
	mlib_s32 c0, c1, *pdst = z;
	__m128i val;
	c0 = c[0];
	c1 = c[1];

	nstep = 16 / sizeof (mlib_s32);
	n1 = ((16 - ((mlib_addr)z & 15)) & 15) / sizeof (mlib_s32);
	n2 = (n + n - n1) / nstep;
	n3 = n + n - n1 - n2 * nstep;

	if (n2 < 1) {
		for (i = 0; i < n; i++) {
			*pdst++ = c0;
			*pdst++ = c1;
		}
	} else {
		for (i = 0; i < n1 / 2; i++) {
			*pdst++ = c0;
			*pdst++ = c1;
		}
		if (n1 & 1) {
			*pdst++ = c0;
			val = _mm_setr_epi32(c1, c0, c1, c0);
		} else {
			val = _mm_setr_epi32(c0, c1, c0, c1);
		}

		for (i = 0; i < n2; i++) {
			_mm_store_si128((__m128i *)pdst, val);
			pdst += nstep;
		}

		if (n1 & 1) {
			*pdst++ = c1;
			n3--;
		}
		for (i = 0; i < n3 / 2; i++) {
			*pdst++ = c0;
			*pdst++ = c1;
		}
		if (n3 & 1) {
			*pdst++ = c0;
		}
	}

	return (MLIB_SUCCESS);
}
Example #4
0
static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
{
  __m128i row1, row2, row3, row4;
  __m128i buf1, buf2, buf3, buf4;
#if defined(HAVE_SSE41)
  __m128i t0, t1;
#if !defined(HAVE_XOP)
  __m128i t2;
#endif
#endif
  __m128i ff0, ff1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
  const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 );
  const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 );
#endif
#if defined(HAVE_SSE41)
  const __m128i m0 = LOADU( block +  00 );
  const __m128i m1 = LOADU( block +  16 );
  const __m128i m2 = LOADU( block +  32 );
  const __m128i m3 = LOADU( block +  48 );
#else
  const uint32_t  m0 = ( ( uint32_t * )block )[ 0];
  const uint32_t  m1 = ( ( uint32_t * )block )[ 1];
  const uint32_t  m2 = ( ( uint32_t * )block )[ 2];
  const uint32_t  m3 = ( ( uint32_t * )block )[ 3];
  const uint32_t  m4 = ( ( uint32_t * )block )[ 4];
  const uint32_t  m5 = ( ( uint32_t * )block )[ 5];
  const uint32_t  m6 = ( ( uint32_t * )block )[ 6];
  const uint32_t  m7 = ( ( uint32_t * )block )[ 7];
  const uint32_t  m8 = ( ( uint32_t * )block )[ 8];
  const uint32_t  m9 = ( ( uint32_t * )block )[ 9];
  const uint32_t m10 = ( ( uint32_t * )block )[10];
  const uint32_t m11 = ( ( uint32_t * )block )[11];
  const uint32_t m12 = ( ( uint32_t * )block )[12];
  const uint32_t m13 = ( ( uint32_t * )block )[13];
  const uint32_t m14 = ( ( uint32_t * )block )[14];
  const uint32_t m15 = ( ( uint32_t * )block )[15];
#endif
  row1 = ff0 = LOADU( &S->h[0] );
  row2 = ff1 = LOADU( &S->h[4] );
  row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A );
  row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOADU( &S->t[0] ) );
  ROUND( 0 );
  ROUND( 1 );
  ROUND( 2 );
  ROUND( 3 );
  ROUND( 4 );
  ROUND( 5 );
  ROUND( 6 );
  ROUND( 7 );
  ROUND( 8 );
  ROUND( 9 );
  STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
  STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
  return 0;
}
Example #5
0
static void
init_data (__m128i *s1, __m128i *d)
{
  int i;
  for (i = 0; i < NUM; i++)
    {
      s1[i] = _mm_setr_epi32 (0x16157e2b, 0xa6d2ae28,
			      0x8815f7ab, 0x3c4fcf09);
      d[i] = _mm_setr_epi32 (0x24b5e434, 0x3424b5e5,
			     0xeb848a01, 0x01eb848b);
    }
}
/**
 * Convert a chroma-keyed image to standard ARGB32.
 * SSE2-optimized version.
 *
 * This operates on the image itself, and does not return
 * a duplicated image with the adjusted image.
 *
 * NOTE: The image *must* be ARGB32.
 *
 * @param key Chroma key color.
 * @return 0 on success; negative POSIX error code on error.
 */
int rp_image::apply_chroma_key_sse2(uint32_t key)
{
	RP_D(rp_image);
	rp_image_backend *const backend = d->backend;
	assert(backend->format == FORMAT_ARGB32);
	if (backend->format != FORMAT_ARGB32) {
		// ARGB32 only.
		return -EINVAL;
	}

	const unsigned int diff = (backend->stride - this->row_bytes()) / sizeof(uint32_t);
	uint32_t *img_buf = static_cast<uint32_t*>(backend->data());

	// SSE2 constants.
	const __m128i xmm_key = _mm_setr_epi32(key, key, key, key);
	const __m128i xmm_ones = _mm_setr_epi32(0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF);

	for (unsigned int y = static_cast<unsigned int>(backend->height); y > 0; y--) {
		// Process 4 pixels per iteration with SSE2.
		unsigned int x = static_cast<unsigned int>(backend->width);
		for (; x > 3; x -= 4, img_buf += 4) {
			__m128i *xmm_data = reinterpret_cast<__m128i*>(img_buf);

			// Compare the pixels to the chroma key.
			// Equal values will be 0xFFFFFFFF.
			// Non-equal values will be 0x00000000.
			__m128i res = _mm_cmpeq_epi32(*xmm_data, xmm_key);

			// Invert the results and mask the original data.
			// Original data will now have 00s for chroma-keyed pixels.
			*xmm_data = _mm_and_si128(_mm_xor_si128(res, xmm_ones), *xmm_data);
		}

		// Remaining pixels.
		for (; x > 0; x--, img_buf++) {
			if (*img_buf == key) {
				*img_buf = 0;
			}
		}

		// Next row.
		img_buf += diff;
	}

	// Adjust sBIT.
	// TODO: Only if transparent pixels were found.
	if (d->has_sBIT && d->sBIT.alpha == 0) {
		d->sBIT.alpha = 1;
	}

	// Chroma key applied.
	return 0;
}
Example #7
0
/* maxbit over |length| integers with provided initial value */
uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
                uint32_t length) {
    __m128i newvec;
    __m128i oldvec;
    __m128i initoffset;
    __m128i accumulator;
    const __m128i *pin;
    uint32_t tmparray[4];
    uint32_t k = 1;
    uint32_t acc;

    assert(length > 0);

    pin = (const __m128i *)(in);
    initoffset = _mm_set1_epi32(initvalue);
    switch (length) {
      case 1:
        newvec = _mm_set1_epi32(in[0]);
        break;
      case 2:
        newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]);
        break;
      case 3:
        newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]);
        break;
      default:
        newvec = _mm_loadu_si128(pin);
        break;
    }
    accumulator = Delta(newvec, initoffset);
    oldvec = newvec;

    /* process 4 integers and build an accumulator */
    while (k * 4 + 4 <= length) {
        newvec = _mm_loadu_si128(pin + k);
        accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
        oldvec = newvec;
        k++;
    }

    /* extract the accumulator as an integer */
    _mm_storeu_si128((__m128i *)(tmparray), accumulator);
    acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3];

    /* now process the remaining integers */
    for (k *= 4; k < length; k++)
        acc |= in[k] - (k == 0 ? initvalue : in[k - 1]);

    /* return the number of bits */
    return bits(acc);
}
void ColorModelView::paintEvent(QPaintEvent *)
{
    QPainter p(this);

    auto mainBounds = mainAreaBounds();
    auto sideBounds = sideAreaBounds();

    if (mainImage_.isNull()) {
        // FIXME: support other color model?
        QImage img(256, 256, QImage::Format_RGB32);
        auto *pixels = reinterpret_cast<quint32 *>(img.bits());
        auto basecolor = QColor::fromHsv(value_.hsvHue(), 255, 255);
        auto basecolorMM = _mm_setr_epi32(basecolor.blue(), basecolor.green(), basecolor.red(), 0);
        basecolorMM = _mm_add_epi32(basecolorMM, _mm_srli_epi32(basecolorMM, 7)); // map [0, 255] to [0, 256]
        auto white = _mm_set1_epi32(256 * 255);
        auto dX = _mm_sub_epi32(basecolorMM, _mm_set1_epi32(256));
        for (int y = 0; y < 256; ++y) {
            auto brightness = _mm_set1_epi32(256 - y - (y >> 7));
            auto col = white; // [0, 256 * 255]
            for (int x = 0; x < 256; ++x) {
                auto c = _mm_mullo_epi16(_mm_srli_epi32(col, 8), brightness);
                c = _mm_srli_epi16(c, 8); // [0, 255]
                c = _mm_packs_epi32(c, c);
                c = _mm_packus_epi16(c, c);

                _mm_store_ss(reinterpret_cast<float *>(&pixels[x + y * 256]),
                        _mm_castsi128_ps(c));

                col = _mm_add_epi32(col, dX);
            }
        }
        mainImage_ = QPixmap::fromImage(img);
    }
Example #9
0
int add_vector32_scalar(short *x, 
			int alpha, 
			short *y, 
			unsigned int N)
{
  unsigned int i;                 // loop counter

  __m128i *x_128; 
  __m128i *y_128; 

  x_128 = (__m128i *)&x[0];
  y_128 = (__m128i *)&y[0];

  alpha_128 = _mm_setr_epi32(alpha,0,alpha,0);

  // we compute 4 cpx multiply for each loop
  for(i=0;i<(N>>3);i++)
  {
    y_128[0] = _mm_add_epi32(alpha_128, x_128[0]);
    y_128[1] = _mm_add_epi32(alpha_128, x_128[1]);
    y_128[2] = _mm_add_epi32(alpha_128, x_128[2]);
    y_128[3] = _mm_add_epi32(alpha_128, x_128[3]);


    x_128+=4;
    y_128 +=4;

  }
  return (0);
}
static INLINE unsigned
build_mask_linear(int c, int dcdx, int dcdy)
{
   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = _mm_set1_epi32(dcdy);

   /* Get values across the quad
    */
   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);

   /* pack pairs of results into epi16
    */
   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);

   /* pack into epi8, preserving sign bits
    */
   __m128i result = _mm_packs_epi16(cstep01, cstep23);

   /* extract sign bits to create mask
    */
   return _mm_movemask_epi8(result);
}
Example #11
0
void INIT_Htable(uint8_t Htbl[16*8], uint8_t *H)
{
	int i;
	__m128i T, TMP0, TMP1, TMP2, TMP3, TMP4, POLY;
	POLY = _mm_setr_epi32(0x1,0,0,0xc2000000);
	T = _mm_loadu_si128(((__m128i*)H));
	TMP0 = T;
	_mm_storeu_si128(&((__m128i*)Htbl)[0], T);
	for (i=1; i<8; i++)
	{
		TMP1 = _mm_clmulepi64_si128(T, TMP0, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, TMP0, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, TMP0, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, TMP0, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
		_mm_storeu_si128(&((__m128i*)Htbl)[i], T);
	}
}
Example #12
0
void Polyval_Horner(unsigned char* TAG,
					unsigned char* pH,
					unsigned char* inp,
					int length)
{
	__m128i TMP0, TMP1, TMP2, TMP3, TMP4, T, POLY, H;
	int i=0;
	if (length==0)
		return;
	int has_semi = length%16;
	uint8_t B[16]={0};
	length /=16;
	
	H = _mm_loadu_si128(((__m128i*)pH));
	T = _mm_loadu_si128(((__m128i*)TAG));
	POLY = _mm_setr_epi32(0x1,0,0,0xc2000000);
	for (i=0; i< length; i++)
	{
		T = _mm_xor_si128(T, _mm_loadu_si128(&((__m128i*)inp)[i]));
		TMP1 = _mm_clmulepi64_si128(T, H, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, H, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, H, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, H, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
	}
	if (has_semi!=0)
	{
		memcpy(B, inp+length*16, has_semi);
		T = _mm_xor_si128(T, _mm_loadu_si128((__m128i*)B));
		TMP1 = _mm_clmulepi64_si128(T, H, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, H, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, H, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, H, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
	}
	_mm_storeu_si128(((__m128i*)TAG), T);
}
Example #13
0
static void blake2s_init_sse()
{
  // We cannot initialize these 128 bit variables in place when declaring
  // them globally, because global scope initialization is performed before
  // our SSE check and it would make code incompatible with older non-SSE2
  // CPUs. Also we cannot initialize them as static inside of function
  // using these variables, because SSE static initialization is not thread
  // safe: first thread starts initialization and sets "init done" flag even
  // if it is not done yet, second thread can attempt to access half-init
  // SSE data. So we moved init code here.

  blake2s_IV_0_3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A );
  blake2s_IV_4_7 = _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 );

#ifdef _WIN_64
  crotr8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 );
  crotr16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 );
#endif
}
Example #14
0
/*
 * mixed endian increment, low 64bits stored in hi word to be compatible
 * with _icm's BSWAP.
 */
static inline __m128i
nextc(__m128i x)
{
	const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0);
	const __m128i ZERO = _mm_setzero_si128();

	x = _mm_add_epi64(x, ONE);
	__m128i t = _mm_cmpeq_epi64(x, ZERO);
	t = _mm_unpackhi_epi64(t, ZERO);
	x = _mm_sub_epi64(x, t);

	return x;
}
static INLINE void
build_masks(int c, 
	    int cdiff,
	    int dcdx,
	    int dcdy,
	    unsigned *outmask,
	    unsigned *partmask)
{
   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = _mm_set1_epi32(dcdy);

   /* Get values across the quad
    */
   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);

   {
      __m128i cstep01, cstep23, result;

      cstep01 = _mm_packs_epi32(cstep0, cstep1);
      cstep23 = _mm_packs_epi32(cstep2, cstep3);
      result = _mm_packs_epi16(cstep01, cstep23);

      *outmask |= _mm_movemask_epi8(result);
   }


   {
      __m128i cio4 = _mm_set1_epi32(cdiff);
      __m128i cstep01, cstep23, result;

      cstep0 = _mm_add_epi32(cstep0, cio4);
      cstep1 = _mm_add_epi32(cstep1, cio4);
      cstep2 = _mm_add_epi32(cstep2, cio4);
      cstep3 = _mm_add_epi32(cstep3, cio4);

      cstep01 = _mm_packs_epi32(cstep0, cstep1);
      cstep23 = _mm_packs_epi32(cstep2, cstep3);
      result = _mm_packs_epi16(cstep01, cstep23);

      *partmask |= _mm_movemask_epi8(result);
   }
}
Example #16
0
void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256d dd = _mm256_set1_pd (scale);
    __m256d XX0 = _mm256_set1_pd (X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256d y0 = _mm256_set1_pd (j*scale + Y0);
        for (unsigned i = 0; i < SX; i += 4)	{

            __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3);
            __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0);
            __m256d x = x0;
            __m256d y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256d x2 = _mm256_mul_pd (x, x);
                __m256d y2 = _mm256_mul_pd (y, y);
                __m256d abs = _mm256_add_pd (x2, y2);
                __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi64 (counts, cmp_mask);
                __m256d t = _mm256_add_pd (x, x);
                y = _mm256_fmadd_pd (t, y, y0);
                x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8));
            *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16);
            out += 4;
        }
    }
}
mlib_status
mlib_VideoColorJFIFYCC2RGB444_S16_naligned(
	mlib_s16 *rgb,
	const mlib_s16 *y,
	const mlib_s16 *cb,
	const mlib_s16 *cr,
	mlib_s32 n)
{
	/* 0 & 1.402*16384 */
	const __m128i x_c1 = _mm_setr_epi16(0, 22970, 0, 22970,
		0, 22970, 0, 22970);

	/* -0.34414*16384 & -0.71414*16384 */
	const __m128i x_c2 = _mm_setr_epi16(-5638, -11700, -5638, -11700,
		-5638, -11700, -5638, -11700);

	/* 1.772*16384 & 0 */
	const __m128i x_c3 = _mm_setr_epi16(29032, 0, 29032, 0,
		29032, 0, 29032, 0);

	const __m128i x_coff = _mm_set1_epi16(2048);
	const __m128i x_cps1 = _mm_set1_epi32(0x8000);
	const __m128i x_cps2 = _mm_set1_epi16(0x8000);
	const __m128i x_zero = _mm_setzero_si128();
	const __m128i x_mask1 = _mm_setr_epi32(0xffffffff, 0xffff, 0, 0);
	const __m128i x_mask2 = _mm_setr_epi32(0, 0xffff0000, 0xffffffff, 0);

	/* __m128i variables */
	__m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_y1, x_y2;
	__m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2, x_t1, x_t2;
	__m128i x_rgbl, x_rgbh, x_rgl, x_rgh, x_bbl, x_bbh;
	__m128i x_cbcr1, x_cbcr2;

	/* pointers */
	__m128i *px_y, *px_cb, *px_cr;
	mlib_s16 *prgb;

	/* other var */
	mlib_d64 fr, fg, fb, fy, fcb, fcr;
	mlib_s32 i;

	px_y = (__m128i *)y;
	px_cb = (__m128i *)cb;
	px_cr = (__m128i *)cr;
	prgb = rgb;
	i = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (; i <= n - 16; i += 8)	{
		x_y = _mm_loadu_si128(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		x_y2 = _mm_unpackhi_epi16(x_y, x_zero);
		x_y2 = _mm_slli_epi32(x_y2, 4);
		px_y++;
		x_cb = _mm_loadu_si128(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb++;
		x_cr = _mm_loadu_si128(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr++;
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);
		x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		x_t2 = _mm_madd_epi16(x_cbcr2, x_c1);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_r2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c2);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_g2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c3);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_b2 = _mm_add_epi32(x_t2, x_y2);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r2 = _mm_sub_epi32(x_r2, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_r2);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g2 = _mm_sub_epi32(x_g2, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_g2);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b2 = _mm_sub_epi32(x_b2, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_b2);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_rgh = _mm_unpackhi_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);
		x_bbh = _mm_unpackhi_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbh);

		x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbh);
	}

	if (i <= (n - 8)) {
		x_y = _mm_loadu_si128(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		x_y2 = _mm_unpackhi_epi16(x_y, x_zero);
		x_y2 = _mm_slli_epi32(x_y2, 4);
		px_y++;
		x_cb = _mm_loadu_si128(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb++;
		x_cr = _mm_loadu_si128(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr++;
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);
		x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		x_t2 = _mm_madd_epi16(x_cbcr2, x_c1);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_r2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c2);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_g2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c3);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_b2 = _mm_add_epi32(x_t2, x_y2);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r2 = _mm_sub_epi32(x_r2, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_r2);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g2 = _mm_sub_epi32(x_g2, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_g2);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b2 = _mm_sub_epi32(x_b2, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_b2);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_rgh = _mm_unpackhi_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);
		x_bbh = _mm_unpackhi_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbh);

		x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh);
		PACK_RGB2(x_rgbh);

		i += 8;
	}

	if (i <= (n - 4)) {
		x_y = _mm_loadl_epi64(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		px_y = (__m128i *)(((__m64 *)px_y) + 1);
		x_cb = _mm_loadl_epi64(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb = (__m128i *)(((__m64 *)px_cb) + 1);
		x_cr = _mm_loadl_epi64(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr = (__m128i *)(((__m64 *)px_cr) + 1);
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_zero);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_zero);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_zero);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB2(x_rgbh);

		i += 4;
	}

	/* pure C implementation */
	for (; i < n; i++) {
		fy = y[i] * SCALE - SAT;
		fcb = (mlib_d64)((cb[i] - 2048) << 20);
		fcr = (mlib_d64)((cr[i] - 2048) << 20);
		fr = fy + 1.40200f * fcr;
		fg = fy - 0.34414f * fcb - 0.71414f * fcr;
		fb = fy + 1.77200f * fcb;
		rgb[3 * i] = CLAMP_U12(fr);
		rgb[3 * i + 1] = CLAMP_U12(fg);
		rgb[3 * i + 2] = CLAMP_U12(fb);
	}

	return (MLIB_SUCCESS);
}
Example #18
0
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint)
{

#if CHECK_FUNCTIONAL
    Vector3d param(EPoint);
#endif

    AVX2TABLETYPE *mp;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    const __m256d ONE_PD = _mm256_set1_pd(1.0);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d ss;
    __m256d blend;

    __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0));
    //     blend = _mm256_blend_pd(iii, jjj, 0);

    INCSUMAVX_VECTOR(mp, ss, iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 2);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3));
    blend = _mm256_blend_pd(iii, jjj, 6);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 4);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 12);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3));
    //     blend = _mm256_blend_pd(iii, jjj, 14);

    INCSUMAVX_VECTOR(mp, ss, jjj);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 10);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0));
    blend = _mm256_blend_pd(iii, jjj, 8);

    INCSUMAVX_VECTOR(mp, ss, blend);


    __m256d xy = _mm256_hadd_pd(x,y);
    __m128d xy_up = _mm256_extractf128_pd(xy,1);
    xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up);
    _mm_storeu_pd(&result[X],xy_up);

    __m128d z_up = _mm256_extractf128_pd(z,1);
    z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up);
    z_up = _mm_hadd_pd(z_up,z_up);
    result[Z] = _mm_cvtsd_f64(z_up);


#if CHECK_FUNCTIONAL
    {
        Vector3d portable_res;
        PortableDNoise(portable_res , param);
        if (fabs(portable_res[X] - result[X]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise X error");
        }
        if (fabs(portable_res[Y] - result[Y]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Y error");
        }
        if (fabs(portable_res[Z] - result[Z]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Z error");
        }

    }

#endif



    _mm256_zeroupper();
    return;

}
Example #19
0
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint)
{
    DBL x, y, z;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /*ix = (int)x; iy = (int)y; iz = (int)z;
    x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/
                /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];
    DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));

    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz);
    __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz);
    __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p;
    __m128d sum_X_Y = _mm_setzero_pd();
    __m128d sum__Z = _mm_setzero_pd();

    __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm);
    INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z);
    INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z);

    sum__Z = _mm_hadd_pd(sum__Z, sum__Z);

    _mm_storeu_pd(*result, sum_X_Y);
    _mm_store_sd(&result[Z], sum__Z);
}
Example #20
0
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator)
{
    DBL x, y, z;
    DBL *mp;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
    DBL sum;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator==kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
            if (sum < 0.0) sum = 0.0;
            if (sum > 1.0) sum = 1.0;

        return sum;
    }

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /* ix = (int)x; iy = (int)y; iz = (long)z; */
    /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];
    DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));
    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);
    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm;
    __m128d int_sum1 = _mm_setzero_pd();

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz);
    INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz);
    INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz);
    INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1);

    int_sum1 = _mm_hadd_pd(int_sum1, int_sum1);

    if(noise_generator==kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        const __m128d r2 = _mm_set_sd(0.48985582);
        const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582);
        int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2);
    }
    else
    {
        int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5));
    }

    int_sum1 = _mm_min_sd(one, int_sum1);
    int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1);
    _mm_store_sd(&sum, int_sum1);

    return (sum);
}
Example #21
0
void Float32ToNativeInt16_X86( const Float32 *src, SInt16 *dst, unsigned int numToConvert )
{
	const float *src0 = src;
	int16_t *dst0 = dst;
	unsigned int count = numToConvert;
	
	if (count >= 8) {
		// vector -- requires 8+ samples
		ROUNDMODE_NEG_INF
		const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
		const __m128 vmin = (const __m128) { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
		const __m128 vmax = (const __m128) { 32767.0f, 32767.0f, 32767.0f, 32767.0f  };
		const __m128 vscale = (const __m128) { 32768.0f, 32768.0f, 32768.0f, 32768.0f  };
		__m128 vf0, vf1;
		__m128i vi0, vi1, vpack0;
	
#define F32TOLE16 \
		vf0 = _mm_mul_ps(vf0, vscale);			\
		vf1 = _mm_mul_ps(vf1, vscale);			\
		vf0 = _mm_add_ps(vf0, vround);			\
		vf1 = _mm_add_ps(vf1, vround);			\
		vf0 = _mm_max_ps(vf0, vmin);			\
		vf1 = _mm_max_ps(vf1, vmin);			\
		vf0 = _mm_min_ps(vf0, vmax);			\
		vf1 = _mm_min_ps(vf1, vmax);			\
		vi0 = _mm_cvtps_epi32(vf0);			\
		vi1 = _mm_cvtps_epi32(vf1);			\
		vpack0 = _mm_packs_epi32(vi0, vi1);

		int falign = (uintptr_t)src & 0xF;
		int ialign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vf0 = _mm_loadu_ps(src);
			vf1 = _mm_loadu_ps(src+4);
			F32TOLE16
			_mm_storeu_si128((__m128i *)dst, vpack0);
			
			// advance such that the destination ints are aligned
			unsigned int n = (16 - ialign) / 2;
			src += n;
			dst += n;
			count -= n;

			falign = (uintptr_t)src & 0xF;
			if (falign != 0) {
				// unaligned loads, aligned stores
				while (count >= 8) {
					vf0 = _mm_loadu_ps(src);
					vf1 = _mm_loadu_ps(src+4);
					F32TOLE16
					_mm_store_si128((__m128i *)dst, vpack0);
					src += 8;
					dst += 8;
					count -= 8;
				}
				goto VectorCleanup;
			}
		}
	
		// aligned loads, aligned stores
		while (count >= 8) {
			vf0 = _mm_load_ps(src);
			vf1 = _mm_load_ps(src+4);
			F32TOLE16
			_mm_store_si128((__m128i *)dst, vpack0);
			
			src += 8;
			dst += 8;
			count -= 8;
		}
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 8;
			dst = dst0 + numToConvert - 8;
			vf0 = _mm_loadu_ps(src);
			vf1 = _mm_loadu_ps(src+4);
			F32TOLE16
			_mm_storeu_si128((__m128i *)dst, vpack0);
		}
		RESTORE_ROUNDMODE
		return;
	}
	
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 2147483648.0, round = 32768.0, max32 = 2147483648.0 - 1.0 - 32768.0, min32 = 0.;
		ROUNDMODE_NEG_INF
		
		while (count-- > 0) {
			double f0 = *src++;
			f0 = f0 * scale + round;
			SInt32 i0 = FloatToInt(f0, min32, max32);
			i0 >>= 16;
			*dst++ = i0;
		}
		RESTORE_ROUNDMODE
	}
}

// ===================================================================================================

void Float32ToSwapInt16_X86( const Float32 *src, SInt16 *dst, unsigned int numToConvert )
{
	const float *src0 = src;
	int16_t *dst0 = dst;
	unsigned int count = numToConvert;
	
	if (count >= 8) {
		// vector -- requires 8+ samples
		ROUNDMODE_NEG_INF
		const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
		const __m128 vmin = (const __m128) { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
		const __m128 vmax = (const __m128) { 32767.0f, 32767.0f, 32767.0f, 32767.0f  };
		const __m128 vscale = (const __m128) { 32768.0f, 32768.0f, 32768.0f, 32768.0f  };
		__m128 vf0, vf1;
		__m128i vi0, vi1, vpack0;
	
#define F32TOBE16 \
		vf0 = _mm_mul_ps(vf0, vscale);			\
		vf1 = _mm_mul_ps(vf1, vscale);			\
		vf0 = _mm_add_ps(vf0, vround);			\
		vf1 = _mm_add_ps(vf1, vround);			\
		vf0 = _mm_max_ps(vf0, vmin);			\
		vf1 = _mm_max_ps(vf1, vmin);			\
		vf0 = _mm_min_ps(vf0, vmax);			\
		vf1 = _mm_min_ps(vf1, vmax);			\
		vi0 = _mm_cvtps_epi32(vf0);			\
		vi1 = _mm_cvtps_epi32(vf1);			\
		vpack0 = _mm_packs_epi32(vi0, vi1);		\
		vpack0 = byteswap16(vpack0);

		int falign = (uintptr_t)src & 0xF;
		int ialign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vf0 = _mm_loadu_ps(src);
			vf1 = _mm_loadu_ps(src+4);
			F32TOBE16
			_mm_storeu_si128((__m128i *)dst, vpack0);

			// and advance such that the destination ints are aligned
			unsigned int n = (16 - ialign) / 2;
			src += n;
			dst += n;
			count -= n;

			falign = (uintptr_t)src & 0xF;
			if (falign != 0) {
				// unaligned loads, aligned stores
				while (count >= 8) {
					vf0 = _mm_loadu_ps(src);
					vf1 = _mm_loadu_ps(src+4);
					F32TOBE16
					_mm_store_si128((__m128i *)dst, vpack0);
					src += 8;
					dst += 8;
					count -= 8;
				}
				goto VectorCleanup;
			}
		}
	
		// aligned loads, aligned stores
		while (count >= 8) {
			vf0 = _mm_load_ps(src);
			vf1 = _mm_load_ps(src+4);
			F32TOBE16
			_mm_store_si128((__m128i *)dst, vpack0);
			
			src += 8;
			dst += 8;
			count -= 8;
		}
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 8;
			dst = dst0 + numToConvert - 8;
			vf0 = _mm_loadu_ps(src);
			vf1 = _mm_loadu_ps(src+4);
			F32TOBE16
			_mm_storeu_si128((__m128i *)dst, vpack0);
		}
		RESTORE_ROUNDMODE
		return;
	}
	
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 2147483648.0, round = 32768.0, max32 = 2147483648.0 - 1.0 - 32768.0, min32 = 0.;
		ROUNDMODE_NEG_INF
		
		while (count-- > 0) {
			double f0 = *src++;
			f0 = f0 * scale + round;
			SInt32 i0 = FloatToInt(f0, min32, max32);
			i0 >>= 16;
			*dst++ = OSSwapInt16(i0);
		}
		RESTORE_ROUNDMODE
	}
}

// ===================================================================================================

void Float32ToNativeInt32_X86( const Float32 *src, SInt32 *dst, unsigned int numToConvert )
{
	const float *src0 = src;
	SInt32 *dst0 = dst;
	unsigned int count = numToConvert;
	
	if (count >= 4) {
		// vector -- requires 4+ samples
		ROUNDMODE_NEG_INF
		const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
		const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f };
		const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32  };
		const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f  };
		__m128 vf0;
		__m128i vi0;
	
#define F32TOLE32(x) \
		vf##x = _mm_mul_ps(vf##x, vscale);			\
		vf##x = _mm_add_ps(vf##x, vround);			\
		vf##x = _mm_max_ps(vf##x, vmin);			\
		vf##x = _mm_min_ps(vf##x, vmax);			\
		vi##x = _mm_cvtps_epi32(vf##x);			\

		int falign = (uintptr_t)src & 0xF;
		int ialign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vf0 = _mm_loadu_ps(src);
			F32TOLE32(0)
			_mm_storeu_si128((__m128i *)dst, vi0);
			
			// and advance such that the destination ints are aligned
			unsigned int n = (16 - ialign) / 4;
			src += n;
			dst += n;
			count -= n;

			falign = (uintptr_t)src & 0xF;
			if (falign != 0) {
				// unaligned loads, aligned stores
				while (count >= 4) {
					vf0 = _mm_loadu_ps(src);
					F32TOLE32(0)
					_mm_store_si128((__m128i *)dst, vi0);
					src += 4;
					dst += 4;
					count -= 4;
				}
				goto VectorCleanup;
			}
		}
	
		while (count >= 4) {
			vf0 = _mm_load_ps(src);
			F32TOLE32(0)
			_mm_store_si128((__m128i *)dst, vi0);
			
			src += 4;
			dst += 4;
			count -= 4;
		}
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 4;
			dst = dst0 + numToConvert - 4;
			vf0 = _mm_loadu_ps(src);
			F32TOLE32(0)
			_mm_storeu_si128((__m128i *)dst, vi0);
		}
		RESTORE_ROUNDMODE
		return;
	}
	
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.;
		ROUNDMODE_NEG_INF
		
		while (count-- > 0) {
			double f0 = *src++;
			f0 = f0 * scale + round;
			SInt32 i0 = FloatToInt(f0, min32, max32);
			*dst++ = i0;
		}
		RESTORE_ROUNDMODE
	}
}

// ===================================================================================================

void Float32ToSwapInt32_X86( const Float32 *src, SInt32 *dst, unsigned int numToConvert )
{
	const float *src0 = src;
	SInt32 *dst0 = dst;
	unsigned int count = numToConvert;
	
	if (count >= 4) {
		// vector -- requires 4+ samples
		ROUNDMODE_NEG_INF
		const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
		const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f };
		const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32  };
		const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f  };
		__m128 vf0;
		__m128i vi0;
	
#define F32TOBE32(x) \
		vf##x = _mm_mul_ps(vf##x, vscale);			\
		vf##x = _mm_add_ps(vf##x, vround);			\
		vf##x = _mm_max_ps(vf##x, vmin);			\
		vf##x = _mm_min_ps(vf##x, vmax);			\
		vi##x = _mm_cvtps_epi32(vf##x);			\
		vi##x = byteswap32(vi##x);

		int falign = (uintptr_t)src & 0xF;
		int ialign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vf0 = _mm_loadu_ps(src);
			F32TOBE32(0)
			_mm_storeu_si128((__m128i *)dst, vi0);
			
			// and advance such that the destination ints are aligned
			unsigned int n = (16 - ialign) / 4;
			src += n;
			dst += n;
			count -= n;

			falign = (uintptr_t)src & 0xF;
			if (falign != 0) {
				// unaligned loads, aligned stores
				while (count >= 4) {
					vf0 = _mm_loadu_ps(src);
					F32TOBE32(0)
					_mm_store_si128((__m128i *)dst, vi0);
					src += 4;
					dst += 4;
					count -= 4;
				}
				goto VectorCleanup;
			}
		}
	
		while (count >= 4) {
			vf0 = _mm_load_ps(src);
			F32TOBE32(0)
			_mm_store_si128((__m128i *)dst, vi0);
			
			src += 4;
			dst += 4;
			count -= 4;
		}
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 4;
			dst = dst0 + numToConvert - 4;
			vf0 = _mm_loadu_ps(src);
			F32TOBE32(0)
			_mm_storeu_si128((__m128i *)dst, vi0);
		}
		RESTORE_ROUNDMODE
		return;
	}
	
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.;
		ROUNDMODE_NEG_INF
		
		while (count-- > 0) {
			double f0 = *src++;
			f0 = f0 * scale + round;
			SInt32 i0 = FloatToInt(f0, min32, max32);
			*dst++ = OSSwapInt32(i0);
		}
		RESTORE_ROUNDMODE
	}
}

// ===================================================================================================

// ~14 instructions
static inline __m128i Pack32ToLE24(__m128i val, __m128i mask)
{
	__m128i store;
	val = _mm_srli_si128(val, 1);
	store = _mm_and_si128(val, mask);

	val = _mm_srli_si128(val, 1);
	mask = _mm_slli_si128(mask, 3);
	store = _mm_or_si128(store, _mm_and_si128(val, mask));

	val = _mm_srli_si128(val, 1);
	mask = _mm_slli_si128(mask, 3);
	store = _mm_or_si128(store, _mm_and_si128(val, mask));

	val = _mm_srli_si128(val, 1);
	mask = _mm_slli_si128(mask, 3);
	store = _mm_or_si128(store, _mm_and_si128(val, mask));
	return store;
}

// marginally faster than scalar
void Float32ToNativeInt24_X86( const Float32 *src, UInt8 *dst, unsigned int numToConvert )
{
	const Float32 *src0 = src;
	UInt8 *dst0 = dst;
	unsigned int count = numToConvert;
	
	if (count >= 6) {
		// vector -- requires 6+ samples
		ROUNDMODE_NEG_INF
		const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
		const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f };
		const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32  };
		const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f  };
		__m128i mask = _mm_setr_epi32(0x00FFFFFF, 0, 0, 0);
			// it is actually cheaper to copy and shift this mask on the fly than to have 4 of them

		__m128i store;
		union {
			UInt32 i[4];
			__m128i v;
		} u;

		__m128 vf0;
		__m128i vi0;

		int falign = (uintptr_t)src & 0xF;
	
		if (falign != 0) {
			// do one unaligned conversion
			vf0 = _mm_loadu_ps(src);
			F32TOLE32(0)
			store = Pack32ToLE24(vi0, mask);
			_mm_storeu_si128((__m128i *)dst, store);

			// and advance such that the source floats are aligned
			unsigned int n = (16 - falign) / 4;
			src += n;
			dst += 3*n;	// bytes
			count -= n;
		}
	
		while (count >= 6) {
			vf0 = _mm_load_ps(src);
			F32TOLE32(0)
			store = Pack32ToLE24(vi0, mask);
			_mm_storeu_si128((__m128i *)dst, store);	// destination always unaligned
			
			src += 4;
			dst += 12;	// bytes
			count -= 4;
		}
		
		
		if (count >= 4) {
			vf0 = _mm_load_ps(src);
			F32TOLE32(0)
			u.v = Pack32ToLE24(vi0, mask);
			((UInt32 *)dst)[0] = u.i[0];
			((UInt32 *)dst)[1] = u.i[1];
			((UInt32 *)dst)[2] = u.i[2];
			
			src += 4;
			dst += 12;	// bytes
			count -= 4;
		}

		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 4;
			dst = dst0 + 3*numToConvert - 12;
			vf0 = _mm_loadu_ps(src);
			F32TOLE32(0)
			u.v = Pack32ToLE24(vi0, mask);
			((UInt32 *)dst)[0] = u.i[0];
			((UInt32 *)dst)[1] = u.i[1];
			((UInt32 *)dst)[2] = u.i[2];
		}
		RESTORE_ROUNDMODE
		return;
	}
	
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.;
		ROUNDMODE_NEG_INF
		
		while (count-- > 0) {
			double f0 = *src++;
			f0 = f0 * scale + round;
			UInt32 i0 = FloatToInt(f0, min32, max32);
			dst[0] = (UInt8)(i0 >> 8);
			dst[1] = (UInt8)(i0 >> 16);
			dst[2] = (UInt8)(i0 >> 24);
			dst += 3;
		}
		RESTORE_ROUNDMODE
	}
}


// ===================================================================================================
#pragma mark -
#pragma mark Int -> Float

void NativeInt16ToFloat32_X86( const SInt16 *src, Float32 *dst, unsigned int numToConvert )
{
	const SInt16 *src0 = src;
	Float32 *dst0 = dst;
	unsigned int count = numToConvert;

	if (count >= 8) {
		// vector -- requires 8+ samples
		// convert the 16-bit words to the high word of 32-bit values
#define LEI16TOF32(x, y) \
	vi##x = _mm_unpacklo_epi16(zero, vpack##x); \
	vi##y = _mm_unpackhi_epi16(zero, vpack##x); \
	vf##x = _mm_cvtepi32_ps(vi##x); \
	vf##y = _mm_cvtepi32_ps(vi##y); \
	vf##x = _mm_mul_ps(vf##x, vscale); \
	vf##y = _mm_mul_ps(vf##y, vscale);
		
		const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f  };
		const __m128i zero = _mm_setzero_si128();
		__m128 vf0, vf1;
		__m128i vi0, vi1, vpack0;

		int ialign = (uintptr_t)src & 0xF;
		int falign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vpack0 = _mm_loadu_si128((__m128i const *)src);
			LEI16TOF32(0, 1)
			_mm_storeu_ps(dst, vf0);
			_mm_storeu_ps(dst+4, vf1);
			
			// and advance such that the destination floats are aligned
			unsigned int n = (16 - falign) / 4;
			src += n;
			dst += n;
			count -= n;

			ialign = (uintptr_t)src & 0xF;
			if (ialign != 0) {
				// unaligned loads, aligned stores
				while (count >= 8) {
					vpack0 = _mm_loadu_si128((__m128i const *)src);
					LEI16TOF32(0, 1)
					_mm_store_ps(dst, vf0);
					_mm_store_ps(dst+4, vf1);
					src += 8;
					dst += 8;
					count -= 8;
				}
				goto VectorCleanup;
			}
		}
	
		// aligned loads, aligned stores
		while (count >= 8) {
			vpack0 = _mm_load_si128((__m128i const *)src);
			LEI16TOF32(0, 1)
			_mm_store_ps(dst, vf0);
			_mm_store_ps(dst+4, vf1);
			src += 8;
			dst += 8;
			count -= 8;
		}
		
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 8;
			dst = dst0 + numToConvert - 8;
			vpack0 = _mm_loadu_si128((__m128i const *)src);
			LEI16TOF32(0, 1)
			_mm_storeu_ps(dst, vf0);
			_mm_storeu_ps(dst+4, vf1);
		}
		return;
	}
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 1./32768.f;
		while (count-- > 0) {
			SInt16 i = *src++;
			double f = (double)i * scale;
			*dst++ = f;
		}
	}
}

// ===================================================================================================

void SwapInt16ToFloat32_X86( const SInt16 *src, Float32 *dst, unsigned int numToConvert )
{
	const SInt16 *src0 = src;
	Float32 *dst0 = dst;
	unsigned int count = numToConvert;

	if (count >= 8) {
		// vector -- requires 8+ samples
		// convert the 16-bit words to the high word of 32-bit values
#define BEI16TOF32 \
	vpack0 = byteswap16(vpack0); \
	vi0 = _mm_unpacklo_epi16(zero, vpack0); \
	vi1 = _mm_unpackhi_epi16(zero, vpack0); \
	vf0 = _mm_cvtepi32_ps(vi0); \
	vf1 = _mm_cvtepi32_ps(vi1); \
	vf0 = _mm_mul_ps(vf0, vscale); \
	vf1 = _mm_mul_ps(vf1, vscale);
		
		const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f  };
		const __m128i zero = _mm_setzero_si128();
		__m128 vf0, vf1;
		__m128i vi0, vi1, vpack0;

		int ialign = (uintptr_t)src & 0xF;
		int falign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vpack0 = _mm_loadu_si128((__m128i const *)src);
			BEI16TOF32
			_mm_storeu_ps(dst, vf0);
			_mm_storeu_ps(dst+4, vf1);

			// and advance such that the destination floats are aligned
			unsigned int n = (16 - falign) / 4;
			src += n;
			dst += n;
			count -= n;

			ialign = (uintptr_t)src & 0xF;
			if (ialign != 0) {
				// unaligned loads, aligned stores
				while (count >= 8) {
					vpack0 = _mm_loadu_si128((__m128i const *)src);
					BEI16TOF32
					_mm_store_ps(dst, vf0);
					_mm_store_ps(dst+4, vf1);
					src += 8;
					dst += 8;
					count -= 8;
				}
				goto VectorCleanup;
			}
		}
	
		// aligned loads, aligned stores
		while (count >= 8) {
			vpack0 = _mm_load_si128((__m128i const *)src);
			BEI16TOF32
			_mm_store_ps(dst, vf0);
			_mm_store_ps(dst+4, vf1);
			src += 8;
			dst += 8;
			count -= 8;
		}
		
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 8;
			dst = dst0 + numToConvert - 8;
			vpack0 = _mm_loadu_si128((__m128i const *)src);
			BEI16TOF32
			_mm_storeu_ps(dst, vf0);
			_mm_storeu_ps(dst+4, vf1);
		}
		return;
	}
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 1./32768.f;
		while (count-- > 0) {
			SInt16 i = *src++;
			i = OSSwapInt16(i);
			double f = (double)i * scale;
			*dst++ = f;
		}
	}
}

// ===================================================================================================

void NativeInt32ToFloat32_X86( const SInt32 *src, Float32 *dst, unsigned int numToConvert )
{
	const SInt32 *src0 = src;
	Float32 *dst0 = dst;
	unsigned int count = numToConvert;

	if (count >= 4) {
		// vector -- requires 4+ samples
#define LEI32TOF32(x) \
	vf##x = _mm_cvtepi32_ps(vi##x); \
	vf##x = _mm_mul_ps(vf##x, vscale); \
		
		const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f  };
		__m128 vf0;
		__m128i vi0;

		int ialign = (uintptr_t)src & 0xF;
		int falign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vi0 = _mm_loadu_si128((__m128i const *)src);
			LEI32TOF32(0)
			_mm_storeu_ps(dst, vf0);
			
			// and advance such that the destination floats are aligned
			unsigned int n = (16 - falign) / 4;
			src += n;
			dst += n;
			count -= n;

			ialign = (uintptr_t)src & 0xF;
			if (ialign != 0) {
				// unaligned loads, aligned stores
				while (count >= 4) {
					vi0 = _mm_loadu_si128((__m128i const *)src);
					LEI32TOF32(0)
					_mm_store_ps(dst, vf0);
					src += 4;
					dst += 4;
					count -= 4;
				}
				goto VectorCleanup;
			}
		}
	
		// aligned loads, aligned stores
		while (count >= 4) {
			vi0 = _mm_load_si128((__m128i const *)src);
			LEI32TOF32(0)
			_mm_store_ps(dst, vf0);
			src += 4;
			dst += 4;
			count -= 4;
		}
		
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 4;
			dst = dst0 + numToConvert - 4;
			vi0 = _mm_loadu_si128((__m128i const *)src);
			LEI32TOF32(0)
			_mm_storeu_ps(dst, vf0);
		}
		return;
	}
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 1./2147483648.0f;
		while (count-- > 0) {
			SInt32 i = *src++;
			double f = (double)i * scale;
			*dst++ = f;
		}
	}
}

// ===================================================================================================

void SwapInt32ToFloat32_X86( const SInt32 *src, Float32 *dst, unsigned int numToConvert )
{
	const SInt32 *src0 = src;
	Float32 *dst0 = dst;
	unsigned int count = numToConvert;

	if (count >= 4) {
		// vector -- requires 4+ samples
#define BEI32TOF32(x) \
	vi##x = byteswap32(vi##x); \
	vf##x = _mm_cvtepi32_ps(vi##x); \
	vf##x = _mm_mul_ps(vf##x, vscale); \
		
		const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f  };
		__m128 vf0;
		__m128i vi0;

		int ialign = (uintptr_t)src & 0xF;
		int falign = (uintptr_t)dst & 0xF;
	
		if (falign != 0 || ialign != 0) {
			// do one unaligned conversion
			vi0 = _mm_loadu_si128((__m128i const *)src);
			BEI32TOF32(0)
			_mm_storeu_ps(dst, vf0);
			
			// and advance such that the destination floats are aligned
			unsigned int n = (16 - falign) / 4;
			src += n;
			dst += n;
			count -= n;

			ialign = (uintptr_t)src & 0xF;
			if (ialign != 0) {
				// unaligned loads, aligned stores
				while (count >= 4) {
					vi0 = _mm_loadu_si128((__m128i const *)src);
					BEI32TOF32(0)
					_mm_store_ps(dst, vf0);
					src += 4;
					dst += 4;
					count -= 4;
				}
				goto VectorCleanup;
			}
		}
	
		// aligned loads, aligned stores
		while (count >= 4) {
			vi0 = _mm_load_si128((__m128i const *)src);
			BEI32TOF32(0)
			_mm_store_ps(dst, vf0);
			src += 4;
			dst += 4;
			count -= 4;
		}
		
VectorCleanup:
		if (count > 0) {
			// unaligned cleanup -- just do one unaligned vector at the end
			src = src0 + numToConvert - 4;
			dst = dst0 + numToConvert - 4;
			vi0 = _mm_loadu_si128((__m128i const *)src);
			BEI32TOF32(0)
			_mm_storeu_ps(dst, vf0);
		}
		return;
	}
	// scalar for small numbers of samples
	if (count > 0) {
		double scale = 1./2147483648.0f;
		while (count-- > 0) {
			SInt32 i = *src++;
			i = OSSwapInt32(i);
			double f = (double)i * scale;
			*dst++ = f;
		}
	}
}
Example #22
0
const ALfloat *Resample_lerp_SSE2(const InterpState* UNUSED(state),
  const ALfloat *restrict src, ALsizei frac, ALint increment,
  ALfloat *restrict dst, ALsizei numsamples)
{
    const __m128i increment4 = _mm_set1_epi32(increment*4);
    const __m128 fracOne4 = _mm_set1_ps(1.0f/FRACTIONONE);
    const __m128i fracMask4 = _mm_set1_epi32(FRACTIONMASK);
    alignas(16) ALsizei pos_[4], frac_[4];
    __m128i frac4, pos4;
    ALsizei todo, pos, i;

    ASSUME(numsamples > 0);

    InitiatePositionArrays(frac, increment, frac_, pos_, 4);
    frac4 = _mm_setr_epi32(frac_[0], frac_[1], frac_[2], frac_[3]);
    pos4 = _mm_setr_epi32(pos_[0], pos_[1], pos_[2], pos_[3]);

    todo = numsamples & ~3;
    for(i = 0;i < todo;i += 4)
    {
        const int pos0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(0, 0, 0, 0)));
        const int pos1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(1, 1, 1, 1)));
        const int pos2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(2, 2, 2, 2)));
        const int pos3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(3, 3, 3, 3)));
        const __m128 val1 = _mm_setr_ps(src[pos0  ], src[pos1  ], src[pos2  ], src[pos3  ]);
        const __m128 val2 = _mm_setr_ps(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1]);

        /* val1 + (val2-val1)*mu */
        const __m128 r0 = _mm_sub_ps(val2, val1);
        const __m128 mu = _mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4);
Example #23
0
		void SWModelRenderer::RenderInner(spades::draw::SWModel *model,
									 const client::ModelRenderParam &param) {
			auto& mat = param.matrix;
			auto origin = mat.GetOrigin();
			auto axis1 = mat.GetAxis(0);
			auto axis2 = mat.GetAxis(1);
			auto axis3 = mat.GetAxis(2);
			auto *rawModel = model->GetRawModel();
			auto rawModelOrigin = rawModel->GetOrigin();
			rawModelOrigin += 0.1f;
			origin += axis1 * rawModelOrigin.x;
			origin += axis2 * rawModelOrigin.y;
			origin += axis3 * rawModelOrigin.z;
			
			int w = rawModel->GetWidth();
			int h = rawModel->GetHeight();
			//int d = rawModel->GetDepth();
			
			// evaluate brightness for each normals
			uint8_t brights[3*3*3];
			{
				auto lightVec = MakeVector3(0.f, -0.707f, -0.707f);
				float dot1 = Vector3::Dot(axis1, lightVec) * fastRSqrt(axis1.GetPoweredLength());
				float dot2 = Vector3::Dot(axis2, lightVec) * fastRSqrt(axis2.GetPoweredLength());
				float dot3 = Vector3::Dot(axis3, lightVec) * fastRSqrt(axis3.GetPoweredLength());
				for(int x = 0; x < 3; x++){
					float d;
					int cnt;
					switch(x){
						case 0: d = -dot1; cnt = 1; break;
						case 1: d = 0.f; cnt = 0; break;
						case 2: d = dot1; cnt = 1; break;
					}
					for(int y = 0; y < 3; y++){
						auto d2 = d;
						auto cnt2 = cnt;
						switch(y){
							case 0: d2 -= dot2; cnt2++; break;
							case 1: break;
							case 2: d2 += dot2; cnt2++; break;
						}
						for(int z = 0; z < 3; z++) {
							auto d3 = d;
							auto cnt3 = cnt2;
							switch(y){
								case 0: d3 -= dot3; cnt3++; break;
								case 1: break;
								case 2: d3 += dot3; cnt3++; break;
							}
							switch(cnt3){
								case 2:
									d3 *= 0.707f;
									break;
								case 3:
									d3 *= 0.57735f;
									break;
							}
							d3 = 192.f + d3 * 62.f;
							brights[x + y * 3 + z * 9]
							= static_cast<uint8_t>(d3);
						}
					}
				}
			}
				
			
			// compute center coord. for culling
			{
				auto center = origin;
				auto localCenter = model->GetCenter();
				center += axis1 * localCenter.x;
				center += axis2 * localCenter.y;
				center += axis3 * localCenter.z;
				
				float largestAxis = axis1.GetPoweredLength();
				largestAxis = std::max(largestAxis, axis2.GetPoweredLength());
				largestAxis = std::max(largestAxis, axis3.GetPoweredLength());
				
				if(!r->SphereFrustrumCull(center, model->GetRadius() * sqrtf(largestAxis)))
					return;
			}
			
			Bitmap *fbmp = r->fb;
			auto *fb = fbmp->GetPixels();
			int fw = fbmp->GetWidth();
			int fh = fbmp->GetHeight();
			auto *db = r->depthBuffer.data();
			
			Matrix4 viewproj = r->GetProjectionViewMatrix();
			Vector4 ndc2scrscale = {fw * 0.5f, -fh * 0.5f, 1.f, 1.f};
			//Vector4 ndc2scroff = {fw * 0.5f, fh * 0.5f, 0.f, 0.f};
			int ndc2scroffX = fw >> 1;
			int ndc2scroffY = fh >> 1;
			
			
			// render each points
			auto tOrigin = viewproj * MakeVector4(origin.x, origin.y, origin.z, 1.f);
			auto tAxis1 = viewproj * MakeVector4(axis1.x, axis1.y, axis1.z, 0.f);
			auto tAxis2 = viewproj * MakeVector4(axis2.x, axis2.y, axis2.z, 0.f);
			auto tAxis3 = viewproj * MakeVector4(axis3.x, axis3.y, axis3.z, 0.f);
			tOrigin *= ndc2scrscale;
			tAxis1 *= ndc2scrscale;
			tAxis2 *= ndc2scrscale;
			tAxis3 *= ndc2scrscale;
			
			float pointDiameter;// = largestAxis * 0.55f * fh * 0.5f;
			{
				float largestAxis = tAxis1.GetPoweredLength();
				largestAxis = std::max(largestAxis, tAxis2.GetPoweredLength());
				largestAxis = std::max(largestAxis, tAxis3.GetPoweredLength());
				pointDiameter = sqrtf(largestAxis);
			}
			
			uint32_t customColor;
			customColor =
			ToFixed8(param.customColor.z) |
			(ToFixed8(param.customColor.y) << 8) |
			(ToFixed8(param.customColor.x) << 16);
			
			auto v1 = tOrigin;
			float zNear = r->sceneDef.zNear;
			for(int x = 0; x < w; x++) {
				auto v2 = v1;
				for(int y = 0; y < h; y++) {
					auto *mp = &model->renderData
					[model->renderDataAddr[x + y * w]];
					while(*mp != -1) {
						uint32_t data = *(mp++);
						uint32_t normal = *(mp++);
						int z = static_cast<int>(data >> 24);
						//SPAssert(z < d);
						SPAssert(z >= 0);
						
						auto vv = v2 + tAxis3 * zvals[z];
						if(vv.z < zNear) continue;
						
						// save Z value (don't divide this by W!)
						float zval = vv.z;
						
						// use vv.z for point radius to be divided by W
						vv.z = pointDiameter;
						
						// perspective division
						float scl = fastRcp(vv.w);
						vv *= scl;
						
						int ix = static_cast<int>(vv.x) + ndc2scroffX;
						int iy = static_cast<int>(vv.y) + ndc2scroffY;
						int idm = static_cast<int>(vv.z + .99f);
						idm = std::max(1, idm);
						int minX = ix - (idm >> 1);
						int minY = iy - (idm >> 1);
						if(minX >= fw || minY >= fh) continue;
						int maxX = ix + idm;
						int maxY = iy + idm;
						if(maxX <= 0 || maxY <= 0) continue;
						
						minX = std::max(minX, 0);
						minY = std::max(minY, 0);
						maxX = std::min(maxX, fw);
						maxY = std::min(maxY, fh);
						
						auto *fb2 = fb + (minX + minY * fw);
						auto *db2 = db + (minX + minY * fw);
						int w = maxX - minX;
						
						uint32_t color = data & 0xffffff;
						if(color == 0)
							color = customColor;
						
						SPAssert(normal < 27);
						int bright = brights[normal];
#if ENABLE_SSE2
						if(lvl == SWFeatureLevel::SSE2) {
							auto m = _mm_setr_epi32(color, 0, 0, 0);
							auto f = _mm_set1_epi16(bright << 8);
							
							m = _mm_unpacklo_epi8(m, _mm_setzero_si128());
							m = _mm_mulhi_epu16(m, f);
							m = _mm_packus_epi16(m, m);
							
							_mm_store_ss(reinterpret_cast<float*>(&color),
										 _mm_castsi128_ps(m));
						}else
#endif
						{
							uint32_t c1 = color & 0xff00;
							uint32_t c2 = color & 0xff00ff;
							c1 *= bright;
							c2 *= bright;
							color = ((c1&0xff0000) | (c2&0xff00ff00)) >> 8;
						}
						
						for(int yy = minY; yy < maxY; yy++){
							auto *fb3 = fb2;
							auto *db3 = db2;
							
							for(int xx = w; xx > 0; xx--) {
								if(zval < *db3) {
									*db3 = zval;
									*fb3 = color;
								}
								fb3++; db3++;
							}
							
							fb2 += fw;
							db2 += fw;
						}
						
						
					}
					v2 += tAxis2;
				}
				v1 += tAxis1;
			}
		}
Example #24
0
intel_AES_GCMContext *intel_AES_GCM_CreateContext(void *context, 
               freeblCipherFunc cipher,
               const unsigned char *params, 
               unsigned int blocksize)
{
    intel_AES_GCMContext *gcm = NULL;
    AESContext *aes = (AESContext*)context;
    const CK_GCM_PARAMS *gcmParams = (const CK_GCM_PARAMS *)params;
    unsigned char buff[AES_BLOCK_SIZE]; /* aux buffer */
    
    int IV_whole_len = gcmParams->ulIvLen&(~0xf);
    int IV_remainder_len = gcmParams->ulIvLen&0xf;
    int AAD_whole_len = gcmParams->ulAADLen&(~0xf);
    int AAD_remainder_len = gcmParams->ulAADLen&0xf;
    
    __m128i BSWAP_MASK = _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
    __m128i ONE = _mm_set_epi32(0,0,0,1);
    unsigned int j;
    SECStatus rv;

    if (blocksize != AES_BLOCK_SIZE) {
      PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
      return NULL;
    }
    gcm = PORT_ZNew(intel_AES_GCMContext);
    
    if (gcm == NULL) {
        return NULL;
    }
    /* initialize context fields */
    gcm->aes_context = aes;
    gcm->tagBits = gcmParams->ulTagBits;
    gcm->Alen = 0;
    gcm->Mlen = 0;
    /* first prepare H and its derivatives for ghash */
    intel_aes_gcmINIT(gcm->Htbl, (unsigned char*)aes->expandedKey, aes->Nr);
    /* Initial TAG value is zero*/
    _mm_storeu_si128((__m128i*)gcm->T, _mm_setzero_si128());
    _mm_storeu_si128((__m128i*)gcm->X0, _mm_setzero_si128());
    /* Init the counter */
    if(gcmParams->ulIvLen == 12) {
        _mm_storeu_si128((__m128i*)gcm->CTR, _mm_setr_epi32(((unsigned int*)gcmParams->pIv)[0], ((unsigned int*)gcmParams->pIv)[1], ((unsigned int*)gcmParams->pIv)[2], 0x01000000));
    } else {
        /* If IV size is not 96 bits, then the initial counter value is GHASH of the IV */
        intel_aes_gcmAAD(gcm->Htbl, gcmParams->pIv, IV_whole_len, gcm->T);
        /* Partial block */
        if(IV_remainder_len) {
            PORT_Memset(buff, 0, AES_BLOCK_SIZE);
            PORT_Memcpy(buff, gcmParams->pIv + IV_whole_len, IV_remainder_len);
            intel_aes_gcmAAD(gcm->Htbl, buff, AES_BLOCK_SIZE, gcm->T);
         }
         
         intel_aes_gcmTAG
         (
            gcm->Htbl,
            gcm->T,
            gcmParams->ulIvLen,
            0,
            gcm->X0,
            gcm->CTR
         );
        /* TAG should be zero again */
        _mm_storeu_si128((__m128i*)gcm->T, _mm_setzero_si128());
    }
    /* Encrypt the initial counter, will be used to encrypt the GHASH value, in the end */
    rv = (*cipher)(context, gcm->X0, &j, AES_BLOCK_SIZE, gcm->CTR, AES_BLOCK_SIZE, AES_BLOCK_SIZE);
    if (rv != SECSuccess) {
        goto loser;
    }
    /* Promote the counter by 1 */
    _mm_storeu_si128((__m128i*)gcm->CTR, _mm_shuffle_epi8(_mm_add_epi32(ONE, _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)gcm->CTR), BSWAP_MASK)), BSWAP_MASK));

/*     Now hash AAD - it would actually make sense to seperate the context creation from the AAD, 
 *     because that would allow to reuse the H, which only changes when the AES key changes, 
 *     and not every package, like the IV and AAD */
    intel_aes_gcmAAD(gcm->Htbl, gcmParams->pAAD, AAD_whole_len, gcm->T);
    if(AAD_remainder_len) {
        PORT_Memset(buff, 0, AES_BLOCK_SIZE);
        PORT_Memcpy(buff, gcmParams->pAAD + AAD_whole_len, AAD_remainder_len);
        intel_aes_gcmAAD(gcm->Htbl, buff, AES_BLOCK_SIZE, gcm->T);
    }
    gcm->Alen += gcmParams->ulAADLen;
    return gcm;
    
    loser:
    if (gcm) {
        PORT_Free(gcm);
    }
    return NULL;
}
Example #25
0
void BrushToolEdit::drawBlur(const QPoint &pt, float amount)
{
    Terrain *tip = tool->tip(pt).data();

    // compute affected rectangle
    QRect dirtyRect(pt, tip->size());
    dirtyRect = dirtyRect.intersected(QRect(QPoint(0, 0), terrain->size()));

    if (!dirtyRect.isValid()) {
        return;
    }

    edit->beginEdit(dirtyRect, terrain);

    QSize tSize = terrain->size();
    QSize tipSize = tip->size();
    QSize blurBufferSize(dirtyRect.width() + 6, dirtyRect.height() + 6);
    TemporaryBuffer<__m128> blurBuffer1(blurBufferSize.width() * blurBufferSize.height(), 16);
    TemporaryBuffer<__m128> blurBuffer2(blurBufferSize.width() * blurBufferSize.height(), 16);
    TemporaryBuffer<float> tipBuffer(blurBufferSize.width() * blurBufferSize.height(), 4);

    for (int y = 0; y < blurBufferSize.height(); ++y) {
        int cy = y + dirtyRect.top() - 3;
        cy = std::max(std::min(cy, tSize.height() - 1), 0);
        for (int x = 0; x < blurBufferSize.width(); ++x) {
            int cx = x + dirtyRect.left() - 3;
            cx = std::max(std::min(cx, tSize.width() - 1), 0);

            quint32 color = terrain->color(cx, cy);
            auto colorMM = _mm_setr_epi32(color, 0, 0, 0);
            colorMM = _mm_unpacklo_epi8(colorMM, _mm_setzero_si128());
            colorMM = _mm_unpacklo_epi16(colorMM, _mm_setzero_si128());
            auto colorF = _mm_cvtepi32_ps(colorMM);
            _mm_store_ps(reinterpret_cast<float *>(blurBuffer1 + x + y * blurBufferSize.width()), colorF);
        }
    }
    for (int y = 0; y < blurBufferSize.height(); ++y) {
        int cy = y + dirtyRect.top() - 3;
        int ty = cy - pt.y();
        if (ty >= 0 && ty < tipSize.height()) {
            for (int x = 0; x < blurBufferSize.width(); ++x) {
                int cx = x + dirtyRect.left() - 3;
                int tx = cx - pt.x();
                tipBuffer[x + y * blurBufferSize.width()] =
                        tx >= 0 && tx < tipSize.width() ?
                            tip->landform(tx, ty) * amount :
                            0.f;
            }
        } else {
            std::fill(&tipBuffer[y * blurBufferSize.width()],
                    &tipBuffer[(y + 1) * blurBufferSize.width()], 0.f);
        }
    }

    // apply horizontal blur
    for (int y = 0; y < blurBufferSize.height(); ++y) {
        __m128 *inBuf = blurBuffer1 + y * blurBufferSize.width();
        __m128 *outBuf = blurBuffer2 + y * blurBufferSize.width();
        float *varBuf = tipBuffer + y * blurBufferSize.width();
        for (int x = 3; x < blurBufferSize.width() - 3; ++x) {
            float variance = varBuf[x];
            __m128 kernel = globalGaussianKernelTable.fetch(variance);

            // sample input
            __m128 p1 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x));
            p1 = _mm_add_ps(p1, p1);
            __m128 p2 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 1));
            p2 = _mm_add_ps(p2, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 1)));
            __m128 p3 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 2));
            p3 = _mm_add_ps(p3, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 2)));
            __m128 p4 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 3));
            p4 = _mm_add_ps(p4, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 3)));

            // apply kernel
            p1 = _mm_mul_ps(p1, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(0, 0, 0, 0)));
            p2 = _mm_mul_ps(p2, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(1, 1, 1, 1)));
            p3 = _mm_mul_ps(p3, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(2, 2, 2, 2)));
            p4 = _mm_mul_ps(p4, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(3, 3, 3, 3)));

            p1 = _mm_add_ps(p1, p2);
            p3 = _mm_add_ps(p3, p4);
            auto p = _mm_add_ps(p1, p3);

            // store
            _mm_store_ps(reinterpret_cast<float *>(outBuf + x), p);
        }
    }

    // apply vertical blur
    for (int y = 3; y < blurBufferSize.height() - 3; ++y) {
        __m128 *inBuf = blurBuffer2 + y * blurBufferSize.width();
        __m128 *outBuf = blurBuffer1 + y * blurBufferSize.width();
        float *varBuf = tipBuffer + y * blurBufferSize.width();
        for (int x = 3; x < blurBufferSize.width() - 3; x += 1) {
            // fetch kernel
            __m128 kernel = globalGaussianKernelTable.fetch(varBuf[x]);

            // load input
            __m128 p1 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x));
            p1 = _mm_add_ps(p1, p1);
            __m128 p2 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width()));
            p2 = _mm_add_ps(p2, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width())));
            __m128 p3 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width() * 2));
            p3 = _mm_add_ps(p3, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width() * 2)));
            __m128 p4 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width() * 3));
            p4 = _mm_add_ps(p4, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width() * 3)));

            // apply kernel
            p1 = _mm_mul_ps(p1, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(0, 0, 0, 0)));
            p2 = _mm_mul_ps(p2, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(1, 1, 1, 1)));
            p3 = _mm_mul_ps(p3, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(2, 2, 2, 2)));
            p4 = _mm_mul_ps(p4, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(3, 3, 3, 3)));
            p1 = _mm_add_ps(p1, p2);
            p3 = _mm_add_ps(p3, p4);
            auto p = _mm_add_ps(p1, p3);

            // store
            _mm_store_ps(reinterpret_cast<float *>(outBuf + x), p);
        }
    }

    for (int y = 0; y < dirtyRect.height(); ++y) {
        __m128 *inBuf = blurBuffer1 + (y + 3) * blurBufferSize.width() + 3;
        for (int x = 0; x < dirtyRect.width(); ++x) {
            int cx = x + dirtyRect.left();
            int cy = y + dirtyRect.top();
            auto colorF = _mm_load_ps(reinterpret_cast<float *>(inBuf + x));
            colorF = _mm_add_ps(colorF, _mm_set1_ps(0.5f));
            colorF = _mm_add_ps(colorF, globalDitherSampler.getM128());
            auto colorMM = _mm_cvttps_epi32(colorF);
            colorMM = _mm_packs_epi32(colorMM, colorMM);
            colorMM = _mm_packus_epi16(colorMM, colorMM);
            _mm_store_ss(reinterpret_cast<float*>(&terrain->color(cx, cy)), _mm_castsi128_ps(colorMM));
        }
    }
    edit->endEdit(terrain);
}
Example #26
0
void BrushToolEdit::drawInner(const QPoint &pt, float strength)
{
    float fixedStrength = params.strength;
    strength *= fixedStrength;

    auto color = params.color;
    std::array<int, 3> colorParts = Terrain::expandColor(color);
    __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0);

    SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST);
    (void) roundingModeScope;

    switch (tool->type()) {
    case BrushType::Blur:
        drawBlur(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Smoothen:
        drawSmoothen(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Raise:
    case BrushType::Lower:
        if (tool->type() == BrushType::Lower) {
            fixedStrength = -fixedStrength;
            strength = -strength;
        }
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength *= 3.f;
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                (void) before;
                current -= tip * strength;
            });
            break;
        case BrushPressureMode::Constant:
            if (tool->type() == BrushType::Lower) {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength));
                });
            } else {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength));
                });
            }
            break;
        case BrushPressureMode::Adjustable:
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                current = Terrain::quantizeOne(before - tip * strength);
            });
            break;
        }
        break;
    case BrushType::Paint:
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength = 1.f - std::exp2(-strength);

            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                (void) before;

                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                auto factor = _mm_set1_ps(tip * strength);

                // blend
                auto diff = _mm_sub_ps(colorMM, currentMF);
                diff = _mm_mul_ps(diff, factor);
                currentMF = _mm_add_ps(currentMF, diff);

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Constant:
            fixedStrength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);
                // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128());

                // use "before" image to which way of color change is possible, and
                // compute possible range of result color
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * fixedStrength);
                auto adddiff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, adddiff);
                auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps());

                // compute output image
                auto out1 = _mm_max_ps(currentMF, beforeMF);
                auto out2 = _mm_min_ps(currentMF, beforeMF);
                currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2));

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Adjustable:
            strength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);

                // blend
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * strength);
                diff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, diff);

                // convert to RGB32
                beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128());
                beforeMM = _mm_cvttps_epi32(beforeMF);
                beforeMM = _mm_packs_epi32(beforeMM, beforeMM);
                beforeMM = _mm_packus_epi16(beforeMM, beforeMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(beforeMM));
            });
            break;
        }
        break;
    }

}
Example #27
0
QSharedPointer<Terrain> BrushTool::tip(QPoint origin)
{
    bool needToGenerate = false;
    if (!tip_) {
        tip_ = QSharedPointer<Terrain>::create(QSize(parameters_.size, parameters_.size));
        needToGenerate = true;
    }
    if (origin.x() < -500) {
        origin = lastTipOrigin_;
    }
    if (origin != lastTipOrigin_) {
        switch (parameters_.tipType) {
        case BrushTipType::Mountains:
            needToGenerate = true;
            break;
        default:
            // position invariant
            break;
        }
    }
    if (needToGenerate) {
        Terrain *t = tip_.data();
        auto size = parameters_.size;

        float scale = 1.f / size;

        switch (parameters_.tipType) {
        case BrushTipType::Mountains:
            {
                // Set rounding mode (required by CoherentNoiseGenerator)
                SseRoundingModeScope roundingModeScope(_MM_ROUND_DOWN);
                (void) roundingModeScope;

                if (noiseGenSeed != parameters_.seed) {
                    noiseGenSeed = parameters_.seed;
                    noiseGen.randomize(static_cast<std::uint_fast32_t>(noiseGenSeed));
                }

                auto noise = noiseGen.sampler();
                __m128i originMM = _mm_setr_epi32(origin.x(), origin.y(), 0, 0);
                float noiseScale = 10.f / parameters_.scale;

                for (int y = 0; y < size; ++y) {
                    for (int x = 0; x < size; ++x) {
                        int cx = (x << 1) - size + 1;
                        int cy = (y << 1) - size + 1;
                        float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale;
                        float alt;
                        if (sq <= 0.f) {
                            alt = 0.f;
                        } else {
                            auto posI = _mm_add_epi32(_mm_setr_epi32(x, y, 0, 0), originMM);
                            auto pos = _mm_cvtepi32_ps(posI);
                            pos = _mm_mul_ps(pos, _mm_set1_ps(noiseScale));
                            auto pos1 = _mm_mul_ps(pos, _mm_set1_ps(0.1f));
                            pos = _mm_unpacklo_ps(_mm_hadd_ps(pos, pos), _mm_hsub_ps(pos, pos));
                            auto pos2 = _mm_mul_ps(pos, _mm_set1_ps(0.15f));
                            auto pos3 = _mm_mul_ps(pos, _mm_set1_ps(0.3f));
                            auto pos4 = _mm_mul_ps(pos, _mm_set1_ps(0.03f));
                            float noiseVal = noise.sample(pos1);
                            noiseVal += noise.sample(pos2) * .3f;
                            noiseVal += noise.sample(pos3) * .15f;
                            noiseVal += noise.sample(pos4) * 1.5f;
                            noiseVal = std::max(std::min(0.5f + noiseVal * 1.1f, 1.f), 0.f);

                            float sqBase = sq;
                            sq *= sq * (3.f - 2.f * sq) * 0.8f;
                            sq *= sq;
                            sq -= 0.1f;
                            sq += (sqBase - sq) * std::abs(noiseVal);
                            alt = std::max(0.f, sq);
                        }
                        t->landform(x, y) = alt;
                    }
                }
            }
            break;
        case BrushTipType::Bell:
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    int cx = (x << 1) - size + 1;
                    int cy = (y << 1) - size + 1;
                    float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale;
                    float alt;
                    if (sq <= 0.f) {
                        alt = 0.f;
                    } else {
                        sq *= sq * (3.f - 2.f * sq);
                        alt = sq;
                    }
                    t->landform(x, y) = alt;
                }
            }
            break;
        case BrushTipType::Cone:
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    int cx = (x << 1) - size + 1;
                    int cy = (y << 1) - size + 1;
                    float sq = 1.f - sqrtf(cx * cx + cy * cy) * scale;
                    float alt;
                    if (sq <= 0.f) {
                        alt = 0.f;
                    } else {
                        alt = sq;
                    }
                    t->landform(x, y) = alt;
                }
            }
            break;
        case BrushTipType::Sphere:
            scale *= scale;
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    int cx = (x << 1) - size + 1;
                    int cy = (y << 1) - size + 1;
                    float sq = 1.f - (cx * cx + cy * cy) * scale;
                    float alt;
                    if (sq <= 0.f) {
                        alt = 0.f;
                    } else {
                        alt = std::sqrt(sq);
                    }
                    t->landform(x, y) = alt;
                }
            }
            break;
        case BrushTipType::Cylinder:
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    int cx = (x << 1) - size + 1;
                    int cy = (y << 1) - size + 1;
                    float sq = size * size - (cx * cx + cy * cy);
                    float alt;
                    if (sq <= 0.f) {
                        alt = 0.f;
                    } else {
                        alt = 1.f;
                    }
                    t->landform(x, y) = alt;
                }
            }
            break;
        case BrushTipType::Square:
            for (int y = 0; y < size; ++y) {
                for (int x = 0; x < size; ++x) {
                    t->landform(x, y) = 1.f;
                }
            }
            break;
        }

    }
    return tip_;
}
Example #28
0
static void compute_correction_matrix(colgate_instance_t *o)
{
	int i;

	/*
	 * Find out what the given neutral color would be in LMS space,
	 * and use that value to build a correction factor for each component
	 * so that the neutral color really becomes gray (in LMS).
	 */
	float ref_r = o->neutral_color.r * 255.0f;
	float ref_g = o->neutral_color.g * 255.0f;
	float ref_b = o->neutral_color.b * 255.0f;

	float linear_r = convert_srgb_to_linear_rgb(ref_r);
	float linear_g = convert_srgb_to_linear_rgb(ref_g);
	float linear_b = convert_srgb_to_linear_rgb(ref_b);

	float x, y, z;
	convert_linear_rgb_to_linear_xyz(linear_r, linear_g, linear_b, &x, &y, &z);

	float l, m, s;
	convert_linear_xyz_to_linear_lms(x, y, z, &l, &m, &s);

	float l_scale, m_scale, s_scale;
	compute_lms_scaling_factors(x, y, z, &l_scale, &m_scale, &s_scale);

	/*
	 * Now apply the color balance. Simply put, we find the chromacity point
	 * for the desired white temperature, see what LMS scaling factors they
	 * would have given us, and then reverse that transform. For T=6500K,
	 * the default, this gives us nearly an identity transform (but only nearly,
	 * since the D65 illuminant does not exactly match the results of T=6500K);
	 * we normalize so that T=6500K really is a no-op.
	 */
	float white_x, white_y, white_z, l_scale_white, m_scale_white, s_scale_white;
	convert_color_temperature_to_xyz(o->color_temperature, &white_x, &white_y, &white_z);
	compute_lms_scaling_factors(white_x, white_y, white_z, &l_scale_white, &m_scale_white, &s_scale_white);

	float ref_x, ref_y, ref_z, l_scale_ref, m_scale_ref, s_scale_ref;
	convert_color_temperature_to_xyz(6500.0f, &ref_x, &ref_y, &ref_z);
	compute_lms_scaling_factors(ref_x, ref_y, ref_z, &l_scale_ref, &m_scale_ref, &s_scale_ref);

	l_scale *= l_scale_ref / l_scale_white;
	m_scale *= m_scale_ref / m_scale_white;
	s_scale *= s_scale_ref / s_scale_white;

	/*
	 * Concatenate all the different linear operations into a single 3x3 matrix.
	 * Note that since we postmultiply our vectors, the order of the matrices
	 * has to be the opposite of the execution order.
	 */
	Matrix3x3 temp, temp2, corr_matrix;
	Matrix3x3 lms_scale_matrix = {
		l_scale,    0.0f,    0.0f,
		   0.0f, m_scale,    0.0f,
		   0.0f,    0.0f, s_scale,
	};
	multiply_3x3_matrices(xyz_to_rgb_matrix, lms_to_xyz_matrix, temp);
	multiply_3x3_matrices(temp, lms_scale_matrix, temp2);
	multiply_3x3_matrices(temp2, xyz_to_lms_matrix, temp);
	multiply_3x3_matrices(temp, rgb_to_xyz_matrix, corr_matrix);

	// Scale for fixed-point, and clamp. We clamp the matrix elements
	// instead of the actual fixed-point numbers below, to make sure
	// we get consistent results over the entire range.
	for (i = 0; i < 9; ++i) {
		corr_matrix[i] *= (float)(1 << MATRIX_ELEMENT_FRAC_BITS);
		if (corr_matrix[i] < -(1 << MATRIX_ELEMENT_BITS)) {
			corr_matrix[i] = -(1 << MATRIX_ELEMENT_BITS);
		}
		if (corr_matrix[i] > (1 << MATRIX_ELEMENT_BITS) - 1) {
			corr_matrix[i] = (1 << MATRIX_ELEMENT_BITS) - 1;
		}
	}

	// Precompute some of the multiplications (after conversion from sRGB)
	// to save some time per-pixel later. Each of these contain the given color
	// converted to linear space and then multiplied by three different factors,
	// given by the matrix.
	for (i = 0; i < 256; ++i) {
		int x = convert_srgb_to_linear_rgb(i) * (float)(1 << INPUT_PIXEL_BITS);

		int r0 = lrintf(x * corr_matrix[0]);
		int r1 = lrintf(x * corr_matrix[3]);
		int r2 = lrintf(x * corr_matrix[6]);

		int g0 = lrintf(x * corr_matrix[1]);
		int g1 = lrintf(x * corr_matrix[4]);
		int g2 = lrintf(x * corr_matrix[7]);

		int b0 = lrintf(x * corr_matrix[2]);
		int b1 = lrintf(x * corr_matrix[5]);
		int b2 = lrintf(x * corr_matrix[8]);

#if __SSE2__
		o->premult_r[i] = _mm_setr_epi32(r0, r1, r2, 0);
		o->premult_g[i] = _mm_setr_epi32(g0, g1, g2, 0);
		o->premult_b[i] = _mm_setr_epi32(b0, b1, b2, 0);
#else
		o->premult_r[i][0] = r0;
		o->premult_r[i][1] = r1;
		o->premult_r[i][2] = r2;

		o->premult_g[i][0] = g0;
		o->premult_g[i][1] = g1;
		o->premult_g[i][2] = g2;

		o->premult_b[i][0] = b0;
		o->premult_b[i][1] = b1;
		o->premult_b[i][2] = b2;
#endif
	}
}
Example #29
0
{
	int ret;
	int i;

	__m128i template;	/* 256-bit write was worse... */
	__m128i rxdesc_fields;

	struct rte_mbuf tmp;
	/* DPDK 2.1 specific
	 * packet_type 0 (32 bits)
	 * pkt_len len (32 bits)
	 * data_len len (16 bits)
	 * vlan_tci 0 (16 bits)
	 * rss 0 (32 bits)
	 */
	rxdesc_fields = _mm_setr_epi32(0, len, len, 0);

	ret = rte_mempool_get_bulk(current_pframe_pool(),
			(void**)array, cnt);
	if (ret != 0) {
		return ret;
	}

	template = *((__m128i*)&current_template()->buf_len);

	if (cnt & 1) {
		array[cnt] = &tmp;
	}

	/* 4 at a time didn't help */
	for (i = 0; i < cnt; i+=2) {
void ahd_interpolate_tile(int top, char * buffer)
{
    int row, col, tr, tc, c, val;
    const int dir[4] = { -1, 1, -width, width };
    __m128i ldiff[2], abdiff[2];
    union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer;
    union hvrgbpix *rix;
    union rgbpix * pix;
    union hvrgbpix (*lab)[width];
    short (*lix)[8];
    char (*h**o)[width][2];
    lab  = (union hvrgbpix (*)[width])(buffer + 16*width*TS);
    h**o = (char  (*)[width][2])(buffer + 32*width*TS);

    const int left=2;

    if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) {
        fprintf(stderr, "unaligned buffers defeat speed!\n"); abort();
    }

    /*  Interpolate gren horz&vert, red and blue, and convert to CIELab:  */
    //do the first two rows of green first.
    //then one green, and rgb through the tile.. this because R/B needs down-right green value
    for (row=top; row < top+2 && row < height-2; row++) {
        col = left + (FC(row,left) & 1);
        for (c = FC(row,col); col < width-2; col+=2) {
            pix = (union rgbpix*)image + row*width+col;
            val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2;
            rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2;
            rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g);
        }
    }

    for (; row < top+TS && row < height-2; row++) {
        int rowx = row-1;

        if (FC(rowx,left+1)==1) {
            int c1 = FC(rowx+1,left+1),
                c2 = FC(rowx,left+2);

            pix = (union rgbpix*)image + row*width+left+1;
            rix = &rgb[row-top][1];

            val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2;
            rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g);
            val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2;
            rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g);
            for (col=left+1; col < width-3; col+=2) {
                pix = (union rgbpix*)image + rowx*width+col+1;

                union hvrgbpix rixr, rix0;

                rix = &rgb[rowx-top][col-left]+1;

                signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1];
                signed pix_ul = pix[-width-1].c[c1];
                rixr.vec = _mm_set1_epi16(pix[-1].g);
                signed pix_lr = pix[-2].c[c2] + pix[0].c[c2];
                rix0.h.c[c2] = rix0.v.c[c2]  = pix[0].c[c2];
                pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1;
                signed pix_dl = pix[width-1].c[c1];

                //fully loaded
                __m128i rix_dr =               _mm_setr_epi32(pix[width].g,       pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]);
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1],  pix[width+3].c[c1], pix[width+1].c[c1], 0));
                rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g,      0,                  pix[2*width+1].g, pix[3*width+1].c[c1]));
                rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1));
                //half loaded
                rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128());
                rix_dr = _mm_srai_epi32(rix_dr,2);
                __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0);
                __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0);
                __m128i m = _mm_min_epi32(a,b);
                __m128i M = _mm_max_epi32(a,b);
                rix_dr = _mm_min_epi32(rix_dr,M);
                rix_dr = _mm_max_epi32(rix_dr,m);

                signed pix_udr = pix_ul + pix_dl;

                signed rix0_ul = rix[-width-1].h.g;
                signed rix1_ul = rix[-width-1].v.g;
                __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0);
                signed rix0_rr = rix[-2].h.g;
                signed rix1_rr = rix[-2].v.g;

                rix0.h.g = rix[0].h.g;
                rix0.v.g = rix[0].v.g;
                signed rix0_dl = rix[width-1].h.g;
                signed rix1_dl = rix[width-1].v.g;

                // fully loaded
                __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr);
                rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g));
                __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr);
                v2 = _mm_sub_epi32(v2, rix_udr);
                v2 = _mm_srai_epi32(v2,1);
                v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec));
                v2 = _mm_max_epi32(v2, _mm_setzero_si128());
                v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff));
                rixr.h.c[c2] = _mm_extract_epi32(v2,2);
                rixr.v.c[c2] = _mm_extract_epi32(v2,3);
                rixr.h.c[c1] = _mm_extract_epi32(v2,0);
                rixr.v.c[c1] = _mm_extract_epi32(v2,1);

                // following only uses 64 bit
                __m128i v1 = _mm_set1_epi32(pix_diag);
                v1 = _mm_sub_epi32(v1, rix_ur);
                v1 = _mm_sub_epi32(v1, rix_dr);
                v1 = _mm_sub_epi32(v1, rix_udr);
                v1 = _mm_srai_epi32(v1,2);
                v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0));
                v1 = _mm_max_epi32(v1, _mm_setzero_si128());
                v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff));
                rix0.h.c[c1] = _mm_extract_epi32(v1,0);
                rix0.v.c[c1] = _mm_extract_epi32(v1,1);


                lab[rowx-top][col-left].vec = cielabv(rixr);
                lab[rowx-top][col-left+1].vec = cielabv(rix0);

                _mm_store_si128(&rix[-1].vec,rixr.vec);
                _mm_store_si128(&rix[0].vec,rix0.vec);

                rix[width+1].h.g = _mm_extract_epi32(rix_dr,0);
                rix[width+1].v.g = _mm_extract_epi32(rix_dr,1);
            }
        } else {