Beispiel #1
0
foo_r (float x, int i)
{
  switch (i)
    {
    case 0:
      return _mm512_setr_ps (x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    case 1:
      return _mm512_setr_ps (0, x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    case 2:
      return _mm512_setr_ps (0, 0, x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    case 3:
      return _mm512_setr_ps (0, 0, 0, x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    case 4:
      return _mm512_setr_ps (0, 0, 0, 0, x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    case 5:
      return _mm512_setr_ps (0, 0, 0, 0, 0, x, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    case 6:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, x, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    case 7:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, x, 0, 0, 0, 0, 0, 0, 0, 0);
    case 8:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, 0, x, 0, 0, 0, 0, 0, 0, 0);
    case 9:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 0, 0, 0, 0, 0);
    case 10:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 0, 0, 0, 0);
    case 11:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 0, 0, 0);
    case 12:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 0, 0);
    case 13:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 0);
    case 14:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, x, 0);
    case 15:
      return _mm512_setr_ps (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, x);
    default:
      abort ();
    }
}
Beispiel #2
0
foo_r (float x)
{
  return _mm512_setr_ps (x, x, x, x, x, x, x, x,
			 x, x, x, x, x, x, x, x);
}
void AVX512BW_mandelbrot(
	float Re_min, float Re_max,
	float Im_min, float Im_max,
	float threshold,
	int maxiters,
	int width, int height,
	uint8_t *data)

{
	float dRe, dIm;
	int x, y;

	__m128i* ptr = (__m128i*)data;

	// step on Re and Im axis
	dRe = (Re_max - Re_min)/width;
	dIm = (Im_max - Im_min)/height;

	// prepare vectors
	// 1. threshold
    const __m512 vec_threshold = _mm512_set1_ps(threshold);

	// 2. Cim
    __m512 Cim = _mm512_set1_ps(Im_min);

	// 3. Re advance every x iteration
    const __m512 vec_dRe = _mm512_set1_ps(16*dRe);

	// 4. Im advance every y iteration
    const __m512 vec_dIm = _mm512_set1_ps(dIm);

	// calculations
	for (y=0; y < height; y++) {

        __m512 Cre = _mm512_setr_ps(
            Re_min +  0*dRe, Re_min +  1*dRe, Re_min +  2*dRe, Re_min +  3*dRe,
            Re_min +  4*dRe, Re_min +  5*dRe, Re_min +  6*dRe, Re_min +  7*dRe,
            Re_min +  8*dRe, Re_min +  9*dRe, Re_min + 10*dRe, Re_min + 11*dRe,
            Re_min + 12*dRe, Re_min + 13*dRe, Re_min + 14*dRe, Re_min + 15*dRe
        );

		for (x=0; x < width; x+=16) {

            __m512 Xre = _mm512_setzero_ps();
            __m512 Xim = _mm512_setzero_ps();

            __m128i itercount = _mm_setzero_si128();

            int i;
            for (i=0; i < maxiters; i++) {

			    // Tre = Xre^2 - Xim^2 + Cim
                const __m512 Xre2 = _mm512_mul_ps(Xre, Xre);
                const __m512 Xim2 = _mm512_mul_ps(Xim, Xim);
                const __m512 Tre  = _mm512_add_ps(Cre, _mm512_sub_ps(Xre2, Xim2));

			    // Tim = 2*Xre*Xim + Cre
                const __m512 t1  = _mm512_mul_ps(Xre, Xim);
                const __m512 Tim = _mm512_add_ps(Cim, _mm512_add_ps(t1, t1));

                // sqr_dist = Tre^2 + Tim^2
                __m512 Tre2 = _mm512_mul_ps(Tre, Tre);
                __m512 Tim2 = _mm512_mul_ps(Tim, Tim);
                __m512 sqr_dist = _mm512_add_ps(Tre2, Tim2);

                // sqr_dist < threshold => 16-bit mask
                __mmask16 mask = _mm512_cmp_ps_mask(sqr_dist, vec_threshold, _CMP_LE_OS);
                if (mask == 0) {
                    break;
                }

                // Note: unlike SSE/AVX2 versions itercount is a packed byte vector,
                //       thus conversion packed dword -> byte is not needed.
                itercount = _mm_sub_epi8(itercount, _mm_movm_epi8(mask));

                Xre = Tre;
                Xim = Tim;

            } // for

            *ptr++ = itercount;

			// advance Cre vector
            Cre = _mm512_add_ps(Cre, vec_dRe);
		}

		// advance Cim vector
        Cim = _mm512_add_ps(Cim, vec_dIm);
	}
}