inline
 short_vec<float, 32> operator/(const short_vec<float, 32>& other) const
 {
     return short_vec<float, 32>(
         _mm512_mul_ps(val1, _mm512_rcp14_ps(other.val1)),
         _mm512_mul_ps(val2, _mm512_rcp14_ps(other.val2)));
 }
 inline
 short_vec<float, 32> operator*(const short_vec<float, 32>& other) const
 {
     return short_vec<float, 32>(
         _mm512_mul_ps(val1, other.val1),
         _mm512_mul_ps(val2, other.val2));
 }
	static inline mic_m512c_t mic_rcp_cps(mic_m512c_t a) {
		mic_m512_t temp1 = _mm512_add_ps(_mm512_mul_ps(a.xvec, a.xvec), _mm512_mul_ps(a.yvec, a.yvec));
		mic_m512_t temp2 = _mm512_rcp23_ps(temp1);
		mic_m512c_t vec;
		//__m512 neg_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
		vec.xvec = _mm512_mul_ps(a.xvec, temp2);
		//vec.yvec = _mm512_xor_ps(_mm512_mul_ps(a.yvec, temp2), neg_mask);
		mic_m512_t zero = _mm512_setzero_ps();
		vec.yvec = _mm512_sub_ps(zero, _mm512_mul_ps(a.yvec, temp2));
		return vec;
	} // mic_rcp_cps()
inline void mic_sincos_ps(mic_m512_t x, mic_m512_t *s, mic_m512_t *c) {
	__m512i sign_bit = _mm512_and_epi32(_mm512_castps_si512(x), _pi32_sign_mask);
	x = _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(x), _pi32_inv_sign_mask));

	mic_m512_t y = _mm512_mul_ps(x, _ps_cephes_FOPI);

	__m512i emm2 = _mm512_cvtfxpnt_round_adjustps_epi32(y, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
	emm2 = _mm512_add_epi32(emm2, _pi32_1);
	emm2 = _mm512_and_epi32(emm2, _pi32_inv1);
	y = _mm512_cvtfxpnt_round_adjustepu32_ps(emm2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);

	__m512i cos_emm2 = _mm512_sub_epi32(emm2, _pi32_2);

	__m512i emm0 = _mm512_and_epi32(emm2, _pi32_4);
	__m512i cos_emm0 = _mm512_andnot_epi32(cos_emm2, _pi32_4);
	emm0 = _mm512_slli_epi32(emm0, 29);
	cos_emm0 = _mm512_slli_epi32(cos_emm0, 29);
	sign_bit = _mm512_xor_epi32(sign_bit, emm0);

	emm2 = _mm512_and_epi32(emm2, _pi32_2);
	cos_emm2 = _mm512_and_epi32(cos_emm2, _pi32_2);
	__mmask16 mask = _mm512_cmp_epi32_mask(emm2, _pi32_0, _MM_CMPINT_EQ);
	emm2 = _mm512_mask_add_epi32(_pi32_0, mask, _pi32_ffff, _pi32_0);
	__mmask16 cos_mask = _mm512_cmp_epi32_mask(cos_emm2, _pi32_0, _MM_CMPINT_EQ);
	cos_emm2 = _mm512_mask_add_epi32(_pi32_0, cos_mask, _pi32_ffff, _pi32_0);
	
	x = _mm512_fmadd_ps(y, _ps_minus_cephes_DP123, x);

	mic_m512_t x2 = _mm512_mul_ps(x, x);
	mic_m512_t x3 = _mm512_mul_ps(x2, x);
	mic_m512_t x4 = _mm512_mul_ps(x2, x2);

	y = _mm512_fmadd_ps(_ps_coscof_p0, x2, _ps_coscof_p1);
	y = _mm512_fmadd_ps(y, x2, _ps_coscof_p2);
	mic_m512_t temp_2 = _mm512_fmsub_ps(x2, _ps_0point5, _ps_1);
	y = _mm512_fmsub_ps(y, x4, temp_2);
	mic_m512_t y2 = _mm512_fmadd_ps(_ps_sincof_p0, x2, _ps_sincof_p1);
	y2 = _mm512_fmadd_ps(y2, x2, _ps_sincof_p2);
	y2 = _mm512_fmadd_ps(y2, x3, x);

	mic_m512_t cos_y = y;
	mic_m512_t cos_y2 = y2;

	y = _mm512_castsi512_ps(_mm512_andnot_epi32(emm2, _mm512_castps_si512(y)));
	cos_y = _mm512_castsi512_ps(_mm512_andnot_epi32(cos_emm2, _mm512_castps_si512(cos_y)));
	y2 = _mm512_castsi512_ps(_mm512_and_epi32(emm2, _mm512_castps_si512(y2)));
	cos_y2 = _mm512_castsi512_ps(_mm512_and_epi32(cos_emm2, _mm512_castps_si512(cos_y2)));

	y = _mm512_add_ps(y, y2);
	cos_y = _mm512_add_ps(cos_y, cos_y2);

	*s = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(y), sign_bit));
	*c = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(cos_y), cos_emm0));
} // sincos_ps()
// exp()
inline mic_m512_t mic_exp_ps(mic_m512_t x) {
	x = _mm512_min_ps(x, _ps_exp_hi);
	x = _mm512_max_ps(x, _ps_exp_lo);

	mic_m512_t temp_2 = _mm512_fmadd_ps(x, _ps_cephes_LOG2EF, _ps_0point5);

	mic_m512_t temp_1 = _mm512_round_ps(temp_2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
	mic_m512_t temp_3 = _mm512_sub_ps(temp_1, temp_2);
	__mmask16 mask = _mm512_cmp_ps_mask(temp_3, _ps_0, _MM_CMPINT_GT);

	temp_2 = _mm512_mask_sub_ps(temp_1, mask, temp_1, _ps_1);
	__m512i emm0 = _mm512_cvtfxpnt_round_adjustps_epi32(temp_2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);

	x = _mm512_fnmadd_ps(temp_2, _ps_cephes_exp_C12, x);

	mic_m512_t x2 = _mm512_mul_ps(x, x);
	mic_m512_t x3 = _mm512_mul_ps(x2, x);
	mic_m512_t x4 = _mm512_mul_ps(x2, x2);
 
	temp_1 = _mm512_add_ps(x, _ps_1);
	temp_1 = _mm512_fmadd_ps(x2, _ps_cephes_exp_p5, temp_1);
	temp_1 = _mm512_fmadd_ps(x3, _ps_cephes_exp_p4, temp_1);

	temp_2 = _mm512_mul_ps(x3, _ps_cephes_exp_p0);
	temp_3 = _mm512_mul_ps(x2, _ps_cephes_exp_p1);

	mic_m512_t temp_4 = _mm512_mul_ps(x, _ps_cephes_exp_p2);

	emm0 = _mm512_add_epi32(emm0, _pi32_0x7f);

	temp_2 = _mm512_add_ps(temp_2, temp_3);
	temp_3 = _mm512_add_ps(temp_3, temp_4);
	temp_2 = _mm512_add_ps(temp_2, temp_3);

	emm0 = _mm512_slli_epi32(emm0, 23);
	mic_m512_t pow2n = _mm512_castsi512_ps(emm0);

	temp_2 = _mm512_mul_ps(temp_2, x4);

	mic_m512_t y = _mm512_add_ps(temp_1, temp_2);

	y = _mm512_mul_ps(y, pow2n);
	return y;
} // newexp_ps()
__attribute__((noinline)) float dot512(float *x1, float *x2, size_t len) {
  assert(len % 16 == 0);
  __m512 sum = _mm512_setzero_ps();
  if (len > 15) {
    size_t limit = len - 15;
    for (size_t i = 0; i < limit; i += 16) {
      __m512 v1 = _mm512_loadu_ps(x1 + i);
      __m512 v2 = _mm512_loadu_ps(x2 + i);
      sum = _mm512_add_ps(sum, _mm512_mul_ps(v1, v2));
    }
  }
  float buffer[16];
  _mm512_storeu_ps(buffer, sum);
  return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] +
         buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] +
         buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15];
}
Exemple #7
0
/**
 * Multiply Matrices A & B to the Matrix C.
 * the sizes of A,B and C are:
 * -----------------
 * A		M * K
 * B		K * N
 * C		M * N
 * -----------------
 * Matrix C should be allocated outside.
 * Matrix C will not be cleared in this function, thus you can add new result to the Matrix C.
 */
void highEfficentMatrixMultiply(float* C, float* A, float* B, int M, int K, int N)
{
	int i,j,k;
	float tmp[16] __attribute__((align(64)));
//#pragma omp parallel for private(j,k) num_threads(THREAD_NUM)
	for(i=0; i < M; i++)
	{
		/* Code Run on Xeon Phi */
		#ifdef __MIC__
		__m512 _A,_B,_C;
		for(k = 0; k < K; k++)
		{
			_A = _mm512_set_1to16_ps(A[i*K + k]);
			/*
			_mm512_packstorelo_ps((void*)&tmp,_A);
			_mm512_packstorehi_ps((char*)&tmp + 64,_A);
			for(int s = 0 ; s < 16; s++)
				fprintf(stderr,"%f ",tmp[s]);
			*/
			//for(j = 0; j < N/16; j += 16)
			for(j = 0; j+16 < N; j += 16)
			{
				//fprintf(stderr,"[i,k,j,A[i,k]]=[%d,%d,%d,%f]\n",i,k,j,A[i*K+k]);
				_B = _mm512_loadunpacklo_ps(_B,(void*)(&B[k*N + j]));
				_B = _mm512_loadunpackhi_ps(_B,(void*)(&B[k*N + j + 16]));
				_C = _mm512_loadunpacklo_ps(_C,(void*)(&C[i*N + j]));
				_C = _mm512_loadunpackhi_ps(_C,(void*)(&C[i*N + j + 16]));

				_mm512_packstorelo_ps((void*)&tmp,_C);
				_mm512_packstorehi_ps((char*)&tmp + 64,_C);
				
				/*
				fprintf(stderr,"_C=\n");
				for(int s = 0 ; s < 4; s++)
					fprintf(stderr,"%f ",tmp[s]);
				fprintf(stderr,"\n");
				*/
				

				_C = _mm512_add_ps(_C,_mm512_mul_ps(_A,_B));
				_mm512_packstorelo_ps((void*)(&C[i*N+j]),_C);
				_mm512_packstorehi_ps((void*)(&C[i*N+j+16]),_C);

				
				/*
				_mm512_packstorelo_ps((void*)&tmp,_C);
				_mm512_packstorehi_ps((char*)&tmp + 64,_C);
				
				for(int s = 0 ; s < 4; s++)
					fprintf(stderr,"%f ",tmp[s]);
				fprintf(stderr,"\n");
				*/
				
			}
			if (j+16 > N)
			{
				
//				fprintf(stderr,"[j=%d]\n",j);
				//We should deal with the tail in each row
				float temp = A[i*K + k];
				#pragma ivdep 
				for(; j < N; j++)
					C[i*N + j] += temp * B[k*N + j];
					
			}



		}	
		
		#else
		/* Code Run On Xeon */
		for(k = 0; k < K; k++)
		{
			float temp = A[i*K + k];
			#pragma ivdep
			for(j = 0; j < N; j++)
			{
				C[i*N + j] += temp * B[k*N + j];
			}
		}
	#endif
	}
}
int main() {
    // Initialize
    int N = 1 << 16;
    int NALIGN = 64;
    int i, j;
    float OPS = 20. * N * N * 1e-9;
    float EPS2 = 1e-6;
    double tic, toc;
    float * x = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * y = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * z = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * m = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * p = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * ax = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * ay = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    float * az = (float*) _mm_malloc(N * sizeof(float), NALIGN);
    #pragma omp parallel for
    for (i=0; i<N; i++) {
        x[i] = drand48();
        y[i] = drand48();
        z[i] = drand48();
        m[i] = drand48() / N;
        p[i] = ax[i] = ay[i] = az[i] = 0;
    }
    printf("N : %d\n",N);

    #pragma omp parallel private(j)
    {
        #pragma omp single
        tic = get_time();
        // Vectorize target with intrinsics
        #pragma omp for
        for (i=0; i<N; i+=16) {
            __m512 pi = _mm512_setzero_ps();
            __m512 axi = _mm512_setzero_ps();
            __m512 ayi = _mm512_setzero_ps();
            __m512 azi = _mm512_setzero_ps();
            __m512 xi = _mm512_load_ps(x+i);
            __m512 yi = _mm512_load_ps(y+i);
            __m512 zi = _mm512_load_ps(z+i);
            for (j=0; j<N; j++) {
                __m512 xj = _mm512_set1_ps(x[j]);
                xj = _mm512_sub_ps(xj, xi);
                __m512 yj = _mm512_set1_ps(y[j]);
                yj = _mm512_sub_ps(yj, yi);
                __m512 zj = _mm512_set1_ps(z[j]);
                zj = _mm512_sub_ps(zj, zi);
                __m512 R2 = _mm512_set1_ps(EPS2);
                R2 = _mm512_fmadd_ps(xj, xj, R2);
                R2 = _mm512_fmadd_ps(yj, yj, R2);
                R2 = _mm512_fmadd_ps(zj, zj, R2);
                __m512 mj = _mm512_set1_ps(m[j]);
                __m512 invR = _mm512_rsqrt23_ps(R2);
                mj = _mm512_mul_ps(mj, invR);
                pi = _mm512_add_ps(pi, mj);
                invR = _mm512_mul_ps(invR, invR);
                invR = _mm512_mul_ps(invR, mj);
                axi = _mm512_fmadd_ps(xj, invR, axi);
                ayi = _mm512_fmadd_ps(yj, invR, ayi);
                azi = _mm512_fmadd_ps(zj, invR, azi);
            }
            _mm512_store_ps(p+i, pi);
            _mm512_store_ps(ax+i, axi);
            _mm512_store_ps(ay+i, ayi);
            _mm512_store_ps(az+i, azi);
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize target with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize source with intrinsics
            tic = get_time();
        }
        #pragma omp for
        for (i=0; i<N; i++) {
            __m512 pi = _mm512_setzero_ps();
            __m512 axi = _mm512_setzero_ps();
            __m512 ayi = _mm512_setzero_ps();
            __m512 azi = _mm512_setzero_ps();
            __m512 xi = _mm512_set1_ps(x[i]);
            __m512 yi = _mm512_set1_ps(y[i]);
            __m512 zi = _mm512_set1_ps(z[i]);
            for (j=0; j<N; j+=16) {
                __m512 xj = _mm512_load_ps(x+j);
                xj = _mm512_sub_ps(xj, xi);
                __m512 yj = _mm512_load_ps(y+j);
                yj = _mm512_sub_ps(yj, yi);
                __m512 zj = _mm512_load_ps(z+j);
                zj = _mm512_sub_ps(zj, zi);
                __m512 R2 = _mm512_set1_ps(EPS2);
                R2 = _mm512_fmadd_ps(xj, xj, R2);
                R2 = _mm512_fmadd_ps(yj, yj, R2);
                R2 = _mm512_fmadd_ps(zj, zj, R2);
                __m512 mj = _mm512_load_ps(m+j);
                __m512 invR = _mm512_rsqrt23_ps(R2);
                mj = _mm512_mul_ps(mj, invR);
                pi = _mm512_add_ps(pi, mj);
                invR = _mm512_mul_ps(invR, invR);
                invR = _mm512_mul_ps(invR, mj);
                axi = _mm512_fmadd_ps(xj, invR, axi);
                ayi = _mm512_fmadd_ps(yj, invR, ayi);
                azi = _mm512_fmadd_ps(zj, invR, azi);
            }
            p[i] = _mm512_reduce_add_ps(pi);
            ax[i] = _mm512_reduce_add_ps(axi);
            ay[i] = _mm512_reduce_add_ps(ayi);
            az[i] = _mm512_reduce_add_ps(azi);
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize source with intrinsics : %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize target with pragma simd
            tic = get_time();
        }
#pragma simd
        #pragma omp for
        for (i=0; i<N; i++) {
            float pi = 0;
            float axi = 0;
            float ayi = 0;
            float azi = 0;
            float xi = x[i];
            float yi = y[i];
            float zi = z[i];
            for (j=0; j<N; j++) {
                float dx = x[j] - xi;
                float dy = y[j] - yi;
                float dz = z[j] - zi;
                float R2 = dx * dx + dy * dy + dz * dz + EPS2;
                float invR = 1.0f / sqrtf(R2);
                float invR3 = m[j] * invR * invR * invR;
                pi += m[j] * invR;
                axi += dx * invR3;
                ayi += dy * invR3;
                azi += dz * invR3;
            }
            p[i] = pi;
            ax[i] = axi;
            ay[i] = ayi;
            az[i] = azi;
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize target with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));

            // Vectorize source with pragma simd
            tic = get_time();
        }
        #pragma omp for
        for (i=0; i<N; i++) {
            float pi = 0;
            float axi = 0;
            float ayi = 0;
            float azi = 0;
            float xi = x[i];
            float yi = y[i];
            float zi = z[i];
#pragma simd
            for (j=0; j<N; j++) {
                float dx = x[j] - xi;
                float dy = y[j] - yi;
                float dz = z[j] - zi;
                float R2 = dx * dx + dy * dy + dz * dz + EPS2;
                float invR = 1.0f / sqrtf(R2);
                float invR3 = m[j] * invR * invR * invR;
                pi += m[j] * invR;
                axi += dx * invR3;
                ayi += dy * invR3;
                azi += dz * invR3;
            }
            p[i] = pi;
            ax[i] = axi;
            ay[i] = ayi;
            az[i] = azi;
        }
        #pragma omp single
        {
            toc = get_time();
            printf("Vectorize source with pragma simd: %e s : %lf GFlops\n",toc-tic, OPS/(toc-tic));
        }
    }

    _mm_free(x);
    _mm_free(y);
    _mm_free(z);
    _mm_free(m);
    _mm_free(p);
    _mm_free(ax);
    _mm_free(ay);
    _mm_free(az);
    return 0;
}
	static inline mic_m512c_t mic_mul_ccps(mic_m512c_t a, mic_m512c_t b) {
		mic_m512c_t vec;
		vec.xvec = _mm512_sub_ps(_mm512_mul_ps(a.xvec, b.xvec), _mm512_mul_ps(a.yvec, b.yvec));
		vec.yvec = _mm512_add_ps(_mm512_mul_ps(a.xvec, b.yvec), _mm512_mul_ps(a.yvec, b.xvec));
		return vec;
	} // mic_mul_ccps()
	static inline mic_m512c_t mic_mul_crps(mic_m512c_t a, mic_m512_t b) {
		mic_m512c_t vec;
		vec.xvec = _mm512_mul_ps(a.xvec, b);
		vec.yvec = _mm512_mul_ps(a.yvec, b);
		return vec;
	} // mic_mul_cpps()
	static inline mic_m512_t mic_mul_rrps(mic_m512_t a, mic_m512_t b) {
		return _mm512_mul_ps(a, b);
	} // mic_mul_rrps()
 static batch_type mul(const batch_type& lhs, const batch_type& rhs)
 {
     return _mm512_mul_ps(lhs, rhs);
 }
static inline mic_m512_t mic_cos_ps(mic_m512_t x) {
	x = _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(x), _pi32_inv_sign_mask));

	mic_m512_t y = _mm512_mul_ps(x, _ps_cephes_FOPI);

	__m512i emm2 = _mm512_cvtfxpnt_round_adjustps_epi32(y, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
	emm2 = _mm512_add_epi32(emm2, _pi32_1);
	emm2 = _mm512_and_epi32(emm2, _pi32_inv1);
	y = _mm512_cvtfxpnt_round_adjustepu32_ps(emm2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);

	emm2 = _mm512_sub_epi32(emm2, _pi32_2);

	__m512i emm0 = _mm512_andnot_epi32(emm2, _pi32_4);
	emm0 = _mm512_slli_epi32(emm0, 29);

	emm2 = _mm512_and_epi32(emm2, _pi32_2);
	__mmask16 mask = _mm512_cmp_epi32_mask(emm2, _pi32_0, _MM_CMPINT_EQ);
	emm2 = _mm512_mask_add_epi32(_pi32_0, mask, _pi32_ffff, _pi32_0);
	
	mic_m512_t temp = _ps_minus_cephes_DP123;
	temp = _mm512_mul_ps(y, temp);
	x = _mm512_add_ps(x, temp);

	mic_m512_t x2 = _mm512_mul_ps(x, x);
	mic_m512_t x3 = _mm512_mul_ps(x2, x);
	mic_m512_t x4 = _mm512_mul_ps(x2, x2);

	y = _mm512_mul_ps(_ps_coscof_p0, x2);
	mic_m512_t y2 = _mm512_mul_ps(_ps_sincof_p0, x2);
	y = _mm512_add_ps(y, _ps_coscof_p1);
	y2 = _mm512_add_ps(y2, _ps_sincof_p1);
	y = _mm512_mul_ps(y, x2);
	y2 = _mm512_mul_ps(y2, x2);
	y = _mm512_add_ps(y, _ps_coscof_p2);
	y2 = _mm512_add_ps(y2, _ps_sincof_p2);
	y = _mm512_mul_ps(y, x4);
	y2 = _mm512_mul_ps(y2, x3);
	temp = _mm512_mul_ps(x2, _ps_0point5);
	temp = _mm512_sub_ps(temp, _ps_1);
	y = _mm512_sub_ps(y, temp);
	y2 = _mm512_add_ps(y2, x);

	y = _mm512_castsi512_ps(_mm512_andnot_epi32(emm2, _mm512_castps_si512(y)));
	y2 = _mm512_castsi512_ps(_mm512_and_epi32(emm2, _mm512_castps_si512(y2)));

	y = _mm512_add_ps(y, y2);
	y = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(y), emm0));

	return y;
} // cos_ps()
void AVX512BW_mandelbrot(
	float Re_min, float Re_max,
	float Im_min, float Im_max,
	float threshold,
	int maxiters,
	int width, int height,
	uint8_t *data)

{
	float dRe, dIm;
	int x, y;

	__m128i* ptr = (__m128i*)data;

	// step on Re and Im axis
	dRe = (Re_max - Re_min)/width;
	dIm = (Im_max - Im_min)/height;

	// prepare vectors
	// 1. threshold
    const __m512 vec_threshold = _mm512_set1_ps(threshold);

	// 2. Cim
    __m512 Cim = _mm512_set1_ps(Im_min);

	// 3. Re advance every x iteration
    const __m512 vec_dRe = _mm512_set1_ps(16*dRe);

	// 4. Im advance every y iteration
    const __m512 vec_dIm = _mm512_set1_ps(dIm);

	// calculations
	for (y=0; y < height; y++) {

        __m512 Cre = _mm512_setr_ps(
            Re_min +  0*dRe, Re_min +  1*dRe, Re_min +  2*dRe, Re_min +  3*dRe,
            Re_min +  4*dRe, Re_min +  5*dRe, Re_min +  6*dRe, Re_min +  7*dRe,
            Re_min +  8*dRe, Re_min +  9*dRe, Re_min + 10*dRe, Re_min + 11*dRe,
            Re_min + 12*dRe, Re_min + 13*dRe, Re_min + 14*dRe, Re_min + 15*dRe
        );

		for (x=0; x < width; x+=16) {

            __m512 Xre = _mm512_setzero_ps();
            __m512 Xim = _mm512_setzero_ps();

            __m128i itercount = _mm_setzero_si128();

            int i;
            for (i=0; i < maxiters; i++) {

			    // Tre = Xre^2 - Xim^2 + Cim
                const __m512 Xre2 = _mm512_mul_ps(Xre, Xre);
                const __m512 Xim2 = _mm512_mul_ps(Xim, Xim);
                const __m512 Tre  = _mm512_add_ps(Cre, _mm512_sub_ps(Xre2, Xim2));

			    // Tim = 2*Xre*Xim + Cre
                const __m512 t1  = _mm512_mul_ps(Xre, Xim);
                const __m512 Tim = _mm512_add_ps(Cim, _mm512_add_ps(t1, t1));

                // sqr_dist = Tre^2 + Tim^2
                __m512 Tre2 = _mm512_mul_ps(Tre, Tre);
                __m512 Tim2 = _mm512_mul_ps(Tim, Tim);
                __m512 sqr_dist = _mm512_add_ps(Tre2, Tim2);

                // sqr_dist < threshold => 16-bit mask
                __mmask16 mask = _mm512_cmp_ps_mask(sqr_dist, vec_threshold, _CMP_LE_OS);
                if (mask == 0) {
                    break;
                }

                // Note: unlike SSE/AVX2 versions itercount is a packed byte vector,
                //       thus conversion packed dword -> byte is not needed.
                itercount = _mm_sub_epi8(itercount, _mm_movm_epi8(mask));

                Xre = Tre;
                Xim = Tim;

            } // for

            *ptr++ = itercount;

			// advance Cre vector
            Cre = _mm512_add_ps(Cre, vec_dRe);
		}

		// advance Cim vector
        Cim = _mm512_add_ps(Cim, vec_dIm);
	}
}
 inline
 void operator/=(const short_vec<float, 32>& other)
 {
     val1 = _mm512_mul_ps(val1, _mm512_rcp14_ps(other.val1));
     val2 = _mm512_mul_ps(val2, _mm512_rcp14_ps(other.val2));
 }
 inline
 void operator*=(const short_vec<float, 32>& other)
 {
     val1 = _mm512_mul_ps(val1, other.val1);
     val2 = _mm512_mul_ps(val2, other.val2);
 }