Beispiel #1
0
double Atomtype::CalcPE(int frame_i, const Trajectory &trj, const coordinates &rand_xyz, const cubicbox_m256 &box, double vol) const
{
    float pe = 0.0;
    int atom_i = 0;

    /* BEGIN SIMD SECTION */
    // This performs the exact same calculation after the SIMD section
    // but doing it on 8 atoms at a time using SIMD instructions.

    coordinates8 rand_xyz8(rand_xyz), atom_xyz;
    __m256 r2_8, mask, r6, ri6, pe_tmp;
    __m256 pe_sum = _mm256_setzero_ps();
    float result[n] __attribute__((aligned (16)));

    for (; atom_i < this->n-8; atom_i+=8)
    {
        atom_xyz = trj.GetXYZ8(frame_i, this->name, atom_i);
        r2_8 = distance2(atom_xyz, rand_xyz8, box);
        mask = _mm256_cmp_ps(r2_8, rcut2_8, _CMP_LT_OS);
        r6 = _mm256_and_ps(mask, _mm256_mul_ps(_mm256_mul_ps(r2_8, r2_8), r2_8));
        ri6 = _mm256_and_ps(mask, _mm256_rcp_ps(r6));
        pe_tmp = _mm256_and_ps(mask, _mm256_mul_ps(ri6, _mm256_sub_ps(_mm256_mul_ps(c12_8, ri6), c6_8)));
        pe_sum = _mm256_add_ps(pe_tmp, pe_sum);
    }
    _mm256_store_ps(result, pe_sum);
    for (int i = 0; i < 8; i++)
    {
        pe += result[i];
    }

    /* END SIMD SECTION */

    for (; atom_i < this->n; atom_i++)
    {
        coordinates atom_xyz = trj.GetXYZ(frame_i, this->name, atom_i);
        float r2 = distance2(atom_xyz, rand_xyz, cubicbox(box));
        if (r2 < this->rcut2)
        {
            float ri6 = 1.0/(pow(r2,3));
            pe += ri6*(this->c12*ri6 - this->c6);
        }
    }

    pe += this->n/vol * this->tail_factor;;

    return pe;
}
Beispiel #2
0
/*---------------------------------------------------------------------------*/
__m256 TTriangle::THit::HitTest8(__m256 mask, const TPoint8& orig, const D3DXVECTOR3& d, HitResult8* result) const
{
    int u, v, w;
    w = ci;
    u = w == 0 ? 1 : 0;
    v = w == 2 ? 1 : 2;

    __m256 nu = _mm256_broadcast_ss(&this->nu);
    __m256 np = _mm256_broadcast_ss(&this->np);
    __m256 nv = _mm256_broadcast_ss(&this->nv);
    __m256 pu = _mm256_broadcast_ss(&this->pu);
    __m256 pv = _mm256_broadcast_ss(&this->pv);
    __m256 e0u = _mm256_broadcast_ss(&this->e0u);
    __m256 e0v = _mm256_broadcast_ss(&this->e0v);
    __m256 e1u = _mm256_broadcast_ss(&this->e1u);
    __m256 e1v = _mm256_broadcast_ss(&this->e1v);

    __m256 ou = orig[u];
    __m256 ov = orig[v];
    __m256 ow = orig[w];
    __m256 du = _mm256_broadcast_ss(&d[u]);
    __m256 dv = _mm256_broadcast_ss(&d[v]);
    __m256 dw = _mm256_broadcast_ss(&d[w]);

    __m256 dett = np -(ou*nu+ov*nv+ow);
    __m256 det = du*nu+dv*nv+dw;
    __m256 Du = du*dett - (pu-ou)*det;
    __m256 Dv = dv*dett - (pv-ov)*det;
    __m256 detu = (e1v*Du - e1u*Dv);
    __m256 detv = (e0u*Dv - e0v*Du);

    __m256 tmpdet0 = det - detu - detv;

    __m256 detMask = _mm256_xor_ps(_mm256_xor_ps(tmpdet0, detv) | _mm256_xor_ps(detv, detu), g_one8) > _mm256_setzero_ps();

    mask = mask & detMask;

    __m256 rdet = _mm256_rcp_ps(det);

    result->t = dett * rdet;
    result->u = detu * rdet;
    result->v = detv * rdet;

    return mask & (result->t > _mm256_setzero_ps());
    /**/
}
Beispiel #3
0
__m256 ori_to_bin_256(
  const __m256& ori,
  const int nbins) {

  //! For convenience
  const __m256 x2PI  = _mm256_set1_ps(2 * M_PI);
  const __m256 xbins = _mm256_set1_ps(nbins);

  //! Get it positive
  const __m256 mask = _mm256_cmp_ps(ori, _mm256_setzero_ps(), _CMP_LT_OS);

  //! Get the value
  const __m256 val = _mm256_round_ps(applyMask256_ps(mask, ori + x2PI, ori)
    / x2PI * xbins + _mm256_set1_ps(0.5f), _MM_FROUND_TO_ZERO);

  //! Return the modulo of it
  return val - xbins * _mm256_round_ps(val / xbins, _MM_FROUND_TO_ZERO);
}
Beispiel #4
0
float dot_product(const int N, const float *X, const int incX, const float *Y,
                  const int incY) {
  __m256 accum = _mm256_setzero_ps();
  for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) {
    __m256 xval = _mm256_load_ps(X);
    __m256 yval = _mm256_load_ps(Y);
    __m256 val = _mm256_mul_ps(xval, yval);
    accum = _mm256_add_ps(val, accum);
  }
  // Reduce the values in accum into the smallest 32-bit subsection
  // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3
  __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum),
      _mm256_extractf128_ps(accum, 1));
  // b0 b1 b2 b3 -> c0 c1 b2 b3
  accum2 = _mm_add_ps(accum2,
      _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8)));
  __m128 final_val = _mm_add_ss(
      _mm_insert_ps(accum2, accum2, 0x4e), accum2);
  // Add the high and low halves
  return final_val[0];
}
Beispiel #5
0
// Rounding half away from zero (equivalent to round() from math.h)
// __m256 contains 8 floats, but to simplify the examples, only 4 will be shown
// Initial values to be used in the examples:
// [-12.49  -0.5   1.5   3.7]
static __m256 c63_mm256_roundhalfawayfromzero_ps(const __m256 initial)
{
	const __m256 sign_mask = _mm256_set1_ps(-0.f);
	const __m256 one_half = _mm256_set1_ps(0.5f);
	const __m256 all_zeros = _mm256_setzero_ps();
	const __m256 pos_one = _mm256_set1_ps(1.f);
	const __m256 neg_one = _mm256_set1_ps(-1.f);

	// Creates a mask based on the sign of the floats, true for negative floats
	// Example: [true   true   false   false]
	__m256 less_than_zero = _mm256_cmp_ps(initial, all_zeros, _CMP_LT_OQ);

	// Returns the integer part of the floats
	// Example: [-12.0   -0.0   1.0   3.0]
	__m256 without_fraction = _mm256_round_ps(initial, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));

	// Returns the fraction part of the floats
	// Example: [-0.49   -0.5   0.5   0.7]
	__m256 fraction = _mm256_sub_ps(initial, without_fraction);

	// Absolute values of the fractions
	// Example: [0.49   0.5   0.5   0.7]
	__m256 fraction_abs = _mm256_andnot_ps(sign_mask, fraction);

	// Compares abs(fractions) to 0.5, true if lower
	// Example: [true   false   false   false]
	__m256 less_than_one_half = _mm256_cmp_ps(fraction_abs, one_half, _CMP_LT_OQ);

	// Blends 1.0 and -1.0 depending on the initial sign of the floats
	// Example: [-1.0   -1.0   1.0   1.0]
	__m256 signed_ones = _mm256_blendv_ps(pos_one, neg_one, less_than_zero);

	// Blends the previous result with zeros depending on the fractions that are lower than 0.5
	// Example: [0.0   -1.0   1.0   1.0]
	__m256 to_add = _mm256_blendv_ps(signed_ones, all_zeros, less_than_one_half);

	// Adds the previous result to the floats without fractions
	// Example: [-12.0   -1.0   2.0   4.0]
	return _mm256_add_ps(without_fraction, to_add);
}
        template <bool align> SIMD_INLINE float SquaredDifferenceSum32f(const float * a, const float * b, size_t size)
        {
            if(align)
                assert(Aligned(a) && Aligned(b));

            float sum = 0;
            size_t i = 0;
            size_t alignedSize = AlignLo(size, 8);
            if(alignedSize)
            {
                __m256 _sum = _mm256_setzero_ps();
                for(; i < alignedSize; i += 8)
                {
                    __m256 _a = Avx::Load<align>(a + i);
                    __m256 _b = Avx::Load<align>(b + i);
                    __m256 _d = _mm256_sub_ps(_a, _b);
                    _sum = _mm256_add_ps(_sum, _mm256_mul_ps(_d, _d));
                }
                sum += Avx::ExtractSum(_sum);
            }
            for(; i < size; ++i)
                sum += Simd::Square(a[i] - b[i]);
            return sum;
        }
Triangle* OctreeLeaf::Query(const Ray& ray, float& t) const
{
	float tBox = std::numeric_limits<float>::min();
	if (!Intersects(ray, bb, tBox) || tBox > t)
		return nullptr;

	const __m256 rayDirX = _mm256_set1_ps(ray.Direction.X);
	const __m256 rayDirY = _mm256_set1_ps(ray.Direction.Y);
	const __m256 rayDirZ = _mm256_set1_ps(ray.Direction.Z);

	const __m256 rayPosX = _mm256_set1_ps(ray.Origin.X);
	const __m256 rayPosY = _mm256_set1_ps(ray.Origin.Y);
	const __m256 rayPosZ = _mm256_set1_ps(ray.Origin.Z);

	union { float dists[MAXSIZE]; __m256 distances[MAXSIZE / NROFLANES]; };

	for (int i = 0; i < count; i++)
	{
		// Vector3F e1 = triangle.Vertices[1].Position - triangle.Vertices[0].Position;
		const __m256 e1X = edge1X8[i];
		const __m256 e1Y = edge1Y8[i];
		const __m256 e1Z = edge1Z8[i];

		// Vector3F e2 = triangle.Vertices[2].Position - triangle.Vertices[0].Position;
		const __m256 e2X = edge2X8[i];
		const __m256 e2Y = edge2Y8[i];
		const __m256 e2Z = edge2Z8[i];

		// Vector3F p = ray.Direction.Cross(e2);
		const __m256 pX = _mm256_sub_ps(_mm256_mul_ps(rayDirY, e2Z), _mm256_mul_ps(rayDirZ, e2Y));
		const __m256 pY = _mm256_sub_ps(_mm256_mul_ps(rayDirZ, e2X), _mm256_mul_ps(rayDirX, e2Z));
		const __m256 pZ = _mm256_sub_ps(_mm256_mul_ps(rayDirX, e2Y), _mm256_mul_ps(rayDirY, e2X));

		// float det = e1.Dot(p);
		const __m256 det = _mm256_add_ps(_mm256_mul_ps(e1X, pX), _mm256_add_ps(_mm256_mul_ps(e1Y, pY), _mm256_mul_ps(e1Z, pZ)));

		// if (det > -EPSILON && det < EPSILON)
		//     return false;
		__m256 mask = _mm256_or_ps(_mm256_cmp_ps(det, _mm256_set1_ps(-EPSILON), _CMP_LE_OS), _mm256_cmp_ps(det, _mm256_set1_ps(EPSILON), _CMP_GE_OS));

		// float invDet = 1 / det;
		const __m256 invDet = _mm256_div_ps(_mm256_set1_ps(1.0f), det);

		// Vector3F r = ray.Origin - triangle.Vertices[0].Position;
		const __m256 rX = _mm256_sub_ps(rayPosX, vert0X8[i]);
		const __m256 rY = _mm256_sub_ps(rayPosY, vert0Y8[i]);
		const __m256 rZ = _mm256_sub_ps(rayPosZ, vert0Z8[i]);

		// float u = r.Dot(p) * invDet;
		const __m256 u = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rX, pX), _mm256_add_ps(_mm256_mul_ps(rY, pY), _mm256_mul_ps(rZ, pZ))));

		// if (u < 0 || u > 1)
		//	   return false;
		mask = _mm256_and_ps(mask, _mm256_cmp_ps(u, _mm256_setzero_ps(), _CMP_GE_OS));

		// Vector3F q = r.Cross(e1);
		const __m256 qX = _mm256_sub_ps(_mm256_mul_ps(rY, e1Z), _mm256_mul_ps(rZ, e1Y));
		const __m256 qY = _mm256_sub_ps(_mm256_mul_ps(rZ, e1X), _mm256_mul_ps(rX, e1Z));
		const __m256 qZ = _mm256_sub_ps(_mm256_mul_ps(rX, e1Y), _mm256_mul_ps(rY, e1X));

		// float v = ray.Direction.Dot(q) * invDet;
		const __m256 v = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rayDirX, qX), _mm256_add_ps(_mm256_mul_ps(rayDirY, qY), _mm256_mul_ps(rayDirZ, qZ))));

		// if (v < 0 || u + v > 1)
		//     return false;
		mask = _mm256_and_ps(mask, _mm256_and_ps(_mm256_cmp_ps(v, _mm256_setzero_ps(), _CMP_GE_OS), _mm256_cmp_ps(_mm256_add_ps(u, v), _mm256_set1_ps(1.0f), _CMP_LE_OS)));

		// float tt = e2.Dot(q) * invDet;
		const __m256 tt = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(e2X, qX), _mm256_add_ps(_mm256_mul_ps(e2Y, qY), _mm256_mul_ps(e2Z, qZ))));

		// if (tt > EPSILON)
		// {
		//     t = tt;
		//     return true;
		// }
		//
		// return false;
		distances[i] = _mm256_and_ps(tt, mask);
	}

	Triangle* triangle = nullptr;
	for (int i = 0; i < count * NROFLANES; i++)
		if (dists[i] < t && dists[i] > EPSILON)
		{
			t = dists[i];
			triangle = triangles[i];
		}

	return triangle;
}
Beispiel #8
0
float 
nv_vector_norm(const nv_matrix_t *vec, int vec_m)
{
#if NV_ENABLE_AVX
	{
		NV_ALIGNED(float, mm[8], 32);
		__m256 x, u;
		int n;
		int pk_lp = (vec->n & 0xfffffff8);
		float dp = 0.0f;
		
		u = _mm256_setzero_ps();
		for (n = 0; n < pk_lp; n += 8) {
			x = _mm256_load_ps(&NV_MAT_V(vec, vec_m, n));
			u = _mm256_add_ps(u, _mm256_mul_ps(x, x));
		}
		_mm256_store_ps(mm, u);
		dp = mm[0] + mm[1] + mm[2] + mm[3] + mm[4] + mm[5] + mm[6] + mm[7];
		for (n = pk_lp; n < vec->n; ++n) {
			dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n);
		}
		if (dp > 0.0f) {
			return sqrtf(dp);
		}
		return 0.0f;
	}
#elif NV_ENABLE_SSE2
	{
		NV_ALIGNED(float, mm[4], 16);
		__m128 x, u;
		int n;
		int pk_lp = (vec->n & 0xfffffffc);
		float dp = 0.0f;
		
		u = _mm_setzero_ps();
		for (n = 0; n < pk_lp; n += 4) {
			x = _mm_load_ps(&NV_MAT_V(vec, vec_m, n));
			u = _mm_add_ps(u, _mm_mul_ps(x, x));
		}
		_mm_store_ps(mm, u);
		dp = mm[0] + mm[1] + mm[2] + mm[3];
		for (n = pk_lp; n < vec->n; ++n) {
			dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n);
		}
		if (dp > 0.0f) {
			return sqrtf(dp);
		}
		return 0.0f;
	}
#else
	{
		int n;
		float dp = 0.0f;
		for (n = 0; n < vec->n; ++n) {
			dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n);
		}
		if (dp > 0.0f) {
			return sqrtf(dp);
		}
		return 0.0f;
	}
#endif
}
Beispiel #9
0
    { 1.1f, 1.1f, 1.1f, 1.1f, 1.1f, 1.1f, 1.1f, 1.1f },   // RCX + 352
    { 1.2f, 1.2f, 1.2f, 1.2f, 1.2f, 1.2f, 1.2f, 1.2f },   // RCX + 384
    { 1.3f, 1.3f, 1.3f, 1.3f, 1.3f, 1.3f, 1.3f, 1.3f },   // RCX + 416
    { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f, 1.4f, 1.4f, 1.4f },   // RCX + 480
    { 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f },   // RCX + 512
  }
  VPU_ALIGN_SUFFIX(32);

  // our assembler
  vpu::IAssembler* a = g_lib->createAssembler();

  // This example demonstrates a simple loop. The equivalent C++ code would look like:

#if 0

  __m256 YMM0 = _mm256_setzero_ps();
  __m256* R9 = (__m256*)(argument_data + 1);
  for (int i = 0; i < 10; ++i)
  {
    YMM0 = _mm256_add_ps(YMM0, R9[i]);
  }
  _mm256_store_ps(argument_data[0], YMM0);

#endif

  // start assembling
  a->begin();

    // we'll accumulate the sum of the array in YMM0
    a->setzero(vpu::YMM0);
    void run_softmax_int32_float_work_item_batch8x(nn_workload_item *const work_item, uint16_t NoBatch8)
    {
        nn_workload_data_t *input_view = work_item->input[0]->output;
        const auto &arguments = work_item->arguments.forward_softmax_fixedpoint;

        const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];
        const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1;
        const auto batch_size_global = work_item->output->parent->lengths.t[NN_DATA_COORD_n];
        const auto batch_size = 8;

        const auto num_full_blocks = output_width / C_max_acc;
        const auto partial_block_size = output_width % C_max_acc;

        const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x] * batch_size * NoBatch8;
        const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p] * batch_size * NoBatch8;

        const auto out_fraction = arguments.input_fraction;

        float * input_f = (float*)_mm_malloc(input_width * batch_size * sizeof(float), 64);
        float * output_f = (float*)_mm_malloc(output_width * batch_size * sizeof(float), 64);

        auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start + NoBatch8 * 8 * input_width];
        //auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start];

        auto shift = out_fraction;
        if (shift > 0)
        {
            for (uint32_t i = 0; i < input_width * batch_size; i++)
                input_f[i] = (float)(input_buffer[i]) / (1 << shift);
        }
        else if (shift < 0)
        {
            for (uint32_t i = 0; i < input_width* batch_size; i++)
                input_f[i] = (float)(input_buffer[i]) * (1 << -shift);
        }
        else
        {
            for (uint32_t i = 0; i < input_width* batch_size; i++)
                input_f[i] = (float)(input_buffer[i]);
        }

        __m256 acc_sum = _mm256_setzero_ps();

        {
            auto input_buffer = input_f;
            //auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start + NoBatch8 * 8 * output_width];
            auto output_buffer = output_f;
            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break;
            case  2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break;
            case  3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break;
            case  4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break;
            case  5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break;
            case  6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break;
            case  7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break;
            case  8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break;
            case  9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break;
            case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break;
            case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break;
            case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break;
            case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break;
            case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum);

        {
            //auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start + NoBatch8 * 8 * output_width];
            auto output_buffer = output_f;
            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_finalize_block<C_max_acc>(output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_finalize_block< 1>(output_buffer, acc_sum); break;
            case  2: softmax_finalize_block< 2>(output_buffer, acc_sum); break;
            case  3: softmax_finalize_block< 3>(output_buffer, acc_sum); break;
            case  4: softmax_finalize_block< 4>(output_buffer, acc_sum); break;
            case  5: softmax_finalize_block< 5>(output_buffer, acc_sum); break;
            case  6: softmax_finalize_block< 6>(output_buffer, acc_sum); break;
            case  7: softmax_finalize_block< 7>(output_buffer, acc_sum); break;
            case  8: softmax_finalize_block< 8>(output_buffer, acc_sum); break;
            case  9: softmax_finalize_block< 9>(output_buffer, acc_sum); break;
            case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break;
            case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break;
            case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break;
            case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break;
            case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

        for (auto itrW = 0; itrW < output_width; itrW++)
        for (auto itr8 = 0; itr8 < C_batch_size; itr8++)
            output_buffer[itr8 + itrW * batch_size_global + NoBatch8 * C_batch_size] = output_f[itr8 + itrW * C_batch_size];

        _mm_free(input_f);
        _mm_free(output_f);
    }
void fastGEMM( const float* aptr, size_t astep, const float* bptr,
               size_t bstep, float* cptr, size_t cstep,
               int ma, int na, int nb )
{
    int n = 0;
    for( ; n <= nb - 16; n += 16 )
    {
        for( int m = 0; m < ma; m += 4 )
        {
            const float* aptr0 = aptr + astep*m;
            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);

            float* cptr0 = cptr + cstep*m;
            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);

            __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
            __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
            __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
            __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();

            for( int k = 0; k < na; k++ )
            {
                __m256 a0 = _mm256_set1_ps(aptr0[k]);
                __m256 a1 = _mm256_set1_ps(aptr1[k]);
                __m256 a2 = _mm256_set1_ps(aptr2[k]);
                __m256 a3 = _mm256_set1_ps(aptr3[k]);
                __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n);
                __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8);
                d00 = _mm256_fmadd_ps(a0, b0, d00);
                d01 = _mm256_fmadd_ps(a0, b1, d01);
                d10 = _mm256_fmadd_ps(a1, b0, d10);
                d11 = _mm256_fmadd_ps(a1, b1, d11);
                d20 = _mm256_fmadd_ps(a2, b0, d20);
                d21 = _mm256_fmadd_ps(a2, b1, d21);
                d30 = _mm256_fmadd_ps(a3, b0, d30);
                d31 = _mm256_fmadd_ps(a3, b1, d31);
            }

            _mm256_storeu_ps(cptr0 + n, d00);
            _mm256_storeu_ps(cptr0 + n + 8, d01);
            _mm256_storeu_ps(cptr1 + n, d10);
            _mm256_storeu_ps(cptr1 + n + 8, d11);
            _mm256_storeu_ps(cptr2 + n, d20);
            _mm256_storeu_ps(cptr2 + n + 8, d21);
            _mm256_storeu_ps(cptr3 + n, d30);
            _mm256_storeu_ps(cptr3 + n + 8, d31);
        }
    }

    for( ; n < nb; n++ )
    {
        for( int m = 0; m < ma; m++ )
        {
            const float* aptr0 = aptr + astep*m;
            float* cptr0 = cptr + cstep*m;
            float d0 = 0.f;

            for( int k = 0; k < na; k++ )
                d0 += aptr0[k]*bptr[k*bstep + n];

            cptr0[n] = d0;
        }
    }
    _mm256_zeroupper();
}
Beispiel #12
0
void kernel_strmv_u_n_8_lib8(int kmax, float *A, float *x, float *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int k;

	__m256
		zeros,
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0, x_1, x_2, x_3,
		y_0, y_0_b, y_0_c, y_0_d, z_0;
	
	zeros = _mm256_setzero_ps();	

	y_0   = _mm256_setzero_ps();	
	y_0_b = _mm256_setzero_ps();	
	y_0_c = _mm256_setzero_ps();	
	y_0_d = _mm256_setzero_ps();	
	

	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	x_0  = _mm256_broadcast_ss( &x[0] );
	x_0  = _mm256_blend_ps( zeros, x_0, 0x01 );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );

	a_01 = _mm256_load_ps( &A[0+lda*1] );
	x_1  = _mm256_broadcast_ss( &x[1] );
	x_1  = _mm256_blend_ps( zeros, x_1, 0x03 );
	ax_temp = _mm256_mul_ps( a_01, x_1 );
	y_0_b = _mm256_add_ps( y_0_b, ax_temp );

	a_02 = _mm256_load_ps( &A[0+lda*2] );
	x_2  = _mm256_broadcast_ss( &x[2] );
	x_2  = _mm256_blend_ps( zeros, x_2, 0x07 );
	ax_temp = _mm256_mul_ps( a_02, x_2 );
	y_0_c = _mm256_add_ps( y_0_c, ax_temp );

	a_03 = _mm256_load_ps( &A[0+lda*3] );
	x_3  = _mm256_broadcast_ss( &x[3] );
	x_3  = _mm256_blend_ps( zeros, x_3, 0x0f );
	ax_temp = _mm256_mul_ps( a_03, x_3 );
	y_0_d = _mm256_add_ps( y_0_d, ax_temp );

	A += 4*lda;
	x += 4;

	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	x_0  = _mm256_broadcast_ss( &x[0] );
	x_0  = _mm256_blend_ps( zeros, x_0, 0x1f );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );

	a_01 = _mm256_load_ps( &A[0+lda*1] );
	x_1  = _mm256_broadcast_ss( &x[1] );
	x_1  = _mm256_blend_ps( zeros, x_1, 0x3f );
	ax_temp = _mm256_mul_ps( a_01, x_1 );
	y_0_b = _mm256_add_ps( y_0_b, ax_temp );

	a_02 = _mm256_load_ps( &A[0+lda*2] );
	x_2  = _mm256_broadcast_ss( &x[2] );
	x_2  = _mm256_blend_ps( zeros, x_2, 0x7f );
	ax_temp = _mm256_mul_ps( a_02, x_2 );
	y_0_c = _mm256_add_ps( y_0_c, ax_temp );

	a_03 = _mm256_load_ps( &A[0+lda*3] );
	x_3  = _mm256_broadcast_ss( &x[3] );
	ax_temp = _mm256_mul_ps( a_03, x_3 );
	y_0_d = _mm256_add_ps( y_0_d, ax_temp );

	A += 4*lda;
	x += 4;

	k=8;
	for(; k<kmax-7; k+=8)
		{

		__builtin_prefetch( A + 4*lda );
		__builtin_prefetch( A + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );

		a_01 = _mm256_load_ps( &A[0+lda*1] );
		x_1  = _mm256_broadcast_ss( &x[1] );
		ax_temp = _mm256_mul_ps( a_01, x_1 );
		y_0_b = _mm256_add_ps( y_0_b, ax_temp );

		a_02 = _mm256_load_ps( &A[0+lda*2] );
		x_2  = _mm256_broadcast_ss( &x[2] );
		ax_temp = _mm256_mul_ps( a_02, x_2 );
		y_0_c = _mm256_add_ps( y_0_c, ax_temp );

		a_03 = _mm256_load_ps( &A[0+lda*3] );
		x_3  = _mm256_broadcast_ss( &x[3] );
		ax_temp = _mm256_mul_ps( a_03, x_3 );
		y_0_d = _mm256_add_ps( y_0_d, ax_temp );

		A += 4*lda;
		x += 4;

		__builtin_prefetch( A + 4*lda );
		__builtin_prefetch( A + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );

		a_01 = _mm256_load_ps( &A[0+lda*1] );
		x_1  = _mm256_broadcast_ss( &x[1] );
		ax_temp = _mm256_mul_ps( a_01, x_1 );
		y_0_b = _mm256_add_ps( y_0_b, ax_temp );

		a_02 = _mm256_load_ps( &A[0+lda*2] );
		x_2  = _mm256_broadcast_ss( &x[2] );
		ax_temp = _mm256_mul_ps( a_02, x_2 );
		y_0_c = _mm256_add_ps( y_0_c, ax_temp );

		a_03 = _mm256_load_ps( &A[0+lda*3] );
		x_3  = _mm256_broadcast_ss( &x[3] );
		ax_temp = _mm256_mul_ps( a_03, x_3 );
		y_0_d = _mm256_add_ps( y_0_d, ax_temp );

		A += 4*lda;
		x += 4;

		}
	for(; k<kmax-3; k+=4)
		{

		__builtin_prefetch( A + 4*lda );
		__builtin_prefetch( A + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );

		a_01 = _mm256_load_ps( &A[0+lda*1] );
		x_1  = _mm256_broadcast_ss( &x[1] );
		ax_temp = _mm256_mul_ps( a_01, x_1 );
		y_0_b = _mm256_add_ps( y_0_b, ax_temp );

		a_02 = _mm256_load_ps( &A[0+lda*2] );
		x_2  = _mm256_broadcast_ss( &x[2] );
		ax_temp = _mm256_mul_ps( a_02, x_2 );
		y_0_c = _mm256_add_ps( y_0_c, ax_temp );

		a_03 = _mm256_load_ps( &A[0+lda*3] );
		x_3  = _mm256_broadcast_ss( &x[3] );
		ax_temp = _mm256_mul_ps( a_03, x_3 );
		y_0_d = _mm256_add_ps( y_0_d, ax_temp );

		A += 4*lda;
		x += 4;

		}

	y_0   = _mm256_add_ps( y_0  , y_0_c );
	y_0_b = _mm256_add_ps( y_0_b, y_0_d );

	if(kmax%4>=2)
		{

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );

		a_01 = _mm256_load_ps( &A[0+lda*1] );
		x_1  = _mm256_broadcast_ss( &x[1] );
		ax_temp = _mm256_mul_ps( a_01, x_1 );
		y_0_b = _mm256_add_ps( y_0_b, ax_temp );

		A += 2*lda;
		x += 2;

		}
	
	y_0   = _mm256_add_ps( y_0  , y_0_b );

	if(kmax%2==1)
		{

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );
		
/*		A += 1*lda;*/
/*		x += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_ps(&y[0], y_0);
		}
	else if(alg==1)
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_add_ps( z_0, y_0 );

		_mm256_storeu_ps(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_sub_ps( z_0, y_0 );

		_mm256_storeu_ps(&y[0], z_0);
		}

	}
Beispiel #13
0
void	TransLut_FindIndexAvx2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac)
{
	assert (val_arr != 0);

	// Constants
	static const int      mant_size = 23;
	static const int      exp_bias  = 127;
	static const uint32_t base      = (exp_bias + LOGLUT_MIN_L2) << mant_size;
	static const float    val_min   = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
//	static const float    val_max   = float (int64_t (1) << LOGLUT_MAX_L2);
	static const int      frac_size = mant_size - LOGLUT_RES_L2;
	static const uint32_t frac_mask = (1 << frac_size) - 1;

	const __m256   zero_f     = _mm256_setzero_ps ();
	const __m256   one_f      = _mm256_set1_ps (1);
	const __m256   frac_mul   = _mm256_set1_ps (1.0f / (1 << frac_size));
	const __m256   mul_eps    = _mm256_set1_ps (1.0f / val_min);
	const __m256   mask_abs_f = _mm256_load_ps (
		reinterpret_cast <const float *> (fstb::ToolsAvx2::_mask_abs)
	);

	const __m256i  zero_i          = _mm256_setzero_si256 ();
	const __m256i  mask_abs_epi32  = _mm256_set1_epi32 (0x7FFFFFFF);
	const __m256i  one_epi32       = _mm256_set1_epi32 (1);
	const __m256i  base_epi32      = _mm256_set1_epi32 (int (base));
	const __m256i  frac_mask_epi32 = _mm256_set1_epi32 (frac_mask);
	const __m256i  val_min_epi32   =
		_mm256_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size);
	const __m256i  val_max_epi32   =
		_mm256_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size);
	const __m256i  index_max_epi32 =
		_mm256_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2);
	const __m256i  hsize_epi32     = _mm256_set1_epi32 (LOGLUT_HSIZE);
	const __m256i  mirror_epi32    = _mm256_set1_epi32 (LOGLUT_HSIZE - 1);

	// It really starts here
	const __m256   val_f = _mm256_load_ps (reinterpret_cast <const float *> (val_arr));
	const __m256   val_a = _mm256_and_ps (val_f, mask_abs_f);
	const __m256i  val_i = _mm256_load_si256 (reinterpret_cast <const __m256i *> (val_arr));
	const __m256i  val_u = _mm256_and_si256 (val_i, mask_abs_epi32);

	// Standard path
	__m256i        index_std = _mm256_sub_epi32 (val_u, base_epi32);
	index_std = _mm256_srli_epi32 (index_std, frac_size);
	index_std = _mm256_add_epi32 (index_std, one_epi32);
	__m256i        frac_stdi = _mm256_and_si256 (val_u, frac_mask_epi32);
	__m256         frac_std  = _mm256_cvtepi32_ps (frac_stdi);
	frac_std  = _mm256_mul_ps (frac_std, frac_mul);

	// Epsilon path
	__m256         frac_eps  = _mm256_max_ps (val_a, zero_f);
	frac_eps = _mm256_mul_ps (frac_eps, mul_eps);

	// Range cases
	const __m256i  eps_flag_i = _mm256_cmpgt_epi32 (val_min_epi32, val_u);
	const __m256i  std_flag_i = _mm256_cmpgt_epi32 (val_max_epi32, val_u);
	const __m256   eps_flag_f = _mm256_castsi256_ps (eps_flag_i);
	const __m256   std_flag_f = _mm256_castsi256_ps (std_flag_i);
	__m256i        index_tmp  =
		fstb::ToolsAvx2::select (std_flag_i, index_std, index_max_epi32);
	__m256         frac_tmp   =
		fstb::ToolsAvx2::select (std_flag_f, frac_std, one_f);
	index_tmp = fstb::ToolsAvx2::select (eps_flag_i, zero_i, index_tmp);
	frac_tmp  = fstb::ToolsAvx2::select (eps_flag_f, frac_eps, frac_tmp);

	// Sign cases
	const __m256i  neg_flag_i = _mm256_srai_epi32 (val_i, 31);
	const __m256   neg_flag_f = _mm256_castsi256_ps (neg_flag_i);
	const __m256i  index_neg  = _mm256_sub_epi32 (mirror_epi32, index_tmp);
	const __m256i  index_pos  = _mm256_add_epi32 (hsize_epi32, index_tmp);
	const __m256   frac_neg   = _mm256_sub_ps (one_f, frac_tmp);
	index = fstb::ToolsAvx2::select (neg_flag_i, index_neg, index_pos);
	frac  = fstb::ToolsAvx2::select (neg_flag_f, frac_neg, frac_tmp);
}
Beispiel #14
0
void sEnv_process(HvBase *_c, SignalEnvelope *o, hv_bInf_t bIn,
		void (*sendMessage)(HvBase *, int, const HvMessage *)) {
#if HV_SIMD_AVX
  _mm256_stream_ps(o->buffer+o->numSamplesInBuffer, _mm256_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    __m256 sum = _mm256_setzero_ps();
    while (n4) {
      __m256 x = _mm256_load_ps(o->buffer + n4 - HV_N_SIMD);
      __m256 h = _mm256_load_ps(o->hanningWeights + n4 - HV_N_SIMD);
      x = _mm256_mul_ps(x, h);
      sum = _mm256_add_ps(sum, x);
      n4 -= HV_N_SIMD;
    }
    sum = _mm256_hadd_ps(sum,sum); // horizontal sum
    sum = _mm256_hadd_ps(sum,sum);
    sEnv_sendMessage(_c, o, sum[0]+sum[4], sendMessage); // updates numSamplesInBuffer
  }
#elif HV_SIMD_SSE
  _mm_stream_ps(o->buffer+o->numSamplesInBuffer, _mm_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    __m128 sum = _mm_setzero_ps();
    while (n4) {
      __m128 x = _mm_load_ps(o->buffer + n4 - HV_N_SIMD);
      __m128 h = _mm_load_ps(o->hanningWeights + n4 - HV_N_SIMD);
      x = _mm_mul_ps(x, h);
      sum = _mm_add_ps(sum, x);
      n4 -= HV_N_SIMD;
    }
    sum = _mm_hadd_ps(sum,sum); // horizontal sum
    sum = _mm_hadd_ps(sum,sum);
    sEnv_sendMessage(_c, o, sum[0], sendMessage);
  }
#elif HV_SIMD_NEON
  vst1q_f32(o->buffer+o->numSamplesInBuffer, vmulq_f32(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    float32x4_t sum = vdupq_n_f32(0.0f);
    while (n4) {
      float32x4_t x = vld1q_f32(o->buffer + n4 - HV_N_SIMD);
      float32x4_t h = vld1q_f32(o->hanningWeights + n4 - HV_N_SIMD);
      x = vmulq_f32(x, h);
      sum = vaddq_f32(sum, x);
      n4 -= HV_N_SIMD;
    }
    sEnv_sendMessage(_c, o, sum[0]+sum[1]+sum[2]+sum[3], sendMessage);
  }
#else // HV_SIMD_NONE
  o->buffer[o->numSamplesInBuffer] = (bIn*bIn);
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    float sum = 0.0f;
    for (int i = 0; i < o->windowSize; ++i) {
      sum += (o->hanningWeights[i] * o->buffer[i]);
    }
    sEnv_sendMessage(_c, o, sum, sendMessage);
  }
#endif
}
Beispiel #15
0
DLL_LOCAL void fname(const float *a00, const float *a01, const float *a02, const float *a11, const float *a12,
                     const float *a22, float *ev0, float *ev1, float *ev2, const size_t len)
{
    const size_t avx_end = len & ~7;
    __m256 v_inv3 = _mm256_set1_ps(1.0 / 3.0);
    __m256 v_root3 = _mm256_sqrt_ps(_mm256_set1_ps(3.0));
    __m256 two = _mm256_set1_ps(2.0);
    __m256 half = _mm256_set1_ps(0.5);
    __m256 zero = _mm256_setzero_ps();

    for (size_t i = 0; i < avx_end; i += 8) {
        __m256 v_a00 = _mm256_loadu_ps(a00 + i);
        __m256 v_a01 = _mm256_loadu_ps(a01 + i);
        __m256 v_a02 = _mm256_loadu_ps(a02 + i);
        __m256 v_a11 = _mm256_loadu_ps(a11 + i);
        __m256 v_a12 = _mm256_loadu_ps(a12 + i);
        __m256 v_a22 = _mm256_loadu_ps(a22 + i);

        __m256 c0 = _avx_sub(_avx_sub(_avx_sub(_avx_add(_avx_mul(_avx_mul(v_a00, v_a11), v_a22),
            _avx_mul(_avx_mul(_avx_mul(two, v_a01), v_a02), v_a12)),
            _avx_mul(_avx_mul(v_a00, v_a12), v_a12)),
            _avx_mul(_avx_mul(v_a11, v_a02), v_a02)),
            _avx_mul(_avx_mul(v_a22, v_a01), v_a01));
        __m256 c1 = _avx_sub(_avx_add(_avx_sub(_avx_add(_avx_sub(_avx_mul(v_a00, v_a11),
            _avx_mul(v_a01, v_a01)),
            _avx_mul(v_a00, v_a22)),
            _avx_mul(v_a02, v_a02)),
            _avx_mul(v_a11, v_a22)),
            _avx_mul(v_a12, v_a12));
        __m256 c2 = _avx_add(_avx_add(v_a00, v_a11), v_a22);
        __m256 c2Div3 = _avx_mul(c2, v_inv3);
        __m256 aDiv3 = _avx_mul(_avx_sub(c1, _avx_mul(c2, c2Div3)), v_inv3);

        aDiv3 = _mm256_min_ps(aDiv3, zero);

        __m256 mbDiv2 = _avx_mul(half, _avx_add(c0, _avx_mul(c2Div3, _avx_sub(_avx_mul(_avx_mul(two, c2Div3), c2Div3), c1))));
        __m256 q = _avx_add(_avx_mul(mbDiv2, mbDiv2), _avx_mul(_avx_mul(aDiv3, aDiv3), aDiv3));

        q = _mm256_min_ps(q, zero);

        __m256 magnitude = _mm256_sqrt_ps(_avx_neg(aDiv3));
        __m256 angle = _avx_mul(atan2_256_ps(_mm256_sqrt_ps(_avx_neg(q)), mbDiv2), v_inv3);
        __m256 cs, sn;

        sincos256_ps(angle, &sn, &cs);

        __m256 r0 = _avx_add(c2Div3, _avx_mul(_avx_mul(two, magnitude), cs));
        __m256 r1 = _avx_sub(c2Div3, _avx_mul(magnitude, _avx_add(cs, _avx_mul(v_root3, sn))));
        __m256 r2 = _avx_sub(c2Div3, _avx_mul(magnitude, _avx_sub(cs, _avx_mul(v_root3, sn))));

        __m256 v_r0_tmp = _mm256_min_ps(r0, r1);
        __m256 v_r1_tmp = _mm256_max_ps(r0, r1);

        __m256 v_r0 = _mm256_min_ps(v_r0_tmp, r2);
        __m256 v_r2_tmp = _mm256_max_ps(v_r0_tmp, r2);

        __m256 v_r1 = _mm256_min_ps(v_r1_tmp, v_r2_tmp);
        __m256 v_r2 = _mm256_max_ps(v_r1_tmp, v_r2_tmp);

        _mm256_storeu_ps(ev2 + i, v_r0);
        _mm256_storeu_ps(ev1 + i, v_r1);
        _mm256_storeu_ps(ev0 + i, v_r2);
    }

    for (size_t i = avx_end; i < len; ++i) {
        float inv3 = 1.0 / 3.0;
        float root3 = sqrt(3.0);

        float c0 = a00[i] * a11[i] * a22[i] + 2.0 * a01[i] * a02[i] * a12[i] - a00[i] * a12[i] * a12[i] -
                   a11[i] * a02[i] * a02[i] - a22[i] * a01[i] * a01[i];
        float c1 =
            a00[i] * a11[i] - a01[i] * a01[i] + a00[i] * a22[i] - a02[i] * a02[i] + a11[i] * a22[i] - a12[i] * a12[i];
        float c2 = a00[i] + a11[i] + a22[i];
        float c2Div3 = c2 * inv3;
        float aDiv3 = (c1 - c2 * c2Div3) * inv3;

        if (aDiv3 > 0.0)
            aDiv3 = 0.0;

        float mbDiv2 = 0.5 * (c0 + c2Div3 * (2.0 * c2Div3 * c2Div3 - c1));
        float q = mbDiv2 * mbDiv2 + aDiv3 * aDiv3 * aDiv3;

        if (q > 0.0)
            q = 0.0;

        float magnitude = sqrt(-aDiv3);
        float angle = atan2(sqrt(-q), mbDiv2) * inv3;
        float cs = cos(angle);
        float sn = sin(angle);
        float r0 = (c2Div3 + 2.0 * magnitude * cs);
        float r1 = (c2Div3 - magnitude * (cs + root3 * sn));
        float r2 = (c2Div3 - magnitude * (cs - root3 * sn));

        if (r0 < r1)
            swap(&r0, &r1);
        if (r0 < r2)
            swap(&r0, &r2);
        if (r1 < r2)
            swap(&r1, &r2);

        ev0[i] = r0;
        ev1[i] = r1;
        ev2[i] = r2;
    }
}
Beispiel #16
0
		while( i-- ) {
			sad += Math::abs( *src1++ - *src2++ );
		}

		// Zero upper half of AVX registers to avoid AVX-SSE transition penalties
		_mm256_zeroupper( );

		return sad;
	}

	float SIMDAVX::SSD( const float* src1, const float* src2, const size_t n ) const
	{
		size_t i = n >> 3;

		__m256 a, b, diff, sqr, sum;
		sum = _mm256_setzero_ps( );
		if( ( ( size_t ) src1 | ( size_t ) src2 ) & 0x1f ) {
			while( i-- ) {
				a = _mm256_loadu_ps( src1 );
				b = _mm256_loadu_ps( src2 );
				diff = _mm256_sub_ps( a, b );
				sqr = _mm256_mul_ps( diff, diff );
				sum = _mm256_add_ps( sum, sqr );
				src1 += 8; src2 += 8;
			}
		} else {
			while( i-- ) {
				a = _mm256_load_ps( src1 );
				b = _mm256_load_ps( src2 );
				diff = _mm256_sub_ps( a, b );
				sqr = _mm256_mul_ps( diff, diff );
Beispiel #17
0
void
calc_dnn_fma(float *dst, float *src, float *w, float *b, int out, int in, float *fstore)
{
#ifdef HAS_SIMD_FMA

  float *s;
  int i, j;
  int n = in / 8;

  for (i = 0; i + 3 < out; i += 4) {
    float *w2, *w3, *w4;
    __m256 x1 = _mm256_setzero_ps();
    __m256 x2 = _mm256_setzero_ps();
    __m256 x3 = _mm256_setzero_ps();
    __m256 x4 = _mm256_setzero_ps();
    w2 = w + in;
    w3 = w2 + in;
    w4 = w3 + in;
    s = src;
    for (j = 0; j < n; j++) {
      __m256 vs = _mm256_load_ps(s);
      __m256 vw1 = _mm256_load_ps(w);
      __m256 vw2 = _mm256_load_ps(w2);
      __m256 vw3 = _mm256_load_ps(w3);
      __m256 vw4 = _mm256_load_ps(w4);
      x1 = _mm256_fmadd_ps(vs, vw1, x1);
      x2 = _mm256_fmadd_ps(vs, vw2, x2);
      x3 = _mm256_fmadd_ps(vs, vw3, x3);
      x4 = _mm256_fmadd_ps(vs, vw4, x4);
      s  += 8;
      w  += 8;
      w2 += 8;
      w3 += 8;
      w4 += 8;
    }
    _mm256_store_ps(fstore, x1);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
    _mm256_store_ps(fstore, x2);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
    _mm256_store_ps(fstore, x3);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
    _mm256_store_ps(fstore, x4);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
    w = w4;
  }

  /* process last <4 nodes */
  for (; i < out; i++) {
    __m256 x = _mm256_setzero_ps();
    s = src;
    for (j = 0; j < n; j++) {
      __m256 vs = _mm256_load_ps(s);
      __m256 v = _mm256_load_ps(w);
      x = _mm256_fmadd_ps(vs, v, x);
      s  += 8;
      w  += 8;
    }
    _mm256_store_ps(fstore, x);
    *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++);
  }
  
#endif	/* HAS_SIMD_FMA */
}
Beispiel #18
0
__m256 mm256_cos_ps(__m256 x) {
  __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
  __m256i emm0, emm2;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI);

  /* store the integer part of y in mm0 */
  emm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1);
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1);
  y = _mm256_cvtepi32_ps(emm2);

  emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2);
  
  /* get the swap sign flag */
  emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4);
  emm0 = _mm256_slli_epi32(emm0, 29);
  /* get the polynom selection mask */
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2);
  emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
  
  __m256 sign_bit = _mm256_castsi256_ps(emm0);
  __m256 poly_mask = _mm256_castsi256_ps(emm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(__m256*)m256_ps_minus_cephes_DP1;
  xmm2 = *(__m256*)m256_ps_minus_cephes_DP2;
  xmm3 = *(__m256*)m256_ps_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(__m256*)m256_ps_coscof_p0;
  __m256 z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  __m256 y2 = *(__m256*)m256_ps_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  _mm256_zeroupper();
  return y;
}
static inline void rectifier_kernel_avx(float *a, const size_t blocks) {
    for (size_t i = 0; i < blocks; ++i) {
        _mm256_store_ps( &a[i*8], _mm256_max_ps( _mm256_load_ps( &a[i*8] ) , _mm256_setzero_ps() ) );
    }
}
static inline void rectifier_kernel_avx10(float *a, const size_t blocks) {
    for (size_t i = 0; i < blocks; ++i) {
        _mm256_store_ps( &a[i*8*10 + 0*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 0*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 1*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 1*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 2*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 2*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 3*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 3*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 4*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 4*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 5*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 5*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 6*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 6*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 7*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 7*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 8*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 8*8] ) , _mm256_setzero_ps() ) );
        _mm256_store_ps( &a[i*8*10 + 9*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 9*8] ) , _mm256_setzero_ps() ) );
    }
}
Beispiel #21
0
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int 
		k, k_left, ii;
	
	float 
		k_left_d;

	const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5};
	float temp_space[8] = {};

	__m256
		mask,
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	mask = _mm256_loadu_ps( mask_f ); 

	zeros = _mm256_setzero_ps();

	x_n_0 = _mm256_broadcast_ss( &x_n[0] );
	x_n_1 = _mm256_broadcast_ss( &x_n[1] );
	x_n_2 = _mm256_broadcast_ss( &x_n[2] );
	x_n_3 = _mm256_broadcast_ss( &x_n[3] );

	if(alg==-1) // TODO xor
		{
		x_n_0 = _mm256_sub_ps( zeros, x_n_0 );
		x_n_1 = _mm256_sub_ps( zeros, x_n_1 );
		x_n_2 = _mm256_sub_ps( zeros, x_n_2 );
		x_n_3 = _mm256_sub_ps( zeros, x_n_3 );
		}

	y_t_0 = _mm256_setzero_ps();
	y_t_1 = _mm256_setzero_ps();
	y_t_2 = _mm256_setzero_ps();
	y_t_3 = _mm256_setzero_ps();
	
	k=0;

	// corner
	if(tri==1)
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );

/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
/*exit(1);*/

		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x00 );
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x0f );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;

		}

	if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
/*	for(; k<kna; k++)*/
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		
		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/

/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*printf("\nk_left = %d\n", k_left);*/
/*exit(1);*/
	
		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;
		
		}
	if(kna>0 || tri==1)
		{
		A += (sda-1)*lda;
		}
	for(; k<kmax-7; k+=8)
		{
		
		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		y_n_0 = _mm256_loadu_ps( &y_n[0] );
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		
		a_00  = _mm256_load_ps( &A[0+lda*0] );
		temp  = _mm256_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_load_ps( &A[0+lda*1] );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_load_ps( &A[0+lda*2] );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_load_ps( &A[0+lda*3] );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += sda*lda;
		y_n += 8;
		x_t += 8;

		}
	
	if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
		{
		
		k_left = kmax-k;

		k_left_d = 8.0 - k_left;

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		
/*printf("\nk_left2 = %d\n", k_left, kmax, k);*/
		a_00  = _mm256_load_ps( &A[0+lda*0] );
/*printf("\nk_left2 = %d\n", k_left);*/
		a_01  = _mm256_load_ps( &A[0+lda*1] );
		a_02  = _mm256_load_ps( &A[0+lda*2] );
		a_03  = _mm256_load_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/
	
/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*		A   += 1;*/
/*		y_n += 1;*/
/*		x_t += 1;*/
		
		}

	// reduction
	__m128
		z_0, z_1;

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1);
	y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3);

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2);

	y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01);
	
	z_0 = _mm256_castps256_ps128(y_t_0);
	z_1 = _mm256_castps256_ps128(y_t_1);
	
	z_1 = _mm_add_ps(z_0, z_1);

	if(alg==1)
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_add_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	else // alg==-1
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_sub_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	
	}
 static inline __m256 gen_zero(void)
 {
     return _mm256_setzero_ps();
 }
Beispiel #23
0
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 8;
/*	const int bs  = 8;*/
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );
	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	int
		k;
	
	__m256
		zeros,
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	zeros = _mm256_setzero_ps();

	y_0 = _mm256_setzero_ps();
	y_1 = _mm256_setzero_ps();
	y_2 = _mm256_setzero_ps();
	y_3 = _mm256_setzero_ps();
	y_4 = _mm256_setzero_ps();
	y_5 = _mm256_setzero_ps();
	y_6 = _mm256_setzero_ps();
	y_7 = _mm256_setzero_ps();

	k=0;
	for(; k<kmax-7; k+=8)
		{
		
		x_0 = _mm256_loadu_ps( &x[0] );

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*1] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_1 = _mm256_add_ps( y_1, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*2] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_2 = _mm256_add_ps( y_2, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*3] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_3 = _mm256_add_ps( y_3, ax_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*4] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_4 = _mm256_add_ps( y_4, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*5] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_5 = _mm256_add_ps( y_5, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*6] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_6 = _mm256_add_ps( y_6, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*7] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_7 = _mm256_add_ps( y_7, ax_temp );

		A += sda*lda;
		x += lda;

		}

	x_0 = _mm256_loadu_ps( &x[0] );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x01 );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*1] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x03 );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_1 = _mm256_add_ps( y_1, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*2] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x07 );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_2 = _mm256_add_ps( y_2, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*3] );
	a_03 = _mm256_blend_ps( zeros, a_03, 0x0f );
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_3 = _mm256_add_ps( y_3, ax_temp );

	a_00 = _mm256_load_ps( &A[0+lda*4] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x1f );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_4 = _mm256_add_ps( y_4, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*5] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x3f );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_5 = _mm256_add_ps( y_5, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*6] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x7f );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_6 = _mm256_add_ps( y_6, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*7] );
/*	a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_7 = _mm256_add_ps( y_7, ax_temp );

	// reduction
	__m256
		z_0;

	y_0 = _mm256_hadd_ps(y_0, y_1);
	y_2 = _mm256_hadd_ps(y_2, y_3);
	y_4 = _mm256_hadd_ps(y_4, y_5);
	y_6 = _mm256_hadd_ps(y_6, y_7);

	y_0 = _mm256_hadd_ps(y_0, y_2);
	y_4 = _mm256_hadd_ps(y_4, y_6);

	y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20);
	y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31);
	
	y_0 = _mm256_add_ps(y_1, y_2);

	// store
	if(alg==0)
		{
		_mm256_storeu_ps(&y[0], y_0);
		}
	else if(alg==1)
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_add_ps(z_0, y_0);

		_mm256_storeu_ps(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_sub_ps(z_0, y_0);

		_mm256_storeu_ps(&y[0], z_0);
		}

	}
Beispiel #24
0
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);

	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	//avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	//emm2 = _mm256_add_epi32(emm2, _pi32_1);
	avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1));

	//emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1)));

	y = _mm256_cvtepi32_ps(emm2);

	//avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2);
	avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2));

	//avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4);
	avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//emm0 = _mm256_slli_epi32(emm0, 29);
	__m128i emm0hi = _mm256_extractf128_si256(emm0, 0);
	__m128i emm0lo = _mm256_extractf128_si256(emm0, 1);
	emm0hi = _mm_slli_epi32(emm0hi, 29);
	emm0lo = _mm_slli_epi32(emm0lo, 29);
	emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0);
	emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1);

	//cos_emm0 = _mm256_slli_epi32(cos_emm0, 29);
	__m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0);
	__m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1);
	cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29);
	cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1);

	//emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2);
	cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256());
	cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	avx_m256_t emm0f = _mm256_castsi256_ps(emm0);
	avx_m256_t emm2f = _mm256_castsi256_ps(emm2);
	avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0);
	avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2);

	sign_bit = _mm256_xor_ps(sign_bit, emm0f);

	avx_m256_t temp_2 = _ps_minus_cephes_DP123;
	temp_2 = _mm256_mul_ps(y, temp_2);
	x = _mm256_add_ps(x, temp_2);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp_2 = _mm256_mul_ps(x2, _ps_0p5);
	y2 = _mm256_add_ps(y2, x);
	temp_2 = _mm256_sub_ps(temp_2, _ps_1);
	y = _mm256_sub_ps(y, temp_2);

	avx_m256_t cos_y = y;
	avx_m256_t cos_y2 = y2;
	y = _mm256_andnot_ps(emm2f, y);
	cos_y = _mm256_andnot_ps(cos_emm2f, cos_y);
	y2 = _mm256_and_ps(emm2f, y2);
	cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2);
	y = _mm256_add_ps(y, y2);
	cos_y = _mm256_add_ps(cos_y, cos_y2);

	*s = _mm256_xor_ps(y, sign_bit);
	*c = _mm256_xor_ps(cos_y, cos_emm0f);
} // newsincos_ps()
Beispiel #25
0
void sLine_onMessage(HvBase *_c, SignalLine *o, int letIn,
  const HvMessage * const m, void *sendMessage) {
  if (msg_isFloat(m,0)) {
    if (msg_isFloat(m,1)) {
      // new ramp
      int n = ctx_millisecondsToSamples(_c, msg_getFloat(m,1));
#if HV_SIMD_AVX
      float x = (o->n[1] > 0) ? (o->x[7] + (o->m[7]/8.0f)) : o->t[7]; // current output value
      float s = (msg_getFloat(m,0) - x) / ((float) n); // slope per sample
      o->n = _mm_set_epi32(n-3, n-2, n-1, n);
      o->x = _mm256_set_ps(x+7.0f*s, x+6.0f*s, x+5.0f*s, x+4.0f*s, x+3.0f*s, x+2.0f*s, x+s, x);
      o->m = _mm256_set1_ps(8.0f*s);
      o->t = _mm256_set1_ps(msg_getFloat(m,0));
#elif HV_SIMD_SSE
      float x = (o->n[1] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3];
      float s = (msg_getFloat(m,0) - x) / ((float) n); // slope per sample
      o->n = _mm_set_epi32(n-3, n-2, n-1, n);
      o->x = _mm_set_ps(x+3.0f*s, x+2.0f*s, x+s, x);
      o->m = _mm_set1_ps(4.0f*s);
      o->t = _mm_set1_ps(msg_getFloat(m,0));
#elif HV_SIMD_NEON
      float x = (o->n[3] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3];
      float s = (msg_getFloat(m,0) - x) / ((float) n);
      o->n = (int32x4_t) {n, n-1, n-2, n-3};
      o->x = (float32x4_t) {x, x+s, x+2.0f*s, x+3.0f*s};
      o->m = vdupq_n_f32(4.0f*s);
      o->t = vdupq_n_f32(msg_getFloat(m,0));
#else // HV_SIMD_NONE
      o->x = (o->n > 0) ? (o->x + o->m) : o->t; // new current value
      o->n = n; // new distance to target
      o->m = (msg_getFloat(m,0) - o->x) / ((float) n); // slope per sample
      o->t = msg_getFloat(m,0);
#endif
    } else {
      // Jump to value
#if HV_SIMD_AVX
      o->n = _mm_setzero_si128();
      o->x = _mm256_set1_ps(msg_getFloat(m,0));
      o->m = _mm256_setzero_ps();
      o->t = _mm256_set1_ps(msg_getFloat(m,0));
#elif HV_SIMD_SSE
      o->n = _mm_setzero_si128();
      o->x = _mm_set1_ps(msg_getFloat(m,0));
      o->m = _mm_setzero_ps();
      o->t = _mm_set1_ps(msg_getFloat(m,0));
#elif HV_SIMD_NEON
      o->n = vdupq_n_s32(0);
      o->x = vdupq_n_f32(0.0f);
      o->m = vdupq_n_f32(0.0f);
      o->t = vdupq_n_f32(0.0f);
#else // HV_SIMD_NONE
      o->n = 0;
      o->x = msg_getFloat(m,0);
      o->m = 0.0f;
      o->t = msg_getFloat(m,0);
#endif
    }
  } else if (msg_compareSymbol(m,0,"stop")) {
    // Stop line at current position
#if HV_SIMD_AVX
    // note o->n[1] is a 64-bit integer; two packed 32-bit ints. We only want to know if the high int is positive,
    // which can be done simply by testing the long int for positiveness.
    float x = (o->n[1] > 0) ? (o->x[7] + (o->m[7]/8.0f)) : o->t[7];
    o->n = _mm_setzero_si128();
    o->x = _mm256_set1_ps(x);
    o->m = _mm256_setzero_ps();
    o->t = _mm256_set1_ps(x);
#elif HV_SIMD_SSE
    float x = (o->n[1] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3];
    o->n = _mm_setzero_si128();
    o->x = _mm_set1_ps(x);
    o->m = _mm_setzero_ps();
    o->t = _mm_set1_ps(x);
#elif HV_SIMD_NEON
    float x = (o->n[3] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3];
    o->n = vdupq_n_s32(0);
    o->x = vdupq_n_f32(x);
    o->m = vdupq_n_f32(0.0f);
    o->t = vdupq_n_f32(x);
#else // HV_SIMD_NONE
    o->n = 0;
    o->x += o->m;
    o->m = 0.0f;
    o->t = o->x;
#endif
  }
}
Beispiel #26
0
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2,
						avx_m256_t *c1, avx_m256_t *c2) {
	avx_m256_t tempa = _ps_sign_mask;
	avx_m256_t tempb = _ps_inv_sign_mask;
	avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa);
	avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa);
	x1 = _mm256_and_ps(x1, tempb);
	x2 = _mm256_and_ps(x2, tempb);

	tempa = _ps_cephes_FOPI;
	avx_m256_t y1 = _mm256_mul_ps(x1, tempa);
	avx_m256_t y2 = _mm256_mul_ps(x2, tempa);

	//avx_m256i_t emm21 = _mm256_cvttps_epi32(y1);
	//avx_m256i_t emm22 = _mm256_cvttps_epi32(y2);
	//emm21 = _mm256_add_epi32(emm21, _pi32_1);
	//emm22 = _mm256_add_epi32(emm22, _pi32_1);
	avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1));
	avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1));

	//emm21 = _mm256_and_si256(emm21, _pi32_inv1);
	//emm22 = _mm256_and_si256(emm22, _pi32_inv1);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1)));

	y1 = _mm256_cvtepi32_ps(emm21);
	y2 = _mm256_cvtepi32_ps(emm22);

	//avx_m256i_t tempia = _pi32_2;
	//avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia);
	//avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia);
	avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2));
	avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2));

	//avx_m256i_t tempib = _pi32_4;
	//avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib);
	//avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib);
	avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib);
	//avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib);
	avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//emm01 = _mm256_slli_epi32(emm01, 29);
	__m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0);
	__m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1);
	emm0hi1 = _mm_slli_epi32(emm0hi1, 29);
	emm0lo1 = _mm_slli_epi32(emm0lo1, 29);
	emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0);
	emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1);

	//emm02 = _mm256_slli_epi32(emm02, 29);
	__m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0);
	__m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1);
	emm0hi2 = _mm_slli_epi32(emm0hi2, 29);
	emm0lo2 = _mm_slli_epi32(emm0lo2, 29);
	emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0);
	emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1);

	//cos_emm01 = _mm256_slli_epi32(cos_emm01, 29);
	__m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0);
	__m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1);
	cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29);
	cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1);

	//cos_emm02 = _mm256_slli_epi32(cos_emm02, 29);
	__m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0);
	__m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1);
	cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29);
	cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1);

	//tempia = _pi32_2;
	//tempib = _mm256_setzero_si256();
	//emm21 = _mm256_and_si256(emm21, tempia);
	//emm22 = _mm256_and_si256(emm22, tempia);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_2)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm21 = _mm256_and_si256(cos_emm21, tempia);
	//cos_emm22 = _mm256_and_si256(cos_emm22, tempia);
	cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_2)));
	cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//emm21 = _mm256_cmpeq_epi32(emm21, tempib);
	//emm22 = _mm256_cmpeq_epi32(emm22, tempib);
	emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib);
	//cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib);
	cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));
	
	avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01);
	avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02);
	avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21);
	avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22);
	avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01);
	avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02);
	avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21);
	avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22);

	sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1);
	sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2);

	tempa = _ps_minus_cephes_DP123;
	tempb = _mm256_mul_ps(y2, tempa);
	tempa = _mm256_mul_ps(y1, tempa);
	x2 = _mm256_add_ps(x2, tempb);
	x1 = _mm256_add_ps(x1, tempa);

	avx_m256_t x21 = _mm256_mul_ps(x1, x1);
	avx_m256_t x22 = _mm256_mul_ps(x2, x2);
	avx_m256_t x31 = _mm256_mul_ps(x21, x1);
	avx_m256_t x32 = _mm256_mul_ps(x22, x2);
	avx_m256_t x41 = _mm256_mul_ps(x21, x21);
	avx_m256_t x42 = _mm256_mul_ps(x22, x22);

	tempa = _ps_coscof_p0;
	tempb = _ps_sincof_p0;

	y1 = _mm256_mul_ps(x21, tempa);
	y2 = _mm256_mul_ps(x22, tempa);
	avx_m256_t y21 = _mm256_mul_ps(x21, tempb);
	avx_m256_t y22 = _mm256_mul_ps(x22, tempb);
	tempa = _ps_coscof_p1;
	tempb = _ps_sincof_p1;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x21);
	y2 = _mm256_mul_ps(y2, x22);
	y21 = _mm256_mul_ps(y21, x21);
	y22 = _mm256_mul_ps(y22, x22);
	tempa = _ps_coscof_p2;
	tempb = _ps_sincof_p2;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x41);
	y2 = _mm256_mul_ps(y2, x42);
	y21 = _mm256_mul_ps(y21, x31);
	y22 = _mm256_mul_ps(y22, x32);
	tempa = _ps_0p5;
	tempb = _ps_1;
	avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa);
	avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa);
	y21 = _mm256_add_ps(y21, x1);
	y22 = _mm256_add_ps(y22, x2);
	temp_21 = _mm256_sub_ps(temp_21, tempb);
	temp_22 = _mm256_sub_ps(temp_22, tempb);
	y1 = _mm256_sub_ps(y1, temp_21);
	y2 = _mm256_sub_ps(y2, temp_22);

	avx_m256_t cos_y1 = y1;
	avx_m256_t cos_y2 = y2;
	avx_m256_t cos_y21 = y21;
	avx_m256_t cos_y22 = y22;
	y1 = _mm256_andnot_ps(emm2f1, y1);
	y2 = _mm256_andnot_ps(emm2f2, y2);
	cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1);
	cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2);
	y21 = _mm256_and_ps(emm2f1, y21);
	y22 = _mm256_and_ps(emm2f2, y22);
	cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21);
	cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22);
	y1 = _mm256_add_ps(y1, y21);
	y2 = _mm256_add_ps(y2, y22);
	cos_y1 = _mm256_add_ps(cos_y1, cos_y21);
	cos_y2 = _mm256_add_ps(cos_y2, cos_y22);

	*s1 = _mm256_xor_ps(y1, sign_bit1);
	*s2 = _mm256_xor_ps(y2, sign_bit2);
	*c1 = _mm256_xor_ps(cos_y1, cos_emm0f1);
	*c2 = _mm256_xor_ps(cos_y2, cos_emm0f2);
} // newsincos_ps_dual()
Beispiel #27
0
CPLErr
GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
    const void *poOptions,
    GUInt32 nPoints,
    CPL_UNUSED const double *unused_padfX,
    CPL_UNUSED const double *unused_padfY,
    CPL_UNUSED const double *unused_padfZ,
    double dfXPoint, double dfYPoint,
    double *pdfValue,
    void* hExtraParamsIn )
{
    size_t i = 0;
    GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn;
    const float* pafX = psExtraParams->pafX;
    const float* pafY = psExtraParams->pafY;
    const float* pafZ = psExtraParams->pafZ;

    const float fEpsilon = 0.0000000000001f;
    const float fXPoint = (float)dfXPoint;
    const float fYPoint = (float)dfYPoint;
    const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
    const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
    const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
    __m256 ymm_nominator = _mm256_setzero_ps();
    __m256 ymm_denominator = _mm256_setzero_ps();
    int mask = 0;

#undef LOOP_SIZE
#if defined(__x86_64) || defined(_M_X64)
    /* This would also work in 32bit mode, but there are only 8 XMM registers */
    /* whereas we have 16 for 64bit */
#define LOOP_SIZE   16
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
    for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
    {
        __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), ymm_x);            /* rx = pafX[i] - fXPoint */
        __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
        __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), ymm_y);            /* ry = pafY[i] - fYPoint */
        __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
        __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx),               /* r2 = rx * rx + ry * ry */
                                   _mm256_mul_ps(ymm_ry, ymm_ry));
        __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
                                     _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
        __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2);                               /* invr2 = 1.0f / r2 */
        __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
        ymm_nominator = _mm256_add_ps(ymm_nominator,                            /* nominator += invr2 * pafZ[i] */
                            _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
        ymm_nominator = _mm256_add_ps(ymm_nominator,
                            _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
        ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2);           /* denominator += invr2 */
        ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
        mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)) |           /* if( r2 < fEpsilon) */
              (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) << 8);
        if( mask )
            break;
    }
#else
#define LOOP_SIZE   8
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
    for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
    {
        __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps((float*)pafX + i), ymm_x);           /* rx = pafX[i] - fXPoint */
        __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps((float*)pafY + i), ymm_y);           /* ry = pafY[i] - fYPoint */
        __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx),              /* r2 = rx * rx + ry * ry */
                                   _mm256_mul_ps(ymm_ry, ymm_ry));
        __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2);                              /* invr2 = 1.0f / r2 */
        ymm_nominator = _mm256_add_ps(ymm_nominator,                           /* nominator += invr2 * pafZ[i] */
                            _mm256_mul_ps(ymm_invr2, _mm256_load_ps((float*)pafZ + i)));
        ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2);           /* denominator += invr2 */
        mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS));            /* if( r2 < fEpsilon) */
        if( mask )
            break;
    }
#endif

    /* Find which i triggered r2 < fEpsilon */
    if( mask )
    {
        for(int j = 0; j < LOOP_SIZE; j++ )
        {
            if( mask & (1 << j) )
            {
                (*pdfValue) = (pafZ)[i + j];

                // GCC and MSVC need explicit zeroing
#if !defined(__clang__)
                _mm256_zeroupper();
#endif
                return CE_None;
            }
        }
    }
#undef LOOP_SIZE

    /* Get back nominator and denominator values for YMM registers */
    float afNominator[8], afDenominator[8];
    _mm256_storeu_ps(afNominator, ymm_nominator);
    _mm256_storeu_ps(afDenominator, ymm_denominator);

    // MSVC doesn't emit AVX afterwards but may use SSE, so clear upper bits
    // Other compilers will continue using AVX for the below floating points operations
#if defined(_MSC_FULL_VER)
    _mm256_zeroupper();
#endif

    float fNominator = afNominator[0] + afNominator[1] +
                       afNominator[2] + afNominator[3] +
                       afNominator[4] + afNominator[5] +
                       afNominator[6] + afNominator[7];
    float fDenominator = afDenominator[0] + afDenominator[1] +
                         afDenominator[2] + afDenominator[3] +
                         afDenominator[4] + afDenominator[5] +
                         afDenominator[6] + afDenominator[7];

    /* Do the few remaining loop iterations */
    for ( ; i < nPoints; i++ )
    {
        const float fRX = pafX[i] - fXPoint;
        const float fRY = pafY[i] - fYPoint;
        const float fR2 =
            fRX * fRX + fRY * fRY;

        // If the test point is close to the grid node, use the point
        // value directly as a node value to avoid singularity.
        if ( fR2 < 0.0000000000001 )
        {
            break;
        }
        else
        {
            const float fInvR2 = 1.0f / fR2;
            fNominator += fInvR2 * pafZ[i];
            fDenominator += fInvR2;
        }
    }

    if( i != nPoints )
    {
        (*pdfValue) = pafZ[i];
    }
    else
    if ( fDenominator == 0.0 )
    {
        (*pdfValue) =
            ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue;
    }
    else
        (*pdfValue) = fNominator / fDenominator;

    // GCC needs explicit zeroing
#if defined(__GNUC__) && !defined(__clang__)
    _mm256_zeroupper();
#endif

    return CE_None;
}
Beispiel #28
0
void run_dct(int width, int height, float *quant, float *input, int32_t *output)
{
  float acosvals[8][8];

  /* Calculating cosines is expensive, and there
   * are only 64 cosines that need to be calculated
   * so precompute them and cache. */
  for (int i = 0; i < 8; i++)
  {
    for (int j = 0; j < 8; j++)
    {
      if (j == 0)
      {
        acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j);
      }
      else
      {
        acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j);
      }
    }
  }

/* Separate the parallel from the for, so each processor gets its
   * own copy of the buffers and variables. */
#pragma omp parallel
  {
    float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
    avload[0] = sqrt(1.0 / 8.0);
    __m256 row0, row1, row2, row3, row4, row5, row6, row7;
    __m256 loaderlow, loaderhigh;
    __m256 temp;
    __m256 minus128 = _mm256_set1_ps(-128.0);
    __m256 avxcosloader, avxcos;
    float avxcosmover;
    __m256i integer;

/* The DCT breaks the image into 8 by 8 blocks and then
   * transforms them into color frequencies. */
#pragma omp for
    for (int brow = 0; brow < height / 8; brow++)
    {
      for (int bcol = 0; bcol < width / 8; bcol++)
      {
        int head_pointer = bcol * 8 + brow * 8 * width;
        row0 = _mm256_setzero_ps();
        row1 = _mm256_setzero_ps();
        row2 = _mm256_setzero_ps();
        row3 = _mm256_setzero_ps();
        row4 = _mm256_setzero_ps();
        row5 = _mm256_setzero_ps();
        row6 = _mm256_setzero_ps();
        row7 = _mm256_setzero_ps();

        /* This pair of loops uses AVX instuctions to add the frequency
       * component from each pixel to all of the buckets at once.  Allows
       * us to do the DCT on a block in 64 iterations of a loop rather
       * than 64 iterations of 64 iterations of a loop (all 64 pixels affect
       * all 64 frequencies) */
        for (int x = 0; x < 8; x++)
        {
          for (int y = 0; y < 4; y++)
          {
            loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]);
            loaderlow = _mm256_add_ps(loaderlow, minus128);
            loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]);
            loaderhigh = _mm256_add_ps(loaderhigh, minus128);

            avxcos = _mm256_loadu_ps(&acosvals[x][0]);
            loaderlow = _mm256_mul_ps(loaderlow, avxcos);
            loaderhigh = _mm256_mul_ps(loaderhigh, avxcos);

            avxcosloader = _mm256_loadu_ps(&acosvals[y][0]);

            avxcosmover = avxcosloader[0];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row0 = _mm256_add_ps(row0, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row0 = _mm256_add_ps(row0, temp);

            avxcosmover = avxcosloader[1];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row1 = _mm256_add_ps(row1, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row1 = _mm256_sub_ps(row1, temp);

            avxcosmover = avxcosloader[2];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row2 = _mm256_add_ps(row2, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row2 = _mm256_add_ps(row2, temp);

            avxcosmover = avxcosloader[3];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row3 = _mm256_add_ps(row3, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row3 = _mm256_sub_ps(row3, temp);

            avxcosmover = avxcosloader[4];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row4 = _mm256_add_ps(row4, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row4 = _mm256_add_ps(row4, temp);

            avxcosmover = avxcosloader[5];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row5 = _mm256_add_ps(row5, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row5 = _mm256_sub_ps(row5, temp);

            avxcosmover = avxcosloader[6];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row6 = _mm256_add_ps(row6, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row6 = _mm256_add_ps(row6, temp);

            avxcosmover = avxcosloader[7];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row7 = _mm256_add_ps(row7, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row7 = _mm256_sub_ps(row7, temp);
          }
        }

        /* Each frequency stored as a float needs to be divided by
       * the quantization value, then rounded to the nearest integer.
       * Also changes the order of the values from pixel order to
       * each 8 by 8 block stored one after another. */
        temp = _mm256_loadu_ps(&quant[0]);
        row0 = _mm256_div_ps(row0, temp);
        row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row0);
        _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[8]);
        row1 = _mm256_div_ps(row1, temp);
        row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row1);
        _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[16]);
        row2 = _mm256_div_ps(row2, temp);
        row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row2);
        _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[24]);
        row3 = _mm256_div_ps(row3, temp);
        row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row3);
        _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[32]);
        row4 = _mm256_div_ps(row4, temp);
        row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row4);
        _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[40]);
        row5 = _mm256_div_ps(row5, temp);
        row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row5);
        _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[48]);
        row6 = _mm256_div_ps(row6, temp);
        row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row6);
        _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[56]);
        row7 = _mm256_div_ps(row7, temp);
        row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row7);
        _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer);
      }
    }
  }
}
    void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item)
    {
        nn_workload_data_t *input_view = work_item->input[0]->output;
        const auto &arguments = work_item->arguments.forward_softmax_fixedpoint;

        const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];
        const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1;

        const auto num_full_blocks = output_width / C_data_stride;
        const auto partial_block_size = (output_width / C_simd_width) % C_max_acc;
        const auto subsimd_block_size = output_width % C_simd_width;

        const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x];

        const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];

        const auto out_fraction = arguments.input_fraction;

        float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64);

        auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start];

        auto shift = out_fraction;
        if (shift > 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) / (1 << shift);
        }
        else if (shift < 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) * (1 << -shift);
        }
        else
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]);
        }

        __m256 acc_sum = _mm256_setzero_ps();
        float subsimd_sum = 0.0f;
        {
            auto input_buffer = input_f;
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break;
            case  2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break;
            case  3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break;
            case  4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break;
            case  5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break;
            case  6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break;
            case  7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break;
            case  8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break;
            case  9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break;
            case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break;
            case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break;
            case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break;
            case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break;
            case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break;
            case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break;
            case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break;
            case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break;
            case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break;
            case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break;
            case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        {
            __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum);
            intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7));
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);

            acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum));
            subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0));

            acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum);
            subsimd_sum = 1.0f / subsimd_sum;
        }

        {
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_finalize_block<C_max_acc>(output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_finalize_block< 1>(output_buffer, acc_sum); break;
            case  2: softmax_finalize_block< 2>(output_buffer, acc_sum); break;
            case  3: softmax_finalize_block< 3>(output_buffer, acc_sum); break;
            case  4: softmax_finalize_block< 4>(output_buffer, acc_sum); break;
            case  5: softmax_finalize_block< 5>(output_buffer, acc_sum); break;
            case  6: softmax_finalize_block< 6>(output_buffer, acc_sum); break;
            case  7: softmax_finalize_block< 7>(output_buffer, acc_sum); break;
            case  8: softmax_finalize_block< 8>(output_buffer, acc_sum); break;
            case  9: softmax_finalize_block< 9>(output_buffer, acc_sum); break;
            case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break;
            case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break;
            case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break;
            case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break;
            case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break;
            case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break;
            case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break;
            case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break;
            case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break;
            case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break;
            case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }
        _mm_free(input_f);
    }
	// PRE: all vectors aligned, 
	//		imag_c = [i1,i1,...,i4,i4]
	//		vec = [v1r,v1i,...,v4r,v4i]
	//		component-wise multiplication
	// POST: returns [-i1*v1i,i1*v1r,...,-i4*v4i,i4*v4r]
	inline __m256 avx_multiply_float_imag_(const __m256& imag_c, const __m256& vec) {
		static const __m256 zero = _mm256_setzero_ps();
		__m256 vec1 = _mm256_mul_ps(imag_c,vec);
		vec1 = _mm256_permute_ps(vec1,0xB1);
		return _mm256_addsub_ps(zero,vec1);
	}