Esempio n. 1
0
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y)
{ 
#ifdef AVX
	//Black magic
	//But it does produce the same results as the not AVX code
	__m256 accumulator;
	__m256 x_s = _mm256_load_ps(x->Features);
	__m256 y_s = _mm256_load_ps(y->Features);
	__m256 result = _mm256_sub_ps(x_s, y_s);
	accumulator = _mm256_mul_ps(result, result);

	x_s = _mm256_load_ps(&x->Features[8]);
	y_s = _mm256_load_ps(&y->Features[8]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[16]);
	y_s = _mm256_load_ps(&y->Features[16]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);

	x_s = _mm256_load_ps(&x->Features[24]);
	y_s = _mm256_load_ps(&y->Features[24]);
	result = _mm256_sub_ps(x_s, y_s);
	result = _mm256_mul_ps(result, result);
	accumulator = _mm256_add_ps(accumulator, result);
	//We now have a vector of 8 floats

	__m256 t1 = _mm256_hadd_ps(accumulator, accumulator);
	__m256 t2 = _mm256_hadd_ps(t1, t1);
	__m128 t3 = _mm256_extractf128_ps(t2, 1);
	__m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3);
	//And now we don't
	return std::sqrtf(_mm_cvtss_f32(t4));
#endif
#ifndef AVX
	//Can be autovectorized
	float accumulator[32];
	float distance = 0;
	for (int i = 0; i < 30; i++)
	{
		accumulator[i] = x->Features[i] - y->Features[i];
	}

	//If done properly this should be 4(8) instructions
	for (int i = 0; i < 30; i++)
	{
		distance += accumulator[i] * accumulator[i];
	}

	return std::sqrtf(distance);
#endif

	
}
Esempio n. 2
0
irreg_poly_area_func_sign(float, _avx) {
    if (__builtin_expect(is_null(cords) || cords_len == 0, 0))
        return 0;

    __m256
        values_0_3,
        values_4_7,
        values_8_11,
        values_12_15,
        values_16_19 = _mm256_load_ps((const float *)&cords[0][0]),
        accum_sum = _mm256_setzero_ps();
    float accum_sum_aux;

    #define _float_cords_dot_prod(curr, next, index)                    \
        _mm256_dp_ps(                                                   \
            curr,                                                       \
            _mm256_xor_ps(                                              \
                _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\
                _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f)  \
            ),                                                          \
            0b11110000 | (1 << (index))                                 \
        )


    unsigned long index;
    for (index = 0; index < (cords_len - 16); index += 16) {
        values_0_3   = values_16_19;
        values_4_7   = _mm256_load_ps((const float *)&cords[index + 4]);
        values_8_11  = _mm256_load_ps((const float *)&cords[index + 8]);
        values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]);
        values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]);

        accum_sum = _mm256_add_ps(
            accum_sum,
            _mm256_add_ps(
                _mm256_add_ps(
                    _float_cords_dot_prod(values_0_3, values_4_7, 0),
                    _float_cords_dot_prod(values_4_7, values_8_11, 1)
                ),
                _mm256_add_ps(
                    _float_cords_dot_prod(values_8_11, values_12_15, 2),
                    _float_cords_dot_prod(values_12_15, values_16_19, 3)
                )
            )
        );
    }

    accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3
    accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ...
    accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ...
    for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++)
        accum_sum_aux += _calc_diff_of_adj_prods(cords, index);

    return accum_sum_aux;
//    return scalar_half(scalar_abs(accum_sum_aux));
}
Esempio n. 3
0
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
    unsigned i;
    __m256 sum_l             = _mm256_setzero_ps();
    __m256 sum_r             = _mm256_setzero_ps();

    const float *buffer_l    = resamp->buffer_l + resamp->ptr;
    const float *buffer_r    = resamp->buffer_r + resamp->ptr;

    unsigned taps            = resamp->taps;
    unsigned phase           = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
    const float *phase_table = resamp->phase_table + phase * taps * 2;
    const float *delta_table = phase_table + taps;
    __m256 delta             = _mm256_set1_ps((float)
                               (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
    const float *phase_table = resamp->phase_table + phase * taps;
#endif

    for (i = 0; i < taps; i += 8)
    {
        __m256 buf_l  = _mm256_loadu_ps(buffer_l + i);
        __m256 buf_r  = _mm256_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
        __m256 deltas = _mm256_load_ps(delta_table + i);
        __m256 sinc   = _mm256_add_ps(_mm256_load_ps(phase_table + i),
                                      _mm256_mul_ps(deltas, delta));
#else
        __m256 sinc   = _mm256_load_ps(phase_table + i);
#endif
        sum_l         = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
        sum_r         = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
    }

    /* hadd on AVX is weird, and acts on low-lanes
     * and high-lanes separately. */
    __m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
    __m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
    res_l        = _mm256_hadd_ps(res_l, res_l);
    res_r        = _mm256_hadd_ps(res_r, res_r);
    res_l        = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
    res_r        = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);

    /* This is optimized to mov %xmmN, [mem].
     * There doesn't seem to be any _mm256_store_ss intrinsic. */
    _mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0));
    _mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0));
}
Esempio n. 4
0
void warmup(float *x, float *y, int size, float alpha)
{
    #pragma ivdep
    int i;

    __m256 m = _mm256_set_ps(1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0, 1.0/alpha, 2.0);
    #pragma vector aligned
    for (i=0; i<size; i+=4)
    {
        __m256 t = _mm256_load_ps(x+2*i);
        __m256 l = _mm256_mul_ps(t, m); // premultiply
        __m256 r = _mm256_permute2f128_ps( l , l , 1); // swap lower and higher 128 bits
        __m256 res = _mm256_hadd_ps(l, r);
        __m128 s = _mm256_extractf128_ps (res, 0);
        _mm_store_ps(y+i,s); // store it
    }
}
    void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item)
    {
        nn_workload_data_t *input_view = work_item->input[0]->output;
        const auto &arguments = work_item->arguments.forward_softmax_fixedpoint;

        const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];
        const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1;

        const auto num_full_blocks = output_width / C_data_stride;
        const auto partial_block_size = (output_width / C_simd_width) % C_max_acc;
        const auto subsimd_block_size = output_width % C_simd_width;

        const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x];

        const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];

        const auto out_fraction = arguments.input_fraction;

        float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64);

        auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start];

        auto shift = out_fraction;
        if (shift > 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) / (1 << shift);
        }
        else if (shift < 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) * (1 << -shift);
        }
        else
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]);
        }

        __m256 acc_sum = _mm256_setzero_ps();
        float subsimd_sum = 0.0f;
        {
            auto input_buffer = input_f;
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break;
            case  2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break;
            case  3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break;
            case  4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break;
            case  5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break;
            case  6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break;
            case  7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break;
            case  8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break;
            case  9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break;
            case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break;
            case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break;
            case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break;
            case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break;
            case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break;
            case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break;
            case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break;
            case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break;
            case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break;
            case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break;
            case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        {
            __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum);
            intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7));
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);

            acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum));
            subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0));

            acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum);
            subsimd_sum = 1.0f / subsimd_sum;
        }

        {
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_finalize_block<C_max_acc>(output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_finalize_block< 1>(output_buffer, acc_sum); break;
            case  2: softmax_finalize_block< 2>(output_buffer, acc_sum); break;
            case  3: softmax_finalize_block< 3>(output_buffer, acc_sum); break;
            case  4: softmax_finalize_block< 4>(output_buffer, acc_sum); break;
            case  5: softmax_finalize_block< 5>(output_buffer, acc_sum); break;
            case  6: softmax_finalize_block< 6>(output_buffer, acc_sum); break;
            case  7: softmax_finalize_block< 7>(output_buffer, acc_sum); break;
            case  8: softmax_finalize_block< 8>(output_buffer, acc_sum); break;
            case  9: softmax_finalize_block< 9>(output_buffer, acc_sum); break;
            case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break;
            case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break;
            case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break;
            case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break;
            case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break;
            case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break;
            case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break;
            case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break;
            case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break;
            case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break;
            case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }
        _mm_free(input_f);
    }
Esempio n. 6
0
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 8;
/*	const int bs  = 8;*/
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );
	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	int
		k;
	
	__m256
		zeros,
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	zeros = _mm256_setzero_ps();

	y_0 = _mm256_setzero_ps();
	y_1 = _mm256_setzero_ps();
	y_2 = _mm256_setzero_ps();
	y_3 = _mm256_setzero_ps();
	y_4 = _mm256_setzero_ps();
	y_5 = _mm256_setzero_ps();
	y_6 = _mm256_setzero_ps();
	y_7 = _mm256_setzero_ps();

	k=0;
	for(; k<kmax-7; k+=8)
		{
		
		x_0 = _mm256_loadu_ps( &x[0] );

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*1] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_1 = _mm256_add_ps( y_1, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*2] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_2 = _mm256_add_ps( y_2, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*3] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_3 = _mm256_add_ps( y_3, ax_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*4] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_4 = _mm256_add_ps( y_4, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*5] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_5 = _mm256_add_ps( y_5, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*6] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_6 = _mm256_add_ps( y_6, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*7] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_7 = _mm256_add_ps( y_7, ax_temp );

		A += sda*lda;
		x += lda;

		}

	x_0 = _mm256_loadu_ps( &x[0] );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x01 );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*1] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x03 );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_1 = _mm256_add_ps( y_1, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*2] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x07 );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_2 = _mm256_add_ps( y_2, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*3] );
	a_03 = _mm256_blend_ps( zeros, a_03, 0x0f );
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_3 = _mm256_add_ps( y_3, ax_temp );

	a_00 = _mm256_load_ps( &A[0+lda*4] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x1f );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_4 = _mm256_add_ps( y_4, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*5] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x3f );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_5 = _mm256_add_ps( y_5, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*6] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x7f );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_6 = _mm256_add_ps( y_6, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*7] );
/*	a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_7 = _mm256_add_ps( y_7, ax_temp );

	// reduction
	__m256
		z_0;

	y_0 = _mm256_hadd_ps(y_0, y_1);
	y_2 = _mm256_hadd_ps(y_2, y_3);
	y_4 = _mm256_hadd_ps(y_4, y_5);
	y_6 = _mm256_hadd_ps(y_6, y_7);

	y_0 = _mm256_hadd_ps(y_0, y_2);
	y_4 = _mm256_hadd_ps(y_4, y_6);

	y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20);
	y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31);
	
	y_0 = _mm256_add_ps(y_1, y_2);

	// store
	if(alg==0)
		{
		_mm256_storeu_ps(&y[0], y_0);
		}
	else if(alg==1)
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_add_ps(z_0, y_0);

		_mm256_storeu_ps(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_sub_ps(z_0, y_0);

		_mm256_storeu_ps(&y[0], z_0);
		}

	}
Esempio n. 7
0
void sEnv_process(HvBase *_c, SignalEnvelope *o, hv_bInf_t bIn,
		void (*sendMessage)(HvBase *, int, const HvMessage *)) {
#if HV_SIMD_AVX
  _mm256_stream_ps(o->buffer+o->numSamplesInBuffer, _mm256_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    __m256 sum = _mm256_setzero_ps();
    while (n4) {
      __m256 x = _mm256_load_ps(o->buffer + n4 - HV_N_SIMD);
      __m256 h = _mm256_load_ps(o->hanningWeights + n4 - HV_N_SIMD);
      x = _mm256_mul_ps(x, h);
      sum = _mm256_add_ps(sum, x);
      n4 -= HV_N_SIMD;
    }
    sum = _mm256_hadd_ps(sum,sum); // horizontal sum
    sum = _mm256_hadd_ps(sum,sum);
    sEnv_sendMessage(_c, o, sum[0]+sum[4], sendMessage); // updates numSamplesInBuffer
  }
#elif HV_SIMD_SSE
  _mm_stream_ps(o->buffer+o->numSamplesInBuffer, _mm_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    __m128 sum = _mm_setzero_ps();
    while (n4) {
      __m128 x = _mm_load_ps(o->buffer + n4 - HV_N_SIMD);
      __m128 h = _mm_load_ps(o->hanningWeights + n4 - HV_N_SIMD);
      x = _mm_mul_ps(x, h);
      sum = _mm_add_ps(sum, x);
      n4 -= HV_N_SIMD;
    }
    sum = _mm_hadd_ps(sum,sum); // horizontal sum
    sum = _mm_hadd_ps(sum,sum);
    sEnv_sendMessage(_c, o, sum[0], sendMessage);
  }
#elif HV_SIMD_NEON
  vst1q_f32(o->buffer+o->numSamplesInBuffer, vmulq_f32(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    float32x4_t sum = vdupq_n_f32(0.0f);
    while (n4) {
      float32x4_t x = vld1q_f32(o->buffer + n4 - HV_N_SIMD);
      float32x4_t h = vld1q_f32(o->hanningWeights + n4 - HV_N_SIMD);
      x = vmulq_f32(x, h);
      sum = vaddq_f32(sum, x);
      n4 -= HV_N_SIMD;
    }
    sEnv_sendMessage(_c, o, sum[0]+sum[1]+sum[2]+sum[3], sendMessage);
  }
#else // HV_SIMD_NONE
  o->buffer[o->numSamplesInBuffer] = (bIn*bIn);
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    float sum = 0.0f;
    for (int i = 0; i < o->windowSize; ++i) {
      sum += (o->hanningWeights[i] * o->buffer[i]);
    }
    sEnv_sendMessage(_c, o, sum, sendMessage);
  }
#endif
}
Esempio n. 8
0
void animate()
{
	float mx;
	float my;
	if(ManualControl)
	{
		POINT pos;
		GetCursorPos(&pos);
		RECT rc;
		GetClientRect(hMainWnd, &rc);
		ScreenToClient(hMainWnd, &pos);

		mx = pos.x;
		my = pos.y;
	}
	else
	{
		UpdatePosition(mx, my);
	}


	const auto size = partCount;

	VertexData *pVertexBuffer;
	pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD);

	_mm256_zeroall();

#pragma omp parallel \
	shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size)
	{
#pragma omp for nowait
		for(int i = 0; i < size; i += 4)
		{
			float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my };

			float *particleCoordsVec = (float*)particlesCoord + i;
			float *velocityVec = (float*)particlesVel + i;

			auto xyCoord = _mm256_loadu_ps(particleCoordsVec);
			auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec));

			auto squares = _mm256_mul_ps(hwTempData, hwTempData);
			auto distSquare = _mm256_hadd_ps(squares, squares);
			distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50);

			auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare);

			if(distSquare.m256_f32[0] < 400)
			{
				theForce.m256_f32[0] = 0;
				theForce.m256_f32[1] = 0;
			}

			if(distSquare.m256_f32[2] < 400)
			{
				theForce.m256_f32[2] = 0;
				theForce.m256_f32[3] = 0;
			}
			if(distSquare.m256_f32[4] < 400)
			{
				theForce.m256_f32[4] = 0;
				theForce.m256_f32[5] = 0;
			}

			if(distSquare.m256_f32[6] < 400)
			{
				theForce.m256_f32[6] = 0;
				theForce.m256_f32[7] = 0;
			}

			auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce);

			auto xyVelocities = _mm256_loadu_ps(velocityVec);
			xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance));
			xyVelocities = _mm256_add_ps(xyVelocities, xyForces);

			xyCoord = _mm256_add_ps(xyCoord, xyVelocities);

			_mm256_storeu_ps(velocityVec, xyVelocities);
			_mm256_storeu_ps(particleCoordsVec, xyCoord);


			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]);
			processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]);

			pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x;
			pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y;
			pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x;
			pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y;
			pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x;
			pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y;
			pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x;
			pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y;
		}
	}
	pVertexObject->Unlock();

	_mm256_zeroall();
}
Esempio n. 9
0
void electric_field(struct Structure This_Structure, float grid_span, int grid_size, fftw_real * grid, int *shared_x, struct atom_values *atoms, int natoms_in)
{

/************/

	/* Counters */

	int residue, atom, i;

	/* Co-ordinates */

	int x, y, z;
	float x_centre, y_centre, z_centre;

	/* Variables */

	float distance;
	float phi, epsilon;

	while (1) {
		
		pthread_mutex_lock(&shared_x_mutex);
		x = *shared_x;
		*shared_x = *shared_x + 1;
		pthread_mutex_unlock(&shared_x_mutex);
		
		if (x >= grid_size)
			break;
		
		printf(".");

		x_centre = gcentre(x, grid_span, grid_size);
		__mtype mx_centre = _set1_ps(x_centre);

		for (y = 0; y < grid_size; y++) {

			y_centre = gcentre(y, grid_span, grid_size);
			__mtype my_centre = _set1_ps(y_centre);

			for (z = 0; z < grid_size; z++) {

				z_centre = gcentre(z, grid_span, grid_size);
				__mtype mz_centre = _set1_ps(z_centre);

				phi = 0;
				__mtype phis = _set1_ps(0.0);
				
				for (i = 0; i < natoms_in; i++) {
				
					__mtype xs = _load_ps(atoms[i].xs);
					__mtype ys = _load_ps(atoms[i].ys);
					__mtype zs = _load_ps(atoms[i].zs);
					__mtype charges = _load_ps(atoms[i].charges);
					__mtype distances;
					
					// Calculo distancias (el original pythagoras)
					__mtype diffxs = _sub_ps(xs, mx_centre);
					__mtype diffys = _sub_ps(ys, my_centre);
					__mtype diffzs = _sub_ps(zs, mz_centre);
					
					diffxs = _mul_ps(diffxs, diffxs);
					diffys = _mul_ps(diffys, diffys);
					diffzs = _mul_ps(diffzs, diffzs);
					
					distances = _add_ps(diffxs, diffys);
					distances = _add_ps(distances, diffzs);
					
					distances = _sqrt_ps(distances);
					
					// A partir de aquí implemento los if's originales usando solo máscaras de bits
					
					// Trunco a 2 como mínimo
					distances = _max_ps(distances, _set1_ps(2.0));
					
					__mtype epsilons = _set1_ps(0.0);
					__mtype tmp;
					__mtype tmp2;
					
					// if >= 8
					tmp = _cmpge_ps(distances, _set1_ps(8.0));
					epsilons = _and_ps(tmp, _set1_ps(80.0));
					
					// else if <= 6
					tmp = _cmple_ps(distances, _set1_ps(6.0));
					tmp = _and_ps(tmp, _set1_ps(4.0));
					epsilons = _or_ps(epsilons, tmp);
					
					// else
					tmp = _cmpgt_ps(distances, _set1_ps(6.0));
					tmp2 = _cmpeq_ps(epsilons, _set1_ps(0.0));
					tmp = _and_ps(tmp, tmp2);
					tmp2 = _mul_ps(distances, _set1_ps(38.0));
					tmp2 = _sub_ps(tmp2, _set1_ps(224.0));
					tmp = _and_ps(tmp, tmp2);
			
					// Valor final
					epsilons = _or_ps(epsilons, tmp);
					
					// Calculo las phis
					tmp = _mul_ps(epsilons, distances);
					tmp = _div_ps(charges, tmp);
					
					// Acumulo las phis
					phis = _add_ps(phis, tmp);
				}
				#ifdef USE_AVX
				
				phis = _mm256_hadd_ps(phis, phis);
				phis = _mm256_hadd_ps(phis, phis);
				
				phi = phis[0] + phis[4];
				
				#else
				
				float tmp, tmp2;
				
				tmp = phis[0] + phis[1];
				tmp2 = phis[2] + phis[3];
				
				phi = tmp + tmp2;
				
				#endif
				grid[gaddress(x, y, z, grid_size)] = (fftw_real) phi;

			}
		}
	}

/************/

	return;

}
Esempio n. 10
0
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int 
		k, k_left, ii;
	
	float 
		k_left_d;

	const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5};
	float temp_space[8] = {};

	__m256
		mask,
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	mask = _mm256_loadu_ps( mask_f ); 

	zeros = _mm256_setzero_ps();

	x_n_0 = _mm256_broadcast_ss( &x_n[0] );
	x_n_1 = _mm256_broadcast_ss( &x_n[1] );
	x_n_2 = _mm256_broadcast_ss( &x_n[2] );
	x_n_3 = _mm256_broadcast_ss( &x_n[3] );

	if(alg==-1) // TODO xor
		{
		x_n_0 = _mm256_sub_ps( zeros, x_n_0 );
		x_n_1 = _mm256_sub_ps( zeros, x_n_1 );
		x_n_2 = _mm256_sub_ps( zeros, x_n_2 );
		x_n_3 = _mm256_sub_ps( zeros, x_n_3 );
		}

	y_t_0 = _mm256_setzero_ps();
	y_t_1 = _mm256_setzero_ps();
	y_t_2 = _mm256_setzero_ps();
	y_t_3 = _mm256_setzero_ps();
	
	k=0;

	// corner
	if(tri==1)
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );

/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
/*exit(1);*/

		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x00 );
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x0f );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;

		}

	if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
/*	for(; k<kna; k++)*/
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		
		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/

/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*printf("\nk_left = %d\n", k_left);*/
/*exit(1);*/
	
		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;
		
		}
	if(kna>0 || tri==1)
		{
		A += (sda-1)*lda;
		}
	for(; k<kmax-7; k+=8)
		{
		
		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		y_n_0 = _mm256_loadu_ps( &y_n[0] );
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		
		a_00  = _mm256_load_ps( &A[0+lda*0] );
		temp  = _mm256_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_load_ps( &A[0+lda*1] );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_load_ps( &A[0+lda*2] );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_load_ps( &A[0+lda*3] );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += sda*lda;
		y_n += 8;
		x_t += 8;

		}
	
	if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
		{
		
		k_left = kmax-k;

		k_left_d = 8.0 - k_left;

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		
/*printf("\nk_left2 = %d\n", k_left, kmax, k);*/
		a_00  = _mm256_load_ps( &A[0+lda*0] );
/*printf("\nk_left2 = %d\n", k_left);*/
		a_01  = _mm256_load_ps( &A[0+lda*1] );
		a_02  = _mm256_load_ps( &A[0+lda*2] );
		a_03  = _mm256_load_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/
	
/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*		A   += 1;*/
/*		y_n += 1;*/
/*		x_t += 1;*/
		
		}

	// reduction
	__m128
		z_0, z_1;

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1);
	y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3);

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2);

	y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01);
	
	z_0 = _mm256_castps256_ps128(y_t_0);
	z_1 = _mm256_castps256_ps128(y_t_1);
	
	z_1 = _mm_add_ps(z_0, z_1);

	if(alg==1)
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_add_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	else // alg==-1
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_sub_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	
	}
Esempio n. 11
0
void neuralNet::feedForward_layer(layerIterator_t nLayer)
{
	constFloatIterator_t pActivations, cWeight, endWeight;
	__m256 vTotal, vSub0, vSub1;
	__m256 *vWeight, *vAct, *vEndWeight;

	// summate each neuron's contribution
	for (neuronIterator_t cNeuron = nLayer->begin(), end = nLayer->end(); 
		cNeuron != end; 
		++cNeuron)
	{
		// foreach [previous neuron, current weight], up to endWeight
		pActivations = activations.begin() + (nLayer - 1)->front().iNeuronIndex;
		cWeight = cNeuron->weightsBegin(*this);
		endWeight = cNeuron->weightsEnd(*this);

		// (first 15 neurons) (TODO: redesign preamble and remove assertions for multiple of 16 size widths in neuralNet.h!)

		// summate all neurons of previous layer: (remaining batches of 8 neurons)
		vWeight = (__m256*)&cWeight[0];
		vAct = (__m256*)&pActivations[0];

		vEndWeight = (__m256*)&endWeight[0];

		// initialize the activation of this neuron to its bias weight. The bias weight's neuron is always on:
		vTotal = _mm256_set_ps(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, *endWeight); // can this be made with an aligned load?

		do // Take advantage of SIMD instructions by doing 16 multiplies per iteration
		{
			/* 
			 * each neuron's contribution is:
			 * input[j] += weight[i,j] * activation[i]
			 */
			// multiply:
			vSub0 = _mm256_mul_ps(vWeight[0], vAct[0]);
			vSub1 = _mm256_mul_ps(vWeight[1], vAct[1]);

			// prefetch next values: (these don't appear to help, are the networks too small for this to matter?)
			//_mm_prefetch((char*)(vWeight0+4), _MM_HINT_T0);
			//_mm_prefetch((char*)(vAct0+4), _MM_HINT_T0);

			// add to accumulator:
			vTotal = _mm256_add_ps(vTotal, vSub0);
			vTotal = _mm256_add_ps(vTotal, vSub1);

			// increment pointers:
			vWeight += 2;
			vAct += 2;
		}
		while (vWeight != vEndWeight);

		//finalize: (combine all 4 accumulators)
		{
			vTotal = _mm256_hadd_ps(vTotal, vTotal);
			vTotal = _mm256_hadd_ps(vTotal, vTotal);
			__m128 vUpperTotal = _mm256_extractf128_ps(vTotal, 1);
			vUpperTotal = _mm_add_ps(vUpperTotal, _mm256_castps256_ps128(vTotal));

			// store the lowest float into cInput:
			_mm_store_ss(&activations[cNeuron->iNeuronIndex], vUpperTotal);
		}
	}

	// activate all neurons in this layer:
	float* cActivation = (&activations.front() + nLayer->front().iNeuronIndex);
	float* lActivation = (&activations.front() + nLayer->back().iNeuronIndex + 1);
	float* lVectorActivation = lActivation - ((lActivation - cActivation)&(ALIGN_SIZE-1)); // equivalent to mod ALIGN_SIZE

	// aligned activations:
	while (cActivation != lVectorActivation)
	{
		activation_approx_avx(cActivation, cActivation);
		cActivation += ALIGN_SIZE;
	};

	// postscript: (unaligned activations):
	{
		size_t dActivation = (lActivation - cActivation);
		switch(dActivation)
		{
		case 7:
			activation_approx(cActivation+6,cActivation+6);
		case 6:
			activation_approx(cActivation+5,cActivation+5);
		case 5:
			activation_approx(cActivation+4,cActivation+4);
		case 4:
			activation_approx_sse(cActivation+0,cActivation+0);
			break;
		case 3:
			activation_approx(cActivation+2, cActivation+2);
		case 2:
			activation_approx(cActivation+1, cActivation+1);
		case 1:
			activation_approx(cActivation+0, cActivation+0);
		case 0:
			break;
		}
	}
}; // endOf feedForward_layer