Exemple #1
0
void bli_pool_alloc_block( siz_t   block_size,
                           siz_t   align_size,
                           pblk_t* block )
{
	void* buf_sys;
	void* buf_align;

	// Allocate the block. We add the alignment size to ensure we will
	// have enough usable space after alignment.
	buf_sys   = bli_malloc( block_size + align_size );
	buf_align = buf_sys;

	// Advance the pointer to achieve the necessary alignment, if it is not
	// already aligned.
	if ( bli_is_unaligned_to( buf_sys, align_size ) )
	{
		// C99's stdint.h guarantees that a void* can be safely cast to a
		// uintptr_t and then back to a void*, hence the casting of buf_sys
		// and align_size to uintptr_t. buf_align is initially cast to char*
		// to allow pointer arithmetic in units of bytes, and then advanced
		// to the next nearest alignment boundary, and finally cast back to
		// void* before being stored. Notice that the arithmetic works even
		// if the alignment value is not a power of two.
		buf_align = ( void* )(   ( char*     )buf_align +
		                       ( ( uintptr_t )align_size -
		                         ( uintptr_t )buf_sys %
		                         ( uintptr_t )align_size )
		                     );
	}
	
	// Save the results in the pblk_t structure.
	bli_pblk_set_buf_sys( buf_sys, block );
	bli_pblk_set_buf_align( buf_align, block );
}
Exemple #2
0
void bli_pool_alloc_block( siz_t   block_size,
                           siz_t   align_size,
                           pblk_t* block )
{
	void* buf_sys;
	void* buf_align;

	// Allocate the block. We add the alignment size to ensure we will
	// have enough usable space after alignment.
	buf_sys   = bli_malloc( block_size + align_size );
	buf_align = buf_sys;

	// Advance the pointer to achieve the necessary alignment, if it
	// is not already aligned.
	if ( bli_is_unaligned_to( ( uintptr_t )buf_sys, ( uintptr_t )align_size ) )
	{
		// Notice that this works even if the alignment is not a power of two.
		buf_align += (   ( uintptr_t )align_size - 
		               ( ( uintptr_t )buf_sys % align_size ) );
	}
	
	// Save the results in the pblk_t structure.
	bli_pblk_set_buf_sys( buf_sys, block );
	bli_pblk_set_buf_align( buf_align, block );
}
Exemple #3
0
void* bli_malloc_align
     (
       malloc_ft f,
       size_t    size,
       size_t    align_size
     )
{
	const size_t ptr_size     = sizeof( void* );
	size_t       align_offset = 0;
	void*        p_orig;
	int8_t*      p_byte;
	void**       p_addr;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_malloc_align_check( f, size, align_size );

	// Return early if zero bytes were requested.
	if ( size == 0 ) return NULL;

	// Add the alignment size and the size of a pointer to the number
	// of bytes to allocate.
	size += align_size + ptr_size;

	// Call the allocation function.
	p_orig = f( size );

	// If NULL was returned, something is probably very wrong.
	if ( p_orig == NULL ) bli_abort();

	// Advance the pointer by one pointer element.
	p_byte = p_orig;
	p_byte += ptr_size;

	// Compute the offset to the desired alignment.
	if ( bli_is_unaligned_to( ( siz_t )p_byte, ( siz_t )align_size ) )
	{
		align_offset = align_size -
		               bli_offset_past_alignment( ( siz_t )p_byte,
		                                          ( siz_t )align_size );
	}

	// Advance the pointer using the difference between the alignment
	// size and the alignment offset.
	p_byte += align_offset;

	// Compute the address of the pointer element just before the start
	// of the aligned address, and store the original address there.
	p_addr = ( void** )(p_byte - ptr_size);
	*p_addr = p_orig;

	// Return the aligned pointer.
	return p_byte;
}
Exemple #4
0
void bli_daxpyf_int_var1
     (
       conj_t  conja,
       conj_t  conjx,
       dim_t   m,
       dim_t   b_n,
       double* alpha,
       double* a, inc_t inca, inc_t lda,
       double* x, inc_t incx,
       double* y, inc_t incy,
       cntx_t* cntx
     )
{
	double*  restrict alpha_cast = alpha;
	double*  restrict a_cast = a;
	double*  restrict x_cast = x;
	double*  restrict y_cast = y;
	dim_t             i;

	const dim_t       n_elem_per_reg = 2;
	const dim_t       n_iter_unroll  = 2;

	dim_t             m_pre;
	dim_t             m_run;
	dim_t             m_left;

    double*  restrict a0;
    double*  restrict a1;
    double*  restrict a2;
    double*  restrict a3;
    double*  restrict y0;
    double            a0c, a1c, a2c, a3c;
    double            chi0, chi1, chi2, chi3;

	v2df_t            a00v, a01v, a02v, a03v, y0v;
	v2df_t            a10v, a11v, a12v, a13v, y1v;
	v2df_t            chi0v, chi1v, chi2v, chi3v;

	bool_t            use_ref = FALSE;


	if ( bli_zero_dim2( m, b_n ) ) return;

	m_pre = 0;

	// If there is anything that would interfere with our use of aligned
	// vector loads/stores, call the reference implementation.
	if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) )
	{
		use_ref = TRUE;
	}
	else if ( inca != 1 || incx != 1 || incy != 1 ||
	          bli_is_unaligned_to( lda*sizeof(double), 16 ) )
	{
		use_ref = TRUE;
	}
	else if ( bli_is_unaligned_to( a, 16 ) ||
	          bli_is_unaligned_to( y, 16 ) )
	{
		use_ref = TRUE;

		if ( bli_is_unaligned_to( a, 16 ) &&
		     bli_is_unaligned_to( y, 16 ) )
		{
			use_ref = FALSE;
			m_pre   = 1;
		}
	}

	// Call the reference implementation if needed.
	if ( use_ref == TRUE )
	{
		BLIS_DAXPYF_KERNEL_REF( conja,
		                        conjx,
		                        m,
		                        b_n,
		                        alpha_cast,
		                        a_cast, inca, lda,
		                        x_cast, incx,
		                        y_cast, incy,
		                        cntx );
		return;
	}


	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );

	a0   = a_cast + 0*lda;
	a1   = a_cast + 1*lda;
	a2   = a_cast + 2*lda;
	a3   = a_cast + 3*lda;
	y0   = y_cast;

	chi0 = *(x_cast + 0*incx);
	chi1 = *(x_cast + 1*incx);
	chi2 = *(x_cast + 2*incx);
	chi3 = *(x_cast + 3*incx);

	PASTEMAC2(d,d,scals)( *alpha_cast, chi0 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi1 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi2 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi3 );

	if ( m_pre == 1 )
	{
		a0c = *a0;
		a1c = *a1;
		a2c = *a2;
		a3c = *a3;

		*y0 += chi0 * a0c + 
		       chi1 * a1c + 
		       chi2 * a2c + 
		       chi3 * a3c;

		a0 += inca;
		a1 += inca;
		a2 += inca;
		a3 += inca;
		y0 += incy;
	}

	chi0v.v = _mm_loaddup_pd( ( double* )&chi0 );
	chi1v.v = _mm_loaddup_pd( ( double* )&chi1 );
	chi2v.v = _mm_loaddup_pd( ( double* )&chi2 );
	chi3v.v = _mm_loaddup_pd( ( double* )&chi3 );

	for ( i = 0; i < m_run; ++i )
	{
		y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) );

		a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) );
		a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) );

		y0v.v += chi0v.v * a00v.v;
		y0v.v += chi1v.v * a01v.v;

		a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) );
		a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) );

		y0v.v += chi2v.v * a02v.v;
		y0v.v += chi3v.v * a03v.v;

		_mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v );


		y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) );

		a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) );
		a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) );

		y1v.v += chi0v.v * a10v.v;
		y1v.v += chi1v.v * a11v.v;

		a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) );
		a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) );

		y1v.v += chi2v.v * a12v.v;
		y1v.v += chi3v.v * a13v.v;

		_mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v );


		a0 += n_elem_per_reg * n_iter_unroll;
		a1 += n_elem_per_reg * n_iter_unroll;
		a2 += n_elem_per_reg * n_iter_unroll;
		a3 += n_elem_per_reg * n_iter_unroll;
		y0 += n_elem_per_reg * n_iter_unroll;
	}

	if ( m_left > 0 )
	{
		for ( i = 0; i < m_left; ++i )
		{
			a0c = *a0;
			a1c = *a1;
			a2c = *a2;
			a3c = *a3;

			*y0 += chi0 * a0c + 
			       chi1 * a1c + 
			       chi2 * a2c + 
			       chi3 * a3c;

			a0 += inca;
			a1 += inca;
			a2 += inca;
			a3 += inca;
			y0 += incy;
		}
	}
}