void bli_pool_alloc_block( siz_t block_size, siz_t align_size, pblk_t* block ) { void* buf_sys; void* buf_align; // Allocate the block. We add the alignment size to ensure we will // have enough usable space after alignment. buf_sys = bli_malloc( block_size + align_size ); buf_align = buf_sys; // Advance the pointer to achieve the necessary alignment, if it is not // already aligned. if ( bli_is_unaligned_to( buf_sys, align_size ) ) { // C99's stdint.h guarantees that a void* can be safely cast to a // uintptr_t and then back to a void*, hence the casting of buf_sys // and align_size to uintptr_t. buf_align is initially cast to char* // to allow pointer arithmetic in units of bytes, and then advanced // to the next nearest alignment boundary, and finally cast back to // void* before being stored. Notice that the arithmetic works even // if the alignment value is not a power of two. buf_align = ( void* )( ( char* )buf_align + ( ( uintptr_t )align_size - ( uintptr_t )buf_sys % ( uintptr_t )align_size ) ); } // Save the results in the pblk_t structure. bli_pblk_set_buf_sys( buf_sys, block ); bli_pblk_set_buf_align( buf_align, block ); }
void bli_pool_alloc_block( siz_t block_size, siz_t align_size, pblk_t* block ) { void* buf_sys; void* buf_align; // Allocate the block. We add the alignment size to ensure we will // have enough usable space after alignment. buf_sys = bli_malloc( block_size + align_size ); buf_align = buf_sys; // Advance the pointer to achieve the necessary alignment, if it // is not already aligned. if ( bli_is_unaligned_to( ( uintptr_t )buf_sys, ( uintptr_t )align_size ) ) { // Notice that this works even if the alignment is not a power of two. buf_align += ( ( uintptr_t )align_size - ( ( uintptr_t )buf_sys % align_size ) ); } // Save the results in the pblk_t structure. bli_pblk_set_buf_sys( buf_sys, block ); bli_pblk_set_buf_align( buf_align, block ); }
void* bli_malloc_align ( malloc_ft f, size_t size, size_t align_size ) { const size_t ptr_size = sizeof( void* ); size_t align_offset = 0; void* p_orig; int8_t* p_byte; void** p_addr; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_malloc_align_check( f, size, align_size ); // Return early if zero bytes were requested. if ( size == 0 ) return NULL; // Add the alignment size and the size of a pointer to the number // of bytes to allocate. size += align_size + ptr_size; // Call the allocation function. p_orig = f( size ); // If NULL was returned, something is probably very wrong. if ( p_orig == NULL ) bli_abort(); // Advance the pointer by one pointer element. p_byte = p_orig; p_byte += ptr_size; // Compute the offset to the desired alignment. if ( bli_is_unaligned_to( ( siz_t )p_byte, ( siz_t )align_size ) ) { align_offset = align_size - bli_offset_past_alignment( ( siz_t )p_byte, ( siz_t )align_size ); } // Advance the pointer using the difference between the alignment // size and the alignment offset. p_byte += align_offset; // Compute the address of the pointer element just before the start // of the aligned address, and store the original address there. p_addr = ( void** )(p_byte - ptr_size); *p_addr = p_orig; // Return the aligned pointer. return p_byte; }
void bli_daxpyf_int_var1 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* alpha, double* a, inc_t inca, inc_t lda, double* x, inc_t incx, double* y, inc_t incy, cntx_t* cntx ) { double* restrict alpha_cast = alpha; double* restrict a_cast = a; double* restrict x_cast = x; double* restrict y_cast = y; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 2; dim_t m_pre; dim_t m_run; dim_t m_left; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict y0; double a0c, a1c, a2c, a3c; double chi0, chi1, chi2, chi3; v2df_t a00v, a01v, a02v, a03v, y0v; v2df_t a10v, a11v, a12v, a13v, y1v; v2df_t chi0v, chi1v, chi2v, chi3v; bool_t use_ref = FALSE; if ( bli_zero_dim2( m, b_n ) ) return; m_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) ) { use_ref = TRUE; } else if ( inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( lda*sizeof(double), 16 ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( a, 16 ) || bli_is_unaligned_to( y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( a, 16 ) && bli_is_unaligned_to( y, 16 ) ) { use_ref = FALSE; m_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha_cast, a_cast, inca, lda, x_cast, incx, y_cast, incy, cntx ); return; } m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll ); a0 = a_cast + 0*lda; a1 = a_cast + 1*lda; a2 = a_cast + 2*lda; a3 = a_cast + 3*lda; y0 = y_cast; chi0 = *(x_cast + 0*incx); chi1 = *(x_cast + 1*incx); chi2 = *(x_cast + 2*incx); chi3 = *(x_cast + 3*incx); PASTEMAC2(d,d,scals)( *alpha_cast, chi0 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi1 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi2 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi3 ); if ( m_pre == 1 ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } chi0v.v = _mm_loaddup_pd( ( double* )&chi0 ); chi1v.v = _mm_loaddup_pd( ( double* )&chi1 ); chi2v.v = _mm_loaddup_pd( ( double* )&chi2 ); chi3v.v = _mm_loaddup_pd( ( double* )&chi3 ); for ( i = 0; i < m_run; ++i ) { y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) ); a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) ); a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) ); y0v.v += chi0v.v * a00v.v; y0v.v += chi1v.v * a01v.v; a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) ); a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) ); y0v.v += chi2v.v * a02v.v; y0v.v += chi3v.v * a03v.v; _mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v ); y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) ); a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) ); a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) ); y1v.v += chi0v.v * a10v.v; y1v.v += chi1v.v * a11v.v; a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) ); a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) ); y1v.v += chi2v.v * a12v.v; y1v.v += chi3v.v * a13v.v; _mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v ); a0 += n_elem_per_reg * n_iter_unroll; a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; a3 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } if ( m_left > 0 ) { for ( i = 0; i < m_left; ++i ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } } }