void bli_cconjmr( uplo_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs ) { float m1 = bli_sm1(); float* a_conj; int lda, inca; int n_iter; int n_elem_max; int n_elem; int j; // Return early if possible. if ( bli_zero_dim2( m, n ) ) return; // We initialize for column-major. n_iter = n; n_elem_max = m; lda = a_cs; inca = a_rs; // An optimization: if A is row-major, then let's access the matrix // by rows instead of by columns to increase spatial locality. if ( bli_is_row_storage( a_rs, a_cs ) ) { bli_swap_ints( n_iter, n_elem_max ); bli_swap_ints( lda, inca ); bli_toggle_uplo( uplo ); } if ( bli_is_upper( uplo ) ) { for ( j = 0; j < n_iter; ++j ) { n_elem = bli_min( j + 1, n_elem_max ); a_conj = ( float* )( a + j*lda ) + 1; bli_sscal( n_elem, &m1, a_conj, 2*inca ); } } else // if ( bli_is_lower( uplo ) ) { for ( j = 0; j < n_iter; ++j ) { n_elem = bli_max( 0, n_elem_max - j ); a_conj = ( float* )( a + j*lda + j*inca ) + 1; if ( n_elem <= 0 ) break; bli_sscal( n_elem, &m1, a_conj, 2*inca ); } } }
void bli_dsetmr( uplo_t uplo, int m, int n, double* sigma, double* a, int a_rs, int a_cs ) { double* a_begin; int lda, inca; int n_iter; int n_elem_max; int n_elem; int j; // Return early if possible. if ( bli_zero_dim2( m, n ) ) return; // Initialize with optimal values for column-major storage. n_iter = n; n_elem_max = m; lda = a_cs; inca = a_rs; // An optimization: if A is row-major, then let's access the matrix by // rows instead of by columns to increase spatial locality. if ( bli_is_row_storage( a_rs, a_cs ) ) { bli_swap_ints( n_iter, n_elem_max ); bli_swap_ints( lda, inca ); bli_toggle_uplo( uplo ); } if ( bli_is_upper( uplo ) ) { for ( j = 0; j < n_iter; j++ ) { n_elem = bli_min( j, n_elem_max ); a_begin = a + j*lda; bli_dsetv( n_elem, sigma, a_begin, inca ); } } else // if ( bli_is_lower( uplo ) ) { for ( j = 0; j < n_iter; j++ ) { n_elem = bli_max( 0, n_elem_max - j - 1 ); a_begin = a + j*lda + (j + 1)*inca; bli_dsetv( n_elem, sigma, a_begin, inca ); } } }
void bli_dmaxabsm( int m, int n, double* a, int a_rs, int a_cs, double* maxabs ) { double zero = bli_d0(); double* a_begin; double maxabs_cand; double maxabs_temp; int inca, lda; int n_iter; int n_elem; int j; // Return early if possible. if ( bli_zero_dim2( m, n ) ) { *maxabs = zero; return; } // Initialize with optimal values for column-major storage. inca = a_rs; lda = a_cs; n_iter = n; n_elem = m; // An optimization: if A is row-major, then let's access the matrix by // rows instead of by columns for increased spatial locality. if ( bli_is_row_storage( a_rs, a_cs ) ) { bli_swap_ints( n_iter, n_elem ); bli_swap_ints( lda, inca ); } // Initialize the maximum absolute value candidate to the first element. bli_dabsval2( a, &maxabs_cand ); for ( j = 0; j < n_iter; j++ ) { a_begin = a + j*lda; bli_dmaxabsv( n_elem, a_begin, inca, &maxabs_temp ); if ( maxabs_temp > maxabs_cand ) maxabs_cand = maxabs_temp; } *maxabs = maxabs_cand; }
void bli_daxpyf_int_var1 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* alpha, double* a, inc_t inca, inc_t lda, double* x, inc_t incx, double* y, inc_t incy, cntx_t* cntx ) { double* restrict alpha_cast = alpha; double* restrict a_cast = a; double* restrict x_cast = x; double* restrict y_cast = y; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 2; dim_t m_pre; dim_t m_run; dim_t m_left; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict y0; double a0c, a1c, a2c, a3c; double chi0, chi1, chi2, chi3; v2df_t a00v, a01v, a02v, a03v, y0v; v2df_t a10v, a11v, a12v, a13v, y1v; v2df_t chi0v, chi1v, chi2v, chi3v; bool_t use_ref = FALSE; if ( bli_zero_dim2( m, b_n ) ) return; m_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) ) { use_ref = TRUE; } else if ( inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( lda*sizeof(double), 16 ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( a, 16 ) || bli_is_unaligned_to( y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( a, 16 ) && bli_is_unaligned_to( y, 16 ) ) { use_ref = FALSE; m_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha_cast, a_cast, inca, lda, x_cast, incx, y_cast, incy, cntx ); return; } m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll ); a0 = a_cast + 0*lda; a1 = a_cast + 1*lda; a2 = a_cast + 2*lda; a3 = a_cast + 3*lda; y0 = y_cast; chi0 = *(x_cast + 0*incx); chi1 = *(x_cast + 1*incx); chi2 = *(x_cast + 2*incx); chi3 = *(x_cast + 3*incx); PASTEMAC2(d,d,scals)( *alpha_cast, chi0 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi1 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi2 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi3 ); if ( m_pre == 1 ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } chi0v.v = _mm_loaddup_pd( ( double* )&chi0 ); chi1v.v = _mm_loaddup_pd( ( double* )&chi1 ); chi2v.v = _mm_loaddup_pd( ( double* )&chi2 ); chi3v.v = _mm_loaddup_pd( ( double* )&chi3 ); for ( i = 0; i < m_run; ++i ) { y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) ); a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) ); a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) ); y0v.v += chi0v.v * a00v.v; y0v.v += chi1v.v * a01v.v; a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) ); a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) ); y0v.v += chi2v.v * a02v.v; y0v.v += chi3v.v * a03v.v; _mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v ); y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) ); a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) ); a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) ); y1v.v += chi0v.v * a10v.v; y1v.v += chi1v.v * a11v.v; a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) ); a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) ); y1v.v += chi2v.v * a12v.v; y1v.v += chi3v.v * a13v.v; _mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v ); a0 += n_elem_per_reg * n_iter_unroll; a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; a3 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } if ( m_left > 0 ) { for ( i = 0; i < m_left; ++i ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } } }