void libblis_test_dotxaxpyf_experiment ( test_params_t* params, test_op_t* op, iface_t iface, num_t datatype, char* pc_str, char* sc_str, unsigned int p_cur, double* perf, double* resid ) { unsigned int n_repeats = params->n_repeats; unsigned int i; double time_min = 1e9; double time; dim_t m, b_n; conj_t conjat, conja, conjw, conjx; obj_t alpha, at, a, w, x, beta, y, z; obj_t y_save, z_save; cntx_t cntx; // Initialize a context. bli_dotxaxpyf_cntx_init( &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); // Query the operation's fusing factor for the current datatype. b_n = bli_cntx_get_blksz_def_dt( datatype, BLIS_XF, &cntx ); // Store the fusing factor so that the driver can retrieve the value // later when printing results. op->dim_aux[0] = b_n; // Map parameter characters to BLIS constants. bli_param_map_char_to_blis_conj( pc_str[0], &conjat ); bli_param_map_char_to_blis_conj( pc_str[1], &conja ); bli_param_map_char_to_blis_conj( pc_str[2], &conjw ); bli_param_map_char_to_blis_conj( pc_str[3], &conjx ); // Create test scalars. bli_obj_scalar_init_detached( datatype, &alpha ); bli_obj_scalar_init_detached( datatype, &beta ); // Create test operands (vectors and/or matrices). libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[0], m, b_n, &a ); libblis_test_vobj_create( params, datatype, sc_str[1], m, &w ); libblis_test_vobj_create( params, datatype, sc_str[2], b_n, &x ); libblis_test_vobj_create( params, datatype, sc_str[3], b_n, &y ); libblis_test_vobj_create( params, datatype, sc_str[3], b_n, &y_save ); libblis_test_vobj_create( params, datatype, sc_str[4], m, &z ); libblis_test_vobj_create( params, datatype, sc_str[4], m, &z_save ); // Set alpha. if ( bli_obj_is_real( y ) ) { bli_setsc( 1.2, 0.0, &alpha ); bli_setsc( -1.0, 0.0, &beta ); } else { bli_setsc( 1.2, 0.1, &alpha ); bli_setsc( -1.0, -0.1, &beta ); } // Randomize A, w, x, y, and z, and save y and z. libblis_test_mobj_randomize( params, FALSE, &a ); libblis_test_vobj_randomize( params, FALSE, &w ); libblis_test_vobj_randomize( params, FALSE, &x ); libblis_test_vobj_randomize( params, FALSE, &y ); libblis_test_vobj_randomize( params, FALSE, &z ); bli_copyv( &y, &y_save ); bli_copyv( &z, &z_save ); // Create an alias to a for at. (Note that it should NOT actually be // marked for transposition since the transposition is part of the dotxf // subproblem.) bli_obj_alias_to( a, at ); // Apply the parameters. bli_obj_set_conj( conjat, at ); bli_obj_set_conj( conja, a ); bli_obj_set_conj( conjw, w ); bli_obj_set_conj( conjx, x ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copyv( &y_save, &y ); bli_copyv( &z_save, &z ); time = bli_clock(); libblis_test_dotxaxpyf_impl( iface, &alpha, &at, &a, &w, &x, &beta, &y, &z, &cntx ); time_min = bli_clock_min_diff( time_min, time ); } // Estimate the performance of the best experiment repeat. *perf = ( 2.0 * m * b_n + 2.0 * m * b_n ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( y ) ) *perf *= 4.0; // Perform checks. libblis_test_dotxaxpyf_check( params, &alpha, &at, &a, &w, &x, &beta, &y, &z, &y_save, &z_save, resid ); // Zero out performance and residual if either output vector is empty. libblis_test_check_empty_problem( &y, perf, resid ); libblis_test_check_empty_problem( &z, perf, resid ); // Free the test objects. bli_obj_free( &a ); bli_obj_free( &w ); bli_obj_free( &x ); bli_obj_free( &y ); bli_obj_free( &z ); bli_obj_free( &y_save ); bli_obj_free( &z_save ); // Finalize the context. bli_dotxaxpyf_cntx_finalize( &cntx ); }
int main( int argc, char** argv ) { obj_t a, c; obj_t c_save; obj_t alpha; dim_t m, n; dim_t p; dim_t p_begin, p_max, p_inc; int m_input, n_input; ind_t ind; num_t dt; char dt_ch; int r, n_repeats; side_t side; uplo_t uploa; trans_t transa; diag_t diaga; f77_char f77_side; f77_char f77_uploa; f77_char f77_transa; f77_char f77_diaga; double dtime; double dtime_save; double gflops; //bli_init(); //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); n_repeats = 3; dt = DT; ind = IND; p_begin = P_BEGIN; p_max = P_MAX; p_inc = P_INC; m_input = -1; n_input = -1; // Supress compiler warnings about unused variable 'ind'. ( void )ind; #if 0 cntx_t* cntx; ind_t ind_mod = ind; // A hack to use 3m1 as 1mpb (with 1m as 1mbp). if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. cntx = bli_gks_query_ind_cntx( ind_mod, dt ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); #elif 1 //k_input = 256; #endif // Choose the char corresponding to the requested datatype. if ( bli_is_float( dt ) ) dt_ch = 's'; else if ( bli_is_double( dt ) ) dt_ch = 'd'; else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; else dt_ch = 'z'; #if 0 side = BLIS_LEFT; #else side = BLIS_RIGHT; #endif #if 0 uploa = BLIS_LOWER; #else uploa = BLIS_UPPER; #endif transa = BLIS_NO_TRANSPOSE; diaga = BLIS_NONUNIT_DIAG; bli_param_map_blis_to_netlib_side( side, &f77_side ); bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga ); // Begin with initializing the last entry to zero so that // matlab allocates space for the entire array once up-front. for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", ( unsigned long )(p - p_begin + 1)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); for ( p = p_begin; p <= p_max; p += p_inc ) { if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); else m = ( dim_t ) m_input; if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); else n = ( dim_t ) n_input; bli_obj_create( dt, 1, 1, 0, 0, &alpha ); if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, 0, 0, &a ); else bli_obj_create( dt, n, n, 0, 0, &a ); bli_obj_create( dt, m, n, 0, 0, &c ); //bli_obj_create( dt, m, n, n, 1, &c ); bli_obj_create( dt, m, n, 0, 0, &c_save ); bli_randm( &a ); bli_randm( &c ); bli_obj_set_struc( BLIS_TRIANGULAR, &a ); bli_obj_set_uplo( uploa, &a ); bli_obj_set_conjtrans( transa, &a ); bli_obj_set_diag( diaga, &a ); bli_randm( &a ); bli_mktrim( &a ); // Load the diagonal of A to make it more likely to be invertible. bli_shiftd( &BLIS_TWO, &a ); bli_setsc( (2.0/1.0), 0.0, &alpha ); bli_copym( &c, &c_save ); #if 0 //def BLIS bli_ind_disable_all_dt( dt ); bli_ind_enable_dt( ind, dt ); #endif dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { bli_copym( &c_save, &c ); dtime = bli_clock(); #ifdef PRINT bli_printm( "a", &a, "%4.1f", "" ); bli_printm( "c", &c, "%4.1f", "" ); #endif #ifdef BLIS bli_trsm( side, &alpha, &a, &c ); #else if ( bli_is_float( dt ) ) { f77_int mm = bli_obj_length( &c ); f77_int kk = bli_obj_width( &c ); f77_int lda = bli_obj_col_stride( &a ); f77_int ldc = bli_obj_col_stride( &c ); float* alphap = ( float* )bli_obj_buffer( &alpha ); float* ap = ( float* )bli_obj_buffer( &a ); float* cp = ( float* )bli_obj_buffer( &c ); strsm_( &f77_side, &f77_uploa, &f77_transa, &f77_diaga, &mm, &kk, alphap, ap, &lda, cp, &ldc ); } else if ( bli_is_double( dt ) ) { f77_int mm = bli_obj_length( &c ); f77_int kk = bli_obj_width( &c ); f77_int lda = bli_obj_col_stride( &a ); f77_int ldc = bli_obj_col_stride( &c ); double* alphap = ( double* )bli_obj_buffer( &alpha ); double* ap = ( double* )bli_obj_buffer( &a ); double* cp = ( double* )bli_obj_buffer( &c ); dtrsm_( &f77_side, &f77_uploa, &f77_transa, &f77_diaga, &mm, &kk, alphap, ap, &lda, cp, &ldc ); } else if ( bli_is_scomplex( dt ) ) { f77_int mm = bli_obj_length( &c ); f77_int kk = bli_obj_width( &c ); f77_int lda = bli_obj_col_stride( &a ); f77_int ldc = bli_obj_col_stride( &c ); #ifdef EIGEN float* alphap = ( float* )bli_obj_buffer( &alpha ); float* ap = ( float* )bli_obj_buffer( &a ); float* cp = ( float* )bli_obj_buffer( &c ); #else scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha ); scomplex* ap = ( scomplex* )bli_obj_buffer( &a ); scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); #endif ctrsm_( &f77_side, &f77_uploa, &f77_transa, &f77_diaga, &mm, &kk, alphap, ap, &lda, cp, &ldc ); } else if ( bli_is_dcomplex( dt ) ) { f77_int mm = bli_obj_length( &c ); f77_int kk = bli_obj_width( &c ); f77_int lda = bli_obj_col_stride( &a ); f77_int ldc = bli_obj_col_stride( &c ); #ifdef EIGEN double* alphap = ( double* )bli_obj_buffer( &alpha ); double* ap = ( double* )bli_obj_buffer( &a ); double* cp = ( double* )bli_obj_buffer( &c ); #else dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha ); dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a ); dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); #endif ztrsm_( &f77_side, &f77_uploa, &f77_transa, &f77_diaga, &mm, &kk, alphap, ap, &lda, cp, &ldc ); } #endif #ifdef PRINT bli_printm( "c after", &c, "%4.1f", "" ); exit(1); #endif dtime_save = bli_clock_min_diff( dtime_save, dtime ); } if ( bli_is_left( side ) ) gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); else gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); if ( bli_is_complex( dt ) ) gflops *= 4.0; printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", ( unsigned long )(p - p_begin + 1)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); bli_obj_free( &alpha ); bli_obj_free( &a ); bli_obj_free( &c ); bli_obj_free( &c_save ); } //bli_finalize(); return 0; }
err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c, cntx ); #if 0 // FGVZ: The datatype-specific variant is now responsible for checking for // alpha == 0.0. // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return BLIS_SUCCESS; } #endif #if 0 // FGVZ: Will this be needed for constructing thrinfo_t's (recall: the // sba needs to be attached to the rntm; see below)? Or will those nodes // just be created "locally," in an exposed manner? // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_GEMM, BLIS_LEFT, // ignored for gemm/hemm/symm bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // FGVZ: the sba needs to be attached to the rntm. But it needs // to be done in the thread region, since it needs a thread id. //bli_sba_rntm_set_pool( tid, array, rntm_p ); #endif #if 0 // FGVZ: The datatype-specific variant is now responsible for inducing a // transposition, if needed. // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( a ) ) { bli_obj_induce_fast_trans( a ); bli_obj_toggle_trans( a ); } if ( bli_obj_has_trans( b ) ) { bli_obj_induce_fast_trans( b ); bli_obj_toggle_trans( b ); } #endif #if 0 //bli_gemmsup_ref_var2 //bli_gemmsup_ref_var1 #if 0 bli_gemmsup_ref_var1n #else #endif const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); if ( is_rrr_rrc_rcr_crr ) { bli_gemmsup_ref_var2m ( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else { bli_gemmsup_ref_var2m ( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } #else const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); // Don't use the small/unpacked implementation if one of the matrices // uses general stride. if ( stor_id == BLIS_XXX ) return BLIS_FAILURE; const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); if ( is_primary ) { // This branch handles: // - rrr rrc rcr crr for row-preferential kernels // - rcc crc ccr ccc for column-preferential kernels const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t mu = m / MR; const dim_t nu = n / NR; if ( mu >= nu ) { // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else // if ( mu < nu ) { // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } } else { // This branch handles: // - rrr rrc rcr crr for column-preferential kernels // - rcc crc ccr ccc for row-preferential kernels const dim_t mt = bli_obj_width( c ); const dim_t nt = bli_obj_length( c ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t mu = mt / MR; const dim_t nu = nt / NR; if ( mu >= nu ) { // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans bli_gemmsup_ref_var2m( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else // if ( mu < nu ) { // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans bli_gemmsup_ref_var1n( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } // *requires nudging of mc,nc up to be a multiple of nr,mr. } #endif // Return success so that the caller knows that we computed the solution. return BLIS_SUCCESS; }
siz_t bli_packv_init_pack ( pack_t schema, bszid_t bmult_id, obj_t* a, obj_t* p, cntx_t* cntx ) { num_t dt = bli_obj_dt( a ); dim_t dim_a = bli_obj_vector_dim( a ); dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); membrk_t* membrk = bli_cntx_membrk( cntx ); #if 0 mem_t* mem_p; #endif dim_t m_p_pad; siz_t size_p; inc_t rs_p, cs_p; void* buf; // We begin by copying the basic fields of c. bli_obj_alias_to( a, p ); // Update the dimensions. bli_obj_set_dims( dim_a, 1, p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, p ); // Set the pack schema in the p object to the value in the control tree // node. bli_obj_set_pack_schema( schema, p ); // Compute the dimensions padded by the dimension multiples. m_p_pad = bli_align_dim_to_mult( bli_obj_vector_dim( p ), bmult ); // Compute the size of the packed buffer. size_p = m_p_pad * 1 * bli_obj_elem_size( p ); #if 0 // Extract the address of the mem_t object within p that will track // properties of the packed buffer. mem_p = bli_obj_pack_mem( *p ); if ( bli_mem_is_unalloc( mem_p ) ) { // If the mem_t object of p has not yet been allocated, then acquire // a memory block suitable for a vector. bli_membrk_acquire_v( membrk, size_p, mem_p ); } else { // If the mem_t object has already been allocated, then release and // re-acquire the memory so there is sufficient space. if ( bli_mem_size( mem_p ) < size_p ) { bli_membrk_release( mem_p ); bli_membrk_acquire_v( membrk, size_p, mem_p ); } } // Grab the buffer address from the mem_t object and copy it to the // main object buffer field. (Sometimes this buffer address will be // copied when the value is already up-to-date, because it persists // in the main object buffer field across loop iterations.) buf = bli_mem_buffer( mem_p ); bli_obj_set_buffer( buf, p ); #endif // Save the padded (packed) dimensions into the packed object. bli_obj_set_padded_dims( m_p_pad, 1, p ); // Set the row and column strides of p based on the pack schema. if ( schema == BLIS_PACKED_VECTOR ) { // Set the strides to reflect a column-stored vector. Note that the // column stride may never be used, and is only useful to determine // how much space beyond the vector would need to be zero-padded, if // zero-padding was needed. rs_p = 1; cs_p = bli_obj_padded_length( p ); bli_obj_set_strides( rs_p, cs_p, p ); } return size_p; }
void bli_daxpyf_int_var1 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* alpha, double* a, inc_t inca, inc_t lda, double* x, inc_t incx, double* y, inc_t incy, cntx_t* cntx ) { double* restrict alpha_cast = alpha; double* restrict a_cast = a; double* restrict x_cast = x; double* restrict y_cast = y; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 2; dim_t m_pre; dim_t m_run; dim_t m_left; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict y0; double a0c, a1c, a2c, a3c; double chi0, chi1, chi2, chi3; v2df_t a00v, a01v, a02v, a03v, y0v; v2df_t a10v, a11v, a12v, a13v, y1v; v2df_t chi0v, chi1v, chi2v, chi3v; bool_t use_ref = FALSE; if ( bli_zero_dim2( m, b_n ) ) return; m_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) ) { use_ref = TRUE; } else if ( inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( lda*sizeof(double), 16 ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( a, 16 ) || bli_is_unaligned_to( y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( a, 16 ) && bli_is_unaligned_to( y, 16 ) ) { use_ref = FALSE; m_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha_cast, a_cast, inca, lda, x_cast, incx, y_cast, incy, cntx ); return; } m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll ); a0 = a_cast + 0*lda; a1 = a_cast + 1*lda; a2 = a_cast + 2*lda; a3 = a_cast + 3*lda; y0 = y_cast; chi0 = *(x_cast + 0*incx); chi1 = *(x_cast + 1*incx); chi2 = *(x_cast + 2*incx); chi3 = *(x_cast + 3*incx); PASTEMAC2(d,d,scals)( *alpha_cast, chi0 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi1 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi2 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi3 ); if ( m_pre == 1 ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } chi0v.v = _mm_loaddup_pd( ( double* )&chi0 ); chi1v.v = _mm_loaddup_pd( ( double* )&chi1 ); chi2v.v = _mm_loaddup_pd( ( double* )&chi2 ); chi3v.v = _mm_loaddup_pd( ( double* )&chi3 ); for ( i = 0; i < m_run; ++i ) { y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) ); a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) ); a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) ); y0v.v += chi0v.v * a00v.v; y0v.v += chi1v.v * a01v.v; a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) ); a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) ); y0v.v += chi2v.v * a02v.v; y0v.v += chi3v.v * a03v.v; _mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v ); y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) ); a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) ); a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) ); y1v.v += chi0v.v * a10v.v; y1v.v += chi1v.v * a11v.v; a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) ); a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) ); y1v.v += chi2v.v * a12v.v; y1v.v += chi3v.v * a13v.v; _mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v ); a0 += n_elem_per_reg * n_iter_unroll; a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; a3 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } if ( m_left > 0 ) { for ( i = 0; i < m_left; ++i ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } } }
C11 := beta * C11 + alpha * A1 * B1 where A1 is MR x k, B1 is k x NR, C11 is MR x NR, and alpha and beta are scalars. For more info, please refer to the BLIS website's wiki on kernels: https://github.com/flame/blis/wiki/KernelsHowTo and/or contact the blis-devel mailing list. -FGVZ */ const num_t dt = BLIS_DCOMPLEX; const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const inc_t cs_a = packmr; const inc_t rs_b = packnr; const inc_t rs_ab = 1; const inc_t cs_ab = mr; dim_t l, j, i; dcomplex ab[ bli_zmr * bli_znr ];