void bli_cconjmr( uplo_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs ) { float m1 = bli_sm1(); float* a_conj; int lda, inca; int n_iter; int n_elem_max; int n_elem; int j; // Return early if possible. if ( bli_zero_dim2( m, n ) ) return; // We initialize for column-major. n_iter = n; n_elem_max = m; lda = a_cs; inca = a_rs; // An optimization: if A is row-major, then let's access the matrix // by rows instead of by columns to increase spatial locality. if ( bli_is_row_storage( a_rs, a_cs ) ) { bli_swap_ints( n_iter, n_elem_max ); bli_swap_ints( lda, inca ); bli_toggle_uplo( uplo ); } if ( bli_is_upper( uplo ) ) { for ( j = 0; j < n_iter; ++j ) { n_elem = bli_min( j + 1, n_elem_max ); a_conj = ( float* )( a + j*lda ) + 1; bli_sscal( n_elem, &m1, a_conj, 2*inca ); } } else // if ( bli_is_lower( uplo ) ) { for ( j = 0; j < n_iter; ++j ) { n_elem = bli_max( 0, n_elem_max - j ); a_conj = ( float* )( a + j*lda + j*inca ) + 1; if ( n_elem <= 0 ) break; bli_sscal( n_elem, &m1, a_conj, 2*inca ); } } }
void bli_dsetmr( uplo_t uplo, int m, int n, double* sigma, double* a, int a_rs, int a_cs ) { double* a_begin; int lda, inca; int n_iter; int n_elem_max; int n_elem; int j; // Return early if possible. if ( bli_zero_dim2( m, n ) ) return; // Initialize with optimal values for column-major storage. n_iter = n; n_elem_max = m; lda = a_cs; inca = a_rs; // An optimization: if A is row-major, then let's access the matrix by // rows instead of by columns to increase spatial locality. if ( bli_is_row_storage( a_rs, a_cs ) ) { bli_swap_ints( n_iter, n_elem_max ); bli_swap_ints( lda, inca ); bli_toggle_uplo( uplo ); } if ( bli_is_upper( uplo ) ) { for ( j = 0; j < n_iter; j++ ) { n_elem = bli_min( j, n_elem_max ); a_begin = a + j*lda; bli_dsetv( n_elem, sigma, a_begin, inca ); } } else // if ( bli_is_lower( uplo ) ) { for ( j = 0; j < n_iter; j++ ) { n_elem = bli_max( 0, n_elem_max - j - 1 ); a_begin = a + j*lda + (j + 1)*inca; bli_dsetv( n_elem, sigma, a_begin, inca ); } } }
// Code for work assignments void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end ) { thrinfo_t* thread = (thrinfo_t*) thr; dim_t n_way = thread->n_way; dim_t work_id = thread->work_id; dim_t size = all_end - all_start; dim_t n_pt = size / n_way; n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt; n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); *start = work_id * n_pt + all_start; *end = bli_min( *start + n_pt, size + all_start ); }
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end) { thrinfo_t* thread = (thrinfo_t*) thr; dim_t n_way = thread->n_way; dim_t work_id = thread->work_id; dim_t size = all_end - all_start; *start = 0; *end = all_end - all_start; double num = size*size / (double) n_way; if( forward ) { dim_t curr_caucus = n_way - 1; dim_t len = 0; while(1){ dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); if( curr_caucus == work_id ) { *start = bli_max( 0 , *end - width ) + all_start; *end = *end + all_start; return; } else{ *end -= width; len += width; curr_caucus--; } } } else{ while(1){ dim_t width = ceil(sqrt(*start * *start + num)) - *start; width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor); if( work_id == 0 ) { *start = *start + all_start; *end = bli_min( *start + width, all_end ); return; } else{ *start = *start + width; } work_id--; } } }
void libblis_test_gemv_check( obj_t* kappa, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, obj_t* y_orig, double* resid ) { num_t dt = bli_obj_datatype( *y ); num_t dt_real = bli_obj_datatype_proj_to_real( *y ); conj_t conja = bli_obj_conj_status( *a ); dim_t n_x = bli_obj_vector_dim( *x ); dim_t m_y = bli_obj_vector_dim( *y ); dim_t min_m_n = bli_min( m_y, n_x ); obj_t x_temp, y_temp; obj_t kappac, norm; obj_t xT_temp, yT_temp, yT; double junk; // // Pre-conditions: // - a is initialized to kappa along the diagonal. // - x is randomized. // - y_orig is randomized. // Note: // - alpha, beta, and kappa should have non-zero imaginary components in // the complex cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // // y := beta * y_orig + alpha * transa(A) * conjx(x) // // is functioning correctly if // // normf( y - z ) // // is negligible, where // // z = beta * y_orig + alpha * conja(kappa) * x // bli_obj_scalar_init_detached_copy_of( dt, conja, kappa, &kappac ); bli_obj_scalar_init_detached( dt_real, &norm ); bli_obj_create( dt, n_x, 1, 0, 0, &x_temp ); bli_obj_create( dt, m_y, 1, 0, 0, &y_temp ); bli_copyv( x, &x_temp ); bli_copyv( y_orig, &y_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, &x_temp, &xT_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, &y_temp, &yT_temp ); bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, y, &yT ); bli_scalv( &kappac, &xT_temp ); bli_scalv( beta, &yT_temp ); bli_axpyv( alpha, &xT_temp, &yT_temp ); bli_subv( &yT_temp, &yT ); bli_normfv( &yT, &norm ); bli_getsc( &norm, resid, &junk ); bli_obj_free( &x_temp ); bli_obj_free( &y_temp ); }
void bli_gemm_blk_var2( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t a_pack_s; obj_t b1_pack_s; obj_t c1_pack_s; obj_t b1, c1; obj_t* a_pack = NULL; obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; dim_t num_groups = bli_gemm_num_thread_groups( cntl->thread_info ); dim_t group_id = bli_gemm_group_id( cntl->thread_info ); if( bli_gemm_am_a_master( cntl->thread_info ) ) { // Initialize object for packing A. bli_obj_init_pack( &a_pack_s ); bli_packm_init( a, &a_pack_s, cntl_sub_packm_a( cntl ) ); } a_pack = bli_gemm_broadcast_a( cntl->thread_info, &a_pack_s ); // Pack A and scale by alpha (if instructed). bli_packm_int( alpha, a, a_pack, cntl_sub_packm_a( cntl ) ); bli_gemm_a_barrier( cntl->thread_info ); if( bli_gemm_am_b_master( cntl->thread_info )) { bli_obj_init_pack( &b1_pack_s ); } b1_pack = bli_gemm_broadcast_b( cntl->thread_info, &b1_pack_s ); if( bli_gemm_am_c_master( cntl->thread_info )) { bli_obj_init_pack( &c1_pack_s ); // Scale C by beta (if instructed). bli_scalm_int( beta, c, cntl_sub_scalm( cntl ) ); } c1_pack = bli_gemm_broadcast_c( cntl->thread_info, &c1_pack_s ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t n_pt = n_trans / num_groups; n_pt = (n_pt * num_groups < n_trans) ? n_pt + 1 : n_pt; n_pt = (n_pt % 8 == 0) ? n_pt : n_pt + 8 - (n_pt % 8); dim_t start = group_id * n_pt; dim_t end = bli_min( start + n_pt, n_trans ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of b (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. b_alg = bli_determine_blocksize_f( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for C1 bli_acquire_mpart_l2r( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Acquire partitions for B1 bli_acquire_mpart_l2r( BLIS_SUBPART1, i, b_alg, b, &b1 ); if( bli_gemm_am_b_master( cntl->thread_info )) { // Initialize objects for packing B1 bli_packm_init( &b1, &b1_pack_s, cntl_sub_packm_b( cntl ) ); } if( bli_gemm_am_c_master( cntl->thread_info )) { // Initialize objects for packing C1 bli_packm_init( &c1, &c1_pack_s, cntl_sub_packm_c( cntl ) ); } bli_gemm_b_barrier( cntl->thread_info ); bli_gemm_c_barrier( cntl->thread_info ); // Pack B1 and scale by alpha (if instructed). bli_packm_int( alpha, &b1, b1_pack, cntl_sub_packm_b( cntl ) ); // Pack C1 and scale by beta (if instructed). bli_packm_int( beta, &c1, c1_pack, cntl_sub_packm_c( cntl ) ); // Packing must be done before computation bli_gemm_b_barrier( cntl->thread_info ); bli_gemm_c_barrier( cntl->thread_info ); // Perform gemm subproblem. bli_gemm_int( alpha, a_pack, b1_pack, beta, c1_pack, cntl_sub_gemm( cntl ) ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_gemm_a_barrier( cntl->thread_info ); if( bli_gemm_am_a_master( cntl->thread_info )) bli_obj_release_pack( &a_pack_s ); bli_gemm_b_barrier( cntl->thread_info ); if( bli_gemm_am_b_master( cntl->thread_info )) { bli_obj_release_pack( &b1_pack_s ); } bli_gemm_c_barrier( cntl->thread_info ); if( bli_gemm_am_c_master( cntl->thread_info )) { bli_obj_release_pack( &c1_pack_s ); } }