void bli_trsm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { obj_t c_pack_s; obj_t a1_pack_s, b1_pack_s; obj_t a1, b1; obj_t* a1_pack = NULL; obj_t* b1_pack = NULL; obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; // Prune any zero region that exists along the partitioning dimension. bli_trsm_prune_unref_mparts_k( a, b, c ); // Initialize pack objects for C that are passed into packm_init(). if( thread_am_ochief( thread ) ) { bli_obj_init_pack( &c_pack_s ); // Initialize object for packing C. bli_packm_init( c, &c_pack_s, cntl_sub_packm_c( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } c_pack = thread_obroadcast( thread, &c_pack_s ); if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &b1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); b1_pack = thread_ibroadcast( thread, &b1_pack_s ); // Pack C (if instructed). bli_packm_int( c, c_pack, cntl_sub_packm_c( cntl ), trsm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: We call a trsm-specific function to determine the kc // blocksize so that we can implement the "nudging" of kc to be // a multiple of mr, as needed. b_alg = bli_trsm_determine_kc_b( i, k_trans, b, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and B1. bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_init( &b1, b1_pack, cntl_sub_packm_b( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), trsm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a1_pack, b1_pack, &BLIS_ONE, c_pack, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); bli_obj_scalar_reset( c_pack ); } } thread_obarrier( thread ); // Unpack C (if C was packed). bli_unpackm_int( c_pack, c, cntl_sub_unpackm_c( cntl ), trsm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. if( thread_am_ochief( thread ) ) { bli_packm_release( c_pack, cntl_sub_packm_c( cntl ) ); } if( thread_am_ichief( thread ) ) { bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) ); } }
void bli_gemm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, gemm_thrinfo_t* thread ) { //The s is for "lives on the stack" obj_t b_pack_s; obj_t a1_pack_s, c1_pack_s; obj_t a1, c1; obj_t* a1_pack = NULL; obj_t* b_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; if( thread_am_ochief( thread ) ) { // Initialize object for packing B. bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, cntx, cntl_sub_packm_b( cntl ) ); // Scale C by beta (if instructed). // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) bli_scalm_int( &BLIS_ONE, c, cntx, cntl_sub_scalm( cntl ) ); } b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize objects passed into bli_packm_init for A and C if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack B (if instructed). bli_packm_int( b, b_pack, cntx, cntl_sub_packm_b( cntl ), gemm_thread_sub_opackm( thread ) ); dim_t my_start, my_end; bli_get_range_t2b( thread, a, bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ), &my_start, &my_end ); // Partition along the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of a (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. b_alg = bli_determine_blocksize_f( i, my_end, a, cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntx, cntl_sub_packm_a( cntl ) ); bli_packm_init( &c1, c1_pack, cntx, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntx, cntl_sub_packm_a( cntl ), gemm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntx, cntl_sub_packm_c( cntl ), gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, c1_pack, cntx, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread bli_unpackm_int( c1_pack, &c1, cntx, cntl_sub_unpackm_c( cntl ), gemm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ){ bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_trsm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { obj_t b_pack_s; obj_t a1_pack_s; obj_t a1, c1; obj_t* b_pack = NULL; obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; dim_t offA; // Initialize object for packing B. if( thread_am_ochief( thread ) ) { bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, cntl_sub_packm_b( cntl ) ); } b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize object for packing B. if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); // Pack B1 (if instructed). bli_packm_int( b, b_pack, cntl_sub_packm_b( cntl ), trsm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. m_trans = bli_obj_length_after_trans( *a ); offA = 0; // If A is lower triangular, we have to adjust where the non-zero part of // A begins. if ( bli_obj_is_lower( *a ) ) offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); dim_t start, end; num_t dt = bli_obj_execution_datatype( *a ); bli_get_range_t2b( thread, offA, m_trans, //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ), bli_info_get_default_mc( BLIS_TRSM, dt ), &start, &end ); // Partition along the remaining portion of the m dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize object for packing A1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, &c1, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ) bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); }
void bli_trsm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; obj_t b1, c1; obj_t* a_pack = NULL; obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t n_trans; // Initialize pack objects for A that are passed into packm_init(). if( thread_am_ochief( thread ) ) { bli_obj_init_pack( &a_pack_s ); // Initialize object for packing A. bli_packm_init( a, &a_pack_s, cntl_sub_packm_a( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } a_pack = thread_obroadcast( thread, &a_pack_s ); // Initialize pack objects for B and C that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } b1_pack = thread_ibroadcast( thread, &b1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack A (if instructed). bli_packm_int( a, a_pack, cntl_sub_packm_a( cntl ), trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *b ); dim_t start, end; num_t dt = bli_obj_execution_datatype( *a ); bli_get_range_r2l( thread, 0, n_trans, //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), // bli_info_get_default_mr( BLIS_TRSM, dt ) ), bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ), bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, end, b, cntl_blocksize( cntl ) ); // Acquire partitions for B1 and C1. bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. if( thread_am_ichief( thread ) ) { bli_packm_init( &b1, b1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_init( &c1, c1_pack, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), trsm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a_pack, b1_pack, &BLIS_ONE, c1_pack, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ), trsm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) ); if( thread_am_ichief( thread ) ) { bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_trsm_blk_var1b( obj_t* a, obj_t* b, obj_t* c, trsm_t* cntl, trsm_thrinfo_t* thread ) { obj_t b_pack_s; obj_t a1_pack_s; obj_t a1, c1; obj_t* b_pack = NULL; obj_t* a1_pack = NULL; dim_t i; dim_t b_alg; // Prune any zero region that exists along the partitioning dimension. bli_trsm_prune_unref_mparts_m( a, b, c ); // Initialize object for packing B. if( thread_am_ochief( thread ) ) { bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, cntl_sub_packm_b( cntl ) ); } b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize object for packing B. if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); // Pack B1 (if instructed). bli_packm_int( b, b_pack, cntl_sub_packm_b( cntl ), trsm_thread_sub_opackm( thread ) ); dim_t my_start, my_end; num_t dt = bli_obj_execution_datatype( *a ); dim_t bf = ( bli_obj_root_is_triangular( *a ) ? bli_info_get_default_mr( BLIS_TRSM, dt ) : bli_info_get_default_nr( BLIS_TRSM, dt ) ); bli_get_range_b2t( thread, a, bf, &my_start, &my_end ); // Partition along the remaining portion of the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, my_end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize object for packing A1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), trsm_thread_sub_ipackm( thread ) ); // Perform trsm subproblem. bli_trsm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, &c1, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ) bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); }
void bli_herk_blk_var3f( obj_t* a, obj_t* ah, obj_t* c, herk_t* cntl, herk_thrinfo_t* thread ) { obj_t c_pack_s; obj_t a1_pack_s, ah1_pack_s; obj_t a1, ah1; obj_t* a1_pack = NULL; obj_t* ah1_pack = NULL; obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; if( thread_am_ochief( thread ) ) { // Initialize object for packing C. bli_obj_init_pack( &c_pack_s ); bli_packm_init( c, &c_pack_s, cntl_sub_packm_c( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } c_pack = thread_obroadcast( thread, &c_pack_s ); // Initialize all pack objects that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &ah1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); ah1_pack = thread_ibroadcast( thread, &ah1_pack_s ); // Pack C (if instructed). bli_packm_int( c, c_pack, cntl_sub_packm_c( cntl ), herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, k_trans, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and A1'. bli_acquire_mpart_l2r( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, ah, &ah1 ); // Initialize objects for packing A1 and A1'. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_init( &ah1, ah1_pack, cntl_sub_packm_b( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), herk_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). bli_packm_int( &ah1, ah1_pack, cntl_sub_packm_b( cntl ), herk_thread_sub_ipackm( thread ) ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, a1_pack, ah1_pack, &BLIS_ONE, c_pack, cntl_sub_herk( cntl ), herk_thread_sub_herk( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. if ( i == 0 ) thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); } thread_obarrier( thread ); // Unpack C (if C was packed). bli_unpackm_int( c_pack, c, cntl_sub_unpackm_c( cntl ), herk_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. if( thread_am_ochief( thread ) ) { bli_obj_release_pack( c_pack ); } if( thread_am_ichief( thread ) ) { bli_obj_release_pack( a1_pack ); bli_obj_release_pack( ah1_pack ); } }
void bli_herk_blk_var1f( obj_t* a, obj_t* ah, obj_t* c, gemm_t* cntl, herk_thrinfo_t* thread ) { obj_t ah_pack_s; obj_t a1_pack_s, c1_pack_s; obj_t a1, c1; obj_t* a1_pack; obj_t* c1_pack; obj_t* ah_pack; dim_t i; dim_t b_alg; // Prune any zero region that exists along the partitioning dimension. bli_herk_prune_unref_mparts_m( a, ah, c ); if( thread_am_ochief( thread ) ) { // Initialize object for packing A'. bli_obj_init_pack( &ah_pack_s ); bli_packm_init( ah, &ah_pack_s, cntl_sub_packm_b( cntl ) ); // Scale C by beta (if instructed). // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } ah_pack = thread_obroadcast( thread, &ah_pack_s ); // Initialize pack objects that are passed into packm_init() for A and C. if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack A' (if instructed). bli_packm_int( ah, ah_pack, cntl_sub_packm_b( cntl ), herk_thread_sub_opackm( thread ) ); dim_t my_start, my_end; bli_get_range_weighted_t2b( thread, c, bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ), &my_start, &my_end ); // Partition along the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, my_end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_init( &c1, c1_pack, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), herk_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), herk_thread_sub_ipackm( thread ) ); // Perform herk subproblem. bli_herk_int( &BLIS_ONE, a1_pack, ah_pack, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), herk_thread_sub_herk( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ), herk_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( ah_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ) { bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_trmm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, trmm_thrinfo_t* thread ) { obj_t b_pack_s; obj_t a1_pack_s, c1_pack_s; obj_t a1, c1; obj_t* a1_pack = NULL; obj_t* b_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; // Prune any zero region that exists along the partitioning dimension. bli_trmm_prune_unref_mparts_m( a, b, c ); if( thread_am_ochief( thread ) ) { // Initialize object for packing B. bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, cntl_sub_packm_b( cntl ) ); // Scale C by beta (if instructed). // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize all pack objects that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack B (if instructed). bli_packm_int( b, b_pack, cntl_sub_packm_b( cntl ), trmm_thread_sub_opackm( thread ) ); // Set the default length of and offset to the non-zero part of A. //m_trans = bli_obj_length_after_trans( *a ); //offA = 0; // If A is lower triangular, we have to adjust where the non-zero part of // A begins. If A is upper triangular, we have to adjust the length of // the non-zero part. If A is general/dense, then we keep the defaults. //if ( bli_obj_is_lower( *a ) ) // offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) ); //else if ( bli_obj_is_upper( *a ) ) // m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) + // bli_obj_width_after_trans( *a ); dim_t my_start, my_end; bli_get_range_weighted_t2b( thread, a, bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ), &my_start, &my_end ); // Partition along the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, my_end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_init( &c1, c1_pack, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ){ bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_packm_blk_var1( obj_t* c, obj_t* p, packm_thrinfo_t* t ) { num_t dt_cp = bli_obj_datatype( *c ); struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); trans_t transc = bli_obj_conjtrans_status( *c ); pack_t schema = bli_obj_pack_schema( *p ); bool_t invdiag = bli_obj_has_inverted_diag( *p ); bool_t revifup = bli_obj_is_pack_rev_if_upper( *p ); bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p ); dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); inc_t is_p = bli_obj_imag_stride( *p ); dim_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); obj_t kappa; /*---initialize pointer to stop gcc complaining 2-9-16 GH --- */ obj_t* kappa_p = {0}; void* buf_kappa; func_t* packm_kers; void* packm_ker; FUNCPTR_T f; // Treatment of kappa (ie: packing during scaling) depends on // whether we are executing an induced method. if ( bli_is_ind_packed( schema ) ) { // The value for kappa we use will depend on whether the scalar // attached to A has a nonzero imaginary component. If it does, // then we will apply the scalar during packing to facilitate // implementing induced complex domain algorithms in terms of // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) if( thread_am_ochief( t ) ) { if ( bli_obj_scalar_has_nonzero_imag( p ) ) { // Detach the scalar. bli_obj_scalar_detach( p, &kappa ); // Reset the attached scalar (to 1.0). bli_obj_scalar_reset( p ); kappa_p = κ } else { // If the internal scalar of A has only a real component, then // we will apply it later (in the micro-kernel), and so we will // use BLIS_ONE to indicate no scaling during packing. kappa_p = &BLIS_ONE; } } kappa_p = thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); } else // if ( bli_is_nat_packed( schema ) ) { // This branch if for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform // any scaling during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); } // Choose the correct func_t object based on the pack_t schema. if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers; else if ( bli_is_3mi_packed( schema ) || bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers; else if ( bli_is_ro_packed( schema ) || bli_is_io_packed( schema ) || bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_obj_query( dt_cp, packm_kers ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, schema, invdiag, revifup, reviflo, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, is_p, pd_p, ps_p, packm_ker, t ); }
void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, trmm_thrinfo_t* thread ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; obj_t b1, c1; obj_t* a_pack = NULL; obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; // Prune any zero region that exists along the partitioning dimension. bli_trmm_prune_unref_mparts_n( a, b, c ); if( thread_am_ochief( thread ) ) { // Initialize object for packing A bli_obj_init_pack( &a_pack_s ); bli_packm_init( a, &a_pack_s, cntx, cntl_sub_packm_a( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntx, cntl_sub_scalm( cntl ) ); } a_pack = thread_obroadcast( thread, &a_pack_s ); // Initialize pack objects for B and C that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } b1_pack = thread_ibroadcast( thread, &b1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack A (if instructed). bli_packm_int( a, a_pack, cntx, cntl_sub_packm_a( cntl ), trmm_thread_sub_opackm( thread ) ); dim_t my_start, my_end; bli_get_range_weighted_r2l( thread, b, bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ), &my_start, &my_end ); // Partition along the n dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, my_end, b, cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. if( thread_am_ichief( thread ) ) { bli_packm_init( &b1, b1_pack, cntx, cntl_sub_packm_b( cntl ) ); bli_packm_init( &c1, c1_pack, cntx, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntx, cntl_sub_packm_b( cntl ), trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntx, cntl_sub_packm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a_pack, b1_pack, &BLIS_ONE, c1_pack, cntx, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntx, cntl_sub_unpackm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) ); if( thread_am_ichief( thread ) ) { bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_gemm_blk_var4f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, gemm_thrinfo_t* thread ) { extern packm_t* gemm3mh_packa_cntl_ro; extern packm_t* gemm3mh_packa_cntl_io; extern packm_t* gemm3mh_packa_cntl_rpi; packm_t* packa_cntl_ro = gemm3mh_packa_cntl_ro; packm_t* packa_cntl_io = gemm3mh_packa_cntl_io; packm_t* packa_cntl_rpi = gemm3mh_packa_cntl_rpi; //The s is for "lives on the stack" obj_t b_pack_s; obj_t a1_pack_s, c1_pack_s; obj_t a1, c1; obj_t* a1_pack = NULL; obj_t* b_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; dim_t m_trans; if( thread_am_ochief( thread ) ) { // Initialize object for packing B. bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, cntl_sub_packm_b( cntl ) ); // Scale C by beta (if instructed). // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize objects passed into bli_packm_init for A and C if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack B (if instructed). bli_packm_int( b, b_pack, cntl_sub_packm_b( cntl ), gemm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); dim_t start, end; bli_get_range_t2b( thread, 0, m_trans, bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ), &start, &end ); // Partition along the m dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of a (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, packa_cntl_ro ); bli_packm_init( &c1, c1_pack, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, packa_cntl_ro, gemm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntl_sub_packm_c( cntl ), gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread ) ); thread_ibarrier( thread ); // Only apply beta within the first of three subproblems. if ( thread_am_ichief( thread ) ) bli_obj_scalar_reset( c1_pack ); // Initialize objects for packing A1 and C1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, packa_cntl_io ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, packa_cntl_io, gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread ) ); thread_ibarrier( thread ); // Initialize objects for packing A1 and C1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, packa_cntl_rpi ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, packa_cntl_rpi, gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, c1_pack, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ), gemm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ){ // It doesn't matter which packm cntl node we pass in, as long // as it is valid, packm_release() will release the mem_t entry. bli_packm_release( a1_pack, packa_cntl_ro ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_trmm_blk_var3b( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, trmm_thrinfo_t* thread ) { obj_t c_pack_s; obj_t a1_pack_s, b1_pack_s; obj_t a1, b1; obj_t* a1_pack = NULL; obj_t* b1_pack = NULL; obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; if( thread_am_ochief( thread ) ){ // Initialize object for packing C bli_obj_init_pack( &c_pack_s ); bli_packm_init( c, &c_pack_s, cntl_sub_packm_c( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } c_pack = thread_obroadcast( thread, &c_pack_s ); // Initialize pack objects for A and B that are passed into packm_init(). if( thread_am_ichief( thread ) ){ bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &b1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); b1_pack = thread_ibroadcast( thread, &b1_pack_s ); // Pack C (if instructed). bli_packm_int( c, c_pack, cntl_sub_packm_c( cntl ), trmm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: We call a trmm-specific function to determine the kc // blocksize so that we can implement the "nudging" of kc to be // a multiple of mr or nr, as needed. b_alg = bli_trmm_determine_kc_b( i, k_trans, a, b, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and B1. bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_b2t( BLIS_SUBPART1, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_init( &b1, b1_pack, cntl_sub_packm_b( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), trmm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), trmm_thread_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a1_pack, b1_pack, &BLIS_ONE, c_pack, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); thread_ibarrier( thread ); } thread_obarrier( thread ); // Unpack C (if C was packed). bli_unpackm_int( c_pack, c, cntl_sub_unpackm_c( cntl ), trmm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. if( thread_am_ochief( thread ) ){ bli_packm_release( c_pack, cntl_sub_packm_c( cntl ) ); } if( thread_am_ichief( thread ) ){ bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) ); } }
void bli_herk_blk_var2f( obj_t* a, obj_t* ah, obj_t* c, gemm_t* cntl, herk_thrinfo_t* thread ) { obj_t a_pack_s; obj_t ah1_pack_s, c1S_pack_s; obj_t ah1, c1, c1S; obj_t aS_pack; obj_t* a_pack; obj_t* ah1_pack; obj_t* c1S_pack; dim_t i; dim_t b_alg; dim_t n_trans; subpart_t stored_part; // The upper and lower variants are identical, except for which // merged subpartition is acquired in the loop body. if ( bli_obj_is_lower( *c ) ) stored_part = BLIS_SUBPART1B; else stored_part = BLIS_SUBPART1T; if( thread_am_ochief( thread ) ) { // Initialize object for packing A bli_obj_init_pack( &a_pack_s ); bli_packm_init( a, &a_pack_s, cntl_sub_packm_a( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } a_pack = thread_obroadcast( thread, &a_pack_s ); // Initialize pack objects for C and A' that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &ah1_pack_s ); bli_obj_init_pack( &c1S_pack_s ); } ah1_pack = thread_ibroadcast( thread, &ah1_pack_s ); c1S_pack = thread_ibroadcast( thread, &c1S_pack_s ); // Pack A (if instructed). bli_packm_int( a, a_pack, cntl_sub_packm_a( cntl ), herk_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( *c ); dim_t start, end; // Needs to be replaced with a weighted range because triangle bli_get_range_weighted( thread, 0, n_trans, bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ), bli_obj_is_lower( *c ), &start, &end ); // Partition along the n dimension. for ( i = start; i < end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, end, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1' and C1. bli_acquire_mpart_l2r( BLIS_SUBPART1, i, b_alg, ah, &ah1 ); bli_acquire_mpart_l2r( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Partition off the stored region of C1 and the corresponding region // of A_pack. bli_acquire_mpart_t2b( stored_part, i, b_alg, &c1, &c1S ); bli_acquire_mpart_t2b( stored_part, i, b_alg, a_pack, &aS_pack ); // Initialize objects for packing A1' and C1. if( thread_am_ichief( thread ) ) { bli_packm_init( &ah1, ah1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_init( &c1S, c1S_pack, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ) ; // Pack A1' (if instructed). bli_packm_int( &ah1, ah1_pack, cntl_sub_packm_b( cntl ), herk_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1S, c1S_pack, cntl_sub_packm_c( cntl ), herk_thread_sub_ipackm( thread ) ) ; // Perform herk subproblem. bli_herk_int( &BLIS_ONE, &aS_pack, ah1_pack, &BLIS_ONE, c1S_pack, cntl_sub_gemm( cntl ), herk_thread_sub_herk( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1S_pack, &c1S, cntl_sub_unpackm_c( cntl ), herk_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) ); if( thread_am_ichief( thread ) ) { bli_packm_release( ah1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_release( c1S_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_gemm_blk_var3f( obj_t* a, obj_t* b, obj_t* c, gemm_t* cntl, gemm_thrinfo_t* thread ) { obj_t c_pack_s; obj_t a1_pack_s, b1_pack_s; obj_t a1, b1; obj_t* a1_pack = NULL; obj_t* b1_pack = NULL; obj_t* c_pack = NULL; dim_t i; dim_t b_alg; dim_t k_trans; if( thread_am_ochief( thread ) ){ // Initialize object for packing C bli_obj_init_pack( &c_pack_s ); bli_packm_init( c, &c_pack_s, cntl_sub_packm_c( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntl_sub_scalm( cntl ) ); } c_pack = thread_obroadcast( thread, &c_pack_s ); // Initialize pack objects for A and B that are passed into packm_init(). if( thread_am_ichief( thread ) ){ bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &b1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); b1_pack = thread_ibroadcast( thread, &b1_pack_s ); // Pack C (if instructed). bli_packm_int( c, c_pack, cntl_sub_packm_c( cntl ), gemm_thread_sub_opackm( thread ) ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: We call a gemm/hemm/symm-specific function to determine // the kc blocksize so that we can implement the "nudging" of kc // to be a multiple of mr or nr, as needed. b_alg = bli_gemm_determine_kc_f( i, k_trans, a, b, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and B1. bli_acquire_mpart_l2r( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, b, &b1 ); // Initialize objects for packing A1 and B1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_init( &b1, b1_pack, cntl_sub_packm_b( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntl_sub_packm_a( cntl ), gemm_thread_sub_ipackm( thread ) ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntl_sub_packm_b( cntl ), gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, a1_pack, b1_pack, &BLIS_ONE, c_pack, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); } thread_obarrier( thread ); // Unpack C (if C was packed). bli_unpackm_int( c_pack, c, cntl_sub_unpackm_c( cntl ), gemm_thread_sub_opackm( thread ) ); // If any packing buffers were acquired within packm, release them back // to the memory manager. if( thread_am_ochief( thread ) ) bli_packm_release( c_pack, cntl_sub_packm_c( cntl ) ); if( thread_am_ichief( thread ) ){ bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) ); } }