void bli_gemm_blk_var1 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, c1; dir_t direct; dim_t i; dim_t b_alg; dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_mdim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end ); // Partition along the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize( direct, i, my_end, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); // Perform gemm subproblem. bli_gemm_int ( &BLIS_ONE, &a1, b, &BLIS_ONE, &c1, cntx, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); } }
void blx_gemm_blk_var3 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, b1; dim_t i; dim_t b_alg; dim_t k_trans; // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = blx_determine_blocksize_f( i, k_trans, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, b, &b1 ); // Perform gemm subproblem. blx_gemm_int ( &a1, &b1, c, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). // And since c is a locally aliased obj_t, we can simply overwrite // the internal beta scalar with BLIS_ONE once it has been used in // the first iteration. if ( i == 0 ) bli_obj_scalar_reset( c ); } }
void bli_trsm_blk_var3 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, b1; dir_t direct; dim_t i; dim_t b_alg; dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); // Perform trsm subproblem. bli_trsm_int ( &BLIS_ONE, &a1, &b1, &BLIS_ONE, c, cntx, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); //bli_thread_ibarrier( thread ); bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. if ( i == 0 ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); bli_obj_scalar_reset( c ); } } }