void blx_gemm_blk_var3 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, b1; dim_t i; dim_t b_alg; dim_t k_trans; // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = blx_determine_blocksize_f( i, k_trans, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, b, &b1 ); // Perform gemm subproblem. blx_gemm_int ( &a1, &b1, c, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). // And since c is a locally aliased obj_t, we can simply overwrite // the internal beta scalar with BLIS_ONE once it has been used in // the first iteration. if ( i == 0 ) bli_obj_scalar_reset( c ); } }
void bli_gemm_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t b_pack; // Pack matrix B according to the control tree node. bli_l3_packm ( b, &b_pack, cntx, cntl, thread ); // Proceed with execution using packed matrix B. bli_gemm_int ( &BLIS_ONE, a, &b_pack, &BLIS_ONE, c, cntx, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); }
void bli_l3_thrinfo_free ( thrinfo_t* thread ) { if ( thread == NULL || thread == &BLIS_PACKM_SINGLE_THREADED || thread == &BLIS_GEMM_SINGLE_THREADED ) return; thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); // Free the communicators, but only if the current thrinfo_t struct // is marked as needing them to be freed. The most common example of // thrinfo_t nodes NOT marked as needing their comms freed are those // associated with packm thrinfo_t nodes. if ( bli_thrinfo_needs_free_comm( thread ) ) { // The ochief always frees his communicator, and the ichief free its // communicator if we are at the leaf node. if ( bli_thread_am_ochief( thread ) ) bli_thrcomm_free( bli_thrinfo_ocomm( thread ) ); } // Free all children of the current thrinfo_t. bli_l3_thrinfo_free( thrinfo_sub_node ); // Free the thrinfo_t struct. bli_free_intl( thread ); }
void bli_gemm_blk_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t b1, c1; dir_t direct; dim_t i; dim_t b_alg; dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end ); // Partition along the n dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize( direct, i, my_end, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); // Perform gemm subproblem. bli_gemm_int ( &BLIS_ONE, a, &b1, &BLIS_ONE, &c1, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); } }
void bli_l3_thrinfo_print_paths ( thrinfo_t** threads ) { dim_t n_threads = bli_thread_num_threads( threads[0] ); dim_t gl_comm_id; thrinfo_t* jc_info = threads[0]; thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); dim_t jc_way = bli_thread_n_way( jc_info ); dim_t pc_way = bli_thread_n_way( pc_info ); dim_t pb_way = bli_thread_n_way( pb_info ); dim_t ic_way = bli_thread_n_way( ic_info ); dim_t pa_way = bli_thread_n_way( pa_info ); dim_t jr_way = bli_thread_n_way( jr_info ); dim_t ir_way = bli_thread_n_way( ir_info ); dim_t gl_nt = bli_thread_num_threads( jc_info ); dim_t jc_nt = bli_thread_num_threads( pc_info ); dim_t pc_nt = bli_thread_num_threads( pb_info ); dim_t pb_nt = bli_thread_num_threads( ic_info ); dim_t ic_nt = bli_thread_num_threads( pa_info ); dim_t pa_nt = bli_thread_num_threads( jr_info ); dim_t jr_nt = bli_thread_num_threads( ir_info ); printf( " gl jc kc pb ic pa jr ir\n" ); printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )gl_nt, ( unsigned long )jc_nt, ( unsigned long )pc_nt, ( unsigned long )pb_nt, ( unsigned long )ic_nt, ( unsigned long )pa_nt, ( unsigned long )jr_nt, ( unsigned long )1 ); printf( "\n" ); printf( " jc kc pb ic pa jr ir\n" ); printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )jc_way, ( unsigned long )pc_way, ( unsigned long )pb_way, ( unsigned long )ic_way, ( unsigned long )pa_way, ( unsigned long )jr_way, ( unsigned long )ir_way ); printf( "=================================================\n" ); for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) { jc_info = threads[gl_comm_id]; pc_info = bli_thrinfo_sub_node( jc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); ic_info = bli_thrinfo_sub_node( pb_info ); pa_info = bli_thrinfo_sub_node( ic_info ); jr_info = bli_thrinfo_sub_node( pa_info ); ir_info = bli_thrinfo_sub_node( jr_info ); dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); dim_t jc_work_id = bli_thread_work_id( jc_info ); dim_t pc_work_id = bli_thread_work_id( pc_info ); dim_t pb_work_id = bli_thread_work_id( pb_info ); dim_t ic_work_id = bli_thread_work_id( ic_info ); dim_t pa_work_id = bli_thread_work_id( pa_info ); dim_t jr_work_id = bli_thread_work_id( jr_info ); dim_t ir_work_id = bli_thread_work_id( ir_info ); printf( " gl jc pb kc pa ic jr \n" ); printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )gl_comm_id, ( unsigned long )jc_comm_id, ( unsigned long )pc_comm_id, ( unsigned long )pb_comm_id, ( unsigned long )ic_comm_id, ( unsigned long )pa_comm_id, ( unsigned long )jr_comm_id ); printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", ( unsigned long )jc_work_id, ( unsigned long )pc_work_id, ( unsigned long )pb_work_id, ( unsigned long )ic_work_id, ( unsigned long )pa_work_id, ( unsigned long )jr_work_id, ( unsigned long )ir_work_id ); printf( "---------------------------------------\n" ); } }
void bli_trsm_blk_var3 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, b1; dir_t direct; dim_t i; dim_t b_alg; dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); // Perform trsm subproblem. bli_trsm_int ( &BLIS_ONE, &a1, &b1, &BLIS_ONE, c, cntx, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); //bli_thread_ibarrier( thread ); bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. if ( i == 0 ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); bli_obj_scalar_reset( c ); } } }