siz_t bli_thread_get_range_ndim ( dir_t direct, thrinfo_t* thr, obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl, cntx_t* cntx, dim_t* start, dim_t* end ) { bszid_t bszid = bli_cntl_bszid( cntl ); opid_t family = bli_cntx_get_family( cntx ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires // we swap the usage of the register blocksizes for the purposes of // packing A and B. if ( family == BLIS_TRSM ) { if ( bli_obj_root_is_triangular( *b ) ) bszid = BLIS_MR; else bszid = BLIS_NR; } blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); obj_t* x; bool_t use_weighted; // Use the operation family to choose the one of the two matrices // being partitioned that potentially has structure, and also to // decide whether or not we need to use weighted range partitioning. // NOTE: It's important that we use non-weighted range partitioning // for hemm and symm (ie: the gemm family) because the weighted // function will mistakenly skip over unstored regions of the // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } if ( use_weighted ) { if ( direct == BLIS_FWD ) return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end ); else return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end ); } else { if ( direct == BLIS_FWD ) return bli_thread_get_range_l2r( thr, x, bmult, start, end ); else return bli_thread_get_range_r2l( thr, x, bmult, start, end ); } }
void bli_gemm_blk_var1f( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, gemm_thrinfo_t* thread ) { //The s is for "lives on the stack" obj_t b_pack_s; obj_t a1_pack_s, c1_pack_s; obj_t a1, c1; obj_t* a1_pack = NULL; obj_t* b_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; if( thread_am_ochief( thread ) ) { // Initialize object for packing B. bli_obj_init_pack( &b_pack_s ); bli_packm_init( b, &b_pack_s, cntx, cntl_sub_packm_b( cntl ) ); // Scale C by beta (if instructed). // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) bli_scalm_int( &BLIS_ONE, c, cntx, cntl_sub_scalm( cntl ) ); } b_pack = thread_obroadcast( thread, &b_pack_s ); // Initialize objects passed into bli_packm_init for A and C if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &a1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } a1_pack = thread_ibroadcast( thread, &a1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack B (if instructed). bli_packm_int( b, b_pack, cntx, cntl_sub_packm_b( cntl ), gemm_thread_sub_opackm( thread ) ); dim_t my_start, my_end; bli_get_range_t2b( thread, a, bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ), &my_start, &my_end ); // Partition along the m dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. // NOTE: Use of a (for execution datatype) is intentional! // This causes the right blocksize to be used if c and a are // complex and b is real. b_alg = bli_determine_blocksize_f( i, my_end, a, cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and C1. if( thread_am_ichief( thread ) ) { bli_packm_init( &a1, a1_pack, cntx, cntl_sub_packm_a( cntl ) ); bli_packm_init( &c1, c1_pack, cntx, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack A1 (if instructed). bli_packm_int( &a1, a1_pack, cntx, cntl_sub_packm_a( cntl ), gemm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntx, cntl_sub_packm_c( cntl ), gemm_thread_sub_ipackm( thread ) ); // Perform gemm subproblem. bli_gemm_int( &BLIS_ONE, a1_pack, b_pack, &BLIS_ONE, c1_pack, cntx, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread bli_unpackm_int( c1_pack, &c1, cntx, cntl_sub_unpackm_c( cntl ), gemm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) ); if( thread_am_ichief( thread ) ){ bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }
void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, trmm_thrinfo_t* thread ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; obj_t b1, c1; obj_t* a_pack = NULL; obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; // Prune any zero region that exists along the partitioning dimension. bli_trmm_prune_unref_mparts_n( a, b, c ); if( thread_am_ochief( thread ) ) { // Initialize object for packing A bli_obj_init_pack( &a_pack_s ); bli_packm_init( a, &a_pack_s, cntx, cntl_sub_packm_a( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntx, cntl_sub_scalm( cntl ) ); } a_pack = thread_obroadcast( thread, &a_pack_s ); // Initialize pack objects for B and C that are passed into packm_init(). if( thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } b1_pack = thread_ibroadcast( thread, &b1_pack_s ); c1_pack = thread_ibroadcast( thread, &c1_pack_s ); // Pack A (if instructed). bli_packm_int( a, a_pack, cntx, cntl_sub_packm_a( cntl ), trmm_thread_sub_opackm( thread ) ); dim_t my_start, my_end; bli_get_range_weighted_r2l( thread, b, bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ), &my_start, &my_end ); // Partition along the n dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, my_end, b, cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. if( thread_am_ichief( thread ) ) { bli_packm_init( &b1, b1_pack, cntx, cntl_sub_packm_b( cntl ) ); bli_packm_init( &c1, c1_pack, cntx, cntl_sub_packm_c( cntl ) ); } thread_ibarrier( thread ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntx, cntl_sub_packm_b( cntl ), trmm_thread_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntx, cntl_sub_packm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a_pack, b1_pack, &BLIS_ONE, c1_pack, cntx, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntx, cntl_sub_unpackm_c( cntl ), trmm_thread_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. thread_obarrier( thread ); if( thread_am_ochief( thread ) ) bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) ); if( thread_am_ichief( thread ) ) { bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) ); bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) ); } }