void bli_l3_thrinfo_free ( thrinfo_t* thread ) { if ( thread == NULL || thread == &BLIS_PACKM_SINGLE_THREADED || thread == &BLIS_GEMM_SINGLE_THREADED ) return; thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); // Free the communicators, but only if the current thrinfo_t struct // is marked as needing them to be freed. The most common example of // thrinfo_t nodes NOT marked as needing their comms freed are those // associated with packm thrinfo_t nodes. if ( bli_thrinfo_needs_free_comm( thread ) ) { // The ochief always frees his communicator, and the ichief free its // communicator if we are at the leaf node. if ( bli_thread_am_ochief( thread ) ) bli_thrcomm_free( bli_thrinfo_ocomm( thread ) ); } // Free all children of the current thrinfo_t. bli_l3_thrinfo_free( thrinfo_sub_node ); // Free the thrinfo_t struct. bli_free_intl( thread ); }
void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, thrinfo_t* thread ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; obj_t b1, c1; obj_t* a_pack = NULL; obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; // Prune any zero region that exists along the partitioning dimension. bli_trmm_prune_unref_mparts_n( a, b, c ); if( bli_thread_am_ochief( thread ) ) { // Initialize object for packing A bli_obj_init_pack( &a_pack_s ); bli_packm_init( a, &a_pack_s, cntx, bli_cntl_sub_packm_a( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntx, bli_cntl_sub_scalm( cntl ) ); } a_pack = bli_thread_obroadcast( thread, &a_pack_s ); // Initialize pack objects for B and C that are passed into packm_init(). if( bli_thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); // Pack A (if instructed). bli_packm_int( a, a_pack, cntx, bli_cntl_sub_packm_a( cntl ), bli_thrinfo_sub_opackm( thread ) ); dim_t my_start, my_end; bli_thread_get_range_weighted_r2l( thread, b, bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), &my_start, &my_end ); // Partition along the n dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, my_end, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. if( bli_thread_am_ichief( thread ) ) { bli_packm_init( &b1, b1_pack, cntx, bli_cntl_sub_packm_b( cntl ) ); bli_packm_init( &c1, c1_pack, cntx, bli_cntl_sub_packm_c( cntl ) ); } bli_thread_ibarrier( thread ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntx, bli_cntl_sub_packm_b( cntl ), bli_thrinfo_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntx, bli_cntl_sub_packm_c( cntl ), bli_thrinfo_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a_pack, b1_pack, &BLIS_ONE, c1_pack, cntx, bli_cntl_sub_gemm( cntl ), bli_thrinfo_sub_self( thread ) ); bli_thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntx, bli_cntl_sub_unpackm_c( cntl ), bli_thrinfo_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_thread_obarrier( thread ); if( bli_thread_am_ochief( thread ) ) bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); if( bli_thread_am_ichief( thread ) ) { bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); } }
void bli_gemm_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { // This should never execute. bli_abort(); if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( cntx, cntl, thread ); // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. { ind_t im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; } } // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, cntl, thread ); }
void bli_packm_unb_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_cp = bli_obj_datatype( *c ); struc_t strucc = bli_obj_struc( *c ); doff_t diagoffc = bli_obj_diag_offset( *c ); diag_t diagc = bli_obj_diag( *c ); uplo_t uploc = bli_obj_uplo( *c ); trans_t transc = bli_obj_conjtrans_status( *c ); dim_t m_p = bli_obj_length( *p ); dim_t n_p = bli_obj_width( *p ); dim_t m_max_p = bli_obj_padded_length( *p ); dim_t n_max_p = bli_obj_padded_width( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); void* buf_kappa; FUNCPTR_T f; // This variant assumes that the computational kernel will always apply // the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE // for kappa so that the underlying packm implementation does not scale // during packing. buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; if( bli_thread_am_ochief( thread ) ) { // Invoke the function. f ( strucc, diagoffc, diagc, uploc, transc, m_p, n_p, m_max_p, n_max_p, buf_kappa, buf_c, rs_c, cs_c, buf_p, rs_p, cs_p, cntx ); } }
void bli_trsm_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; trsm_var_oft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( c, &c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) { bli_obj_induce_trans( &c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( a ) ) { // If alpha is non-unit, typecast and apply it to the scalar // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( b ) ) { // If alpha is non-unit, typecast and apply it to the scalar // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &a_local ); } } // FGVZ->TMS: Is this barrier still needed? bli_thread_obarrier( thread ); // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, rntm, cntl, thread ); }
void bli_unpackm_int( obj_t* p, obj_t* a, cntx_t* cntx, unpackm_t* cntl, thrinfo_t* thread ) { // The unpackm operation consists of an optional post-process: castm. // (This post-process is analogous to the castm pre-process in packm.) // Here are the following possible ways unpackm can execute: // 1. unpack and cast: Unpack to a temporary matrix c and then cast // c to a. // 2. unpack only: Unpack directly to matrix a since typecasting is // not needed. // 3. cast only: Not yet supported / not used. // 4. no-op: The control tree directs us to skip the unpack operation // entirely. No action is taken. obj_t c; varnum_t n; impl_t i; FUNCPTR_T f; // Sanity check; A should never have a zero dimension. If we must support // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); // First check if we are to skip this operation because the control tree // is NULL, and if so, simply return. if ( bli_cntl_is_noop( cntl ) ) { return; } // If p was aliased to a during the pack stage (because it was already // in an acceptable packed/contiguous format), then no unpack is actually // necessary, so we return. if ( bli_obj_is_alias_of( *p, *a ) ) { return; } // Check parameters. if ( bli_error_checking_is_enabled() ) bli_unpackm_check( p, a, cntx, cntl ); // Now, if we are not skipping the unpack operation, then the only // question left is whether we are to typecast matrix a after unpacking. if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) bli_abort(); /* if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) { // Initialize an object c for the intermediate typecast matrix. bli_unpackm_init_cast( p, a, &c ); } else */ { // If no cast is needed, then aliasing object c to the original // matrix serves as a minor optimization. This causes the unpackm // implementation to unpack directly into matrix a. bli_obj_alias_to( *a, c ); } // Now we are ready to proceed with the unpacking. // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. if( bli_thread_am_ochief( thread ) ) { f( p, &c, cntx, cntl ); } bli_thread_obarrier( thread ); // Now, if necessary, we cast the contents of c to matrix a. If casting // was not necessary, then we are done because the call to the unpackm // implementation would have unpacked directly to matrix a. /* if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) { // Copy/typecast matrix c to matrix a. // NOTE: Here, we use copynzm instead of copym because, in the cases // where we are unpacking/typecasting a real matrix c to a complex // matrix a, we want to touch only the real components of a, rather // than also set the imaginary components to zero. This comes about // because of the fact that, if we are unpacking real-to-complex, // then it is because all of the computation occurred in the real // domain, and so we would want to leave whatever imaginary values // there are in matrix a untouched. Notice that for unpackings that // entail complex-to-complex data movements, the copynzm operation // behaves exactly as copym, so no use cases are lost (at least none // that I can think of). bli_copynzm( &c, a ); // NOTE: The above code/comment is outdated. What should happen is // as follows: // - If dt(a) is complex and dt(p) is real, then create an alias of // a and then tweak it so that it looks like a real domain object. // This will involve: // - projecting the datatype to real domain // - scaling both the row and column strides by 2 // ALL OF THIS should be done in the front-end, NOT here, as // unpackm() won't even be needed in that case. } */ }
void bli_gemm_int( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, gemm_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; varnum_t n; impl_t i; FUNCPTR_T f; ind_t im; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { if( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { //if( bli_thread_am_ochief( thread ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); // } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { if ( im == BLIS_3M3 && f == bli_gemm_blk_var1f ) f = bli_gemm_blk_var4f; else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var4; else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var3; } // Invoke the variant. f( &a_local, &b_local, &c_local, cntx, cntl, thread ); }