void blx_gemm_blk_var3 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, b1; dim_t i; dim_t b_alg; dim_t k_trans; // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = blx_determine_blocksize_f( i, k_trans, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, b, &b1 ); // Perform gemm subproblem. blx_gemm_int ( &a1, &b1, c, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). // And since c is a locally aliased obj_t, we can simply overwrite // the internal beta scalar with BLIS_ONE once it has been used in // the first iteration. if ( i == 0 ) bli_obj_scalar_reset( c ); } }
void bli_trsm_blk_var3 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, b1; dir_t direct; dim_t i; dim_t b_alg; dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); // Partition along the k dimension. for ( i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); // Perform trsm subproblem. bli_trsm_int ( &BLIS_ONE, &a1, &b1, &BLIS_ONE, c, cntx, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); //bli_thread_ibarrier( thread ); bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. if ( i == 0 ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); bli_obj_scalar_reset( c ); } } }
void bli_trmm_blk_var2b( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, thrinfo_t* thread ) { obj_t a_pack_s; obj_t b1_pack_s, c1_pack_s; obj_t b1, c1; obj_t* a_pack = NULL; obj_t* b1_pack = NULL; obj_t* c1_pack = NULL; dim_t i; dim_t b_alg; // Prune any zero region that exists along the partitioning dimension. bli_trmm_prune_unref_mparts_n( a, b, c ); if( bli_thread_am_ochief( thread ) ) { // Initialize object for packing A bli_obj_init_pack( &a_pack_s ); bli_packm_init( a, &a_pack_s, cntx, bli_cntl_sub_packm_a( cntl ) ); // Scale C by beta (if instructed). bli_scalm_int( &BLIS_ONE, c, cntx, bli_cntl_sub_scalm( cntl ) ); } a_pack = bli_thread_obroadcast( thread, &a_pack_s ); // Initialize pack objects for B and C that are passed into packm_init(). if( bli_thread_am_ichief( thread ) ) { bli_obj_init_pack( &b1_pack_s ); bli_obj_init_pack( &c1_pack_s ); } b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); // Pack A (if instructed). bli_packm_int( a, a_pack, cntx, bli_cntl_sub_packm_a( cntl ), bli_thrinfo_sub_opackm( thread ) ); dim_t my_start, my_end; bli_thread_get_range_weighted_r2l( thread, b, bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), &my_start, &my_end ); // Partition along the n dimension. for ( i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( i, my_end, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_r2l( BLIS_SUBPART1, i, b_alg, c, &c1 ); // Initialize objects for packing A1 and B1. if( bli_thread_am_ichief( thread ) ) { bli_packm_init( &b1, b1_pack, cntx, bli_cntl_sub_packm_b( cntl ) ); bli_packm_init( &c1, c1_pack, cntx, bli_cntl_sub_packm_c( cntl ) ); } bli_thread_ibarrier( thread ); // Pack B1 (if instructed). bli_packm_int( &b1, b1_pack, cntx, bli_cntl_sub_packm_b( cntl ), bli_thrinfo_sub_ipackm( thread ) ); // Pack C1 (if instructed). bli_packm_int( &c1, c1_pack, cntx, bli_cntl_sub_packm_c( cntl ), bli_thrinfo_sub_ipackm( thread ) ); // Perform trmm subproblem. bli_trmm_int( &BLIS_ONE, a_pack, b1_pack, &BLIS_ONE, c1_pack, cntx, bli_cntl_sub_gemm( cntl ), bli_thrinfo_sub_self( thread ) ); bli_thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntx, bli_cntl_sub_unpackm_c( cntl ), bli_thrinfo_sub_ipackm( thread ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_thread_obarrier( thread ); if( bli_thread_am_ochief( thread ) ) bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); if( bli_thread_am_ichief( thread ) ) { bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); } }
void bli_gemm_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { // This should never execute. bli_abort(); if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( cntx, cntl, thread ); // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. { ind_t im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; } } // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, cntl, thread ); }
void bli_trsm_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; trsm_var_oft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( c, &c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) { bli_obj_induce_trans( &c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( a ) ) { // If alpha is non-unit, typecast and apply it to the scalar // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( b ) ) { // If alpha is non-unit, typecast and apply it to the scalar // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &a_local ); } } // FGVZ->TMS: Is this barrier still needed? bli_thread_obarrier( thread ); // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, rntm, cntl, thread ); }
void bli_unpackm_int( obj_t* p, obj_t* a, cntx_t* cntx, unpackm_t* cntl, thrinfo_t* thread ) { // The unpackm operation consists of an optional post-process: castm. // (This post-process is analogous to the castm pre-process in packm.) // Here are the following possible ways unpackm can execute: // 1. unpack and cast: Unpack to a temporary matrix c and then cast // c to a. // 2. unpack only: Unpack directly to matrix a since typecasting is // not needed. // 3. cast only: Not yet supported / not used. // 4. no-op: The control tree directs us to skip the unpack operation // entirely. No action is taken. obj_t c; varnum_t n; impl_t i; FUNCPTR_T f; // Sanity check; A should never have a zero dimension. If we must support // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); // First check if we are to skip this operation because the control tree // is NULL, and if so, simply return. if ( bli_cntl_is_noop( cntl ) ) { return; } // If p was aliased to a during the pack stage (because it was already // in an acceptable packed/contiguous format), then no unpack is actually // necessary, so we return. if ( bli_obj_is_alias_of( *p, *a ) ) { return; } // Check parameters. if ( bli_error_checking_is_enabled() ) bli_unpackm_check( p, a, cntx, cntl ); // Now, if we are not skipping the unpack operation, then the only // question left is whether we are to typecast matrix a after unpacking. if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) bli_abort(); /* if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) { // Initialize an object c for the intermediate typecast matrix. bli_unpackm_init_cast( p, a, &c ); } else */ { // If no cast is needed, then aliasing object c to the original // matrix serves as a minor optimization. This causes the unpackm // implementation to unpack directly into matrix a. bli_obj_alias_to( *a, c ); } // Now we are ready to proceed with the unpacking. // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. if( bli_thread_am_ochief( thread ) ) { f( p, &c, cntx, cntl ); } bli_thread_obarrier( thread ); // Now, if necessary, we cast the contents of c to matrix a. If casting // was not necessary, then we are done because the call to the unpackm // implementation would have unpacked directly to matrix a. /* if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) { // Copy/typecast matrix c to matrix a. // NOTE: Here, we use copynzm instead of copym because, in the cases // where we are unpacking/typecasting a real matrix c to a complex // matrix a, we want to touch only the real components of a, rather // than also set the imaginary components to zero. This comes about // because of the fact that, if we are unpacking real-to-complex, // then it is because all of the computation occurred in the real // domain, and so we would want to leave whatever imaginary values // there are in matrix a untouched. Notice that for unpackings that // entail complex-to-complex data movements, the copynzm operation // behaves exactly as copym, so no use cases are lost (at least none // that I can think of). bli_copynzm( &c, a ); // NOTE: The above code/comment is outdated. What should happen is // as follows: // - If dt(a) is complex and dt(p) is real, then create an alias of // a and then tweak it so that it looks like a real domain object. // This will involve: // - projecting the datatype to real domain // - scaling both the row and column strides by 2 // ALL OF THIS should be done in the front-end, NOT here, as // unpackm() won't even be needed in that case. } */ }
void bli_gemm_int( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, gemm_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; varnum_t n; impl_t i; FUNCPTR_T f; ind_t im; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { if( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { //if( bli_thread_am_ochief( thread ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); // } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { if ( im == BLIS_3M3 && f == bli_gemm_blk_var1f ) f = bli_gemm_blk_var4f; else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var4; else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var3; } // Invoke the variant. f( &a_local, &b_local, &c_local, cntx, cntl, thread ); }
void bli_packm_int( obj_t* a, obj_t* p, cntx_t* cntx, packm_t* cntl, thrinfo_t* thread ) { varnum_t n; impl_t i; FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_int_check( a, p, cntx ); // Sanity check; A should never have a zero dimension. If we must support // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); // First check if we are to skip this operation because the control tree // is NULL. We return without taking any action because a was already // aliased to p in packm_init(). if ( bli_cntl_is_noop( cntl ) ) { return; } // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can return, since by now aliasing has already // taken place in packm_init(). // NOTE: The reason we don't need to even look at the control tree in // this case is as follows: an object's pack status is only set to // BLIS_PACKED_UNSPEC for situations when the actual format used is // not important, as long as its packed into contiguous rows or // contiguous columns. A good example of this is packing for matrix // operands in the level-2 operations. if ( bli_obj_pack_schema( *a ) == BLIS_PACKED_UNSPEC ) { return; } // At this point, we can be assured that cntl is not NULL. Now we check // if the object has already been packed to the desired schema (as en- // coded in the control tree). If so, we can return, as above. // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED // and thus packing will be called for (but in some cases packing has // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. if ( bli_obj_pack_schema( *a ) == cntl_pack_schema( cntl ) ) { return; } // If the object is marked as being filled with zeros, then we can skip // the packm operation entirely. if ( bli_obj_is_zeros( *a ) ) { return; } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant with kappa_use. f( a, p, cntx, thread ); // Barrier so that packing is done before computation bli_thread_obarrier( thread ); }