void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { packm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_int_check( a, p, cntx ); // Sanity check; A should never have a zero dimension. If we must support // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can return, since by now aliasing has already // taken place in packm_init(). // NOTE: The reason we don't need to even look at the control tree in // this case is as follows: an object's pack status is only set to // BLIS_PACKED_UNSPEC for situations when the actual format used is // not important, as long as its packed into contiguous rows or // contiguous columns. A good example of this is packing for matrix // operands in the level-2 operations. if ( bli_obj_pack_schema( *a ) == BLIS_PACKED_UNSPEC ) { return; } // At this point, we can be assured that cntl is not NULL. Now we check // if the object has already been packed to the desired schema (as en- // coded in the control tree). If so, we can return, as above. // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED // and thus packing will be called for (but in some cases packing has // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. if ( bli_obj_pack_schema( *a ) == bli_cntl_packm_params_pack_schema( cntl ) ) { return; } // If the object is marked as being filled with zeros, then we can skip // the packm operation entirely. if ( bli_obj_is_zeros( *a ) ) { return; } // Extract the function pointer from the current control tree node. f = bli_cntl_packm_params_var_func( cntl ); // Invoke the variant with kappa_use. f ( a, p, cntx, cntl, thread ); }
void bli_gemm_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { // This should never execute. bli_abort(); if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( cntx, cntl, thread ); // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. { ind_t im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; } } // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, cntl, thread ); }
void bli_trsm_u_blk_var4( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, trsm_t* cntl ) { obj_t a1, a1_pack; obj_t b_pack; obj_t c1; dim_t i; dim_t bm_alg; dim_t m_trans; // Initialize all pack objects that are passed into packm_init(). bli_obj_init_pack( &a1_pack ); bli_obj_init_pack( &b_pack ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( *a ); // Initialize object for packing B. bli_packm_init( b, &b_pack, cntl_sub_packm_b( cntl ) ); // Find the offset to the first non-zero block of A. for ( i = 0; i < m_trans; i += bm_alg ) { // Determine the current algorithmic blocksize. bm_alg = bli_determine_blocksize_b( i, m_trans, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_b2t( BLIS_SUBPART1, i, bm_alg, a, &a1 ); if ( !bli_obj_is_zeros( a1 ) ) break; } // Fuse the first iteration with incremental packing and computation. { obj_t b_inc, b_pack_inc; obj_t c1_inc; dim_t j; dim_t bn_inc; dim_t n_trans; // Query dimension in partitioning direction. n_trans = bli_obj_width( b_pack ); // Determine the current algorithmic blocksize. bm_alg = bli_determine_blocksize_b( i, m_trans, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_b2t( BLIS_SUBPART1, i, bm_alg, a, &a1 ); bli_acquire_mpart_b2t( BLIS_SUBPART1, i, bm_alg, c, &c1 ); // Initialize objects for packing A1 and C1. bli_packm_init( &a1, &a1_pack, cntl_sub_packm_a( cntl ) ); // Pack A1 and scale by alpha (if instructed). bli_packm_int( alpha, &a1, &a1_pack, cntl_sub_packm_a( cntl ) ); // Partition along the n dimension. for ( j = 0; j < n_trans; j += bn_inc ) { // Determine the current incremental packing blocksize. bn_inc = bli_determine_blocksize_f( j, n_trans, b, cntl_blocksize_aux( cntl ) ); // Acquire partitions. bli_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc, b, &b_inc ); bli_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc, &b_pack, &b_pack_inc ); bli_acquire_mpart_l2r( BLIS_SUBPART1, j, bn_inc, &c1, &c1_inc ); // Pack B1 and scale by alpha (if instructed). bli_packm_int( alpha, &b_inc, &b_pack_inc, cntl_sub_packm_b( cntl ) ); // Perform trsm subproblem. bli_trsm_int( BLIS_LEFT, alpha, &a1_pack, &b_pack_inc, beta, &c1_inc, cntl_sub_trsm( cntl ) ); } } // Partition along the remaining portion of the m dimension. for ( i = i + bm_alg; i < m_trans; i += bm_alg ) { // Determine the current algorithmic blocksize. bm_alg = bli_determine_blocksize_b( i, m_trans, a, cntl_blocksize( cntl ) ); // Acquire partitions for A1 and C1. bli_acquire_mpart_b2t( BLIS_SUBPART1, i, bm_alg, a, &a1 ); bli_acquire_mpart_b2t( BLIS_SUBPART1, i, bm_alg, c, &c1 ); // Initialize object for packing A1. bli_packm_init( &a1, &a1_pack, cntl_sub_packm_a( cntl ) ); // Pack A1 and scale by alpha (if instructed). bli_packm_int( alpha, &a1, &a1_pack, cntl_sub_packm_a( cntl ) ); if ( bli_obj_intersects_diag( a1_pack ) ) bli_trsm_int( BLIS_LEFT, alpha, &a1_pack, &b_pack, beta, &c1, cntl_sub_trsm( cntl ) ); else bli_gemm_int( &BLIS_MINUS_ONE, &a1_pack, &b_pack, &BLIS_ONE, &c1, cntl_sub_gemm( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_obj_release_pack( &a1_pack ); bli_obj_release_pack( &b_pack ); }
void bli_gemm_int( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl, gemm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; varnum_t n; impl_t i; FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_int_check( alpha, a, b, beta, c, cntl ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if( thread_am_ochief( thread ) ) bli_scalm( beta, c ); thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { if( thread_am_ochief( thread ) ) bli_scalm( beta, c ); thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { //if( thread_am_ochief( thread ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); // } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( &a_local, &b_local, &c_local, cntl, thread ); }