siz_t bli_thread_get_range_weighted_b2t ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { siz_t area; // This function assigns area-weighted ranges in the m dimension // where the total range spans 0 to m-1 with 0 at the bottom end and // m-1 at the top end. if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) { bli_reflect_about_diag( diagoff, uplo, m, n ); } bli_reflect_about_diag( diagoff, uplo, m, n ); bli_rotate180_trapezoid( diagoff, uplo ); area = bli_thread_get_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end ); } else // if dense or zeros { area = bli_thread_get_range_b2t ( thr, a, bmult, start, end ); } return area; }
void bli_obj_print( char* label, obj_t* obj ) { FILE* file = stdout; if ( bli_error_checking_is_enabled() ) bli_obj_print_check( label, obj ); fprintf( file, "\n" ); fprintf( file, "%s\n", label ); fprintf( file, "\n" ); fprintf( file, " m x n %lu x %lu\n", ( unsigned long int )bli_obj_length( *obj ), ( unsigned long int )bli_obj_width( *obj ) ); fprintf( file, "\n" ); fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_off( *obj ), ( unsigned long int )bli_obj_col_off( *obj ) ); fprintf( file, " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( *obj ) ); fprintf( file, "\n" ); fprintf( file, " buf %p\n", ( void* )bli_obj_buffer( *obj ) ); fprintf( file, " elem size %lu\n", ( unsigned long int )bli_obj_elem_size( *obj ) ); fprintf( file, " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ), ( signed long int )bli_obj_col_stride( *obj ) ); fprintf( file, " is %ld\n", ( signed long int )bli_obj_imag_stride( *obj ) ); fprintf( file, " m_padded %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) ); fprintf( file, " n_padded %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) ); fprintf( file, " ps %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) ); fprintf( file, "\n" ); fprintf( file, " info %lX\n", ( unsigned long int )(*obj).info ); fprintf( file, " - is complex %lu\n", ( unsigned long int )bli_obj_is_complex( *obj ) ); fprintf( file, " - is d. prec %lu\n", ( unsigned long int )bli_obj_is_double_precision( *obj ) ); fprintf( file, " - datatype %lu\n", ( unsigned long int )bli_obj_datatype( *obj ) ); fprintf( file, " - target dt %lu\n", ( unsigned long int )bli_obj_target_datatype( *obj ) ); fprintf( file, " - exec dt %lu\n", ( unsigned long int )bli_obj_execution_datatype( *obj ) ); fprintf( file, " - has trans %lu\n", ( unsigned long int )bli_obj_has_trans( *obj ) ); fprintf( file, " - has conj %lu\n", ( unsigned long int )bli_obj_has_conj( *obj ) ); fprintf( file, " - unit diag? %lu\n", ( unsigned long int )bli_obj_has_unit_diag( *obj ) ); fprintf( file, " - struc type %lu\n", ( unsigned long int )bli_obj_struc( *obj ) >> BLIS_STRUC_SHIFT ); fprintf( file, " - uplo type %lu\n", ( unsigned long int )bli_obj_uplo( *obj ) >> BLIS_UPLO_SHIFT ); fprintf( file, " - is upper %lu\n", ( unsigned long int )bli_obj_is_upper( *obj ) ); fprintf( file, " - is lower %lu\n", ( unsigned long int )bli_obj_is_lower( *obj ) ); fprintf( file, " - is dense %lu\n", ( unsigned long int )bli_obj_is_dense( *obj ) ); fprintf( file, " - pack schema %lu\n", ( unsigned long int )bli_obj_pack_schema( *obj ) >> BLIS_PACK_SCHEMA_SHIFT ); fprintf( file, " - packinv diag? %lu\n", ( unsigned long int )bli_obj_has_inverted_diag( *obj ) ); fprintf( file, " - pack ordifup %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_upper( *obj ) ); fprintf( file, " - pack ordiflo %lu\n", ( unsigned long int )bli_obj_is_pack_rev_if_lower( *obj ) ); fprintf( file, " - packbuf type %lu\n", ( unsigned long int )bli_obj_pack_buffer_type( *obj ) >> BLIS_PACK_BUFFER_SHIFT ); fprintf( file, "\n" ); }
err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c, cntx ); #if 0 // FGVZ: The datatype-specific variant is now responsible for checking for // alpha == 0.0. // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return BLIS_SUCCESS; } #endif #if 0 // FGVZ: Will this be needed for constructing thrinfo_t's (recall: the // sba needs to be attached to the rntm; see below)? Or will those nodes // just be created "locally," in an exposed manner? // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_GEMM, BLIS_LEFT, // ignored for gemm/hemm/symm bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // FGVZ: the sba needs to be attached to the rntm. But it needs // to be done in the thread region, since it needs a thread id. //bli_sba_rntm_set_pool( tid, array, rntm_p ); #endif #if 0 // FGVZ: The datatype-specific variant is now responsible for inducing a // transposition, if needed. // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( a ) ) { bli_obj_induce_fast_trans( a ); bli_obj_toggle_trans( a ); } if ( bli_obj_has_trans( b ) ) { bli_obj_induce_fast_trans( b ); bli_obj_toggle_trans( b ); } #endif #if 0 //bli_gemmsup_ref_var2 //bli_gemmsup_ref_var1 #if 0 bli_gemmsup_ref_var1n #else #endif const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); if ( is_rrr_rrc_rcr_crr ) { bli_gemmsup_ref_var2m ( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else { bli_gemmsup_ref_var2m ( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } #else const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); // Don't use the small/unpacked implementation if one of the matrices // uses general stride. if ( stor_id == BLIS_XXX ) return BLIS_FAILURE; const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); if ( is_primary ) { // This branch handles: // - rrr rrc rcr crr for row-preferential kernels // - rcc crc ccr ccc for column-preferential kernels const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t mu = m / MR; const dim_t nu = n / NR; if ( mu >= nu ) { // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else // if ( mu < nu ) { // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } } else { // This branch handles: // - rrr rrc rcr crr for column-preferential kernels // - rcc crc ccr ccc for row-preferential kernels const dim_t mt = bli_obj_width( c ); const dim_t nt = bli_obj_length( c ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t mu = mt / MR; const dim_t nu = nt / NR; if ( mu >= nu ) { // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans bli_gemmsup_ref_var2m( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else // if ( mu < nu ) { // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans bli_gemmsup_ref_var1n( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } // *requires nudging of mc,nc up to be a multiple of nr,mr. } #endif // Return success so that the caller knows that we computed the solution. return BLIS_SUCCESS; }
void bli_trmm3_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, trmm_t* l_cntl, trmm_t* r_cntl ) { trmm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmm3_check( side, alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 if ( bli_is_right( side ) ) { bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); bli_toggle_side( side ); } #endif #if 1 // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } // An optimization: If C is row-stored, transpose the entire operation // so as to allow the macro-kernel more favorable access patterns // through C. (The effect of the transposition of A and B is negligible // because those operands are always packed to contiguous memory.) if ( bli_obj_is_row_stored( c_local ) ) { bli_obj_swap( a_local, b_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); bli_toggle_side( side ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Choose the control tree. if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_trmm_int, alpha, &a_local, &b_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_trmm_thrinfo_free_paths( infos, n_threads ); }
void bli_trmm_int( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, trmm_t* cntl ) { obj_t a_local; obj_t b_local; obj_t c_local; bool_t side, uplo; varnum_t n; impl_t i; FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmm_int_check( alpha, a, b, beta, c, cntl ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { bli_scalm( beta, c ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( *a ) ) { side = 0; if ( bli_obj_root_is_lower( *a ) ) uplo = 0; else uplo = 1; } else // if ( bli_obj_root_is_triangular( *b ) ) { side = 1; // Set a bool based on the uplo field of A's root object. if ( bli_obj_root_is_lower( *b ) ) uplo = 0; else uplo = 1; } // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[side][uplo][n][i]; // Invoke the variant. f( &a_local, &b_local, &c_local, cntl ); }
void bli_trmm3_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmm3_check( side, alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } #else // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Notice that, unlike trmm_r, there is no dependency in the jc loop // for trmm3_r, so we can pass in FALSE for jc_dependency. trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_trmm_int, alpha, &a_local, &b_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_trmm_thrinfo_free_paths( infos, n_threads ); }
void bli_trsv_int( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trsv_t* cntl ) { varnum_t n; impl_t i; bool_t uplo; FUNCPTR_T f; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trsv_check( alpha, a, x ); // If A or x has a zero dimension, return early. if ( bli_obj_has_zero_dim( a ) ) return; if ( bli_obj_has_zero_dim( x ) ) return; // Alias A in case we need to induce a transformation (ie: transposition). bli_obj_alias_to( a, &a_local ); // NOTE: to support cases where B is complex and A is real, we will // need to have the default side case be BLIS_RIGHT and then express // the left case in terms of it, rather than the other way around. // Determine uplo (for indexing to the correct function pointer). if ( bli_obj_is_lower( &a_local ) ) uplo = 0; else uplo = 1; // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply toggle the uplo value to cause the // correct algorithm to be induced. When that algorithm partitions into // A, it will grab the correct subpartitions, which will inherit A's // transposition bit and thus downstream subproblems will do the right // thing. Alternatively, we could accomplish the same end goal by // inducing a transposition, via bli_obj_induce_trans(), in the code // block below. That macro function swaps dimensions, strides, and // offsets. As an example, given a lower triangular, column-major matrix // that needs a transpose, we would induce that transposition by recasting // the object as an upper triangular, row-major matrix (with no transpose // needed). Note that how we choose to handle transposition here does NOT // affect the optimal choice of kernel (ie: a column-major column panel // matrix with transpose times a vector would use the same kernel as a // row-major row panel matrix with no transpose times a vector). if ( bli_obj_has_trans( &a_local ) ) { //bli_obj_induce_trans( &a_local ); //bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); if ( uplo == 1 ) uplo = 0; else uplo = 1; } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[uplo][n][i]; // Invoke the variant. f( alpha, &a_local, x, cntx, cntl ); }
void bli_gemmsup_ref_var1n ( trans_t trans, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, stor3_t eff_id, cntx_t* cntx, rntm_t* rntm ) { #if 0 obj_t at, bt; bli_obj_alias_to( a, &at ); bli_obj_alias_to( b, &bt ); // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width( &at ); void* restrict buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); void* restrict buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #else const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); dim_t k; void* restrict buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; void* restrict buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { k = bli_obj_width( a ); rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else // if ( bli_obj_has_trans( a ) ) { // Assign the variables with an implicit transposition. k = bli_obj_length( a ); rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else // if ( bli_obj_has_trans( b ) ) { // Assign the variables with an implicit transposition. rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #endif // Index into the type combination array to extract the correct // function pointer. FUNCPTR_T f = ftypes_var1n[dt_exec]; if ( bli_is_notrans( trans ) ) { // Invoke the function. f ( conja, conjb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b, buf_beta, buf_c, rs_c, cs_c, eff_id, cntx, rntm ); } else { // Invoke the function (transposing the operation). f ( conjb, // swap the conj values. conja, n, // swap the m and n dimensions. m, k, buf_alpha, buf_b, cs_b, rs_b, // swap the positions of A and B. buf_a, cs_a, rs_a, // swap the strides of A and B. buf_beta, buf_c, cs_c, rs_c, // swap the strides of C. bli_stor3_trans( eff_id ), // transpose the stor3_t id. cntx, rntm ); } }
void bli_gemm_int( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl, gemm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; varnum_t n; impl_t i; FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_int_check( alpha, a, b, beta, c, cntl ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if( thread_am_ochief( thread ) ) bli_scalm( beta, c ); thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { if( thread_am_ochief( thread ) ) bli_scalm( beta, c ); thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { //if( thread_am_ochief( thread ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); // } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( &a_local, &b_local, &c_local, cntl, thread ); }
void bli_trsm_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, trsm_t* l_cntl, trsm_t* r_cntl ) { trsm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trsm_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( alpha, b ); return; } // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. bli_mem_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *b, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 // If A is being solved against from the right, transpose all operands // so that we can perform the computation as if A were being solved // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } #else // If A is being solved against from the right, swap A and B so that // the triangular matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Choose the control tree. if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths( bli_is_right( side ) ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (l3_int_t) bli_trsm_int, alpha, &a_local, &b_local, alpha, &c_local, (void*) cntx, (void*) cntl, (void**) infos ); bli_trsm_thrinfo_free_paths( infos, n_threads ); }
void bli_trsm_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; trsm_var_oft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( c, &c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) { bli_obj_induce_trans( &c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( a ) ) { // If alpha is non-unit, typecast and apply it to the scalar // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( b ) ) { // If alpha is non-unit, typecast and apply it to the scalar // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &a_local ); } } // FGVZ->TMS: Is this barrier still needed? bli_thread_obarrier( thread ); // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, rntm, cntl, thread ); }
// // Define object-based interface. // void bli_dotxf( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { num_t dt = bli_obj_datatype( *x ); conj_t conja = bli_obj_conj_status( *a ); conj_t conjx = bli_obj_conj_status( *x ); dim_t m = bli_obj_vector_dim( *y ); dim_t b_n = bli_obj_vector_dim( *x ); void* buf_a = bli_obj_buffer_at_off( *a ); inc_t rs_a = bli_obj_row_stride( *a ); inc_t cs_a = bli_obj_col_stride( *a ); void* buf_x = bli_obj_buffer_at_off( *x ); inc_t inc_x = bli_obj_vector_inc( *x ); void* buf_y = bli_obj_buffer_at_off( *y ); inc_t inc_y = bli_obj_vector_inc( *y ); obj_t alpha_local; void* buf_alpha; obj_t beta_local; void* buf_beta; FUNCPTR_T f = ftypes[dt]; if ( bli_error_checking_is_enabled() ) bli_dotxf_check( alpha, a, x, beta, y ); // Create local copy-casts of the scalars (and apply internal conjugation // if needed). bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, alpha, &alpha_local ); bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, beta, &beta_local ); // Extract the scalar buffers. buf_alpha = bli_obj_buffer_for_1x1( dt, alpha_local ); buf_beta = bli_obj_buffer_for_1x1( dt, beta_local ); // Support cases where matrix A requires a transposition. if ( bli_obj_has_trans( *a ) ) { bli_swap_incs( rs_a, cs_a ); } // Invoke the void pointer-based function. f( conja, conjx, m, b_n, buf_alpha, buf_a, rs_a, cs_a, buf_x, inc_x, buf_beta, buf_y, inc_y ); }
void bli_ger_int( conj_t conjx, conj_t conjy, obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t alpha_local; obj_t x_local; obj_t y_local; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_ger_check( alpha, x, y, a ); // If A has a zero dimension, return early. if ( bli_obj_has_zero_dim( a ) ) return; // If x or y has a zero dimension, return early. if ( bli_obj_has_zero_dim( x ) || bli_obj_has_zero_dim( y ) ) return; // Alias the objects, applying conjx and conjy to x and y, respectively. bli_obj_alias_with_conj( conjx, x, &x_local ); bli_obj_alias_with_conj( conjy, y, &y_local ); bli_obj_alias_to( a, &a_local ); // If matrix A is marked for conjugation, we interpret this as a request // to apply a conjugation to the other operands. if ( bli_obj_has_conj( &a_local ) ) { bli_obj_toggle_conj( &a_local ); bli_obj_toggle_conj( &x_local ); bli_obj_toggle_conj( &y_local ); bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ), BLIS_CONJUGATE, alpha, &alpha_local ); } else { bli_obj_alias_to( *alpha, alpha_local ); } // If we are about the call a leaf-level implementation, and matrix A // still needs a transposition, then we must induce one by swapping the // strides and dimensions. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( &a_local ) ) { bli_obj_induce_trans( &a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( &alpha_local, &x_local, &y_local, &a_local, cntx, cntl ); }