void libblis_test_xpbym_check ( test_params_t* params, obj_t* x, obj_t* beta, obj_t* y, obj_t* y_orig, double* resid ) { num_t dt = bli_obj_dt( y ); num_t dt_real = bli_obj_dt_proj_to_real( y ); dim_t m = bli_obj_length( y ); dim_t n = bli_obj_width( y ); obj_t x_temp, y_temp; obj_t norm; double junk; // // Pre-conditions: // - x is randomized. // - y_orig is randomized. // Note: // - alpha should have a non-zero imaginary component in the complex // cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // // y := beta * y_orig + conjx(x) // // is functioning correctly if // // normf( y - ( beta * y_orig + conjx(x) ) ) // // is negligible. // bli_obj_scalar_init_detached( dt_real, &norm ); bli_obj_create( dt, m, n, 0, 0, &x_temp ); bli_obj_create( dt, m, n, 0, 0, &y_temp ); bli_copym( x, &x_temp ); bli_copym( y_orig, &y_temp ); bli_scalm( beta, &y_temp ); bli_addm( &x_temp, &y_temp ); bli_subm( &y_temp, y ); bli_normfm( y, &norm ); bli_getsc( &norm, resid, &junk ); bli_obj_free( &x_temp ); bli_obj_free( &y_temp ); }
void libblis_test_scalm_check( obj_t* beta, obj_t* y, obj_t* y_orig, double* resid ) { num_t dt = bli_obj_datatype( *y ); num_t dt_real = bli_obj_datatype_proj_to_real( *y ); dim_t m = bli_obj_length( *y ); dim_t n = bli_obj_width( *y ); obj_t norm_y_r; obj_t nbeta; obj_t y2; double junk; // // Pre-conditions: // - y_orig is randomized. // Note: // - beta should have a non-zero imaginary component in the complex // cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // // y := conjbeta(beta) * y_orig // // is functioning correctly if // // normf( y + -conjbeta(beta) * y_orig ) // // is negligible. // bli_obj_create( dt, m, n, 0, 0, &y2 ); bli_copym( y_orig, &y2 ); bli_obj_scalar_init_detached( dt, &nbeta ); bli_obj_scalar_init_detached( dt_real, &norm_y_r ); bli_copysc( beta, &nbeta ); bli_mulsc( &BLIS_MINUS_ONE, &nbeta ); bli_scalm( &nbeta, &y2 ); bli_addm( &y2, y ); bli_normfm( y, &norm_y_r ); bli_getsc( &norm_y_r, resid, &junk ); bli_obj_free( &y2 ); }
void libblis_test_scalm_impl( iface_t iface, obj_t* beta, obj_t* y ) { switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: bli_scalm( beta, y ); break; default: libblis_test_printf_error( "Invalid interface type.\n" ); } }
void bli_syrk_front( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t a_local; obj_t at_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syrk_check( alpha, a, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For syrk, the right-hand "B" operand is simply A^T. bli_obj_alias_to( *a, at_local ); bli_obj_induce_trans( at_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_obj_induce_trans( c_local ); } herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_herk_int, alpha, &a_local, &at_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_herk_thrinfo_free_paths( infos, n_threads ); }
void libblis_test_syr2k_experiment( test_params_t* params, test_op_t* op, iface_t iface, num_t datatype, char* pc_str, char* sc_str, unsigned int p_cur, double* perf, double* resid ) { unsigned int n_repeats = params->n_repeats; unsigned int i; double time_min = 1e9; double time; dim_t m, k; uplo_t uploc; trans_t transa, transb; obj_t kappa; obj_t alpha, a, b, beta, c; obj_t c_save; // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); k = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); // Map parameter characters to BLIS constants. bli_param_map_char_to_blis_uplo( pc_str[0], &uploc ); bli_param_map_char_to_blis_trans( pc_str[1], &transa ); bli_param_map_char_to_blis_trans( pc_str[2], &transb ); // Create test scalars. bli_obj_scalar_init_detached( datatype, &kappa ); bli_obj_scalar_init_detached( datatype, &alpha ); bli_obj_scalar_init_detached( datatype, &beta ); // Create test operands (vectors and/or matrices). libblis_test_mobj_create( params, datatype, transa, sc_str[0], m, k, &a ); libblis_test_mobj_create( params, datatype, transb, sc_str[1], m, k, &b ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[2], m, m, &c ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[2], m, m, &c_save ); // Set alpha and beta. if ( bli_obj_is_real( c ) ) { bli_setsc( 0.8, 0.0, &alpha ); bli_setsc( -1.0, 0.0, &beta ); } else { // For syr2k, both alpha and beta may be complex since, unlike her2k, // C is symmetric in both the real and complex cases. bli_setsc( 0.8, 0.5, &alpha ); bli_setsc( -1.0, 0.5, &beta ); } // Randomize A and B. bli_randm( &a ); bli_randm( &b ); // Set the structure and uplo properties of C. bli_obj_set_struc( BLIS_SYMMETRIC, c ); bli_obj_set_uplo( uploc, c ); // Randomize A, make it densely symmetric, and zero the unstored triangle // to ensure the implementation is reads only from the stored region. bli_randm( &c ); bli_mksymm( &c ); bli_mktrim( &c ); // Save C and set its structure and uplo properties. bli_obj_set_struc( BLIS_SYMMETRIC, c_save ); bli_obj_set_uplo( uploc, c_save ); bli_copym( &c, &c_save ); // Normalize by k. bli_setsc( 1.0/( double )k, 0.0, &kappa ); bli_scalm( &kappa, &a ); bli_scalm( &kappa, &b ); // Apply the remaining parameters. bli_obj_set_conjtrans( transa, a ); bli_obj_set_conjtrans( transb, b ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c_save, &c ); time = bli_clock(); libblis_test_syr2k_impl( iface, &alpha, &a, &b, &beta, &c ); time_min = bli_clock_min_diff( time_min, time ); } // Estimate the performance of the best experiment repeat. *perf = ( 2.0 * m * m * k ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( c ) ) *perf *= 4.0; // Perform checks. libblis_test_syr2k_check( &alpha, &a, &b, &beta, &c, &c_save, resid ); // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); bli_obj_free( &c ); bli_obj_free( &c_save ); }
void bli_hemv_int( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, hemv_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) { if ( bli_is_conj( conjh ) ) bli_hemv_check( alpha, a, x, beta, y ); else bli_symv_check( alpha, a, x, beta, y ); } // If y has a zero dimension, return early. if ( bli_obj_has_zero_dim( *y ) ) return; // If x has a zero dimension, scale y by beta and return early. if ( bli_obj_has_zero_dim( *x ) ) { bli_scalm( beta, y ); return; } // Alias A in case we need to induce the upper triangular case. bli_obj_alias_to( *a, a_local ); /* // Our blocked algorithms only [explicitly] implement the lower triangular // case, so if matrix A is stored as upper triangular, we must toggle the // transposition (and conjugation) bits so that the diagonal partitioning // routines grab the correct partitions corresponding to the upper // triangular case. But we only need to do this for blocked algorithms, // since unblocked algorithms are responsible for handling the upper case // explicitly (and they should not be inspecting the transposition bit anyway). if ( bli_cntl_is_blocked( cntl ) && bli_obj_is_upper( *a ) ) { bli_obj_toggle_conj( a_local ); bli_obj_toggle_trans( a_local ); } */ // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( conjh, alpha, &a_local, x, beta, y, cntx, cntl ); }
void bli_her2k_front( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, herk_t* cntl ) { obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( *b, bh_local ); bli_obj_induce_trans( bh_local ); bli_obj_toggle_conj( bh_local ); bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is row-stored, transpose the entire operation // so as to allow the macro-kernel more favorable access patterns // through C. (The effect of the transposition of A and A' is negligible // because those operands are always packed to contiguous memory.) if ( bli_obj_is_row_stored( c_local ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( bh_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( ah_local ); bli_obj_induce_trans( c_local ); } #if 0 // Invoke the internal back-end. bli_her2k_int( alpha, &a_local, &bh_local, &alpha_conj, &b_local, &ah_local, beta, &c_local, cntl ); #else // Invoke herk twice, using beta only the first time. bli_herk_int( alpha, &a_local, &bh_local, beta, &c_local, cntl ); bli_herk_int( &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, cntl ); #endif }
err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c, cntx ); #if 0 // FGVZ: The datatype-specific variant is now responsible for checking for // alpha == 0.0. // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return BLIS_SUCCESS; } #endif #if 0 // FGVZ: Will this be needed for constructing thrinfo_t's (recall: the // sba needs to be attached to the rntm; see below)? Or will those nodes // just be created "locally," in an exposed manner? // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_GEMM, BLIS_LEFT, // ignored for gemm/hemm/symm bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // FGVZ: the sba needs to be attached to the rntm. But it needs // to be done in the thread region, since it needs a thread id. //bli_sba_rntm_set_pool( tid, array, rntm_p ); #endif #if 0 // FGVZ: The datatype-specific variant is now responsible for inducing a // transposition, if needed. // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( a ) ) { bli_obj_induce_fast_trans( a ); bli_obj_toggle_trans( a ); } if ( bli_obj_has_trans( b ) ) { bli_obj_induce_fast_trans( b ); bli_obj_toggle_trans( b ); } #endif #if 0 //bli_gemmsup_ref_var2 //bli_gemmsup_ref_var1 #if 0 bli_gemmsup_ref_var1n #else #endif const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); if ( is_rrr_rrc_rcr_crr ) { bli_gemmsup_ref_var2m ( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else { bli_gemmsup_ref_var2m ( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } #else const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); // Don't use the small/unpacked implementation if one of the matrices // uses general stride. if ( stor_id == BLIS_XXX ) return BLIS_FAILURE; const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); if ( is_primary ) { // This branch handles: // - rrr rrc rcr crr for row-preferential kernels // - rcc crc ccr ccc for column-preferential kernels const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t mu = m / MR; const dim_t nu = n / NR; if ( mu >= nu ) { // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else // if ( mu < nu ) { // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } } else { // This branch handles: // - rrr rrc rcr crr for column-preferential kernels // - rcc crc ccr ccc for row-preferential kernels const dim_t mt = bli_obj_width( c ); const dim_t nt = bli_obj_length( c ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t mu = mt / MR; const dim_t nu = nt / NR; if ( mu >= nu ) { // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans bli_gemmsup_ref_var2m( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else // if ( mu < nu ) { // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans bli_gemmsup_ref_var1n( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } // *requires nudging of mc,nc up to be a multiple of nr,mr. } #endif // Return success so that the caller knows that we computed the solution. return BLIS_SUCCESS; }
void bli_trmm3_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, trmm_t* l_cntl, trmm_t* r_cntl ) { trmm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmm3_check( side, alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 if ( bli_is_right( side ) ) { bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); bli_toggle_side( side ); } #endif #if 1 // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } // An optimization: If C is row-stored, transpose the entire operation // so as to allow the macro-kernel more favorable access patterns // through C. (The effect of the transposition of A and B is negligible // because those operands are always packed to contiguous memory.) if ( bli_obj_is_row_stored( c_local ) ) { bli_obj_swap( a_local, b_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); bli_toggle_side( side ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Choose the control tree. if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_trmm_int, alpha, &a_local, &b_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_trmm_thrinfo_free_paths( infos, n_threads ); }
void bli_gemm_front( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_obj_swap( a_local, b_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths(); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_gemm_int, alpha, &a_local, &b_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_gemm_thrinfo_free_paths( infos, n_threads ); #ifdef BLIS_ENABLE_FLOP_COUNT // Increment the global flop counter. bli_flop_count_inc( 2.0 * bli_obj_length( *c ) * bli_obj_width( *c ) * bli_obj_width_after_trans( a_local ) * ( bli_obj_is_complex( *c ) ? 4.0 : 1.0 ) ); #endif }
void bli_gemm_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { // This should never execute. bli_abort(); if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( cntx, cntl, thread ); // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. { ind_t im = bli_cntx_get_ind_method( cntx ); if ( im != BLIS_NAT ) { if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; } } // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, cntl, thread ); }
void bli_gemm_int( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl, gemm_thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; varnum_t n; impl_t i; FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_int_check( alpha, a, b, beta, c, cntl ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { if( thread_am_ochief( thread ) ) bli_scalm( beta, c ); thread_obarrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { if( thread_am_ochief( thread ) ) bli_scalm( beta, c ); thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { //if( thread_am_ochief( thread ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); // } } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( &a_local, &b_local, &c_local, cntl, thread ); }
void bli_trsm_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, trsm_t* l_cntl, trsm_t* r_cntl ) { trsm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trsm_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( alpha, b ); return; } // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. bli_mem_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *b, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 // If A is being solved against from the right, transpose all operands // so that we can perform the computation as if A were being solved // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } #else // If A is being solved against from the right, swap A and B so that // the triangular matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Choose the control tree. if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths( bli_is_right( side ) ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (l3_int_t) bli_trsm_int, alpha, &a_local, &b_local, alpha, &c_local, (void*) cntx, (void*) cntl, (void**) infos ); bli_trsm_thrinfo_free_paths( infos, n_threads ); }
void bli_trsm_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; trsm_var_oft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( c, &c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) { bli_obj_induce_trans( &c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( a ) ) { // If alpha is non-unit, typecast and apply it to the scalar // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( b ) ) { // If alpha is non-unit, typecast and apply it to the scalar // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &a_local ); } } // FGVZ->TMS: Is this barrier still needed? bli_thread_obarrier( thread ); // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, rntm, cntl, thread ); }
void bli_her2k_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { bli_init_once(); obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c, cntx ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); bli_obj_set_as_root( &c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( b, &bh_local ); bli_obj_induce_trans( &bh_local ); bli_obj_toggle_conj( &bh_local ); bli_obj_alias_to( a, &ah_local ); bli_obj_induce_trans( &ah_local ); bli_obj_toggle_conj( &ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &bh_local ); bli_obj_swap( &b_local, &ah_local ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &bh_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &ah_local ); bli_obj_induce_trans( &c_local ); } // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ) ); // Invoke herk twice, using beta only the first time. // Invoke the internal back-end. bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id alpha, &a_local, &bh_local, beta, &c_local, cntx, cntl ); bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, cntx, cntl ); // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
void bli_trmm_int( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, trmm_t* cntl ) { obj_t a_local; obj_t b_local; obj_t c_local; bool_t side, uplo; varnum_t n; impl_t i; FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmm_int_check( alpha, a, b, beta, c, cntl ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( *c ) ) return; // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { bli_scalm( beta, c ); return; } // Alias A and B in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( *a ) ) { side = 0; if ( bli_obj_root_is_lower( *a ) ) uplo = 0; else uplo = 1; } else // if ( bli_obj_root_is_triangular( *b ) ) { side = 1; // Set a bool based on the uplo field of A's root object. if ( bli_obj_root_is_lower( *b ) ) uplo = 0; else uplo = 1; } // Extract the variant number and implementation type. n = cntl_var_num( cntl ); i = cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[side][uplo][n][i]; // Invoke the variant. f( &a_local, &b_local, &c_local, cntl ); }
void libblis_test_gemm_experiment( test_params_t* params, test_op_t* op, iface_t iface, num_t datatype, char* pc_str, char* sc_str, unsigned int p_cur, double* perf, double* resid ) { unsigned int n_repeats = params->n_repeats; unsigned int i; double time_min = 1e9; double time; dim_t m, n, k; trans_t transa; trans_t transb; obj_t kappa; obj_t alpha, a, b, beta, c; obj_t c_save; // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); k = libblis_test_get_dim_from_prob_size( op->dim_spec[2], p_cur ); // Map parameter characters to BLIS constants. bli_param_map_char_to_blis_trans( pc_str[0], &transa ); bli_param_map_char_to_blis_trans( pc_str[1], &transb ); // Create test scalars. bli_obj_scalar_init_detached( datatype, &kappa ); bli_obj_scalar_init_detached( datatype, &alpha ); bli_obj_scalar_init_detached( datatype, &beta ); // Create test operands (vectors and/or matrices). libblis_test_mobj_create( params, datatype, transa, sc_str[0], m, k, &a ); libblis_test_mobj_create( params, datatype, transb, sc_str[1], k, n, &b ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[2], m, n, &c ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[2], m, n, &c_save ); // Set alpha and beta. if ( bli_obj_is_real( c ) ) { bli_setsc( 1.2, 0.0, &alpha ); bli_setsc( -1.0, 0.0, &beta ); } else { bli_setsc( 1.2, 0.8, &alpha ); bli_setsc( -1.0, 1.0, &beta ); } // Randomize A, B, and C, and save C. bli_randm( &a ); bli_randm( &b ); bli_randm( &c ); bli_copym( &c, &c_save ); // Normalize by k. bli_setsc( 1.0/( double )k, 0.0, &kappa ); bli_scalm( &kappa, &a ); bli_scalm( &kappa, &b ); // Apply the parameters. bli_obj_set_conjtrans( transa, a ); bli_obj_set_conjtrans( transb, b ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c_save, &c ); time = bli_clock(); libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c ); time_min = bli_clock_min_diff( time_min, time ); } // Estimate the performance of the best experiment repeat. *perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( c ) ) *perf *= 4.0; // Perform checks. libblis_test_gemm_check( &alpha, &a, &b, &beta, &c, &c_save, resid ); // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); bli_obj_free( &c ); bli_obj_free( &c_save ); }
void bli_herk_front( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, gemm_t* cntl ) { obj_t a_local; obj_t ah_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_herk_check( alpha, a, beta, c, cntx ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. bli_mem_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For herk, the right-hand "B" operand is simply A'. bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_toggle_conj( a_local ); bli_obj_toggle_conj( ah_local ); bli_obj_induce_trans( c_local ); } thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_l3_thread_decorator( n_threads, (l3_int_t) bli_herk_int, alpha, &a_local, &ah_local, beta, &c_local, (void*) cntx, (void*) cntl, (void**) infos ); bli_l3_thrinfo_free_paths( infos, n_threads ); // The Hermitian rank-k product was computed as A*A', even for the // diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
void libblis_test_trmm3_experiment( test_params_t* params, test_op_t* op, mt_impl_t impl, num_t datatype, char* pc_str, char* sc_str, unsigned int p_cur, double* perf, double* resid ) { unsigned int n_repeats = params->n_repeats; unsigned int i; double time_min = 1e9; double time; dim_t m, n; dim_t mn_side; side_t side; uplo_t uploa; trans_t transa; diag_t diaga; trans_t transb; obj_t kappa; obj_t alpha, a, b, beta, c; obj_t c_save; // Map the dimension specifier to actual dimensions. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); // Map parameter characters to BLIS constants. bli_param_map_char_to_blis_side( pc_str[0], &side ); bli_param_map_char_to_blis_uplo( pc_str[1], &uploa ); bli_param_map_char_to_blis_trans( pc_str[2], &transa ); bli_param_map_char_to_blis_diag( pc_str[3], &diaga ); bli_param_map_char_to_blis_trans( pc_str[4], &transb ); // Create test scalars. bli_obj_scalar_init_detached( datatype, &kappa ); bli_obj_scalar_init_detached( datatype, &alpha ); bli_obj_scalar_init_detached( datatype, &beta ); // Create test operands (vectors and/or matrices). bli_set_dim_with_side( side, m, n, mn_side ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[0], mn_side, mn_side, &a ); libblis_test_mobj_create( params, datatype, transb, sc_str[1], m, n, &b ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[2], m, n, &c ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[2], m, n, &c_save ); // Set alpha and beta. if ( bli_obj_is_real( c ) ) { bli_setsc( 0.8, 0.0, &alpha ); bli_setsc( -1.0, 0.0, &beta ); } else { bli_setsc( 0.8, 0.6, &alpha ); bli_setsc( -1.0, 0.5, &beta ); } // Set the structure and uplo properties of A. bli_obj_set_struc( BLIS_TRIANGULAR, a ); bli_obj_set_uplo( uploa, a ); // Randomize A, make it densely triangular. bli_randm( &a ); bli_mktrim( &a ); // Randomize B and C, and save C. bli_randm( &b ); bli_randm( &c ); bli_copym( &c, &c_save ); // Normalize by m. bli_setsc( 1.0/( double )m, 0.0, &kappa ); bli_scalm( &kappa, &b ); // Apply the remaining parameters. bli_obj_set_conjtrans( transa, a ); bli_obj_set_diag( diaga, a ); bli_obj_set_conjtrans( transb, b ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c_save, &c ); time = bli_clock(); libblis_test_trmm3_impl( impl, side, &alpha, &a, &b, &beta, &c ); time_min = bli_clock_min_diff( time_min, time ); } // Estimate the performance of the best experiment repeat. *perf = ( 1.0 * mn_side * m * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( c ) ) *perf *= 4.0; // Perform checks. libblis_test_trmm3_check( side, &alpha, &a, &b, &beta, &c, &c_save, resid ); // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); // Free the test objects. bli_obj_free( &a ); bli_obj_free( &b ); bli_obj_free( &c ); bli_obj_free( &c_save ); }
void bli_symm_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_symm_check( side, alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), cntl_gemm_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), cntl_gemm_ukrs( cntl ) ) ) ) { bli_toggle_side( side ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } // Swap A and B if multiplying A from the right so that "B" contains // the symmetric matrix. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths(); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_gemm_int, alpha, &a_local, &b_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_gemm_thrinfo_free_paths( infos, n_threads ); }
void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_symm_check( side, alpha, a, b, beta, c, cntx ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } // Swap A and B if multiplying A from the right so that "B" contains // the symmetric matrix. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } // Set the operation family id in the context. bli_cntx_set_family( BLIS_GEMM, cntx ); // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( bli_gemm_int, alpha, &a_local, &b_local, beta, &c_local, cntx, cntl ); }
void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { #ifdef BLIS_SMALL_MATRIX_ENABLE #ifndef BLIS_ENABLE_MULTITHREADING gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl); if(BLIS_SUCCESS != status) #endif #endif { obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c, cntx ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, b_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } // Set the operation family id in the context. bli_cntx_set_family( BLIS_GEMM, cntx ); // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), bli_obj_width( c_local ), bli_obj_width( a_local ) ); // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( bli_gemm_int, alpha, &a_local, &b_local, beta, &c_local, cntx, cntl ); } }
void bli_her2k_front( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( *b, bh_local ); bli_obj_induce_trans( bh_local ); bli_obj_toggle_conj( bh_local ); bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( bh_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( ah_local ); bli_obj_induce_trans( c_local ); } #if 0 // Invoke the internal back-end. bli_her2k_int( alpha, &a_local, &bh_local, &alpha_conj, &b_local, &ah_local, beta, &c_local, cntl ); #else // Invoke herk twice, using beta only the first time. herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_herk_int, alpha, &a_local, &bh_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_level3_thread_decorator( n_threads, (level3_int_t) bli_herk_int, &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, (void*) cntl, (void**) infos ); bli_herk_thrinfo_free_paths( infos, n_threads ); #endif // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
void bli_trmm3_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmm3_check( side, alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } #else // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Notice that, unlike trmm_r, there is no dependency in the jc loop // for trmm3_r, so we can pass in FALSE for jc_dependency. trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_trmm_int, alpha, &a_local, &b_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_trmm_thrinfo_free_paths( infos, n_threads ); }
void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params, test_op_t* op, mt_impl_t impl, num_t datatype, char* pc_str, char* sc_str, unsigned int p_cur, double* perf, double* resid ) { unsigned int n_repeats = params->n_repeats; unsigned int i; double time_min = 1e9; double time; dim_t m, n, k; char sc_a = 'c'; char sc_b = 'r'; side_t side = BLIS_LEFT; uplo_t uploa; obj_t kappa; obj_t alpha; obj_t a_big, a, b; obj_t b11, c11; obj_t ap, bp; obj_t a1xp, a11p, bx1p, b11p; obj_t c11_save; // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); // Fix m and n to MR and NR, respectively. m = bli_blksz_for_type( datatype, gemm_mr ); n = bli_blksz_for_type( datatype, gemm_nr ); // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; op->dim_aux[1] = n; // Map parameter characters to BLIS constants. bli_param_map_char_to_blis_uplo( pc_str[0], &uploa ); // Create test scalars. bli_obj_scalar_init_detached( datatype, &kappa ); bli_obj_scalar_init_detached( datatype, &alpha ); // Create test operands (vectors and/or matrices). libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_a, k+m, k+m, &a_big ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_b, k+m, n, &b ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[0], m, n, &c11 ); libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[0], m, n, &c11_save ); // Set alpha. if ( bli_obj_is_real( b ) ) { bli_setsc( 2.0, 0.0, &alpha ); } else { bli_setsc( 2.0, 0.0, &alpha ); } // Set the structure, uplo, and diagonal offset properties of A. bli_obj_set_struc( BLIS_TRIANGULAR, a_big ); bli_obj_set_uplo( uploa, a_big ); // Randomize A and make it densely triangular. bli_randm( &a_big ); // Normalize B and save. bli_randm( &b ); bli_setsc( 1.0/( double )m, 0.0, &kappa ); bli_scalm( &kappa, &b ); // Use the last m rows of A_big as A. bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &a_big, &a ); // Locate the B11 block of B, copy to C11, and save. if ( bli_obj_is_lower( a ) ) bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &b, &b11 ); else bli_acquire_mpart_t2b( BLIS_SUBPART1, 0, m, &b, &b11 ); bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); // Initialize pack objects. bli_obj_init_pack( &ap ); bli_obj_init_pack( &bp ); // Create pack objects for a and b. libblis_test_pobj_create( gemm_mr, gemm_mr, BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap ); libblis_test_pobj_create( gemm_mr, gemm_nr, BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp ); // Pack the contents of a to ap. bli_packm_blk_var3( &a, &ap ); // Pack the contents of b to bp. bli_packm_blk_var2( &b, &bp ); // Create subpartitions from the a and b panels. bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &a1xp, &a11p, &bx1p, &b11p ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); // Re-pack the contents of b to bp. bli_packm_blk_var2( &b, &bp ); time = bli_clock(); libblis_test_gemmtrsm_ukr_impl( impl, side, &alpha, &a1xp, &a11p, &bx1p, &b11p, &c11 ); time_min = bli_clock_min_diff( time_min, time ); } // Estimate the performance of the best experiment repeat. *perf = ( 2.0 * m * n * k + 1.0 * m * m * n ) / time_min / FLOPS_PER_UNIT_PERF; if ( bli_obj_is_complex( b ) ) *perf *= 4.0; // Perform checks. libblis_test_gemmtrsm_ukr_check( side, &alpha, &a1xp, &a11p, &bx1p, &b11p, &c11, &c11_save, resid ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); // Release packing buffers within pack objects. bli_obj_release_pack( &ap ); bli_obj_release_pack( &bp ); // Free the test objects. bli_obj_free( &a_big ); bli_obj_free( &b ); bli_obj_free( &c11 ); bli_obj_free( &c11_save ); }
void bli_syr2k_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { bli_init_once(); obj_t c_local; obj_t a_local; obj_t bt_local; obj_t b_local; obj_t at_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syr2k_check( alpha, a, b, beta, c, cntx ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); bli_obj_set_as_root( &c_local ); // For syr2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( b, &bt_local ); bli_obj_induce_trans( &bt_local ); bli_obj_alias_to( a, &at_local ); bli_obj_induce_trans( &at_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( &c_local ); } // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ) ); // Invoke herk twice, using beta only the first time. // Invoke the internal back-end. bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id alpha, &a_local, &bt_local, beta, &c_local, cntx, cntl ); bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id alpha, &b_local, &at_local, &BLIS_ONE, &c_local, cntx, cntl ); }