void bli_obj_create_without_buffer( num_t dt, dim_t m, dim_t n, obj_t* obj ) { siz_t elem_size; void* s; if ( bli_error_checking_is_enabled() ) bli_obj_create_without_buffer_check( dt, m, n, obj ); // Query the size of one element of the object's pre-set datatype. elem_size = bli_datatype_size( dt ); // Set any default properties that are appropriate. bli_obj_set_defaults( *obj ); // Set the object root to itself, since obj is not presumed to be a view // into a larger matrix. This is typically the only time this field is // ever set; henceforth, subpartitions and aliases to this object will // get copies of this field, and thus always have access to its // "greatest-grand" parent (ie: the original parent, or "root", object). // However, there ARE a few places where it is convenient to reset the // root field explicitly via bli_obj_set_as_root(). (We do not list // those places here. Just grep for bli_obj_set_as_root within the // top-level 'frame' directory to see them. bli_obj_set_as_root( *obj ); // Set individual fields. bli_obj_set_buffer( NULL, *obj ); bli_obj_set_datatype( dt, *obj ); bli_obj_set_elem_size( elem_size, *obj ); bli_obj_set_target_datatype( dt, *obj ); bli_obj_set_execution_datatype( dt, *obj ); bli_obj_set_dims( m, n, *obj ); bli_obj_set_offs( 0, 0, *obj ); bli_obj_set_diag_offset( 0, *obj ); // Set the internal scalar to 1.0. s = bli_obj_internal_scalar_buffer( *obj ); if ( bli_is_float( dt ) ) { bli_sset1s( *(( float* )s) ); } else if ( bli_is_double( dt ) ) { bli_dset1s( *(( double* )s) ); } else if ( bli_is_scomplex( dt ) ) { bli_cset1s( *(( scomplex* )s) ); } else if ( bli_is_dcomplex( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); } }
void bli_her2k_front( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, herk_t* cntl ) { obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( *b, bh_local ); bli_obj_induce_trans( bh_local ); bli_obj_toggle_conj( bh_local ); bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is row-stored, transpose the entire operation // so as to allow the macro-kernel more favorable access patterns // through C. (The effect of the transposition of A and A' is negligible // because those operands are always packed to contiguous memory.) if ( bli_obj_is_row_stored( c_local ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( bh_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( ah_local ); bli_obj_induce_trans( c_local ); } #if 0 // Invoke the internal back-end. bli_her2k_int( alpha, &a_local, &bh_local, &alpha_conj, &b_local, &ah_local, beta, &c_local, cntl ); #else // Invoke herk twice, using beta only the first time. bli_herk_int( alpha, &a_local, &bh_local, beta, &c_local, cntl ); bli_herk_int( &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, cntl ); #endif }
void bli_her2k_front( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( *b, bh_local ); bli_obj_induce_trans( bh_local ); bli_obj_toggle_conj( bh_local ); bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_datatype( *a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( bh_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( ah_local ); bli_obj_induce_trans( c_local ); } #if 0 // Invoke the internal back-end. bli_her2k_int( alpha, &a_local, &bh_local, &alpha_conj, &b_local, &ah_local, beta, &c_local, cntl ); #else // Invoke herk twice, using beta only the first time. herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_herk_int, alpha, &a_local, &bh_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_level3_thread_decorator( n_threads, (level3_int_t) bli_herk_int, &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, (void*) cntl, (void**) infos ); bli_herk_thrinfo_free_paths( infos, n_threads ); #endif // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
void bli_trmm3_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, trmm_t* l_cntl, trmm_t* r_cntl ) { trmm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmm3_check( side, alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 if ( bli_is_right( side ) ) { bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); bli_toggle_side( side ); } #endif #if 1 // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } // An optimization: If C is row-stored, transpose the entire operation // so as to allow the macro-kernel more favorable access patterns // through C. (The effect of the transposition of A and B is negligible // because those operands are always packed to contiguous memory.) if ( bli_obj_is_row_stored( c_local ) ) { bli_obj_swap( a_local, b_local ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); bli_toggle_side( side ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Choose the control tree. if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_trmm_int, alpha, &a_local, &b_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_trmm_thrinfo_free_paths( infos, n_threads ); }
void bli_syrk_front( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t a_local; obj_t at_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syrk_check( alpha, a, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For syrk, the right-hand "B" operand is simply A^T. bli_obj_alias_to( *a, at_local ); bli_obj_induce_trans( at_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_obj_induce_trans( c_local ); } herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths(); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_herk_int, alpha, &a_local, &at_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_herk_thrinfo_free_paths( infos, n_threads ); }
void bli_trmm3_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, gemm_t* cntl ) { obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmm3_check( side, alpha, a, b, beta, c ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } #else // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( ( bli_obj_is_row_stored( c_local ) && bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) || ( bli_obj_is_col_stored( c_local ) && bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), bli_gemm_cntl_ukrs( cntl ) ) ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Notice that, unlike trmm_r, there is no dependency in the jc loop // for trmm3_r, so we can pass in FALSE for jc_dependency. trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (level3_int_t) bli_trmm_int, alpha, &a_local, &b_local, beta, &c_local, (void*) cntl, (void**) infos ); bli_trmm_thrinfo_free_paths( infos, n_threads ); }
void bli_herk_front( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, gemm_t* cntl ) { obj_t a_local; obj_t ah_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_herk_check( alpha, a, beta, c, cntx ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. bli_mem_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For herk, the right-hand "B" operand is simply A'. bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_toggle_conj( a_local ); bli_obj_toggle_conj( ah_local ); bli_obj_induce_trans( c_local ); } thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_l3_thread_decorator( n_threads, (l3_int_t) bli_herk_int, alpha, &a_local, &ah_local, beta, &c_local, (void*) cntx, (void*) cntl, (void**) infos ); bli_l3_thrinfo_free_paths( infos, n_threads ); // The Hermitian rank-k product was computed as A*A', even for the // diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
void bli_trsm_front( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, trsm_t* l_cntl, trsm_t* r_cntl ) { trsm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trsm_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( alpha, b ); return; } // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. bli_mem_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *b, c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( a_local ) ) { bli_obj_induce_trans( a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local ); } #if 0 // If A is being solved against from the right, transpose all operands // so that we can perform the computation as if A were being solved // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); } #else // If A is being solved against from the right, swap A and B so that // the triangular matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } #endif // Set each alias as the root object. // NOTE: We MUST wait until we are done potentially swapping the objects // before setting the root fields! bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); // Choose the control tree. if ( bli_is_left( side ) ) cntl = l_cntl; else cntl = r_cntl; trsm_thrinfo_t** infos = bli_create_trsm_thrinfo_paths( bli_is_right( side ) ); dim_t n_threads = thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_level3_thread_decorator( n_threads, (l3_int_t) bli_trsm_int, alpha, &a_local, &b_local, alpha, &c_local, (void*) cntx, (void*) cntl, (void**) infos ); bli_trsm_thrinfo_free_paths( infos, n_threads ); }
void bli_her2k_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { bli_init_once(); obj_t alpha_conj; obj_t c_local; obj_t a_local; obj_t bh_local; obj_t b_local; obj_t ah_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c, cntx ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); bli_obj_set_as_root( &c_local ); // For her2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( b, &bh_local ); bli_obj_induce_trans( &bh_local ); bli_obj_toggle_conj( &bh_local ); bli_obj_alias_to( a, &ah_local ); bli_obj_induce_trans( &ah_local ); bli_obj_toggle_conj( &ah_local ); // Initialize a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ), BLIS_CONJUGATE, alpha, &alpha_conj ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &bh_local ); bli_obj_swap( &b_local, &ah_local ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &bh_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &ah_local ); bli_obj_induce_trans( &c_local ); } // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ) ); // Invoke herk twice, using beta only the first time. // Invoke the internal back-end. bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id alpha, &a_local, &bh_local, beta, &c_local, cntx, cntl ); bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id &alpha_conj, &b_local, &ah_local, &BLIS_ONE, &c_local, cntx, cntl ); // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }
void bli_syr2k_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { bli_init_once(); obj_t c_local; obj_t a_local; obj_t bt_local; obj_t b_local; obj_t at_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syr2k_check( alpha, a, b, beta, c, cntx ); // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); bli_obj_set_as_root( &c_local ); // For syr2k, the first and second right-hand "B" operands are simply B' // and A'. bli_obj_alias_to( b, &bt_local ); bli_obj_induce_trans( &bt_local ); bli_obj_alias_to( a, &at_local ); bli_obj_induce_trans( &at_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( &c_local ); } // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ) ); // Invoke herk twice, using beta only the first time. // Invoke the internal back-end. bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id alpha, &a_local, &bt_local, beta, &c_local, cntx, cntl ); bli_l3_thread_decorator ( bli_gemm_int, BLIS_HERK, // operation family id alpha, &b_local, &at_local, &BLIS_ONE, &c_local, cntx, cntl ); }