void bli_l3_thrinfo_free_paths ( thrinfo_t** threads ) { dim_t n_threads = bli_thread_num_threads( threads[0] ); dim_t i; for ( i = 0; i < n_threads; ++i ) bli_l3_thrinfo_free( threads[i] ); bli_free_intl( threads ); }
void bli_l3_thrinfo_print_paths ( thrinfo_t** threads ) { dim_t n_threads = bli_thread_num_threads( threads[0] ); dim_t gl_comm_id; thrinfo_t* jc_info = threads[0]; thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); dim_t jc_way = bli_thread_n_way( jc_info ); dim_t pc_way = bli_thread_n_way( pc_info ); dim_t pb_way = bli_thread_n_way( pb_info ); dim_t ic_way = bli_thread_n_way( ic_info ); dim_t pa_way = bli_thread_n_way( pa_info ); dim_t jr_way = bli_thread_n_way( jr_info ); dim_t ir_way = bli_thread_n_way( ir_info ); dim_t gl_nt = bli_thread_num_threads( jc_info ); dim_t jc_nt = bli_thread_num_threads( pc_info ); dim_t pc_nt = bli_thread_num_threads( pb_info ); dim_t pb_nt = bli_thread_num_threads( ic_info ); dim_t ic_nt = bli_thread_num_threads( pa_info ); dim_t pa_nt = bli_thread_num_threads( jr_info ); dim_t jr_nt = bli_thread_num_threads( ir_info ); printf( " gl jc kc pb ic pa jr ir\n" ); printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )gl_nt, ( unsigned long )jc_nt, ( unsigned long )pc_nt, ( unsigned long )pb_nt, ( unsigned long )ic_nt, ( unsigned long )pa_nt, ( unsigned long )jr_nt, ( unsigned long )1 ); printf( "\n" ); printf( " jc kc pb ic pa jr ir\n" ); printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )jc_way, ( unsigned long )pc_way, ( unsigned long )pb_way, ( unsigned long )ic_way, ( unsigned long )pa_way, ( unsigned long )jr_way, ( unsigned long )ir_way ); printf( "=================================================\n" ); for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) { jc_info = threads[gl_comm_id]; pc_info = bli_thrinfo_sub_node( jc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); ic_info = bli_thrinfo_sub_node( pb_info ); pa_info = bli_thrinfo_sub_node( ic_info ); jr_info = bli_thrinfo_sub_node( pa_info ); ir_info = bli_thrinfo_sub_node( jr_info ); dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); dim_t jc_work_id = bli_thread_work_id( jc_info ); dim_t pc_work_id = bli_thread_work_id( pc_info ); dim_t pb_work_id = bli_thread_work_id( pb_info ); dim_t ic_work_id = bli_thread_work_id( ic_info ); dim_t pa_work_id = bli_thread_work_id( pa_info ); dim_t jr_work_id = bli_thread_work_id( jr_info ); dim_t ir_work_id = bli_thread_work_id( ir_info ); printf( " gl jc pb kc pa ic jr \n" ); printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", ( unsigned long )gl_comm_id, ( unsigned long )jc_comm_id, ( unsigned long )pc_comm_id, ( unsigned long )pb_comm_id, ( unsigned long )ic_comm_id, ( unsigned long )pa_comm_id, ( unsigned long )jr_comm_id ); printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", ( unsigned long )jc_work_id, ( unsigned long )pc_work_id, ( unsigned long )pb_work_id, ( unsigned long )ic_work_id, ( unsigned long )pa_work_id, ( unsigned long )jr_work_id, ( unsigned long )ir_work_id ); printf( "---------------------------------------\n" ); } }
void bli_herk_front( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, gemm_t* cntl ) { obj_t a_local; obj_t ah_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_herk_check( alpha, a, beta, c, cntx ); // If alpha is zero, scale by beta, zero the imaginary components of // the diagonal elements, and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); bli_setid( &BLIS_ZERO, c ); return; } // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. bli_mem_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *c, c_local ); bli_obj_set_as_root( c_local ); // For herk, the right-hand "B" operand is simply A'. bli_obj_alias_to( *a, ah_local ); bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_toggle_conj( a_local ); bli_obj_toggle_conj( ah_local ); bli_obj_induce_trans( c_local ); } thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_l3_thread_decorator( n_threads, (l3_int_t) bli_herk_int, alpha, &a_local, &ah_local, beta, &c_local, (void*) cntx, (void*) cntl, (void**) infos ); bli_l3_thrinfo_free_paths( infos, n_threads ); // The Hermitian rank-k product was computed as A*A', even for the // diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); }