Example #1
0
void blx_gemm_blk_var3
     (
       obj_t*  a,
       obj_t*  b,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t a1, b1;
	dim_t i;
	dim_t b_alg;
	dim_t k_trans;

	// Query dimension in partitioning direction.
	k_trans = bli_obj_width_after_trans( a );

	// Partition along the k dimension.
	for ( i = 0; i < k_trans; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = blx_determine_blocksize_f( i, k_trans, c,
		                                   bli_cntl_bszid( cntl ), cntx );

		// Acquire partitions for A1 and B1.
		bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, a, &a1 );
		bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, b, &b1 );

		// Perform gemm subproblem.
		blx_gemm_int
		(
		  &a1, &b1, c, cntx, rntm,
		  bli_cntl_sub_node( cntl ),
		  bli_thrinfo_sub_node( thread )
		);

		bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );

		// This variant executes multiple rank-k updates. Therefore, if the
		// internal beta scalar on matrix C is non-zero, we must use it
		// only for the first iteration (and then BLIS_ONE for all others).
		// And since c is a locally aliased obj_t, we can simply overwrite
		// the internal beta scalar with BLIS_ONE once it has been used in
		// the first iteration. 
		if ( i == 0 ) bli_obj_scalar_reset( c );
	}
}
Example #2
0
void bli_gemm_packb
     (
       obj_t*  a,
       obj_t*  b,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t b_pack;

	// Pack matrix B according to the control tree node.
	bli_l3_packm
	(
	  b,
	  &b_pack,
	  cntx,
	  cntl,
	  thread
	);

	// Proceed with execution using packed matrix B.
	bli_gemm_int
	(
	  &BLIS_ONE,
	  a,
	  &b_pack,
	  &BLIS_ONE,
	  c,
	  cntx,
	  bli_cntl_sub_node( cntl ),
	  bli_thrinfo_sub_node( thread )
	);
}
Example #3
0
void bli_l3_thrinfo_free
     (
       thrinfo_t* thread
     )
{
	if ( thread == NULL ||
	     thread == &BLIS_PACKM_SINGLE_THREADED ||
	     thread == &BLIS_GEMM_SINGLE_THREADED
	   ) return;

	thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread );

	// Free the communicators, but only if the current thrinfo_t struct
	// is marked as needing them to be freed. The most common example of
	// thrinfo_t nodes NOT marked as needing their comms freed are those
	// associated with packm thrinfo_t nodes.
	if ( bli_thrinfo_needs_free_comm( thread ) )
	{
		// The ochief always frees his communicator, and the ichief free its
		// communicator if we are at the leaf node.
		if ( bli_thread_am_ochief( thread ) )
			bli_thrcomm_free( bli_thrinfo_ocomm( thread ) );
	}

	// Free all children of the current thrinfo_t.
	bli_l3_thrinfo_free( thrinfo_sub_node );

	// Free the thrinfo_t struct.
	bli_free_intl( thread );
}
Example #4
0
void bli_gemm_blk_var2
     (
       obj_t*  a,
       obj_t*  b,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t b1, c1;

	dir_t direct;

	dim_t i;
	dim_t b_alg;
	dim_t my_start, my_end;

	// Determine the direction in which to partition (forwards or backwards).
	direct = bli_l3_direct( a, b, c, cntl );

	// Prune any zero region that exists along the partitioning dimension.
	bli_l3_prune_unref_mparts_n( a, b, c, cntl );

	// Determine the current thread's subpartition range.
	bli_thread_range_ndim
	(
	  direct, thread, a, b, c, cntl, cntx,
	  &my_start, &my_end
	);

	// Partition along the n dimension.
	for ( i = my_start; i < my_end; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_determine_blocksize( direct, i, my_end, b,
		                                 bli_cntl_bszid( cntl ), cntx );

		// Acquire partitions for B1 and C1.
		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
		                        i, b_alg, b, &b1 );
		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
		                        i, b_alg, c, &c1 );

		// Perform gemm subproblem.
		bli_gemm_int
		(
		  &BLIS_ONE,
		  a,
		  &b1,
		  &BLIS_ONE,
		  &c1,
		  cntx,
		  rntm,
		  bli_cntl_sub_node( cntl ),
		  bli_thrinfo_sub_node( thread )
		);
	}
}
Example #5
0
void bli_l3_thrinfo_print_paths
     (
       thrinfo_t** threads
     )
{
	dim_t n_threads = bli_thread_num_threads( threads[0] );
	dim_t gl_comm_id;

	thrinfo_t* jc_info  = threads[0];
	thrinfo_t* pc_info  = bli_thrinfo_sub_node( jc_info );
	thrinfo_t* pb_info  = bli_thrinfo_sub_node( pc_info );
	thrinfo_t* ic_info  = bli_thrinfo_sub_node( pb_info );
	thrinfo_t* pa_info  = bli_thrinfo_sub_node( ic_info );
	thrinfo_t* jr_info  = bli_thrinfo_sub_node( pa_info );
	thrinfo_t* ir_info  = bli_thrinfo_sub_node( jr_info );

	dim_t jc_way = bli_thread_n_way( jc_info );
	dim_t pc_way = bli_thread_n_way( pc_info );
	dim_t pb_way = bli_thread_n_way( pb_info );
	dim_t ic_way = bli_thread_n_way( ic_info );
	dim_t pa_way = bli_thread_n_way( pa_info );
	dim_t jr_way = bli_thread_n_way( jr_info );
	dim_t ir_way = bli_thread_n_way( ir_info );

	dim_t gl_nt = bli_thread_num_threads( jc_info );
	dim_t jc_nt = bli_thread_num_threads( pc_info );
	dim_t pc_nt = bli_thread_num_threads( pb_info );
	dim_t pb_nt = bli_thread_num_threads( ic_info );
	dim_t ic_nt = bli_thread_num_threads( pa_info );
	dim_t pa_nt = bli_thread_num_threads( jr_info );
	dim_t jr_nt = bli_thread_num_threads( ir_info );

	printf( "            gl   jc   kc   pb   ic   pa   jr   ir\n" );
	printf( "xx_nt:    %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
	( unsigned long )gl_nt,
	( unsigned long )jc_nt,
	( unsigned long )pc_nt,
	( unsigned long )pb_nt,
	( unsigned long )ic_nt,
	( unsigned long )pa_nt,
	( unsigned long )jr_nt,
	( unsigned long )1 );
	printf( "\n" );
	printf( "            jc   kc   pb   ic   pa   jr   ir\n" );
	printf( "xx_way:   %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
    ( unsigned long )jc_way,
	( unsigned long )pc_way,
	( unsigned long )pb_way,
	( unsigned long )ic_way,
	( unsigned long )pa_way,
	( unsigned long )jr_way,
	( unsigned long )ir_way );
	printf( "=================================================\n" );

	for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id )
	{
		jc_info = threads[gl_comm_id];
		pc_info = bli_thrinfo_sub_node( jc_info );
		pb_info = bli_thrinfo_sub_node( pc_info );
		ic_info = bli_thrinfo_sub_node( pb_info );
		pa_info = bli_thrinfo_sub_node( ic_info );
		jr_info = bli_thrinfo_sub_node( pa_info );
		ir_info = bli_thrinfo_sub_node( jr_info );

		dim_t gl_comm_id = bli_thread_ocomm_id( jc_info );
		dim_t jc_comm_id = bli_thread_ocomm_id( pc_info );
		dim_t pc_comm_id = bli_thread_ocomm_id( pb_info );
		dim_t pb_comm_id = bli_thread_ocomm_id( ic_info );
		dim_t ic_comm_id = bli_thread_ocomm_id( pa_info );
		dim_t pa_comm_id = bli_thread_ocomm_id( jr_info );
		dim_t jr_comm_id = bli_thread_ocomm_id( ir_info );

		dim_t jc_work_id = bli_thread_work_id( jc_info );
		dim_t pc_work_id = bli_thread_work_id( pc_info );
		dim_t pb_work_id = bli_thread_work_id( pb_info );
		dim_t ic_work_id = bli_thread_work_id( ic_info );
		dim_t pa_work_id = bli_thread_work_id( pa_info );
		dim_t jr_work_id = bli_thread_work_id( jr_info );
		dim_t ir_work_id = bli_thread_work_id( ir_info );

		printf( "            gl   jc   pb   kc   pa   ic   jr  \n" );
		printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n",
		( unsigned long )gl_comm_id,
		( unsigned long )jc_comm_id,
		( unsigned long )pc_comm_id,
		( unsigned long )pb_comm_id,
		( unsigned long )ic_comm_id,
		( unsigned long )pa_comm_id,
		( unsigned long )jr_comm_id );
		printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n",
		( unsigned long )jc_work_id,
		( unsigned long )pc_work_id,
		( unsigned long )pb_work_id,
		( unsigned long )ic_work_id,
		( unsigned long )pa_work_id,
		( unsigned long )jr_work_id,
		( unsigned long )ir_work_id );
		printf( "---------------------------------------\n" );
	}

}
Example #6
0
void bli_trsm_blk_var3
     (
       obj_t*  a,
       obj_t*  b,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t a1, b1;

	dir_t direct;

	dim_t i;
	dim_t b_alg;
	dim_t k_trans;

	// Determine the direction in which to partition (forwards or backwards).
	direct = bli_l3_direct( a, b, c, cntl );

	// Prune any zero region that exists along the partitioning dimension.
	bli_l3_prune_unref_mparts_k( a, b, c, cntl );

	// Query dimension in partitioning direction.
	k_trans = bli_obj_width_after_trans( *a );

	// Partition along the k dimension.
	for ( i = 0; i < k_trans; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b,
		                               bli_cntl_bszid( cntl ), cntx );

		// Acquire partitions for A1 and B1.
		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
		                        i, b_alg, a, &a1 );
		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
		                        i, b_alg, b, &b1 );

		// Perform trsm subproblem.
		bli_trsm_int
		(
		  &BLIS_ONE,
		  &a1,
		  &b1,
		  &BLIS_ONE,
		  c,
		  cntx,
		  bli_cntl_sub_node( cntl ),
		  bli_thrinfo_sub_node( thread )
		);

		//bli_thread_ibarrier( thread );
		bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );

		// This variant executes multiple rank-k updates. Therefore, if the
		// internal alpha scalars on A/B and C are non-zero, we must ensure
		// that they are only used in the first iteration.
		if ( i == 0 )
		{
			bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b );
			bli_obj_scalar_reset( c );
		}
	}
}