Пример #1
0
dim_t bli_gemm_determine_kc_f( dim_t    i,
                               dim_t    dim,
                               obj_t*   a,
                               obj_t*   b,
                               blksz_t* bsize )
{
    num_t dt;
    dim_t mnr;
    dim_t b_alg, b_max;
    dim_t b_use;

    // We assume that this function is being called from an algorithm that
    // is moving "forward" (ie: top to bottom, left to right, top-left
    // to bottom-right).

    // Extract the execution datatype and use it to query the corresponding
    // blocksize and blocksize maximum values from the blksz_t object.
    dt    = bli_obj_execution_datatype( *a );
    b_alg = bli_blksz_get_def( dt, bsize );
    b_max = bli_blksz_get_max( dt, bsize );

    // Nudge the default and maximum kc blocksizes up to the nearest
    // multiple of MR if A is Hermitian or symmetric, or NR if B is
    // Hermitian or symmetric. If neither case applies, then we leave
    // the blocksizes unchanged.
    if      ( bli_obj_root_is_herm_or_symm( *a ) )
    {
        mnr   = bli_blksz_get_mr( dt, bsize );
        b_alg = bli_align_dim_to_mult( b_alg, mnr );
        b_max = bli_align_dim_to_mult( b_max, mnr );
    }
    else if ( bli_obj_root_is_herm_or_symm( *b ) )
    {
        mnr   = bli_blksz_get_nr( dt, bsize );
        b_alg = bli_align_dim_to_mult( b_alg, mnr );
        b_max = bli_align_dim_to_mult( b_max, mnr );
    }

    b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max );

    return b_use;
}
Пример #2
0
dim_t bli_determine_blocksize_b( dim_t    i,
                                 dim_t    dim,
                                 obj_t*   obj,
                                 bszid_t  bszid,
                                 cntx_t*  cntx )
{
	num_t    dt;
	blksz_t* bsize;
	dim_t    b_alg, b_max;
	dim_t    b_use;

	// Extract the execution datatype and use it to query the corresponding
	// blocksize and blocksize maximum values from the blksz_t object.
	dt    = bli_obj_execution_datatype( *obj );
	bsize = bli_cntx_get_blksz( bszid, cntx );
	b_alg = bli_blksz_get_def( dt, bsize );
	b_max = bli_blksz_get_max( dt, bsize );

	b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );

	return b_use;
}
Пример #3
0
void bli_trsm_rl_ker_var2( obj_t*  a,
                           obj_t*  b,
                           obj_t*  c,
                           trsm_t* cntl,
                           trsm_thrinfo_t* thread )
{
	num_t     dt_exec   = bli_obj_execution_datatype( *c );

	doff_t    diagoffb  = bli_obj_diag_offset( *b );

	pack_t    schema_a  = bli_obj_pack_schema( *a );
	pack_t    schema_b  = bli_obj_pack_schema( *b );

	dim_t     m         = bli_obj_length( *c );
	dim_t     n         = bli_obj_width( *c );
	dim_t     k         = bli_obj_width( *a );

	void*     buf_a     = bli_obj_buffer_at_off( *a );
	inc_t     cs_a      = bli_obj_col_stride( *a );
	dim_t     pd_a      = bli_obj_panel_dim( *a );
	inc_t     ps_a      = bli_obj_panel_stride( *a );

	void*     buf_b     = bli_obj_buffer_at_off( *b );
	inc_t     rs_b      = bli_obj_row_stride( *b );
	dim_t     pd_b      = bli_obj_panel_dim( *b );
	inc_t     ps_b      = bli_obj_panel_stride( *b );

	void*     buf_c     = bli_obj_buffer_at_off( *c );
	inc_t     rs_c      = bli_obj_row_stride( *c );
	inc_t     cs_c      = bli_obj_col_stride( *c );

	void*     buf_alpha1;
	void*     buf_alpha2;

	FUNCPTR_T f;

	func_t*   gemmtrsm_ukrs;
	func_t*   gemm_ukrs;
	void*     gemmtrsm_ukr;
	void*     gemm_ukr;


	// Grab the address of the internal scalar buffer for the scalar
	// attached to A. This will be the alpha scalar used in the gemmtrsm
	// subproblems (ie: the scalar that would be applied to the packed
	// copy of A prior to it being updated by the trsm subproblem). This
	// scalar may be unit, if for example it was applied during packing.
	buf_alpha1 = bli_obj_internal_scalar_buffer( *a );

	// Grab the address of the internal scalar buffer for the scalar
	// attached to C. This will be the "beta" scalar used in the gemm-only
	// subproblems that correspond to micro-panels that do not intersect
	// the diagonal. We need this separate scalar because it's possible
	// that the alpha attached to B was reset, if it was applied during
	// packing.
	buf_alpha2 = bli_obj_internal_scalar_buffer( *c );

	// Index into the type combination array to extract the correct
	// function pointer.
	f = ftypes[dt_exec];

	// Extract from the control tree node the func_t objects containing
	// the gemmtrsm and gemm micro-kernel function addresses, and then
	// query the function addresses corresponding to the current datatype.
	gemmtrsm_ukrs = cntl_gemmtrsm_u_ukrs( cntl );
	gemm_ukrs     = cntl_gemm_ukrs( cntl );
	gemmtrsm_ukr  = bli_func_obj_query( dt_exec, gemmtrsm_ukrs );
	gemm_ukr      = bli_func_obj_query( dt_exec, gemm_ukrs );

	// Invoke the function.
	f( diagoffb,
	   schema_a,
	   schema_b,
	   m,
	   n,
	   k,
	   buf_alpha1,
	   buf_a, cs_a, pd_a, ps_a,
	   buf_b, rs_b, pd_b, ps_b,
	   buf_alpha2,
	   buf_c, rs_c, cs_c,
	   gemmtrsm_ukr,
	   gemm_ukr,
	   thread );
}
Пример #4
0
void bli_trsm_ru_ker_var2( obj_t*  a,
                           obj_t*  b,
                           obj_t*  c,
                           trsm_t* cntl )
{
	num_t     dt_exec   = bli_obj_execution_datatype( *c );

	doff_t    diagoffb  = bli_obj_diag_offset( *b );

	dim_t     m         = bli_obj_length( *c );
	dim_t     n         = bli_obj_width( *c );
	dim_t     k         = bli_obj_width( *a );

	void*     buf_a     = bli_obj_buffer_at_off( *a );
	inc_t     cs_a      = bli_obj_col_stride( *a );
	inc_t     pd_a      = bli_obj_panel_dim( *a );
	inc_t     ps_a      = bli_obj_panel_stride( *a );

	void*     buf_b     = bli_obj_buffer_at_off( *b );
	inc_t     rs_b      = bli_obj_row_stride( *b );
	inc_t     pd_b      = bli_obj_panel_dim( *b );
	inc_t     ps_b      = bli_obj_panel_stride( *b );

	void*     buf_c     = bli_obj_buffer_at_off( *c );
	inc_t     rs_c      = bli_obj_row_stride( *c );
	inc_t     cs_c      = bli_obj_col_stride( *c );

	void*     buf_alpha1;
	void*     buf_alpha2;

	FUNCPTR_T f;

	func_t*   gemmtrsm_ukrs;
	func_t*   gemm_ukrs;
	void*     gemmtrsm_ukr;
	void*     gemm_ukr;


	// Grab the address of the internal scalar buffer for the scalar
	// attached to A. This will be the alpha scalar used in the gemmtrsm
	// subproblems (ie: the scalar that would be applied to the packed
	// copy of A prior to it being updated by the trsm subproblem). This
	// scalar may be unit, if for example it was applied during packing.
	buf_alpha1 = bli_obj_internal_scalar_buffer( *a );

	// Grab the address of the internal scalar buffer for the scalar
	// attached to C. This will be the "beta" scalar used in the gemm-only
	// subproblems that correspond to micro-panels that do not intersect
	// the diagonal. We need this separate scalar because it's possible
	// that the alpha attached to B was reset, if it was applied during
	// packing.
	buf_alpha2 = bli_obj_internal_scalar_buffer( *c );

	// Index into the type combination array to extract the correct
	// function pointer.
	f = ftypes[dt_exec];

	// Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This
	// is needed because cs_a and rs_b are used to index into the
	// micro-panels of A and B, respectively, and since the pointer
	// types in the macro-kernel (scomplex or dcomplex) will result
	// in pointer arithmetic that moves twice as far as it should,
	// given the datatypes actually stored (float or double), we must
	// halve the strides to compensate.
	if ( bli_obj_is_panel_packed_4m( *a ) ||
	     bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; }

	// Extract from the control tree node the func_t objects containing
	// the gemmtrsm and gemm micro-kernel function addresses, and then
	// query the function addresses corresponding to the current datatype.
	gemmtrsm_ukrs = cntl_gemmtrsm_l_ukrs( cntl );
	gemm_ukrs     = cntl_gemm_ukrs( cntl );
	gemmtrsm_ukr  = bli_func_obj_query( dt_exec, gemmtrsm_ukrs );
	gemm_ukr      = bli_func_obj_query( dt_exec, gemm_ukrs );

	// Invoke the function.
	f( diagoffb,
	   m,
	   n,
	   k,
	   buf_alpha1,
	   buf_a, cs_a, pd_a, ps_a,
	   buf_b, rs_b, pd_b, ps_b,
	   buf_alpha2,
	   buf_c, rs_c, cs_c,
	   gemmtrsm_ukr,
	   gemm_ukr );
}
Пример #5
0
void bli_herk_l_ker_var2( obj_t*  a,
                          obj_t*  b,
                          obj_t*  c,
                          gemm_t* cntl,
                          herk_thrinfo_t* thread )
{
	num_t     dt_exec   = bli_obj_execution_datatype( *c );

	doff_t    diagoffc  = bli_obj_diag_offset( *c );

	pack_t    schema_a  = bli_obj_pack_schema( *a );
	pack_t    schema_b  = bli_obj_pack_schema( *b );

	dim_t     m         = bli_obj_length( *c );
	dim_t     n         = bli_obj_width( *c );
	dim_t     k         = bli_obj_width( *a );

	void*     buf_a     = bli_obj_buffer_at_off( *a );
	inc_t     cs_a      = bli_obj_col_stride( *a );
	inc_t     pd_a      = bli_obj_panel_dim( *a );
	inc_t     ps_a      = bli_obj_panel_stride( *a );

	void*     buf_b     = bli_obj_buffer_at_off( *b );
	inc_t     rs_b      = bli_obj_row_stride( *b );
	inc_t     pd_b      = bli_obj_panel_dim( *b );
	inc_t     ps_b      = bli_obj_panel_stride( *b );

	void*     buf_c     = bli_obj_buffer_at_off( *c );
	inc_t     rs_c      = bli_obj_row_stride( *c );
	inc_t     cs_c      = bli_obj_col_stride( *c );

	obj_t     scalar_a;
	obj_t     scalar_b;

	void*     buf_alpha;
	void*     buf_beta;

	FUNCPTR_T f;

	func_t*   gemm_ukrs;
	void*     gemm_ukr;


	// Detach and multiply the scalars attached to A and B.
	bli_obj_scalar_detach( a, &scalar_a );
	bli_obj_scalar_detach( b, &scalar_b );
	bli_mulsc( &scalar_a, &scalar_b );

	// Grab the addresses of the internal scalar buffers for the scalar
	// merged above and the scalar attached to C.
	buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
	buf_beta  = bli_obj_internal_scalar_buffer( *c );

	// Index into the type combination array to extract the correct
	// function pointer.
	f = ftypes[dt_exec];

	// Extract from the control tree node the func_t object containing
	// the gemm micro-kernel function addresses, and then query the
	// function address corresponding to the current datatype.
	gemm_ukrs = cntl_gemm_ukrs( cntl );
	gemm_ukr  = bli_func_obj_query( dt_exec, gemm_ukrs );

	// Invoke the function.
	f( diagoffc,
	   schema_a,
	   schema_b,
	   m,
	   n,
	   k,
	   buf_alpha,
	   buf_a, cs_a, pd_a, ps_a,
	   buf_b, rs_b, pd_b, ps_b,
	   buf_beta,
	   buf_c, rs_c, cs_c,
	   gemm_ukr,
	   thread );
}
Пример #6
0
void bli_trsm_rl_ker_var2
     (
       obj_t*  a,
       obj_t*  b,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	num_t     dt_exec   = bli_obj_execution_datatype( *c );

	doff_t    diagoffb  = bli_obj_diag_offset( *b );

	pack_t    schema_a  = bli_obj_pack_schema( *a );
	pack_t    schema_b  = bli_obj_pack_schema( *b );

	dim_t     m         = bli_obj_length( *c );
	dim_t     n         = bli_obj_width( *c );
	dim_t     k         = bli_obj_width( *a );

	void*     buf_a     = bli_obj_buffer_at_off( *a );
	inc_t     cs_a      = bli_obj_col_stride( *a );
	dim_t     pd_a      = bli_obj_panel_dim( *a );
	inc_t     ps_a      = bli_obj_panel_stride( *a );

	void*     buf_b     = bli_obj_buffer_at_off( *b );
	inc_t     rs_b      = bli_obj_row_stride( *b );
	dim_t     pd_b      = bli_obj_panel_dim( *b );
	inc_t     ps_b      = bli_obj_panel_stride( *b );

	void*     buf_c     = bli_obj_buffer_at_off( *c );
	inc_t     rs_c      = bli_obj_row_stride( *c );
	inc_t     cs_c      = bli_obj_col_stride( *c );

	void*     buf_alpha1;
	void*     buf_alpha2;

	FUNCPTR_T f;

	// Grab the address of the internal scalar buffer for the scalar
	// attached to A (the non-triangular matrix). This will be the alpha
	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
	// be applied to the packed copy of A prior to it being updated by
	// the trsm subproblem). This scalar may be unit, if for example it
	// was applied during packing.
	buf_alpha1 = bli_obj_internal_scalar_buffer( *a );

	// Grab the address of the internal scalar buffer for the scalar
	// attached to C. This will be the "beta" scalar used in the gemm-only
	// subproblems that correspond to micro-panels that do not intersect
	// the diagonal. We need this separate scalar because it's possible
	// that the alpha attached to B was reset, if it was applied during
	// packing.
	buf_alpha2 = bli_obj_internal_scalar_buffer( *c );

	// Index into the type combination array to extract the correct
	// function pointer.
	f = ftypes[dt_exec];

	// Invoke the function.
	f( diagoffb,
	   schema_a,
	   schema_b,
	   m,
	   n,
	   k,
	   buf_alpha1,
	   buf_a, cs_a, pd_a, ps_a,
	   buf_b, rs_b, pd_b, ps_b,
	   buf_alpha2,
	   buf_c, rs_c, cs_c,
	   cntx,
	   thread );
}
Пример #7
0
void bli_trsm_blk_var1f( obj_t*  a,
                         obj_t*  b,
                         obj_t*  c,
                         trsm_t* cntl,
                         trsm_thrinfo_t* thread )
{
    obj_t b_pack_s;
    obj_t a1_pack_s;

	obj_t a1, c1;
	obj_t* b_pack = NULL;
	obj_t* a1_pack = NULL;

	dim_t i;
	dim_t b_alg;
	dim_t m_trans;
	dim_t offA;

    // Initialize object for packing B.
    if( thread_am_ochief( thread ) ) {
	    bli_obj_init_pack( &b_pack_s );
        bli_packm_init( b, &b_pack_s,
                        cntl_sub_packm_b( cntl ) );
    }
    b_pack = thread_obroadcast( thread, &b_pack_s );

    // Initialize object for packing B.
    if( thread_am_ichief( thread ) ) {
        bli_obj_init_pack( &a1_pack_s );
    }
    a1_pack = thread_ibroadcast( thread, &a1_pack_s );

	// Pack B1 (if instructed).
	bli_packm_int( b, b_pack,
	               cntl_sub_packm_b( cntl ),
                   trsm_thread_sub_opackm( thread ) );

	// Set the default length of and offset to the non-zero part of A.
	m_trans  = bli_obj_length_after_trans( *a );
	offA     = 0;

	// If A is lower triangular, we have to adjust where the non-zero part of
	// A begins.
	if ( bli_obj_is_lower( *a ) )
		offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );

    dim_t start, end;
    num_t dt = bli_obj_execution_datatype( *a );
    bli_get_range_t2b( thread, offA, m_trans,
                       //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
                       bli_info_get_default_mc( BLIS_TRSM, dt ),
                       &start, &end );

	// Partition along the remaining portion of the m dimension.
	for ( i = start; i < end; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_determine_blocksize_f( i, end, a,
		                                   cntl_blocksize( cntl ) );

		// Acquire partitions for A1 and C1.
		bli_acquire_mpart_t2b( BLIS_SUBPART1,
		                       i, b_alg, a, &a1 );
		bli_acquire_mpart_t2b( BLIS_SUBPART1,
		                       i, b_alg, c, &c1 );

		// Initialize object for packing A1.
        if( thread_am_ichief( thread ) ) {
            bli_packm_init( &a1, a1_pack,
                            cntl_sub_packm_a( cntl ) );
        }
        thread_ibarrier( thread );

		// Pack A1 (if instructed).
		bli_packm_int( &a1, a1_pack,
		               cntl_sub_packm_a( cntl ),
                       trsm_thread_sub_ipackm( thread ) );

		// Perform trsm subproblem.
		bli_trsm_int( &BLIS_ONE,
		              a1_pack,
		              b_pack,
		              &BLIS_ONE,
		              &c1,
		              cntl_sub_trsm( cntl ),
                      trsm_thread_sub_trsm( thread ) );
        thread_ibarrier( thread );
	}

	// If any packing buffers were acquired within packm, release them back
	// to the memory manager.
    thread_obarrier( thread );
    if( thread_am_ochief( thread ) )
    	bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) );
    if( thread_am_ichief( thread ) )
    	bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
}
Пример #8
0
void bli_trsm_blk_var1b( obj_t*  a,
                         obj_t*  b,
                         obj_t*  c,
                         trsm_t* cntl,
                         trsm_thrinfo_t* thread )
{
    obj_t b_pack_s;
    obj_t a1_pack_s;

	obj_t a1, c1;
	obj_t* b_pack = NULL;
	obj_t* a1_pack = NULL;

	dim_t i;
	dim_t b_alg;

	// Prune any zero region that exists along the partitioning dimension.
	bli_trsm_prune_unref_mparts_m( a, b, c );

    // Initialize object for packing B.
    if( thread_am_ochief( thread ) ) {
	    bli_obj_init_pack( &b_pack_s );
        bli_packm_init( b, &b_pack_s,
                        cntl_sub_packm_b( cntl ) );
    }
    b_pack = thread_obroadcast( thread, &b_pack_s );

    // Initialize object for packing B.
    if( thread_am_ichief( thread ) ) {
        bli_obj_init_pack( &a1_pack_s );
    }
    a1_pack = thread_ibroadcast( thread, &a1_pack_s );

	// Pack B1 (if instructed).
	bli_packm_int( b, b_pack,
	               cntl_sub_packm_b( cntl ),
                   trsm_thread_sub_opackm( thread ) );

    dim_t my_start, my_end;
    num_t dt = bli_obj_execution_datatype( *a );
	dim_t bf = ( bli_obj_root_is_triangular( *a ) ?
	             bli_info_get_default_mr( BLIS_TRSM, dt ) :
	             bli_info_get_default_nr( BLIS_TRSM, dt ) );
    bli_get_range_b2t( thread, a, bf,
                       &my_start, &my_end );

	// Partition along the remaining portion of the m dimension.
	for ( i = my_start; i < my_end; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_determine_blocksize_b( i, my_end, a,
		                                   cntl_blocksize( cntl ) );

		// Acquire partitions for A1 and C1.
		bli_acquire_mpart_b2t( BLIS_SUBPART1,
		                       i, b_alg, a, &a1 );
		bli_acquire_mpart_b2t( BLIS_SUBPART1,
		                       i, b_alg, c, &c1 );

		// Initialize object for packing A1.
        if( thread_am_ichief( thread ) ) {
            bli_packm_init( &a1, a1_pack,
                            cntl_sub_packm_a( cntl ) );
        }
        thread_ibarrier( thread );

		// Pack A1 (if instructed).
		bli_packm_int( &a1, a1_pack,
		               cntl_sub_packm_a( cntl ),
                       trsm_thread_sub_ipackm( thread ) );

		// Perform trsm subproblem.
		bli_trsm_int( &BLIS_ONE,
		              a1_pack,
		              b_pack,
		              &BLIS_ONE,
		              &c1,
		              cntl_sub_trsm( cntl ),
                      trsm_thread_sub_trsm( thread ) );
        thread_ibarrier( thread );
	}

	// If any packing buffers were acquired within packm, release them back
	// to the memory manager.
    thread_obarrier( thread );
    if( thread_am_ochief( thread ) )
    	bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) );
    if( thread_am_ichief( thread ) )
    	bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
}
Пример #9
0
void bli_trsm_blk_var2b( obj_t*  a,
                         obj_t*  b,
                         obj_t*  c,
                         trsm_t* cntl,
                         trsm_thrinfo_t* thread )
{
    obj_t a_pack_s;
    obj_t b1_pack_s, c1_pack_s;

    obj_t b1, c1;
	obj_t* a_pack = NULL;
	obj_t* b1_pack = NULL;
	obj_t* c1_pack = NULL;

	dim_t i;
	dim_t b_alg;
	dim_t n_trans;

	// Initialize pack objects for A that are passed into packm_init().
    if( thread_am_ochief( thread ) ) {
	    bli_obj_init_pack( &a_pack_s );

        // Initialize object for packing A.
        bli_packm_init( a, &a_pack_s,
                        cntl_sub_packm_a( cntl ) );

        // Scale C by beta (if instructed).
        bli_scalm_int( &BLIS_ONE,
                       c,
                       cntl_sub_scalm( cntl ) );
    }
    a_pack = thread_obroadcast( thread, &a_pack_s );

	// Initialize pack objects for B and C that are passed into packm_init().
    if( thread_am_ichief( thread ) ) {
        bli_obj_init_pack( &b1_pack_s );
        bli_obj_init_pack( &c1_pack_s );
    }
    b1_pack = thread_ibroadcast( thread, &b1_pack_s );
    c1_pack = thread_ibroadcast( thread, &c1_pack_s );

	// Pack A (if instructed).
	bli_packm_int( a, a_pack,
	               cntl_sub_packm_a( cntl ),
                   trmm_thread_sub_opackm( thread ) );

	// Query dimension in partitioning direction.
	n_trans = bli_obj_width_after_trans( *b );
    dim_t start, end;
    num_t dt = bli_obj_execution_datatype( *a );
    bli_get_range_r2l( thread, 0, n_trans,
                       //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
	                   //         bli_info_get_default_mr( BLIS_TRSM, dt ) ),
	                   bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
	                            bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
                       &start, &end );

	// Partition along the n dimension.
	for ( i = start; i < end; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_determine_blocksize_b( i, end, b,
		                                   cntl_blocksize( cntl ) );

		// Acquire partitions for B1 and C1.
		bli_acquire_mpart_r2l( BLIS_SUBPART1,
		                       i, b_alg, b, &b1 );
		bli_acquire_mpart_r2l( BLIS_SUBPART1,
		                       i, b_alg, c, &c1 );

		// Initialize objects for packing A1 and B1.
        if( thread_am_ichief( thread ) ) {
            bli_packm_init( &b1, b1_pack,
                            cntl_sub_packm_b( cntl ) );
            bli_packm_init( &c1, c1_pack,
                            cntl_sub_packm_c( cntl ) );
        }
        thread_ibarrier( thread );

		// Pack B1 (if instructed).
		bli_packm_int( &b1, b1_pack,
		               cntl_sub_packm_b( cntl ),
                       trsm_thread_sub_ipackm( thread ) );

		// Pack C1 (if instructed).
		bli_packm_int( &c1, c1_pack,
		               cntl_sub_packm_c( cntl ),
                       trsm_thread_sub_ipackm( thread ) );

		// Perform trsm subproblem.
		bli_trsm_int( &BLIS_ONE,
		              a_pack,
		              b1_pack,
		              &BLIS_ONE,
		              c1_pack,
		              cntl_sub_trsm( cntl ),
                      trsm_thread_sub_trsm( thread ) );
        thread_ibarrier( thread );

		// Unpack C1 (if C1 was packed).
        bli_unpackm_int( c1_pack, &c1,
                         cntl_sub_unpackm_c( cntl ),
                         trsm_thread_sub_ipackm( thread ) );
	}

	// If any packing buffers were acquired within packm, release them back
	// to the memory manager.
    thread_obarrier( thread );
    if( thread_am_ochief( thread ) )
    	bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
    if( thread_am_ichief( thread ) ) {
        bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
        bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
    }
}
Пример #10
0
void bli_trmm_ru_ker_var2( obj_t*  a,
                           obj_t*  b,
                           obj_t*  c,
                           cntx_t* cntx,
                           gemm_t* cntl,
                           thrinfo_t* thread )
{
	num_t     dt_exec   = bli_obj_execution_datatype( *c );

	doff_t    diagoffb  = bli_obj_diag_offset( *b );

	pack_t    schema_a  = bli_obj_pack_schema( *a );
	pack_t    schema_b  = bli_obj_pack_schema( *b );

	dim_t     m         = bli_obj_length( *c );
	dim_t     n         = bli_obj_width( *c );
	dim_t     k         = bli_obj_width( *a );

	void*     buf_a     = bli_obj_buffer_at_off( *a );
	inc_t     cs_a      = bli_obj_col_stride( *a );
	dim_t     pd_a      = bli_obj_panel_dim( *a );
	inc_t     ps_a      = bli_obj_panel_stride( *a );

	void*     buf_b     = bli_obj_buffer_at_off( *b );
	inc_t     rs_b      = bli_obj_row_stride( *b );
	dim_t     pd_b      = bli_obj_panel_dim( *b );
	inc_t     ps_b      = bli_obj_panel_stride( *b );

	void*     buf_c     = bli_obj_buffer_at_off( *c );
	inc_t     rs_c      = bli_obj_row_stride( *c );
	inc_t     cs_c      = bli_obj_col_stride( *c );

	obj_t     scalar_a;
	obj_t     scalar_b;

	void*     buf_alpha;
	void*     buf_beta;

	FUNCPTR_T f;

	// Detach and multiply the scalars attached to A and B.
	bli_obj_scalar_detach( a, &scalar_a );
	bli_obj_scalar_detach( b, &scalar_b );
	bli_mulsc( &scalar_a, &scalar_b );

	// Grab the addresses of the internal scalar buffers for the scalar
	// merged above and the scalar attached to C.
	buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
	buf_beta  = bli_obj_internal_scalar_buffer( *c );

	// Index into the type combination array to extract the correct
	// function pointer.
	f = ftypes[dt_exec];

	// Invoke the function.
	f( diagoffb,
	   schema_a,
	   schema_b,
	   m,
	   n,
	   k,
	   buf_alpha,
	   buf_a, cs_a, pd_a, ps_a,
	   buf_b, rs_b, pd_b, ps_b,
	   buf_beta,
	   buf_c, rs_c, cs_c,
	   cntx,
	   thread );
}
Пример #11
0
void bli_trmm_ru_ker_var2( obj_t*  a,
                           obj_t*  b,
                           obj_t*  c,
                           trmm_t* cntl,
                           trmm_thrinfo_t* thread )
{
	num_t     dt_exec   = bli_obj_execution_datatype( *c );

	doff_t    diagoffb  = bli_obj_diag_offset( *b );

	dim_t     m         = bli_obj_length( *c );
	dim_t     n         = bli_obj_width( *c );
	dim_t     k         = bli_obj_width( *a );

	void*     buf_a     = bli_obj_buffer_at_off( *a );
	inc_t     cs_a      = bli_obj_col_stride( *a );
	inc_t     pd_a      = bli_obj_panel_dim( *a );
	inc_t     ps_a      = bli_obj_panel_stride( *a );

	void*     buf_b     = bli_obj_buffer_at_off( *b );
	inc_t     rs_b      = bli_obj_row_stride( *b );
	inc_t     pd_b      = bli_obj_panel_dim( *b );
	inc_t     ps_b      = bli_obj_panel_stride( *b );

	void*     buf_c     = bli_obj_buffer_at_off( *c );
	inc_t     rs_c      = bli_obj_row_stride( *c );
	inc_t     cs_c      = bli_obj_col_stride( *c );

	obj_t     scalar_a;
	obj_t     scalar_b;

	void*     buf_alpha;
	void*     buf_beta;

	FUNCPTR_T f;

	func_t*   gemm_ukrs;
	void*     gemm_ukr;


	// Detach and multiply the scalars attached to A and B.
	bli_obj_scalar_detach( a, &scalar_a );
	bli_obj_scalar_detach( b, &scalar_b );
	bli_mulsc( &scalar_a, &scalar_b );

	// Grab the addresses of the internal scalar buffers for the scalar
	// merged above and the scalar attached to C.
	buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
	buf_beta  = bli_obj_internal_scalar_buffer( *c );

	// Index into the type combination array to extract the correct
	// function pointer.
	f = ftypes[dt_exec];

	// Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This
	// is needed because cs_a and rs_b are used to index into the
	// micro-panels of A and B, respectively, and since the pointer
	// types in the macro-kernel (scomplex or dcomplex) will result
	// in pointer arithmetic that moves twice as far as it should,
	// given the datatypes actually stored (float or double), we must
	// halve the strides to compensate.
	if ( bli_obj_is_panel_packed_4m( *a ) ||
	     bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; }

	// Extract from the control tree node the func_t object containing
	// the gemm micro-kernel function addresses, and then query the
	// function address corresponding to the current datatype.
	gemm_ukrs = cntl_gemm_ukrs( cntl );
	gemm_ukr  = bli_func_obj_query( dt_exec, gemm_ukrs );

	// Invoke the function.
	f( diagoffb,
	   m,
	   n,
	   k,
	   buf_alpha,
	   buf_a, cs_a, pd_a, ps_a,
	   buf_b, rs_b, pd_b, ps_b,
	   buf_beta,
	   buf_c, rs_c, cs_c,
	   gemm_ukr,
       thread );
}
Пример #12
0
void bli_gemm_ker_var2
     (
       obj_t*  a,
       obj_t*  b,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	num_t     dt_exec   = bli_obj_execution_datatype( *c );

	pack_t    schema_a  = bli_obj_pack_schema( *a );
	pack_t    schema_b  = bli_obj_pack_schema( *b );

	dim_t     m         = bli_obj_length( *c );
	dim_t     n         = bli_obj_width( *c );
	dim_t     k         = bli_obj_width( *a );

	void*     buf_a     = bli_obj_buffer_at_off( *a );
	inc_t     cs_a      = bli_obj_col_stride( *a );
	inc_t     is_a      = bli_obj_imag_stride( *a );
	dim_t     pd_a      = bli_obj_panel_dim( *a );
	inc_t     ps_a      = bli_obj_panel_stride( *a );

	void*     buf_b     = bli_obj_buffer_at_off( *b );
	inc_t     rs_b      = bli_obj_row_stride( *b );
	inc_t     is_b      = bli_obj_imag_stride( *b );
	dim_t     pd_b      = bli_obj_panel_dim( *b );
	inc_t     ps_b      = bli_obj_panel_stride( *b );

	void*     buf_c     = bli_obj_buffer_at_off( *c );
	inc_t     rs_c      = bli_obj_row_stride( *c );
	inc_t     cs_c      = bli_obj_col_stride( *c );

	obj_t     scalar_a;
	obj_t     scalar_b;

	void*     buf_alpha;
	void*     buf_beta;

	FUNCPTR_T f;

	// Detach and multiply the scalars attached to A and B.
	bli_obj_scalar_detach( a, &scalar_a );
	bli_obj_scalar_detach( b, &scalar_b );
	bli_mulsc( &scalar_a, &scalar_b );

	// Grab the addresses of the internal scalar buffers for the scalar
	// merged above and the scalar attached to C.
	buf_alpha = bli_obj_internal_scalar_buffer( scalar_b );
	buf_beta  = bli_obj_internal_scalar_buffer( *c );

    // If 1m is being employed on a column- or row-stored matrix with a
    // real-valued beta, we can use the real domain macro-kernel, which
	// eliminates a little overhead associated with the 1m virtual
	// micro-kernel.
#if 1
	if ( bli_is_1m_packed( schema_a ) )
	{
		bli_l3_ind_recast_1m_params
		(
		  dt_exec,
		  schema_a,
		  c,
		  m, n, k,
		  pd_a, ps_a,
		  pd_b, ps_b,
		  rs_c, cs_c
		);
	}
#endif

	// Index into the type combination array to extract the correct
	// function pointer.
	f = ftypes[dt_exec];

	// Invoke the function.
	f( schema_a,
	   schema_b,
	   m,
	   n,
	   k,
	   buf_alpha,
	   buf_a, cs_a, is_a,
	          pd_a, ps_a,
	   buf_b, rs_b, is_b,
	          pd_b, ps_b,
	   buf_beta,
	   buf_c, rs_c, cs_c,
	   cntx,
	   thread );
}