예제 #1
0
void bli_l3_thrinfo_free
     (
       thrinfo_t* thread
     )
{
	if ( thread == NULL ||
	     thread == &BLIS_PACKM_SINGLE_THREADED ||
	     thread == &BLIS_GEMM_SINGLE_THREADED
	   ) return;

	thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread );

	// Free the communicators, but only if the current thrinfo_t struct
	// is marked as needing them to be freed. The most common example of
	// thrinfo_t nodes NOT marked as needing their comms freed are those
	// associated with packm thrinfo_t nodes.
	if ( bli_thrinfo_needs_free_comm( thread ) )
	{
		// The ochief always frees his communicator, and the ichief free its
		// communicator if we are at the leaf node.
		if ( bli_thread_am_ochief( thread ) )
			bli_thrcomm_free( bli_thrinfo_ocomm( thread ) );
	}

	// Free all children of the current thrinfo_t.
	bli_l3_thrinfo_free( thrinfo_sub_node );

	// Free the thrinfo_t struct.
	bli_free_intl( thread );
}
예제 #2
0
void bli_trmm_blk_var2b( obj_t*  a,
                         obj_t*  b,
                         obj_t*  c,
                         cntx_t* cntx,
                         gemm_t* cntl,
                         thrinfo_t* thread )
{
    obj_t a_pack_s;
    obj_t b1_pack_s, c1_pack_s;
    
    obj_t b1, c1; 
    obj_t*  a_pack = NULL;
    obj_t*  b1_pack = NULL;
    obj_t*  c1_pack = NULL;

	dim_t i;
	dim_t b_alg;

	// Prune any zero region that exists along the partitioning dimension.
	bli_trmm_prune_unref_mparts_n( a, b, c );

    if( bli_thread_am_ochief( thread ) ) { 
        // Initialize object for packing A
        bli_obj_init_pack( &a_pack_s );
        bli_packm_init( a, &a_pack_s,
                        cntx, bli_cntl_sub_packm_a( cntl ) );

        // Scale C by beta (if instructed).
        bli_scalm_int( &BLIS_ONE,
                       c,  
                       cntx, bli_cntl_sub_scalm( cntl ) );
    }   
    a_pack = bli_thread_obroadcast( thread, &a_pack_s );

    // Initialize pack objects for B and C that are passed into packm_init().
    if( bli_thread_am_ichief( thread ) ) { 
        bli_obj_init_pack( &b1_pack_s );
        bli_obj_init_pack( &c1_pack_s );
    }   
    b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
    c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );

	// Pack A (if instructed).
	bli_packm_int( a, a_pack,
	               cntx, bli_cntl_sub_packm_a( cntl ),
                   bli_thrinfo_sub_opackm( thread ) );

    dim_t my_start, my_end;
    bli_thread_get_range_weighted_r2l( thread, b,
                                bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
                                &my_start, &my_end );

	// Partition along the n dimension.
	for ( i = my_start; i < my_end; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_determine_blocksize_b( i, my_end, b,
		                                   bli_cntl_bszid( cntl ), cntx );

		// Acquire partitions for B1 and C1.
		bli_acquire_mpart_r2l( BLIS_SUBPART1,
		                       i, b_alg, b, &b1 );
		bli_acquire_mpart_r2l( BLIS_SUBPART1,
		                       i, b_alg, c, &c1 );

		// Initialize objects for packing A1 and B1.
        if( bli_thread_am_ichief( thread ) ) {
            bli_packm_init( &b1, b1_pack,
                            cntx, bli_cntl_sub_packm_b( cntl ) );
            bli_packm_init( &c1, c1_pack,
                            cntx, bli_cntl_sub_packm_c( cntl ) );
        }
        bli_thread_ibarrier( thread );

		// Pack B1 (if instructed).
		bli_packm_int( &b1, b1_pack,
		               cntx, bli_cntl_sub_packm_b( cntl ),
                       bli_thrinfo_sub_ipackm( thread ) );

		// Pack C1 (if instructed).
		bli_packm_int( &c1, c1_pack,
		               cntx, bli_cntl_sub_packm_c( cntl ),
                       bli_thrinfo_sub_ipackm( thread ) );

		// Perform trmm subproblem.
		bli_trmm_int( &BLIS_ONE,
		              a_pack,
		              b1_pack,
		              &BLIS_ONE,
		              c1_pack,
		              cntx,
		              bli_cntl_sub_gemm( cntl ),
                      bli_thrinfo_sub_self( thread ) );
        bli_thread_ibarrier( thread );

        // Unpack C1 (if C1 was packed).
        bli_unpackm_int( c1_pack, &c1,
                         cntx, bli_cntl_sub_unpackm_c( cntl ),
                         bli_thrinfo_sub_ipackm( thread ) );
	}

	// If any packing buffers were acquired within packm, release them back
	// to the memory manager.
    bli_thread_obarrier( thread );
    if( bli_thread_am_ochief( thread ) )
        bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
    if( bli_thread_am_ichief( thread ) ) {
        bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
        bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
    }
}
예제 #3
0
void bli_gemm_int
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	gemm_voft f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
	{
        if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );
		return;
	}

	// If A or B is marked as being filled with zeros, scale C by beta and
	// return early.
	if ( bli_obj_is_zeros( *a ) ||
	     bli_obj_is_zeros( *b ) )
	{
		// This should never execute.
		bli_abort();

        if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );
		return;
	}

	// Alias A, B, and C in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( alpha, &b_local );
	}

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Create the next node in the thrinfo_t structure.
	bli_thrinfo_grow( cntx, cntl, thread );

	// Extract the function pointer from the current control tree node.
	f = bli_cntl_var_func( cntl );

	// Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations.
	{
		ind_t im = bli_cntx_get_ind_method( cntx );

		if ( im != BLIS_NAT )
		{
			if      ( im == BLIS_3M3  && f == bli_gemm_packa    ) f = bli_gemm3m3_packa;
			else if ( im == BLIS_3M2  && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2;
			else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
		}
	}

	// Invoke the variant.
	f
	(
	  &a_local,
	  &b_local,
	  &c_local,
	  cntx,
	  cntl,
      thread
	);
}
예제 #4
0
void bli_packm_unb_var1
     (
       obj_t*  c,
       obj_t*  p,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	num_t     dt_cp     = bli_obj_datatype( *c );

	struc_t   strucc    = bli_obj_struc( *c );
	doff_t    diagoffc  = bli_obj_diag_offset( *c );
	diag_t    diagc     = bli_obj_diag( *c );
	uplo_t    uploc     = bli_obj_uplo( *c );
	trans_t   transc    = bli_obj_conjtrans_status( *c );

	dim_t     m_p       = bli_obj_length( *p );
	dim_t     n_p       = bli_obj_width( *p );
	dim_t     m_max_p   = bli_obj_padded_length( *p );
	dim_t     n_max_p   = bli_obj_padded_width( *p );

	void*     buf_c     = bli_obj_buffer_at_off( *c );
	inc_t     rs_c      = bli_obj_row_stride( *c );
	inc_t     cs_c      = bli_obj_col_stride( *c );

	void*     buf_p     = bli_obj_buffer_at_off( *p );
	inc_t     rs_p      = bli_obj_row_stride( *p );
	inc_t     cs_p      = bli_obj_col_stride( *p );

	void*     buf_kappa;

	FUNCPTR_T f;


	// This variant assumes that the computational kernel will always apply
	// the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
	// for kappa so that the underlying packm implementation does not scale
	// during packing.
	buf_kappa = bli_obj_buffer_for_const( dt_cp, BLIS_ONE );

	// Index into the type combination array to extract the correct
	// function pointer.
	f = ftypes[dt_cp];

    if( bli_thread_am_ochief( thread ) ) {
        // Invoke the function.
        f
		(
		  strucc,
          diagoffc,
          diagc,
          uploc,
          transc,
          m_p,
          n_p,
          m_max_p,
          n_max_p,
          buf_kappa,
          buf_c, rs_c, cs_c,
          buf_p, rs_p, cs_p,
		  cntx
		);
    }
}
예제 #5
0
파일: bli_trsm_int.c 프로젝트: figual/blis
void bli_trsm_int
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t        a_local;
	obj_t        b_local;
	obj_t        c_local;
	trsm_var_oft f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( a ) ||
	     bli_obj_has_zero_dim( b ) )
	{
		if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
		bli_thread_obarrier( thread );
		return;
	}

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( a, &a_local );
	bli_obj_alias_to( b, &b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( c, &c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
	{
		bli_obj_induce_trans( &c_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );
	}

	// If beta is non-unit, apply it to the scalar attached to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
		bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Set two bools: one based on the implied side parameter (the structure
	// of the root object) and one based on the uplo field of the triangular
	// matrix's root object (whether that is matrix A or matrix B).
	if ( bli_obj_root_is_triangular( a ) )
	{
		// If alpha is non-unit, typecast and apply it to the scalar
		// attached to B (the non-triangular matrix).
		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
		{
			bli_obj_scalar_apply_scalar( alpha, &b_local );
		}
	}
	else // if ( bli_obj_root_is_triangular( b ) )
	{
		// If alpha is non-unit, typecast and apply it to the scalar
		// attached to A (the non-triangular matrix).
		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
		{
            bli_obj_scalar_apply_scalar( alpha, &a_local );
		}
	}

	// FGVZ->TMS: Is this barrier still needed?
	bli_thread_obarrier( thread );

	// Create the next node in the thrinfo_t structure.
	bli_thrinfo_grow( rntm, cntl, thread );

	// Extract the function pointer from the current control tree node.
	f = bli_cntl_var_func( cntl );

	// Invoke the variant.
	f
	(
	  &a_local,
	  &b_local,
	  &c_local,
	  cntx,
	  rntm,
	  cntl,
	  thread
	);
}
예제 #6
0
void bli_unpackm_int( obj_t*     p,
                      obj_t*     a,
                      cntx_t*    cntx,
                      unpackm_t* cntl,
                      thrinfo_t* thread )
{
	// The unpackm operation consists of an optional post-process: castm.
	// (This post-process is analogous to the castm pre-process in packm.)
	// Here are the following possible ways unpackm can execute:
	//  1. unpack and cast: Unpack to a temporary matrix c and then cast
	//     c to a.
	//  2. unpack only: Unpack directly to matrix a since typecasting is
	//     not needed.
	//  3. cast only: Not yet supported / not used.
	//  4. no-op: The control tree directs us to skip the unpack operation
	//     entirely. No action is taken.

	obj_t     c;

	varnum_t  n;
	impl_t    i;
	FUNCPTR_T f;

	// Sanity check; A should never have a zero dimension. If we must support
	// it, then we should fold it into the next alias-and-early-exit block.
	//if ( bli_obj_has_zero_dim( *a ) ) bli_abort();

	// First check if we are to skip this operation because the control tree
	// is NULL, and if so, simply return.
	if ( bli_cntl_is_noop( cntl ) )
	{
		return;
	}

	// If p was aliased to a during the pack stage (because it was already
	// in an acceptable packed/contiguous format), then no unpack is actually
	// necessary, so we return.
	if ( bli_obj_is_alias_of( *p, *a ) )
	{
		return;
	}

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_unpackm_check( p, a, cntx, cntl );

	// Now, if we are not skipping the unpack operation, then the only
	// question left is whether we are to typecast matrix a after unpacking.
	if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) )
		bli_abort();
/*
	if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) )
	{
		// Initialize an object c for the intermediate typecast matrix.
		bli_unpackm_init_cast( p,
		                       a,
		                       &c );
	}
	else
*/
	{
		// If no cast is needed, then aliasing object c to the original
		// matrix serves as a minor optimization. This causes the unpackm
		// implementation to unpack directly into matrix a.
		bli_obj_alias_to( *a, c );
	}

	// Now we are ready to proceed with the unpacking.

	// Extract the variant number and implementation type.
	n = bli_cntl_var_num( cntl );
	i = bli_cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Invoke the variant.
    if( bli_thread_am_ochief( thread ) ) {
        f( p,
           &c,
		   cntx,
           cntl );
    }
    bli_thread_obarrier( thread );

	// Now, if necessary, we cast the contents of c to matrix a. If casting
	// was not necessary, then we are done because the call to the unpackm
	// implementation would have unpacked directly to matrix a.
/*
	if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) )
	{
		// Copy/typecast matrix c to matrix a.
		// NOTE: Here, we use copynzm instead of copym because, in the cases
		// where we are unpacking/typecasting a real matrix c to a complex
		// matrix a, we want to touch only the real components of a, rather
		// than also set the imaginary components to zero. This comes about
		// because of the fact that, if we are unpacking real-to-complex,
		// then it is because all of the computation occurred in the real
		// domain, and so we would want to leave whatever imaginary values
		// there are in matrix a untouched. Notice that for unpackings that
		// entail complex-to-complex data movements, the copynzm operation
		// behaves exactly as copym, so no use cases are lost (at least none
		// that I can think of).
		bli_copynzm( &c,
		             a );

		// NOTE: The above code/comment is outdated. What should happen is
		// as follows:
		// - If dt(a) is complex and dt(p) is real, then create an alias of
		//   a and then tweak it so that it looks like a real domain object.
		//   This will involve:
		//   - projecting the datatype to real domain
		//   - scaling both the row and column strides by 2
		//   ALL OF THIS should be done in the front-end, NOT here, as
		//   unpackm() won't even be needed in that case.
	}
*/
}
예제 #7
0
void bli_gemm_int( obj_t*  alpha,
                   obj_t*  a,
                   obj_t*  b,
                   obj_t*  beta,
                   obj_t*  c,
                   cntx_t* cntx,
                   gemm_t* cntl,
                   thrinfo_t* thread )
{
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	varnum_t  n;
	impl_t    i;
	FUNCPTR_T f;
	ind_t     im;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
	{
        if( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );
		return;
	}

	// If A or B is marked as being filled with zeros, scale C by beta and
	// return early.
	if ( bli_obj_is_zeros( *a ) ||
	     bli_obj_is_zeros( *b ) )
	{
        if( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );
		return;
	}

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( *c, c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
	{
        //if( bli_thread_am_ochief( thread ) ) {
            bli_obj_induce_trans( c_local );
            bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
       // }
	}

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( alpha, &b_local );
	}

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Extract the variant number and implementation type.
	n = bli_cntl_var_num( cntl );
	i = bli_cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations.
	im = bli_cntx_get_ind_method( cntx );

	if ( im != BLIS_NAT )
	{
		if      ( im == BLIS_3M3  && f == bli_gemm_blk_var1f ) f = bli_gemm_blk_var4f;
		else if ( im == BLIS_3M2  && f == bli_gemm_ker_var2  ) f = bli_gemm_ker_var4;
		else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2  ) f = bli_gemm_ker_var3;
	}

	// Invoke the variant.
	f( &a_local,
	   &b_local,
	   &c_local,
	   cntx,
	   cntl,
       thread );
}