C++ (Cpp) bli_thread_obarrier Examples

Example #1

0

Show file

File: blx_gemm_blk_var3.c Project: figual/blis

void blx_gemm_blk_var3
     (
       obj_t*  a,
       obj_t*  b,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t a1, b1;
	dim_t i;
	dim_t b_alg;
	dim_t k_trans;

	// Query dimension in partitioning direction.
	k_trans = bli_obj_width_after_trans( a );

	// Partition along the k dimension.
	for ( i = 0; i < k_trans; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = blx_determine_blocksize_f( i, k_trans, c,
		                                   bli_cntl_bszid( cntl ), cntx );

		// Acquire partitions for A1 and B1.
		bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, a, &a1 );
		bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, b_alg, b, &b1 );

		// Perform gemm subproblem.
		blx_gemm_int
		(
		  &a1, &b1, c, cntx, rntm,
		  bli_cntl_sub_node( cntl ),
		  bli_thrinfo_sub_node( thread )
		);

		bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );

		// This variant executes multiple rank-k updates. Therefore, if the
		// internal beta scalar on matrix C is non-zero, we must use it
		// only for the first iteration (and then BLIS_ONE for all others).
		// And since c is a locally aliased obj_t, we can simply overwrite
		// the internal beta scalar with BLIS_ONE once it has been used in
		// the first iteration. 
		if ( i == 0 ) bli_obj_scalar_reset( c );
	}
}

Example #2

0

Show file

File: bli_trsm_blk_var3.c Project: honnibal/cython-blis

void bli_trsm_blk_var3
     (
       obj_t*  a,
       obj_t*  b,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t a1, b1;

	dir_t direct;

	dim_t i;
	dim_t b_alg;
	dim_t k_trans;

	// Determine the direction in which to partition (forwards or backwards).
	direct = bli_l3_direct( a, b, c, cntl );

	// Prune any zero region that exists along the partitioning dimension.
	bli_l3_prune_unref_mparts_k( a, b, c, cntl );

	// Query dimension in partitioning direction.
	k_trans = bli_obj_width_after_trans( *a );

	// Partition along the k dimension.
	for ( i = 0; i < k_trans; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b,
		                               bli_cntl_bszid( cntl ), cntx );

		// Acquire partitions for A1 and B1.
		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
		                        i, b_alg, a, &a1 );
		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
		                        i, b_alg, b, &b1 );

		// Perform trsm subproblem.
		bli_trsm_int
		(
		  &BLIS_ONE,
		  &a1,
		  &b1,
		  &BLIS_ONE,
		  c,
		  cntx,
		  bli_cntl_sub_node( cntl ),
		  bli_thrinfo_sub_node( thread )
		);

		//bli_thread_ibarrier( thread );
		bli_thread_obarrier( bli_thrinfo_sub_node( thread ) );

		// This variant executes multiple rank-k updates. Therefore, if the
		// internal alpha scalars on A/B and C are non-zero, we must ensure
		// that they are only used in the first iteration.
		if ( i == 0 )
		{
			bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b );
			bli_obj_scalar_reset( c );
		}
	}
}

Example #3

0

Show file

File: bli_trmm_blk_var2b.c Project: honnibal/cython-blis

void bli_trmm_blk_var2b( obj_t*  a,
                         obj_t*  b,
                         obj_t*  c,
                         cntx_t* cntx,
                         gemm_t* cntl,
                         thrinfo_t* thread )
{
    obj_t a_pack_s;
    obj_t b1_pack_s, c1_pack_s;
    
    obj_t b1, c1; 
    obj_t*  a_pack = NULL;
    obj_t*  b1_pack = NULL;
    obj_t*  c1_pack = NULL;

	dim_t i;
	dim_t b_alg;

	// Prune any zero region that exists along the partitioning dimension.
	bli_trmm_prune_unref_mparts_n( a, b, c );

    if( bli_thread_am_ochief( thread ) ) { 
        // Initialize object for packing A
        bli_obj_init_pack( &a_pack_s );
        bli_packm_init( a, &a_pack_s,
                        cntx, bli_cntl_sub_packm_a( cntl ) );

        // Scale C by beta (if instructed).
        bli_scalm_int( &BLIS_ONE,
                       c,  
                       cntx, bli_cntl_sub_scalm( cntl ) );
    }   
    a_pack = bli_thread_obroadcast( thread, &a_pack_s );

    // Initialize pack objects for B and C that are passed into packm_init().
    if( bli_thread_am_ichief( thread ) ) { 
        bli_obj_init_pack( &b1_pack_s );
        bli_obj_init_pack( &c1_pack_s );
    }   
    b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
    c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );

	// Pack A (if instructed).
	bli_packm_int( a, a_pack,
	               cntx, bli_cntl_sub_packm_a( cntl ),
                   bli_thrinfo_sub_opackm( thread ) );

    dim_t my_start, my_end;
    bli_thread_get_range_weighted_r2l( thread, b,
                                bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
                                &my_start, &my_end );

	// Partition along the n dimension.
	for ( i = my_start; i < my_end; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_determine_blocksize_b( i, my_end, b,
		                                   bli_cntl_bszid( cntl ), cntx );

		// Acquire partitions for B1 and C1.
		bli_acquire_mpart_r2l( BLIS_SUBPART1,
		                       i, b_alg, b, &b1 );
		bli_acquire_mpart_r2l( BLIS_SUBPART1,
		                       i, b_alg, c, &c1 );

		// Initialize objects for packing A1 and B1.
        if( bli_thread_am_ichief( thread ) ) {
            bli_packm_init( &b1, b1_pack,
                            cntx, bli_cntl_sub_packm_b( cntl ) );
            bli_packm_init( &c1, c1_pack,
                            cntx, bli_cntl_sub_packm_c( cntl ) );
        }
        bli_thread_ibarrier( thread );

		// Pack B1 (if instructed).
		bli_packm_int( &b1, b1_pack,
		               cntx, bli_cntl_sub_packm_b( cntl ),
                       bli_thrinfo_sub_ipackm( thread ) );

		// Pack C1 (if instructed).
		bli_packm_int( &c1, c1_pack,
		               cntx, bli_cntl_sub_packm_c( cntl ),
                       bli_thrinfo_sub_ipackm( thread ) );

		// Perform trmm subproblem.
		bli_trmm_int( &BLIS_ONE,
		              a_pack,
		              b1_pack,
		              &BLIS_ONE,
		              c1_pack,
		              cntx,
		              bli_cntl_sub_gemm( cntl ),
                      bli_thrinfo_sub_self( thread ) );
        bli_thread_ibarrier( thread );

        // Unpack C1 (if C1 was packed).
        bli_unpackm_int( c1_pack, &c1,
                         cntx, bli_cntl_sub_unpackm_c( cntl ),
                         bli_thrinfo_sub_ipackm( thread ) );
	}

	// If any packing buffers were acquired within packm, release them back
	// to the memory manager.
    bli_thread_obarrier( thread );
    if( bli_thread_am_ochief( thread ) )
        bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
    if( bli_thread_am_ichief( thread ) ) {
        bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
        bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
    }
}

Example #4

0

Show file

File: bli_gemm_int.c Project: honnibal/cython-blis

void bli_gemm_int
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	gemm_voft f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
	{
        if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );
		return;
	}

	// If A or B is marked as being filled with zeros, scale C by beta and
	// return early.
	if ( bli_obj_is_zeros( *a ) ||
	     bli_obj_is_zeros( *b ) )
	{
		// This should never execute.
		bli_abort();

        if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );
		return;
	}

	// Alias A, B, and C in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );
	bli_obj_alias_to( *c, c_local );

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( alpha, &b_local );
	}

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Create the next node in the thrinfo_t structure.
	bli_thrinfo_grow( cntx, cntl, thread );

	// Extract the function pointer from the current control tree node.
	f = bli_cntl_var_func( cntl );

	// Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations.
	{
		ind_t im = bli_cntx_get_ind_method( cntx );

		if ( im != BLIS_NAT )
		{
			if      ( im == BLIS_3M3  && f == bli_gemm_packa    ) f = bli_gemm3m3_packa;
			else if ( im == BLIS_3M2  && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2;
			else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
		}
	}

	// Invoke the variant.
	f
	(
	  &a_local,
	  &b_local,
	  &c_local,
	  cntx,
	  cntl,
      thread
	);
}

Example #5

0

Show file

File: bli_trsm_int.c Project: figual/blis

void bli_trsm_int
     (
       obj_t*  alpha,
       obj_t*  a,
       obj_t*  b,
       obj_t*  beta,
       obj_t*  c,
       cntx_t* cntx,
       rntm_t* rntm,
       cntl_t* cntl,
       thrinfo_t* thread
     )
{
	obj_t        a_local;
	obj_t        b_local;
	obj_t        c_local;
	trsm_var_oft f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( a ) ||
	     bli_obj_has_zero_dim( b ) )
	{
		if ( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
		bli_thread_obarrier( thread );
		return;
	}

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( a, &a_local );
	bli_obj_alias_to( b, &b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( c, &c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
	{
		bli_obj_induce_trans( &c_local );
		bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );
	}

	// If beta is non-unit, apply it to the scalar attached to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
		bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Set two bools: one based on the implied side parameter (the structure
	// of the root object) and one based on the uplo field of the triangular
	// matrix's root object (whether that is matrix A or matrix B).
	if ( bli_obj_root_is_triangular( a ) )
	{
		// If alpha is non-unit, typecast and apply it to the scalar
		// attached to B (the non-triangular matrix).
		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
		{
			bli_obj_scalar_apply_scalar( alpha, &b_local );
		}
	}
	else // if ( bli_obj_root_is_triangular( b ) )
	{
		// If alpha is non-unit, typecast and apply it to the scalar
		// attached to A (the non-triangular matrix).
		if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
		{
            bli_obj_scalar_apply_scalar( alpha, &a_local );
		}
	}

	// FGVZ->TMS: Is this barrier still needed?
	bli_thread_obarrier( thread );

	// Create the next node in the thrinfo_t structure.
	bli_thrinfo_grow( rntm, cntl, thread );

	// Extract the function pointer from the current control tree node.
	f = bli_cntl_var_func( cntl );

	// Invoke the variant.
	f
	(
	  &a_local,
	  &b_local,
	  &c_local,
	  cntx,
	  rntm,
	  cntl,
	  thread
	);
}

Example #6

0

Show file

File: bli_unpackm_int.c Project: santanu-thangaraj/blis

void bli_unpackm_int( obj_t*     p,
                      obj_t*     a,
                      cntx_t*    cntx,
                      unpackm_t* cntl,
                      thrinfo_t* thread )
{
	// The unpackm operation consists of an optional post-process: castm.
	// (This post-process is analogous to the castm pre-process in packm.)
	// Here are the following possible ways unpackm can execute:
	//  1. unpack and cast: Unpack to a temporary matrix c and then cast
	//     c to a.
	//  2. unpack only: Unpack directly to matrix a since typecasting is
	//     not needed.
	//  3. cast only: Not yet supported / not used.
	//  4. no-op: The control tree directs us to skip the unpack operation
	//     entirely. No action is taken.

	obj_t     c;

	varnum_t  n;
	impl_t    i;
	FUNCPTR_T f;

	// Sanity check; A should never have a zero dimension. If we must support
	// it, then we should fold it into the next alias-and-early-exit block.
	//if ( bli_obj_has_zero_dim( *a ) ) bli_abort();

	// First check if we are to skip this operation because the control tree
	// is NULL, and if so, simply return.
	if ( bli_cntl_is_noop( cntl ) )
	{
		return;
	}

	// If p was aliased to a during the pack stage (because it was already
	// in an acceptable packed/contiguous format), then no unpack is actually
	// necessary, so we return.
	if ( bli_obj_is_alias_of( *p, *a ) )
	{
		return;
	}

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_unpackm_check( p, a, cntx, cntl );

	// Now, if we are not skipping the unpack operation, then the only
	// question left is whether we are to typecast matrix a after unpacking.
	if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) )
		bli_abort();
/*
	if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) )
	{
		// Initialize an object c for the intermediate typecast matrix.
		bli_unpackm_init_cast( p,
		                       a,
		                       &c );
	}
	else
*/
	{
		// If no cast is needed, then aliasing object c to the original
		// matrix serves as a minor optimization. This causes the unpackm
		// implementation to unpack directly into matrix a.
		bli_obj_alias_to( *a, c );
	}

	// Now we are ready to proceed with the unpacking.

	// Extract the variant number and implementation type.
	n = bli_cntl_var_num( cntl );
	i = bli_cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Invoke the variant.
    if( bli_thread_am_ochief( thread ) ) {
        f( p,
           &c,
		   cntx,
           cntl );
    }
    bli_thread_obarrier( thread );

	// Now, if necessary, we cast the contents of c to matrix a. If casting
	// was not necessary, then we are done because the call to the unpackm
	// implementation would have unpacked directly to matrix a.
/*
	if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) )
	{
		// Copy/typecast matrix c to matrix a.
		// NOTE: Here, we use copynzm instead of copym because, in the cases
		// where we are unpacking/typecasting a real matrix c to a complex
		// matrix a, we want to touch only the real components of a, rather
		// than also set the imaginary components to zero. This comes about
		// because of the fact that, if we are unpacking real-to-complex,
		// then it is because all of the computation occurred in the real
		// domain, and so we would want to leave whatever imaginary values
		// there are in matrix a untouched. Notice that for unpackings that
		// entail complex-to-complex data movements, the copynzm operation
		// behaves exactly as copym, so no use cases are lost (at least none
		// that I can think of).
		bli_copynzm( &c,
		             a );

		// NOTE: The above code/comment is outdated. What should happen is
		// as follows:
		// - If dt(a) is complex and dt(p) is real, then create an alias of
		//   a and then tweak it so that it looks like a real domain object.
		//   This will involve:
		//   - projecting the datatype to real domain
		//   - scaling both the row and column strides by 2
		//   ALL OF THIS should be done in the front-end, NOT here, as
		//   unpackm() won't even be needed in that case.
	}
*/
}

Example #7

0

Show file

File: bli_gemm_int.c Project: santanu-thangaraj/blis

void bli_gemm_int( obj_t*  alpha,
                   obj_t*  a,
                   obj_t*  b,
                   obj_t*  beta,
                   obj_t*  c,
                   cntx_t* cntx,
                   gemm_t* cntl,
                   thrinfo_t* thread )
{
	obj_t     a_local;
	obj_t     b_local;
	obj_t     c_local;
	varnum_t  n;
	impl_t    i;
	FUNCPTR_T f;
	ind_t     im;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

	// If C has a zero dimension, return early.
	if ( bli_obj_has_zero_dim( *c ) ) return;

	// If A or B has a zero dimension, scale C by beta and return early.
	if ( bli_obj_has_zero_dim( *a ) ||
	     bli_obj_has_zero_dim( *b ) )
	{
        if( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );
		return;
	}

	// If A or B is marked as being filled with zeros, scale C by beta and
	// return early.
	if ( bli_obj_is_zeros( *a ) ||
	     bli_obj_is_zeros( *b ) )
	{
        if( bli_thread_am_ochief( thread ) )
		    bli_scalm( beta, c );
        bli_thread_obarrier( thread );
		return;
	}

	// Alias A and B in case we need to update attached scalars.
	bli_obj_alias_to( *a, a_local );
	bli_obj_alias_to( *b, b_local );

	// Alias C in case we need to induce a transposition.
	bli_obj_alias_to( *c, c_local );

	// If we are about to call a leaf-level implementation, and matrix C
	// still needs a transposition, then we must induce one by swapping the
	// strides and dimensions. Note that this transposition would normally
	// be handled explicitly in the packing of C, but if C is not being
	// packed, this is our last chance to handle the transposition.
	if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
	{
        //if( bli_thread_am_ochief( thread ) ) {
            bli_obj_induce_trans( c_local );
            bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
       // }
	}

	// If alpha is non-unit, typecast and apply it to the scalar attached
	// to B.
	if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( alpha, &b_local );
	}

	// If beta is non-unit, typecast and apply it to the scalar attached
	// to C.
	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
	{
        bli_obj_scalar_apply_scalar( beta, &c_local );
	}

	// Extract the variant number and implementation type.
	n = bli_cntl_var_num( cntl );
	i = bli_cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations.
	im = bli_cntx_get_ind_method( cntx );

	if ( im != BLIS_NAT )
	{
		if      ( im == BLIS_3M3  && f == bli_gemm_blk_var1f ) f = bli_gemm_blk_var4f;
		else if ( im == BLIS_3M2  && f == bli_gemm_ker_var2  ) f = bli_gemm_ker_var4;
		else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2  ) f = bli_gemm_ker_var3;
	}

	// Invoke the variant.
	f( &a_local,
	   &b_local,
	   &c_local,
	   cntx,
	   cntl,
       thread );
}

Example #8

0

Show file

File: bli_packm_int.c Project: santanu-thangaraj/blis

void bli_packm_int( obj_t*   a,
                    obj_t*   p,
                    cntx_t*  cntx,
                    packm_t* cntl,
                    thrinfo_t* thread )
{
	varnum_t  n;
	impl_t    i;
	FUNCPTR_T f;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_packm_int_check( a, p, cntx );

	// Sanity check; A should never have a zero dimension. If we must support
	// it, then we should fold it into the next alias-and-early-exit block.
	//if ( bli_obj_has_zero_dim( *a ) ) bli_abort();

	// First check if we are to skip this operation because the control tree
	// is NULL. We return without taking any action because a was already
	// aliased to p in packm_init().
	if ( bli_cntl_is_noop( cntl ) )
	{
		return;
	}

	// Let us now check to see if the object has already been packed. First
	// we check if it has been packed to an unspecified (row or column)
	// format, in which case we can return, since by now aliasing has already
	// taken place in packm_init().
	// NOTE: The reason we don't need to even look at the control tree in
	// this case is as follows: an object's pack status is only set to
	// BLIS_PACKED_UNSPEC for situations when the actual format used is
	// not important, as long as its packed into contiguous rows or
	// contiguous columns. A good example of this is packing for matrix
	// operands in the level-2 operations.
	if ( bli_obj_pack_schema( *a ) == BLIS_PACKED_UNSPEC )
	{
		return;
	}

	// At this point, we can be assured that cntl is not NULL. Now we check
	// if the object has already been packed to the desired schema (as en-
	// coded in the control tree). If so, we can return, as above.
	// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
	// and thus packing will be called for (but in some cases packing has
	// already taken place, or does not need to take place, and so that will
	// be indicated by the pack status). Also, not all combinations of
	// current pack status and desired pack schema are valid.
	if ( bli_obj_pack_schema( *a ) == cntl_pack_schema( cntl ) )
	{
		return;
	}

	// If the object is marked as being filled with zeros, then we can skip
	// the packm operation entirely.
	if ( bli_obj_is_zeros( *a ) )
	{
		return;
	}


	// Extract the variant number and implementation type.
	n = bli_cntl_var_num( cntl );
	i = bli_cntl_impl_type( cntl );

	// Index into the variant array to extract the correct function pointer.
	f = vars[n][i];

	// Invoke the variant with kappa_use.
	f( a,
	   p,
	   cntx,
       thread );

    // Barrier so that packing is done before computation
    bli_thread_obarrier( thread );
}