예제 #1
0
void bli_trmv_l_blk_var2( obj_t*  alpha,
                          obj_t*  a,
                          obj_t*  x,
                          trmv_t* cntl )
{
	obj_t   a11, a11_pack;
	obj_t   a21;
	obj_t   x1, x1_pack;
	obj_t   x2;

	dim_t   mn;
	dim_t   ij;
	dim_t   b_alg;

	// Initialize objects for packing.
	bli_obj_init_pack( &a11_pack );
	bli_obj_init_pack( &x1_pack );

	// Query dimension.
	mn = bli_obj_length( *a );

	// Partition diagonally.
	for ( ij = 0; ij < mn; ij += b_alg )
	{
		// Determine the current algorithmic blocksize.
		b_alg = bli_determine_blocksize_b( ij, mn, a,
		                                   cntl_blocksize( cntl ) );

		// Acquire partitions for A11, A21, x1, and x2.
		bli_acquire_mpart_br2tl( BLIS_SUBPART11,
		                         ij, b_alg, a, &a11 );
		bli_acquire_mpart_br2tl( BLIS_SUBPART21,
		                         ij, b_alg, a, &a21 );
		bli_acquire_vpart_b2f( BLIS_SUBPART1,
		                       ij, b_alg, x, &x1 );
		bli_acquire_vpart_b2f( BLIS_SUBPART2,
		                       ij, b_alg, x, &x2 );

		// Initialize objects for packing A11 and x1 (if needed).
		bli_packm_init( &a11, &a11_pack,
		                cntl_sub_packm_a11( cntl ) );
		bli_packv_init( &x1, &x1_pack,
		                cntl_sub_packv_x1( cntl ) );

		// Copy/pack A11, x1 (if needed).
		bli_packm_int( &a11, &a11_pack,
		               cntl_sub_packm_a11( cntl ),
                       &BLIS_PACKM_SINGLE_THREADED );
		bli_packv_int( &x1, &x1_pack,
		               cntl_sub_packv_x1( cntl ) );

		// x2 = x2 + alpha * A21 * x1;
		bli_gemv_int( BLIS_NO_TRANSPOSE,
		              BLIS_NO_CONJUGATE,
	                  alpha,
		              &a21,
		              &x1_pack,
		              &BLIS_ONE,
		              &x2,
		              cntl_sub_gemv_cp( cntl ) );

		// x1 = alpha * tril( A11 ) * x1;
		bli_trmv_int( alpha,
		              &a11_pack,
		              &x1_pack,
		              cntl_sub_trmv( cntl ) );

		// Copy/unpack x1 (if x1 was packed).
		bli_unpackv_int( &x1_pack, &x1,
		                 cntl_sub_unpackv_x1( cntl ) );
	}

	// If any packing buffers were acquired within packm, release them back
	// to the memory manager.
	bli_obj_release_pack( &a11_pack );
	bli_obj_release_pack( &x1_pack );
}
예제 #2
0
void bli_trmv( obj_t*  alpha,
               obj_t*  a,
               obj_t*  x )
{
	trmv_t* trmv_cntl;
	num_t   dt_targ_a;
	num_t   dt_targ_x;
	bool_t  a_is_contig;
	bool_t  x_is_contig;
	obj_t   alpha_local;
	num_t   dt_alpha;

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_trmv_check( alpha, a, x );


	// Query the target datatypes of each object.
	dt_targ_a = bli_obj_target_datatype( *a );
	dt_targ_x = bli_obj_target_datatype( *x );

	// Determine whether each operand is stored contiguously.
	a_is_contig = ( bli_obj_is_row_stored( *a ) ||
	                bli_obj_is_col_stored( *a ) );
	x_is_contig = ( bli_obj_vector_inc( *x ) == 1 );


	// Create an object to hold a copy-cast of alpha. Notice that we use
	// the type union of the target datatypes of a and x to prevent any
	// unnecessary loss of information during the computation.
	dt_alpha = bli_datatype_union( dt_targ_a, dt_targ_x );
	bli_obj_init_scalar_copy_of( dt_alpha,
	                             BLIS_NO_CONJUGATE,
	                             alpha,
	                             &alpha_local );

	// If all operands are contiguous, we choose a control tree for calling
	// the unblocked implementation directly without any blocking.
	if ( a_is_contig &&
	     x_is_contig )
	{
		// We use two control trees to handle the four cases corresponding to
		// combinations of transposition and row/column-storage.
		// The row-stored without transpose and column-stored with transpose
		// trees are identical. Same for the remaining two trees.
		if ( bli_obj_has_notrans( *a ) )
		{
			if ( bli_obj_is_row_stored( *a ) ) trmv_cntl = trmv_cntl_bs_ke_nrow_tcol;
			else                               trmv_cntl = trmv_cntl_bs_ke_ncol_trow;
		}
		else // if ( bli_obj_has_trans( *a ) )
		{
			if ( bli_obj_is_row_stored( *a ) ) trmv_cntl = trmv_cntl_bs_ke_ncol_trow;
			else                               trmv_cntl = trmv_cntl_bs_ke_nrow_tcol;
		}
	}
	else
	{
		// Mark objects with unit stride as already being packed. This prevents
		// unnecessary packing from happening within the blocked algorithm.
		if ( a_is_contig ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, *a );
		if ( x_is_contig ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, *x );

		// Here, we make a similar choice as above, except that (1) we look
		// at storage tilt, and (2) we choose a tree that performs blocking.
		if ( bli_obj_has_notrans( *a ) )
		{
			if ( bli_obj_is_row_tilted( *a ) ) trmv_cntl = trmv_cntl_ge_nrow_tcol;
			else                               trmv_cntl = trmv_cntl_ge_ncol_trow;
		}
		else // if ( bli_obj_has_trans( *a ) )
		{
			if ( bli_obj_is_row_tilted( *a ) ) trmv_cntl = trmv_cntl_ge_ncol_trow;
			else                               trmv_cntl = trmv_cntl_ge_nrow_tcol;
		}
	}


	// Invoke the internal back-end with the copy-cast of alpha and the
	// chosen control tree.
	bli_trmv_int( &alpha_local,
	              a,
	              x,
	              trmv_cntl );
}