예제 #1
0
파일: bli_conjmr.c 프로젝트: pgawron/tlash
void bli_cconjmr( uplo_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs )
{
	float   m1 = bli_sm1();
	float*  a_conj;
	int     lda, inca;
	int     n_iter;
	int     n_elem_max;
	int     n_elem;
	int     j;

	// Return early if possible.
	if ( bli_zero_dim2( m, n ) ) return;

	// We initialize for column-major.
	n_iter     = n;
	n_elem_max = m;
	lda        = a_cs;
	inca       = a_rs;

	// An optimization: if A is row-major, then let's access the matrix
	// by rows instead of by columns to increase spatial locality.
	if ( bli_is_row_storage( a_rs, a_cs ) )
	{
		bli_swap_ints( n_iter, n_elem_max );
		bli_swap_ints( lda, inca );
		bli_toggle_uplo( uplo );
	}

	if ( bli_is_upper( uplo ) )
	{
		for ( j = 0; j < n_iter; ++j )
		{
			n_elem = bli_min( j + 1, n_elem_max );
			a_conj = ( float* )( a + j*lda ) + 1;
	
			bli_sscal( n_elem,
			           &m1,
			           a_conj, 2*inca );
		}
	}
	else // if ( bli_is_lower( uplo ) )
	{
		for ( j = 0; j < n_iter; ++j )
		{
			n_elem = bli_max( 0, n_elem_max - j );
			a_conj = ( float* )( a + j*lda + j*inca ) + 1;
	
			if ( n_elem <= 0 ) break;

			bli_sscal( n_elem,
			           &m1,
			           a_conj, 2*inca );
		}
	}
}
예제 #2
0
파일: bli_setmr.c 프로젝트: pgawron/tlash
void bli_dsetmr( uplo_t uplo, int m, int n, double* sigma, double* a, int a_rs, int a_cs )
{
	double*   a_begin;
	int       lda, inca;
	int       n_iter;
	int       n_elem_max;
	int       n_elem;
	int       j;

	// Return early if possible.
	if ( bli_zero_dim2( m, n ) ) return;

	// Initialize with optimal values for column-major storage.
	n_iter     = n;
	n_elem_max = m;
	lda        = a_cs;
	inca       = a_rs;

	// An optimization: if A is row-major, then let's access the matrix by
	// rows instead of by columns to increase spatial locality.
	if ( bli_is_row_storage( a_rs, a_cs ) )
	{
		bli_swap_ints( n_iter, n_elem_max );
		bli_swap_ints( lda, inca );
		bli_toggle_uplo( uplo );
	}
	
	if ( bli_is_upper( uplo ) )
	{
		for ( j = 0; j < n_iter; j++ )
		{
			n_elem  = bli_min( j, n_elem_max );
			a_begin = a + j*lda;

			bli_dsetv( n_elem,
			           sigma,
			           a_begin, inca );
		}
	}
	else // if ( bli_is_lower( uplo ) )
	{
		for ( j = 0; j < n_iter; j++ )
		{
			n_elem  = bli_max( 0, n_elem_max - j - 1 );
			a_begin = a + j*lda + (j + 1)*inca;

			bli_dsetv( n_elem,
			           sigma,
			           a_begin, inca );
		}
	}
}
예제 #3
0
// Code for work assignments
void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
{
    thrinfo_t* thread = (thrinfo_t*) thr;
    dim_t n_way = thread->n_way;
    dim_t work_id = thread->work_id;

    dim_t size = all_end - all_start;
    dim_t n_pt = size / n_way;
    n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
    n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); 
    *start = work_id * n_pt + all_start;
    *end   = bli_min( *start + n_pt, size + all_start );
}
예제 #4
0
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
{
    thrinfo_t* thread = (thrinfo_t*) thr;
    dim_t n_way = thread->n_way;
    dim_t work_id = thread->work_id;
    dim_t size = all_end - all_start;

    *start = 0;
    *end   = all_end - all_start;
    double num = size*size / (double) n_way;

    if( forward ) {
        dim_t curr_caucus = n_way - 1;
        dim_t len = 0;
        while(1){
            dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus
            width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
            if( curr_caucus == work_id ) {
                *start = bli_max( 0 , *end - width ) + all_start;
                *end = *end + all_start;
                return;
            }
            else{
                *end -= width;
                len += width;
                curr_caucus--;
            }
        }
    }
    else{
        while(1){
            dim_t width = ceil(sqrt(*start * *start + num)) - *start;
            width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);

            if( work_id == 0 ) {
                *start = *start + all_start;
                *end = bli_min( *start + width, all_end );
                return;
            }
            else{
                *start = *start + width;
            }
            work_id--;
        }
    }
}
예제 #5
0
파일: test_gemv.c 프로젝트: jmhautbois/blis
void libblis_test_gemv_check( obj_t*  kappa,
                              obj_t*  alpha,
                              obj_t*  a,
                              obj_t*  x,
                              obj_t*  beta,
                              obj_t*  y,
                              obj_t*  y_orig,
                              double* resid )
{
	num_t  dt      = bli_obj_datatype( *y );
	num_t  dt_real = bli_obj_datatype_proj_to_real( *y );

	conj_t conja   = bli_obj_conj_status( *a );

	dim_t  n_x     = bli_obj_vector_dim( *x );
	dim_t  m_y     = bli_obj_vector_dim( *y );

	dim_t  min_m_n = bli_min( m_y, n_x );

	obj_t  x_temp, y_temp;
	obj_t  kappac, norm;
	obj_t  xT_temp, yT_temp, yT;

	double junk;

	//
	// Pre-conditions:
	// - a is initialized to kappa along the diagonal.
	// - x is randomized.
	// - y_orig is randomized.
	// Note:
	// - alpha, beta, and kappa should have non-zero imaginary components in
	//   the complex cases in order to more fully exercise the implementation.
	//
	// Under these conditions, we assume that the implementation for
	//
	//   y := beta * y_orig + alpha * transa(A) * conjx(x)
	//
	// is functioning correctly if
	//
	//   normf( y - z )
	//
	// is negligible, where
	//
	//   z = beta * y_orig + alpha * conja(kappa) * x
	//

	bli_obj_scalar_init_detached_copy_of( dt, conja, kappa, &kappac );
	bli_obj_scalar_init_detached( dt_real, &norm );

	bli_obj_create( dt, n_x, 1, 0, 0, &x_temp );
	bli_obj_create( dt, m_y, 1, 0, 0, &y_temp );

	bli_copyv( x,      &x_temp );
	bli_copyv( y_orig, &y_temp );

	bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, 
	                       &x_temp, &xT_temp );
	bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, 
	                       &y_temp, &yT_temp );
	bli_acquire_vpart_f2b( BLIS_SUBPART1, 0, min_m_n, 
	                       y, &yT );

	bli_scalv( &kappac, &xT_temp );
	bli_scalv( beta, &yT_temp );
	bli_axpyv( alpha, &xT_temp, &yT_temp );

	bli_subv( &yT_temp, &yT );
	bli_normfv( &yT, &norm );
	bli_getsc( &norm, resid, &junk );

	bli_obj_free( &x_temp );
	bli_obj_free( &y_temp );
}
예제 #6
0
void bli_gemm_blk_var2( obj_t*  alpha,
                        obj_t*  a,
                        obj_t*  b,
                        obj_t*  beta,
                        obj_t*  c,
                        gemm_t* cntl )
{
	obj_t a_pack_s;
	obj_t b1_pack_s;
	obj_t c1_pack_s;

    obj_t  b1, c1;
    obj_t* a_pack   = NULL;
    obj_t* b1_pack  = NULL;
    obj_t* c1_pack  = NULL;

	dim_t i;
	dim_t b_alg;
	dim_t n_trans;
    
    dim_t num_groups = bli_gemm_num_thread_groups( cntl->thread_info );
    dim_t group_id   = bli_gemm_group_id( cntl->thread_info );

    if( bli_gemm_am_a_master( cntl->thread_info ) ) {
        // Initialize object for packing A.
        bli_obj_init_pack( &a_pack_s );
        bli_packm_init( a, &a_pack_s,
                        cntl_sub_packm_a( cntl ) );

    }
    a_pack = bli_gemm_broadcast_a( cntl->thread_info, &a_pack_s );

	// Pack A and scale by alpha (if instructed).
	bli_packm_int( alpha,
	               a, a_pack,
	               cntl_sub_packm_a( cntl ) );

    bli_gemm_a_barrier( cntl->thread_info );

	if( bli_gemm_am_b_master( cntl->thread_info )) {
        bli_obj_init_pack( &b1_pack_s );
    }
    b1_pack  = bli_gemm_broadcast_b( cntl->thread_info, &b1_pack_s );

    if( bli_gemm_am_c_master( cntl->thread_info )) {
        bli_obj_init_pack( &c1_pack_s );

        // Scale C by beta (if instructed).
        bli_scalm_int( beta,
                       c,
                       cntl_sub_scalm( cntl ) );
    }
    c1_pack  = bli_gemm_broadcast_c( cntl->thread_info, &c1_pack_s );


	// Query dimension in partitioning direction.
	n_trans = bli_obj_width_after_trans( *b );
    dim_t n_pt = n_trans / num_groups;
    n_pt = (n_pt * num_groups < n_trans) ? n_pt + 1 : n_pt;
    n_pt = (n_pt % 8 == 0) ? n_pt : n_pt + 8 - (n_pt % 8);
    dim_t start = group_id * n_pt;
    dim_t end = bli_min( start + n_pt, n_trans );

	// Partition along the n dimension.
	for ( i = start; i < end; i += b_alg )
	{
		// Determine the current algorithmic blocksize.
		// NOTE: Use of b (for execution datatype) is intentional!
		// This causes the right blocksize to be used if c and a are
		// complex and b is real.
		b_alg = bli_determine_blocksize_f( i, end, b,
		                                   cntl_blocksize( cntl ) );

        // Acquire partitions for C1 
        bli_acquire_mpart_l2r( BLIS_SUBPART1,
                               i, b_alg, c, &c1 );
        // Acquire partitions for B1 
        bli_acquire_mpart_l2r( BLIS_SUBPART1,
                               i, b_alg, b, &b1 );

        if( bli_gemm_am_b_master( cntl->thread_info )) {
            // Initialize objects for packing B1 
            bli_packm_init( &b1, &b1_pack_s,
                            cntl_sub_packm_b( cntl ) );
        }

        if( bli_gemm_am_c_master( cntl->thread_info )) {
            // Initialize objects for packing C1 
            bli_packm_init( &c1, &c1_pack_s,
                            cntl_sub_packm_c( cntl ) );
        }

        bli_gemm_b_barrier( cntl->thread_info );
        bli_gemm_c_barrier( cntl->thread_info );
        
		// Pack B1 and scale by alpha (if instructed).
		bli_packm_int( alpha,
		               &b1, b1_pack,
		               cntl_sub_packm_b( cntl ) );

		// Pack C1 and scale by beta (if instructed).
		bli_packm_int( beta,
		               &c1, c1_pack,
		               cntl_sub_packm_c( cntl ) );

        // Packing must be done before computation
        bli_gemm_b_barrier( cntl->thread_info );
        bli_gemm_c_barrier( cntl->thread_info );

		// Perform gemm subproblem.
		bli_gemm_int( alpha,
		              a_pack,
		              b1_pack,
		              beta,
		              c1_pack,
		              cntl_sub_gemm( cntl ) );

		// Unpack C1 (if C1 was packed).
		bli_unpackm_int( c1_pack, &c1,
		                 cntl_sub_unpackm_c( cntl ) );
	}

	// If any packing buffers were acquired within packm, release them back
	// to the memory manager.
    bli_gemm_a_barrier( cntl->thread_info );
    if( bli_gemm_am_a_master( cntl->thread_info ))
	    bli_obj_release_pack( &a_pack_s );
    bli_gemm_b_barrier( cntl->thread_info );
    if( bli_gemm_am_b_master( cntl->thread_info )) {
        bli_obj_release_pack( &b1_pack_s );
    }
    bli_gemm_c_barrier( cntl->thread_info );
    if( bli_gemm_am_c_master( cntl->thread_info )) {
        bli_obj_release_pack( &c1_pack_s );
    }
}