Пример #1
0
FLA_Error FLA_Obj_create_buffer( dim_t rs, dim_t cs, FLA_Obj *obj )
{
  size_t buffer_size;
  size_t n_elem;
  dim_t  m, n;

  m = FLA_Obj_length( *obj );
  n = FLA_Obj_width( *obj );

  // Adjust the strides, if necessary.
  FLA_adjust_strides( m, n, &rs, &cs );

  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_Obj_create_buffer_check( rs, cs, obj );

  // Compute the number of elements needed for the buffer, adjusting
  // the strides for alignment if needed.
  n_elem = FLA_compute_num_elem( FLA_Obj_elem_size( *obj ),
                                 m, n, &rs, &cs );

  // Compute the buffer size in bytes.
  buffer_size = ( size_t ) n_elem *
                ( size_t ) FLA_Obj_elem_size( *obj );

  // Allocate the base object's element buffer.
#ifdef FLA_ENABLE_SCC
  obj->base->buffer = ( FLA_Obj_elemtype( *obj ) == FLA_MATRIX ? FLA_malloc( buffer_size ) : FLA_shmalloc( buffer_size ) );
#else
  obj->base->buffer = FLA_malloc( buffer_size );
#endif
  obj->base->buffer_info = 0;

  // Save the number of elements allocated (for use with FLASH).
  obj->base->n_elem_alloc = n_elem;

  // Save the row and column strides used in the memory allocation.
  obj->base->rs     = rs;
  obj->base->cs     = cs;

  return FLA_SUCCESS;
}
Пример #2
0
void* genRandomData(dim_t order, dim_t size[]){
	dim_t i;
	dim_t numel = 1;
	double* buffer;

	for(i = 0; i < order; i++)
		numel *= size[i];

	buffer = (double*)FLA_malloc(numel * sizeof(double));
	for(i = 0; i < numel; i++)
		buffer[i] = ((double) rand() / ((double)RAND_MAX / 2.0F) ) - 1.0F;

	return (void*)buffer;
}
Пример #3
0
void* genSequentialData(dim_t order, dim_t size[]){
	dim_t i;
	dim_t numel = 1;
	double* buffer;

	for(i = 0; i < order; i++)
		numel *= size[i];

	buffer = (double*)FLA_malloc(numel * sizeof(double));
	for(i = 0; i < numel; i++)
		buffer[i] = (double)i;

	return (void*)buffer;
}
Пример #4
0
void* FLA_realloc( void* old_ptr, size_t size )
{
  FLA_Error e_val;
  void*     new_ptr;

  // We can't do much if size is zero. To emulate realloc(), we must
  // return a NULL pointer, regardless of the value of old_ptr.
  if ( size == 0 )
  {
    // If the pointer is valid, free() it.
    if ( old_ptr != NULL )
      FLA_free( old_ptr );

    // If size is zero, we should return a NULL pointer.
    new_ptr = NULL;
  }
  else
  {
    // If old_ptr is NULL, allocate size bytes as if it were a first-time
    // FLA_malloc() request. Otherwise, proceed to realloc() the memory.
    if ( old_ptr == NULL )
    {
      new_ptr = FLA_malloc( size );
    }
    else
    {
      // At this point, we know that size is non-zero and old_ptr is valid.

      // Since we may need aligned addresses, we don't really want to call
      // realloc(), since it does not guarantee arbitrary aligned pointers.
      // But we can't implement it ourselves either, because we don't know
      // how large the original buffer is, therefor we don't know how much
      // to copy over after the new buffer is allocated. So we're stuck with
      // the system implementation.
      new_ptr = realloc( old_ptr, size );

      if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
      {
        e_val = FLA_Check_malloc_pointer( new_ptr );
        FLA_Check_error_code( e_val );
      }
    }
  }

  // Return the pointer (either NULL, or the return value from FLA_malloc()
  // or realloc()).
  return new_ptr;
}
Пример #5
0
FLA_Error FLA_Obj_create_without_buffer( FLA_Datatype datatype, dim_t m, dim_t n, FLA_Obj *obj )
{
  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_Obj_create_without_buffer_check( datatype, m, n, obj );

  // Populate the fields in the view object.
  obj->m                = m;
  obj->n                = n;
  obj->offm             = 0;
  obj->offn             = 0;
  obj->m_inner          = m;
  obj->n_inner          = n;

  // Allocate the base object field.
  obj->base             = ( FLA_Base_obj * ) FLA_malloc( sizeof( FLA_Base_obj ) );

  // Populate the fields in the base object.
  obj->base->datatype   = datatype;
  obj->base->elemtype   = FLA_SCALAR;
  obj->base->m          = m;
  obj->base->n          = n;
  obj->base->m_inner    = m;
  obj->base->n_inner    = n;
  obj->base->id         = ( unsigned long ) obj->base;
  obj->base->m_index    = 0;
  obj->base->n_index    = 0;

  // Set the row and column strides to invalid values.
  obj->base->rs         = 0;
  obj->base->cs         = 0;

  // Initialize the base object's element buffer to NULL.
  obj->base->buffer       = NULL;
  obj->base->buffer_info  = 0;
  obj->base->n_elem_alloc = 0;

#ifdef FLA_ENABLE_SUPERMATRIX
  // Initialize SuperMatrix fields.
  obj->base->n_read_tasks   = 0;
  obj->base->read_task_head = NULL;
  obj->base->read_task_tail = NULL;
  obj->base->write_task     = NULL;
#endif

  return FLA_SUCCESS;
}
Пример #6
0
fla_gemm_t* FLA_Cntl_gemm_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal,
                                      fla_gemm_t*      sub_gemm )
{
	fla_gemm_t* cntl;
	
	cntl = ( fla_gemm_t* ) FLA_malloc( sizeof(fla_gemm_t) );
	
	cntl->matrix_type = matrix_type;
	cntl->variant     = variant;
	cntl->blocksize   = blocksize;
	cntl->sub_scal    = sub_scal;
	cntl->sub_gemm    = sub_gemm;

	return cntl;
}
Пример #7
0
fla_trsv_t* FLA_Cntl_trsv_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize, 
                                      fla_trsv_t*      sub_trsv,
                                      fla_gemv_t*      sub_gemv )
{
	fla_trsv_t* cntl;
	
	cntl = ( fla_trsv_t* ) FLA_malloc( sizeof(fla_trsv_t) );
	
	cntl->matrix_type = matrix_type;
	cntl->variant     = variant;
	cntl->blocksize   = blocksize;
	cntl->sub_trsv    = sub_trsv;
	cntl->sub_gemv    = sub_gemv;

	return cntl;
}
Пример #8
0
void initObjZero_ttm(dim_t order, dim_t size[], FLA_Obj* obj){
  dim_t i;
  dim_t stride[FLA_MAX_ORDER];
  dim_t nData = 1;
  double* data;

  stride[0] = 1;
  for(i = 1; i < order; i++)
	stride[i] = stride[i-1]*size[i-1];


  for(i = 0; i < order; i++)
    nData *= size[i];

  data = (double*)FLA_malloc(nData * sizeof(double));;
  memset(data, 0, nData * sizeof(double));

  	FLA_Obj_create_tensor_without_buffer(FLA_DOUBLE, order, size, obj);
  	FLA_Obj_attach_buffer_to_tensor(data, order, stride, obj);
}
Пример #9
0
fla_syr2k_t* FLA_Cntl_syr2k_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_scalr_t*     sub_scalr,
                                        fla_syr2k_t*     sub_syr2k,
                                        fla_gemm_t*      sub_gemm1,
                                        fla_gemm_t*      sub_gemm2 )
{
	fla_syr2k_t* cntl;
	
	cntl = ( fla_syr2k_t* ) FLA_malloc( sizeof(fla_syr2k_t) );

	cntl->matrix_type = matrix_type;
	cntl->variant     = variant;
	cntl->blocksize   = blocksize;
	cntl->sub_scalr   = sub_scalr;
	cntl->sub_syr2k   = sub_syr2k;
	cntl->sub_gemm1   = sub_gemm1;
	cntl->sub_gemm2   = sub_gemm2;

	return cntl;
}
Пример #10
0
int main( int argc, char *argv[] ) 
{ 
   int
      i, j,
      n_threads,
      n_repeats,
      n_trials,
      increment,
      begin,
      sorting,
      caching,
      work_stealing,
      data_affinity;

   dim_t
      size,
      nb_alg;

   FLA_Datatype
      datatype = FLA_DOUBLE;

   FLA_Obj 
      A, x, b, b_norm,
      AH, pH, bH;
   
   double 
      b_norm_value,
      dtime, 
      *dtimes,
      *flops;

#ifndef FLA_ENABLE_WINDOWS_BUILD
   char
      output_file_m[100];
   
   FILE
      *fpp;
#endif

   fprintf( stdout, "%c Enter number of repeats: ", '%' );
   scanf( "%d", &n_repeats );
   fprintf( stdout, "%c %d\n", '%', n_repeats );

   fprintf( stdout, "%c Enter blocksize: ", '%' );
   scanf( "%u", &nb_alg );
   fprintf( stdout, "%c %u\n", '%', nb_alg );

   fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' );
   scanf( "%d%d%d", &begin, &increment, &n_trials );
   fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials );

   fprintf( stdout, "%c Enter number of threads: ", '%' );
   scanf( "%d", &n_threads );
   fprintf( stdout, "%c %d\n", '%', n_threads );

   fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' );
   scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity );
   fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) );

#ifdef FLA_ENABLE_WINDOWS_BUILD
   fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg );
#else
   sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE );
   fpp = fopen( output_file_m, "a" );

   fprintf( fpp, "%%\n" );
   fprintf( fpp, "%% | Matrix Size |    FLASH    |\n" );
   fprintf( fpp, "%% |    n x n    |    GFlops   |\n" );
   fprintf( fpp, "%% -----------------------------\n" );
   fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg );
#endif

   FLA_Init();

   dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) );
   flops  = ( double * ) FLA_malloc( n_trials  * sizeof( double ) );
   
   FLASH_Queue_set_num_threads( n_threads );
   FLASH_Queue_set_sorting( sorting );
   FLASH_Queue_set_caching( caching );
   FLASH_Queue_set_work_stealing( work_stealing );
   FLASH_Queue_set_data_affinity( data_affinity );

   for ( i = 0; i < n_trials; i++ )
   {
      size = begin + i * increment;
      
      FLA_Obj_create( datatype, size, size, 0, 0, &A );
      FLA_Obj_create( datatype, size, 1,    0, 0, &x );
      FLA_Obj_create( datatype, size, 1,    0, 0, &b );
      FLA_Obj_create( datatype, 1,    1,    0, 0, &b_norm );

      for ( j = 0; j < n_repeats; j++ )
      {
         FLA_Random_matrix( A );
         FLA_Random_matrix( b );

         FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH );
         FLASH_Obj_create( FLA_INT,    size, 1, 1, &nb_alg, &pH );
         FLASH_Obj_create_hier_copy_of_flat( b, 1, &nb_alg, &bH );

         dtime = FLA_Clock();

         FLASH_LU_piv( AH, pH );

         dtime = FLA_Clock() - dtime;
         dtimes[j] = dtime;

         FLASH_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pH, bH );
         FLASH_Trsv( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, 
                     AH, bH );
         FLASH_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
                     AH, bH );

         FLASH_Obj_free( &AH );
         FLASH_Obj_free( &pH );

         FLASH_Obj_flatten( bH, x );
         FLASH_Obj_free( &bH );
      }
      
      dtime = dtimes[0];
      for ( j = 1; j < n_repeats; j++ )
         dtime = min( dtime, dtimes[j] );
      flops[i] = 2.0 / 3.0 * size * size * size / dtime / 1e9;

      FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, 
                         A, x, FLA_MINUS_ONE, b );
      FLA_Nrm2_external( b, b_norm );
      FLA_Obj_extract_real_scalar( b_norm, &b_norm_value );

#ifdef FLA_ENABLE_WINDOWS_BUILD
      fprintf( stdout, "   %d   %6.3f   %le\n", size, flops[i], b_norm_value );
#else      
      fprintf( fpp, "   %d   %6.3f\n", size, flops[i] );

      fprintf( stdout, "Time: %e  |  GFlops: %6.3f\n", dtime, flops[i] );
      fprintf( stdout, "Matrix size: %u x %u  |  nb_alg: %u\n", 
               size, size, nb_alg );
      fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value );
#endif

      FLA_Obj_free( &A );
      FLA_Obj_free( &x );
      FLA_Obj_free( &b );
      FLA_Obj_free( &b_norm );
   }

#ifdef FLA_ENABLE_WINDOWS_BUILD
   fprintf( stdout, "];\n\n" );
#else
   fprintf( fpp, "];\n" );
   
   fflush( fpp );
   fclose( fpp );
#endif

   FLA_free( dtimes );
   FLA_free( flops );

   FLA_Finalize(); 
   
   return 0; 
}
FLA_Error FLA_Tridiag_UT_l_step_ops_var2( int m_A,
                                          int m_T,
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_T, int rs_T, int cs_T )
{
  float*    buff_2  = FLA_FLOAT_PTR( FLA_TWO );
  float*    buff_1  = FLA_FLOAT_PTR( FLA_ONE );
  float*    buff_0  = FLA_FLOAT_PTR( FLA_ZERO );
  float*    buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE );

  float     first_elem;
  float     beta;
  float     inv_tau11;
  float     minus_inv_tau11;
  float     minus_upsilon11, minus_conj_upsilon11;
  float     minus_zeta11, minus_conj_zeta11;
  int       i;

  // b_alg = FLA_Obj_length( T );
  int       b_alg = m_T;

  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u );
  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w );
  float*    buff_u = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) );
  float*    buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) );
  float*    buff_w = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) );
  int       inc_u  = 1;
  int       inc_z  = 1;
  int       inc_w  = 1;

  // Initialize some variables (only to prevent compiler warnings).
  first_elem      = *buff_0;
  minus_inv_tau11 = *buff_0;

  for ( i = 0; i < b_alg; ++i )
  {
    float*    A20      = buff_A + (0  )*cs_A + (i+1)*rs_A;
    float*    alpha11  = buff_A + (i  )*cs_A + (i  )*rs_A;
    float*    a21      = buff_A + (i  )*cs_A + (i+1)*rs_A;
    float*    A22      = buff_A + (i+1)*cs_A + (i+1)*rs_A;

    float*    t01      = buff_T + (i  )*cs_T + (0  )*rs_T;
    float*    tau11    = buff_T + (i  )*cs_T + (i  )*rs_T;

    float*    upsilon11= buff_u + (i  )*inc_u;
    float*    u21      = buff_u + (i+1)*inc_u;

    float*    zeta11   = buff_z + (i  )*inc_z;
    float*    z21      = buff_z + (i+1)*inc_z;

    float*    w21      = buff_w + (i+1)*inc_w;

    float*    a21_t    = a21    + (0  )*cs_A + (0  )*rs_A;
    float*    a21_b    = a21    + (0  )*cs_A + (1  )*rs_A;

    int       m_ahead  = m_A - i - 1;
    int       m_behind = i;
    int       n_behind = i;

    /*------------------------------------------------------------*/

    if ( m_behind > 0 )
    {
      // FLA_Copy( upsilon11, minus_upsilon11 );
      // FLA_Scal( FLA_MINUS_ONE, minus_upsilon11 );
      // FLA_Copy( minus_upsilon11, minus_conj_upsilon11 );
      bl1_smult3( buff_m1, upsilon11, &minus_upsilon11 );
      bl1_scopyconj( &minus_upsilon11, &minus_conj_upsilon11 );

      // FLA_Copy( zeta11, minus_zeta11 );
      // FLA_Scal( FLA_MINUS_ONE, minus_zeta11 );
      // FLA_Copy( minus_zeta11, minus_conj_zeta11 );
      bl1_smult3( buff_m1, zeta11, &minus_zeta11 );
      bl1_scopyconj( &minus_zeta11, &minus_conj_zeta11 );

      // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon11, zeta11,    alpha11 );
      // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta11,    upsilon11, alpha11 );
      bl1_saxpyv( BLIS1_CONJUGATE,
                  1,
                  &minus_upsilon11,
                  zeta11,  1,
                  alpha11, 1 );
      bl1_saxpyv( BLIS1_CONJUGATE,
                  1,
                  &minus_zeta11,
                  upsilon11, 1,
                  alpha11,  1 );

      // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_zeta11,    u21, a21 );
      // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon11, z21, a21 );
      bl1_saxpyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &minus_conj_zeta11,
                  u21, inc_u,
                  a21, rs_A );
      bl1_saxpyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &minus_conj_upsilon11,
                  z21, inc_z,
                  a21, rs_A );
    }

    if ( m_ahead > 0 )
    {
      // FLA_Househ2_UT( FLA_LEFT,
      //                 a21_t,
      //                 a21_b, tau11 );
      FLA_Househ2_UT_l_ops( m_ahead - 1,
                            a21_t,
                            a21_b, rs_A,
                            tau11 );

      // FLA_Set( FLA_ONE, inv_tau11 );
      // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 );
      // FLA_Copy( inv_tau11, minus_inv_tau11 );
      // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 );
      bl1_sdiv3( buff_1, tau11, &inv_tau11 );
      bl1_sneg2( &inv_tau11, &minus_inv_tau11 );

      // FLA_Copy( a21_t, first_elem );
      // FLA_Set( FLA_ONE, a21_t );
      first_elem = *a21_t;
      *a21_t = *buff_1;
    }

    if ( m_behind > 0 )
    {
      // FLA_Her2( FLA_LOWER_TRIANGULAR, FLA_MINUS_ONE, u21, z21, A22 );
      bl1_ssyr2( BLIS1_LOWER_TRIANGULAR,
                 m_ahead,
                 buff_m1,
                 u21, inc_u,
                 z21, inc_z,
                 A22, rs_A, cs_A );
    }

    if ( m_ahead > 0 )
    {
      // FLA_Hemv( FLA_LOWER_TRIANGULAR, FLA_ONE, A22, a21, FLA_ZERO, w21 );
      bl1_ssymv( BLIS1_LOWER_TRIANGULAR,
                 m_ahead,
                 buff_1,
                 A22, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 w21, inc_w );

      // FLA_Copy( a21, u21 );
      // FLA_Copy( w21, z21 );
      bl1_scopyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  a21, rs_A,
                  u21, inc_u );
      bl1_scopyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  w21, inc_w,
                  z21, inc_z );

      // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta );
      // FLA_Inv_scal( FLA_TWO, beta );
      bl1_sdot( BLIS1_CONJUGATE,
                m_ahead,
                a21, rs_A,
                z21, inc_z,
                &beta );
      bl1_sinvscals( buff_2, &beta );

      // FLA_Scal( minus_inv_tau11, beta );
      // FLA_Axpy( beta, a21, z21 );
      // FLA_Scal( inv_tau11, z21 );
      bl1_sscals( &minus_inv_tau11, &beta );
      bl1_saxpyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &beta,
                  a21, rs_A,
                  z21, inc_z );
      bl1_sscalv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &inv_tau11,
                  z21, inc_z );

      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 );
      bl1_sgemv( BLIS1_CONJ_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_1,
                 A20, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 t01, rs_T );

      // FLA_Copy( first_elem, a21_t );
      *a21_t = first_elem;
    }

    if ( m_behind + 1 == b_alg && m_ahead > 0 )
    {
      // FLA_Her2( FLA_LOWER_TRIANGULAR, FLA_MINUS_ONE, u21, z21, A22 );
      bl1_ssyr2( BLIS1_LOWER_TRIANGULAR,
                 m_ahead,
                 buff_m1,
                 u21, inc_u,
                 z21, inc_z,
                 A22, rs_A, cs_A );
    }

    /*------------------------------------------------------------*/

  }

  // FLA_Obj_free( &u );
  // FLA_Obj_free( &z );
  // FLA_Obj_free( &w );
  FLA_free( buff_u );
  FLA_free( buff_z );
  FLA_free( buff_w );

  return FLA_SUCCESS;
}
FLA_Error FLA_Tridiag_UT_l_step_ops_var1( int m_A,
                                          int m_T,
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_T, int rs_T, int cs_T )
{
  float*    buff_2  = FLA_FLOAT_PTR( FLA_TWO );
  float*    buff_1  = FLA_FLOAT_PTR( FLA_ONE );
  float*    buff_0  = FLA_FLOAT_PTR( FLA_ZERO );
  float*    buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE );

  float     first_elem;
  float     beta;
  float     inv_tau11;
  float     minus_inv_tau11;
  int       i;

  // b_alg = FLA_Obj_length( T );
  int       b_alg = m_T;

  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z );
  float*    buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) );
  int       inc_z  = 1;

  for ( i = 0; i < b_alg; ++i )
  {
    float*    A20      = buff_A + (0  )*cs_A + (i+1)*rs_A;
    float*    a21      = buff_A + (i  )*cs_A + (i+1)*rs_A;
    float*    A22      = buff_A + (i+1)*cs_A + (i+1)*rs_A;

    float*    t01      = buff_T + (i  )*cs_T + (0  )*rs_T;
    float*    tau11    = buff_T + (i  )*cs_T + (i  )*rs_T;

    float*    z21      = buff_z + (i+1)*inc_z;

    float*    a21_t    = a21    + (0  )*cs_A + (0  )*rs_A;
    float*    a21_b    = a21    + (0  )*cs_A + (1  )*rs_A;

    int       m_ahead  = m_A - i - 1;
    int       n_behind = i;

    /*------------------------------------------------------------*/

    if ( m_ahead > 0 )
    {
      // FLA_Househ2_UT( FLA_LEFT,
      //                 a21_t,
      //                 a21_b, tau11 );
      FLA_Househ2_UT_l_ops( m_ahead - 1,
                            a21_t,
                            a21_b, rs_A,
                            tau11 );

      // FLA_Set( FLA_ONE, inv_tau11 );
      // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 );
      // FLA_Copy( inv_tau11, minus_inv_tau11 );
      // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 );
      bl1_sdiv3( buff_1, tau11, &inv_tau11 );
      bl1_sneg2( &inv_tau11, &minus_inv_tau11 );

      // FLA_Copy( a21_t, first_elem );
      // FLA_Set( FLA_ONE, a21_t );
      first_elem = *a21_t;
      *a21_t = *buff_1;

      // FLA_Hemv( FLA_LOWER_TRIANGULAR, FLA_ONE, A22, a21, FLA_ZERO, z21 );
      bl1_ssymv( BLIS1_LOWER_TRIANGULAR,
                 m_ahead,
                 buff_1,
                 A22, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 z21, inc_z );

      // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta );
      // FLA_Inv_scal( FLA_TWO, beta );
      bl1_sdot( BLIS1_CONJUGATE,
                m_ahead,
                a21, rs_A,
                z21, inc_z,
                &beta );
      bl1_sinvscals( buff_2, &beta );

      // FLA_Scal( minus_inv_tau11, beta );
      // FLA_Axpy( beta, a21, z21 );
      // FLA_Scal( inv_tau11, z21 );
      bl1_sscals( &minus_inv_tau11, &beta );
      bl1_saxpyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &beta,
                  a21, rs_A,
                  z21, inc_z );
      bl1_sscalv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &inv_tau11,
                  z21, inc_z );

      // FLA_Her2( FLA_LOWER_TRIANGULAR, FLA_MINUS_ONE, a21, z21, A22 );
      bl1_ssyr2( BLIS1_LOWER_TRIANGULAR,
                 m_ahead,
                 buff_m1,
                 a21, rs_A,
                 z21, inc_z,
                 A22, rs_A, cs_A );

      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 );
      bl1_sgemv( BLIS1_CONJ_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_1,
                 A20, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 t01, rs_T );

      // FLA_Copy( first_elem, a21_t );
      *a21_t = first_elem;
    }

    /*------------------------------------------------------------*/

  }

  // FLA_Obj_free( &z );
  FLA_free( buff_z );

  return FLA_SUCCESS;
}
Пример #13
0
int main( int argc, char *argv[] ) 
{ 
   int
      i, j,
      n_threads,
      n_repeats,
      n_trials,
      increment,
      begin,
      sorting,
      caching,
      work_stealing,
      data_affinity;

   dim_t
      size,
      nb_alg;

   FLA_Datatype
      datatype = FLA_DOUBLE;

   FLA_Inv    
      inv = FLA_NO_INVERSE;

   FLA_Uplo
      uplo = FLA_LOWER_TRIANGULAR;

   FLA_Obj 
      A, B, x, b, b_norm,
      AH, BH;
   
   double 
      length,
      b_norm_value = 0.0,
      dtime, 
      *dtimes,
      *flops;

#ifndef FLA_ENABLE_WINDOWS_BUILD
   char
      output_file_m[100];
   
   FILE
      *fpp;
#endif

   fprintf( stdout, "%c Enter number of repeats: ", '%' );
   scanf( "%d", &n_repeats );
   fprintf( stdout, "%c %d\n", '%', n_repeats );

   fprintf( stdout, "%c Enter blocksize: ", '%' );
   scanf( "%u", &nb_alg );
   fprintf( stdout, "%c %u\n", '%', nb_alg );

   fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' );
   scanf( "%d%d%d", &begin, &increment, &n_trials );
   fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials );

   fprintf( stdout, "%c Enter number of threads: ", '%' );
   scanf( "%d", &n_threads );
   fprintf( stdout, "%c %d\n", '%', n_threads );

   fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' );
   scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity );
   fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) );

#ifdef FLA_ENABLE_WINDOWS_BUILD
   fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg );
#else
   sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE );
   fpp = fopen( output_file_m, "a" );

   fprintf( fpp, "%%\n" );
   fprintf( fpp, "%% | Matrix Size |    FLASH    |\n" );
   fprintf( fpp, "%% |    n x n    |    GFlops   |\n" );
   fprintf( fpp, "%% -----------------------------\n" );
   fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg );
#endif

   FLA_Init();

   dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) );
   flops  = ( double * ) FLA_malloc( n_trials  * sizeof( double ) );
   
   FLASH_Queue_set_num_threads( n_threads );
   FLASH_Queue_set_sorting( sorting );
   FLASH_Queue_set_caching( caching );
   FLASH_Queue_set_work_stealing( work_stealing );
   FLASH_Queue_set_data_affinity( data_affinity );

   for ( i = 0; i < n_trials; i++ )
   {
      size = begin + i * increment;
      
      FLA_Obj_create( datatype, size, size, 0, 0, &A ); 
      FLA_Obj_create( datatype, size, size, 0, 0, &B ); 
      FLA_Obj_create( datatype, size, 1,    0, 0, &x ); 
      FLA_Obj_create( datatype, size, 1,    0, 0, &b ); 
      FLA_Obj_create( datatype, 1,    1,    0, 0, &b_norm ); 
      
      for ( j = 0; j < n_repeats; j++ )
      {
         FLA_Random_matrix( A );
         FLA_Random_matrix( B );
         FLA_Random_matrix( x );
         FLA_Random_matrix( b );

         FLA_Symmetrize( uplo, A );
         FLA_Symmetrize( uplo, B );

         length = ( double ) FLA_Obj_length( B );
         FLA_Add_to_diag( &length, B );
         FLA_Symv_external( uplo, FLA_ONE, B, x, FLA_ZERO, b );

         FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH );  
         FLASH_Obj_create_hier_copy_of_flat( B, 1, &nb_alg, &BH );  

         FLASH_Chol( uplo, BH );
         
         dtime = FLA_Clock();
         
         FLASH_Eig_gest( inv, uplo, AH, BH );
         
         dtime = FLA_Clock() - dtime;
         dtimes[j] = dtime;
         
         FLASH_Obj_free( &AH );
         FLASH_Obj_free( &BH );
      }
      
      dtime = dtimes[0];
      for ( j = 1; j < n_repeats; j++ )
         dtime = min( dtime, dtimes[j] );
      flops[i] = 1.0 * size * size * size / dtime / 1e9;

#ifdef FLA_ENABLE_WINDOWS_BUILD      
      fprintf( stdout, "   %d   %6.3f   %le\n", size, flops[i], b_norm_value );
#else
      fprintf( fpp, "   %d   %6.3f\n", size, flops[i] );
      
      fprintf( stdout, "Time: %e  |  GFlops: %6.3f\n", dtime, flops[i] );
      fprintf( stdout, "Matrix size: %u x %u  |  nb_alg: %u\n", 
               size, size, nb_alg ); 
      fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value ); 
#endif
 
      FLA_Obj_free( &A ); 
      FLA_Obj_free( &B ); 
      FLA_Obj_free( &x ); 
      FLA_Obj_free( &b ); 
      FLA_Obj_free( &b_norm ); 
   }

#ifdef FLA_ENABLE_WINDOWS_BUILD
   fprintf( stdout, "];\n\n" );
#else
   fprintf( fpp, "];\n" );
   
   fflush( fpp );
   fclose( fpp );
#endif

   FLA_free( dtimes );
   FLA_free( flops );

   FLA_Finalize(); 
   
   return 0; 
}
Пример #14
0
FLA_Error FLA_Hess_UT_step_ofd_var4( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_Y, int rs_Y, int cs_Y,
                                     double* buff_Z, int rs_Z, int cs_Z,
                                     double* buff_T, int rs_T, int cs_T )
{
  double*   buff_2  = FLA_DOUBLE_PTR( FLA_TWO );
  double*   buff_1  = FLA_DOUBLE_PTR( FLA_ONE );
  double*   buff_0  = FLA_DOUBLE_PTR( FLA_ZERO );
  double*   buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE );

  double    first_elem, last_elem;
  double    dot_product;
  double    beta, conj_beta;
  double    inv_tau11;
  double    minus_inv_tau11;
  int       i;

  // b_alg = FLA_Obj_length( T );
  int       b_alg = m_T;

  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d );
  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e );
  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f );
  double*   buff_e = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) );
  int       inc_e  = 1;

  // FLA_Set( FLA_ZERO, Y );
  // FLA_Set( FLA_ZERO, Z );
  bl1_dsetm( m_A,
             b_alg,
             buff_0,
             buff_Y, rs_Y, cs_Y );
  bl1_dsetm( m_A,
             b_alg,
             buff_0,
             buff_Z, rs_Z, cs_Z );

  for ( i = 0; i < b_alg; ++i )
  {
    double*   a10t     = buff_A + (0  )*cs_A + (i  )*rs_A;
    double*   A20      = buff_A + (0  )*cs_A + (i+1)*rs_A;
    double*   alpha11  = buff_A + (i  )*cs_A + (i  )*rs_A;
    double*   a21      = buff_A + (i  )*cs_A + (i+1)*rs_A;
    double*   A02      = buff_A + (i+1)*cs_A + (0  )*rs_A;
    double*   a12t     = buff_A + (i+1)*cs_A + (i  )*rs_A;
    double*   A22      = buff_A + (i+1)*cs_A + (i+1)*rs_A;

    double*   y10t     = buff_Y + (0  )*cs_Y + (i  )*rs_Y;
    double*   Y20      = buff_Y + (0  )*cs_Y + (i+1)*rs_Y;
    double*   y21      = buff_Y + (i  )*cs_Y + (i+1)*rs_Y;

    double*   z10t     = buff_Z + (0  )*cs_Z + (i  )*rs_Z;
    double*   Z20      = buff_Z + (0  )*cs_Z + (i+1)*rs_Z;
    double*   z21      = buff_Z + (i  )*cs_Z + (i+1)*rs_Z;

    double*   t01      = buff_T + (i  )*cs_T + (0  )*rs_T;
    double*   tau11    = buff_T + (i  )*cs_T + (i  )*rs_T;

    double*   e0       = buff_e + (0  )*inc_e;

    double*   a10t_r   = a10t   + (i-1)*cs_A + (0  )*rs_A;

    double*   a21_t    = a21    + (0  )*cs_A + (0  )*rs_A;
    double*   a21_b    = a21    + (0  )*cs_A + (1  )*rs_A;

    double*   ABL      = a10t;
    double*   ZBL      = z10t;

    double*   a2       = alpha11;

    int       m_ahead  = m_A - i - 1;
    int       n_ahead  = m_A - i - 1;
    int       m_behind = i;
    int       n_behind = i;

    /*------------------------------------------------------------*/

    if ( m_behind > 0 )
    {
      // FLA_Copy( a10t_r, last_elem );
      // FLA_Set( FLA_ONE, a10t_r );
      last_elem = *a10t_r;
      *a10t_r = *buff_1;
    }

    // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 );
    // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 );
    bl1_dgemv( BLIS1_NO_TRANSPOSE,
               BLIS1_CONJUGATE,
               m_ahead + 1,
               n_behind,
               buff_m1,
               ABL,  rs_A, cs_A,
               y10t, cs_Y,
               buff_1,
               a2,   rs_A );
    bl1_dgemv( BLIS1_NO_TRANSPOSE,
               BLIS1_CONJUGATE,
               m_ahead + 1,
               n_behind,
               buff_m1,
               ZBL,  rs_Z, cs_Z,
               a10t, cs_A,
               buff_1,
               a2,   rs_A );

    // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t );
    // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t );
    bl1_dgemv( BLIS1_CONJ_NO_TRANSPOSE,
               BLIS1_NO_CONJUGATE,
               m_ahead,
               n_behind,
               buff_m1,
               Y20,  rs_Y, cs_Y,
               a10t, cs_A,
               buff_1,
               a12t, cs_A );
    bl1_dgemv( BLIS1_CONJ_NO_TRANSPOSE,
               BLIS1_NO_CONJUGATE,
               m_ahead,
               n_behind,
               buff_m1,
               A20,  rs_A, cs_A,
               z10t, cs_Z,
               buff_1,
               a12t, cs_A );

    if ( m_behind > 0 )
    {
      // FLA_Copy( last_elem, a10t_r );
      *a10t_r = last_elem;
    }

    if ( m_ahead > 0 )
    {
      // FLA_Househ2_UT( FLA_LEFT,
      //                 a21_t,
      //                 a21_b, tau11 );
      FLA_Househ2_UT_l_opd( m_ahead - 1,
                            a21_t,
                            a21_b, rs_A,
                            tau11 );

      // FLA_Set( FLA_ONE, inv_tau11 );
      // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 );
      // FLA_Copy( inv_tau11, minus_inv_tau11 );
      // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 );
      bl1_ddiv3( buff_1, tau11, &inv_tau11 );
      bl1_dneg2( &inv_tau11, &minus_inv_tau11 );

      // FLA_Copy( a21_t, first_elem );
      // FLA_Set( FLA_ONE, a21_t );
      first_elem = *a21_t;
      *a21_t = *buff_1;

      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 );
      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 );
      FLA_Fused_Ahx_Ax_opd_var1( m_ahead,
                                 n_ahead,
                                 A22, rs_A, cs_A,
                                 a21, rs_A,
                                 y21, rs_Y,
                                 z21, rs_Z );

      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 );
      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 );
      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 );
      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 );
      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 );
      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 );
      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 );
      // FLA_Copy( d0, t01 );
      FLA_Fused_Uhu_Yhu_Zhu_opd_var1( m_ahead,
                                      n_behind,
                                      buff_m1,
                                      A20, rs_A, cs_A,
                                      Y20, rs_Y, cs_Y,
                                      Z20, rs_Z, cs_Z,
                                      t01, rs_T,
                                      a21, rs_A,
                                      y21, rs_Y,
                                      z21, rs_Z );

      // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta );
      // FLA_Inv_scal( FLA_TWO, beta );
      // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta );
      bl1_ddot( BLIS1_CONJUGATE,
                m_ahead,
                a21, rs_A,
                z21, rs_Z,
                &beta );
      bl1_dinvscals( buff_2, &beta );
      bl1_dcopyconj( &beta, &conj_beta );

      // FLA_Scal( minus_inv_tau11, conj_beta );
      // FLA_Axpy( conj_beta, a21, y21 );
      // FLA_Scal( inv_tau11, y21 );
      bl1_dscals( &minus_inv_tau11, &conj_beta );
      bl1_daxpyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &conj_beta,
                  a21, rs_A,
                  y21, rs_Y );
      bl1_dscalv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &inv_tau11,
                  y21, rs_Y );

      // FLA_Scal( minus_inv_tau11, beta );
      // FLA_Axpy( beta, a21, z21 );
      // FLA_Scal( inv_tau11, z21 );
      bl1_dscals( &minus_inv_tau11, &beta );
      bl1_daxpyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &beta,
                  a21, rs_A,
                  z21, rs_Z );
      bl1_dscalv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &inv_tau11,
                  z21, rs_Z );

      // FLA_Dot( a12t, a21, dot_product );
      // FLA_Scal( minus_inv_tau11, dot_product );
      // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t );
      bl1_ddot( BLIS1_NO_CONJUGATE,
                m_ahead,
                a12t, cs_A,
                a21,  rs_A,
                &dot_product );
      bl1_dscals( &minus_inv_tau11, &dot_product );
      bl1_daxpyv( BLIS1_CONJUGATE,
                  m_ahead,
                  &dot_product,
                  a21,  rs_A,
                  a12t, cs_A );

      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 );
      // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 );
      bl1_dgemv( BLIS1_NO_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_behind,
                 n_ahead,
                 buff_1,
                 A02, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 e0,  inc_e );
      bl1_dger( BLIS1_NO_CONJUGATE,
                BLIS1_CONJUGATE,
                m_behind,
                n_ahead,
                &minus_inv_tau11,
                e0,  inc_e,
                a21, rs_A,
                A02, rs_A, cs_A );

      // FLA_Copy( first_elem, a21_t );
      *a21_t = first_elem;
    }

    /*------------------------------------------------------------*/

  }

  // FLA_Obj_free( &e );
  FLA_free( buff_e );

  return FLA_SUCCESS;
}
Пример #15
0
//Note: Only retains symmetry that exists...
//Note: Mode multiplies MUST be INORDER (so that traverse stored pieces correctly).
//(This might could be relaxed since no matter the loop order, we will hit the unique part only once...Not sure...I think we would just have to handle the permutations)
//Step 1: Partition unaltered symmetric groups of A and C first!!
//Step 2: Deal with the symGroup that will be broken
FLA_Error FLA_Psttv( FLA_Obj alpha, FLA_Obj A, dim_t mode, FLA_Obj beta, FLA_Obj B, FLA_Obj C )
{
    dim_t i;

    //Determine which (if any) symmetric group can safely be repartitioned similarly
    //between A and C.
    TLA_sym symC = C.sym;
    dim_t symGroupToSplit = -1;
    dim_t nModes_part;
    for(i = 0; i < symC.nSymGroups; i++)
        if(symC.symGroupLens[i] > 1) {
            symGroupToSplit = i;
            nModes_part = symC.symGroupLens[symGroupToSplit];
            break;
        }

    //No group can be split, meaning mode multiplied in is on own in both tensors.
    //Multiply
    if(symGroupToSplit == -1) {
        FLA_Ttm_single_mode(alpha, A, mode, beta, B, C);
        return FLA_SUCCESS;
    } else {
        //This is the symmetric group to split
        dim_t symGroupToSplitOffset = TLA_sym_group_mode_offset(symC, symGroupToSplit);

        dim_t* part_modes;
        dim_t* sizes;
        dim_t* repart_sizes;
        FLA_Side* sides;
        FLA_Side* repart_sides;

        dim_t isSingleBlock;

        FLA_Obj** Apart;
        FLA_Obj** Cpart;
        FLA_Obj** Arepart;
        FLA_Obj** Crepart;

        dim_t update_region_stride;
        dim_t update_region;

        FLA_Obj Apass;
        FLA_Obj Cpass;

        //Initialize Views & data for loop
        dim_t nPart = 1 << nModes_part;
        dim_t nRepart = 1;
        for(i = 0; i < nModes_part; i++)
            nRepart *= 3;

        //Check if we are dealing with a single block
        //If so, we get to just multiply in a mode
        isSingleBlock = TRUE;
        for(i = 0; i < nModes_part; i++) {
            if(FLA_Obj_dimsize(C, symC.symModes[symGroupToSplitOffset + i]) == 0) {
                return FLA_SUCCESS;
            }
            if(FLA_Obj_dimsize(C, symC.symModes[symGroupToSplitOffset + i]) > 1) {
                isSingleBlock = FALSE;
            }
        }

        if(isSingleBlock) {
            FLA_Ttm_single_mode(alpha, A, mode, beta, B, C);
            return FLA_SUCCESS;
        }

        part_modes = (dim_t*)FLA_malloc(nModes_part * sizeof(dim_t));
        sizes = (dim_t*)FLA_malloc(nModes_part * sizeof(dim_t));
        repart_sizes = (dim_t*)FLA_malloc(nModes_part * sizeof(dim_t));
        sides = (FLA_Side*)FLA_malloc(nModes_part * sizeof(dim_t));
        repart_sides = (FLA_Side*)FLA_malloc(nModes_part * sizeof(dim_t));

        for(i = 0; i < nModes_part; i++) {
            part_modes[i] = symC.symModes[symGroupToSplitOffset + i];
            sizes[i] = 0;
            repart_sizes[i] = 1;
            sides[i] = FLA_TOP;
            repart_sides[i] = FLA_BOTTOM;
        }

        //Begin loop for general tensor case
        Apart = (FLA_Obj**)FLA_malloc(nPart * sizeof(FLA_Obj*));
        Cpart = (FLA_Obj**)FLA_malloc(nPart * sizeof(FLA_Obj*));

        Arepart = (FLA_Obj**)FLA_malloc(nRepart * sizeof(FLA_Obj*));
        Crepart = (FLA_Obj**)FLA_malloc(nRepart * sizeof(FLA_Obj*));

        TLA_create_part_obj(nPart, Apart);
        TLA_create_part_obj(nPart, Cpart);

        TLA_create_part_obj(nRepart, Arepart);
        TLA_create_part_obj(nRepart, Crepart);

        FLA_Part_2powm(A, Apart,
                       nModes_part, part_modes,
                       sizes, sides);

        FLA_Part_2powm(C, Cpart,
                       nModes_part, part_modes,
                       sizes, sides);

        while(FLA_Obj_dimsize(*(Cpart[0]), part_modes[0]) < FLA_Obj_dimsize(C, part_modes[0])) {
            FLA_Repart_2powm_to_3powm(Apart, Arepart,
                                      nModes_part, part_modes,
                                      repart_sizes, repart_sides);
            FLA_Repart_2powm_to_3powm(Cpart, Crepart,
                                      nModes_part, part_modes,
                                      repart_sizes, repart_sides);

            /******************************/
            update_region_stride = 1;
            for(i = 1; i < nModes_part; i++) {
                update_region_stride *= 3;
            }

            //Symmetric region being partitioned includes
            //symmetric tensors of order 0->order-1
            //Must update ALL of them
            update_region = update_region_stride;
            for(i = 0; i < nModes_part; i++) {
                Apass = *(Arepart[update_region]);
                Cpass = *(Crepart[update_region]);
                FLA_Psttv(alpha, Apass, mode, beta, B, Cpass);
                update_region_stride /= 3;
                update_region += update_region_stride;
            }
            /******************************/
            FLA_Cont_with_3powm_to_2powm(Apart, Arepart,
                                         nModes_part, part_modes,
                                         repart_sides);
            FLA_Cont_with_3powm_to_2powm(Cpart, Crepart,
                                         nModes_part, part_modes,
                                         repart_sides);
        }

        //Tidy up alloc'd data
        TLA_destroy_part_obj(nPart, Apart);
        TLA_destroy_part_obj(nPart, Cpart);

        TLA_destroy_part_obj(nRepart, Arepart);
        TLA_destroy_part_obj(nRepart, Crepart);


        FLA_free(part_modes);
        FLA_free(sizes);
        FLA_free(repart_sizes);
        FLA_free(sides);
        FLA_free(repart_sides);

        FLA_free(Apart);
        FLA_free(Cpart);
        FLA_free(Arepart);
        FLA_free(Crepart);
    }

    return FLA_SUCCESS;
}
Пример #16
0
int main( int argc, char *argv[] )
{
    int
    i, j,
    size,
    n_threads,
    n_repeats,
    n_trials,
    nb_alg,
    increment,
    begin;

    FLA_Datatype
    datatype = FLA_DOUBLE;

    FLA_Obj
    A;

    double
    b_norm_value = 0.0,
    dtime,
    *dtimes,
    *flops,
    *T;

    char
    output_file_m[100];

    FILE
    *fpp;

    fprintf( stdout, "%c Enter number of repeats: ", '%' );
    scanf( "%d", &n_repeats );
    fprintf( stdout, "%c %d\n", '%', n_repeats );

    fprintf( stdout, "%c Enter blocksize: ", '%' );
    scanf( "%d", &nb_alg );
    fprintf( stdout, "%c %d\n", '%', nb_alg );

    fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' );
    scanf( "%d%d%d", &begin, &increment, &n_trials );
    fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials );

    fprintf( stdout, "%c Enter number of threads: ", '%' );
    scanf( "%d", &n_threads );
    fprintf( stdout, "%c %d\n\n", '%', n_threads );

    sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE );
    fpp = fopen( output_file_m, "a" );

    fprintf( fpp, "%%\n" );
    fprintf( fpp, "%% | Matrix Size |    PLASMA   |\n" );
    fprintf( fpp, "%% |    n x n    |    GFlops   |\n" );
    fprintf( fpp, "%% -----------------------------\n" );

    FLA_Init();
    PLASMA_Init( n_threads );

    PLASMA_Disable( PLASMA_AUTOTUNING );
    PLASMA_Set( PLASMA_TILE_SIZE, nb_alg );
    PLASMA_Set( PLASMA_INNER_BLOCK_SIZE, nb_alg / 4 );

    dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) );
    flops  = ( double * ) FLA_malloc( n_trials  * sizeof( double ) );

    fprintf( fpp, "%s = [\n", OUTPUT_FILE );

    for ( i = 0; i < n_trials; i++ )
    {
        size = begin + i * increment;

        FLA_Obj_create( datatype, size, size, 0, 0, &A );

        for ( j = 0; j < n_repeats; j++ )
        {
            FLA_Random_matrix( A );

            PLASMA_Alloc_Workspace_dgeqrf( size, size, &T );

            dtime = FLA_Clock();

            PLASMA_dgeqrf( size, size, FLA_Obj_buffer_at_view( A ), size, T );

            dtime = FLA_Clock() - dtime;
            dtimes[j] = dtime;

            free( T );
        }

        dtime = dtimes[0];
        for ( j = 1; j < n_repeats; j++ )
            dtime = min( dtime, dtimes[j] );
        flops[i] = 4.0 / 3.0 * size * size * size / dtime / 1e9;

        fprintf( fpp, "   %d   %6.3f\n", size, flops[i] );

        printf( "Time: %e  |  GFlops: %6.3f\n",
                dtime, flops[i] );
        printf( "Matrix size: %d x %d  |  nb_alg: %d\n",
                size, size, nb_alg );
        printf( "Norm of difference: %le\n\n", b_norm_value );

        FLA_Obj_free( &A );
    }

    fprintf( fpp, "];\n" );

    fflush( fpp );
    fclose( fpp );

    FLA_free( dtimes );
    FLA_free( flops );

    PLASMA_Finalize();
    FLA_Finalize();

    return 0;
}
Пример #17
0
FLA_Error FLA_Hess_UT_step_opc_var4( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_Y, int rs_Y, int cs_Y,
                                     scomplex* buff_Z, int rs_Z, int cs_Z,
                                     scomplex* buff_T, int rs_T, int cs_T )
{
  scomplex* buff_2  = FLA_COMPLEX_PTR( FLA_TWO );
  scomplex* buff_1  = FLA_COMPLEX_PTR( FLA_ONE );
  scomplex* buff_0  = FLA_COMPLEX_PTR( FLA_ZERO );
  scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE );

  scomplex  first_elem, last_elem;
  scomplex  dot_product;
  scomplex  beta, conj_beta;
  scomplex  inv_tau11;
  scomplex  minus_inv_tau11;
  int       i;

  // b_alg = FLA_Obj_length( T );
  int       b_alg = m_T;

  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d );
  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e );
  // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f );
  scomplex* buff_d = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) );
  scomplex* buff_e = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) );
  scomplex* buff_f = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) );
  int       inc_d  = 1;
  int       inc_e  = 1;
  int       inc_f  = 1;

  // FLA_Set( FLA_ZERO, Y );
  // FLA_Set( FLA_ZERO, Z );
  bl1_csetm( m_A,
             b_alg,
             buff_0,
             buff_Y, rs_Y, cs_Y );
  bl1_csetm( m_A,
             b_alg,
             buff_0,
             buff_Z, rs_Z, cs_Z );

  for ( i = 0; i < b_alg; ++i )
  {
    scomplex* a10t     = buff_A + (0  )*cs_A + (i  )*rs_A;
    scomplex* A20      = buff_A + (0  )*cs_A + (i+1)*rs_A;
    scomplex* alpha11  = buff_A + (i  )*cs_A + (i  )*rs_A;
    scomplex* a21      = buff_A + (i  )*cs_A + (i+1)*rs_A;
    scomplex* A02      = buff_A + (i+1)*cs_A + (0  )*rs_A;
    scomplex* a12t     = buff_A + (i+1)*cs_A + (i  )*rs_A;
    scomplex* A22      = buff_A + (i+1)*cs_A + (i+1)*rs_A;

    scomplex* y10t     = buff_Y + (0  )*cs_Y + (i  )*rs_Y;
    scomplex* Y20      = buff_Y + (0  )*cs_Y + (i+1)*rs_Y;
    scomplex* y21      = buff_Y + (i  )*cs_Y + (i+1)*rs_Y;

    scomplex* z10t     = buff_Z + (0  )*cs_Z + (i  )*rs_Z;
    scomplex* Z20      = buff_Z + (0  )*cs_Z + (i+1)*rs_Z;
    scomplex* z21      = buff_Z + (i  )*cs_Z + (i+1)*rs_Z;

    scomplex* t01      = buff_T + (i  )*cs_T + (0  )*rs_T;
    scomplex* tau11    = buff_T + (i  )*cs_T + (i  )*rs_T;

    scomplex* d0       = buff_d + (0  )*inc_d;

    scomplex* e0       = buff_e + (0  )*inc_e;

    scomplex* f0       = buff_f + (0  )*inc_f;

    scomplex* a10t_r   = a10t   + (i-1)*cs_A + (0  )*rs_A;

    scomplex* a21_t    = a21    + (0  )*cs_A + (0  )*rs_A;
    scomplex* a21_b    = a21    + (0  )*cs_A + (1  )*rs_A;

    scomplex* ABL      = a10t;
    scomplex* ZBL      = z10t;

    scomplex* a2       = alpha11;

    int       m_ahead  = m_A - i - 1;
    int       n_ahead  = m_A - i - 1;
    int       m_behind = i;
    int       n_behind = i;

    /*------------------------------------------------------------*/

    if ( m_behind > 0 )
    {
      // FLA_Copy( a10t_r, last_elem );
      // FLA_Set( FLA_ONE, a10t_r );
      last_elem = *a10t_r;
      *a10t_r = *buff_1;
    }

    // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 );
    // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 );
    bl1_cgemv( BLIS1_NO_TRANSPOSE,
               BLIS1_CONJUGATE,
               m_ahead + 1,
               n_behind,
               buff_m1,
               ABL,  rs_A, cs_A,
               y10t, cs_Y,
               buff_1,
               a2,   rs_A );
    bl1_cgemv( BLIS1_NO_TRANSPOSE,
               BLIS1_CONJUGATE,
               m_ahead + 1,
               n_behind,
               buff_m1,
               ZBL,  rs_Z, cs_Z,
               a10t, cs_A,
               buff_1,
               a2,   rs_A );

    // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t );
    // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t );
    bl1_cgemv( BLIS1_CONJ_NO_TRANSPOSE,
               BLIS1_NO_CONJUGATE,
               m_ahead,
               n_behind,
               buff_m1,
               Y20,  rs_Y, cs_Y,
               a10t, cs_A,
               buff_1,
               a12t, cs_A );
    bl1_cgemv( BLIS1_CONJ_NO_TRANSPOSE,
               BLIS1_NO_CONJUGATE,
               m_ahead,
               n_behind,
               buff_m1,
               A20,  rs_A, cs_A,
               z10t, cs_Z,
               buff_1,
               a12t, cs_A );

    if ( m_behind > 0 )
    {
      // FLA_Copy( last_elem, a10t_r );
      *a10t_r = last_elem;
    }

    if ( m_ahead > 0 )
    {
      // FLA_Househ2_UT( FLA_LEFT,
      //                 a21_t,
      //                 a21_b, tau11 );
      FLA_Househ2_UT_l_opc( m_ahead - 1,
                            a21_t,
                            a21_b, rs_A,
                            tau11 );

      // FLA_Set( FLA_ONE, inv_tau11 );
      // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 );
      // FLA_Copy( inv_tau11, minus_inv_tau11 );
      // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 );
      bl1_cdiv3( buff_1, tau11, &inv_tau11 );
      bl1_cneg2( &inv_tau11, &minus_inv_tau11 );

      // FLA_Copy( a21_t, first_elem );
      // FLA_Set( FLA_ONE, a21_t );
      first_elem = *a21_t;
      *a21_t = *buff_1;

      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 );
      bl1_cgemv( BLIS1_CONJ_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_ahead,
                 buff_1,
                 A22, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 y21, rs_Y );

      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 );
      bl1_cgemv( BLIS1_NO_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_ahead,
                 buff_1,
                 A22, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 z21, rs_Z );

      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 );
      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 );
      // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 );
      bl1_cgemv( BLIS1_CONJ_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_1,
                 A20, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 d0,  inc_d );
      bl1_cgemv( BLIS1_CONJ_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_1,
                 Y20, rs_Y, cs_Y,
                 a21, rs_A,
                 buff_0,
                 e0,  inc_e );
      bl1_cgemv( BLIS1_CONJ_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_1,
                 Z20, rs_Z, cs_Z,
                 a21, rs_A,
                 buff_0,
                 f0,  inc_f );

      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 );
      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 );
      bl1_cgemv( BLIS1_NO_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_m1,
                 Y20, rs_Y, cs_Y,
                 d0,  inc_d,
                 buff_1,
                 y21, rs_Y );
      bl1_cgemv( BLIS1_NO_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_m1,
                 A20, rs_A, cs_A,
                 f0,  inc_f,
                 buff_1,
                 y21, rs_Y );

      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 );
      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 );
      bl1_cgemv( BLIS1_NO_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_m1,
                 A20, rs_A, cs_A,
                 e0,  inc_e,
                 buff_1,
                 z21, rs_Z );
      bl1_cgemv( BLIS1_NO_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_ahead,
                 n_behind,
                 buff_m1,
                 Z20, rs_Z, cs_Z,
                 d0,  inc_d,
                 buff_1,
                 z21, rs_Z );

      // FLA_Copy( d0, t01 );
      bl1_ccopyv( BLIS1_NO_CONJUGATE,
                  n_behind,
                  d0,  inc_d,
                  t01, rs_T );

      // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta );
      // FLA_Inv_scal( FLA_TWO, beta );
      // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta );
      bl1_cdot( BLIS1_CONJUGATE,
                m_ahead,
                a21, rs_A,
                z21, rs_Z,
                &beta );
      bl1_cinvscals( buff_2, &beta );
      bl1_ccopyconj( &beta, &conj_beta );

      // FLA_Scal( minus_inv_tau11, conj_beta );
      // FLA_Axpy( conj_beta, a21, y21 );
      // FLA_Scal( inv_tau11, y21 );
      bl1_cscals( &minus_inv_tau11, &conj_beta );
      bl1_caxpyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &conj_beta,
                  a21, rs_A,
                  y21, rs_Y );
      bl1_cscalv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &inv_tau11,
                  y21, rs_Y );

      // FLA_Scal( minus_inv_tau11, beta );
      // FLA_Axpy( beta, a21, z21 );
      // FLA_Scal( inv_tau11, z21 );
      bl1_cscals( &minus_inv_tau11, &beta );
      bl1_caxpyv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &beta,
                  a21, rs_A,
                  z21, rs_Z );
      bl1_cscalv( BLIS1_NO_CONJUGATE,
                  m_ahead,
                  &inv_tau11,
                  z21, rs_Z );

      // FLA_Dot( a12t, a21, dot_product );
      // FLA_Scal( minus_inv_tau11, dot_product );
      // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t );
      bl1_cdot( BLIS1_NO_CONJUGATE,
                m_ahead,
                a12t, cs_A,
                a21,  rs_A,
                &dot_product );
      bl1_cscals( &minus_inv_tau11, &dot_product );
      bl1_caxpyv( BLIS1_CONJUGATE,
                  m_ahead,
                  &dot_product,
                  a21,  rs_A,
                  a12t, cs_A );

      // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 );
      // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 );
      bl1_cgemv( BLIS1_NO_TRANSPOSE,
                 BLIS1_NO_CONJUGATE,
                 m_behind,
                 n_ahead,
                 buff_1,
                 A02, rs_A, cs_A,
                 a21, rs_A,
                 buff_0,
                 e0,  inc_e );
      bl1_cger( BLIS1_NO_CONJUGATE,
                BLIS1_CONJUGATE,
                m_behind,
                n_ahead,
                &minus_inv_tau11,
                e0,  inc_e,
                a21, rs_A,
                A02, rs_A, cs_A );

      // FLA_Copy( first_elem, a21_t );
      *a21_t = first_elem;
    }

    /*------------------------------------------------------------*/

  }

  // FLA_Obj_free( &d );
  // FLA_Obj_free( &e );
  // FLA_Obj_free( &f );
  FLA_free( buff_d );
  FLA_free( buff_e );
  FLA_free( buff_f );

  return FLA_SUCCESS;
}
Пример #18
0
FLA_Error FLA_Obj_create_ext( FLA_Datatype datatype, FLA_Elemtype elemtype, dim_t m, dim_t n, dim_t m_inner, dim_t n_inner, dim_t rs, dim_t cs, FLA_Obj *obj )
{
  size_t buffer_size;
  size_t n_elem;

  // Adjust the strides, if necessary.
  FLA_adjust_strides( m, n, &rs, &cs );

  if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING )
    FLA_Obj_create_ext_check( datatype, elemtype, m, n, m_inner, n_inner, rs, cs, obj );

  // Populate the fields in the view object.
  obj->m                = m;
  obj->n                = n;
  obj->offm             = 0;
  obj->offn             = 0;
  obj->m_inner          = m_inner;
  obj->n_inner          = n_inner;

  // Allocate the base object field.
  obj->base             = ( FLA_Base_obj * ) FLA_malloc( sizeof( FLA_Base_obj ) );

  // Populate the fields in the base object.
  obj->base->datatype   = datatype;
  obj->base->elemtype   = elemtype;
  obj->base->m          = m;
  obj->base->n          = n;
  obj->base->m_inner    = m_inner;
  obj->base->n_inner    = n_inner;
  obj->base->id         = ( unsigned long ) obj->base;
  obj->base->m_index    = 0;
  obj->base->n_index    = 0;

  // Compute the number of elements needed for the buffer, adjusting
  // the strides for alignment if needed.
  n_elem = FLA_compute_num_elem( FLA_Obj_elem_size( *obj ),
                                 m, n, &rs, &cs );

  // Compute the buffer size in bytes.
  buffer_size = ( size_t ) n_elem *
                ( size_t ) FLA_Obj_elem_size( *obj );

  // Allocate the base object's element buffer.
#ifdef FLA_ENABLE_SCC
  obj->base->buffer = ( FLA_Obj_elemtype( *obj ) == FLA_MATRIX ? FLA_malloc( buffer_size ) : FLA_shmalloc( buffer_size ) );
#else
  obj->base->buffer = FLA_malloc( buffer_size );
#endif
  obj->base->buffer_info = 0;

  // Just in case this is a FLASH object, save the number of elements
  // allocated so that we can more easily free the elements later on.
  obj->base->n_elem_alloc = n_elem;

  // Save the row and column strides used in the memory allocation.
  obj->base->rs     = rs;
  obj->base->cs     = cs;

#ifdef FLA_ENABLE_SUPERMATRIX
  // Initialize SuperMatrix fields.
  obj->base->n_read_tasks   = 0;
  obj->base->read_task_head = NULL;
  obj->base->read_task_tail = NULL;
  obj->base->write_task     = NULL;
#endif

  return FLA_SUCCESS;
}