FLA_Error REF_Gemm_nn( FLA_Obj A, FLA_Obj B, FLA_Obj C ) { FLA_Datatype datatype; int m, k, n, ldim_A, ldim_B, ldim_C; datatype = FLA_Obj_datatype( A ); ldim_A = FLA_Obj_ldim( A ); ldim_B = FLA_Obj_ldim( B ); ldim_C = FLA_Obj_ldim( C ); m = FLA_Obj_length( A ); k = FLA_Obj_width( A ); n = FLA_Obj_width( B ); switch( datatype ){ case FLA_DOUBLE: { double *buff_A, *buff_B, *buff_C, d_one=1.0; buff_A = ( double * ) FLA_Obj_buffer_at_view( A ); buff_B = ( double * ) FLA_Obj_buffer_at_view( B ); buff_C = ( double * ) FLA_Obj_buffer_at_view( C ); FLA_C2F( dgemm )( "N", "N", &m, &n, &k, &d_one, buff_A, &ldim_A, buff_B, &ldim_B, &d_one, buff_C, &ldim_C ); } break; } return 0; }
FLA_Error REF_Syrk_ln( FLA_Obj A, FLA_Obj C ) { FLA_Datatype datatype; int k, m, ldim_A, ldim_C; datatype = FLA_Obj_datatype( A ); ldim_A = FLA_Obj_ldim( A ); ldim_C = FLA_Obj_ldim( C ); k = FLA_Obj_width( A ); m = FLA_Obj_length( A ); switch( datatype ){ case FLA_DOUBLE: { double *buff_A, *buff_C, d_one=1.0; buff_A = ( double * ) FLA_Obj_buffer_at_view( A ); buff_C = ( double * ) FLA_Obj_buffer_at_view( C ); dsyrk_( "L", "N", &m, &k, &d_one, buff_A, &ldim_A, &d_one, buff_C, &ldim_C ); } break; } return 0; }
// ============================================================================ void compute_case4a( int size_a, int size_b, int size_c, int size_d, int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C, int print_data ) { int size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci, iter_a, iter_b, iter_c, iter_d, iter_i, iter_j; double * buff_cb_A, * buff_cb_B, * buff_cb_C, ci, * ptr_ai, * ptr_bi; // Some initializations. buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A ); buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B ); buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C ); size_ab = size_a * size_b; size_abc = size_a * size_b * size_c; size_ia = size_i * size_a; size_iaj = size_i * size_a * size_j; size_jc = size_j * size_c; size_jci = size_j * size_c * size_i; // Show data. if( print_data == 1 ) { FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" ); FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" ); } // Perform computation. for( iter_a = 0; iter_a < size_a; iter_a++ ) { for( iter_b = 0; iter_b < size_b; iter_b++ ) { for( iter_c = 0; iter_c < size_c; iter_c++ ) { for( iter_d = 0; iter_d < size_d; iter_d++ ) { ci = 0.0; for( iter_j = 0; iter_j < size_j; iter_j++ ) { ptr_ai = & buff_cb_A[ 0 + iter_a * size_i + iter_j * size_ia + iter_b * size_iaj ]; ptr_bi = & buff_cb_B[ iter_j + iter_c * size_j + 0 * size_jc + iter_d * size_jci ]; for( iter_i = 0; iter_i < size_i; iter_i++ ) { ci += ( * ptr_ai ) * ( * ptr_bi ); ptr_ai++; ptr_bi += size_jc; } } buff_cb_C[ iter_a + iter_b * size_a + iter_c * size_ab + iter_d * size_abc ] = ci; } } } } // Show data. if( print_data == 1 ) { FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" ); FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" ); } }
int FLAME_invert_ztau( FLA_Obj t ) { dim_t m = FLA_Obj_vector_dim( t ); dim_t inc = FLA_Obj_vector_inc( t ); dcomplex* buff = FLA_Obj_buffer_at_view( t ); double one = 1.0; double conjsign = one; // if conjugate -one; double zero = 0.0; double temp, s, xr_s, xi_s; dcomplex* chi; int i; for ( i = 0; i < m; ++i ) { chi = buff + i*inc; s = bl1_fmaxabs( chi->real, chi->imag ); if ( s != zero ) { xr_s = chi->real / s; xi_s = chi->imag / s; temp = xr_s * chi->real + xi_s * chi->imag; chi->real = xr_s / temp; chi->imag = conjsign * xi_s / temp; } } return 0; }
FLA_Bool FLA_Obj_is_identical( FLA_Obj A, FLA_Obj B ) { FLA_Bool r_val = FALSE; // For LU_piv, if A and B are identical, we do not need copy. // Elemtype should be checked as they can have the same buffer pointer // but elemtype can be either scalar or matrix. if ( A.base != NULL && A.base != NULL ) if ( ( A.base == B.base ) || ( A.base->elemtype == B.base->elemtype && A.base->datatype == B.base->datatype ) ) if ( FLA_Obj_buffer_at_view( A ) == FLA_Obj_buffer_at_view( B ) ) if ( A.m == B.m && A.n == B.n ) r_val = TRUE; return r_val; }
FLA_Bool FLA_Obj_is_overlapped( FLA_Obj A, FLA_Obj B ) { FLA_Bool r_val = FALSE; // For form_Q, if A and B are not overlapped, we do not use in-place forming Q. if ( A.base != NULL && A.base != NULL ) if ( ( A.base == B.base ) || ( A.base->elemtype == B.base->elemtype && A.base->datatype == B.base->datatype ) ) if ( FLA_Obj_buffer_at_view( A ) == FLA_Obj_buffer_at_view( B ) ) if ( ( ( A.offm <= B.offm && B.offm < ( A.offm + A.m ) ) && ( A.offn <= B.offn && B.offn < ( A.offn + A.n ) ) ) || ( ( B.offm <= A.offm && A.offm < ( B.offm + B.m ) ) && ( B.offn <= A.offn && A.offn < ( B.offn + B.n ) ) ) ) r_val = TRUE; return r_val; }
// ============================================================================ void compute_case1( int m, int n, int k, int l, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj C, int print_data ) { FLA_Obj slice_A, slice_B; int datatype, h; double * buff_cb_A, * buff_cb_B; // Some initializations. datatype = FLA_Obj_datatype( cb_A ); buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A ); buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B ); // Prepare temporal slices. FLA_Obj_create_without_buffer( datatype, m, k, & slice_A ); FLA_Obj_create_without_buffer( datatype, n, k, & slice_B ); // Initialize matrix C for the result. MyFLA_Obj_set_to_zero( C ); // Show data. if( print_data == 1 ) { FLA_Obj_show( " Ci = [ ", C, "%le", " ];" ); FLA_Obj_show( " cb_A = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B = [ ", cb_B, "%le", " ];" ); } // Perform computation. for( h = 0; h < l; h++ ) { FLA_Obj_attach_buffer( buff_cb_A + m * k * h, 1, m, & slice_A ); FLA_Obj_attach_buffer( buff_cb_B + n * k * h, 1, n, & slice_B ); FLA_Gemm( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, slice_A, slice_B, FLA_ONE, C ); } // Remove temporal slices. FLA_Obj_free_without_buffer( & slice_A ); FLA_Obj_free_without_buffer( & slice_B ); // Show data. if( print_data == 1 ) { FLA_Obj_show( " Cf = [ ", C, "%le", " ];" ); } }
// ============================================================================ void compute_case5( int m, int n, int k, int l, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj C, int print_data ) { int datatype, i, j, kl; double * buff_cb_A, * buff_cb_B, d_one = 1.0; // Some initializations. datatype = FLA_Obj_datatype( cb_A ); buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A ); buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B ); // Initialize matrix C for the result. MyFLA_Obj_set_to_zero( C ); // Show data. if( print_data == 1 ) { FLA_Obj_show( " Ci = [ ", C, "%le", " ];" ); FLA_Obj_show( " cb_A = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B = [ ", cb_B, "%le", " ];" ); } // Perform computation. kl = k * l; for( i = 0; i < k; i++ ) { for( j = 0; j < l; j++ ) { dger_( & m, & n, & d_one, ( double * ) buff_cb_A + l * m * i + j, & l, ( double * ) buff_cb_B + i + k * j, & kl, ( double * ) FLA_Obj_buffer_at_view( C ), & m ); } } // Show data. if( print_data == 1 ) { FLA_Obj_show( " Cf = [ ", C, "%le", " ];" ); } }
void FLA_CAQR_UT_inc_init_structure( dim_t p, dim_t nb_part, FLA_Obj R ) { dim_t m, n; dim_t rs, cs; dim_t i, j, ip; FLA_Obj* buff_R; m = FLA_Obj_length( R ); n = FLA_Obj_width( R ); rs = FLA_Obj_row_stride( R ); cs = FLA_Obj_col_stride( R ); buff_R = FLA_Obj_buffer_at_view( R ); // Fill in R by row panels. for ( ip = 0; ip < p; ++ip ) { FLA_Obj* buff_R1 = buff_R + (ip*nb_part)*rs; int m_behind = ip*nb_part; int m_ahead = m - m_behind; int m_cur = min( nb_part, m_ahead ); int n_cur = n; // Iterate across columns for the current panel. for ( j = 0; j < n_cur; ++j ) { FLA_Obj* rho = buff_R1 + j*cs; // Mark the above-diagonal blocks as full. for ( i = 0; i < j; ++i ) { rho->base->uplo = FLA_FULL_MATRIX; rho += rs; } // Mark the diagonal block as triangular. rho->base->uplo = FLA_UPPER_TRIANGULAR; rho += rs; // Mark the below-diagonal blocks as zero. for ( i = j + 1; i < m_cur; ++i ) { rho->base->uplo = FLA_ZERO_MATRIX; rho += rs; } } } }
void FLA_Gemm_pack_andor_scale_A( FLA_Trans transA, FLA_Obj alpha, FLA_Obj A, FLA_Obj* packed_A ) { int m, n, ldim_A; double *buff_packed_A, *buff_packed_aligned_A, *buff_A; m = FLA_Obj_length( A ); n = FLA_Obj_width ( A ); ldim_A = FLA_Obj_ldim ( A ); buff_A = ( double* ) FLA_Obj_buffer_at_view( A ); dgemm_itcopy( m, n, buff_A, ldim_A, FLA_Work_buffer_aligned_A ); FLA_Obj_create_without_buffer( FLA_DOUBLE, m, n, packed_A ); // FLA_Obj_attach_buffer( buff_packed_A, n, packed_A ); }
void FLA_Gemm_pack_andor_scale_B( FLA_Trans transB, FLA_Obj alpha, FLA_Obj B, FLA_Obj* packed_B ) { int m, n, ldim_B; double *buff_packed_B, *buff_packed_aligned_B, *buff_B; m = FLA_Obj_length( B ); n = FLA_Obj_width ( B ); ldim_B = FLA_Obj_ldim ( B ); buff_B = ( double* ) FLA_Obj_buffer_at_view( B ); dgemm_oncopy( m, n, buff_B, ldim_B, FLA_Work_buffer_aligned_B ); FLA_Obj_create_without_buffer( FLA_DOUBLE, m, n, packed_B ); // FLA_Obj_attach_buffer( buff_packed_B, m, packed_B ); }
// Transform tau. int FLAME_invert_stau( FLA_Obj t ) { dim_t m = FLA_Obj_vector_dim( t ); dim_t inc = FLA_Obj_vector_inc( t ); float* buff = FLA_Obj_buffer_at_view( t ); float one = 1.0F; float zero = 0.0F; float* chi; int i; for ( i = 0; i < m; ++i ) { chi = buff + i*inc; if ( *chi != zero ) *chi = ( one / *chi ); } return 0; }
FLA_Error REF_Chol( int time_lapack, FLA_Obj A, int nb_alg ) { int n, ldim_A, info; double *buff_A, sqrt(); n = FLA_Obj_length( A ); ldim_A = FLA_Obj_col_stride( A ); buff_A = (double *) FLA_Obj_buffer_at_view( A ); if ( time_lapack ){ // dpotrfx_( "lower", &n, buff_A, &ldim_A, &info, &nb_alg ); } else{ int i, j, k; for ( j=0; j<n; j++ ){ /* alpha11 = sqrt( alpha11 ) */ AA( j, j ) = sqrt( AA( j, j ) ); /* a21 = a21 / alpha11 */ for ( i=j+1; i<n; i++ ) AA( i,j ) = AA( i,j ) / AA( j,j ); /* A22 = A22 - tril( a21 * a21') */ for ( k=j+1; k<n; k++ ) for ( i=k; i<n; i++ ) AA( i,k ) = AA( i,k ) - AA( i,j ) * AA( k,j ); } } return FLA_SUCCESS; }
void FLA_Gemm_kernel( FLA_Obj alpha, FLA_Obj packed_A, FLA_Obj packed_B, FLA_Obj packed_C ) { int m, n, k, ldim_C; double alpha_value, *buff_A, *buff_B, *buff_C, *buff_aligned_A, *buff_aligned_B; m = FLA_Obj_length( packed_C ); n = FLA_Obj_width ( packed_C ); k = FLA_Obj_length( packed_A ); ldim_C = FLA_Obj_ldim ( packed_C ); // buff_A = ( double * ) FLA_Obj_buffer_at_view( packed_A ); // buff_B = ( double * ) FLA_Obj_buffer_at_view( packed_B ); buff_C = ( double* ) FLA_Obj_buffer_at_view( packed_C ); alpha_value = FLA_DOUBLE_VALUE( alpha ); dgemm_kernel( m, n, k, alpha_value, FLA_Work_buffer_aligned_A, FLA_Work_buffer_aligned_B, buff_C, ldim_C ); }
int main( int argc, char *argv[] ) { int i, j, size, n_threads, n_repeats, n_trials, nb_alg, increment, begin; FLA_Datatype datatype = FLA_DOUBLE; FLA_Obj A; double b_norm_value = 0.0, dtime, *dtimes, *flops, *T; char output_file_m[100]; FILE *fpp; fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n\n", '%', n_threads ); sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | PLASMA |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); FLA_Init(); PLASMA_Init( n_threads ); PLASMA_Disable( PLASMA_AUTOTUNING ); PLASMA_Set( PLASMA_TILE_SIZE, nb_alg ); PLASMA_Set( PLASMA_INNER_BLOCK_SIZE, nb_alg / 4 ); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); fprintf( fpp, "%s = [\n", OUTPUT_FILE ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); PLASMA_Alloc_Workspace_dgeqrf( size, size, &T ); dtime = FLA_Clock(); PLASMA_dgeqrf( size, size, FLA_Obj_buffer_at_view( A ), size, T ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; free( T ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 4.0 / 3.0 * size * size * size / dtime / 1e9; fprintf( fpp, " %d %6.3f\n", size, flops[i] ); printf( "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); printf( "Matrix size: %d x %d | nb_alg: %d\n", size, size, nb_alg ); printf( "Norm of difference: %le\n\n", b_norm_value ); FLA_Obj_free( &A ); } fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); FLA_free( dtimes ); FLA_free( flops ); PLASMA_Finalize(); FLA_Finalize(); return 0; }
// ============================================================================ void compute_case2b( int size_a, int size_b, int size_c, int size_d, int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C, int print_data ) { FLA_Obj slice_A, slice_B, slice_C; int datatype, size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci, iter_a, iter_b, iter_c, iter_d, iter_i, iter_j, ii, jj, ldim_slice_B; size_t idx_A, idx_B, idx_C; double * buff_cb_A, * buff_cb_B, * buff_cb_C, * buff_slice_B; // Some initializations. datatype = FLA_Obj_datatype( cb_A ); buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A ); buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B ); buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C ); size_ab = size_a * size_b; size_abc = size_a * size_b * size_c; size_ia = size_i * size_a; size_iaj = size_i * size_a * size_j; size_jc = size_j * size_c; size_jci = size_j * size_c * size_i; // Show data. if( print_data == 1 ) { FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" ); FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" ); } // Prepare temporal slices without buffer. FLA_Obj_create_without_buffer( datatype, size_i, size_a, & slice_A ); FLA_Obj_create( datatype, size_i, size_d, 0, 0, & slice_B ); FLA_Obj_create_without_buffer( datatype, size_a, size_d, & slice_C ); // Perform computation. for( iter_b = 0; iter_b < size_b; iter_b++ ) { for( iter_c = 0; iter_c < size_c; iter_c++ ) { iter_a = 0; iter_d = 0; iter_i = 0; idx_C = ( ( size_t ) iter_a ) + ( ( size_t ) iter_b * size_a ) + ( ( size_t ) iter_c * size_ab ) + ( ( size_t ) iter_d * size_abc ), FLA_Obj_attach_buffer( & buff_cb_C[ idx_C ], 1, size_abc, & slice_C ); MyFLA_Obj_set_to_zero( slice_C ); for( iter_j = 0; iter_j < size_j; iter_j++ ) { // Define Ai. idx_A = ( ( size_t ) iter_i ) + ( ( size_t ) iter_a * size_i ) + ( ( size_t ) iter_j * size_ia ) + ( ( size_t ) iter_b * size_iaj ); FLA_Obj_attach_buffer( & buff_cb_A[ idx_A ], 1, size_i, & slice_A ); // Define Bi. buff_slice_B = ( double * ) FLA_Obj_buffer_at_view( slice_B ); ldim_slice_B = FLA_Obj_col_stride( slice_B ); for( jj = 0; jj < size_d; jj++ ) { for( ii = 0; ii < size_i; ii++ ) { idx_B = ( ( size_t ) iter_j ) + ( ( size_t ) iter_c * size_j ) + ( ( size_t ) ii * size_jc ) + ( ( size_t ) jj * size_jci ); buff_slice_B[ ii + jj * ldim_slice_B ] = buff_cb_B[ idx_B ]; } } // Compute Ai' * Bi. FLA_Gemm( FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, slice_A, slice_B, FLA_ONE, slice_C ); } } } // Show data. if( print_data == 1 ) { FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" ); FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" ); } // Remove temporal slices. FLA_Obj_free_without_buffer( & slice_A ); FLA_Obj_free( & slice_B ); FLA_Obj_free_without_buffer( & slice_C ); }
int main(int argc, char *argv[]) { int n, nfirst, nlast, ninc, nlast_unb, i, irep, nrepeats, nb_alg; double dtime, dtime_best, gflops, max_gflops, diff, d_n; FLA_Obj A, Aref, Aold, delta; /* Initialize FLAME */ FLA_Init( ); /* Every time trial is repeated "repeat" times and the fastest run in recorded */ printf( "%% number of repeats:" ); scanf( "%d", &nrepeats ); printf( "%% %d\n", nrepeats ); /* Enter the max GFLOPS attainable This is used to set the y-axis range for the graphs. Here is how you figure out what to enter (on Linux machines): 1) more /proc/cpuinfo (this lists the contents of this file). 2) read through this and figure out the clock rate of the machine (in GHz). 3) Find out (from an expert of from the web) the number of floating point instructions that can be performed per core per clock cycle. 4) Figure out if you are using "multithreaded BLAS" which automatically parallelize calls to the Basic Linear Algebra Subprograms. If so, check how many cores are available. 5) Multiply 2) x 3) x 4) and enter this in response to the below. If you enter a value for max GFLOPS that is lower that the maximum that is observed in the experiments, then the top of the graph is set to the observed maximum. Thus, one possibility is to simply set this to 0.0. */ printf( "%% enter max GFLOPS:" ); scanf( "%lf", &max_gflops ); printf( "%% %lf\n", max_gflops ); /* Enter the algorithmic block size */ printf( "%% enter nb_alg:" ); scanf( "%d", &nb_alg ); printf( "%% %d\n", nb_alg ); /* Timing trials for matrix sizes n=nfirst to nlast in increments of ninc will be performed. Unblocked versions are only tested to nlast_unb */ printf( "%% enter nfirst, nlast, ninc, nlast_unb:" ); scanf( "%d%d%d%d", &nfirst, &nlast, &ninc, &nlast_unb ); printf( "%% %d %d %d %d\n", nfirst, nlast, ninc, nlast_unb ); i = 1; for ( n=nfirst; n<= nlast; n+=ninc ){ /* Allocate space for the matrices */ FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &A ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aref ); FLA_Obj_create( FLA_DOUBLE, n, n, 1, n, &Aold ); FLA_Obj_create( FLA_DOUBLE, 1, 1, 1, 1, &delta ); /* Generate random matrix A and save in Aold */ FLA_Random_matrix( Aold ); /* Add something large to the diagonal to make sure it isn't ill-conditionsed */ d_n = ( double ) n; *( ( double * ) FLA_Obj_buffer_at_view( delta ) ) = d_n; FLA_Shift_diag( FLA_NO_CONJUGATE, delta, Aold ); /* Set gflops = billions of floating point operations that will be performed */ gflops = 1.0/3.0 * n * n * n * 1.0e-09; /* Time the reference implementation */ #if TIME_LAPACK == TRUE #else // if ( n <= nlast_unb ) #endif { for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, Aref ); dtime = FLA_Clock(); REF_Chol( TIME_LAPACK, Aref, nb_alg ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_REF( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); fflush( stdout ); } /* Time FLA_Chol */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); FLA_Chol( FLA_LOWER_TRIANGULAR, A ); dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } printf( "data_FLAME( %d, 1:2 ) = [ %d %le ];\n", i, n, gflops / dtime_best ); if ( gflops / dtime_best > max_gflops ) max_gflops = gflops / dtime_best; fflush( stdout ); /* Time the your implementations */ /* Variant 1 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR1 == TRUE Chol_unb_var1( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 1 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR1 == TRUE Chol_blk_var1( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var1( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 2 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR2 == TRUE Chol_unb_var2( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 2 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR2 == TRUE Chol_blk_var2( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var2( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); /* Variant 3 unblocked */ if ( n <= nlast_unb ){ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_UNB_VAR3 == TRUE Chol_unb_var3( A ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_unb_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); } /* Variant 3 blocked */ for ( irep=0; irep<nrepeats; irep++ ){ FLA_Copy( Aold, A ); dtime = FLA_Clock(); #if TIME_BLK_VAR3 == TRUE Chol_blk_var3( A, nb_alg ); #else REF_Chol( TIME_LAPACK, A, nb_alg ); #endif dtime = FLA_Clock() - dtime; if ( irep == 0 ) dtime_best = dtime; else dtime_best = ( dtime < dtime_best ? dtime : dtime_best ); } diff = FLA_Max_elemwise_diff( A, Aref ); printf( "data_blk_var3( %d, 1:3 ) = [ %d %le %le];\n", i, n, gflops / dtime_best, diff ); fflush( stdout ); FLA_Obj_free( &A ); FLA_Obj_free( &Aold ); FLA_Obj_free( &Aref ); FLA_Obj_free( &delta ); printf( "\n" ); i++; } /* Print the MATLAB commands to plot the data */ /* Delete all existing figures */ printf( "close all\n" ); #if OCTAVE == TRUE /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), '-k;libflame;' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), '-m;reference;' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), \"-rx;UnbVar1;\" ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), \"-go;UnbVar2;\" ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), \"-b*;UnbVar3;\" ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), \"-rx;BlkVar1;\", \"markersize\", 3 ); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), \"-go;BlkVar2;\", \"markersize\", 3 ); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), \"-b*;BlkVar3;\", \"markersize\", 3 ); \n" ); #else /* Plot the performance of FLAME */ printf( "plot( data_FLAME( :,1 ), data_FLAME( :, 2 ), 'k--' ); \n" ); /* Indicate that you want to add to the existing plot */ printf( "hold on\n" ); /* Plot the performance of the reference implementation */ printf( "plot( data_REF( :,1 ), data_REF( :, 2 ), 'k-' ); \n" ); /* Plot the performance of your implementations */ printf( "plot( data_unb_var1( :,1 ), data_unb_var1( :, 2 ), 'r-.x' ); \n" ); printf( "plot( data_unb_var2( :,1 ), data_unb_var2( :, 2 ), 'g-.o' ); \n" ); printf( "plot( data_unb_var3( :,1 ), data_unb_var3( :, 2 ), 'b-.*' ); \n" ); printf( "plot( data_blk_var1( :,1 ), data_blk_var1( :, 2 ), 'r-x'); \n" ); printf( "plot( data_blk_var2( :,1 ), data_blk_var2( :, 2 ), 'g-o'); \n" ); printf( "plot( data_blk_var3( :,1 ), data_blk_var3( :, 2 ), 'b-*'); \n" ); #endif printf( "hold off \n"); printf( "xlabel( 'matrix dimension m=n' );\n"); printf( "ylabel( 'GFLOPS/sec.' );\n"); printf( "axis( [ 0 %d 0 %3.1f ] ); \n", nlast, max_gflops ); #if OCTAVE == TRUE printf( "legend( 2 ); \n" ); printf(" print -landscape -solid -color -deps -F:24 Chol.eps\n" ); #else printf( "legend( 'FLA Chol', ...\n"); printf( " 'Simple loops', ...\n"); printf( " 'unb var1', ...\n"); printf( " 'unb var2', ...\n"); printf( " 'unb var3', ...\n"); printf( " 'blk var1', ...\n"); printf( " 'blk var2', ...\n"); printf( " 'blk var3', 2);\n"); printf( "print -r100 -dpdf Chol.pdf\n"); #endif FLA_Finalize( ); exit( 0 ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Datatype realtype = REALTYPE; FLA_Obj A, TU, TV, A_copy, A_recovered, U, V, Vb, B, Be, d, e, DU, DV; FLA_Obj ATL, ATR, ABL, ABR, Ae; FLA_Uplo uplo; dim_t m, n, min_m_n; FLA_Error init_result; double residual_A = 0.0; if ( argc == 3 ) { m = atoi(argv[1]); n = atoi(argv[2]); min_m_n = min(m,n); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m n\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " n : matrix width\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 || n == 0 ) return 0; FLA_Init_safe( &init_result ); // FLAME Bidiag setup FLA_Obj_create( datatype, m, n, 0, 0, &A ); FLA_Bidiag_UT_create_T( A, &TU, &TV ); // Rand A and create A_copy. FLA_Random_matrix( A ); { scomplex *buff_A = FLA_Obj_buffer_at_view( A ); buff_A[0].real = 4.4011e-01; buff_A[0].imag = -4.0150e-09; buff_A[2].real = -2.2385e-01; buff_A[2].imag = -1.5546e-01; buff_A[4].real = -6.3461e-02; buff_A[4].imag = 2.7892e-01; buff_A[6].real = -1.3197e-01; buff_A[6].imag = 5.0888e-01; buff_A[1].real = 3.3352e-01; buff_A[1].imag = -6.6346e-02; buff_A[3].real = -1.9307e-01; buff_A[3].imag = -8.4066e-02; buff_A[5].real = -6.0446e-03; buff_A[5].imag = 2.2094e-01; buff_A[7].real = -2.3299e-02; buff_A[7].imag = 4.0553e-01; } //FLA_Set_to_identity( A ); //FLA_Scal( FLA_MINUS_ONE, A ); if ( m >= n ) { uplo = FLA_UPPER_TRIANGULAR; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, min_m_n - 1, 1, FLA_TL ); Ae = ATR; } else { uplo = FLA_LOWER_TRIANGULAR; FLA_Part_2x2( A, &ATL, &ATR, &ABL, &ABR, 1, min_m_n - 1, FLA_TL ); Ae = ABL; } FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_copy ); FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &A_recovered ); // Bidiag test { FLA_Obj norm; FLA_Bool apply_scale; FLA_Obj_create( realtype, 1,1, 0,0, &norm ); FLA_Max_abs_value( A, norm ); apply_scale = FLA_Obj_gt( norm, FLA_OVERFLOW_SQUARE_THRES ); if ( apply_scale ) FLA_Scal( FLA_SAFE_MIN, A ); FLA_Bidiag_UT( A, TU, TV ); if ( apply_scale ) FLA_Bidiag_UT_scale_diagonals( FLA_SAFE_INV_MIN, A ); FLA_Obj_free( &norm ); } // Orthonomal basis U, V. FLA_Obj_create( datatype, m, min_m_n, 0, 0, &U ); FLA_Set( FLA_ZERO, U ); FLA_Obj_create( datatype, min_m_n, n, 0, 0, &V ); FLA_Set( FLA_ZERO, V ); FLA_Bidiag_UT_form_U_ext( uplo, A, TU, FLA_NO_TRANSPOSE, U ); FLA_Bidiag_UT_form_V_ext( uplo, A, TV, FLA_CONJ_TRANSPOSE, V ); if ( FLA_Obj_is_complex( A ) ){ FLA_Obj rL, rR; FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rL ); FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &rR ); FLA_Obj_fshow( stdout, " - Factor no realified - ", A, "% 6.4e", "------"); FLA_Bidiag_UT_realify( A, rL, rR ); FLA_Obj_fshow( stdout, " - Factor realified - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - rL - ", rL, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - rR - ", rR, "% 6.4e", "------"); FLA_Apply_diag_matrix( FLA_RIGHT, FLA_CONJUGATE, rL, U ); FLA_Apply_diag_matrix( FLA_LEFT, FLA_CONJUGATE, rR, V ); FLA_Obj_free( &rL ); FLA_Obj_free( &rR ); } // U^H U FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DU ); FLA_Gemm_external( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, U, U, FLA_ZERO, DU ); // V^H V FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &DV ); FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE, FLA_ONE, V, V, FLA_ZERO, DV ); // Recover the matrix FLA_Obj_create( datatype, min_m_n, min_m_n, 0, 0, &B ); FLA_Set( FLA_ZERO, B ); // Set B FLA_Obj_create( datatype, min_m_n, 1, 0, 0, &d ); FLA_Set_diagonal_vector( A, d ); FLA_Set_diagonal_matrix( d, B ); FLA_Obj_free( &d ); if ( min_m_n > 1 ) { FLA_Obj_create( datatype, min_m_n - 1 , 1, 0, 0, &e ); FLA_Set_diagonal_vector( Ae, e ); if ( uplo == FLA_UPPER_TRIANGULAR ) { FLA_Part_2x2( B, &ATL, &ATR, &ABL, &ABR, min_m_n - 1, 1, FLA_TL ); Be = ATR; } else { FLA_Part_2x2( B, &ATL, &ATR, &ABL, &ABR, 1, min_m_n - 1, FLA_TL ); Be = ABL; } FLA_Set_diagonal_matrix( e, Be ); FLA_Obj_free( &e ); } // Vb := B (V^H) FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, V, &Vb ); FLA_Trmm_external( FLA_LEFT, uplo, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, FLA_ONE, B, Vb ); // A := U Vb FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, U, Vb, FLA_ZERO, A_recovered ); residual_A = FLA_Max_elemwise_diff( A_copy, A_recovered ); if (1) { FLA_Obj_fshow( stdout, " - Given - ", A_copy, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Factor - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - TU - ", TU, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - TV - ", TV, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Vb - ", Vb, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U'U - ", DU, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - VV' - ", DV, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - Recovered A - ", A_recovered, "% 6.4e", "------"); fprintf( stdout, "lapack2flame: %lu x %lu: ", m, n); fprintf( stdout, "recovery A = %12.10e\n\n", residual_A ) ; } FLA_Obj_free( &A ); FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &B ); FLA_Obj_free( &U ); FLA_Obj_free( &V ); FLA_Obj_free( &Vb ); FLA_Obj_free( &DU ); FLA_Obj_free( &DV ); FLA_Obj_free( &A_copy ); FLA_Obj_free( &A_recovered ); FLA_Finalize_safe( init_result ); }
int main( int argc, char** argv ) { FLA_Datatype datatype = TESTTYPE; FLA_Obj A, A_flame, A_lapack, C; int m; FLA_Error init_result; FLA_Obj TU, TV, U_flame, V_flame, d_flame, e_flame, B_flame; FLA_Obj tauq, taup, d_lapack, e_lapack, U_lapack, V_lapack, W, B_lapack; testtype *buff_tauq, *buff_taup, *buff_d_lapack, *buff_e_lapack, *buff_W, *buff_A_lapack, *buff_U_lapack, *buff_V_lapack; int lwork, info, is_flame; if ( argc == 3 ) { m = atoi(argv[1]); is_flame = atoi(argv[2]); } else { fprintf(stderr, " \n"); fprintf(stderr, "Usage: %s m is_flame\n", argv[0]); fprintf(stderr, " m : matrix length\n"); fprintf(stderr, " is_flame : 1 yes, 0 no\n"); fprintf(stderr, " \n"); return -1; } if ( m == 0 ) return 0; FLA_Init_safe( &init_result ); fprintf( stdout, "lapack2flame: %d x %d: \n", m, m); FLA_Obj_create( datatype, m, m, 0, 0, &A ); FLA_Random_matrix( A ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A, &A_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &C ); FLA_Random_matrix( C ); if ( is_flame ) { fprintf( stdout, " flame executed\n"); FLA_Bidiag_UT_create_T( A_flame, &TU, &TV ); FLA_Bidiag_UT( A_flame, TU, TV ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &U_flame ); FLA_Obj_create_copy_of( FLA_NO_TRANSPOSE, A_flame, &V_flame ); FLA_Bidiag_UT_form_U( U_flame, TU, U_flame ); FLA_Bidiag_UT_form_V( V_flame, TV, V_flame ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_flame ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_flame ); FLA_Bidiag_UT_extract_diagonals( A_flame, d_flame, e_flame ); FLA_Obj_create( datatype, m, m, 0, 0, &B_flame ); FLA_Set( FLA_ZERO, B_flame ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_flame, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_flame, B_flame ); FLA_Set_diagonal_matrix( e_flame, BTR ); } if (1) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_flame, "% 6.4e", "------"); } } else { fprintf( stdout, " lapack executed\n"); FLA_Obj_create( datatype, m, 1, 0, 0, &tauq ); FLA_Obj_create( datatype, m, 1, 0, 0, &taup ); FLA_Obj_create( datatype, m, 1, 0, 0, &d_lapack ); FLA_Obj_create( datatype, m - 1, 1, 0, 0, &e_lapack ); buff_A_lapack = (testtype*)FLA_Obj_buffer_at_view( A_lapack ); buff_tauq = (testtype*)FLA_Obj_buffer_at_view( tauq ); buff_taup = (testtype*)FLA_Obj_buffer_at_view( taup ); buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_lapack ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_lapack ); lwork = 32*m; FLA_Obj_create( datatype, lwork, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sgebrd_( &m, &m, buff_A_lapack, &m, buff_d_lapack, buff_e_lapack, buff_tauq, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &U_lapack ); FLA_Obj_create( datatype, m, m, 0, 0, &V_lapack ); FLA_Copy( A_lapack, U_lapack ); FLA_Copy( A_lapack, V_lapack ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_lapack ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_lapack ); sorgbr_( "Q", &m, &m, &m, buff_U_lapack, &m, buff_tauq, buff_W, &lwork, &info ); sorgbr_( "P", &m, &m, &m, buff_V_lapack, &m, buff_taup, buff_W, &lwork, &info ); FLA_Obj_create( datatype, m, m, 0, 0, &B_lapack ); FLA_Set( FLA_ZERO, B_lapack ); { FLA_Obj BTL, BTR, BBL, BBR; FLA_Part_2x2( B_lapack, &BTL, &BTR, &BBL, &BBR, 1,1, FLA_BL ); FLA_Set_diagonal_matrix( d_lapack, B_lapack ); FLA_Set_diagonal_matrix( e_lapack, BTR ); } FLA_Obj_free( &W ); if (1) { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - Given A - ", A, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - A - ", A_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - B - ", B_lapack, "% 6.4e", "------"); } } { testtype dummy; int zero = 0, one = 1; FLA_Obj D_lapack; FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, A, &D_lapack ); FLA_Set( FLA_ZERO, D_lapack ); if ( is_flame ) { buff_d_lapack = (testtype*)FLA_Obj_buffer_at_view( d_flame ); buff_e_lapack = (testtype*)FLA_Obj_buffer_at_view( e_flame ); buff_U_lapack = (testtype*)FLA_Obj_buffer_at_view( U_flame ); buff_V_lapack = (testtype*)FLA_Obj_buffer_at_view( V_flame ); } FLA_Obj_create( datatype, 4*m, 1, 0, 0, &W ); buff_W = (testtype*)FLA_Obj_buffer_at_view( W ); sbdsqr_( "U", &m, &m, &m, &zero, buff_d_lapack, buff_e_lapack, buff_V_lapack, &m, buff_U_lapack, &m, &dummy, &one, buff_W, &info ); FLA_Obj_free( &W ); if (info != 0) printf( " Error info = %d\n", info ); if ( is_flame ) FLA_Set_diagonal_matrix( d_flame, D_lapack ); else FLA_Set_diagonal_matrix( d_lapack, D_lapack ); if ( is_flame ) { fprintf( stdout, " - FLAME ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_flame, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } else { fprintf( stdout, " - LAPACK ----------\n"); FLA_Obj_fshow( stdout, " - U - ", U_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - V - ", V_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - d - ", d_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - e - ", e_lapack, "% 6.4e", "------"); FLA_Obj_fshow( stdout, " - D - ", D_lapack, "% 6.4e", "------"); } FLA_Obj_free( &D_lapack ); } if ( is_flame ) { FLA_Obj_free( &TU ); FLA_Obj_free( &TV ); FLA_Obj_free( &U_flame ); FLA_Obj_free( &V_flame ); FLA_Obj_free( &d_flame ); FLA_Obj_free( &e_flame ); FLA_Obj_free( &B_flame ); } else { FLA_Obj_free( &tauq ); FLA_Obj_free( &taup ); FLA_Obj_free( &d_lapack ); FLA_Obj_free( &e_lapack ); FLA_Obj_free( &U_lapack ); FLA_Obj_free( &V_lapack ); FLA_Obj_free( &B_lapack ); } FLA_Obj_free( &A ); FLA_Obj_free( &A_flame ); FLA_Obj_free( &A_lapack ); FLA_Obj_free( &C ); FLA_Finalize_safe( init_result ); }
// ============================================================================ void compute_case4b( int size_a, int size_b, int size_c, int size_d, int size_i, int size_j, FLA_Obj cb_A, FLA_Obj cb_B, FLA_Obj cb_C, int print_data ) { FLA_Obj slice_C; int datatype, size_ab, size_abc, size_ia, size_iaj, size_jc, size_jci, iter_a, iter_b, iter_c, iter_d, iter_i, iter_j, ldim_slice_C; size_t idx_A, idx_B, idx_C; double * buff_cb_A, * buff_cb_B, * buff_cb_C, * buff_slice_C, d_one = 1.0; // Some initializations. datatype = FLA_Obj_datatype( cb_A ); buff_cb_A = ( double * ) FLA_Obj_buffer_at_view( cb_A ); buff_cb_B = ( double * ) FLA_Obj_buffer_at_view( cb_B ); buff_cb_C = ( double * ) FLA_Obj_buffer_at_view( cb_C ); size_ab = size_a * size_b; size_abc = size_a * size_b * size_c; size_ia = size_i * size_a; size_iaj = size_i * size_a * size_j; size_jc = size_j * size_c; size_jci = size_j * size_c * size_i; // Show data. if( print_data == 1 ) { FLA_Obj_show( " cb_A_i = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B_i = [ ", cb_B, "%le", " ];" ); FLA_Obj_show( " cb_C_i = [ ", cb_C, "%le", " ];" ); } // Prepare temporal slices without buffer. FLA_Obj_create_without_buffer( datatype, size_a, size_c, & slice_C ); #if 0 FLA_Obj_create_without_buffer( datatype, size_a, 1, & slice_A ); FLA_Obj_create_without_buffer( datatype, size_c, 1, & slice_B ); #endif // Perform computation. for( iter_b = 0; iter_b < size_b; iter_b++ ) { for( iter_d = 0; iter_d < size_d; iter_d++ ) { // Define slice_C. iter_a = 0; iter_c = 0; idx_C = ( ( size_t ) iter_a ) + ( ( size_t ) iter_b * size_a ) + ( ( size_t ) iter_c * size_ab ) + ( ( size_t ) iter_d * size_abc ); FLA_Obj_attach_buffer( & buff_cb_C[ idx_C ], 1, size_ab, & slice_C ); buff_slice_C = ( double * ) FLA_Obj_buffer_at_view( slice_C ); ldim_slice_C = FLA_Obj_col_stride( slice_C ); // Initialize slice_C. MyFLA_Obj_set_to_zero( slice_C ); for( iter_i = 0; iter_i < size_i; iter_i++ ) { for( iter_j = 0; iter_j < size_j; iter_j++ ) { #if 0 // Define slice_A. FLA_Obj_attach_buffer( & buff_cb_A[ iter_i + 0 * size_i + iter_j * size_ia + iter_b * size_iaj ], size_i, 1, & slice_A ); // Define slice_B. FLA_Obj_attach_buffer( & buff_cb_B[ iter_j + 0 * size_j + iter_i * size_jc + iter_d * size_jci ], size_j, 1, & slice_B ); // Compute DGER operation. FLA_Ger( FLA_ONE, slice_A, slice_B, slice_C ); #endif idx_A = ( ( size_t ) iter_i ) + ( ( size_t ) 0 * size_i ) + ( ( size_t ) iter_j * size_ia ) + ( ( size_t ) iter_b * size_iaj ); idx_B = ( ( size_t ) iter_j ) + ( ( size_t ) 0 * size_j ) + ( ( size_t ) iter_i * size_jc ) + ( ( size_t ) iter_d * size_jci ); dger_( & size_a, & size_c, & d_one, & buff_cb_A[ idx_A ], & size_i, & buff_cb_B[ idx_B ], & size_j, buff_slice_C, & ldim_slice_C ); } } } } // Show data. if( print_data == 1 ) { FLA_Obj_show( " cb_A_f = [ ", cb_A, "%le", " ];" ); FLA_Obj_show( " cb_B_f = [ ", cb_B, "%le", " ];" ); FLA_Obj_show( " cb_C_f = [ ", cb_C, "%le", " ];" ); } // Remove temporal slices. FLA_Obj_free_without_buffer( & slice_C ); #if 0 FLA_Obj_free_without_buffer( & slice_A ); FLA_Obj_free_without_buffer( & slice_B ); #endif }