FLA_Error FLA_Obj_free_buffer( FLA_Obj *obj ) { if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_free_buffer_check( obj ); #ifdef FLA_ENABLE_SCC ( FLA_Obj_elemtype( *obj ) == FLA_MATRIX ? FLA_free( obj->base->buffer ) : FLA_shfree( obj->base->buffer ) ); #else FLA_free( obj->base->buffer ); #endif obj->base->buffer = NULL; return FLA_SUCCESS; }
void test_permute_tensor(dim_t permutation[], FLA_Obj A){ dim_t i,j; dim_t num_elem_alloc_A = FLA_Obj_num_elem_alloc(A); printf("Testing permutation routine\n"); print_array("permutation", A.order, permutation); printf("-----------------------------\n"); for(i = 0; i < num_elem_alloc_A; i++){ FLA_Obj curObj = ((FLA_Obj*)FLA_Obj_base_buffer(A))[i]; dim_t n_obj_alloc = FLA_Obj_num_elem_alloc(curObj); double* obj_base_buffer = (double*)FLA_Obj_base_buffer(curObj); dim_t* sizeObj = FLA_Obj_size(curObj); dim_t* strideObj = FLA_Obj_stride(curObj); FLA_Obj P; printf("A[%d] ", i); print_array("stored permutation", curObj.order, curObj.permutation); print_array("stored size", curObj.order, curObj.size); FLA_Obj_print_matlab("data", curObj); printf("raw data: "); for(j = 0; j < n_obj_alloc; j++) printf("%.3f ", obj_base_buffer[j]); printf("\n"); printf("A[%d] ", i); print_array("under permutation", curObj.order, permutation); FLA_Obj_create_tensor(FLA_DOUBLE, curObj.order, sizeObj, strideObj, &P); FLA_Permute(curObj, permutation, &P); print_array("P stored permutation", P.order, P.permutation); print_array("P stored size", P.order, P.size); FLA_Obj_print_matlab("P", P); FLA_free(sizeObj); FLA_free(strideObj); } }
FLA_Error FLA_Obj_free_without_buffer( FLA_Obj *obj ) { if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_free_without_buffer_check( obj ); FLA_free( ( void * ) obj->base ); obj->offm = 0; obj->offn = 0; obj->m = 0; obj->n = 0; return FLA_SUCCESS; }
FLA_Error FLA_Obj_free( FLA_Obj *obj ) { if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Obj_free_check( obj ); if ( obj->base != NULL ) { #ifdef FLA_ENABLE_SCC ( FLA_Obj_elemtype( *obj ) == FLA_MATRIX ? FLA_free( obj->base->buffer ) : FLA_shfree( obj->base->buffer ) ); #else //printf( "freeing buff %p\n", obj->base->buffer ); fflush( stdout ); FLA_free( obj->base->buffer ); #endif //printf( "freeing base %p\n", obj->base ); fflush( stdout ); FLA_free( ( void * ) obj->base ); } obj->offm = 0; obj->offn = 0; obj->m = 0; obj->n = 0; return FLA_SUCCESS; }
void* FLA_realloc( void* old_ptr, size_t size ) { FLA_Error e_val; void* new_ptr; // We can't do much if size is zero. To emulate realloc(), we must // return a NULL pointer, regardless of the value of old_ptr. if ( size == 0 ) { // If the pointer is valid, free() it. if ( old_ptr != NULL ) FLA_free( old_ptr ); // If size is zero, we should return a NULL pointer. new_ptr = NULL; } else { // If old_ptr is NULL, allocate size bytes as if it were a first-time // FLA_malloc() request. Otherwise, proceed to realloc() the memory. if ( old_ptr == NULL ) { new_ptr = FLA_malloc( size ); } else { // At this point, we know that size is non-zero and old_ptr is valid. // Since we may need aligned addresses, we don't really want to call // realloc(), since it does not guarantee arbitrary aligned pointers. // But we can't implement it ourselves either, because we don't know // how large the original buffer is, therefor we don't know how much // to copy over after the new buffer is allocated. So we're stuck with // the system implementation. new_ptr = realloc( old_ptr, size ); if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) { e_val = FLA_Check_malloc_pointer( new_ptr ); FLA_Check_error_code( e_val ); } } } // Return the pointer (either NULL, or the return value from FLA_malloc() // or realloc()). return new_ptr; }
FLA_Error FLA_Hess_UT_step_opc_var4( int m_A, int m_T, scomplex* buff_A, int rs_A, int cs_A, scomplex* buff_Y, int rs_Y, int cs_Y, scomplex* buff_Z, int rs_Z, int cs_Z, scomplex* buff_T, int rs_T, int cs_T ) { scomplex* buff_2 = FLA_COMPLEX_PTR( FLA_TWO ); scomplex* buff_1 = FLA_COMPLEX_PTR( FLA_ONE ); scomplex* buff_0 = FLA_COMPLEX_PTR( FLA_ZERO ); scomplex* buff_m1 = FLA_COMPLEX_PTR( FLA_MINUS_ONE ); scomplex first_elem, last_elem; scomplex dot_product; scomplex beta, conj_beta; scomplex inv_tau11; scomplex minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); scomplex* buff_d = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_e = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); scomplex* buff_f = ( scomplex* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_d = 1; int inc_e = 1; int inc_f = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bl1_csetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bl1_csetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { scomplex* a10t = buff_A + (0 )*cs_A + (i )*rs_A; scomplex* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; scomplex* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; scomplex* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; scomplex* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; scomplex* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; scomplex* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; scomplex* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; scomplex* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; scomplex* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; scomplex* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; scomplex* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; scomplex* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; scomplex* t01 = buff_T + (i )*cs_T + (0 )*rs_T; scomplex* tau11 = buff_T + (i )*cs_T + (i )*rs_T; scomplex* d0 = buff_d + (0 )*inc_d; scomplex* e0 = buff_e + (0 )*inc_e; scomplex* f0 = buff_f + (0 )*inc_f; scomplex* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; scomplex* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; scomplex* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; scomplex* ABL = a10t; scomplex* ZBL = z10t; scomplex* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bl1_cgemv( BLIS1_NO_TRANSPOSE, BLIS1_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bl1_cgemv( BLIS1_NO_TRANSPOSE, BLIS1_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bl1_cgemv( BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bl1_cgemv( BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opc( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bl1_cdiv3( buff_1, tau11, &inv_tau11 ); bl1_cneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); bl1_cgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); bl1_cgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); bl1_cgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, d0, inc_d ); bl1_cgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, Y20, rs_Y, cs_Y, a21, rs_A, buff_0, e0, inc_e ); bl1_cgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, Z20, rs_Z, cs_Z, a21, rs_A, buff_0, f0, inc_f ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); bl1_cgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, d0, inc_d, buff_1, y21, rs_Y ); bl1_cgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, f0, inc_f, buff_1, y21, rs_Y ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); bl1_cgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, e0, inc_e, buff_1, z21, rs_Z ); bl1_cgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Z20, rs_Z, cs_Z, d0, inc_d, buff_1, z21, rs_Z ); // FLA_Copy( d0, t01 ); bl1_ccopyv( BLIS1_NO_CONJUGATE, n_behind, d0, inc_d, t01, rs_T ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bl1_cdot( BLIS1_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bl1_cinvscals( buff_2, &beta ); bl1_ccopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bl1_cscals( &minus_inv_tau11, &conj_beta ); bl1_caxpyv( BLIS1_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bl1_cscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bl1_cscals( &minus_inv_tau11, &beta ); bl1_caxpyv( BLIS1_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bl1_cscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bl1_cdot( BLIS1_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bl1_cscals( &minus_inv_tau11, &dot_product ); bl1_caxpyv( BLIS1_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bl1_cgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bl1_cger( BLIS1_NO_CONJUGATE, BLIS1_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &d ); // FLA_Obj_free( &e ); // FLA_Obj_free( &f ); FLA_free( buff_d ); FLA_free( buff_e ); FLA_free( buff_f ); return FLA_SUCCESS; }
int main( int argc, char *argv[] ) { int i, j, n_threads, n_repeats, n_trials, increment, begin, sorting, caching, work_stealing, data_affinity; dim_t size, nb_alg; FLA_Datatype datatype = FLA_DOUBLE; FLA_Obj A, x, b, b_norm, AH, pH, bH; double b_norm_value, dtime, *dtimes, *flops; #ifndef FLA_ENABLE_WINDOWS_BUILD char output_file_m[100]; FILE *fpp; #endif fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' ); scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity ); fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #else sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | FLASH |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #endif FLA_Init(); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); FLASH_Queue_set_num_threads( n_threads ); FLASH_Queue_set_sorting( sorting ); FLASH_Queue_set_caching( caching ); FLASH_Queue_set_work_stealing( work_stealing ); FLASH_Queue_set_data_affinity( data_affinity ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); FLA_Obj_create( datatype, size, 1, 0, 0, &x ); FLA_Obj_create( datatype, size, 1, 0, 0, &b ); FLA_Obj_create( datatype, 1, 1, 0, 0, &b_norm ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); FLA_Random_matrix( b ); FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH ); FLASH_Obj_create( FLA_INT, size, 1, 1, &nb_alg, &pH ); FLASH_Obj_create_hier_copy_of_flat( b, 1, &nb_alg, &bH ); dtime = FLA_Clock(); FLASH_LU_piv( AH, pH ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; FLASH_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pH, bH ); FLASH_Trsv( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, AH, bH ); FLASH_Trsv( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, AH, bH ); FLASH_Obj_free( &AH ); FLASH_Obj_free( &pH ); FLASH_Obj_flatten( bH, x ); FLASH_Obj_free( &bH ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 2.0 / 3.0 * size * size * size / dtime / 1e9; FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_ONE, A, x, FLA_MINUS_ONE, b ); FLA_Nrm2_external( b, b_norm ); FLA_Obj_extract_real_scalar( b_norm, &b_norm_value ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, " %d %6.3f %le\n", size, flops[i], b_norm_value ); #else fprintf( fpp, " %d %6.3f\n", size, flops[i] ); fprintf( stdout, "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); fprintf( stdout, "Matrix size: %u x %u | nb_alg: %u\n", size, size, nb_alg ); fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &b_norm ); } #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "];\n\n" ); #else fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); #endif FLA_free( dtimes ); FLA_free( flops ); FLA_Finalize(); return 0; }
FLA_Error FLA_Tridiag_UT_l_step_ops_var2( int m_A, int m_T, float* buff_A, int rs_A, int cs_A, float* buff_T, int rs_T, int cs_T ) { float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem; float beta; float inv_tau11; float minus_inv_tau11; float minus_upsilon11, minus_conj_upsilon11; float minus_zeta11, minus_conj_zeta11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &u ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &w ); float* buff_u = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); float* buff_w = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_u = 1; int inc_z = 1; int inc_w = 1; // Initialize some variables (only to prevent compiler warnings). first_elem = *buff_0; minus_inv_tau11 = *buff_0; for ( i = 0; i < b_alg; ++i ) { float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* upsilon11= buff_u + (i )*inc_u; float* u21 = buff_u + (i+1)*inc_u; float* zeta11 = buff_z + (i )*inc_z; float* z21 = buff_z + (i+1)*inc_z; float* w21 = buff_w + (i+1)*inc_w; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( upsilon11, minus_upsilon11 ); // FLA_Scal( FLA_MINUS_ONE, minus_upsilon11 ); // FLA_Copy( minus_upsilon11, minus_conj_upsilon11 ); bl1_smult3( buff_m1, upsilon11, &minus_upsilon11 ); bl1_scopyconj( &minus_upsilon11, &minus_conj_upsilon11 ); // FLA_Copy( zeta11, minus_zeta11 ); // FLA_Scal( FLA_MINUS_ONE, minus_zeta11 ); // FLA_Copy( minus_zeta11, minus_conj_zeta11 ); bl1_smult3( buff_m1, zeta11, &minus_zeta11 ); bl1_scopyconj( &minus_zeta11, &minus_conj_zeta11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_upsilon11, zeta11, alpha11 ); // FLA_Axpyt( FLA_CONJ_NO_TRANSPOSE, minus_zeta11, upsilon11, alpha11 ); bl1_saxpyv( BLIS1_CONJUGATE, 1, &minus_upsilon11, zeta11, 1, alpha11, 1 ); bl1_saxpyv( BLIS1_CONJUGATE, 1, &minus_zeta11, upsilon11, 1, alpha11, 1 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_zeta11, u21, a21 ); // FLA_Axpyt( FLA_NO_TRANSPOSE, minus_conj_upsilon11, z21, a21 ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &minus_conj_zeta11, u21, inc_u, a21, rs_A ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &minus_conj_upsilon11, z21, inc_z, a21, rs_A ); } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bl1_sdiv3( buff_1, tau11, &inv_tau11 ); bl1_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; } if ( m_behind > 0 ) { // FLA_Her2( FLA_LOWER_TRIANGULAR, FLA_MINUS_ONE, u21, z21, A22 ); bl1_ssyr2( BLIS1_LOWER_TRIANGULAR, m_ahead, buff_m1, u21, inc_u, z21, inc_z, A22, rs_A, cs_A ); } if ( m_ahead > 0 ) { // FLA_Hemv( FLA_LOWER_TRIANGULAR, FLA_ONE, A22, a21, FLA_ZERO, w21 ); bl1_ssymv( BLIS1_LOWER_TRIANGULAR, m_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, w21, inc_w ); // FLA_Copy( a21, u21 ); // FLA_Copy( w21, z21 ); bl1_scopyv( BLIS1_NO_CONJUGATE, m_ahead, a21, rs_A, u21, inc_u ); bl1_scopyv( BLIS1_NO_CONJUGATE, m_ahead, w21, inc_w, z21, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); bl1_sdot( BLIS1_CONJUGATE, m_ahead, a21, rs_A, z21, inc_z, &beta ); bl1_sinvscals( buff_2, &beta ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bl1_sscals( &minus_inv_tau11, &beta ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, inc_z ); bl1_sscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, z21, inc_z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bl1_sgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } if ( m_behind + 1 == b_alg && m_ahead > 0 ) { // FLA_Her2( FLA_LOWER_TRIANGULAR, FLA_MINUS_ONE, u21, z21, A22 ); bl1_ssyr2( BLIS1_LOWER_TRIANGULAR, m_ahead, buff_m1, u21, inc_u, z21, inc_z, A22, rs_A, cs_A ); } /*------------------------------------------------------------*/ } // FLA_Obj_free( &u ); // FLA_Obj_free( &z ); // FLA_Obj_free( &w ); FLA_free( buff_u ); FLA_free( buff_z ); FLA_free( buff_w ); return FLA_SUCCESS; }
FLA_Error FLA_Tridiag_UT_l_step_ops_var1( int m_A, int m_T, float* buff_A, int rs_A, int cs_A, float* buff_T, int rs_T, int cs_T ) { float* buff_2 = FLA_FLOAT_PTR( FLA_TWO ); float* buff_1 = FLA_FLOAT_PTR( FLA_ONE ); float* buff_0 = FLA_FLOAT_PTR( FLA_ZERO ); float* buff_m1 = FLA_FLOAT_PTR( FLA_MINUS_ONE ); float first_elem; float beta; float inv_tau11; float minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &z ); float* buff_z = ( float* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_z = 1; for ( i = 0; i < b_alg; ++i ) { float* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; float* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; float* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; float* t01 = buff_T + (i )*cs_T + (0 )*rs_T; float* tau11 = buff_T + (i )*cs_T + (i )*rs_T; float* z21 = buff_z + (i+1)*inc_z; float* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; float* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; int m_ahead = m_A - i - 1; int n_behind = i; /*------------------------------------------------------------*/ if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_ops( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bl1_sdiv3( buff_1, tau11, &inv_tau11 ); bl1_sneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Hemv( FLA_LOWER_TRIANGULAR, FLA_ONE, A22, a21, FLA_ZERO, z21 ); bl1_ssymv( BLIS1_LOWER_TRIANGULAR, m_ahead, buff_1, A22, rs_A, cs_A, a21, rs_A, buff_0, z21, inc_z ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); bl1_sdot( BLIS1_CONJUGATE, m_ahead, a21, rs_A, z21, inc_z, &beta ); bl1_sinvscals( buff_2, &beta ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bl1_sscals( &minus_inv_tau11, &beta ); bl1_saxpyv( BLIS1_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, inc_z ); bl1_sscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, z21, inc_z ); // FLA_Her2( FLA_LOWER_TRIANGULAR, FLA_MINUS_ONE, a21, z21, A22 ); bl1_ssyr2( BLIS1_LOWER_TRIANGULAR, m_ahead, buff_m1, a21, rs_A, z21, inc_z, A22, rs_A, cs_A ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, t01 ); bl1_sgemv( BLIS1_CONJ_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_1, A20, rs_A, cs_A, a21, rs_A, buff_0, t01, rs_T ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &z ); FLA_free( buff_z ); return FLA_SUCCESS; }
int main( int argc, char *argv[] ) { int i, j, n_threads, n_repeats, n_trials, increment, begin, sorting, caching, work_stealing, data_affinity; dim_t size, nb_alg; FLA_Datatype datatype = FLA_DOUBLE; FLA_Inv inv = FLA_NO_INVERSE; FLA_Uplo uplo = FLA_LOWER_TRIANGULAR; FLA_Obj A, B, x, b, b_norm, AH, BH; double length, b_norm_value = 0.0, dtime, *dtimes, *flops; #ifndef FLA_ENABLE_WINDOWS_BUILD char output_file_m[100]; FILE *fpp; #endif fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%u", &nb_alg ); fprintf( stdout, "%c %u\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n", '%', n_threads ); fprintf( stdout, "%c Enter SuperMatrix parameters: sorting, caching, work stealing, data affinity: ", '%' ); scanf( "%d%d%d%d", &sorting, &caching, &work_stealing, &data_affinity ); fprintf( stdout, "%c %s %s %s %s\n\n", '%', ( sorting ? "TRUE" : "FALSE" ), ( caching ? "TRUE" : "FALSE" ), ( work_stealing ? "TRUE" : "FALSE" ), ( data_affinity ? ( data_affinity == 1 ? "FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC" : "FLASH_QUEUE_AFFINITY_OTHER" ) : "FLASH_QUEUE_AFFINITY_NONE" ) ); #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #else sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | FLASH |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); fprintf( fpp, "%s_%u = [\n", OUTPUT_FILE, nb_alg ); #endif FLA_Init(); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); FLASH_Queue_set_num_threads( n_threads ); FLASH_Queue_set_sorting( sorting ); FLASH_Queue_set_caching( caching ); FLASH_Queue_set_work_stealing( work_stealing ); FLASH_Queue_set_data_affinity( data_affinity ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); FLA_Obj_create( datatype, size, size, 0, 0, &B ); FLA_Obj_create( datatype, size, 1, 0, 0, &x ); FLA_Obj_create( datatype, size, 1, 0, 0, &b ); FLA_Obj_create( datatype, 1, 1, 0, 0, &b_norm ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); FLA_Random_matrix( B ); FLA_Random_matrix( x ); FLA_Random_matrix( b ); FLA_Symmetrize( uplo, A ); FLA_Symmetrize( uplo, B ); length = ( double ) FLA_Obj_length( B ); FLA_Add_to_diag( &length, B ); FLA_Symv_external( uplo, FLA_ONE, B, x, FLA_ZERO, b ); FLASH_Obj_create_hier_copy_of_flat( A, 1, &nb_alg, &AH ); FLASH_Obj_create_hier_copy_of_flat( B, 1, &nb_alg, &BH ); FLASH_Chol( uplo, BH ); dtime = FLA_Clock(); FLASH_Eig_gest( inv, uplo, AH, BH ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; FLASH_Obj_free( &AH ); FLASH_Obj_free( &BH ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 1.0 * size * size * size / dtime / 1e9; #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, " %d %6.3f %le\n", size, flops[i], b_norm_value ); #else fprintf( fpp, " %d %6.3f\n", size, flops[i] ); fprintf( stdout, "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); fprintf( stdout, "Matrix size: %u x %u | nb_alg: %u\n", size, size, nb_alg ); fprintf( stdout, "Norm of difference: %le\n\n", b_norm_value ); #endif FLA_Obj_free( &A ); FLA_Obj_free( &B ); FLA_Obj_free( &x ); FLA_Obj_free( &b ); FLA_Obj_free( &b_norm ); } #ifdef FLA_ENABLE_WINDOWS_BUILD fprintf( stdout, "];\n\n" ); #else fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); #endif FLA_free( dtimes ); FLA_free( flops ); FLA_Finalize(); return 0; }
FLA_Error FLA_Hess_UT_step_ofd_var4( int m_A, int m_T, double* buff_A, int rs_A, int cs_A, double* buff_Y, int rs_Y, int cs_Y, double* buff_Z, int rs_Z, int cs_Z, double* buff_T, int rs_T, int cs_T ) { double* buff_2 = FLA_DOUBLE_PTR( FLA_TWO ); double* buff_1 = FLA_DOUBLE_PTR( FLA_ONE ); double* buff_0 = FLA_DOUBLE_PTR( FLA_ZERO ); double* buff_m1 = FLA_DOUBLE_PTR( FLA_MINUS_ONE ); double first_elem, last_elem; double dot_product; double beta, conj_beta; double inv_tau11; double minus_inv_tau11; int i; // b_alg = FLA_Obj_length( T ); int b_alg = m_T; // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &d ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &e ); // FLA_Obj_create( datatype_A, m_A, 1, 0, 0, &f ); double* buff_e = ( double* ) FLA_malloc( m_A * sizeof( *buff_A ) ); int inc_e = 1; // FLA_Set( FLA_ZERO, Y ); // FLA_Set( FLA_ZERO, Z ); bl1_dsetm( m_A, b_alg, buff_0, buff_Y, rs_Y, cs_Y ); bl1_dsetm( m_A, b_alg, buff_0, buff_Z, rs_Z, cs_Z ); for ( i = 0; i < b_alg; ++i ) { double* a10t = buff_A + (0 )*cs_A + (i )*rs_A; double* A20 = buff_A + (0 )*cs_A + (i+1)*rs_A; double* alpha11 = buff_A + (i )*cs_A + (i )*rs_A; double* a21 = buff_A + (i )*cs_A + (i+1)*rs_A; double* A02 = buff_A + (i+1)*cs_A + (0 )*rs_A; double* a12t = buff_A + (i+1)*cs_A + (i )*rs_A; double* A22 = buff_A + (i+1)*cs_A + (i+1)*rs_A; double* y10t = buff_Y + (0 )*cs_Y + (i )*rs_Y; double* Y20 = buff_Y + (0 )*cs_Y + (i+1)*rs_Y; double* y21 = buff_Y + (i )*cs_Y + (i+1)*rs_Y; double* z10t = buff_Z + (0 )*cs_Z + (i )*rs_Z; double* Z20 = buff_Z + (0 )*cs_Z + (i+1)*rs_Z; double* z21 = buff_Z + (i )*cs_Z + (i+1)*rs_Z; double* t01 = buff_T + (i )*cs_T + (0 )*rs_T; double* tau11 = buff_T + (i )*cs_T + (i )*rs_T; double* e0 = buff_e + (0 )*inc_e; double* a10t_r = a10t + (i-1)*cs_A + (0 )*rs_A; double* a21_t = a21 + (0 )*cs_A + (0 )*rs_A; double* a21_b = a21 + (0 )*cs_A + (1 )*rs_A; double* ABL = a10t; double* ZBL = z10t; double* a2 = alpha11; int m_ahead = m_A - i - 1; int n_ahead = m_A - i - 1; int m_behind = i; int n_behind = i; /*------------------------------------------------------------*/ if ( m_behind > 0 ) { // FLA_Copy( a10t_r, last_elem ); // FLA_Set( FLA_ONE, a10t_r ); last_elem = *a10t_r; *a10t_r = *buff_1; } // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ABL, y10t, FLA_ONE, a2 ); // FLA_Gemvc( FLA_NO_TRANSPOSE, FLA_CONJUGATE, FLA_MINUS_ONE, ZBL, a10t, FLA_ONE, a2 ); bl1_dgemv( BLIS1_NO_TRANSPOSE, BLIS1_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ABL, rs_A, cs_A, y10t, cs_Y, buff_1, a2, rs_A ); bl1_dgemv( BLIS1_NO_TRANSPOSE, BLIS1_CONJUGATE, m_ahead + 1, n_behind, buff_m1, ZBL, rs_Z, cs_Z, a10t, cs_A, buff_1, a2, rs_A ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, a10t, FLA_ONE, a12t ); // FLA_Gemv( FLA_CONJ_NO_TRANSPOSE, FLA_MINUS_ONE, A20, z10t, FLA_ONE, a12t ); bl1_dgemv( BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, Y20, rs_Y, cs_Y, a10t, cs_A, buff_1, a12t, cs_A ); bl1_dgemv( BLIS1_CONJ_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, z10t, cs_Z, buff_1, a12t, cs_A ); if ( m_behind > 0 ) { // FLA_Copy( last_elem, a10t_r ); *a10t_r = last_elem; } if ( m_ahead > 0 ) { // FLA_Househ2_UT( FLA_LEFT, // a21_t, // a21_b, tau11 ); FLA_Househ2_UT_l_opd( m_ahead - 1, a21_t, a21_b, rs_A, tau11 ); // FLA_Set( FLA_ONE, inv_tau11 ); // FLA_Inv_scalc( FLA_NO_CONJUGATE, tau11, inv_tau11 ); // FLA_Copy( inv_tau11, minus_inv_tau11 ); // FLA_Scal( FLA_MINUS_ONE, minus_inv_tau11 ); bl1_ddiv3( buff_1, tau11, &inv_tau11 ); bl1_dneg2( &inv_tau11, &minus_inv_tau11 ); // FLA_Copy( a21_t, first_elem ); // FLA_Set( FLA_ONE, a21_t ); first_elem = *a21_t; *a21_t = *buff_1; // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A22, a21, FLA_ZERO, z21 ); FLA_Fused_Ahx_Ax_opd_var1( m_ahead, n_ahead, A22, rs_A, cs_A, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, FLA_ZERO, d0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Y20, a21, FLA_ZERO, e0 ); // FLA_Gemv( FLA_CONJ_TRANSPOSE, FLA_ONE, Z20, a21, FLA_ZERO, f0 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Y20, d0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, f0, FLA_ONE, y21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, e0, FLA_ONE, z21 ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, Z20, d0, FLA_ONE, z21 ); // FLA_Copy( d0, t01 ); FLA_Fused_Uhu_Yhu_Zhu_opd_var1( m_ahead, n_behind, buff_m1, A20, rs_A, cs_A, Y20, rs_Y, cs_Y, Z20, rs_Z, cs_Z, t01, rs_T, a21, rs_A, y21, rs_Y, z21, rs_Z ); // FLA_Dotc( FLA_CONJUGATE, a21, z21, beta ); // FLA_Inv_scal( FLA_TWO, beta ); // FLA_Copyt( FLA_CONJ_NO_TRANSPOSE, beta, conj_beta ); bl1_ddot( BLIS1_CONJUGATE, m_ahead, a21, rs_A, z21, rs_Z, &beta ); bl1_dinvscals( buff_2, &beta ); bl1_dcopyconj( &beta, &conj_beta ); // FLA_Scal( minus_inv_tau11, conj_beta ); // FLA_Axpy( conj_beta, a21, y21 ); // FLA_Scal( inv_tau11, y21 ); bl1_dscals( &minus_inv_tau11, &conj_beta ); bl1_daxpyv( BLIS1_NO_CONJUGATE, m_ahead, &conj_beta, a21, rs_A, y21, rs_Y ); bl1_dscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, y21, rs_Y ); // FLA_Scal( minus_inv_tau11, beta ); // FLA_Axpy( beta, a21, z21 ); // FLA_Scal( inv_tau11, z21 ); bl1_dscals( &minus_inv_tau11, &beta ); bl1_daxpyv( BLIS1_NO_CONJUGATE, m_ahead, &beta, a21, rs_A, z21, rs_Z ); bl1_dscalv( BLIS1_NO_CONJUGATE, m_ahead, &inv_tau11, z21, rs_Z ); // FLA_Dot( a12t, a21, dot_product ); // FLA_Scal( minus_inv_tau11, dot_product ); // FLA_Axpyt( FLA_CONJ_TRANSPOSE, dot_product, a21, a12t ); bl1_ddot( BLIS1_NO_CONJUGATE, m_ahead, a12t, cs_A, a21, rs_A, &dot_product ); bl1_dscals( &minus_inv_tau11, &dot_product ); bl1_daxpyv( BLIS1_CONJUGATE, m_ahead, &dot_product, a21, rs_A, a12t, cs_A ); // FLA_Gemv( FLA_NO_TRANSPOSE, FLA_ONE, A02, a21, FLA_ZERO, e0 ); // FLA_Gerc( FLA_NO_CONJUGATE, FLA_CONJUGATE, minus_inv_tau11, e0, a21, A02 ); bl1_dgemv( BLIS1_NO_TRANSPOSE, BLIS1_NO_CONJUGATE, m_behind, n_ahead, buff_1, A02, rs_A, cs_A, a21, rs_A, buff_0, e0, inc_e ); bl1_dger( BLIS1_NO_CONJUGATE, BLIS1_CONJUGATE, m_behind, n_ahead, &minus_inv_tau11, e0, inc_e, a21, rs_A, A02, rs_A, cs_A ); // FLA_Copy( first_elem, a21_t ); *a21_t = first_elem; } /*------------------------------------------------------------*/ } // FLA_Obj_free( &e ); FLA_free( buff_e ); return FLA_SUCCESS; }
//Note: Only retains symmetry that exists... //Note: Mode multiplies MUST be INORDER (so that traverse stored pieces correctly). //(This might could be relaxed since no matter the loop order, we will hit the unique part only once...Not sure...I think we would just have to handle the permutations) //Step 1: Partition unaltered symmetric groups of A and C first!! //Step 2: Deal with the symGroup that will be broken FLA_Error FLA_Psttv( FLA_Obj alpha, FLA_Obj A, dim_t mode, FLA_Obj beta, FLA_Obj B, FLA_Obj C ) { dim_t i; //Determine which (if any) symmetric group can safely be repartitioned similarly //between A and C. TLA_sym symC = C.sym; dim_t symGroupToSplit = -1; dim_t nModes_part; for(i = 0; i < symC.nSymGroups; i++) if(symC.symGroupLens[i] > 1) { symGroupToSplit = i; nModes_part = symC.symGroupLens[symGroupToSplit]; break; } //No group can be split, meaning mode multiplied in is on own in both tensors. //Multiply if(symGroupToSplit == -1) { FLA_Ttm_single_mode(alpha, A, mode, beta, B, C); return FLA_SUCCESS; } else { //This is the symmetric group to split dim_t symGroupToSplitOffset = TLA_sym_group_mode_offset(symC, symGroupToSplit); dim_t* part_modes; dim_t* sizes; dim_t* repart_sizes; FLA_Side* sides; FLA_Side* repart_sides; dim_t isSingleBlock; FLA_Obj** Apart; FLA_Obj** Cpart; FLA_Obj** Arepart; FLA_Obj** Crepart; dim_t update_region_stride; dim_t update_region; FLA_Obj Apass; FLA_Obj Cpass; //Initialize Views & data for loop dim_t nPart = 1 << nModes_part; dim_t nRepart = 1; for(i = 0; i < nModes_part; i++) nRepart *= 3; //Check if we are dealing with a single block //If so, we get to just multiply in a mode isSingleBlock = TRUE; for(i = 0; i < nModes_part; i++) { if(FLA_Obj_dimsize(C, symC.symModes[symGroupToSplitOffset + i]) == 0) { return FLA_SUCCESS; } if(FLA_Obj_dimsize(C, symC.symModes[symGroupToSplitOffset + i]) > 1) { isSingleBlock = FALSE; } } if(isSingleBlock) { FLA_Ttm_single_mode(alpha, A, mode, beta, B, C); return FLA_SUCCESS; } part_modes = (dim_t*)FLA_malloc(nModes_part * sizeof(dim_t)); sizes = (dim_t*)FLA_malloc(nModes_part * sizeof(dim_t)); repart_sizes = (dim_t*)FLA_malloc(nModes_part * sizeof(dim_t)); sides = (FLA_Side*)FLA_malloc(nModes_part * sizeof(dim_t)); repart_sides = (FLA_Side*)FLA_malloc(nModes_part * sizeof(dim_t)); for(i = 0; i < nModes_part; i++) { part_modes[i] = symC.symModes[symGroupToSplitOffset + i]; sizes[i] = 0; repart_sizes[i] = 1; sides[i] = FLA_TOP; repart_sides[i] = FLA_BOTTOM; } //Begin loop for general tensor case Apart = (FLA_Obj**)FLA_malloc(nPart * sizeof(FLA_Obj*)); Cpart = (FLA_Obj**)FLA_malloc(nPart * sizeof(FLA_Obj*)); Arepart = (FLA_Obj**)FLA_malloc(nRepart * sizeof(FLA_Obj*)); Crepart = (FLA_Obj**)FLA_malloc(nRepart * sizeof(FLA_Obj*)); TLA_create_part_obj(nPart, Apart); TLA_create_part_obj(nPart, Cpart); TLA_create_part_obj(nRepart, Arepart); TLA_create_part_obj(nRepart, Crepart); FLA_Part_2powm(A, Apart, nModes_part, part_modes, sizes, sides); FLA_Part_2powm(C, Cpart, nModes_part, part_modes, sizes, sides); while(FLA_Obj_dimsize(*(Cpart[0]), part_modes[0]) < FLA_Obj_dimsize(C, part_modes[0])) { FLA_Repart_2powm_to_3powm(Apart, Arepart, nModes_part, part_modes, repart_sizes, repart_sides); FLA_Repart_2powm_to_3powm(Cpart, Crepart, nModes_part, part_modes, repart_sizes, repart_sides); /******************************/ update_region_stride = 1; for(i = 1; i < nModes_part; i++) { update_region_stride *= 3; } //Symmetric region being partitioned includes //symmetric tensors of order 0->order-1 //Must update ALL of them update_region = update_region_stride; for(i = 0; i < nModes_part; i++) { Apass = *(Arepart[update_region]); Cpass = *(Crepart[update_region]); FLA_Psttv(alpha, Apass, mode, beta, B, Cpass); update_region_stride /= 3; update_region += update_region_stride; } /******************************/ FLA_Cont_with_3powm_to_2powm(Apart, Arepart, nModes_part, part_modes, repart_sides); FLA_Cont_with_3powm_to_2powm(Cpart, Crepart, nModes_part, part_modes, repart_sides); } //Tidy up alloc'd data TLA_destroy_part_obj(nPart, Apart); TLA_destroy_part_obj(nPart, Cpart); TLA_destroy_part_obj(nRepart, Arepart); TLA_destroy_part_obj(nRepart, Crepart); FLA_free(part_modes); FLA_free(sizes); FLA_free(repart_sizes); FLA_free(sides); FLA_free(repart_sides); FLA_free(Apart); FLA_free(Cpart); FLA_free(Arepart); FLA_free(Crepart); } return FLA_SUCCESS; }
int main( int argc, char *argv[] ) { int i, j, size, n_threads, n_repeats, n_trials, nb_alg, increment, begin; FLA_Datatype datatype = FLA_DOUBLE; FLA_Obj A; double b_norm_value = 0.0, dtime, *dtimes, *flops, *T; char output_file_m[100]; FILE *fpp; fprintf( stdout, "%c Enter number of repeats: ", '%' ); scanf( "%d", &n_repeats ); fprintf( stdout, "%c %d\n", '%', n_repeats ); fprintf( stdout, "%c Enter blocksize: ", '%' ); scanf( "%d", &nb_alg ); fprintf( stdout, "%c %d\n", '%', nb_alg ); fprintf( stdout, "%c Enter problem size parameters: first, inc, num: ", '%' ); scanf( "%d%d%d", &begin, &increment, &n_trials ); fprintf( stdout, "%c %d %d %d\n", '%', begin, increment, n_trials ); fprintf( stdout, "%c Enter number of threads: ", '%' ); scanf( "%d", &n_threads ); fprintf( stdout, "%c %d\n\n", '%', n_threads ); sprintf( output_file_m, "%s/%s_output.m", OUTPUT_PATH, OUTPUT_FILE ); fpp = fopen( output_file_m, "a" ); fprintf( fpp, "%%\n" ); fprintf( fpp, "%% | Matrix Size | PLASMA |\n" ); fprintf( fpp, "%% | n x n | GFlops |\n" ); fprintf( fpp, "%% -----------------------------\n" ); FLA_Init(); PLASMA_Init( n_threads ); PLASMA_Disable( PLASMA_AUTOTUNING ); PLASMA_Set( PLASMA_TILE_SIZE, nb_alg ); PLASMA_Set( PLASMA_INNER_BLOCK_SIZE, nb_alg / 4 ); dtimes = ( double * ) FLA_malloc( n_repeats * sizeof( double ) ); flops = ( double * ) FLA_malloc( n_trials * sizeof( double ) ); fprintf( fpp, "%s = [\n", OUTPUT_FILE ); for ( i = 0; i < n_trials; i++ ) { size = begin + i * increment; FLA_Obj_create( datatype, size, size, 0, 0, &A ); for ( j = 0; j < n_repeats; j++ ) { FLA_Random_matrix( A ); PLASMA_Alloc_Workspace_dgeqrf( size, size, &T ); dtime = FLA_Clock(); PLASMA_dgeqrf( size, size, FLA_Obj_buffer_at_view( A ), size, T ); dtime = FLA_Clock() - dtime; dtimes[j] = dtime; free( T ); } dtime = dtimes[0]; for ( j = 1; j < n_repeats; j++ ) dtime = min( dtime, dtimes[j] ); flops[i] = 4.0 / 3.0 * size * size * size / dtime / 1e9; fprintf( fpp, " %d %6.3f\n", size, flops[i] ); printf( "Time: %e | GFlops: %6.3f\n", dtime, flops[i] ); printf( "Matrix size: %d x %d | nb_alg: %d\n", size, size, nb_alg ); printf( "Norm of difference: %le\n\n", b_norm_value ); FLA_Obj_free( &A ); } fprintf( fpp, "];\n" ); fflush( fpp ); fclose( fpp ); FLA_free( dtimes ); FLA_free( flops ); PLASMA_Finalize(); FLA_Finalize(); return 0; }