FLA_Error FLASH_LU_piv( FLA_Obj A, FLA_Obj p ) { FLA_Error r_val = FLA_SUCCESS; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_LU_piv_check( A, p ); // *** The current LU_piv algorithm implemented assumes that // the matrix has a hierarchical depth of 1. We check for that here, because // we anticipate that we'll use a more general algorithm in the future, and // we don't want to forget to remove the constraint. *** if ( FLASH_Obj_depth( A ) != 1 ) { FLA_Print_message( "FLASH_LU_piv() currently only supports matrices of depth 1", __FILE__, __LINE__ ); FLA_Abort(); } // Begin a parallel region. FLASH_Queue_begin(); // Invoke FLA_LU_piv_internal() with large control tree. FLA_LU_piv_internal( A, p, flash_lu_piv_cntl ); // End the parallel region. FLASH_Queue_end(); // Check for singularity. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) r_val = FLASH_LU_find_zero_on_diagonal( A ); return r_val; }
FLA_Error FLA_Check_error_code_helper( int code, char* file, int line ) { if ( code == FLA_SUCCESS ) return code; //if ( /* fatal error checking enabled */ ) if ( TRUE ) { if ( FLA_ERROR_CODE_MAX <= code && code <= FLA_ERROR_CODE_MIN ) { FLA_Print_message( FLA_Error_string_for_code( code ), file, line ); FLA_Abort(); } else { FLA_Print_message( FLA_Error_string_for_code( FLA_UNDEFINED_ERROR_CODE ), file, line ); FLA_Abort(); } } return code; }
void F77_fla_obj_show( char* prefix, int* m, int* n, void* buffer, int* ldim ) { FLA_Error init_result; FLA_Datatype datatype; FLA_Obj A; switch( *prefix ) { case 'i': case 'I': datatype = FLA_INT; break; case 's': case 'S': datatype = FLA_FLOAT; break; case 'd': case 'D': datatype = FLA_DOUBLE; break; case 'c': case 'C': datatype = FLA_COMPLEX; break; case 'z': case 'Z': datatype = FLA_DOUBLE_COMPLEX; break; default: fprintf(stderr, "Invalid prefix %c, where i,s,d,c,z are allowed.\n", *prefix); FLA_Abort(); } FLA_Init_safe( &init_result ); FLA_Obj_create_without_buffer( datatype, *m, *n, &A ); FLA_Obj_attach_buffer( buffer, 1, *ldim, &A ); FLA_Obj_fshow( stdout, "= F77_FLA_OBJ_SHOW =", A, "% 6.4e", "=-=-=-=-=-=-=-=-=-=-\n"); FLA_Obj_free_without_buffer( &A ); FLA_Finalize_safe( init_result ); }
FLA_Error FLASH_Apply_Q_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B ) { FLA_Error r_val; dim_t b_alg; // Check parameters. if ( FLA_Check_error_level() >= FLA_MIN_ERROR_CHECKING ) FLA_Apply_Q_UT_check( side, trans, direct, storev, A, T, W, B ); // Inspect the length of TTL to get the blocksize used by the QR/LQ // factorization, which will be our inner blocksize for Apply_Q_UT. b_alg = FLASH_Obj_scalar_length_tl( T ); // The traditional (non-incremental) Apply_Q_UT algorithm-by-blocks // requires that the algorithmic blocksize be equal to the storage // blocksize. if ( b_alg != FLASH_Obj_scalar_width_tl( T ) ) { FLA_Print_message( "FLASH_Apply_Q_UT() requires that b_alg == b_store", __FILE__, __LINE__ ); FLA_Abort(); } // Adjust the blocksize of the control tree node for the flat subproblem. if ( FLA_Cntl_blocksize( fla_apqut_cntl_leaf ) != NULL ) FLA_Blocksize_set( FLA_Cntl_blocksize( fla_apqut_cntl_leaf ), b_alg, b_alg, b_alg, b_alg ); // Begin a parallel region. FLASH_Queue_begin(); // Invoke FLA_Apply_Q_UT_internal() with the standard control tree. r_val = FLA_Apply_Q_UT_internal( side, trans, direct, storev, A, T, W, B, flash_apqut_cntl_blas ); // End the parallel region. FLASH_Queue_end(); return r_val; }
int FLA_task_determine_matrix_size( FLA_Obj A, FLA_Quadrant from ) { int r_val = 0; // Determine the size of the matrix dimension along which we are moving. switch( from ) { case FLA_TOP: case FLA_BOTTOM: { r_val = FLA_Obj_length( A ); break; } case FLA_LEFT: case FLA_RIGHT: { r_val = FLA_Obj_width( A ); break; } case FLA_TL: case FLA_TR: case FLA_BL: case FLA_BR: { // If A happens to be the full object, we need to use min_dim() here // because the matrix might be rectangular. If A is the processed // partition, it is very probably square, and min_dim() doesn't hurt. r_val = FLA_Obj_min_dim( A ); break; } default: FLA_Print_message( "Unexpected default in switch statement!", __FILE__, __LINE__ ); FLA_Abort(); } return r_val; }
int FLA_Task_compute_blocksize( int tag, FLA_Obj A, FLA_Obj A_proc, FLA_Quadrant from ) { int n_threads = FLA_Queue_get_num_threads(); int A_size, A_proc_size; int n_part; int b; // Determine the sizes of the matrix partitions. A_size = FLA_task_determine_matrix_size( A, from ); A_proc_size = FLA_task_determine_matrix_size( A_proc, from ); // Determine the raw blocksize value. n_part = FLA_task_get_num_partitions( n_threads, tag ); // Determine the blocksize based on the sign of the value from // _get_num_partitions(). if( n_part > 0 ) { b = FLA_task_determine_absolute_blocksize( A_size, A_proc_size, n_part ); } else if( n_part < 0 ) { b = FLA_task_determine_relative_blocksize( A_size, A_proc_size, abs(n_part) ); } else { FLA_Print_message( "Detected blocksize of 0!", __FILE__, __LINE__ ); FLA_Abort(); } return b; }
FLA_Error FLA_Tevd_v_opz_var2( int m_A, int m_U, int n_G, int n_iter_max, double* buff_d, int inc_d, double* buff_e, int inc_e, dcomplex* buff_G, int rs_G, int cs_G, double* buff_R, int rs_R, int cs_R, dcomplex* buff_W, int rs_W, int cs_W, dcomplex* buff_U, int rs_U, int cs_U, int b_alg ) { dcomplex one = bl1_z1(); double rone = bl1_d1(); double rzero = bl1_d0(); dcomplex* G; double* d1; double* e1; int r_val; int done; int m_G_sweep_max; int ij_begin; int ijTL, ijBR; int m_A11; int n_iter_perf; int n_U_apply; int total_deflations; int n_deflations; int n_iter_prev; int n_iter_perf_sweep_max; // Initialize our completion flag. done = FALSE; // Initialize a counter that holds the maximum number of rows of G // that we would need to initialize for the next sweep. m_G_sweep_max = m_A - 1; // Initialize a counter for the total number of iterations performed. n_iter_prev = 0; // Initialize R to identity. bl1_dident( m_A, buff_R, rs_R, cs_R ); // Iterate until the matrix has completely deflated. for ( total_deflations = 0; done != TRUE; ) { // Initialize G to contain only identity rotations. bl1_zsetm( m_G_sweep_max, n_G, &one, buff_G, rs_G, cs_G ); // Keep track of the maximum number of iterations performed in the // current sweep. This is used when applying the sweep's Givens // rotations. n_iter_perf_sweep_max = 0; // Perform a sweep: Move through the matrix and perform a tridiagonal // EVD on each non-zero submatrix that is encountered. During the // first time through, ijTL will be 0 and ijBR will be m_A - 1. for ( ij_begin = 0; ij_begin < m_A; ) { #ifdef PRINTF if ( ij_begin == 0 ) printf( "FLA_Tevd_v_opz_var2: beginning new sweep (ij_begin = %d)\n", ij_begin ); #endif // Search for the first submatrix along the diagonal that is // bounded by zeroes (or endpoints of the matrix). If no // submatrix is found (ie: if the entire subdiagonal is zero // then FLA_FAILURE is returned. This function also inspects // subdiagonal elements for proximity to zero. If a given // element is close enough to zero, then it is deemed // converged and manually set to zero. r_val = FLA_Tevd_find_submatrix_opd( m_A, ij_begin, buff_d, inc_d, buff_e, inc_e, &ijTL, &ijBR ); // Verify that a submatrix was found. If one was not found, // then we are done with the current sweep. Furthermore, if // a submatrix was not found AND we began our search at the // beginning of the matrix (ie: ij_begin == 0), then the // matrix has completely deflated and so we are done with // Francis step iteration. if ( r_val == FLA_FAILURE ) { if ( ij_begin == 0 ) { #ifdef PRINTF printf( "FLA_Tevd_v_opz_var2: subdiagonal is completely zero.\n" ); printf( "FLA_Tevd_v_opz_var2: Francis iteration is done!\n" ); #endif done = TRUE; } // Break out of the current sweep so we can apply the last // remaining Givens rotations. break; } // If we got this far, then: // (a) ijTL refers to the index of the first non-zero // subdiagonal along the diagonal, and // (b) ijBR refers to either: // - the first zero element that occurs after ijTL, or // - the the last diagonal element. // Note that ijTL and ijBR also correspond to the first and // last diagonal elements of the submatrix of interest. Thus, // we may compute the dimension of this submatrix as: m_A11 = ijBR - ijTL + 1; #ifdef PRINTF printf( "FLA_Tevd_v_opz_var2: ij_begin = %d\n", ij_begin ); printf( "FLA_Tevd_v_opz_var2: ijTL = %d\n", ijTL ); printf( "FLA_Tevd_v_opz_var2: ijBR = %d\n", ijBR ); printf( "FLA_Tevd_v_opz_var2: m_A11 = %d\n", m_A11 ); #endif // Adjust ij_begin, which gets us ready for the next subproblem, if // there is one. ij_begin = ijBR + 1; // Index to the submatrices upon which we will operate. d1 = buff_d + ijTL * inc_d; e1 = buff_e + ijTL * inc_e; G = buff_G + ijTL * rs_G; // Search for a batch of eigenvalues, recursing on deflated // subproblems whenever a split occurs. Iteration continues // as long as // (a) there is still matrix left to operate on, and // (b) the number of iterations performed in this batch is // less than n_G. // If/when either of the two above conditions fails to hold, // the function returns. n_deflations = FLA_Tevd_iteracc_v_opd_var1( m_A11, n_G, ijTL, d1, inc_d, e1, inc_e, G, rs_G, cs_G, &n_iter_perf ); // Record the number of deflations that we observed. total_deflations += n_deflations; // Update the maximum number of iterations performed in the // current sweep. n_iter_perf_sweep_max = max( n_iter_perf_sweep_max, n_iter_perf ); #ifdef PRINTF printf( "FLA_Tevd_v_opz_var2: deflations observed = %d\n", n_deflations ); printf( "FLA_Tevd_v_opz_var2: total deflations observed = %d\n", total_deflations ); printf( "FLA_Tevd_v_opz_var2: num iterations = %d\n", n_iter_perf ); #endif // Store the most recent value of ijBR in m_G_sweep_max. // When the sweep is done, this value will contain the minimum // number of rows of G we can apply and safely include all // non-identity rotations that were computed during the // eigenvalue searches. m_G_sweep_max = ijBR; // Make sure we haven't exceeded our maximum iteration count. if ( n_iter_prev >= m_A * n_iter_max ) { #ifdef PRINTF printf( "FLA_Tevd_v_opz_var2: reached maximum total number of iterations: %d\n", n_iter_prev ); #endif FLA_Abort(); //return FLA_FAILURE; } } // The sweep is complete. Now we must apply the Givens rotations // that were accumulated during the sweep. // Recall that the number of columns of U to which we apply // rotations is one more than the number of rotations. n_U_apply = m_G_sweep_max + 1; // Apply the Givens rotations that were computed as part of // the previous batch of iterations. //FLA_Apply_G_rf_bld_var8b( n_iter_perf_sweep_max, //FLA_Apply_G_rf_bld_var5b( n_iter_perf_sweep_max, FLA_Apply_G_rf_bld_var3b( n_iter_perf_sweep_max, //FLA_Apply_G_rf_bld_var9b( n_iter_perf_sweep_max, //FLA_Apply_G_rf_bld_var6b( n_iter_perf_sweep_max, m_U, n_U_apply, n_iter_prev, buff_G, rs_G, cs_G, buff_R, rs_R, cs_R, b_alg ); #ifdef PRINTF printf( "FLA_Tevd_v_opz_var2: applying %d sets of Givens rotations\n", n_iter_perf_sweep_max ); #endif // Increment the total number of iterations previously performed. n_iter_prev += n_iter_perf_sweep_max; } // Copy the contents of Q to temporary storage. bl1_zcopymt( BLIS1_NO_TRANSPOSE, m_A, m_A, buff_U, rs_U, cs_U, buff_W, rs_W, cs_W ); // Multiply Q by R, overwriting U. bl1_dgemm( BLIS1_NO_TRANSPOSE, BLIS1_NO_TRANSPOSE, 2*m_A, m_A, m_A, &rone, ( double* )buff_W, rs_W, 2*cs_W, buff_R, rs_R, cs_R, &rzero, ( double* )buff_U, rs_U, 2*cs_U ); return n_iter_prev; }