static boost::tuple< boost::shared_ptr<Matrix>, boost::shared_ptr<Matrix> > transfer_operators(const Matrix &A, params &prm) { typedef typename backend::value_type<Matrix>::type V; const size_t n = rows(A); TIC("aggregates"); Aggregates aggr(A, prm.aggr, prm.nullspace.cols); TOC("aggregates"); TIC("interpolation"); boost::shared_ptr<Matrix> P = tentative_prolongation<Matrix>( n, aggr.count, aggr.id, prm.nullspace, prm.aggr.block_size ); TOC("interpolation"); boost::shared_ptr<Matrix> R = boost::make_shared<Matrix>(); *R = transpose(*P); if (prm.nullspace.cols > 0) prm.aggr.block_size = prm.nullspace.cols; return boost::make_tuple(P, R); }
amg(const Matrix &M, const params &p = params()) : prm(p) { precondition( backend::rows(M) == backend::cols(M), "Matrix should be square!" ); boost::shared_ptr<build_matrix> P, R; boost::shared_ptr<build_matrix> A = boost::make_shared<build_matrix>( M ); sort_rows(*A); while( backend::rows(*A) > prm.coarse_enough) { TIC("transfer operators"); boost::tie(P, R) = Coarsening::transfer_operators( *A, prm.coarsening); precondition( backend::cols(*P) > 0, "Zero-sized coarse level in amgcl (diagonal matrix?)" ); TOC("transfer operators"); TIC("move to backend") levels.push_back( level(A, P, R, prm) ); TOC("move to backend") TIC("coarse operator"); A = Coarsening::coarse_operator(*A, *P, *R, prm.coarsening); sort_rows(*A); TOC("coarse operator"); } TIC("coarsest level"); levels.push_back( level(A, prm, levels.empty()) ); TOC("coarsest level"); }
static std::pair< sparse::matrix<value_t, index_t>, sparse::matrix<value_t, index_t> > interp(const sparse::matrix<value_t, index_t> &A, const params &prm) { const index_t n = sparse::matrix_rows(A); std::vector<index_t> aggr; assert(prm.dof_per_node > 0); if (prm.dof_per_node == 1) { // Scalar system. Nothing fancy. TIC("aggregates"); aggr_type::aggregates(A, aggr::connect(A, prm.eps_strong)).swap(aggr); TOC("aggregates"); } else { // Non-scalar system. // Build reduced matrix, find connections and aggregates with it, // restore the vectors to full size. std::pair<std::vector<char>, std::vector<index_t> > S_aggr = aggr::pointwise_coarsening<aggr_type>( A, prm.eps_strong, prm.dof_per_node); aggr.swap(S_aggr.second); } index_t nc = std::max( static_cast<index_t>(0), *std::max_element(aggr.begin(), aggr.end()) + static_cast<index_t>(1) ); TIC("interpolation"); static std::pair< sparse::matrix<value_t, index_t>, sparse::matrix<value_t, index_t> > PR; sparse::matrix<value_t, index_t> &P = PR.first; sparse::matrix<value_t, index_t> &R = PR.second; P.resize(n, nc); P.col.reserve(n); P.val.reserve(n); P.row[0] = 0; for(index_t i = 0; i < n; ++i) { if (aggr[i] >= 0) { P.row[i + 1] = P.row[i] + 1; P.col.push_back(aggr[i]); P.val.push_back(static_cast<value_t>(1)); } else { P.row[i + 1] = P.row[i]; } } TOC("interpolation"); sparse::transpose(P).swap(R); return PR; }
double exchange( int dim_x, int dim_y, int dim_z, double delta_x, double delta_y, double delta_z, bool periodic_x, bool periodic_y, bool periodic_z, const Matrix &Ms, const Matrix &A, const VectorMatrix &M, VectorMatrix &H) { const bool use_cuda = isCudaEnabled(); double res = 0; if (use_cuda) { #ifdef HAVE_CUDA CUTIC("exchange"); res = exchange_cuda(dim_x, dim_y, dim_z, delta_x, delta_y, delta_z, periodic_x, periodic_y, periodic_z, Ms, A, M, H, isCuda64Enabled()); CUTOC("exchange"); #else assert(0); #endif } else { TIC("exchange"); res = exchange_cpu(dim_x, dim_y, dim_z, delta_x, delta_y, delta_z, periodic_x, periodic_y, periodic_z, Ms, A, M, H); TOC("exchange"); } return res; }
double cubic_anisotropy( const VectorMatrix &axis1, const VectorMatrix &axis2, const Matrix &k, const Matrix &Ms, const VectorMatrix &M, VectorMatrix &H) { const bool use_cuda = isCudaEnabled(); double energy_sum = 0.0; if (use_cuda) { #ifdef HAVE_CUDA CUTIC("cubic_anisotropy"); energy_sum = cubic_anisotropy_cuda(axis1, axis2, k, Ms, M, H, isCuda64Enabled()); CUTOC("cubic_anisotropy"); #else assert(0); #endif } else { TIC("cubic_anisotropy"); energy_sum = cubic_anisotropy_cpu(axis1, axis2, k, Ms, M, H); TOC("cubic_anisotropy"); } return energy_sum; }
void minimize( const Matrix &f, const double h, const VectorMatrix &M, const VectorMatrix &H, VectorMatrix &M2) { const bool use_cuda = isCudaEnabled(); if (use_cuda) { #ifdef HAVE_CUDA CUTIC("minimize"); #ifdef HAVE_CUDA_64 if (isCuda64Enabled()) minimize_cu64(f, h, M, H, M2); else #endif minimize_cu32(f, h, M, H, M2); CUTOC("minimize"); #else assert(0); #endif } else { TIC("minimize"); minimize_cpu(f, h, M, H, M2); TOC("minimize"); } }
void nfst_adjoint( nfst_plan *ths) { /** * use ths->my_fftw_plan * **/ ths->g_hat = ths->g2; ths->g = ths->g1; /** * set \f$ g_l = \sum_{j=0}^{M-1} f_j \psi\left(x_j-\frac{l}{n}\right) * \text{ for } l \in I_n,m(x_j) \f$ * */ TIC(2) nfst_B_T( ths); TOC(2) /** * compute by d-variate discrete cosine transform * \f$ \hat g_k = \sum_{l \in I_n} g_l {\rm e}^{-2\pi {\rm i} \frac{kl}{n}} * \text{ for } k \in I_N\f$ * */ TIC(1) fftw_execute( ths->my_fftw_r2r_plan); TOC(1) /** * form \f$ \hat f_k = \frac{\hat g_k}{c_k\left(\phi\right)} \text{ for } * k \in I_N \f$ * */ TIC(0) nfst_D_T( ths); TOC(0) } /* nfst_adjoint */
/** * user routines * */ void nfst_trafo( nfst_plan *ths) { /** * use ths->my_fftw_r2r_plan * */ ths->g_hat = ths->g1; ths->g = ths->g2; /** * form \f$ \hat g_k = \frac{\hat f_k}{c_k\left(\phi\right)} \text{ for } * k \in I_N \f$ * */ TIC(0) nfst_D_A( ths); TOC(0) /** * compute by d-variate discrete Fourier transform * \f$ g_l = \sum_{k \in I_N} \hat g_k {\rm e}^{-2\pi {\rm i} \frac{kl}{n}} * \text{ for } l \in I_n \f$ * */ TIC(1) fftw_execute( ths->my_fftw_r2r_plan); TOC(1) /** * set \f$ f_j = \sum_{l \in I_n,m(x_j)} g_l \psi\left(x_j-\frac{l}{n}\right) * \text{ for } j=0,\hdots,M-1 \f$ * */ TIC(2) nfst_B_A( ths); TOC(2) } /* nfst_trafo */
void FeatureExtractor::calcObservedActions(Observation prevObs, Observation obs, std::vector<Action::Type> &actions) { actions.resize(prevObs.positions.size()); TIC(historyuncenter); prevObs.uncenterPrey(dims); obs.uncenterPrey(dims); TOC(historyuncenter); //std::cout << prevObs << " " << obs << std::endl << std::flush; bool prevCapture = obs.didPreyMoveIllegally(dims,prevObs.absPrey); for (unsigned int i = 0; i < prevObs.positions.size(); i++) { // skip if the prey was captured last step if (prevCapture && ((int)i == obs.preyInd)) { actions[i] = Action::NUM_ACTIONS; continue; } TIC(historydiff); Point2D diff = getDifferenceToPoint(dims,prevObs.positions[i],obs.positions[i]); TOC(historydiff); TIC(historyaction); //actions.push_back(getAction(diff)); actions[i] = getAction(diff); TOC(historyaction); } }
void FeatureExtractor::updateHistory(const Observation &obs, FeatureExtractorHistory &history) { std::vector<Action::Type> observedActions; if (history.initialized) { TIC(historycalc); calcObservedActions(history.obs,obs,observedActions); TOC(historycalc); } else { //std::cout << "no hist " << obs << std::endl; for (unsigned int i = 0; i < obs.positions.size(); i++) { history.actionHistory.push_back(boost::circular_buffer<Action::Type>(HISTORY_SIZE)); observedActions.push_back(Action::NUM_ACTIONS); } } for (unsigned int agentInd = 0; agentInd < obs.positions.size(); agentInd++) { history.actionHistory[agentInd].push_front(observedActions[agentInd]); } history.initialized = true; history.obs = obs; }
/* Limit, stabilize, convert and quantize NLSFs */ void silk_process_NLSFs( silk_encoder_state *psEncC, /* I/O Encoder state */ opus_int16 PredCoef_Q12[ 2 ][ MAX_LPC_ORDER ], /* O Prediction coefficients */ opus_int16 pNLSF_Q15[ MAX_LPC_ORDER ], /* I/O Normalized LSFs (quant out) (0 - (2^15-1)) */ const opus_int16 prev_NLSFq_Q15[ MAX_LPC_ORDER ] /* I Previous Normalized LSFs (0 - (2^15-1)) */ ) { opus_int i, doInterpolate; opus_int NLSF_mu_Q20; opus_int32 i_sqr_Q15; opus_int16 pNLSF0_temp_Q15[ MAX_LPC_ORDER ]; opus_int16 pNLSFW_QW[ MAX_LPC_ORDER ]; opus_int16 pNLSFW0_temp_QW[ MAX_LPC_ORDER ]; SKP_assert( psEncC->speech_activity_Q8 >= 0 ); SKP_assert( psEncC->speech_activity_Q8 <= SILK_FIX_CONST( 1.0, 8 ) ); /***********************/ /* Calculate mu values */ /***********************/ /* NLSF_mu = 0.003 - 0.0015 * psEnc->speech_activity; */ NLSF_mu_Q20 = SKP_SMLAWB( SILK_FIX_CONST( 0.0025, 20 ), SILK_FIX_CONST( -0.001, 28 ), psEncC->speech_activity_Q8 ); if( psEncC->nb_subfr == 2 ) { /* Multiply by 1.5 for 10 ms packets */ NLSF_mu_Q20 = SKP_ADD_RSHIFT( NLSF_mu_Q20, NLSF_mu_Q20, 1 ); } SKP_assert( NLSF_mu_Q20 > 0 ); SKP_assert( NLSF_mu_Q20 <= SILK_FIX_CONST( 0.0045, 20 ) ); /* Calculate NLSF weights */ silk_NLSF_VQ_weights_laroia( pNLSFW_QW, pNLSF_Q15, psEncC->predictLPCOrder ); /* Update NLSF weights for interpolated NLSFs */ doInterpolate = ( psEncC->useInterpolatedNLSFs == 1 ) && ( psEncC->indices.NLSFInterpCoef_Q2 < 4 ); if( doInterpolate ) { /* Calculate the interpolated NLSF vector for the first half */ silk_interpolate( pNLSF0_temp_Q15, prev_NLSFq_Q15, pNLSF_Q15, psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder ); /* Calculate first half NLSF weights for the interpolated NLSFs */ silk_NLSF_VQ_weights_laroia( pNLSFW0_temp_QW, pNLSF0_temp_Q15, psEncC->predictLPCOrder ); /* Update NLSF weights with contribution from first half */ i_sqr_Q15 = SKP_LSHIFT( SKP_SMULBB( psEncC->indices.NLSFInterpCoef_Q2, psEncC->indices.NLSFInterpCoef_Q2 ), 11 ); for( i = 0; i < psEncC->predictLPCOrder; i++ ) { pNLSFW_QW[ i ] = SKP_SMLAWB( SKP_RSHIFT( pNLSFW_QW[ i ], 1 ), pNLSFW0_temp_QW[ i ], i_sqr_Q15 ); SKP_assert( pNLSFW_QW[ i ] <= SKP_int16_MAX ); SKP_assert( pNLSFW_QW[ i ] >= 1 ); } } TIC(NLSF_encode) silk_NLSF_encode( psEncC->indices.NLSFIndices, pNLSF_Q15, psEncC->psNLSF_CB, pNLSFW_QW, NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType ); TOC(NLSF_encode) /* Convert quantized NLSFs back to LPC coefficients */ silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder ); if( doInterpolate ) { /* Calculate the interpolated, quantized LSF vector for the first half */ silk_interpolate( pNLSF0_temp_Q15, prev_NLSFq_Q15, pNLSF_Q15, psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder ); /* Convert back to LPC coefficients */ silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder ); } else { /* Copy LPC coefficients for first half from second half */ SKP_memcpy( PredCoef_Q12[ 0 ], PredCoef_Q12[ 1 ], psEncC->predictLPCOrder * sizeof( opus_int16 ) ); } }
static boost::tuple< boost::shared_ptr<Matrix>, boost::shared_ptr<Matrix> > transfer_operators(const Matrix &A, params &prm) { typedef typename backend::value_type<Matrix>::type value_type; typedef typename math::scalar_of<value_type>::type scalar_type; const size_t n = rows(A); BOOST_AUTO(Aptr, A.ptr_data()); BOOST_AUTO(Acol, A.col_data()); BOOST_AUTO(Aval, A.val_data()); TIC("aggregates"); Aggregates aggr(A, prm.aggr, prm.nullspace.cols); prm.aggr.eps_strong *= 0.5; TOC("aggregates"); TIC("interpolation"); boost::shared_ptr<Matrix> P_tent = tentative_prolongation<Matrix>( n, aggr.count, aggr.id, prm.nullspace, prm.aggr.block_size ); boost::shared_ptr<Matrix> P = boost::make_shared<Matrix>(); P->nrows = rows(*P_tent); P->ncols = cols(*P_tent); P->ptr.resize(n + 1, 0); #pragma omp parallel { std::vector<ptrdiff_t> marker(P->ncols, -1); #ifdef _OPENMP int nt = omp_get_num_threads(); int tid = omp_get_thread_num(); ptrdiff_t chunk_size = (n + nt - 1) / nt; ptrdiff_t chunk_start = tid * chunk_size; ptrdiff_t chunk_end = std::min<ptrdiff_t>(n, chunk_start + chunk_size); #else ptrdiff_t chunk_start = 0; ptrdiff_t chunk_end = n; #endif // Count number of entries in P. for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) { for(ptrdiff_t ja = Aptr[i], ea = Aptr[i+1]; ja < ea; ++ja) { ptrdiff_t ca = Acol[ja]; // Skip weak off-diagonal connections. if (ca != i && !aggr.strong_connection[ja]) continue; for(ptrdiff_t jp = P_tent->ptr[ca], ep = P_tent->ptr[ca+1]; jp < ep; ++jp) { ptrdiff_t cp = P_tent->col[jp]; if (marker[cp] != i) { marker[cp] = i; ++( P->ptr[i + 1] ); } } } } boost::fill(marker, -1); #pragma omp barrier #pragma omp single { boost::partial_sum(P->ptr, P->ptr.begin()); P->col.resize(P->ptr.back()); P->val.resize(P->ptr.back()); } // Fill the interpolation matrix. for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) { // Diagonal of the filtered matrix is the original matrix // diagonal minus its weak connections. value_type dia = math::zero<value_type>(); for(ptrdiff_t j = Aptr[i], e = Aptr[i+1]; j < e; ++j) { if (Acol[j] == i) dia += Aval[j]; else if (!aggr.strong_connection[j]) dia -= Aval[j]; } dia = math::inverse(dia); ptrdiff_t row_beg = P->ptr[i]; ptrdiff_t row_end = row_beg; for(ptrdiff_t ja = Aptr[i], ea = Aptr[i + 1]; ja < ea; ++ja) { ptrdiff_t ca = Acol[ja]; // Skip weak off-diagonal connections. if (ca != i && !aggr.strong_connection[ja]) continue; value_type va = (ca == i) ? static_cast<value_type>(static_cast<scalar_type>(1 - prm.relax) * math::identity<value_type>()) : static_cast<value_type>(static_cast<scalar_type>(-prm.relax) * dia * Aval[ja]); for(ptrdiff_t jp = P_tent->ptr[ca], ep = P_tent->ptr[ca+1]; jp < ep; ++jp) { ptrdiff_t cp = P_tent->col[jp]; value_type vp = P_tent->val[jp]; if (marker[cp] < row_beg) { marker[cp] = row_end; P->col[row_end] = cp; P->val[row_end] = va * vp; ++row_end; } else { P->val[ marker[cp] ] += va * vp; } } } } } TOC("interpolation"); boost::shared_ptr<Matrix> R = boost::make_shared<Matrix>(); *R = transpose(*P); if (prm.nullspace.cols > 0) prm.aggr.block_size = prm.nullspace.cols; return boost::make_tuple(P, R); }
void SKP_Silk_find_pred_coefs_FIX(SKP_Silk_encoder_state_FIX * psEnc, /* I/O encoder state */ SKP_Silk_encoder_control_FIX * psEncCtrl, /* I/O encoder control */ const int16_t res_pitch[] /* I Residual from pitch analysis */ ) { int i; int32_t WLTP[NB_SUBFR * LTP_ORDER * LTP_ORDER]; int32_t invGains_Q16[NB_SUBFR], local_gains_Qx[NB_SUBFR], Wght_Q15[NB_SUBFR]; int NLSF_Q15[MAX_LPC_ORDER]; const int16_t *x_ptr; int16_t *x_pre_ptr, LPC_in_pre[NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH]; int32_t tmp, min_gain_Q16; #if !VARQ int LZ; #endif int LTP_corrs_rshift[NB_SUBFR]; /* weighting for weighted least squares */ min_gain_Q16 = int32_t_MAX >> 6; for (i = 0; i < NB_SUBFR; i++) { min_gain_Q16 = SKP_min(min_gain_Q16, psEncCtrl->Gains_Q16[i]); } #if !VARQ LZ = SKP_Silk_CLZ32(min_gain_Q16) - 1; LZ = SKP_LIMIT(LZ, 0, 16); min_gain_Q16 = SKP_RSHIFT(min_gain_Q16, 2); /* Ensure that maximum invGains_Q16 is within range of a 16 bit int */ #endif for (i = 0; i < NB_SUBFR; i++) { /* Divide to Q16 */ assert(psEncCtrl->Gains_Q16[i] > 0); #if VARQ /* Invert and normalize gains, and ensure that maximum invGains_Q16 is within range of a 16 bit int */ invGains_Q16[i] = SKP_DIV32_varQ(min_gain_Q16, psEncCtrl->Gains_Q16[i], 16 - 2); #else invGains_Q16[i] = SKP_DIV32(SKP_LSHIFT(min_gain_Q16, LZ), SKP_RSHIFT(psEncCtrl->Gains_Q16[i], 16 - LZ)); #endif /* Ensure Wght_Q15 a minimum value 1 */ invGains_Q16[i] = SKP_max(invGains_Q16[i], 363); /* Square the inverted gains */ assert(invGains_Q16[i] == SKP_SAT16(invGains_Q16[i])); tmp = SKP_SMULWB(invGains_Q16[i], invGains_Q16[i]); Wght_Q15[i] = SKP_RSHIFT(tmp, 1); /* Invert the inverted and normalized gains */ local_gains_Qx[i] = SKP_DIV32((1 << (16 + Qx)), invGains_Q16[i]); } if (psEncCtrl->sCmn.sigtype == SIG_TYPE_VOICED) { /**********/ /* VOICED */ /**********/ assert(psEnc->sCmn.frame_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->sCmn.pitchL[0] + LTP_ORDER / 2); /* LTP analysis */ SKP_Silk_find_LTP_FIX(psEncCtrl->LTPCoef_Q14, WLTP, &psEncCtrl->LTPredCodGain_Q7, res_pitch, res_pitch + SKP_RSHIFT(psEnc->sCmn.frame_length, 1), psEncCtrl->sCmn.pitchL, Wght_Q15, psEnc->sCmn.subfr_length, psEnc->sCmn.frame_length, LTP_corrs_rshift); /* Quantize LTP gain parameters */ SKP_Silk_quant_LTP_gains_FIX(psEncCtrl->LTPCoef_Q14, psEncCtrl->sCmn.LTPIndex, &psEncCtrl->sCmn.PERIndex, WLTP, psEnc->mu_LTP_Q8, psEnc->sCmn.LTPQuantLowComplexity); /* Control LTP scaling */ SKP_Silk_LTP_scale_ctrl_FIX(psEnc, psEncCtrl); /* Create LTP residual */ SKP_Silk_LTP_analysis_filter_FIX(LPC_in_pre, psEnc->x_buf + psEnc->sCmn.frame_length - psEnc->sCmn.predictLPCOrder, psEncCtrl->LTPCoef_Q14, psEncCtrl->sCmn.pitchL, invGains_Q16, 16, psEnc->sCmn.subfr_length, psEnc->sCmn.predictLPCOrder); } else { /************/ /* UNVOICED */ /************/ /* Create signal with prepended subframes, scaled by inverse gains */ x_ptr = psEnc->x_buf + psEnc->sCmn.frame_length - psEnc->sCmn.predictLPCOrder; x_pre_ptr = LPC_in_pre; for (i = 0; i < NB_SUBFR; i++) { SKP_Silk_scale_copy_vector16(x_pre_ptr, x_ptr, invGains_Q16[i], psEnc->sCmn.subfr_length + psEnc->sCmn. predictLPCOrder); x_pre_ptr += psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder; x_ptr += psEnc->sCmn.subfr_length; } SKP_memset(psEncCtrl->LTPCoef_Q14, 0, NB_SUBFR * LTP_ORDER * sizeof(int16_t)); psEncCtrl->LTPredCodGain_Q7 = 0; } /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */ TIC(FIND_LPC) SKP_Silk_find_LPC_FIX(NLSF_Q15, &psEncCtrl->sCmn.NLSFInterpCoef_Q2, psEnc->sPred.prev_NLSFq_Q15, psEnc->sCmn.useInterpolatedNLSFs * (1 - psEnc-> sCmn. first_frame_after_reset), psEnc->sCmn.predictLPCOrder, LPC_in_pre, psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder); TOC(FIND_LPC) /* Quantize LSFs */ TIC(PROCESS_LSFS) SKP_Silk_process_NLSFs_FIX(psEnc, psEncCtrl, NLSF_Q15); TOC(PROCESS_LSFS) /* Calculate residual energy using quantized LPC coefficients */ SKP_Silk_residual_energy_FIX(psEncCtrl->ResNrg, psEncCtrl->ResNrgQ, LPC_in_pre, (const int16_t(*)[])psEncCtrl->PredCoef_Q12, local_gains_Qx, Qx, psEnc->sCmn.subfr_length, psEnc->sCmn.predictLPCOrder); /* Copy to prediction struct for use in next frame for fluctuation reduction */ SKP_memcpy(psEnc->sPred.prev_NLSFq_Q15, NLSF_Q15, psEnc->sCmn.predictLPCOrder * sizeof(int)); }
int_t pdgstrf /************************************************************************/ ( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info ) /* * Purpose * ======= * * PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following field should be defined: * o ReplaceTinyPivot (yes_no_t) * Specifies whether to replace the tiny diagonals by * sqrt(epsilon)*norm(A) during LU factorization. * * m (input) int * Number of rows in the matrix. * * n (input) int * Number of columns in the matrix. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * The following fields should be defined: * * o Glu_persist (input) Glu_persist_t* * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (input/output) LocalLU_t* * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { #ifdef _CRAY _fcd ftcs = _cptofcd("N", strlen("N")); _fcd ftcs1 = _cptofcd("L", strlen("L")); _fcd ftcs2 = _cptofcd("N", strlen("N")); _fcd ftcs3 = _cptofcd("U", strlen("U")); #endif double alpha = 1.0, beta = 0.0; int_t *xsup; int_t *lsub, *lsub1, *usub, *Usub_buf, *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ double *lusup, *lusup1, *uval, *Uval_buf, *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, nlb, nub, nsupc, rel, rukp; int_t Pc, Pr; int iam, kcol, krow, mycol, myrow, pi, pj; int j, k, lk, nsupers; int nsupr, nbrow, segsize; int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: * 0 : transferred in Lsub_buf[] * 1 : transferred in Lval_buf[] * 2 : transferred in Usub_buf[] * 3 : transferred in Uval_buf[] */ int_t msg0, msg2; int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; double **Unzval_br_ptr, **Lnzval_bc_ptr; int_t *index; double *nzval; int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ double *ucol; int_t *indirect; double *tempv, *tempv2d; int_t iinfo; int_t *ToRecv, *ToSendD, **ToSendR; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; superlu_scope_t *scp; float s_eps; double thresh; double *tempU2d, *tempu; int full, ldt, ldu, lead_zero, ncols; MPI_Request recv_req[4], *send_req, *U_diag_blk_send_req = NULL; MPI_Status status; #if ( DEBUGlevel>=2 ) int_t num_copy=0, num_update=0; #endif #if ( PRNTlevel==3 ) int_t zero_msg = 0, total_msg = 0; #endif #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t iword = sizeof(int_t), dword = sizeof(double); #endif /* Test the input parameters. */ *info = 0; if ( m < 0 ) *info = -2; else if ( n < 0 ) *info = -3; if ( *info ) { pxerbla("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if ( m == 0 || n == 0 ) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; s_eps = slamch_("Epsilon"); thresh = s_eps * anorm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrf()"); #endif stat->ops[FACT] = 0.0; if ( Pr*Pc > 1 ) { i = Llu->bufmax[0]; if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lsub_buf."); Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; i = Llu->bufmax[1]; if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lval_buf[]."); Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; if ( Llu->bufmax[2] != 0 ) if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) ABORT("Malloc fails for Usub_buf[]."); if ( Llu->bufmax[3] != 0 ) if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) ABORT("Malloc fails for Uval_buf[]."); if ( !(U_diag_blk_send_req = (MPI_Request *) SUPERLU_MALLOC(Pr*sizeof(MPI_Request)))) ABORT("Malloc fails for U_diag_blk_send_req[]."); U_diag_blk_send_req[myrow] = 0; /* flag no outstanding Isend */ if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) ABORT("Malloc fails for send_req[]."); } k = sp_ienv_dist(3); /* max supernode size */ if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) ) ABORT("Malloc fails for ujrow[]."); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", Llu->bufmax[0], Llu->bufmax[1], Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); } #endif Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; Lval_buf_2[0] = Llu->Lval_buf_2[0]; Lval_buf_2[1] = Llu->Lval_buf_2[1]; Usub_buf = Llu->Usub_buf; Uval_buf = Llu->Uval_buf; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; ToRecv = Llu->ToRecv; ToSendD = Llu->ToSendD; ToSendR = Llu->ToSendR; ldt = sp_ienv_dist(3); /* Size of maximum supernode */ if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) ABORT("Calloc fails for tempv2d[]."); tempU2d = tempv2d + ldt*ldt; if ( !(indirect = intMalloc_dist(ldt)) ) ABORT("Malloc fails for indirect[]."); k = CEILING( nsupers, Pr ); /* Number of local block rows */ if ( !(iuip = intMalloc_dist(k)) ) ABORT("Malloc fails for iuip[]."); if ( !(ruip = intMalloc_dist(k)) ) ABORT("Malloc fails for ruip[]."); #if ( VAMPIR>=1 ) VT_symdef(1, "Send-L", "Comm"); VT_symdef(2, "Recv-L", "Comm"); VT_symdef(3, "Send-U", "Comm"); VT_symdef(4, "Recv-U", "Comm"); VT_symdef(5, "TRF2", "Factor"); VT_symdef(100, "Factor", "Factor"); VT_begin(100); VT_traceon(); #endif /* --------------------------------------------------------------- Handle the first block column separately to start the pipeline. --------------------------------------------------------------- */ if ( mycol == 0 ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif scp = &grid->rscp; /* The scope of process row. */ /* Process column *kcol* multicasts numeric values of L(:,k) to process rows. */ lsub = Lrowind_bc_ptr[0]; lusup = Lnzval_bc_ptr[0]; if ( lsub ) { msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; msgcnt[1] = lsub[1] * SuperSize( 0 ); } else { msgcnt[0] = msgcnt[1] = 0; } for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[0][pj] != EMPTY ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, &send_req[pj] ); MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, &send_req[pj+Pc] ); #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, 0, msgcnt[0], msgcnt[1], pj); #endif #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif } } /* for pj ... */ } else { /* Post immediate receives. */ if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, 0, scp->comm, &recv_req[0] ); MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, 1, scp->comm, &recv_req[1] ); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); #endif } } /* if mycol == 0 */ /* ------------------------------------------ MAIN LOOP: Loop through all block columns. ------------------------------------------ */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( mycol == kcol ) { lk = LBj( k, grid ); /* Local block number. */ for (pj = 0; pj < Pc; ++pj) { /* Wait for Isend to complete before using lsub/lusup. */ if ( ToSendR[lk][pj] != EMPTY ) { MPI_Wait( &send_req[pj], &status ); MPI_Wait( &send_req[pj+Pc], &status ); } } lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; } else { if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ scp = &grid->rscp; /* The scope of process row. */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(2); #endif /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[0]);*/ /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, (4*k)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[0], &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[1]);*/ /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, (4*k+1)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[1], &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); #if ( VAMPIR>=1 ) VT_end(2); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", iam, k, msgcnt[0], msgcnt[1], kcol); fflush(stdout); #endif lsub = Lsub_buf_2[k%2]; lusup = Lval_buf_2[k%2]; #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[0] ) ++zero_msg; #endif } else msgcnt[0] = 0; } /* if mycol = Pc(k) */ scp = &grid->cscp; /* The scope of process column. */ if ( myrow == krow ) { /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ #ifdef _CRAY pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); #else pdgstrs2(n, k, Glu_persist, grid, Llu, stat); #endif /* Multicasts U(k,:) to process columns. */ lk = LBi( k, grid ); usub = Ufstnz_br_ptr[lk]; uval = Unzval_br_ptr[lk]; if ( usub ) { msgcnt[2] = usub[2]; msgcnt[3] = usub[1]; } else { msgcnt[2] = msgcnt[3] = 0; } if ( ToSendD[lk] == YES ) { for (pi = 0; pi < Pr; ++pi) { if ( pi != myrow ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(3); #endif MPI_Send( usub, msgcnt[2], mpi_int_t, pi, (4*k+2)%NTAGS, scp->comm); MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, (4*k+3)%NTAGS, scp->comm); #if ( VAMPIR>=1 ) VT_end(3); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); #endif } /* if pi ... */ } /* for pi ... */ } /* if ToSendD ... */ } else { /* myrow != krow */ if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(4); #endif /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[2]);*/ MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, (4*k+2)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[3]);*/ MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, (4*k+3)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); #if ( VAMPIR>=1 ) VT_end(4); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif usub = Usub_buf; uval = Uval_buf; #if ( DEBUGlevel>=2 ) printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); #endif #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[2] ) ++zero_msg; #endif } else msgcnt[2] = 0; } /* if myrow == Pr(k) */ /* * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). * for (j = k+1; k < N; ++k) { * for (i = k+1; i < N; ++i) * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) * && L(i,k) != 0 && U(k,j) != 0 ) * A(i,j) = A(i,j) - L(i,k) * U(k,j); */ msg0 = msgcnt[0]; msg2 = msgcnt[2]; if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ nsupr = lsub[1]; /* LDA of lusup. */ if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; luptr0 = knsupc; nlb = lsub[0] - 1; } else { lptr0 = BC_HEADER; luptr0 = 0; nlb = lsub[0]; } lptr = lptr0; for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ ib = lsub[lptr]; lib = LBi( ib, grid ); iuip[lib] = BR_HEADER; ruip[lib] = 0; lptr += LB_DESCRIPTOR + lsub[lptr+1]; } nub = usub[0]; /* Number of blocks in the block row U(k,:) */ iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ rukp = 0; /* Pointer to nzval[] of U(k,:) */ klst = FstBlockC( k+1 ); /* --------------------------------------------------- Update the first block column A(:,k+1). --------------------------------------------------- */ jb = usub[iukp]; /* Global block number of block U(k,j). */ if ( jb == k+1 ) { /* First update (k+1)-th block. */ --nub; lptr = lptr0; luptr = luptr0; ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0, it = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[it++]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (it = 0, i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[it++]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* if jb == k+1 */ } /* if L(:,k) and U(k,:) not empty */ if ( k+1 < nsupers ) { kcol = PCOL( k+1, grid ); if ( mycol == kcol ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, U_diag_blk_send_req, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif /* Process column *kcol+1* multicasts numeric values of L(:,k+1) to process rows. */ lk = LBj( k+1, grid ); /* Local block number. */ lsub1 = Lrowind_bc_ptr[lk]; if ( lsub1 ) { msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; msgcnt[1] = lsub1[1] * SuperSize( k+1 ); } else { msgcnt[0] = 0; msgcnt[1] = 0; } scp = &grid->rscp; /* The scope of process row. */ for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[lk][pj] != EMPTY ) { lusup1 = Lnzval_bc_ptr[lk]; #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, k+1, msgcnt[0], msgcnt[1], pj); #endif } } /* for pj ... */ } else { /* Post Recv of block column L(:,k+1). */ if ( ToRecv[k+1] >= 1 ) { scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); #endif } } /* if mycol == Pc(k+1) */ } /* if k+1 < nsupers */ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ /* --------------------------------------------------- Update all other blocks using block row U(k,:) --------------------------------------------------- */ for (j = 0; j < nub; ++j) { lptr = lptr0; luptr = luptr0; jb = usub[iukp]; /* Global block number of block U(k,j). */ ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #else dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } /* Skip descriptor. Now point to fstnz index of block U(i,j). */ iuip[lib] += UB_DESCRIPTOR; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0 ; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[i]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted for the L blocks. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[i]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* for j ... */ } /* if k L(:,k) and U(k,:) are not empty */ } /* ------------------------------------------ END MAIN LOOP: for k = ... ------------------------------------------ */ #if ( VAMPIR>=1 ) VT_end(100); VT_traceoff(); #endif if ( Pr*Pc > 1 ) { SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); SUPERLU_FREE(send_req); if ( U_diag_blk_send_req[myrow] ) { /* wait for last Isend requests to complete, deallocate objects */ for (krow = 0; krow < Pr; ++krow) if ( krow != myrow ) MPI_Wait(U_diag_blk_send_req + krow, &status); } SUPERLU_FREE(U_diag_blk_send_req); } SUPERLU_FREE(Llu->ujrow); SUPERLU_FREE(tempv2d); SUPERLU_FREE(indirect); SUPERLU_FREE(iuip); SUPERLU_FREE(ruip); /* Prepare error message. */ if ( *info == 0 ) *info = n + 1; #if ( PROFlevel>=1 ) TIC(t1); #endif MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; MPI_Reduce( &msg_cnt, &msg_cnt_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_cnt, &msg_cnt_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); if ( !iam ) { printf("\tPDGSTRF comm stat:" "\tAvg\tMax\t\tAvg\tMax\n" "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", msg_cnt_sum/Pr/Pc, msg_cnt_max, msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); } } #endif if ( iinfo == n + 1 ) *info = 0; else *info = iinfo; #if ( PRNTlevel==3 ) MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # total msg\t%d\n", iinfo); #endif #if ( DEBUGlevel>=2 ) for (i = 0; i < Pr * Pc; ++i) { if ( iam == i ) { dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); printf("(%d)\n", iam); PrintInt10("Recv", nsupers, Llu->ToRecv); } MPI_Barrier( grid->comm ); } #endif #if ( DEBUGlevel>=3 ) printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrf()"); #endif } /* PDGSTRF */
void *pzgstrf_thread(void *arg) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * * Purpose * ======= * * This is the slave process, representing the main scheduling loop to * perform the factorization. Each process executes a copy of the * following code ... (SPMD paradigm) * * Working arrays local to each process * ====================================== * marker[0:3*m-1]: marker[i] == j means node i has been reached when * working on column j. * Storage: relative to original row subscripts * * THERE ARE 3 OF THEM: * marker[0 : m-1]: used by pzgstrf_factor_snode() and * pzgstrf_panel_dfs(); * marker[m : 2m-1]: used by pzgstrf_panel_dfs() and * pxgstrf_super_bnd_dfs(); * values in [0 : n-1] when used by pzgstrf_panel_dfs() * values in [n : 2n-1] when used by pxgstrf_super_bnd_dfs() * marker[2m : 3m-1]: used by pzgstrf_column_dfs() in inner-factor * * parent[0:n-1]: parent vector used during dfs * Storage: relative to new row subscripts * * xplore[0:2m-1]: xplore[i] gives the location of the next (dfs) * unexplored neighbor of i in lsub[*]; xplore[n+i] gives the * location of the last unexplored neighbor of i in lsub[*]. * * segrep[0:nseg-1]: contains the list of supernodal representatives * in topological order of the dfs. A supernode representative is the * last column of a supernode. * * repfnz[0:m-1]: for a nonzero segment U[*,j] that ends at a * supernodal representative r, repfnz[r] is the location of the first * nonzero in this segment. It is also used during the dfs: * repfnz[r]>0 indicates that supernode r has been explored. * NOTE: There are w of them, each used for one column of a panel. * * panel_lsub[0:w*m-1]: temporary for the nonzero row indices below * the panel diagonal. These are filled in during pzgstrf_panel_dfs(), * and are used later in the inner LU factorization. * panel_lsub[]/dense[] pair forms the SPA data structure. * NOTE: There are w of them. * * dense[0:w*m-1]: sparse accumulator (SPA) for intermediate values; * NOTE: there are w of them. * * tempv[0:m-1]: real temporary used for dense numeric kernels; * * * Scheduling algorithm (For each process ...) * ==================== * Shared task Q <-- { relaxed s-nodes (CANGO) }; * * WHILE (not finished) * * panel = Scheduler(Q); (see pxgstrf_scheduler.c for policy) * * IF (panel == RELAXED_SNODE) * factor_relax_snode(panel); * ELSE * * pzgstrf_panel_dfs() * - skip all BUSY s-nodes (or panels) * * * dpanel_bmod() * - updates from DONE s-nodes * - wait for BUSY s-nodes to become DONE * * * inner-factor() * - identical as it is in the sequential algorithm, * except that pruning() will interact with the * pzgstrf_panel_dfs() of other panels. * ENDIF * * END WHILE; * */ #if ( MACH==SGI || MACH==ORIGIN ) #if ( MACH==SGI ) int pnum = mpc_my_threadnum(); #elif ( MACH==ORIGIN ) int pnum = mp_my_threadnum(); #endif pzgstrf_threadarg_t *thr_arg = &((pzgstrf_threadarg_t *)arg)[pnum]; #else pzgstrf_threadarg_t *thr_arg = arg; int pnum = thr_arg->pnum; #endif /* Unpack the options argument */ superlumt_options_t *superlumt_options = thr_arg->superlumt_options; pxgstrf_shared_t *pxgstrf_shared= thr_arg->pxgstrf_shared; int panel_size = superlumt_options->panel_size; double diag_pivot_thresh = superlumt_options->diag_pivot_thresh; yes_no_t *usepr = &superlumt_options->usepr; /* may be modified */ int *etree = superlumt_options->etree; int *super_bnd = superlumt_options->part_super_h; int *perm_r = superlumt_options->perm_r; int *inv_perm_c= pxgstrf_shared->inv_perm_c; int *inv_perm_r= pxgstrf_shared->inv_perm_r; int *xprune = pxgstrf_shared->xprune; int *ispruned = pxgstrf_shared->ispruned; SuperMatrix *A = pxgstrf_shared->A; GlobalLU_t *Glu = pxgstrf_shared->Glu; Gstat_t *Gstat = pxgstrf_shared->Gstat; int *info = &thr_arg->info; /* Local working arrays */ int *iwork; doublecomplex *dwork; int *segrep, *repfnz, *parent, *xplore; int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */ int *marker, *marker1, *marker2; int *lbusy; /* "Local busy" array, indicates which descendants were busy when this panel's computation began. Those columns (s-nodes) are treated specially during pzgstrf_panel_dfs() and dpanel_bmod(). */ int *spa_marker; /* size n-by-w */ int *w_lsub_end; /* record the end of each column in panel_lsub */ doublecomplex *dense, *tempv; int *lsub, *xlsub, *xlsub_end; /* Local scalars */ register int m, n, k, jj, jcolm1, itemp, singular; int pivrow; /* pivotal row number in the original matrix A */ int nseg1; /* no of segments in U-column above panel row jcol */ int nseg; /* no of segments in each U-column */ int w, bcol, jcol; #ifdef PROFILE double *utime = Gstat->utime; double t1, t2, t, stime; register float flopcnt; #endif #ifdef PREDICT_OPT flops_t *ops = Gstat->ops; register float pdiv; #endif #if ( DEBUGlevel>=1 ) printf("(%d) thr_arg-> pnum %d, info %d\n", pnum, thr_arg->pnum, thr_arg->info); #endif singular = 0; m = A->nrow; n = A->ncol; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; /* Allocate and initialize the per-process working storage. */ if ( (*info = pzgstrf_WorkInit(m, panel_size, &iwork, &dwork)) ) { *info += pzgstrf_memory_use(Glu->nzlmax, Glu->nzumax, Glu->nzlumax); return 0; } pxgstrf_SetIWork(m, panel_size, iwork, &segrep, &parent, &xplore, &repfnz, &panel_lsub, &marker, &lbusy); pzgstrf_SetRWork(m, panel_size, dwork, &dense, &tempv); /* New data structures to facilitate parallel algorithm */ spa_marker = intMalloc(m * panel_size); w_lsub_end = intMalloc(panel_size); ifill (spa_marker, m * panel_size, EMPTY); ifill (marker, m * NO_MARKER, EMPTY); ifill (lbusy, m, EMPTY); jcol = EMPTY; marker1 = marker + m; marker2 = marker + 2*m; #ifdef PROFILE stime = SuperLU_timer_(); #endif /* ------------------------- Main loop: repeatedly ... ------------------------- */ while ( pxgstrf_shared->tasks_remain > 0 ) { #ifdef PROFILE TIC(t); #endif /* Get a panel from the scheduler. */ pxgstrf_scheduler(pnum, n, etree, &jcol, &bcol, pxgstrf_shared); #if ( DEBUGlevel>=1 ) if ( jcol>=LOCOL && jcol<=HICOL ) { printf("(%d) Scheduler(): jcol %d, bcol %d, tasks_remain %d\n", pnum, jcol, bcol, pxgstrf_shared->tasks_remain); fflush(stdout); } #endif #ifdef PROFILE TOC(t2, t); Gstat->procstat[pnum].skedtime += t2; #endif if ( jcol != EMPTY ) { w = pxgstrf_shared->pan_status[jcol].size; #if ( DEBUGlevel>=3 ) printf("P%2d got panel %5d-%5d\ttime %.4f\tpanels_left %d\n", pnum, jcol, jcol+w-1, SuperLU_timer_(), pxgstrf_shared->tasks_remain); fflush(stdout); #endif /* Nondomain panels */ #ifdef PROFILE flopcnt = Gstat->procstat[pnum].fcops; Gstat->panstat[jcol].pnum = pnum; TIC(t1); Gstat->panstat[jcol].starttime = t1; #endif if ( pxgstrf_shared->pan_status[jcol].type == RELAXED_SNODE ) { #ifdef PREDICT_OPT pdiv = Gstat->procstat[pnum].fcops; #endif /* A relaxed supernode at the bottom of the etree */ pzgstrf_factor_snode (pnum, jcol, A, diag_pivot_thresh, usepr, perm_r, inv_perm_r, inv_perm_c, xprune, marker, panel_lsub, dense, tempv, pxgstrf_shared, info); if ( *info ) { if ( *info > n ) return 0; else if ( singular == 0 || *info < singular ) singular = *info; #if ( DEBUGlevel>=1 ) printf("(%d) After pzgstrf_factor_snode(): singular=%d\n", pnum, singular); #endif } /* Release the whole relaxed supernode */ for (jj = jcol; jj < jcol + w; ++jj) pxgstrf_shared->spin_locks[jj] = 0; #ifdef PREDICT_OPT pdiv = Gstat->procstat[pnum].fcops - pdiv; cp_panel[jcol].pdiv = pdiv; #endif } else { /* Regular panel */ #ifdef PROFILE TIC(t); #endif pxgstrf_mark_busy_descends(pnum, jcol, etree, pxgstrf_shared, &bcol, lbusy); /* Symbolic factor on a panel of columns */ pzgstrf_panel_dfs (pnum, m, w, jcol, A, perm_r, xprune,ispruned,lbusy, &nseg1, panel_lsub, w_lsub_end, segrep, repfnz, marker, spa_marker, parent, xplore, dense, Glu); #if ( DEBUGlevel>=2 ) if ( jcol==BADPAN ) printf("(%d) After pzgstrf_panel_dfs(): nseg1 %d, w_lsub_end %d\n", pnum, nseg1, w_lsub_end[0]); #endif #ifdef PROFILE TOC(t2, t); utime[DFS] += t2; #endif /* Numeric sup-panel updates in topological order. * On return, the update values are temporarily stored in * the n-by-w SPA dense[m,w]. */ pzgstrf_panel_bmod (pnum, m, w, jcol, bcol, inv_perm_r, etree, &nseg1, segrep, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, pxgstrf_shared); /* * All "busy" descendants are "done" now -- * Find the set of row subscripts in the preceeding column * "jcol-1" of the current panel. Column "jcol-1" is * usually taken by a process other than myself. * This row-subscripts information will be used by myself * during column dfs to detect whether "jcol" belongs * to the same supernode as "jcol-1". * * ACCORDING TO PROFILE, THE AMOUNT OF TIME SPENT HERE * IS NEGLIGIBLE. */ jcolm1 = jcol - 1; itemp = xlsub_end[jcolm1]; for (k = xlsub[jcolm1]; k < itemp; ++k) marker2[lsub[k]] = jcolm1; #ifdef PREDICT_OPT pdiv = Gstat->procstat[pnum].fcops; #endif /* Inner-factorization, using sup-col algorithm */ for ( jj = jcol; jj < jcol + w; jj++) { k = (jj - jcol) * m; /* index into w-wide arrays */ nseg = nseg1; /* begin after all the panel segments */ #ifdef PROFILE TIC(t); #endif /* Allocate storage for the current H-supernode. */ if ( Glu->dynamic_snode_bound && super_bnd[jj] ) { /* jj starts a supernode in H */ pxgstrf_super_bnd_dfs (pnum, m, n, jj, super_bnd[jj], A, perm_r, inv_perm_r, xprune, ispruned, marker1, parent, xplore, pxgstrf_shared); } if ( (*info = pzgstrf_column_dfs (pnum, m, jj, jcol, perm_r, ispruned, &panel_lsub[k],w_lsub_end[jj-jcol], super_bnd, &nseg, segrep, &repfnz[k], xprune, marker2, parent, xplore, pxgstrf_shared)) ) return 0; #ifdef PROFILE TOC(t2, t); utime[DFS] += t2; #endif /* On return, the L supernode is gathered into the global storage. */ if ( (*info = pzgstrf_column_bmod (pnum, jj, jcol, (nseg - nseg1), &segrep[nseg1], &repfnz[k], &dense[k], tempv, pxgstrf_shared, Gstat)) ) return 0; if ( (*info = pzgstrf_pivotL (pnum, jj, diag_pivot_thresh, usepr, perm_r, inv_perm_r, inv_perm_c, &pivrow, Glu, Gstat)) ) if ( singular == 0 || *info < singular ) { singular = *info; #if ( DEBUGlevel>=1 ) printf("(%d) After pzgstrf_pivotL(): singular=%d\n", pnum, singular); #endif } /* release column "jj", so that the other processes waiting for this column can proceed */ pxgstrf_shared->spin_locks[jj] = 0; /* copy the U-segments to ucol[*] */ if ( (*info = pzgstrf_copy_to_ucol (pnum,jj,nseg,segrep,&repfnz[k], perm_r, &dense[k], pxgstrf_shared)) ) return 0; /* Prune columns [0:jj-1] using column jj */ pxgstrf_pruneL(jj, perm_r, pivrow, nseg, segrep, &repfnz[k], xprune, ispruned, Glu); /* Reset repfnz[] for this column */ pxgstrf_resetrep_col (nseg, segrep, &repfnz[k]); #if ( DEBUGlevel>=2 ) /* if (jj >= LOCOL && jj <= HICOL) {*/ if ( jj==BADCOL ) { dprint_lu_col(pnum, "panel:", jcol, jj, w, pivrow, xprune, Glu); dcheck_zero_vec(pnum, "after pzgstrf_copy_to_ucol() dense_col[]", n, &dense[k]); } #endif } /* for jj ... */ #ifdef PREDICT_OPT pdiv = Gstat->procstat[pnum].fcops - pdiv; cp_panel[jcol].pdiv = pdiv; #endif } /* else regular panel ... */ STATE( jcol ) = DONE; /* Release panel jcol. */ #ifdef PROFILE TOC(Gstat->panstat[jcol].fctime, t1); Gstat->panstat[jcol].flopcnt += Gstat->procstat[pnum].fcops - flopcnt; /*if ( Glu->tasks_remain < P ) { flops_last_P_panels += Gstat->panstat[jcol].flopcnt; printf("Panel %d, flops %e\n", jcol, Gstat->panstat[jcol].flopcnt); fflush(stdout); } */ #endif } #ifdef PROFILE else { /* No panel from the task queue - wait and try again */ Gstat->procstat[pnum].skedwaits++; } #endif } /* while there are more panels */ *info = singular; /* Free work space and compress storage */ pzgstrf_WorkFree(iwork, dwork, Glu); SUPERLU_FREE (spa_marker); SUPERLU_FREE (w_lsub_end); #ifdef PROFILE Gstat->procstat[pnum].fctime = SuperLU_timer_() - stime; #endif return 0; }
void silk_find_LTP_FIX( opus_int32 XXLTP_Q17[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* O Correlation matrix */ opus_int32 xXLTP_Q17[ MAX_NB_SUBFR * LTP_ORDER ], /* O Correlation vector */ const opus_int16 r_ptr[], /* I Residual signal after LPC */ const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ const opus_int subfr_length, /* I Subframe length */ const opus_int nb_subfr, /* I Number of subframes */ int arch /* I Run-time architecture */ ) { opus_int i, k, extra_shifts; opus_int xx_shifts, xX_shifts, XX_shifts; const opus_int16 *lag_ptr; opus_int32 *XXLTP_Q17_ptr, *xXLTP_Q17_ptr; opus_int32 xx, nrg, temp; xXLTP_Q17_ptr = xXLTP_Q17; XXLTP_Q17_ptr = XXLTP_Q17; for( k = 0; k < nb_subfr; k++ ) { lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 ); silk_sum_sqr_shift( &xx, &xx_shifts, r_ptr, subfr_length + LTP_ORDER ); /* xx in Q( -xx_shifts ) */ silk_corrMatrix_FIX( lag_ptr, subfr_length, LTP_ORDER, XXLTP_Q17_ptr, &nrg, &XX_shifts, arch ); /* XXLTP_Q17_ptr and nrg in Q( -XX_shifts ) */ extra_shifts = xx_shifts - XX_shifts; if( extra_shifts > 0 ) { /* Shift XX */ xX_shifts = xx_shifts; for( i = 0; i < LTP_ORDER * LTP_ORDER; i++ ) { XXLTP_Q17_ptr[ i ] = silk_RSHIFT32( XXLTP_Q17_ptr[ i ], extra_shifts ); /* Q( -xX_shifts ) */ } nrg = silk_RSHIFT32( nrg, extra_shifts ); /* Q( -xX_shifts ) */ } else if( extra_shifts < 0 ) { /* Shift xx */ xX_shifts = XX_shifts; xx = silk_RSHIFT32( xx, -extra_shifts ); /* Q( -xX_shifts ) */ } else { xX_shifts = xx_shifts; } silk_corrVector_FIX( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xXLTP_Q17_ptr, xX_shifts, arch ); /* xXLTP_Q17_ptr in Q( -xX_shifts ) */ /* At this point all correlations are in Q(-xX_shifts) */ temp = silk_SMLAWB( 1, nrg, SILK_FIX_CONST( LTP_CORR_INV_MAX, 16 ) ); temp = silk_max( temp, xx ); TIC(div) #if 0 for( i = 0; i < LTP_ORDER * LTP_ORDER; i++ ) { XXLTP_Q17_ptr[ i ] = silk_DIV32_varQ( XXLTP_Q17_ptr[ i ], temp, 17 ); } for( i = 0; i < LTP_ORDER; i++ ) { xXLTP_Q17_ptr[ i ] = silk_DIV32_varQ( xXLTP_Q17_ptr[ i ], temp, 17 ); } #else for( i = 0; i < LTP_ORDER * LTP_ORDER; i++ ) { XXLTP_Q17_ptr[ i ] = (opus_int32)( silk_LSHIFT64( (opus_int64)XXLTP_Q17_ptr[ i ], 17 ) / temp ); } for( i = 0; i < LTP_ORDER; i++ ) { xXLTP_Q17_ptr[ i ] = (opus_int32)( silk_LSHIFT64( (opus_int64)xXLTP_Q17_ptr[ i ], 17 ) / temp ); } #endif TOC(div) r_ptr += subfr_length; XXLTP_Q17_ptr += LTP_ORDER * LTP_ORDER; xXLTP_Q17_ptr += LTP_ORDER; } }
InstancePtr FeatureExtractor::extract(const Observation &obs, FeatureExtractorHistory &history) { TIC(total); assert(obs.preyInd == 0); InstancePtr instance(new Instance); TIC(pos); setFeature(instance,FeatureType::PredInd,obs.myInd - 1); // positions of agents for (unsigned int i = 0; i < obs.positions.size(); i++) { Point2D diff = getDifferenceToPoint(dims,obs.myPos(),obs.positions[i]); unsigned int key = FeatureType::Prey_dx + 2 * i; setFeature(instance,key,diff.x); setFeature(instance,key+1,diff.y); } TOC(pos); // derived features TIC(derived); bool next2prey = false; for (unsigned int a = 0; a < Action::NUM_NEIGHBORS; a++) { Point2D pos = movePosition(dims,obs.myPos(),(Action::Type)a); bool occupied = false; for (unsigned int i = 0; i < obs.positions.size(); i++) { if (i == obs.myInd) continue; if (obs.positions[i] == pos) { occupied = true; if (i == 0) next2prey = true; break; } } setFeature(instance,FeatureType::Occupied_0 + a, occupied); } setFeature(instance,FeatureType::NextToPrey,next2prey); TOC(derived); // actions predicted by models TIC(actions); // not currently supported //ActionProbs actionProbs; for (std::vector<FeatureAgent>::iterator it = featureAgents.begin(); it != featureAgents.end(); it++) { std::cerr << "FeatureExtractor can't handle featureAgents" << std::endl; exit(58); //actionProbs = it->agent->step(obs); //ADD_KEY(it->name + ".des"); //setFeature(instance,actionProbs.maxAction()); } TOC(actions); // update the history TIC(history); updateHistory(obs,history); // add the history features TIC(historyupdate); Action::Type action; for (unsigned int j = 0; j < HISTORY_SIZE; j++) { if (j < history.actionHistory[obs.myInd].size()) action = history.actionHistory[obs.myInd][j]; else action = Action::NUM_ACTIONS; setFeature(instance,FeatureType::MyHistoricalAction_0 + j,action); } TOC(historyupdate); /* for (unsigned int agentInd = 0; agentInd < obs.positions.size(); agentInd++) { for (unsigned int j = 0; j < HISTORY_SIZE; j++) { if (j < history.actionHistory[agentInd].size()) action = history.actionHistory[agentInd][j]; else action = Action::NUM_ACTIONS; if (USE_ALL_AGENTS_HISTORY) { std::cerr << "FeatureExtractor can't handle all agents history" << std::endl; exit(58); //ADD_KEY("HistoricalAction" + boost::lexical_cast<std::string>(agentInd) + "." + boost::lexical_cast<std::string>(j)); //setFeature(instance,action); } if (agentInd == obs.myInd) { setFeature(instance,FeatureType::MyHistoricalAction_0 + j,action); } } } */ TOC(history); instance->weight = 1.0; TOC(total); //std::cout << "instance: " << *instance << std::endl; return instance; }
static boost::tuple< boost::shared_ptr<Matrix>, boost::shared_ptr<Matrix> > transfer_operators(const Matrix &A, params &prm) { typedef typename backend::value_type<Matrix>::type Val; const size_t n = rows(A); TIC("aggregates"); Aggregates aggr(A, prm.aggr); prm.aggr.eps_strong *= 0.5; TOC("aggregates"); TIC("interpolation"); boost::shared_ptr<Matrix> P = boost::make_shared<Matrix>(); P->nrows = n; P->ncols = aggr.count; P->ptr.resize(n + 1, 0); #pragma omp parallel { std::vector<ptrdiff_t> marker(aggr.count, -1); #ifdef _OPENMP int nt = omp_get_num_threads(); int tid = omp_get_thread_num(); size_t chunk_size = (n + nt - 1) / nt; size_t chunk_start = tid * chunk_size; size_t chunk_end = std::min(n, chunk_start + chunk_size); #else size_t chunk_start = 0; size_t chunk_end = n; #endif // Count number of entries in P. for(size_t i = chunk_start; i < chunk_end; ++i) { for(ptrdiff_t j = A.ptr[i], e = A.ptr[i+1]; j < e; ++j) { size_t c = static_cast<size_t>(A.col[j]); // Skip weak off-diagonal connections. if (c != i && !aggr.strong_connection[j]) continue; ptrdiff_t g = aggr.id[c]; if (g >= 0 && static_cast<size_t>(marker[g]) != i) { marker[g] = static_cast<ptrdiff_t>(i); ++( P->ptr[i + 1] ); } } } boost::fill(marker, -1); #pragma omp barrier #pragma omp single { boost::partial_sum(P->ptr, P->ptr.begin()); P->col.resize(P->ptr.back()); P->val.resize(P->ptr.back()); } // Fill the interpolation matrix. for(size_t i = chunk_start; i < chunk_end; ++i) { // Diagonal of the filtered matrix is the original matrix // diagonal minus its weak connections. Val dia = 0; for(ptrdiff_t j = A.ptr[i], e = A.ptr[i+1]; j < e; ++j) { if (static_cast<size_t>(A.col[j]) == i) dia += A.val[j]; else if (!aggr.strong_connection[j]) dia -= A.val[j]; } dia = 1 / dia; ptrdiff_t row_beg = P->ptr[i]; ptrdiff_t row_end = row_beg; for(ptrdiff_t j = A.ptr[i], e = A.ptr[i + 1]; j < e; ++j) { size_t c = static_cast<size_t>(A.col[j]); // Skip weak couplings, ... if (c != i && !aggr.strong_connection[j]) continue; // ... and the ones not in any aggregate. ptrdiff_t g = aggr.id[c]; if (g < 0) continue; Val v = (c == i) ? 1 - prm.relax : -prm.relax * dia * A.val[j]; if (marker[g] < row_beg) { marker[g] = row_end; P->col[row_end] = g; P->val[row_end] = v; ++row_end; } else { P->val[ marker[g] ] += v; } } } } TOC("interpolation"); boost::shared_ptr<Matrix> R = boost::make_shared<Matrix>(); *R = transpose(*P); return boost::make_tuple(P, R); }
void SKP_Silk_find_pred_coefs_FIX( SKP_Silk_encoder_state_FIX *psEnc, /* I/O encoder state */ SKP_Silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ const SKP_int16 res_pitch[], /* I Residual from pitch analysis */ const SKP_int16 x[] /* I Speech signal */ ) { SKP_int i; SKP_int32 WLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ]; SKP_int32 invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ], Wght_Q15[ MAX_NB_SUBFR ]; SKP_int16 NLSF_Q15[ MAX_LPC_ORDER ]; const SKP_int16 *x_ptr; SKP_int16 *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ]; SKP_int32 tmp, min_gain_Q16; SKP_int LTP_corrs_rshift[ MAX_NB_SUBFR ]; /* weighting for weighted least squares */ min_gain_Q16 = SKP_int32_MAX >> 6; for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { min_gain_Q16 = SKP_min( min_gain_Q16, psEncCtrl->Gains_Q16[ i ] ); } for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { /* Divide to Q16 */ SKP_assert( psEncCtrl->Gains_Q16[ i ] > 0 ); /* Invert and normalize gains, and ensure that maximum invGains_Q16 is within range of a 16 bit int */ invGains_Q16[ i ] = SKP_DIV32_varQ( min_gain_Q16, psEncCtrl->Gains_Q16[ i ], 16 - 2 ); /* Ensure Wght_Q15 a minimum value 1 */ invGains_Q16[ i ] = SKP_max( invGains_Q16[ i ], 363 ); /* Square the inverted gains */ SKP_assert( invGains_Q16[ i ] == SKP_SAT16( invGains_Q16[ i ] ) ); tmp = SKP_SMULWB( invGains_Q16[ i ], invGains_Q16[ i ] ); Wght_Q15[ i ] = SKP_RSHIFT( tmp, 1 ); /* Invert the inverted and normalized gains */ local_gains[ i ] = SKP_DIV32( ( 1 << 16 ), invGains_Q16[ i ] ); } if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { /**********/ /* VOICED */ /**********/ SKP_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 ); /* LTP analysis */ SKP_Silk_find_LTP_FIX( psEncCtrl->LTPCoef_Q14, WLTP, &psEncCtrl->LTPredCodGain_Q7, res_pitch, psEncCtrl->pitchL, Wght_Q15, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift ); /* Quantize LTP gain parameters */ SKP_Silk_quant_LTP_gains( psEncCtrl->LTPCoef_Q14, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr); /* Control LTP scaling */ SKP_Silk_LTP_scale_ctrl_FIX( psEnc, psEncCtrl ); /* Create LTP residual */ SKP_Silk_LTP_analysis_filter_FIX( LPC_in_pre, psEnc->x_buf + psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder, psEncCtrl->LTPCoef_Q14, psEncCtrl->pitchL, invGains_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder ); } else { /************/ /* UNVOICED */ /************/ /* Create signal with prepended subframes, scaled by inverse gains */ x_ptr = x - psEnc->sCmn.predictLPCOrder; x_pre_ptr = LPC_in_pre; for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { SKP_Silk_scale_copy_vector16( x_pre_ptr, x_ptr, invGains_Q16[ i ], psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder ); x_pre_ptr += psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder; x_ptr += psEnc->sCmn.subfr_length; } SKP_memset( psEncCtrl->LTPCoef_Q14, 0, psEnc->sCmn.nb_subfr * LTP_ORDER * sizeof( SKP_int16 ) ); psEncCtrl->LTPredCodGain_Q7 = 0; } /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */ TIC(FIND_LPC) SKP_Silk_find_LPC_FIX( NLSF_Q15, &psEnc->sCmn.indices.NLSFInterpCoef_Q2, psEnc->sCmn.prev_NLSFq_Q15, psEnc->sCmn.useInterpolatedNLSFs, psEnc->sCmn.first_frame_after_reset, psEnc->sCmn.predictLPCOrder, LPC_in_pre, psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder, psEnc->sCmn.nb_subfr ); TOC(FIND_LPC) /* Quantize LSFs */ TIC(PROCESS_LSFS) SKP_Silk_process_NLSFs( &psEnc->sCmn, psEncCtrl->PredCoef_Q12, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 ); TOC(PROCESS_LSFS) /* Calculate residual energy using quantized LPC coefficients */ SKP_Silk_residual_energy_FIX( psEncCtrl->ResNrg, psEncCtrl->ResNrgQ, LPC_in_pre, psEncCtrl->PredCoef_Q12, local_gains, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder ); /* Copy to prediction struct for use in next frame for fluctuation reduction */ SKP_memcpy( psEnc->sCmn.prev_NLSFq_Q15, NLSF_Q15, sizeof( psEnc->sCmn.prev_NLSFq_Q15 ) ); }
SKP_int SKP_Silk_decode_frame( SKP_Silk_decoder_state *psDec, /* I/O Pointer to Silk decoder state */ ec_dec *psRangeDec, /* I/O Compressor data structure */ SKP_int16 pOut[], /* O Pointer to output speech frame */ SKP_int32 *pN, /* O Pointer to size of output frame */ const SKP_int nBytes, /* I Payload length */ SKP_int lostFlag /* I 0: no loss, 1 loss, 2 decode fec */ ) { SKP_Silk_decoder_control sDecCtrl; SKP_int i, L, mv_len, ret = 0; SKP_int8 flags; SKP_int32 LBRR_symbol; SKP_int pulses[ MAX_FRAME_LENGTH ]; TIC(DECODE_FRAME) L = psDec->frame_length; sDecCtrl.LTP_scale_Q14 = 0; /* Safety checks */ SKP_assert( L > 0 && L <= MAX_FRAME_LENGTH ); /********************************************/ /* Decode Frame if packet is not lost */ /********************************************/ if( lostFlag != PACKET_LOST && psDec->nFramesDecoded == 0 ) { /* First decoder call for this payload */ /* Decode VAD flags and LBRR flag */ flags = SKP_RSHIFT( psRangeDec->buf[ 0 ], 7 - psDec->nFramesPerPacket ) & ( SKP_LSHIFT( 1, psDec->nFramesPerPacket + 1 ) - 1 ); psDec->LBRR_flag = flags & 1; for( i = psDec->nFramesPerPacket - 1; i >= 0 ; i-- ) { flags = SKP_RSHIFT( flags, 1 ); psDec->VAD_flags[ i ] = flags & 1; } for( i = 0; i < psDec->nFramesPerPacket + 1; i++ ) { ec_dec_icdf( psRangeDec, SKP_Silk_uniform2_iCDF, 8 ); } /* Decode LBRR flags */ SKP_memset( psDec->LBRR_flags, 0, sizeof( psDec->LBRR_flags ) ); if( psDec->LBRR_flag ) { if( psDec->nFramesPerPacket == 1 ) { psDec->LBRR_flags[ 0 ] = 1; } else { LBRR_symbol = ec_dec_icdf( psRangeDec, SKP_Silk_LBRR_flags_iCDF_ptr[ psDec->nFramesPerPacket - 2 ], 8 ) + 1; for( i = 0; i < psDec->nFramesPerPacket; i++ ) { psDec->LBRR_flags[ i ] = SKP_RSHIFT( LBRR_symbol, i ) & 1; } } } if( lostFlag == DECODE_NORMAL ) { /* Regular decoding: skip all LBRR data */ for( i = 0; i < psDec->nFramesPerPacket; i++ ) { if( psDec->LBRR_flags[ i ] ) { SKP_Silk_decode_indices( psDec, psRangeDec, i, 1 ); SKP_Silk_decode_pulses( psRangeDec, pulses, psDec->indices.signalType, psDec->indices.quantOffsetType, psDec->frame_length ); } } } } if( lostFlag == DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 0 ) { /* Treat absent LBRR data as lost frame */ lostFlag = PACKET_LOST; psDec->nFramesDecoded++; } if( lostFlag != PACKET_LOST ) { /*********************************************/ /* Decode quantization indices of side info */ /*********************************************/ TIC(decode_indices) SKP_Silk_decode_indices( psDec, psRangeDec, psDec->nFramesDecoded, lostFlag ); TOC(decode_indices) /*********************************************/ /* Decode quantization indices of excitation */ /*********************************************/ TIC(decode_pulses) SKP_Silk_decode_pulses( psRangeDec, pulses, psDec->indices.signalType, psDec->indices.quantOffsetType, psDec->frame_length ); TOC(decode_pulses) /********************************************/ /* Decode parameters and pulse signal */ /********************************************/ TIC(decode_params) SKP_Silk_decode_parameters( psDec, &sDecCtrl ); TOC(decode_params) /* Update length. Sampling frequency may have changed */ L = psDec->frame_length; /********************************************************/ /* Run inverse NSQ */ /********************************************************/ TIC(decode_core) SKP_Silk_decode_core( psDec, &sDecCtrl, pOut, pulses ); TOC(decode_core) /********************************************************/ /* Update PLC state */ /********************************************************/ SKP_Silk_PLC( psDec, &sDecCtrl, pOut, L, 0 ); psDec->lossCnt = 0; psDec->prevSignalType = psDec->indices.signalType; SKP_assert( psDec->prevSignalType >= 0 && psDec->prevSignalType <= 2 ); /* A frame has been decoded without errors */ psDec->first_frame_after_reset = 0; psDec->nFramesDecoded++; } else { /* Handle packet loss by extrapolation */ SKP_Silk_PLC( psDec, &sDecCtrl, pOut, L, 1 ); } /*************************/ /* Update output buffer. */ /*************************/ SKP_assert( psDec->ltp_mem_length >= psDec->frame_length ); mv_len = psDec->ltp_mem_length - psDec->frame_length; SKP_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(SKP_int16) ); SKP_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( SKP_int16 ) ); /****************************************************************/ /* Ensure smooth connection of extrapolated and good frames */ /****************************************************************/ SKP_Silk_PLC_glue_frames( psDec, &sDecCtrl, pOut, L ); /************************************************/ /* Comfort noise generation / estimation */ /************************************************/ SKP_Silk_CNG( psDec, &sDecCtrl, pOut, L ); /********************************************/ /* HP filter output */ /********************************************/ TIC(HP_out) SKP_Silk_biquad_alt( pOut, psDec->HP_B, psDec->HP_A, psDec->HPState, pOut, L ); TOC(HP_out) /* Update some decoder state variables */ psDec->lagPrev = sDecCtrl.pitchL[ psDec->nb_subfr - 1 ]; /********************************************/ /* set output frame length */ /********************************************/ *pN = ( SKP_int16 )L; TOC(DECODE_FRAME) return ret; }
void psgstrf_panel_bmod( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ const int bcol, /* first column of the farthest busy snode*/ int *inv_perm_r,/* in; inverse of the row pivoting */ int *etree, /* in */ int *nseg, /* modified */ int *segrep, /* modified */ int *repfnz, /* modified, size n-by-w */ int *panel_lsub,/* modified */ int *w_lsub_end,/* modified */ int *spa_marker,/* modified; size n-by-w */ float *dense, /* modified, size n-by-w */ float *tempv, /* working array - zeros on input/output */ pxgstrf_shared_t *pxgstrf_shared /* modified */ ) { /* * -- SuperLU MT routine (version 2.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * and Xerox Palo Alto Research Center. * September 10, 2007 * * Purpose * ======= * * Performs numeric block updates (sup-panel) in topological order. * It features combined 1D and 2D blocking of the source updating s-node. * It consists of two steps: * (1) accumulates updates from "done" s-nodes. * (2) accumulates updates from "busy" s-nodes. * * Before entering this routine, the nonzeros of the original A in * this panel were already copied into the SPA dense[n,w]. * * Updated/Output arguments * ======================== * L[*,j:j+w-1] and U[*,j:j+w-1] are returned collectively in the * m-by-w vector dense[*,w]. The locations of nonzeros in L[*,j:j+w-1] * are given by lsub[*] and U[*,j:j+w-1] by (nseg,segrep,repfnz). * */ GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ Gstat_t *Gstat = pxgstrf_shared->Gstat; /* modified */ register int j, k, ksub; register int fsupc, nsupc, nsupr, nrow; register int kcol, krep, ksupno, dadsupno; register int jj; /* index through each column in the panel */ int *xsup, *xsup_end, *supno; int *lsub, *xlsub, *xlsub_end; int *repfnz_col; /* repfnz[] for a column in the panel */ float *dense_col; /* dense[] for a column in the panel */ int *col_marker; /* each column of the spa_marker[*,w] */ int *col_lsub; /* each column of the panel_lsub[*,w] */ static int first = 1, rowblk, colblk; #ifdef PROFILE double t1, t2; /* temporary time */ #endif #ifdef PREDICT_OPT register float pmod, max_child_eft = 0, sum_pmod = 0, min_desc_eft = 0; register float pmod_eft; register int kid, ndesc = 0; #endif #if ( DEBUGlevel>=2 ) int dbg_addr = 0*m; #endif if ( first ) { rowblk = sp_ienv(4); colblk = sp_ienv(5); first = 0; } xsup = Glu->xsup; xsup_end = Glu->xsup_end; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL) check_panel_dfs_list(pnum, "begin", jcol, *nseg, segrep);*/ if (jcol == BADPAN) printf("(%d) Enter psgstrf_panel_bmod() jcol %d,BADCOL %d,dense_col[%d] %.10f\n", pnum, jcol, BADCOL, BADROW, dense[dbg_addr+BADROW]); #endif /* -------------------------------------------------------------------- For each non-busy supernode segment of U[*,jcol] in topological order, perform sup-panel update. -------------------------------------------------------------------- */ k = *nseg - 1; for (ksub = 0; ksub < *nseg; ++ksub) { /* * krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in a supernode * nsupr = no of rows in a supernode */ krep = segrep[k--]; fsupc = xsup[supno[krep]]; nsupc = krep - fsupc + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; nrow = nsupr - nsupc; #ifdef PREDICT_OPT pmod = Gstat->procstat[pnum].fcops; #endif if ( nsupc >= colblk && nrow >= rowblk ) { /* 2-D block update */ #ifdef GEMV2 psgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else psgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } else { /* 1-D block update */ #ifdef GEMV2 psgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else psgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } #ifdef PREDICT_OPT pmod = Gstat->procstat[pnum].fcops - pmod; kid = (Glu->pan_status[krep].size > 0) ? krep : (krep + Glu->pan_status[krep].size); desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv; desc_eft[ndesc++].pmod = pmod; #endif #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) non-busy update: krep %d, repfnz %d, dense_col[%d] %.10e\n", pnum, krep, repfnz[dbg_addr+krep], BADROW, dense[dbg_addr+BADROW]); #endif } /* for each updating supernode ... */ #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) After non-busy update: dense_col[%d] %.10e\n", pnum, BADROW, dense[dbg_addr+BADROW]); #endif /* --------------------------------------------------------------------- * Now wait for the "busy" s-nodes to become "done" -- this amounts to * climbing up the e-tree along the path starting from "bcol". * Several points are worth noting: * * (1) There are two possible relations between supernodes and panels * along the path of the e-tree: * o |s-node| < |panel| * want to climb up the e-tree one column at a time in order * to achieve more concurrency * o |s-node| > |panel| * want to climb up the e-tree one panel at a time; this * processor is stalled anyway while waiting for the panel. * * (2) Need to accommodate new fills, append them in panel_lsub[*,w]. * o use an n-by-w marker array, as part of the SPA (not scalable!) * * (3) Symbolically, need to find out repfnz[S, w], for each (busy) * supernode S. * o use dense[inv_perm_r[kcol]], filter all zeros * o detect the first nonzero in each segment * (at this moment, the boundary of the busy supernode/segment * S has already been identified) * * --------------------------------------------------------------------- */ kcol = bcol; while ( kcol < jcol ) { /* Pointers to each column of the w-wide arrays. */ repfnz_col = repfnz; dense_col = dense; col_marker = spa_marker; col_lsub = panel_lsub; /* Wait for the supernode, and collect wait-time statistics. */ if ( pxgstrf_shared->spin_locks[kcol] ) { #ifdef PROFILE TIC(t1); #endif await( &pxgstrf_shared->spin_locks[kcol] ); #ifdef PROFILE TOC(t2, t1); Gstat->panstat[jcol].pipewaits++; Gstat->panstat[jcol].spintime += t2; Gstat->procstat[pnum].spintime += t2; #ifdef DOPRINT PRINT_SPIN_TIME(1); #endif #endif } /* Find leading column "fsupc" in the supernode that contains column "kcol" */ ksupno = supno[kcol]; fsupc = kcol; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL) */ if ( jcol==BADCOL ) printf("(%d) psgstrf_panel_bmod[1] kcol %d, ksupno %d, fsupc %d\n", pnum, kcol, ksupno, fsupc); #endif /* Wait for the whole supernode to become "done" -- climb up e-tree one column at a time */ do { krep = SUPER_REP( ksupno ); kcol = etree[kcol]; if ( kcol >= jcol ) break; if ( pxgstrf_shared->spin_locks[kcol] ) { #ifdef PROFILE TIC(t1); #endif await ( &pxgstrf_shared->spin_locks[kcol] ); #ifdef PROFILE TOC(t2, t1); Gstat->panstat[jcol].pipewaits++; Gstat->panstat[jcol].spintime += t2; Gstat->procstat[pnum].spintime += t2; #ifdef DOPRINT PRINT_SPIN_TIME(2); #endif #endif } dadsupno = supno[kcol]; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL)*/ if ( jcol==BADCOL ) printf("(%d) psgstrf_panel_bmod[2] krep %d, dad=kcol %d, dadsupno %d\n", pnum, krep, kcol, dadsupno); #endif } while ( dadsupno == ksupno ); /* Append the new segment into segrep[*]. After column_bmod(), copy_to_ucol() will use them. */ segrep[*nseg] = krep; ++(*nseg); /* Determine repfnz[krep, w] for each column in the panel */ for (jj = jcol; jj < jcol + w; ++jj, dense_col += m, repfnz_col += m, col_marker += m, col_lsub += m) { /* * Note: relaxed supernode may not form a path on the e-tree, * but its column numbers are contiguous. */ #ifdef SCATTER_FOUND for (kcol = fsupc; kcol <= krep; ++kcol) { if ( col_marker[inv_perm_r[kcol]] == jj ) { repfnz_col[krep] = kcol; /* Append new fills in panel_lsub[*,jj]. */ j = w_lsub_end[jj - jcol]; /*#pragma ivdep*/ for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) { ksub = lsub[k]; if ( col_marker[ksub] != jj ) { col_marker[ksub] = jj; col_lsub[j++] = ksub; } } w_lsub_end[jj - jcol] = j; break; /* found the leading nonzero in the segment */ } } #else for (kcol = fsupc; kcol <= krep; ++kcol) { if ( dense_col[inv_perm_r[kcol]] != 0.0 ) { repfnz_col[krep] = kcol; break; /* Found the leading nonzero in the U-segment */ } } /* In this case, we always treat the L-subscripts of the busy s-node [kcol : krep] as the new fills, even if the corresponding U-segment may be all zero. */ /* Append new fills in panel_lsub[*,jj]. */ j = w_lsub_end[jj - jcol]; /*#pragma ivdep*/ for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) { ksub = lsub[k]; if ( col_marker[ksub] != jj ) { col_marker[ksub] = jj; col_lsub[j++] = ksub; } } w_lsub_end[jj - jcol] = j; #endif #if ( DEBUGlevel>=2 ) if (jj == BADCOL) { printf("(%d) psgstrf_panel_bmod[fills]: jj %d, repfnz_col[%d] %d, inv_pr[%d] %d\n", pnum, jj, krep, repfnz_col[krep], fsupc, inv_perm_r[fsupc]); printf("(%d) psgstrf_panel_bmod[fills] xlsub %d, xlsub_end %d, #lsub[%d] %d\n", pnum,xlsub[krep],xlsub_end[krep],krep, xlsub_end[krep]-xlsub[krep]); } #endif } /* for jj ... */ #ifdef PREDICT_OPT pmod = Gstat->procstat[pnum].fcops; #endif /* Perform sup-panel updates - use combined 1D + 2D updates. */ nsupc = krep - fsupc + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; nrow = nsupr - nsupc; if ( nsupc >= colblk && nrow >= rowblk ) { /* 2-D block update */ #ifdef GEMV2 psgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else psgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } else { /* 1-D block update */ #ifdef GEMV2 psgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else psgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } #ifdef PREDICT_OPT pmod = Gstat->procstat[pnum].fcops - pmod; kid = (pxgstrf_shared->pan_status[krep].size > 0) ? krep : (krep + pxgstrf_shared->pan_status[krep].size); desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv; desc_eft[ndesc++].pmod = pmod; #endif #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) After busy update: dense_col[%d] %.10f\n", pnum, BADROW, dense[dbg_addr+BADROW]); #endif /* Go to the parent of "krep" */ kcol = etree[krep]; } /* while kcol < jcol ... */ #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL)*/ if ( jcol==BADCOL ) check_panel_dfs_list(pnum, "after-busy", jcol, *nseg, segrep); #endif #ifdef PREDICT_OPT qsort(desc_eft, ndesc, sizeof(desc_eft_t), (int(*)())numcomp); pmod_eft = 0; for (j = 0; j < ndesc; ++j) { pmod_eft = SUPERLU_MAX( pmod_eft, desc_eft[j].eft ) + desc_eft[j].pmod; } if ( ndesc == 0 ) { /* No modifications from descendants */ pmod_eft = 0; for (j = cp_firstkid[jcol]; j != EMPTY; j = cp_nextkid[j]) { kid = (pxgstrf_shared->pan_status[j].size > 0) ? j : (j + pxgstrf_shared->pan_status[j].size); pmod_eft = SUPERLU_MAX( pmod_eft, cp_panel[kid].est + cp_panel[kid].pdiv ); } } cp_panel[jcol].est = pmod_eft; #endif }
/* Limit, stabilize, convert and quantize NLSFs. */ void SKP_Silk_process_NLSFs_FIX( SKP_Silk_encoder_state_FIX *psEnc, /* I/O Encoder state FIX */ SKP_Silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control FIX */ SKP_int *pNLSF_Q15 /* I/O Normalized LSFs (quant out) (0 - (2^15-1)) */ ) { SKP_int doInterpolate; SKP_int pNLSFW_Q6[ MAX_LPC_ORDER ]; SKP_int NLSF_mu_Q15, NLSF_mu_fluc_red_Q16; SKP_int32 i_sqr_Q15; const SKP_Silk_NLSF_CB_struct *psNLSF_CB; /* Used only for NLSF interpolation */ SKP_int pNLSF0_temp_Q15[ MAX_LPC_ORDER ]; SKP_int pNLSFW0_temp_Q6[ MAX_LPC_ORDER ]; SKP_int i; SKP_assert( psEnc->speech_activity_Q8 >= 0 ); SKP_assert( psEnc->speech_activity_Q8 <= 256 ); SKP_assert( psEncCtrl->sparseness_Q8 >= 0 ); SKP_assert( psEncCtrl->sparseness_Q8 <= 256 ); SKP_assert( psEncCtrl->sCmn.sigtype == SIG_TYPE_VOICED || psEncCtrl->sCmn.sigtype == SIG_TYPE_UNVOICED ); /***********************/ /* Calculate mu values */ /***********************/ if( psEncCtrl->sCmn.sigtype == SIG_TYPE_VOICED ) { /* NLSF_mu = 0.002f - 0.001f * psEnc->speech_activity; */ /* NLSF_mu_fluc_red = 0.1f - 0.05f * psEnc->speech_activity; */ NLSF_mu_Q15 = SKP_SMLAWB( 66, -8388, psEnc->speech_activity_Q8 ); NLSF_mu_fluc_red_Q16 = SKP_SMLAWB( 6554, -838848, psEnc->speech_activity_Q8 ); } else { /* NLSF_mu = 0.005f - 0.004f * psEnc->speech_activity; */ /* NLSF_mu_fluc_red = 0.2f - 0.1f * psEnc->speech_activity - 0.1f * psEncCtrl->sparseness; */ NLSF_mu_Q15 = SKP_SMLAWB( 164, -33554, psEnc->speech_activity_Q8 ); NLSF_mu_fluc_red_Q16 = SKP_SMLAWB( 13107, -1677696, psEnc->speech_activity_Q8 + psEncCtrl->sparseness_Q8 ); } SKP_assert( NLSF_mu_Q15 >= 0 ); SKP_assert( NLSF_mu_Q15 <= 164 ); SKP_assert( NLSF_mu_fluc_red_Q16 >= 0 ); SKP_assert( NLSF_mu_fluc_red_Q16 <= 13107 ); NLSF_mu_Q15 = SKP_max( NLSF_mu_Q15, 1 ); /* Calculate NLSF weights */ TIC(NLSF_weights_FIX) SKP_Silk_NLSF_VQ_weights_laroia( pNLSFW_Q6, pNLSF_Q15, psEnc->sCmn.predictLPCOrder ); TOC(NLSF_weights_FIX) /* Update NLSF weights for interpolated NLSFs */ doInterpolate = ( psEnc->sCmn.useInterpolatedNLSFs == 1 ) && ( psEncCtrl->sCmn.NLSFInterpCoef_Q2 < ( 1 << 2 ) ); if( doInterpolate ) { /* Calculate the interpolated NLSF vector for the first half */ SKP_Silk_interpolate( pNLSF0_temp_Q15, psEnc->sPred.prev_NLSFq_Q15, pNLSF_Q15, psEncCtrl->sCmn.NLSFInterpCoef_Q2, psEnc->sCmn.predictLPCOrder ); /* Calculate first half NLSF weights for the interpolated NLSFs */ TIC(NLSF_weights_FIX) SKP_Silk_NLSF_VQ_weights_laroia( pNLSFW0_temp_Q6, pNLSF0_temp_Q15, psEnc->sCmn.predictLPCOrder ); TOC(NLSF_weights_FIX) /* Update NLSF weights with contribution from first half */ i_sqr_Q15 = SKP_LSHIFT( SKP_SMULBB( psEncCtrl->sCmn.NLSFInterpCoef_Q2, psEncCtrl->sCmn.NLSFInterpCoef_Q2 ), 11 ); for( i = 0; i < psEnc->sCmn.predictLPCOrder; i++ ) { pNLSFW_Q6[ i ] = SKP_SMLAWB( SKP_RSHIFT( pNLSFW_Q6[ i ], 1 ), pNLSFW0_temp_Q6[ i ], i_sqr_Q15 ); SKP_assert( pNLSFW_Q6[ i ] <= SKP_int16_MAX ); SKP_assert( pNLSFW_Q6[ i ] >= 1 ); } } /* Set pointer to the NLSF codebook for the current signal type and LPC order */ psNLSF_CB = psEnc->sCmn.psNLSF_CB[ psEncCtrl->sCmn.sigtype ]; /* Quantize NLSF parameters given the trained NLSF codebooks */ TIC(MSVQ_encode_FIX) SKP_Silk_NLSF_MSVQ_encode_FIX( psEncCtrl->sCmn.NLSFIndices, pNLSF_Q15, psNLSF_CB, psEnc->sPred.prev_NLSFq_Q15, pNLSFW_Q6, NLSF_mu_Q15, NLSF_mu_fluc_red_Q16, psEnc->sCmn.NLSF_MSVQ_Survivors, psEnc->sCmn.predictLPCOrder, psEnc->sCmn.first_frame_after_reset ); TOC(MSVQ_encode_FIX) /* Convert quantized NLSFs back to LPC coefficients */ SKP_Silk_NLSF2A_stable( psEncCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psEnc->sCmn.predictLPCOrder ); if( doInterpolate ) { /* Calculate the interpolated, quantized LSF vector for the first half */ SKP_Silk_interpolate( pNLSF0_temp_Q15, psEnc->sPred.prev_NLSFq_Q15, pNLSF_Q15, psEncCtrl->sCmn.NLSFInterpCoef_Q2, psEnc->sCmn.predictLPCOrder ); /* Convert back to LPC coefficients */ SKP_Silk_NLSF2A_stable( psEncCtrl->PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEnc->sCmn.predictLPCOrder ); } else { /* Copy LPC coefficients for first half from second half */ SKP_memcpy( psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->PredCoef_Q12[ 1 ], psEnc->sCmn.predictLPCOrder * sizeof( SKP_int16 ) ); } }