Esempio n. 1
0
    static boost::tuple<
        boost::shared_ptr<Matrix>,
        boost::shared_ptr<Matrix>
        >
    transfer_operators(const Matrix &A, params &prm)
    {
        typedef typename backend::value_type<Matrix>::type V;

        const size_t n = rows(A);

        TIC("aggregates");
        Aggregates aggr(A, prm.aggr, prm.nullspace.cols);
        TOC("aggregates");

        TIC("interpolation");
        boost::shared_ptr<Matrix> P = tentative_prolongation<Matrix>(
                n, aggr.count, aggr.id, prm.nullspace, prm.aggr.block_size
                );
        TOC("interpolation");

        boost::shared_ptr<Matrix> R = boost::make_shared<Matrix>();
        *R = transpose(*P);

        if (prm.nullspace.cols > 0)
            prm.aggr.block_size = prm.nullspace.cols;

        return boost::make_tuple(P, R);
    }
Esempio n. 2
0
        amg(const Matrix &M, const params &p = params()) : prm(p)
        {
            precondition(
                    backend::rows(M) == backend::cols(M),
                    "Matrix should be square!"
                    );

            boost::shared_ptr<build_matrix> P, R;
            boost::shared_ptr<build_matrix> A = boost::make_shared<build_matrix>( M );
            sort_rows(*A);

            while( backend::rows(*A) > prm.coarse_enough) {
                TIC("transfer operators");
                boost::tie(P, R) = Coarsening::transfer_operators(
                        *A, prm.coarsening);
                precondition(
                        backend::cols(*P) > 0,
                        "Zero-sized coarse level in amgcl (diagonal matrix?)"
                        );
                TOC("transfer operators");

                TIC("move to backend")
                levels.push_back( level(A, P, R, prm) );
                TOC("move to backend")

                TIC("coarse operator");
                A = Coarsening::coarse_operator(*A, *P, *R, prm.coarsening);
                sort_rows(*A);
                TOC("coarse operator");
            }

            TIC("coarsest level");
            levels.push_back( level(A, prm, levels.empty()) );
            TOC("coarsest level");
        }
Esempio n. 3
0
static std::pair<
    sparse::matrix<value_t, index_t>,
    sparse::matrix<value_t, index_t>
    >
interp(const sparse::matrix<value_t, index_t> &A, const params &prm) {
    const index_t n = sparse::matrix_rows(A);

    std::vector<index_t> aggr;

    assert(prm.dof_per_node > 0);

    if (prm.dof_per_node == 1) {
        // Scalar system. Nothing fancy.
        TIC("aggregates");
        aggr_type::aggregates(A, aggr::connect(A, prm.eps_strong)).swap(aggr);
        TOC("aggregates");
    } else {
        // Non-scalar system.
        // Build reduced matrix, find connections and aggregates with it,
        // restore the vectors to full size.

        std::pair<std::vector<char>, std::vector<index_t> > S_aggr = aggr::pointwise_coarsening<aggr_type>(
                    A, prm.eps_strong, prm.dof_per_node);
        aggr.swap(S_aggr.second);
    }

    index_t nc = std::max(
            static_cast<index_t>(0),
            *std::max_element(aggr.begin(), aggr.end()) + static_cast<index_t>(1)
            );

    TIC("interpolation");
    static std::pair<
        sparse::matrix<value_t, index_t>,
        sparse::matrix<value_t, index_t>
    > PR;

    sparse::matrix<value_t, index_t> &P = PR.first;
    sparse::matrix<value_t, index_t> &R = PR.second;

    P.resize(n, nc);
    P.col.reserve(n);
    P.val.reserve(n);

    P.row[0] = 0;
    for(index_t i = 0; i < n; ++i) {
        if (aggr[i] >= 0) {
            P.row[i + 1] = P.row[i] + 1;
            P.col.push_back(aggr[i]);
            P.val.push_back(static_cast<value_t>(1));
        } else {
            P.row[i + 1] = P.row[i];
        }
    }
    TOC("interpolation");

    sparse::transpose(P).swap(R);
    return PR;
}
Esempio n. 4
0
double exchange(
	int dim_x, int dim_y, int dim_z,
	double delta_x, double delta_y, double delta_z,
	bool periodic_x, bool periodic_y, bool periodic_z,
	const Matrix &Ms,
	const Matrix &A,
	const VectorMatrix &M,
	VectorMatrix &H)
{
	const bool use_cuda = isCudaEnabled();

	double res = 0;
	if (use_cuda) {
#ifdef HAVE_CUDA
		CUTIC("exchange");
		res = exchange_cuda(dim_x, dim_y, dim_z, delta_x, delta_y, delta_z, periodic_x, periodic_y, periodic_z, Ms, A, M, H, isCuda64Enabled());
		CUTOC("exchange");
#else
		assert(0);
#endif
	} else {
		TIC("exchange");
		res = exchange_cpu(dim_x, dim_y, dim_z, delta_x, delta_y, delta_z, periodic_x, periodic_y, periodic_z, Ms, A, M, H);
		TOC("exchange");
	}

	return res;
}
Esempio n. 5
0
double cubic_anisotropy(
	const VectorMatrix &axis1,
	const VectorMatrix &axis2,
	const       Matrix &k,
	const       Matrix &Ms,
	const VectorMatrix &M,
	VectorMatrix &H)
{
	const bool use_cuda = isCudaEnabled();

	double energy_sum = 0.0;

	if (use_cuda) {
#ifdef HAVE_CUDA
		CUTIC("cubic_anisotropy");
		energy_sum = cubic_anisotropy_cuda(axis1, axis2, k, Ms, M, H, isCuda64Enabled());
		CUTOC("cubic_anisotropy");
#else
		assert(0);
#endif
	} else {
		TIC("cubic_anisotropy");
		energy_sum = cubic_anisotropy_cpu(axis1, axis2, k, Ms, M, H);
		TOC("cubic_anisotropy");
	}

	return energy_sum;
}
Esempio n. 6
0
void minimize(
	const Matrix &f, const double h,
	const VectorMatrix &M,
	const VectorMatrix &H,
	VectorMatrix &M2)
{
	const bool use_cuda = isCudaEnabled();

	if (use_cuda) {
#ifdef HAVE_CUDA
		CUTIC("minimize");
#ifdef HAVE_CUDA_64
		if (isCuda64Enabled())
			minimize_cu64(f, h, M, H, M2);
		else
#endif
			minimize_cu32(f, h, M, H, M2);
		CUTOC("minimize");
#else
		assert(0);
#endif
	} else {
		TIC("minimize");
		minimize_cpu(f, h, M, H, M2);
		TOC("minimize");
	}
}
Esempio n. 7
0
void nfst_adjoint( nfst_plan *ths)
{
  /**
   * use ths->my_fftw_plan
   *
   **/
  ths->g_hat = ths->g2;
  ths->g     = ths->g1;


  /**
   * set \f$ g_l = \sum_{j=0}^{M-1} f_j \psi\left(x_j-\frac{l}{n}\right)
   * \text{ for } l \in I_n,m(x_j) \f$
   *
   */
  TIC(2)
  nfst_B_T( ths);
  TOC(2)


  /**
   * compute by d-variate discrete cosine transform
   * \f$ \hat g_k = \sum_{l \in I_n} g_l {\rm e}^{-2\pi {\rm i} \frac{kl}{n}}
   * \text{ for }  k \in I_N\f$
   *
   */
  TIC(1)
  fftw_execute( ths->my_fftw_r2r_plan);
  TOC(1)


  /**
   * form \f$ \hat f_k = \frac{\hat g_k}{c_k\left(\phi\right)} \text{ for }
   * k \in I_N \f$
   *
   */
  TIC(0)
  nfst_D_T( ths);
  TOC(0)

} /* nfst_adjoint */
Esempio n. 8
0
/**
 * user routines
 *
 */
void nfst_trafo( nfst_plan *ths)
{
  /**
   * use ths->my_fftw_r2r_plan
   *
   */
  ths->g_hat = ths->g1;
  ths->g     = ths->g2;


  /**
   * form \f$ \hat g_k = \frac{\hat f_k}{c_k\left(\phi\right)} \text{ for }
   * k \in I_N \f$
   *
   */
  TIC(0)
  nfst_D_A( ths);
  TOC(0)


  /**
   * compute by d-variate discrete Fourier transform
   * \f$ g_l = \sum_{k \in I_N} \hat g_k {\rm e}^{-2\pi {\rm i} \frac{kl}{n}}
   * \text{ for } l \in I_n \f$
   *
   */
  TIC(1)
  fftw_execute( ths->my_fftw_r2r_plan);
  TOC(1)


  /**
   * set \f$ f_j = \sum_{l \in I_n,m(x_j)} g_l \psi\left(x_j-\frac{l}{n}\right)
   * \text{ for } j=0,\hdots,M-1 \f$
   *
   */
  TIC(2)
  nfst_B_A( ths);
  TOC(2)

} /* nfst_trafo */
Esempio n. 9
0
void FeatureExtractor::calcObservedActions(Observation prevObs, Observation obs, std::vector<Action::Type> &actions) {
  actions.resize(prevObs.positions.size());
  TIC(historyuncenter);
  prevObs.uncenterPrey(dims);
  obs.uncenterPrey(dims);
  TOC(historyuncenter);
  //std::cout << prevObs << " " << obs << std::endl << std::flush;
  bool prevCapture = obs.didPreyMoveIllegally(dims,prevObs.absPrey);
  for (unsigned int i = 0; i < prevObs.positions.size(); i++) {
    // skip if the prey was captured last step
    if (prevCapture && ((int)i == obs.preyInd)) {
      actions[i] = Action::NUM_ACTIONS;
      continue;
    }
    TIC(historydiff);
    Point2D diff = getDifferenceToPoint(dims,prevObs.positions[i],obs.positions[i]);
    TOC(historydiff);
    TIC(historyaction);
    //actions.push_back(getAction(diff));
    actions[i] = getAction(diff);
    TOC(historyaction);
  }
}
Esempio n. 10
0
void FeatureExtractor::updateHistory(const Observation &obs, FeatureExtractorHistory &history) {
  std::vector<Action::Type> observedActions;
  if (history.initialized) {
    TIC(historycalc);
    calcObservedActions(history.obs,obs,observedActions);
    TOC(historycalc);
  } else {
    //std::cout << "no hist " << obs << std::endl;
    for (unsigned int i = 0; i < obs.positions.size(); i++) {
      history.actionHistory.push_back(boost::circular_buffer<Action::Type>(HISTORY_SIZE));
      observedActions.push_back(Action::NUM_ACTIONS);
    }
  }
  for (unsigned int agentInd = 0; agentInd < obs.positions.size(); agentInd++) {
    history.actionHistory[agentInd].push_front(observedActions[agentInd]);
  }
  history.initialized = true;
  history.obs = obs;
}
Esempio n. 11
0
/* Limit, stabilize, convert and quantize NLSFs */ 
void silk_process_NLSFs(
    silk_encoder_state              *psEncC,                                /* I/O  Encoder state                               */
    opus_int16                       PredCoef_Q12[ 2 ][ MAX_LPC_ORDER ],     /* O    Prediction coefficients                     */
    opus_int16                       pNLSF_Q15[         MAX_LPC_ORDER ],     /* I/O  Normalized LSFs (quant out) (0 - (2^15-1))  */
    const opus_int16                 prev_NLSFq_Q15[    MAX_LPC_ORDER ]      /* I    Previous Normalized LSFs (0 - (2^15-1))     */
)
{
    opus_int     i, doInterpolate;
    opus_int     NLSF_mu_Q20;
    opus_int32   i_sqr_Q15;
    opus_int16   pNLSF0_temp_Q15[ MAX_LPC_ORDER ];
    opus_int16   pNLSFW_QW[ MAX_LPC_ORDER ];
    opus_int16   pNLSFW0_temp_QW[ MAX_LPC_ORDER ];

    SKP_assert( psEncC->speech_activity_Q8 >=   0 );
    SKP_assert( psEncC->speech_activity_Q8 <= SILK_FIX_CONST( 1.0, 8 ) );

    /***********************/
    /* Calculate mu values */
    /***********************/
    /* NLSF_mu  = 0.003 - 0.0015 * psEnc->speech_activity; */
    NLSF_mu_Q20 = SKP_SMLAWB( SILK_FIX_CONST( 0.0025, 20 ), SILK_FIX_CONST( -0.001, 28 ), psEncC->speech_activity_Q8 );
    if( psEncC->nb_subfr == 2 ) {
        /* Multiply by 1.5 for 10 ms packets */
        NLSF_mu_Q20 = SKP_ADD_RSHIFT( NLSF_mu_Q20, NLSF_mu_Q20, 1 );
    }

    SKP_assert( NLSF_mu_Q20 >  0 );
    SKP_assert( NLSF_mu_Q20 <= SILK_FIX_CONST( 0.0045, 20 ) );

    /* Calculate NLSF weights */
    silk_NLSF_VQ_weights_laroia( pNLSFW_QW, pNLSF_Q15, psEncC->predictLPCOrder );

    /* Update NLSF weights for interpolated NLSFs */
    doInterpolate = ( psEncC->useInterpolatedNLSFs == 1 ) && ( psEncC->indices.NLSFInterpCoef_Q2 < 4 );
    if( doInterpolate ) {
        /* Calculate the interpolated NLSF vector for the first half */
        silk_interpolate( pNLSF0_temp_Q15, prev_NLSFq_Q15, pNLSF_Q15, 
            psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder );

        /* Calculate first half NLSF weights for the interpolated NLSFs */
        silk_NLSF_VQ_weights_laroia( pNLSFW0_temp_QW, pNLSF0_temp_Q15, psEncC->predictLPCOrder );

        /* Update NLSF weights with contribution from first half */
        i_sqr_Q15 = SKP_LSHIFT( SKP_SMULBB( psEncC->indices.NLSFInterpCoef_Q2, psEncC->indices.NLSFInterpCoef_Q2 ), 11 );
        for( i = 0; i < psEncC->predictLPCOrder; i++ ) {
            pNLSFW_QW[ i ] = SKP_SMLAWB( SKP_RSHIFT( pNLSFW_QW[ i ], 1 ), pNLSFW0_temp_QW[ i ], i_sqr_Q15 );
            SKP_assert( pNLSFW_QW[ i ] <= SKP_int16_MAX );
            SKP_assert( pNLSFW_QW[ i ] >= 1 );
        }
    }

    TIC(NLSF_encode)
    silk_NLSF_encode( psEncC->indices.NLSFIndices, pNLSF_Q15, psEncC->psNLSF_CB, pNLSFW_QW, 
        NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType );
    TOC(NLSF_encode)

    /* Convert quantized NLSFs back to LPC coefficients */
    silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder );

    if( doInterpolate ) {
        /* Calculate the interpolated, quantized LSF vector for the first half */
        silk_interpolate( pNLSF0_temp_Q15, prev_NLSFq_Q15, pNLSF_Q15, 
            psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder );

        /* Convert back to LPC coefficients */
        silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder );

    } else {
        /* Copy LPC coefficients for first half from second half */
        SKP_memcpy( PredCoef_Q12[ 0 ], PredCoef_Q12[ 1 ], psEncC->predictLPCOrder * sizeof( opus_int16 ) );
    }
}
Esempio n. 12
0
    static boost::tuple< boost::shared_ptr<Matrix>, boost::shared_ptr<Matrix> >
    transfer_operators(const Matrix &A, params &prm)
    {
        typedef typename backend::value_type<Matrix>::type value_type;
        typedef typename math::scalar_of<value_type>::type scalar_type;

        const size_t n = rows(A);

        BOOST_AUTO(Aptr, A.ptr_data());
        BOOST_AUTO(Acol, A.col_data());
        BOOST_AUTO(Aval, A.val_data());

        TIC("aggregates");
        Aggregates aggr(A, prm.aggr, prm.nullspace.cols);
        prm.aggr.eps_strong *= 0.5;
        TOC("aggregates");

        TIC("interpolation");
        boost::shared_ptr<Matrix> P_tent = tentative_prolongation<Matrix>(
                n, aggr.count, aggr.id, prm.nullspace, prm.aggr.block_size
                );

        boost::shared_ptr<Matrix> P = boost::make_shared<Matrix>();
        P->nrows = rows(*P_tent);
        P->ncols = cols(*P_tent);

        P->ptr.resize(n + 1, 0);

#pragma omp parallel
        {
            std::vector<ptrdiff_t> marker(P->ncols, -1);

#ifdef _OPENMP
            int nt  = omp_get_num_threads();
            int tid = omp_get_thread_num();

            ptrdiff_t chunk_size  = (n + nt - 1) / nt;
            ptrdiff_t chunk_start = tid * chunk_size;
            ptrdiff_t chunk_end   = std::min<ptrdiff_t>(n, chunk_start + chunk_size);
#else
            ptrdiff_t chunk_start = 0;
            ptrdiff_t chunk_end   = n;
#endif

            // Count number of entries in P.
            for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) {
                for(ptrdiff_t ja = Aptr[i], ea = Aptr[i+1]; ja < ea; ++ja) {
                    ptrdiff_t ca = Acol[ja];

                    // Skip weak off-diagonal connections.
                    if (ca != i && !aggr.strong_connection[ja])
                        continue;

                    for(ptrdiff_t jp = P_tent->ptr[ca], ep = P_tent->ptr[ca+1]; jp < ep; ++jp) {
                        ptrdiff_t cp = P_tent->col[jp];

                        if (marker[cp] != i) {
                            marker[cp] = i;
                            ++( P->ptr[i + 1] );
                        }
                    }
                }
            }

            boost::fill(marker, -1);

#pragma omp barrier
#pragma omp single
            {
                boost::partial_sum(P->ptr, P->ptr.begin());
                P->col.resize(P->ptr.back());
                P->val.resize(P->ptr.back());
            }

            // Fill the interpolation matrix.
            for(ptrdiff_t i = chunk_start; i < chunk_end; ++i) {

                // Diagonal of the filtered matrix is the original matrix
                // diagonal minus its weak connections.
                value_type dia = math::zero<value_type>();
                for(ptrdiff_t j = Aptr[i], e = Aptr[i+1]; j < e; ++j) {
                    if (Acol[j] == i)
                        dia += Aval[j];
                    else if (!aggr.strong_connection[j])
                        dia -= Aval[j];
                }
                dia = math::inverse(dia);

                ptrdiff_t row_beg = P->ptr[i];
                ptrdiff_t row_end = row_beg;
                for(ptrdiff_t ja = Aptr[i], ea = Aptr[i + 1]; ja < ea; ++ja) {
                    ptrdiff_t ca = Acol[ja];

                    // Skip weak off-diagonal connections.
                    if (ca != i && !aggr.strong_connection[ja]) continue;

                    value_type va = (ca == i)
                        ? static_cast<value_type>(static_cast<scalar_type>(1 - prm.relax) * math::identity<value_type>())
                        : static_cast<value_type>(static_cast<scalar_type>(-prm.relax) * dia * Aval[ja]);

                    for(ptrdiff_t jp = P_tent->ptr[ca], ep = P_tent->ptr[ca+1]; jp < ep; ++jp) {
                        ptrdiff_t cp = P_tent->col[jp];
                        value_type vp = P_tent->val[jp];

                        if (marker[cp] < row_beg) {
                            marker[cp] = row_end;
                            P->col[row_end] = cp;
                            P->val[row_end] = va * vp;
                            ++row_end;
                        } else {
                            P->val[ marker[cp] ] += va * vp;
                        }
                    }
                }
            }
        }
        TOC("interpolation");

        boost::shared_ptr<Matrix> R = boost::make_shared<Matrix>();
        *R = transpose(*P);

        if (prm.nullspace.cols > 0)
            prm.aggr.block_size = prm.nullspace.cols;

        return boost::make_tuple(P, R);
    }
void SKP_Silk_find_pred_coefs_FIX(SKP_Silk_encoder_state_FIX * psEnc,	/* I/O  encoder state                               */
				  SKP_Silk_encoder_control_FIX * psEncCtrl,	/* I/O  encoder control                             */
				  const int16_t res_pitch[]	/* I    Residual from pitch analysis                */
    )
{
	int i;
	int32_t WLTP[NB_SUBFR * LTP_ORDER * LTP_ORDER];
	int32_t invGains_Q16[NB_SUBFR], local_gains_Qx[NB_SUBFR],
	    Wght_Q15[NB_SUBFR];
	int NLSF_Q15[MAX_LPC_ORDER];
	const int16_t *x_ptr;
	int16_t *x_pre_ptr,
	    LPC_in_pre[NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH];

	int32_t tmp, min_gain_Q16;
#if !VARQ
	int LZ;
#endif
	int LTP_corrs_rshift[NB_SUBFR];

	/* weighting for weighted least squares */
	min_gain_Q16 = int32_t_MAX >> 6;
	for (i = 0; i < NB_SUBFR; i++) {
		min_gain_Q16 = SKP_min(min_gain_Q16, psEncCtrl->Gains_Q16[i]);
	}
#if !VARQ
	LZ = SKP_Silk_CLZ32(min_gain_Q16) - 1;
	LZ = SKP_LIMIT(LZ, 0, 16);
	min_gain_Q16 = SKP_RSHIFT(min_gain_Q16, 2);	/* Ensure that maximum invGains_Q16 is within range of a 16 bit int */
#endif
	for (i = 0; i < NB_SUBFR; i++) {
		/* Divide to Q16 */
		assert(psEncCtrl->Gains_Q16[i] > 0);
#if VARQ
		/* Invert and normalize gains, and ensure that maximum invGains_Q16 is within range of a 16 bit int */
		invGains_Q16[i] =
		    SKP_DIV32_varQ(min_gain_Q16, psEncCtrl->Gains_Q16[i],
				   16 - 2);
#else
		invGains_Q16[i] =
		    SKP_DIV32(SKP_LSHIFT(min_gain_Q16, LZ),
			      SKP_RSHIFT(psEncCtrl->Gains_Q16[i], 16 - LZ));
#endif

		/* Ensure Wght_Q15 a minimum value 1 */
		invGains_Q16[i] = SKP_max(invGains_Q16[i], 363);

		/* Square the inverted gains */
		assert(invGains_Q16[i] == SKP_SAT16(invGains_Q16[i]));
		tmp = SKP_SMULWB(invGains_Q16[i], invGains_Q16[i]);
		Wght_Q15[i] = SKP_RSHIFT(tmp, 1);

		/* Invert the inverted and normalized gains */
		local_gains_Qx[i] =
		    SKP_DIV32((1 << (16 + Qx)), invGains_Q16[i]);
	}

	if (psEncCtrl->sCmn.sigtype == SIG_TYPE_VOICED) {
	/**********/
		/* VOICED */
	/**********/
		assert(psEnc->sCmn.frame_length -
			   psEnc->sCmn.predictLPCOrder >=
			   psEncCtrl->sCmn.pitchL[0] + LTP_ORDER / 2);

		/* LTP analysis */
		SKP_Silk_find_LTP_FIX(psEncCtrl->LTPCoef_Q14, WLTP,
				      &psEncCtrl->LTPredCodGain_Q7, res_pitch,
				      res_pitch +
				      SKP_RSHIFT(psEnc->sCmn.frame_length, 1),
				      psEncCtrl->sCmn.pitchL, Wght_Q15,
				      psEnc->sCmn.subfr_length,
				      psEnc->sCmn.frame_length,
				      LTP_corrs_rshift);

		/* Quantize LTP gain parameters */
		SKP_Silk_quant_LTP_gains_FIX(psEncCtrl->LTPCoef_Q14,
					     psEncCtrl->sCmn.LTPIndex,
					     &psEncCtrl->sCmn.PERIndex, WLTP,
					     psEnc->mu_LTP_Q8,
					     psEnc->sCmn.LTPQuantLowComplexity);

		/* Control LTP scaling */
		SKP_Silk_LTP_scale_ctrl_FIX(psEnc, psEncCtrl);

		/* Create LTP residual */
		SKP_Silk_LTP_analysis_filter_FIX(LPC_in_pre,
						 psEnc->x_buf +
						 psEnc->sCmn.frame_length -
						 psEnc->sCmn.predictLPCOrder,
						 psEncCtrl->LTPCoef_Q14,
						 psEncCtrl->sCmn.pitchL,
						 invGains_Q16, 16,
						 psEnc->sCmn.subfr_length,
						 psEnc->sCmn.predictLPCOrder);

	} else {
	/************/
		/* UNVOICED */
	/************/
		/* Create signal with prepended subframes, scaled by inverse gains */
		x_ptr =
		    psEnc->x_buf + psEnc->sCmn.frame_length -
		    psEnc->sCmn.predictLPCOrder;
		x_pre_ptr = LPC_in_pre;
		for (i = 0; i < NB_SUBFR; i++) {
			SKP_Silk_scale_copy_vector16(x_pre_ptr, x_ptr,
						     invGains_Q16[i],
						     psEnc->sCmn.subfr_length +
						     psEnc->sCmn.
						     predictLPCOrder);
			x_pre_ptr +=
			    psEnc->sCmn.subfr_length +
			    psEnc->sCmn.predictLPCOrder;
			x_ptr += psEnc->sCmn.subfr_length;
		}

		SKP_memset(psEncCtrl->LTPCoef_Q14, 0,
			   NB_SUBFR * LTP_ORDER * sizeof(int16_t));
		psEncCtrl->LTPredCodGain_Q7 = 0;
	}

	/* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */
	TIC(FIND_LPC)
	    SKP_Silk_find_LPC_FIX(NLSF_Q15, &psEncCtrl->sCmn.NLSFInterpCoef_Q2,
				  psEnc->sPred.prev_NLSFq_Q15,
				  psEnc->sCmn.useInterpolatedNLSFs * (1 -
								      psEnc->
								      sCmn.
								      first_frame_after_reset),
				  psEnc->sCmn.predictLPCOrder, LPC_in_pre,
				  psEnc->sCmn.subfr_length +
				  psEnc->sCmn.predictLPCOrder);
	TOC(FIND_LPC)

	    /* Quantize LSFs */
	    TIC(PROCESS_LSFS)
	    SKP_Silk_process_NLSFs_FIX(psEnc, psEncCtrl, NLSF_Q15);
	TOC(PROCESS_LSFS)

	    /* Calculate residual energy using quantized LPC coefficients */
	    SKP_Silk_residual_energy_FIX(psEncCtrl->ResNrg, psEncCtrl->ResNrgQ,
					 LPC_in_pre, (const int16_t(*)[])psEncCtrl->PredCoef_Q12,
					 local_gains_Qx, Qx,
					 psEnc->sCmn.subfr_length,
					 psEnc->sCmn.predictLPCOrder);

	/* Copy to prediction struct for use in next frame for fluctuation reduction */
	SKP_memcpy(psEnc->sPred.prev_NLSFq_Q15, NLSF_Q15,
		   psEnc->sCmn.predictLPCOrder * sizeof(int));

}
Esempio n. 14
0
int_t pdgstrf
/************************************************************************/
(
 superlu_options_t *options, int m, int n, double anorm,
 LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
 )
/* 
 * Purpose
 * =======
 *
 *  PDGSTRF performs the LU factorization in parallel.
 *
 * Arguments
 * =========
 * 
 * options (input) superlu_options_t*
 *         The structure defines the input parameters to control
 *         how the LU decomposition will be performed.
 *         The following field should be defined:
 *         o ReplaceTinyPivot (yes_no_t)
 *           Specifies whether to replace the tiny diagonals by
 *           sqrt(epsilon)*norm(A) during LU factorization.
 *
 * m      (input) int
 *        Number of rows in the matrix.
 *
 * n      (input) int
 *        Number of columns in the matrix.
 *
 * anorm  (input) double
 *        The norm of the original matrix A, or the scaled A if
 *        equilibration was done.
 *
 * LUstruct (input/output) LUstruct_t*
 *         The data structures to store the distributed L and U factors.
 *         The following fields should be defined:
 *
 *         o Glu_persist (input) Glu_persist_t*
 *           Global data structure (xsup, supno) replicated on all processes,
 *           describing the supernode partition in the factored matrices
 *           L and U:
 *	       xsup[s] is the leading column of the s-th supernode,
 *             supno[i] is the supernode number to which column i belongs.
 *
 *         o Llu (input/output) LocalLU_t*
 *           The distributed data structures to store L and U factors.
 *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
 *
 * grid   (input) gridinfo_t*
 *        The 2D process mesh. It contains the MPI communicator, the number
 *        of process rows (NPROW), the number of process columns (NPCOL),
 *        and my process rank. It is an input argument to all the
 *        parallel routines.
 *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
 *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
 *
 * stat   (output) SuperLUStat_t*
 *        Record the statistics on runtime and floating-point operation count.
 *        See util.h for the definition of 'SuperLUStat_t'.
 *
 * info   (output) int*
 *        = 0: successful exit
 *        < 0: if info = -i, the i-th argument had an illegal value
 *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
 *             been completed, but the factor U is exactly singular,
 *             and division by zero will occur if it is used to solve a
 *             system of equations.
 *
 */
{
#ifdef _CRAY
    _fcd ftcs = _cptofcd("N", strlen("N"));
    _fcd ftcs1 = _cptofcd("L", strlen("L"));
    _fcd ftcs2 = _cptofcd("N", strlen("N"));
    _fcd ftcs3 = _cptofcd("U", strlen("U"));
#endif
    double alpha = 1.0, beta = 0.0;
    int_t *xsup;
    int_t *lsub, *lsub1, *usub, *Usub_buf,
          *Lsub_buf_2[2];  /* Need 2 buffers to implement Irecv. */
    double *lusup, *lusup1, *uval, *Uval_buf,
           *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
    int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
          lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
          nlb, nub, nsupc, rel, rukp;
    int_t Pc, Pr;
    int   iam, kcol, krow, mycol, myrow, pi, pj;
    int   j, k, lk, nsupers;
    int   nsupr, nbrow, segsize;
    int   msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
		      *     0 : transferred in Lsub_buf[]
		      *     1 : transferred in Lval_buf[]
		      *     2 : transferred in Usub_buf[] 
		      *     3 : transferred in Uval_buf[]
		      */
    int_t  msg0, msg2;
    int_t  **Ufstnz_br_ptr, **Lrowind_bc_ptr;
    double **Unzval_br_ptr, **Lnzval_bc_ptr;
    int_t  *index;
    double *nzval;
    int_t  *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
    double *ucol;
    int_t  *indirect;
    double *tempv, *tempv2d;
    int_t iinfo;
    int_t *ToRecv, *ToSendD, **ToSendR;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;
    superlu_scope_t *scp;
    float s_eps;
    double thresh;
    double *tempU2d, *tempu;
    int    full, ldt, ldu, lead_zero, ncols;
    MPI_Request recv_req[4], *send_req, *U_diag_blk_send_req = NULL;
    MPI_Status status;
#if ( DEBUGlevel>=2 ) 
    int_t num_copy=0, num_update=0;
#endif
#if ( PRNTlevel==3 )
    int_t  zero_msg = 0, total_msg = 0;
#endif
#if ( PROFlevel>=1 )
    double t1, t2;
    float msg_vol = 0, msg_cnt = 0;
    int_t iword = sizeof(int_t), dword = sizeof(double);
#endif

    /* Test the input parameters. */
    *info = 0;
    if ( m < 0 ) *info = -2;
    else if ( n < 0 ) *info = -3;
    if ( *info ) {
	pxerbla("pdgstrf", grid, -*info);
	return (-1);
    }

    /* Quick return if possible. */
    if ( m == 0 || n == 0 ) return 0;

    /*
     * Initialization.
     */
    iam = grid->iam;
    Pc = grid->npcol;
    Pr = grid->nprow;
    myrow = MYROW( iam, grid );
    mycol = MYCOL( iam, grid );
    nsupers = Glu_persist->supno[n-1] + 1;
    xsup = Glu_persist->xsup;
    s_eps = slamch_("Epsilon");
    thresh = s_eps * anorm;

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter pdgstrf()");
#endif

    stat->ops[FACT] = 0.0;

    if ( Pr*Pc > 1 ) {
	i = Llu->bufmax[0];
	if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
	    ABORT("Malloc fails for Lsub_buf.");
	Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
	i = Llu->bufmax[1];
	if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) )
	    ABORT("Malloc fails for Lval_buf[].");
	Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
	if ( Llu->bufmax[2] != 0 ) 
	    if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
		ABORT("Malloc fails for Usub_buf[].");
	if ( Llu->bufmax[3] != 0 ) 
	    if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) )
		ABORT("Malloc fails for Uval_buf[].");
	if ( !(U_diag_blk_send_req =
	       (MPI_Request *) SUPERLU_MALLOC(Pr*sizeof(MPI_Request))))
	    ABORT("Malloc fails for U_diag_blk_send_req[].");
        U_diag_blk_send_req[myrow] = 0; /* flag no outstanding Isend */
	if ( !(send_req =
	       (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
	    ABORT("Malloc fails for send_req[].");
    }
    k = sp_ienv_dist(3); /* max supernode size */
    if ( !(Llu->ujrow = doubleMalloc_dist(k*(k+1)/2)) )
	ABORT("Malloc fails for ujrow[].");

#if ( PRNTlevel>=1 )
    if ( !iam ) {
	printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
	printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
	       Llu->bufmax[0], Llu->bufmax[1], 
	       Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
    }
#endif

    Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
    Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
    Lval_buf_2[0] = Llu->Lval_buf_2[0];
    Lval_buf_2[1] = Llu->Lval_buf_2[1];
    Usub_buf = Llu->Usub_buf;
    Uval_buf = Llu->Uval_buf;
    Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
    Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
    Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
    Unzval_br_ptr = Llu->Unzval_br_ptr;
    ToRecv = Llu->ToRecv;
    ToSendD = Llu->ToSendD;
    ToSendR = Llu->ToSendR;

    ldt = sp_ienv_dist(3); /* Size of maximum supernode */
    if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) )
	ABORT("Calloc fails for tempv2d[].");
    tempU2d = tempv2d + ldt*ldt;
    if ( !(indirect = intMalloc_dist(ldt)) )
	ABORT("Malloc fails for indirect[].");
    k = CEILING( nsupers, Pr ); /* Number of local block rows */
    if ( !(iuip = intMalloc_dist(k)) )
	ABORT("Malloc fails for iuip[].");
    if ( !(ruip = intMalloc_dist(k)) )
	ABORT("Malloc fails for ruip[].");

#if ( VAMPIR>=1 )
    VT_symdef(1, "Send-L", "Comm");
    VT_symdef(2, "Recv-L", "Comm");
    VT_symdef(3, "Send-U", "Comm");
    VT_symdef(4, "Recv-U", "Comm");
    VT_symdef(5, "TRF2", "Factor");
    VT_symdef(100, "Factor", "Factor");
    VT_begin(100);
    VT_traceon();
#endif

    /* ---------------------------------------------------------------
       Handle the first block column separately to start the pipeline.
       --------------------------------------------------------------- */
    if ( mycol == 0 ) {

#if ( VAMPIR>=1 )
	VT_begin(5);
#endif
	pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, 
		 U_diag_blk_send_req, stat, info);

#if ( VAMPIR>=1 )
	VT_end(5);
#endif

	scp = &grid->rscp; /* The scope of process row. */

	/* Process column *kcol* multicasts numeric values of L(:,k) 
	   to process rows. */
	lsub = Lrowind_bc_ptr[0];
	lusup = Lnzval_bc_ptr[0];
	if ( lsub ) {
	    msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
	    msgcnt[1] = lsub[1] * SuperSize( 0 );
	} else {
	    msgcnt[0] = msgcnt[1] = 0;
	}
	
	for (pj = 0; pj < Pc; ++pj) {
	    if ( ToSendR[0][pj] != EMPTY ) {
#if ( PROFlevel>=1 )
		TIC(t1);
#endif
#if ( VAMPIR>=1 )
		VT_begin(1);
#endif
		MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
			  &send_req[pj] );
		MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm,
			  &send_req[pj+Pc] );
#if ( DEBUGlevel>=2 )
		printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
		       iam, 0, msgcnt[0], msgcnt[1], pj);
#endif
#if ( VAMPIR>=1 )
		VT_end(1);
#endif
#if ( PROFlevel>=1 )
		TOC(t2, t1);
		stat->utime[COMM] += t2;
		msg_cnt += 2;
		msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
#endif
	    }
	} /* for pj ... */
    } else { /* Post immediate receives. */
	if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
	    scp = &grid->rscp; /* The scope of process row. */
	    MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
		      0, scp->comm, &recv_req[0] );
	    MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0,
		      1, scp->comm, &recv_req[1] );
#if ( DEBUGlevel>=2 )
	    printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
#endif
	}
    } /* if mycol == 0 */

    /* ------------------------------------------
       MAIN LOOP: Loop through all block columns.
       ------------------------------------------ */
    for (k = 0; k < nsupers; ++k) {

	knsupc = SuperSize( k );
	krow = PROW( k, grid );
	kcol = PCOL( k, grid );

	if ( mycol == kcol ) {
	    lk = LBj( k, grid ); /* Local block number. */

	    for (pj = 0; pj < Pc; ++pj) {
                /* Wait for Isend to complete before using lsub/lusup. */
		if ( ToSendR[lk][pj] != EMPTY ) {
		    MPI_Wait( &send_req[pj], &status );
		    MPI_Wait( &send_req[pj+Pc], &status );
		}
	    }
	    lsub = Lrowind_bc_ptr[lk];
	    lusup = Lnzval_bc_ptr[lk];
	} else {
	    if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
		scp = &grid->rscp; /* The scope of process row. */
#if ( PROFlevel>=1 )
		TIC(t1);
#endif
#if ( VAMPIR>=1 )
		VT_begin(2);
#endif
		/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, 
		  Llu->bufmax[0]);*/
		/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, 
			 (4*k)%NTAGS, scp->comm, &status );*/
		MPI_Wait( &recv_req[0], &status );
		MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
		/*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, 
		  Llu->bufmax[1]);*/
		/*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, 
			 (4*k+1)%NTAGS, scp->comm, &status );*/
		MPI_Wait( &recv_req[1], &status );
		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] );
#if ( VAMPIR>=1 )
		VT_end(2);
#endif
#if ( PROFlevel>=1 )
		TOC(t2, t1);
		stat->utime[COMM] += t2;
#endif
#if ( DEBUGlevel>=2 )
		printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
		       iam, k, msgcnt[0], msgcnt[1], kcol);
		fflush(stdout);
#endif
		lsub = Lsub_buf_2[k%2];
		lusup = Lval_buf_2[k%2];
#if ( PRNTlevel==3 )
		++total_msg;
		if ( !msgcnt[0] ) ++zero_msg;
#endif
	    } else msgcnt[0] = 0;
	} /* if mycol = Pc(k) */

	scp = &grid->cscp; /* The scope of process column. */

	if ( myrow == krow ) {
	    /* Parallel triangular solve across process row *krow* --
	       U(k,j) = L(k,k) \ A(k,j).  */
#ifdef _CRAY
	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
#else
	    pdgstrs2(n, k, Glu_persist, grid, Llu, stat);
#endif

	    /* Multicasts U(k,:) to process columns. */
	    lk = LBi( k, grid );
	    usub = Ufstnz_br_ptr[lk];
	    uval = Unzval_br_ptr[lk];
	    if ( usub )	{
		msgcnt[2] = usub[2];
		msgcnt[3] = usub[1];
	    } else {
		msgcnt[2] = msgcnt[3] = 0;
	    }

	    if ( ToSendD[lk] == YES ) {
		for (pi = 0; pi < Pr; ++pi) {
		    if ( pi != myrow ) {
#if ( PROFlevel>=1 )
			TIC(t1);
#endif
#if ( VAMPIR>=1 )
			VT_begin(3);
#endif
			MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
				 (4*k+2)%NTAGS, scp->comm);
			MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi,
				 (4*k+3)%NTAGS, scp->comm);
#if ( VAMPIR>=1 )
			VT_end(3);
#endif
#if ( PROFlevel>=1 )
			TOC(t2, t1);
			stat->utime[COMM] += t2;
			msg_cnt += 2;
			msg_vol += msgcnt[2]*iword + msgcnt[3]*dword;
#endif
#if ( DEBUGlevel>=2 )
			printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
#endif
		    } /* if pi ... */
		} /* for pi ... */
	    } /* if ToSendD ... */
	} else { /* myrow != krow */
	    if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
#if ( PROFlevel>=1 )
		TIC(t1);
#endif
#if ( VAMPIR>=1 )
		VT_begin(4);
#endif
		/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, 
		  Llu->bufmax[2]);*/
		MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
			 (4*k+2)%NTAGS, scp->comm, &status );
		MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
		/*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, 
		  Llu->bufmax[3]);*/
		MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, 
			 (4*k+3)%NTAGS, scp->comm, &status );
		MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] );
#if ( VAMPIR>=1 )
		VT_end(4);
#endif
#if ( PROFlevel>=1 )
		TOC(t2, t1);
		stat->utime[COMM] += t2;
#endif
		usub = Usub_buf;
		uval = Uval_buf;
#if ( DEBUGlevel>=2 )
		printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
#endif
#if ( PRNTlevel==3 )
		++total_msg;
		if ( !msgcnt[2] ) ++zero_msg;
#endif
	    } else msgcnt[2] = 0;
	} /* if myrow == Pr(k) */
	  
	/* 
	 * Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
	 *  for (j = k+1; k < N; ++k) {
	 *     for (i = k+1; i < N; ++i) 
	 *         if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
	 *              && L(i,k) != 0 && U(k,j) != 0 )
	 *             A(i,j) = A(i,j) - L(i,k) * U(k,j);
	 */
	msg0 = msgcnt[0];
	msg2 = msgcnt[2];
	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
	    nsupr = lsub[1]; /* LDA of lusup. */
	    if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
		lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
		luptr0 = knsupc;
		nlb = lsub[0] - 1;
	    } else {
		lptr0 = BC_HEADER;
		luptr0 = 0;
		nlb = lsub[0];
	    }
	    lptr = lptr0;
	    for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
		ib = lsub[lptr];
		lib = LBi( ib, grid );
		iuip[lib] = BR_HEADER;
		ruip[lib] = 0;
		lptr += LB_DESCRIPTOR + lsub[lptr+1];
	    }
	    nub = usub[0];    /* Number of blocks in the block row U(k,:) */
	    iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
	    rukp = 0;         /* Pointer to nzval[] of U(k,:) */
	    klst = FstBlockC( k+1 );
	    
	    /* ---------------------------------------------------
	       Update the first block column A(:,k+1).
	       --------------------------------------------------- */
	    jb = usub[iukp];   /* Global block number of block U(k,j). */
	    if ( jb == k+1 ) { /* First update (k+1)-th block. */
		--nub;
		lptr = lptr0;
		luptr = luptr0;
		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
		nsupc = SuperSize( jb );
		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */

		/* Prepare to call DGEMM. */
		jj = iukp;
		while ( usub[jj] == klst ) ++jj;
		ldu = klst - usub[jj++];
		ncols = 1;
		full = 1;
		for (; jj < iukp+nsupc; ++jj) {
		    segsize = klst - usub[jj];
		    if ( segsize ) {
		        ++ncols;
			if ( segsize != ldu ) full = 0;
		        if ( segsize > ldu ) ldu = segsize;
		    }
		}
#if ( DEBUGlevel>=3 )
		++num_update;
#endif
		if ( full ) {
		    tempu = &uval[rukp];
		} else { /* Copy block U(k,j) into tempU2d. */
#if ( DEBUGlevel>=3 )
		  printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
			 iam, full, k, jb, ldu, ncols, nsupc);
		  ++num_copy;
#endif
		    tempu = tempU2d;
		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
		        segsize = klst - usub[jj];
			if ( segsize ) {
			    lead_zero = ldu - segsize;
			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
			    tempu += lead_zero;
			    for (i = 0; i < segsize; ++i)
				tempu[i] = uval[rukp+i];
			    rukp += segsize;
			    tempu += segsize;
			}
		    }
		    tempu = tempU2d;
		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
		} /* if full ... */

		for (lb = 0; lb < nlb; ++lb) { 
		    ib = lsub[lptr]; /* Row block L(i,k). */
		    nbrow = lsub[lptr+1];  /* Number of full rows. */
		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
		    tempv = tempv2d;
#ifdef _CRAY
		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
			  tempu, &ldu, &beta, tempv, &ldt);
#elif defined (USE_VENDOR_BLAS)
		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
#else
		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
			   tempu, &ldu, &beta, tempv, &ldt);
#endif
		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;

		    /* Now gather the result into the destination block. */
		    if ( ib < jb ) { /* A(i,j) is in U. */
			ilst = FstBlockC( ib+1 );
			lib = LBi( ib, grid );
			index = Ufstnz_br_ptr[lib];
			ijb = index[iuip[lib]];
			while ( ijb < jb ) { /* Search for dest block. */
			    ruip[lib] += index[iuip[lib]+1];
			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
			    ijb = index[iuip[lib]];
			}
			iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */

			tempv = tempv2d;
			for (jj = 0; jj < nsupc; ++jj) {
			    segsize = klst - usub[iukp + jj];
			    fnz = index[iuip[lib]++];
			    if ( segsize ) { /* Nonzero segment in U(k.j). */
				ucol = &Unzval_br_ptr[lib][ruip[lib]];
				for (i = 0, it = 0; i < nbrow; ++i) {
				    rel = lsub[lptr + i] - fnz;
				    ucol[rel] -= tempv[it++];
				}
				tempv += ldt;
			    }
			    ruip[lib] += ilst - fnz;
			}
		    } else { /* A(i,j) is in L. */
			index = Lrowind_bc_ptr[ljb];
			ldv = index[1];   /* LDA of the dest lusup. */
			lptrj = BC_HEADER;
			luptrj = 0;
			ijb = index[lptrj];
			while ( ijb != ib ) { /* Search for dest block -- 
						 blocks are not ordered! */
			    luptrj += index[lptrj+1];
			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
			    ijb = index[lptrj];
			}
			/*
			 * Build indirect table. This is needed because the
			 * indices are not sorted.
			 */
			fnz = FstBlockC( ib );
			lptrj += LB_DESCRIPTOR;
			for (i = 0; i < index[lptrj-1]; ++i) {
			    rel = index[lptrj + i] - fnz;
			    indirect[rel] = i;
			}
			nzval = Lnzval_bc_ptr[ljb] + luptrj;
			tempv = tempv2d;
			for (jj = 0; jj < nsupc; ++jj) {
			    segsize = klst - usub[iukp + jj];
			    if ( segsize ) {
/*#pragma _CRI cache_bypass nzval,tempv*/
				for (it = 0, i = 0; i < nbrow; ++i) {
				    rel = lsub[lptr + i] - fnz;
				    nzval[indirect[rel]] -= tempv[it++];
				}
				tempv += ldt;
			    }
			    nzval += ldv;
			}
		    } /* if ib < jb ... */
		    lptr += nbrow;
		    luptr += nbrow;
		} /* for lb ... */
		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
		iukp += nsupc;
	    }  /* if jb == k+1 */
	} /* if L(:,k) and U(k,:) not empty */


	if ( k+1 < nsupers ) {
	  kcol = PCOL( k+1, grid );
	  if ( mycol == kcol ) {
#if ( VAMPIR>=1 )
	    VT_begin(5);
#endif
	    /* Factor diagonal and subdiagonal blocks and test for exact
	       singularity.  */
	    pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu,
		     U_diag_blk_send_req, stat, info);

#if ( VAMPIR>=1 )
	    VT_end(5);
#endif

	    /* Process column *kcol+1* multicasts numeric values of L(:,k+1) 
	       to process rows. */
	    lk = LBj( k+1, grid ); /* Local block number. */
	    lsub1 = Lrowind_bc_ptr[lk];
 	    if ( lsub1 ) {
		msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
		msgcnt[1] = lsub1[1] * SuperSize( k+1 );
	    } else {
		msgcnt[0] = 0;
		msgcnt[1] = 0;
	    }
	    scp = &grid->rscp; /* The scope of process row. */
	    for (pj = 0; pj < Pc; ++pj) {
		if ( ToSendR[lk][pj] != EMPTY ) {
		    lusup1 = Lnzval_bc_ptr[lk];
#if ( PROFlevel>=1 )
		    TIC(t1);
#endif
#if ( VAMPIR>=1 )
		    VT_begin(1);
#endif
		    MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
			      (4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
		    MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj,
			     (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
#if ( VAMPIR>=1 )
		    VT_end(1);
#endif
#if ( PROFlevel>=1 )
		    TOC(t2, t1);
		    stat->utime[COMM] += t2;
		    msg_cnt += 2;
		    msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
#endif
#if ( DEBUGlevel>=2 )
		    printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
			   iam, k+1, msgcnt[0], msgcnt[1], pj);
#endif
		}
	    } /* for pj ... */
	  } else { /* Post Recv of block column L(:,k+1). */
	    if ( ToRecv[k+1] >= 1 ) {
		scp = &grid->rscp; /* The scope of process row. */
		MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
			  (4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
		MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, 
			  (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
#if ( DEBUGlevel>=2 )
		printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
#endif
	    }
	  } /* if mycol == Pc(k+1) */
        } /* if k+1 < nsupers */

	if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
	    /* ---------------------------------------------------
	       Update all other blocks using block row U(k,:)
	       --------------------------------------------------- */
	    for (j = 0; j < nub; ++j) { 
		lptr = lptr0;
		luptr = luptr0;
		jb = usub[iukp];  /* Global block number of block U(k,j). */
		ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
		nsupc = SuperSize( jb );
		iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */

		/* Prepare to call DGEMM. */
		jj = iukp;
		while ( usub[jj] == klst ) ++jj;
		ldu = klst - usub[jj++];
		ncols = 1;
		full = 1;
		for (; jj < iukp+nsupc; ++jj) {
		    segsize = klst - usub[jj];
		    if ( segsize ) {
		        ++ncols;
			if ( segsize != ldu ) full = 0;
		        if ( segsize > ldu ) ldu = segsize;
		    }
		}
#if ( DEBUGlevel>=3 )
		printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
		       iam, full, k, jb, ldu, ncols, nsupc);
		++num_update;
#endif
		if ( full ) {
		    tempu = &uval[rukp];
		} else { /* Copy block U(k,j) into tempU2d. */
#if ( DEBUGlevel>=3 )
		    ++num_copy;
#endif
		    tempu = tempU2d;
		    for (jj = iukp; jj < iukp+nsupc; ++jj) {
		        segsize = klst - usub[jj];
			if ( segsize ) {
			    lead_zero = ldu - segsize;
			    for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
			    tempu += lead_zero;
			    for (i = 0; i < segsize; ++i)
			        tempu[i] = uval[rukp+i];
			    rukp += segsize;
			    tempu += segsize;
			}
		    }
		    tempu = tempU2d;
		    rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
		} /* if full ... */

		for (lb = 0; lb < nlb; ++lb) { 
		    ib = lsub[lptr];       /* Row block L(i,k). */
		    nbrow = lsub[lptr+1];  /* Number of full rows. */
		    lptr += LB_DESCRIPTOR; /* Skip descriptor. */
		    tempv = tempv2d;
#ifdef _CRAY
		    SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, 
			  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
			  tempu, &ldu, &beta, tempv, &ldt);
#elif defined (USE_VENDOR_BLAS)
		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
			   tempu, &ldu, &beta, tempv, &ldt, 1, 1);
#else
		    dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, 
			   &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, 
			   tempu, &ldu, &beta, tempv, &ldt);
#endif
		    stat->ops[FACT] += 2 * nbrow * ldu * ncols;

		    /* Now gather the result into the destination block. */
		    if ( ib < jb ) { /* A(i,j) is in U. */
			ilst = FstBlockC( ib+1 );
			lib = LBi( ib, grid );
			index = Ufstnz_br_ptr[lib];
			ijb = index[iuip[lib]];
			while ( ijb < jb ) { /* Search for dest block. */
			    ruip[lib] += index[iuip[lib]+1];
			    iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
			    ijb = index[iuip[lib]];
			}
			/* Skip descriptor.  Now point to fstnz index of 
			   block U(i,j). */
			iuip[lib] += UB_DESCRIPTOR;

			tempv = tempv2d;
			for (jj = 0; jj < nsupc; ++jj) {
			    segsize = klst - usub[iukp + jj];
			    fnz = index[iuip[lib]++];
			    if ( segsize ) { /* Nonzero segment in U(k.j). */
				ucol = &Unzval_br_ptr[lib][ruip[lib]];
				for (i = 0 ; i < nbrow; ++i) {
				    rel = lsub[lptr + i] - fnz;
				    ucol[rel] -= tempv[i];
				}
				tempv += ldt;
			    }
			    ruip[lib] += ilst - fnz;
			}
		    } else { /* A(i,j) is in L. */
			index = Lrowind_bc_ptr[ljb];
			ldv = index[1];   /* LDA of the dest lusup. */
			lptrj = BC_HEADER;
			luptrj = 0;
			ijb = index[lptrj];
			while ( ijb != ib ) { /* Search for dest block -- 
						 blocks are not ordered! */
			    luptrj += index[lptrj+1];
			    lptrj += LB_DESCRIPTOR + index[lptrj+1];
			    ijb = index[lptrj];
			}
			/*
			 * Build indirect table. This is needed because the
			 * indices are not sorted for the L blocks.
			 */
			fnz = FstBlockC( ib );
			lptrj += LB_DESCRIPTOR;
			for (i = 0; i < index[lptrj-1]; ++i) {
			    rel = index[lptrj + i] - fnz;
			    indirect[rel] = i;
			}
			nzval = Lnzval_bc_ptr[ljb] + luptrj;
			tempv = tempv2d;
			for (jj = 0; jj < nsupc; ++jj) {
			    segsize = klst - usub[iukp + jj];
			    if ( segsize ) {
/*#pragma _CRI cache_bypass nzval,tempv*/
				for (i = 0; i < nbrow; ++i) {
				    rel = lsub[lptr + i] - fnz;
				    nzval[indirect[rel]] -= tempv[i];
				}
				tempv += ldt;
			    }
			    nzval += ldv;
			}
		    } /* if ib < jb ... */
		    lptr += nbrow;
		    luptr += nbrow;
		} /* for lb ... */
		rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
		iukp += nsupc;
	    } /* for j ... */
	} /* if  k L(:,k) and U(k,:) are not empty */

    } 
    /* ------------------------------------------
       END MAIN LOOP: for k = ...
       ------------------------------------------ */

#if ( VAMPIR>=1 )
    VT_end(100);
    VT_traceoff();
#endif

    if ( Pr*Pc > 1 ) {
	SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
	SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
	if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
	if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
	SUPERLU_FREE(send_req);
	if ( U_diag_blk_send_req[myrow] ) {
	    /* wait for last Isend requests to complete, deallocate objects */ 
	    for (krow = 0; krow < Pr; ++krow)
		if ( krow != myrow )
                    MPI_Wait(U_diag_blk_send_req + krow, &status);
	}
	SUPERLU_FREE(U_diag_blk_send_req);
    }

    SUPERLU_FREE(Llu->ujrow);
    SUPERLU_FREE(tempv2d);
    SUPERLU_FREE(indirect);
    SUPERLU_FREE(iuip);
    SUPERLU_FREE(ruip);

    /* Prepare error message. */
    if ( *info == 0 ) *info = n + 1;
#if ( PROFlevel>=1 )
    TIC(t1);
#endif
    MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
#if ( PROFlevel>=1 )
    TOC(t2, t1);
    stat->utime[COMM] += t2;
    {
	float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
	
	MPI_Reduce( &msg_cnt, &msg_cnt_sum,
		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	MPI_Reduce( &msg_cnt, &msg_cnt_max,
		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
	MPI_Reduce( &msg_vol, &msg_vol_sum,
		   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
	MPI_Reduce( &msg_vol, &msg_vol_max,
		   1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
	if ( !iam ) {
	    printf("\tPDGSTRF comm stat:"
		   "\tAvg\tMax\t\tAvg\tMax\n"
		   "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
		   msg_cnt_sum/Pr/Pc, msg_cnt_max,
		   msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
	}
    }
#endif
    if ( iinfo == n + 1 ) *info = 0;
    else *info = iinfo;


#if ( PRNTlevel==3 )
    MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
    if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
    MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
    if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
#endif

#if ( DEBUGlevel>=2 )
    for (i = 0; i < Pr * Pc; ++i) {
	if ( iam == i ) {
	    dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
	    dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
	    printf("(%d)\n", iam);
	    PrintInt10("Recv", nsupers, Llu->ToRecv);
	}
	MPI_Barrier( grid->comm );
    }
#endif

#if ( DEBUGlevel>=3 )
    printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
#endif
#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit pdgstrf()");
#endif
} /* PDGSTRF */
Esempio n. 15
0
void
*pzgstrf_thread(void *arg)
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 *
 * Purpose
 * =======
 *
 * This is the slave process, representing the main scheduling loop to
 * perform the factorization. Each process executes a copy of the
 * following code ... (SPMD paradigm)
 *
 * Working arrays local to each process
 * ======================================
 *   marker[0:3*m-1]: marker[i] == j means node i has been reached when 
 *                                 working on column j.
 *	Storage: relative to original row subscripts
 *
 *	THERE ARE 3 OF THEM:
 *          marker[0 : m-1]:   used by pzgstrf_factor_snode() and 
 *                                     pzgstrf_panel_dfs();
 *          marker[m : 2m-1]:  used by pzgstrf_panel_dfs() and 
 *                                     pxgstrf_super_bnd_dfs();
 *                values in [0 : n-1]  when used by pzgstrf_panel_dfs()
 *                values in [n : 2n-1] when used by pxgstrf_super_bnd_dfs()
 *	    marker[2m : 3m-1]: used by pzgstrf_column_dfs() in inner-factor 
 *
 *   parent[0:n-1]: parent vector used during dfs
 *      Storage: relative to new row subscripts
 *
 *   xplore[0:2m-1]: xplore[i] gives the location of the next (dfs) 
 *	unexplored neighbor of i in lsub[*]; xplore[n+i] gives the
 *      location of the last unexplored neighbor of i in lsub[*].
 *
 *   segrep[0:nseg-1]: contains the list of supernodal representatives
 *	in topological order of the dfs. A supernode representative is the 
 *	last column of a supernode.
 *
 *   repfnz[0:m-1]: for a nonzero segment U[*,j] that ends at a 
 *	supernodal representative r, repfnz[r] is the location of the first 
 *	nonzero in this segment.  It is also used during the dfs:
 *      repfnz[r]>0 indicates that supernode r has been explored.
 *	NOTE: There are w of them, each used for one column of a panel. 
 *
 *   panel_lsub[0:w*m-1]: temporary for the nonzero row indices below 
 *      the panel diagonal. These are filled in during pzgstrf_panel_dfs(), 
 *      and are used later in the inner LU factorization.
 *	panel_lsub[]/dense[] pair forms the SPA data structure.
 *	NOTE: There are w of them.
 *
 *   dense[0:w*m-1]: sparse accumulator (SPA) for intermediate values;
 *	NOTE: there are w of them.
 *
 *   tempv[0:m-1]: real temporary used for dense numeric kernels;
 *
 * 
 * Scheduling algorithm (For each process ...)
 * ====================
 *     Shared task Q <-- { relaxed s-nodes (CANGO) };
 *
 *     WHILE (not finished)
 *
 *         panel = Scheduler(Q); (see pxgstrf_scheduler.c for policy)
 *
 *         IF (panel == RELAXED_SNODE)
 *             factor_relax_snode(panel);
 *         ELSE
 *             * pzgstrf_panel_dfs()
 *                 - skip all BUSY s-nodes (or panels)
 *
 *             * dpanel_bmod()
 *                 - updates from DONE s-nodes
 *                 - wait for BUSY s-nodes to become DONE
 *
 *             * inner-factor()
 *                 - identical as it is in the sequential algorithm,
 *                   except that pruning() will interact with the
 *                   pzgstrf_panel_dfs() of other panels.
 *         ENDIF
 *
 *     END WHILE;
 *
 */

#if ( MACH==SGI || MACH==ORIGIN )
#if ( MACH==SGI )
    int         pnum = mpc_my_threadnum();
#elif ( MACH==ORIGIN )
    int         pnum = mp_my_threadnum();
#endif
    pzgstrf_threadarg_t *thr_arg = &((pzgstrf_threadarg_t *)arg)[pnum];
#else
    pzgstrf_threadarg_t *thr_arg  = arg;
    int         pnum = thr_arg->pnum;
#endif

    /* Unpack the options argument */
    superlumt_options_t *superlumt_options = thr_arg->superlumt_options;
    pxgstrf_shared_t  *pxgstrf_shared= thr_arg->pxgstrf_shared;
    int         panel_size = superlumt_options->panel_size;
    double     diag_pivot_thresh = superlumt_options->diag_pivot_thresh;
    yes_no_t    *usepr     = &superlumt_options->usepr; /* may be modified */
    int         *etree     = superlumt_options->etree;
    int         *super_bnd = superlumt_options->part_super_h;
    int         *perm_r    = superlumt_options->perm_r;
    int         *inv_perm_c= pxgstrf_shared->inv_perm_c;
    int         *inv_perm_r= pxgstrf_shared->inv_perm_r;
    int	        *xprune    = pxgstrf_shared->xprune;
    int	        *ispruned  = pxgstrf_shared->ispruned;
    SuperMatrix *A         = pxgstrf_shared->A;
    GlobalLU_t  *Glu       = pxgstrf_shared->Glu;
    Gstat_t 	*Gstat     = pxgstrf_shared->Gstat;
    int         *info      = &thr_arg->info;

    /* Local working arrays */
    int       *iwork;
    doublecomplex    *dwork;
    int	      *segrep, *repfnz, *parent, *xplore;
    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
    int	      *marker, *marker1, *marker2;
    int       *lbusy; /* "Local busy" array, indicates which descendants
			 were busy when this panel's computation began.
			 Those columns (s-nodes) are treated specially
			 during pzgstrf_panel_dfs() and dpanel_bmod(). */

    int       *spa_marker; /* size n-by-w */
    int       *w_lsub_end; /* record the end of each column in panel_lsub */
    doublecomplex    *dense, *tempv;
    int       *lsub, *xlsub, *xlsub_end;

    /* Local scalars */
    register int m, n, k, jj, jcolm1, itemp, singular;
    int       pivrow;   /* pivotal row number in the original matrix A */
    int       nseg1;	/* no of segments in U-column above panel row jcol */
    int       nseg;	/* no of segments in each U-column */
    int       w, bcol, jcol;

#ifdef PROFILE
    double *utime = Gstat->utime;
    double t1, t2, t, stime;
    register float flopcnt;
#endif

#ifdef PREDICT_OPT
    flops_t  *ops = Gstat->ops;
    register float pdiv;
#endif
    
#if ( DEBUGlevel>=1 )
    printf("(%d) thr_arg-> pnum %d, info %d\n", pnum, thr_arg->pnum, thr_arg->info);
#endif

    singular   = 0;
    m          = A->nrow;
    n          = A->ncol;
    lsub       = Glu->lsub;
    xlsub      = Glu->xlsub;
    xlsub_end  = Glu->xlsub_end;

    /* Allocate and initialize the per-process working storage. */
    if ( (*info = pzgstrf_WorkInit(m, panel_size, &iwork, &dwork)) ) {
	*info += pzgstrf_memory_use(Glu->nzlmax, Glu->nzumax, Glu->nzlumax);
	return 0;
    }
    pxgstrf_SetIWork(m, panel_size, iwork, &segrep, &parent, &xplore,
	     &repfnz, &panel_lsub, &marker, &lbusy);
    pzgstrf_SetRWork(m, panel_size, dwork, &dense, &tempv);
    
    /* New data structures to facilitate parallel algorithm */
    spa_marker = intMalloc(m * panel_size);
    w_lsub_end = intMalloc(panel_size);
    ifill (spa_marker, m * panel_size, EMPTY);
    ifill (marker, m * NO_MARKER, EMPTY);
    ifill (lbusy, m, EMPTY);
    jcol = EMPTY;
    marker1 = marker + m;
    marker2 = marker + 2*m;

#ifdef PROFILE    
    stime = SuperLU_timer_();
#endif

    /* -------------------------
       Main loop: repeatedly ...
       ------------------------- */
    while ( pxgstrf_shared->tasks_remain > 0 ) {
        
#ifdef PROFILE
	TIC(t);
#endif
	/* Get a panel from the scheduler. */
	pxgstrf_scheduler(pnum, n, etree, &jcol, &bcol, pxgstrf_shared);

#if ( DEBUGlevel>=1 )
    if ( jcol>=LOCOL && jcol<=HICOL ) {
	printf("(%d) Scheduler(): jcol %d, bcol %d, tasks_remain %d\n", 
	       pnum, jcol, bcol, pxgstrf_shared->tasks_remain);
	fflush(stdout);
    }
#endif

#ifdef PROFILE	    
	TOC(t2, t);
	Gstat->procstat[pnum].skedtime += t2;	    
#endif
	    
	if ( jcol != EMPTY ) {
	    w = pxgstrf_shared->pan_status[jcol].size;

#if ( DEBUGlevel>=3 )
	    printf("P%2d got panel %5d-%5d\ttime %.4f\tpanels_left %d\n",
		   pnum, jcol, jcol+w-1, SuperLU_timer_(), 
		   pxgstrf_shared->tasks_remain);
	    fflush(stdout); 
#endif
	    /* Nondomain panels */
#ifdef PROFILE
	    flopcnt = Gstat->procstat[pnum].fcops;
	    Gstat->panstat[jcol].pnum = pnum;
	    TIC(t1);
	    Gstat->panstat[jcol].starttime = t1;
#endif
	    if ( pxgstrf_shared->pan_status[jcol].type == RELAXED_SNODE ) {
		
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops;
#endif
		/* A relaxed supernode at the bottom of the etree */
		pzgstrf_factor_snode
		    (pnum, jcol, A, diag_pivot_thresh, usepr,
		     perm_r, inv_perm_r, inv_perm_c, xprune, marker,
		     panel_lsub, dense, tempv, pxgstrf_shared, info);
		if ( *info ) {
		    if ( *info > n ) return 0;
		    else if ( singular == 0 || *info < singular ) 
		        singular = *info;
#if ( DEBUGlevel>=1 )
    printf("(%d) After pzgstrf_factor_snode(): singular=%d\n", pnum, singular);
#endif
		}

		/* Release the whole relaxed supernode */
		for (jj = jcol; jj < jcol + w; ++jj) 
		    pxgstrf_shared->spin_locks[jj] = 0;
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops - pdiv;
		cp_panel[jcol].pdiv = pdiv;
#endif
	    } else { /* Regular panel */
#ifdef PROFILE
		TIC(t);
#endif
		pxgstrf_mark_busy_descends(pnum, jcol, etree, pxgstrf_shared, 
					   &bcol, lbusy);
		
		/* Symbolic factor on a panel of columns */
		pzgstrf_panel_dfs
		    (pnum, m, w, jcol, A, perm_r, xprune,ispruned,lbusy,
		     &nseg1, panel_lsub, w_lsub_end, segrep, repfnz,
		     marker, spa_marker, parent, xplore, dense, Glu);
#if ( DEBUGlevel>=2 )
  if ( jcol==BADPAN )
    printf("(%d) After pzgstrf_panel_dfs(): nseg1 %d, w_lsub_end %d\n",
	   pnum, nseg1, w_lsub_end[0]);
#endif
#ifdef PROFILE
		TOC(t2, t);
		utime[DFS] += t2;
#endif
		/* Numeric sup-panel updates in topological order.
		 * On return, the update values are temporarily stored in 
		 * the n-by-w SPA dense[m,w].
		 */
		pzgstrf_panel_bmod
		    (pnum, m, w, jcol, bcol, inv_perm_r, etree,
		     &nseg1, segrep, repfnz, panel_lsub, w_lsub_end,
		     spa_marker, dense, tempv, pxgstrf_shared);

		/*
		 * All "busy" descendants are "done" now --
		 * Find the set of row subscripts in the preceeding column
		 * "jcol-1" of the current panel. Column "jcol-1" is
		 * usually taken by a process other than myself.
		 * This row-subscripts information will be used by myself
		 * during column dfs to detect whether "jcol" belongs
		 * to the same supernode as "jcol-1".
		 * 
		 * ACCORDING TO PROFILE, THE AMOUNT OF TIME SPENT HERE 
		 * IS NEGLIGIBLE.
		 */
		jcolm1 = jcol - 1;
		itemp = xlsub_end[jcolm1];
		for (k = xlsub[jcolm1]; k < itemp; ++k)
		    marker2[lsub[k]] = jcolm1;
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops;
#endif
		/* Inner-factorization, using sup-col algorithm */
		for ( jj = jcol; jj < jcol + w; jj++) {
		    k = (jj - jcol) * m; /* index into w-wide arrays */
		    nseg = nseg1; /* begin after all the panel segments */
#ifdef PROFILE
		    TIC(t);
#endif
		    /* Allocate storage for the current H-supernode. */
		    if ( Glu->dynamic_snode_bound && super_bnd[jj] ) {
		        /* jj starts a supernode in H */
			pxgstrf_super_bnd_dfs
			    (pnum, m, n, jj, super_bnd[jj], A, perm_r, 
			     inv_perm_r, xprune, ispruned, marker1, parent, 
			     xplore, pxgstrf_shared);
		    }
		    
		    if ( (*info = pzgstrf_column_dfs
			            (pnum, m, jj, jcol, perm_r, ispruned,
				     &panel_lsub[k],w_lsub_end[jj-jcol],
				     super_bnd, &nseg, segrep,
				     &repfnz[k], xprune, marker2,
				     parent, xplore, pxgstrf_shared)) )
			return 0;
#ifdef PROFILE
		    TOC(t2, t);
		    utime[DFS] += t2;
#endif
		    /* On return, the L supernode is gathered into the
		       global storage. */
		    if ( (*info = pzgstrf_column_bmod
			          (pnum, jj, jcol, (nseg - nseg1),
				   &segrep[nseg1], &repfnz[k],
				   &dense[k], tempv, pxgstrf_shared, Gstat)) )
			return 0;
		
		    if ( (*info = pzgstrf_pivotL
			            (pnum, jj, diag_pivot_thresh, usepr,
				     perm_r, inv_perm_r, inv_perm_c,
				     &pivrow, Glu, Gstat)) )
			if ( singular == 0 || *info < singular ) {
			    singular = *info;
#if ( DEBUGlevel>=1 )
    printf("(%d) After pzgstrf_pivotL(): singular=%d\n", pnum, singular);
#endif
			}

                    /* release column "jj", so that the other processes
                       waiting for this column can proceed */
		    pxgstrf_shared->spin_locks[jj] = 0;
		    
		    /* copy the U-segments to ucol[*] */
		    if ( (*info = pzgstrf_copy_to_ucol
			            (pnum,jj,nseg,segrep,&repfnz[k],
				     perm_r, &dense[k], pxgstrf_shared)) )
		      return 0;

		    /* Prune columns [0:jj-1] using column jj */
		    pxgstrf_pruneL(jj, perm_r, pivrow, nseg, segrep,
				   &repfnz[k], xprune, ispruned, Glu);

		    /* Reset repfnz[] for this column */
		    pxgstrf_resetrep_col (nseg, segrep, &repfnz[k]);

#if ( DEBUGlevel>=2 )
/*  if (jj >= LOCOL && jj <= HICOL) {*/
  if ( jj==BADCOL ) {
    dprint_lu_col(pnum, "panel:", jcol, jj, w, pivrow, xprune, Glu);
    dcheck_zero_vec(pnum, "after pzgstrf_copy_to_ucol() dense_col[]", n, &dense[k]);
  }
#endif
		} /* for jj ... */
		
#ifdef PREDICT_OPT
		pdiv = Gstat->procstat[pnum].fcops - pdiv;
		cp_panel[jcol].pdiv = pdiv;
#endif
		
	    } /* else regular panel ... */
	    
	    STATE( jcol ) = DONE; /* Release panel jcol. */
	    
#ifdef PROFILE
	    TOC(Gstat->panstat[jcol].fctime, t1);
	    Gstat->panstat[jcol].flopcnt += Gstat->procstat[pnum].fcops - flopcnt;
	    /*if ( Glu->tasks_remain < P ) {
		flops_last_P_panels += Gstat->panstat[jcol].flopcnt;
		printf("Panel %d, flops %e\n", jcol, Gstat->panstat[jcol].flopcnt);
		fflush(stdout);
	    } */
#endif

	}
#ifdef PROFILE
	else { /* No panel from the task queue - wait and try again */
	    Gstat->procstat[pnum].skedwaits++;
	}
#endif
	
    } /* while there are more panels */

    *info = singular;

    /* Free work space and compress storage */
    pzgstrf_WorkFree(iwork, dwork, Glu);
    SUPERLU_FREE (spa_marker);
    SUPERLU_FREE (w_lsub_end);

#ifdef PROFILE
    Gstat->procstat[pnum].fctime = SuperLU_timer_() - stime;
#endif

    return 0;
}
void
pxgstrf_scheduler(const int pnum, const int n, const int *etree, 
		  int *cur_pan, int *bcol, pxgstrf_shared_t *pxgstrf_shared)
{
/*
 * -- SuperLU MT routine (version 1.0) --
 * Univ. of California Berkeley, Xerox Palo Alto Research Center,
 * and Lawrence Berkeley National Lab.
 * August 15, 1997
 *
 * Purpose
 * =======
 *
 * pxgstrf_scheduler() gets a panel for the processor to work on. 
 * It schedules a panel in decreasing order of priority:
 *   (1) the current panel's parent, if it can be done without pipelining
 *   (2) any other panel in the queue that can be done without pipelining
 *       ("CANGO" status)
 *   (3) any other panel in the queue that can be done with pipelining
 *       ("CANPIPE" status)
 *
 * Arguments
 * =========
 * pnum    (input) int
 *         Processor number.
 *
 * n       (input) int
 *         Column dimension of the matrix.
 *
 * etree   (input) int*
 *         Elimination tree of A'*A, size n.
 *         Note: etree is a vector of parent pointers for a forest whose
 *         vertices are the integers 0 to n-1; etree[root] = n.
 *
 * cur_pan (input/output) int*
 *         On entry, the current panel just finished by this processor;
 *         On exit, [0, n-1]: the new panel to work on;
 *                  EMPTY:    failed to get any work, will try later;
 *                  n:        all panels are taken; ready to terminate.
 *
 * taskq   (input/output) queue_t*
 *         Global work queue.
 *
 * fb_cols (input/output) int*
 *         The farthest busy descendant of each (leading column of the) panel.
 *
 * bcol    (output) int*
 *         The most distant busy descendant of cur_pan in the *linear*
 *         pipeline of busy descendants. If all proper descendants of
 *         cur_pan are done, bcol is returned equal to cur_pan.
 *
 * Defining terms
 * ==============
 *   o parent(panel) = etree(last column in the panel)
 *   o the kids of a panel = collective kids of all columns in the panel
 *     kids[REP] = SUM_{j in panel} ( kids[j] ) 
 *   o linear pipeline - what does it mean in the panel context?
 *       if ukids[REP] = 0, then the panel becomes a leaf (CANGO)
 *       if ukids[REP] = 1 && ukids[firstcol] = 1, then the panel can
 *                       be taken with pipelining (CANPIPE)
 *
 * NOTES
 * =====
 *   o When a "busy" panel finishes, if its parent has only one remaining
 *     undone child there is no check to see if the parent should change
 *     from "unready" to "canpipe". Thus a few potential pipelinings will
 *     be lost, but checking out this pipeline opportunity may be costly.
 *
 */

    register int dad, dad_ukids, jcol, w, j;
    int *fb_cols = pxgstrf_shared->fb_cols;
    queue_t *taskq = &pxgstrf_shared->taskq;
    Gstat_t *Gstat = pxgstrf_shared->Gstat;
#ifdef PROFILE
    double t;
#endif

    jcol = *cur_pan;
    if ( jcol != EMPTY ) {
#ifdef DOMAINS
	if ( in_domain[jcol] == TREE_DOMAIN )
	    dad = etree[jcol];
	else
#endif
	    dad = DADPANEL (jcol);
    }

    /* w_top = sp_ienv(1)/2;
       if ( w_top == 0 ) w_top = 1;*/

#ifdef PROFILE
    TIC(t);
#endif
#if ( MACH==SUN )
    mutex_lock( &pxgstrf_shared->lu_locks[SCHED_LOCK] );
#elif ( MACH==DEC || MACH==PTHREAD )
    pthread_mutex_lock( &pxgstrf_shared->lu_locks[SCHED_LOCK] );
#elif ( MACH==SGI || MACH==ORIGIN )
#pragma critical lock(pxgstrf_shared->lu_locks[SCHED_LOCK])
#elif ( MACH==CRAY_PVP )
#pragma _CRI guard SCHED_LOCK
#elif ( MACH==OPENMP )
#pragma omp critical ( SCHED_LOCK )
#endif    

{   /* ---- START CRITICAL SECTION ---- */
    
    /* Update the status of the current panel and its parent, so that
     * the other processors waiting on it can proceed.
     * If all siblings are done, and dad is not busy, then take dad.
     */
    if ( jcol != EMPTY ) { /* jcol was just finished by this processor */    
	dad_ukids = --pxgstrf_shared->pan_status[dad].ukids;
	
#ifdef DEBUG
	printf("(%d) DONE %d in Scheduler(), dad %d, STATE %d, dad_ukids %d\n",
	       pnum, jcol, dad, STATE(dad), dad_ukids);
#endif	

	if ( dad_ukids == 0 && STATE( dad ) > BUSY ) { /* dad not started */
	    jcol = dad;
#ifdef DEBUG
	    printf("(%d) Scheduler[1] Got dad %d, STATE %d\n",
		   pnum, jcol, STATE(dad));
#endif
#ifdef PROFILE
	    ++(Gstat->panhows[DADPAN]);
#endif	    
	} else {
	    /* Try to get a panel from the task Q. */
	    while ( 1 ) {
		/*>>if ( (j = Dequeue(taskq, &item)) == EMPTY ) {*/
		if ( taskq->count <= 0 ) {
		    jcol = EMPTY;
		    break;
		} else {
		    jcol = taskq->queue[taskq->head++];
		    --taskq->count;
		    if ( STATE( jcol ) >= CANGO ) { /* CANGO or CANPIPE */
#ifdef DEBUG
			printf("(%d) Dequeue[1] Got %d, STATE %d, Qcount %d\n",
			       pnum, jcol, STATE(jcol), j);
#endif
#ifdef PROFILE
			if (STATE( jcol ) == CANGO) ++(Gstat->panhows[NOPIPE]);
			else ++(Gstat->panhows[PIPE]);
#endif			
		        break;
		    }
		}
	    } /* while */
	}
    } else {
	/*
	 * jcol was EMPTY; Try to get a panel from the task Q.
	 */
    	while ( 1 ) {
    	    /*>>if ( (j = Dequeue(taskq, &item)) == EMPTY ) {*/
	    if ( taskq->count <= 0 ) {
		jcol = EMPTY;
		break;
	    } else {
		jcol = taskq->queue[taskq->head++];
		--taskq->count;
		if ( STATE( jcol ) >= CANGO ) { /* CANGO or CANPIPE */
#ifdef DEBUG
		    printf("(%d) Dequeue[2] Got %d, STATE %d, Qcount %d\n",
			   pnum, jcol, STATE(jcol), j);
#endif
#ifdef PROFILE
		    if (STATE( jcol ) == CANGO) ++(Gstat->panhows[NOPIPE]);
		    else ++(Gstat->panhows[PIPE]);
#endif			
		    break;
		}
	    }
	} /* while */
    }
    
    /*
     * Update the status of the new panel "jcol" and its parent "dad".
     */
    if ( jcol != EMPTY ) {
	    --pxgstrf_shared->tasks_remain;
#ifdef DOMAINS
	if ( in_domain[jcol] == TREE_DOMAIN ) {
	    /* Dequeue the first descendant of this domain */
	    *bcol = taskq->queue[taskq->head++];
	    --taskq->count;
	} else
#endif
	{
	    STATE( jcol ) = BUSY;
	    w = pxgstrf_shared->pan_status[jcol].size;

	    for (j = jcol; j < jcol+w; ++j) pxgstrf_shared->spin_locks[j] = 1;
	    dad = DADPANEL (jcol);
	    if ( dad < n && pxgstrf_shared->pan_status[dad].ukids == 1 ) {
		STATE( dad ) = CANPIPE;
		/*>> j = Enqueue(taskq, dad);*/
		taskq->queue[taskq->tail++] = dad;
		++taskq->count;
#ifdef DEBUG
		printf("(%d) Enqueue() %d's dad %d ->CANPIPE, Qcount %d\n",
		       pnum, jcol, dad, j);
#endif
	    }

#ifdef PROFILE
	    Gstat->procstat[pnum].panels++;
#endif
	
	    /* Find the farthest busy descendant of the new panel
	       and its parent.*/
	    *bcol = fb_cols[jcol];
#ifdef DEBUG
	    printf("(%d) Scheduler[2] fb_cols[%d]=%d, STATE %d\n",
		   pnum, jcol, *bcol, STATE( *bcol ));
#endif
	    while ( STATE( *bcol ) == DONE ) *bcol = DADPANEL (*bcol);
	    fb_cols[dad] = *bcol;
	
	} /* else regular_panel */

    } /* if jcol != empty */

    *cur_pan = jcol;

#ifdef DEBUG
    printf("(%d) Exit C.S. tasks_remain %d, cur_pan %d\n", 
	   pnum, pxgstrf_shared->tasks_remain, jcol);
#endif

} /* ---- END CRITICAL SECTION ---- */
    
#if ( MACH==SUN )
    /* Exit C.S. */
    mutex_unlock( &pxgstrf_shared->lu_locks[SCHED_LOCK] );
#elif ( MACH==DEC || MACH==PTHREAD )
    pthread_mutex_unlock( &pxgstrf_shared->lu_locks[SCHED_LOCK] );
#elif ( MACH==CRAY_PVP )
#pragma _CRI endguard SCHED_LOCK
#endif    

#ifdef PROFILE
    Gstat->procstat[pnum].cs_time += SuperLU_timer_() - t;
#endif

    return;
}
Esempio n. 17
0
InstancePtr FeatureExtractor::extract(const Observation &obs, FeatureExtractorHistory &history) {
  TIC(total);
  assert(obs.preyInd == 0);
  InstancePtr instance(new Instance);
  
  TIC(pos);
  setFeature(instance,FeatureType::PredInd,obs.myInd - 1);
  // positions of agents
  for (unsigned int i = 0; i < obs.positions.size(); i++) {
    Point2D diff = getDifferenceToPoint(dims,obs.myPos(),obs.positions[i]);
    unsigned int key = FeatureType::Prey_dx + 2 * i;
    setFeature(instance,key,diff.x);
    setFeature(instance,key+1,diff.y);
  }
  TOC(pos);
  // derived features
  TIC(derived);
  bool next2prey = false;
  for (unsigned int a = 0; a < Action::NUM_NEIGHBORS; a++) {
    Point2D pos = movePosition(dims,obs.myPos(),(Action::Type)a);
    bool occupied = false;
    for (unsigned int i = 0; i < obs.positions.size(); i++) {
      if (i == obs.myInd)
        continue;
      if (obs.positions[i] == pos) {
        occupied = true;
        if (i == 0)
          next2prey = true;
        break;
      }
    }
    setFeature(instance,FeatureType::Occupied_0 + a, occupied);
  }
  setFeature(instance,FeatureType::NextToPrey,next2prey);
  TOC(derived);
  // actions predicted by models
  TIC(actions);
  // not currently supported
  //ActionProbs actionProbs;
  for (std::vector<FeatureAgent>::iterator it = featureAgents.begin(); it != featureAgents.end(); it++) {
    std::cerr << "FeatureExtractor can't handle featureAgents" << std::endl;
    exit(58);
    //actionProbs = it->agent->step(obs);
    //ADD_KEY(it->name + ".des");
    //setFeature(instance,actionProbs.maxAction());
  }
  TOC(actions);
  // update the history
  TIC(history);
  updateHistory(obs,history);
  // add the history features
  TIC(historyupdate);
  Action::Type action;
  for (unsigned int j = 0; j < HISTORY_SIZE; j++) {
    if (j < history.actionHistory[obs.myInd].size())
      action = history.actionHistory[obs.myInd][j];
    else
      action = Action::NUM_ACTIONS;
    setFeature(instance,FeatureType::MyHistoricalAction_0 + j,action);
  }
  TOC(historyupdate);
/*
  for (unsigned int agentInd = 0; agentInd < obs.positions.size(); agentInd++) {
    for (unsigned int j = 0; j < HISTORY_SIZE; j++) {
      if (j < history.actionHistory[agentInd].size())
        action = history.actionHistory[agentInd][j];
      else
        action = Action::NUM_ACTIONS;
      if (USE_ALL_AGENTS_HISTORY) {
        std::cerr << "FeatureExtractor can't handle all agents history" << std::endl;
        exit(58);
        //ADD_KEY("HistoricalAction" + boost::lexical_cast<std::string>(agentInd) + "." + boost::lexical_cast<std::string>(j));
        //setFeature(instance,action);
      }

      if (agentInd == obs.myInd) {
        setFeature(instance,FeatureType::MyHistoricalAction_0 + j,action);
      }

    }
  }
*/
  TOC(history);

  instance->weight = 1.0;
  TOC(total);
  //std::cout << "instance: " << *instance << std::endl;
  return instance;
}
Esempio n. 18
0
    static boost::tuple< boost::shared_ptr<Matrix>, boost::shared_ptr<Matrix> >
    transfer_operators(const Matrix &A, params &prm)
    {
        typedef typename backend::value_type<Matrix>::type Val;

        const size_t n = rows(A);

        TIC("aggregates");
        Aggregates aggr(A, prm.aggr);
        prm.aggr.eps_strong *= 0.5;
        TOC("aggregates");

        TIC("interpolation");
        boost::shared_ptr<Matrix> P = boost::make_shared<Matrix>();
        P->nrows = n;
        P->ncols = aggr.count;
        P->ptr.resize(n + 1, 0);

#pragma omp parallel
        {
            std::vector<ptrdiff_t> marker(aggr.count, -1);

#ifdef _OPENMP
            int nt  = omp_get_num_threads();
            int tid = omp_get_thread_num();

            size_t chunk_size  = (n + nt - 1) / nt;
            size_t chunk_start = tid * chunk_size;
            size_t chunk_end   = std::min(n, chunk_start + chunk_size);
#else
            size_t chunk_start = 0;
            size_t chunk_end   = n;
#endif

            // Count number of entries in P.
            for(size_t i = chunk_start; i < chunk_end; ++i) {
                for(ptrdiff_t j = A.ptr[i], e = A.ptr[i+1]; j < e; ++j) {
                    size_t c = static_cast<size_t>(A.col[j]);

                    // Skip weak off-diagonal connections.
                    if (c != i && !aggr.strong_connection[j])
                        continue;

                    ptrdiff_t g = aggr.id[c];

                    if (g >= 0 && static_cast<size_t>(marker[g]) != i) {
                        marker[g] = static_cast<ptrdiff_t>(i);
                        ++( P->ptr[i + 1] );
                    }
                }
            }

            boost::fill(marker, -1);

#pragma omp barrier
#pragma omp single
            {
                boost::partial_sum(P->ptr, P->ptr.begin());
                P->col.resize(P->ptr.back());
                P->val.resize(P->ptr.back());
            }

            // Fill the interpolation matrix.
            for(size_t i = chunk_start; i < chunk_end; ++i) {

                // Diagonal of the filtered matrix is the original matrix
                // diagonal minus its weak connections.
                Val dia = 0;
                for(ptrdiff_t j = A.ptr[i], e = A.ptr[i+1]; j < e; ++j) {
                    if (static_cast<size_t>(A.col[j]) == i)
                        dia += A.val[j];
                    else if (!aggr.strong_connection[j])
                        dia -= A.val[j];
                }
                dia = 1 / dia;

                ptrdiff_t row_beg = P->ptr[i];
                ptrdiff_t row_end = row_beg;
                for(ptrdiff_t j = A.ptr[i], e = A.ptr[i + 1]; j < e; ++j) {
                    size_t c = static_cast<size_t>(A.col[j]);

                    // Skip weak couplings, ...
                    if (c != i && !aggr.strong_connection[j]) continue;

                    // ... and the ones not in any aggregate.
                    ptrdiff_t g = aggr.id[c];
                    if (g < 0) continue;

                    Val v = (c == i) ? 1 - prm.relax : -prm.relax * dia * A.val[j];

                    if (marker[g] < row_beg) {
                        marker[g] = row_end;
                        P->col[row_end] = g;
                        P->val[row_end] = v;
                        ++row_end;
                    } else {
                        P->val[ marker[g] ] += v;
                    }
                }
            }
        }
        TOC("interpolation");

        boost::shared_ptr<Matrix> R = boost::make_shared<Matrix>();
        *R = transpose(*P);

        return boost::make_tuple(P, R);
    }
void SKP_Silk_find_pred_coefs_FIX(
    SKP_Silk_encoder_state_FIX      *psEnc,         /* I/O  encoder state                               */
    SKP_Silk_encoder_control_FIX    *psEncCtrl,     /* I/O  encoder control                             */
    const SKP_int16                 res_pitch[],    /* I    Residual from pitch analysis                */
    const SKP_int16                 x[]             /* I    Speech signal                               */
)
{
    SKP_int         i;
    SKP_int32       WLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
    SKP_int32       invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ], Wght_Q15[ MAX_NB_SUBFR ];
    SKP_int16       NLSF_Q15[ MAX_LPC_ORDER ];
    const SKP_int16 *x_ptr;
    SKP_int16       *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ];
    SKP_int32       tmp, min_gain_Q16;
    SKP_int         LTP_corrs_rshift[ MAX_NB_SUBFR ];

    /* weighting for weighted least squares */
    min_gain_Q16 = SKP_int32_MAX >> 6;
    for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) {
        min_gain_Q16 = SKP_min( min_gain_Q16, psEncCtrl->Gains_Q16[ i ] );
    }
    for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) {
        /* Divide to Q16 */
        SKP_assert( psEncCtrl->Gains_Q16[ i ] > 0 );
        /* Invert and normalize gains, and ensure that maximum invGains_Q16 is within range of a 16 bit int */
        invGains_Q16[ i ] = SKP_DIV32_varQ( min_gain_Q16, psEncCtrl->Gains_Q16[ i ], 16 - 2 );

        /* Ensure Wght_Q15 a minimum value 1 */
        invGains_Q16[ i ] = SKP_max( invGains_Q16[ i ], 363 ); 
        
        /* Square the inverted gains */
        SKP_assert( invGains_Q16[ i ] == SKP_SAT16( invGains_Q16[ i ] ) );
        tmp = SKP_SMULWB( invGains_Q16[ i ], invGains_Q16[ i ] );
        Wght_Q15[ i ] = SKP_RSHIFT( tmp, 1 );

        /* Invert the inverted and normalized gains */
        local_gains[ i ] = SKP_DIV32( ( 1 << 16 ), invGains_Q16[ i ] );
    }

    if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) {
        /**********/
        /* VOICED */
        /**********/
        SKP_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );

        /* LTP analysis */
        SKP_Silk_find_LTP_FIX( psEncCtrl->LTPCoef_Q14, WLTP, &psEncCtrl->LTPredCodGain_Q7, 
            res_pitch, psEncCtrl->pitchL, Wght_Q15, psEnc->sCmn.subfr_length, 
            psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift );

        /* Quantize LTP gain parameters */
        SKP_Silk_quant_LTP_gains( psEncCtrl->LTPCoef_Q14, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex, 
            WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr);

        /* Control LTP scaling */
        SKP_Silk_LTP_scale_ctrl_FIX( psEnc, psEncCtrl );

        /* Create LTP residual */
        SKP_Silk_LTP_analysis_filter_FIX( LPC_in_pre, psEnc->x_buf + psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder, 
            psEncCtrl->LTPCoef_Q14, psEncCtrl->pitchL, invGains_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder );

    } else {
        /************/
        /* UNVOICED */
        /************/
        /* Create signal with prepended subframes, scaled by inverse gains */
        x_ptr     = x - psEnc->sCmn.predictLPCOrder;
        x_pre_ptr = LPC_in_pre;
        for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) {
            SKP_Silk_scale_copy_vector16( x_pre_ptr, x_ptr, invGains_Q16[ i ], 
                psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder );
            x_pre_ptr += psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder;
            x_ptr     += psEnc->sCmn.subfr_length;
        }

        SKP_memset( psEncCtrl->LTPCoef_Q14, 0, psEnc->sCmn.nb_subfr * LTP_ORDER * sizeof( SKP_int16 ) );
        psEncCtrl->LTPredCodGain_Q7 = 0;
    }

    /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */
    TIC(FIND_LPC)
    SKP_Silk_find_LPC_FIX( NLSF_Q15, &psEnc->sCmn.indices.NLSFInterpCoef_Q2, psEnc->sCmn.prev_NLSFq_Q15, 
        psEnc->sCmn.useInterpolatedNLSFs, psEnc->sCmn.first_frame_after_reset, psEnc->sCmn.predictLPCOrder, 
        LPC_in_pre, psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder, psEnc->sCmn.nb_subfr );
    TOC(FIND_LPC)

    /* Quantize LSFs */
    TIC(PROCESS_LSFS)
    SKP_Silk_process_NLSFs( &psEnc->sCmn, psEncCtrl->PredCoef_Q12, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 );
    TOC(PROCESS_LSFS)

    /* Calculate residual energy using quantized LPC coefficients */
    SKP_Silk_residual_energy_FIX( psEncCtrl->ResNrg, psEncCtrl->ResNrgQ, LPC_in_pre, psEncCtrl->PredCoef_Q12, local_gains,
        psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder );

    /* Copy to prediction struct for use in next frame for fluctuation reduction */
    SKP_memcpy( psEnc->sCmn.prev_NLSFq_Q15, NLSF_Q15, sizeof( psEnc->sCmn.prev_NLSFq_Q15 ) );
}
SKP_int SKP_Silk_decode_frame(
    SKP_Silk_decoder_state      *psDec,             /* I/O  Pointer to Silk decoder state               */
    ec_dec                      *psRangeDec,        /* I/O  Compressor data structure                   */
    SKP_int16                   pOut[],             /* O    Pointer to output speech frame              */
    SKP_int32                   *pN,                /* O    Pointer to size of output frame             */
    const SKP_int               nBytes,             /* I    Payload length                              */
    SKP_int                     lostFlag            /* I    0: no loss, 1 loss, 2 decode fec            */
)
{
    SKP_Silk_decoder_control sDecCtrl;
    SKP_int         i, L, mv_len, ret = 0;
    SKP_int8        flags;
    SKP_int32       LBRR_symbol;
    SKP_int         pulses[ MAX_FRAME_LENGTH ];

TIC(DECODE_FRAME)

    L = psDec->frame_length;
    sDecCtrl.LTP_scale_Q14 = 0;

    /* Safety checks */
    SKP_assert( L > 0 && L <= MAX_FRAME_LENGTH );

    /********************************************/
    /* Decode Frame if packet is not lost       */
    /********************************************/
    if( lostFlag != PACKET_LOST && psDec->nFramesDecoded == 0 ) {
        /* First decoder call for this payload */
        /* Decode VAD flags and LBRR flag */
        flags = SKP_RSHIFT( psRangeDec->buf[ 0 ], 7 - psDec->nFramesPerPacket ) & 
            ( SKP_LSHIFT( 1, psDec->nFramesPerPacket + 1 ) - 1 );
        psDec->LBRR_flag = flags & 1;
        for( i = psDec->nFramesPerPacket - 1; i >= 0 ; i-- ) {
            flags = SKP_RSHIFT( flags, 1 );
            psDec->VAD_flags[ i ] = flags & 1;
        }
        for( i = 0; i < psDec->nFramesPerPacket + 1; i++ ) {
            ec_dec_icdf( psRangeDec, SKP_Silk_uniform2_iCDF, 8 );
        }
       
        /* Decode LBRR flags */
        SKP_memset( psDec->LBRR_flags, 0, sizeof( psDec->LBRR_flags ) );
        if( psDec->LBRR_flag ) {
            if( psDec->nFramesPerPacket == 1 ) {
                psDec->LBRR_flags[ 0 ] = 1;
            } else {
                LBRR_symbol = ec_dec_icdf( psRangeDec, SKP_Silk_LBRR_flags_iCDF_ptr[ psDec->nFramesPerPacket - 2 ], 8 ) + 1;
                for( i = 0; i < psDec->nFramesPerPacket; i++ ) {
                    psDec->LBRR_flags[ i ] = SKP_RSHIFT( LBRR_symbol, i ) & 1;
                }
            }
        }

        if( lostFlag == DECODE_NORMAL ) {
            /* Regular decoding: skip all LBRR data */
            for( i = 0; i < psDec->nFramesPerPacket; i++ ) {
                if( psDec->LBRR_flags[ i ] ) {
                    SKP_Silk_decode_indices( psDec, psRangeDec, i, 1 );
                    SKP_Silk_decode_pulses( psRangeDec, pulses, psDec->indices.signalType, 
                        psDec->indices.quantOffsetType, psDec->frame_length );
                }
            }
        }

    }

    if( lostFlag == DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 0 ) {
        /* Treat absent LBRR data as lost frame */
        lostFlag = PACKET_LOST;
        psDec->nFramesDecoded++;
    }

    if( lostFlag != PACKET_LOST ) {
        /*********************************************/
        /* Decode quantization indices of side info  */
        /*********************************************/
TIC(decode_indices)
        SKP_Silk_decode_indices( psDec, psRangeDec, psDec->nFramesDecoded, lostFlag );
TOC(decode_indices)

        /*********************************************/
        /* Decode quantization indices of excitation */
        /*********************************************/
TIC(decode_pulses)
        SKP_Silk_decode_pulses( psRangeDec, pulses, psDec->indices.signalType, 
                psDec->indices.quantOffsetType, psDec->frame_length );
TOC(decode_pulses)

        /********************************************/
        /* Decode parameters and pulse signal       */
        /********************************************/
TIC(decode_params)
        SKP_Silk_decode_parameters( psDec, &sDecCtrl );
TOC(decode_params)

        /* Update length. Sampling frequency may have changed */
        L = psDec->frame_length;

        /********************************************************/
        /* Run inverse NSQ                                      */
        /********************************************************/
TIC(decode_core)
        SKP_Silk_decode_core( psDec, &sDecCtrl, pOut, pulses );
TOC(decode_core)

        /********************************************************/
        /* Update PLC state                                     */
        /********************************************************/
        SKP_Silk_PLC( psDec, &sDecCtrl, pOut, L, 0 );

        psDec->lossCnt = 0;
        psDec->prevSignalType = psDec->indices.signalType;
        SKP_assert( psDec->prevSignalType >= 0 && psDec->prevSignalType <= 2 );

        /* A frame has been decoded without errors */
        psDec->first_frame_after_reset = 0;
        psDec->nFramesDecoded++;
    } else {
        /* Handle packet loss by extrapolation */
        SKP_Silk_PLC( psDec, &sDecCtrl, pOut, L, 1 );
    }

    /*************************/
    /* Update output buffer. */
    /*************************/
    SKP_assert( psDec->ltp_mem_length >= psDec->frame_length );
    mv_len = psDec->ltp_mem_length - psDec->frame_length;
    SKP_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(SKP_int16) );
    SKP_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( SKP_int16 ) );

    /****************************************************************/
    /* Ensure smooth connection of extrapolated and good frames     */
    /****************************************************************/
    SKP_Silk_PLC_glue_frames( psDec, &sDecCtrl, pOut, L );

    /************************************************/
    /* Comfort noise generation / estimation        */
    /************************************************/
    SKP_Silk_CNG( psDec, &sDecCtrl, pOut, L );

    /********************************************/
    /* HP filter output                            */
    /********************************************/
TIC(HP_out)
    SKP_Silk_biquad_alt( pOut, psDec->HP_B, psDec->HP_A, psDec->HPState, pOut, L );
TOC(HP_out)

    /* Update some decoder state variables */
    psDec->lagPrev = sDecCtrl.pitchL[ psDec->nb_subfr - 1 ];

    /********************************************/
    /* set output frame length                    */
    /********************************************/
    *pN = ( SKP_int16 )L;

TOC(DECODE_FRAME)

    return ret;
}
void
psgstrf_panel_bmod(
		   const int  pnum, /* process number */
		   const int  m,    /* number of rows in the matrix */
		   const int  w,    /* current panel width */
		   const int  jcol, /* leading column of the current panel */
		   const int  bcol, /* first column of the farthest busy snode*/
		   int   *inv_perm_r,/* in; inverse of the row pivoting */
		   int   *etree,     /* in */
		   int   *nseg,      /* modified */
		   int   *segrep,    /* modified */
		   int   *repfnz,    /* modified, size n-by-w */
		   int   *panel_lsub,/* modified */
		   int   *w_lsub_end,/* modified */
		   int   *spa_marker,/* modified; size n-by-w */
		   float *dense, /* modified, size n-by-w */
		   float *tempv, /* working array - zeros on input/output */
		   pxgstrf_shared_t *pxgstrf_shared /* modified */
		   )
{
/*
 * -- SuperLU MT routine (version 2.0) --
 * Lawrence Berkeley National Lab, Univ. of California Berkeley,
 * and Xerox Palo Alto Research Center.
 * September 10, 2007
 *
 * Purpose
 * =======
 *
 *    Performs numeric block updates (sup-panel) in topological order.
 *    It features combined 1D and 2D blocking of the source updating s-node.
 *    It consists of two steps:
 *       (1) accumulates updates from "done" s-nodes.
 *       (2) accumulates updates from "busy" s-nodes.
 *
 *    Before entering this routine, the nonzeros of the original A in
 *    this panel were already copied into the SPA dense[n,w].
 *
 * Updated/Output arguments
 * ========================
 *    L[*,j:j+w-1] and U[*,j:j+w-1] are returned collectively in the
 *    m-by-w vector dense[*,w]. The locations of nonzeros in L[*,j:j+w-1]
 *    are given by lsub[*] and U[*,j:j+w-1] by (nseg,segrep,repfnz).
 *
 */
    GlobalLU_t *Glu = pxgstrf_shared->Glu;  /* modified */
    Gstat_t *Gstat = pxgstrf_shared->Gstat; /* modified */
    register int j, k, ksub;
    register int fsupc, nsupc, nsupr, nrow;
    register int kcol, krep, ksupno, dadsupno;
    register int jj;	      /* index through each column in the panel */
    int          *xsup, *xsup_end, *supno;
    int          *lsub, *xlsub, *xlsub_end;
    int          *repfnz_col; /* repfnz[] for a column in the panel */
    float       *dense_col;  /* dense[] for a column in the panel */
    int          *col_marker; /* each column of the spa_marker[*,w] */
    int          *col_lsub;   /* each column of the panel_lsub[*,w] */
    static   int first = 1, rowblk, colblk;

#ifdef PROFILE
    double   t1, t2; /* temporary time */
#endif
    
#ifdef PREDICT_OPT    
    register float pmod, max_child_eft = 0, sum_pmod = 0, min_desc_eft = 0;
    register float pmod_eft;
    register int   kid, ndesc = 0;
#endif
    
#if ( DEBUGlevel>=2 )
    int dbg_addr = 0*m;
#endif
    
    if ( first ) {
	rowblk   = sp_ienv(4);
	colblk   = sp_ienv(5);
	first = 0;
    }
    
    xsup      = Glu->xsup;
    xsup_end  = Glu->xsup_end;
    supno     = Glu->supno;
    lsub      = Glu->lsub;
    xlsub     = Glu->xlsub;
    xlsub_end = Glu->xlsub_end;

#if ( DEBUGlevel>=2 )
    /*if (jcol >= LOCOL && jcol <= HICOL)
    check_panel_dfs_list(pnum, "begin", jcol, *nseg, segrep);*/
if (jcol == BADPAN)
    printf("(%d) Enter psgstrf_panel_bmod() jcol %d,BADCOL %d,dense_col[%d] %.10f\n",
	   pnum, jcol, BADCOL, BADROW, dense[dbg_addr+BADROW]);
#endif    

    /* --------------------------------------------------------------------
       For each non-busy supernode segment of U[*,jcol] in topological order,
       perform sup-panel update.
       -------------------------------------------------------------------- */
    k = *nseg - 1;
    for (ksub = 0; ksub < *nseg; ++ksub) {
	/*
	 * krep = representative of current k-th supernode
	 * fsupc = first supernodal column
	 * nsupc = no of columns in a supernode
	 * nsupr = no of rows in a supernode
	 */
        krep = segrep[k--];
	fsupc = xsup[supno[krep]];
	nsupc = krep - fsupc + 1;
	nsupr = xlsub_end[fsupc] - xlsub[fsupc];
	nrow = nsupr - nsupc;

#ifdef PREDICT_OPT
	pmod = Gstat->procstat[pnum].fcops;
#endif
	    
	if ( nsupc >= colblk && nrow >= rowblk ) {
	    /* 2-D block update */
#ifdef GEMV2
	    psgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, 
			       nrow, repfnz, panel_lsub, w_lsub_end, 
			       spa_marker, dense, tempv, Glu, Gstat);
#else
	    psgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow,
			   repfnz, panel_lsub, w_lsub_end, spa_marker,
			   dense, tempv, Glu, Gstat);
#endif
	} else {
	    /* 1-D block update */
#ifdef GEMV2
	    psgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr,
			       nrow, repfnz, panel_lsub, w_lsub_end, 
			       spa_marker, dense, tempv, Glu, Gstat);
#else
	    psgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow,
			   repfnz, panel_lsub, w_lsub_end, spa_marker,
			   dense, tempv, Glu, Gstat);
#endif
	}
	
#ifdef PREDICT_OPT
	pmod = Gstat->procstat[pnum].fcops - pmod;
	kid = (Glu->pan_status[krep].size > 0) ?
	    krep : (krep + Glu->pan_status[krep].size);
	desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv;
	desc_eft[ndesc++].pmod = pmod;
#endif
	
#if ( DEBUGlevel>=2 )
if (jcol == BADPAN)
    printf("(%d) non-busy update: krep %d, repfnz %d, dense_col[%d] %.10e\n",
	   pnum, krep, repfnz[dbg_addr+krep], BADROW, dense[dbg_addr+BADROW]);
#endif

    } /* for each updating supernode ... */
    
#if ( DEBUGlevel>=2 )
if (jcol == BADPAN)
    printf("(%d) After non-busy update: dense_col[%d] %.10e\n",
	   pnum, BADROW, dense[dbg_addr+BADROW]);
#endif
    
    /* ---------------------------------------------------------------------
     * Now wait for the "busy" s-nodes to become "done" -- this amounts to
     * climbing up the e-tree along the path starting from "bcol".
     * Several points are worth noting:
     *
     *  (1) There are two possible relations between supernodes and panels
     *      along the path of the e-tree:
     *      o |s-node| < |panel|
     *        want to climb up the e-tree one column at a time in order
     *        to achieve more concurrency
     *      o |s-node| > |panel|
     *        want to climb up the e-tree one panel at a time; this
     *        processor is stalled anyway while waiting for the panel.
     *
     *  (2) Need to accommodate new fills, append them in panel_lsub[*,w].
     *      o use an n-by-w marker array, as part of the SPA (not scalable!)
     *
     *  (3) Symbolically, need to find out repfnz[S, w], for each (busy)
     *      supernode S.
     *      o use dense[inv_perm_r[kcol]], filter all zeros
     *      o detect the first nonzero in each segment
     *        (at this moment, the boundary of the busy supernode/segment
     *         S has already been identified)
     *
     * --------------------------------------------------------------------- */

    kcol = bcol;
    while ( kcol < jcol ) {
        /* Pointers to each column of the w-wide arrays. */
	repfnz_col = repfnz;
	dense_col = dense;
	col_marker = spa_marker;
	col_lsub = panel_lsub;

	/* Wait for the supernode, and collect wait-time statistics. */
	if ( pxgstrf_shared->spin_locks[kcol] ) {
#ifdef PROFILE
	    TIC(t1);
#endif
	    await( &pxgstrf_shared->spin_locks[kcol] );

#ifdef PROFILE
	    TOC(t2, t1);
	    Gstat->panstat[jcol].pipewaits++;
	    Gstat->panstat[jcol].spintime += t2;
	    Gstat->procstat[pnum].spintime += t2;
#ifdef DOPRINT
	    PRINT_SPIN_TIME(1);
#endif
#endif		
	}
	
        /* Find leading column "fsupc" in the supernode that
           contains column "kcol" */
	ksupno = supno[kcol];
	fsupc = kcol;

#if ( DEBUGlevel>=2 )
	/*if (jcol >= LOCOL && jcol <= HICOL)    */
  if ( jcol==BADCOL )
    printf("(%d) psgstrf_panel_bmod[1] kcol %d, ksupno %d, fsupc %d\n",
	   pnum, kcol, ksupno, fsupc);
#endif
	
	/* Wait for the whole supernode to become "done" --
	   climb up e-tree one column at a time */
	do {
	    krep = SUPER_REP( ksupno );
	    kcol = etree[kcol];
	    if ( kcol >= jcol ) break;
	    if ( pxgstrf_shared->spin_locks[kcol] ) {
#ifdef PROFILE
		TIC(t1);
#endif
		await ( &pxgstrf_shared->spin_locks[kcol] );

#ifdef PROFILE
		TOC(t2, t1);
		Gstat->panstat[jcol].pipewaits++;
		Gstat->panstat[jcol].spintime += t2;
		Gstat->procstat[pnum].spintime += t2;
#ifdef DOPRINT
		PRINT_SPIN_TIME(2);
#endif
#endif		
	    }

	    dadsupno = supno[kcol];

#if ( DEBUGlevel>=2 )
	    /*if (jcol >= LOCOL && jcol <= HICOL)*/
if ( jcol==BADCOL )
    printf("(%d) psgstrf_panel_bmod[2] krep %d, dad=kcol %d, dadsupno %d\n",
	   pnum, krep, kcol, dadsupno);
#endif	    

	} while ( dadsupno == ksupno );

	/* Append the new segment into segrep[*]. After column_bmod(),
	   copy_to_ucol() will use them. */
	segrep[*nseg] = krep;
        ++(*nseg);
        
	/* Determine repfnz[krep, w] for each column in the panel */
	for (jj = jcol; jj < jcol + w; ++jj, dense_col += m, 
	       repfnz_col += m, col_marker += m, col_lsub += m) {
	    /*
	     * Note: relaxed supernode may not form a path on the e-tree,
	     *       but its column numbers are contiguous.
	     */
#ifdef SCATTER_FOUND
 	    for (kcol = fsupc; kcol <= krep; ++kcol) {
		if ( col_marker[inv_perm_r[kcol]] == jj ) {
		    repfnz_col[krep] = kcol;

 		    /* Append new fills in panel_lsub[*,jj]. */
		    j = w_lsub_end[jj - jcol];
/*#pragma ivdep*/
		    for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) {
			ksub = lsub[k];
			if ( col_marker[ksub] != jj ) {
			    col_marker[ksub] = jj;
			    col_lsub[j++] = ksub;
			}
		    }
		    w_lsub_end[jj - jcol] = j;

		    break; /* found the leading nonzero in the segment */
		}
	    }

#else
	    for (kcol = fsupc; kcol <= krep; ++kcol) {
                if ( dense_col[inv_perm_r[kcol]] != 0.0 ) {
		    repfnz_col[krep] = kcol;
		    break; /* Found the leading nonzero in the U-segment */
		}
	    }

	    /* In this case, we always treat the L-subscripts of the 
	       busy s-node [kcol : krep] as the new fills, even if the
	       corresponding U-segment may be all zero. */

	    /* Append new fills in panel_lsub[*,jj]. */
	    j = w_lsub_end[jj - jcol];
/*#pragma ivdep*/
	    for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) {
	        ksub = lsub[k];
		if ( col_marker[ksub] != jj ) {
		    col_marker[ksub] = jj;
		    col_lsub[j++] = ksub;
		}
	    }
	    w_lsub_end[jj - jcol] = j;
#endif

#if ( DEBUGlevel>=2 )
if (jj == BADCOL) {
printf("(%d) psgstrf_panel_bmod[fills]: jj %d, repfnz_col[%d] %d, inv_pr[%d] %d\n",
	   pnum, jj, krep, repfnz_col[krep], fsupc, inv_perm_r[fsupc]);
printf("(%d) psgstrf_panel_bmod[fills] xlsub %d, xlsub_end %d, #lsub[%d] %d\n",
       pnum,xlsub[krep],xlsub_end[krep],krep, xlsub_end[krep]-xlsub[krep]);
}
#endif	   
	} /* for jj ... */

#ifdef PREDICT_OPT
	pmod = Gstat->procstat[pnum].fcops;
#endif
	
	/* Perform sup-panel updates - use combined 1D + 2D updates. */
	nsupc = krep - fsupc + 1;
	nsupr = xlsub_end[fsupc] - xlsub[fsupc];
	nrow = nsupr - nsupc;
	if ( nsupc >= colblk && nrow >= rowblk ) {
	    /* 2-D block update */
#ifdef GEMV2
	    psgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr,
			       nrow, repfnz, panel_lsub, w_lsub_end, 
			       spa_marker, dense, tempv, Glu, Gstat);
#else
	    psgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow,
			   repfnz, panel_lsub, w_lsub_end, spa_marker,
			   dense, tempv, Glu, Gstat);
#endif
	} else {
	    /* 1-D block update */
#ifdef GEMV2
	    psgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr,
			       nrow, repfnz, panel_lsub, w_lsub_end, 
			       spa_marker, dense, tempv, Glu, Gstat);
#else
	    psgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow,
			   repfnz, panel_lsub, w_lsub_end, spa_marker,
			   dense, tempv, Glu, Gstat);
#endif
	}

#ifdef PREDICT_OPT
	pmod = Gstat->procstat[pnum].fcops - pmod;
	kid = (pxgstrf_shared->pan_status[krep].size > 0) ?
	       krep : (krep + pxgstrf_shared->pan_status[krep].size);
	desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv;
	desc_eft[ndesc++].pmod = pmod;
#endif
	
#if ( DEBUGlevel>=2 )
if (jcol == BADPAN)
    printf("(%d) After busy update: dense_col[%d] %.10f\n",
	   pnum, BADROW, dense[dbg_addr+BADROW]);
#endif
	
	/* Go to the parent of "krep" */
	kcol = etree[krep];
	
    } /* while kcol < jcol ... */
    
#if ( DEBUGlevel>=2 )
    /*if (jcol >= LOCOL && jcol <= HICOL)*/
if ( jcol==BADCOL )
    check_panel_dfs_list(pnum, "after-busy", jcol, *nseg, segrep);
#endif

#ifdef PREDICT_OPT
    qsort(desc_eft, ndesc, sizeof(desc_eft_t), (int(*)())numcomp);
    pmod_eft = 0;
    for (j = 0; j < ndesc; ++j) {
	pmod_eft = SUPERLU_MAX( pmod_eft, desc_eft[j].eft ) + desc_eft[j].pmod;
    }

    if ( ndesc == 0 ) {
	/* No modifications from descendants */
	pmod_eft = 0;
	for (j = cp_firstkid[jcol]; j != EMPTY; j = cp_nextkid[j]) {
	    kid = (pxgstrf_shared->pan_status[j].size > 0) ? 
			j : (j + pxgstrf_shared->pan_status[j].size);
	    pmod_eft = SUPERLU_MAX( pmod_eft,
			   	cp_panel[kid].est + cp_panel[kid].pdiv );
	}
    }
    
    cp_panel[jcol].est = pmod_eft;
    
#endif

}
/* Limit, stabilize, convert and quantize NLSFs.    */ 
void SKP_Silk_process_NLSFs_FIX(
    SKP_Silk_encoder_state_FIX      *psEnc,             /* I/O  Encoder state FIX                           */
    SKP_Silk_encoder_control_FIX    *psEncCtrl,         /* I/O  Encoder control FIX                         */
    SKP_int                         *pNLSF_Q15          /* I/O  Normalized LSFs (quant out) (0 - (2^15-1))  */
)
{
    SKP_int     doInterpolate;
    SKP_int     pNLSFW_Q6[ MAX_LPC_ORDER ];
    SKP_int     NLSF_mu_Q15, NLSF_mu_fluc_red_Q16;
    SKP_int32   i_sqr_Q15;
    const SKP_Silk_NLSF_CB_struct *psNLSF_CB;

    /* Used only for NLSF interpolation */
    SKP_int     pNLSF0_temp_Q15[ MAX_LPC_ORDER ];
    SKP_int     pNLSFW0_temp_Q6[ MAX_LPC_ORDER ];
    SKP_int     i;

    SKP_assert( psEnc->speech_activity_Q8 >=   0 );
    SKP_assert( psEnc->speech_activity_Q8 <= 256 );
    SKP_assert( psEncCtrl->sparseness_Q8  >=   0 );
    SKP_assert( psEncCtrl->sparseness_Q8  <= 256 );
    SKP_assert( psEncCtrl->sCmn.sigtype == SIG_TYPE_VOICED || psEncCtrl->sCmn.sigtype == SIG_TYPE_UNVOICED );

    /***********************/
    /* Calculate mu values */
    /***********************/
    if( psEncCtrl->sCmn.sigtype == SIG_TYPE_VOICED ) {
        /* NLSF_mu           = 0.002f - 0.001f * psEnc->speech_activity; */
        /* NLSF_mu_fluc_red  = 0.1f   - 0.05f  * psEnc->speech_activity; */
        NLSF_mu_Q15          = SKP_SMLAWB(   66,   -8388, psEnc->speech_activity_Q8 );
        NLSF_mu_fluc_red_Q16 = SKP_SMLAWB( 6554, -838848, psEnc->speech_activity_Q8 );
    } else { 
        /* NLSF_mu           = 0.005f - 0.004f * psEnc->speech_activity; */
        /* NLSF_mu_fluc_red  = 0.2f   - 0.1f   * psEnc->speech_activity - 0.1f * psEncCtrl->sparseness; */
        NLSF_mu_Q15          = SKP_SMLAWB(   164,   -33554, psEnc->speech_activity_Q8 );
        NLSF_mu_fluc_red_Q16 = SKP_SMLAWB( 13107, -1677696, psEnc->speech_activity_Q8 + psEncCtrl->sparseness_Q8 ); 
    }
    SKP_assert( NLSF_mu_Q15          >= 0     );
    SKP_assert( NLSF_mu_Q15          <= 164   );
    SKP_assert( NLSF_mu_fluc_red_Q16 >= 0     );
    SKP_assert( NLSF_mu_fluc_red_Q16 <= 13107 );

    NLSF_mu_Q15 = SKP_max( NLSF_mu_Q15, 1 );

    /* Calculate NLSF weights */
    TIC(NLSF_weights_FIX)
    SKP_Silk_NLSF_VQ_weights_laroia( pNLSFW_Q6, pNLSF_Q15, psEnc->sCmn.predictLPCOrder );
    TOC(NLSF_weights_FIX)

    /* Update NLSF weights for interpolated NLSFs */
    doInterpolate = ( psEnc->sCmn.useInterpolatedNLSFs == 1 ) && ( psEncCtrl->sCmn.NLSFInterpCoef_Q2 < ( 1 << 2 ) );
    if( doInterpolate ) {

        /* Calculate the interpolated NLSF vector for the first half */
        SKP_Silk_interpolate( pNLSF0_temp_Q15, psEnc->sPred.prev_NLSFq_Q15, pNLSF_Q15, 
            psEncCtrl->sCmn.NLSFInterpCoef_Q2, psEnc->sCmn.predictLPCOrder );

        /* Calculate first half NLSF weights for the interpolated NLSFs */
        TIC(NLSF_weights_FIX)
        SKP_Silk_NLSF_VQ_weights_laroia( pNLSFW0_temp_Q6, pNLSF0_temp_Q15, psEnc->sCmn.predictLPCOrder );
        TOC(NLSF_weights_FIX)

        /* Update NLSF weights with contribution from first half */
        i_sqr_Q15 = SKP_LSHIFT( SKP_SMULBB( psEncCtrl->sCmn.NLSFInterpCoef_Q2, psEncCtrl->sCmn.NLSFInterpCoef_Q2 ), 11 );
        for( i = 0; i < psEnc->sCmn.predictLPCOrder; i++ ) {
            pNLSFW_Q6[ i ] = SKP_SMLAWB( SKP_RSHIFT( pNLSFW_Q6[ i ], 1 ), pNLSFW0_temp_Q6[ i ], i_sqr_Q15 );
            SKP_assert( pNLSFW_Q6[ i ] <= SKP_int16_MAX );
            SKP_assert( pNLSFW_Q6[ i ] >= 1 );
        }
    }

    /* Set pointer to the NLSF codebook for the current signal type and LPC order */
    psNLSF_CB = psEnc->sCmn.psNLSF_CB[ psEncCtrl->sCmn.sigtype ];

    /* Quantize NLSF parameters given the trained NLSF codebooks */
    TIC(MSVQ_encode_FIX)
    SKP_Silk_NLSF_MSVQ_encode_FIX( psEncCtrl->sCmn.NLSFIndices, pNLSF_Q15, psNLSF_CB, 
        psEnc->sPred.prev_NLSFq_Q15, pNLSFW_Q6, NLSF_mu_Q15, NLSF_mu_fluc_red_Q16, 
        psEnc->sCmn.NLSF_MSVQ_Survivors, psEnc->sCmn.predictLPCOrder, psEnc->sCmn.first_frame_after_reset );
    TOC(MSVQ_encode_FIX)

    /* Convert quantized NLSFs back to LPC coefficients */
    SKP_Silk_NLSF2A_stable( psEncCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psEnc->sCmn.predictLPCOrder );

    if( doInterpolate ) {
        /* Calculate the interpolated, quantized LSF vector for the first half */
        SKP_Silk_interpolate( pNLSF0_temp_Q15, psEnc->sPred.prev_NLSFq_Q15, pNLSF_Q15, 
            psEncCtrl->sCmn.NLSFInterpCoef_Q2, psEnc->sCmn.predictLPCOrder );

        /* Convert back to LPC coefficients */
        SKP_Silk_NLSF2A_stable( psEncCtrl->PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEnc->sCmn.predictLPCOrder );

    } else {
        /* Copy LPC coefficients for first half from second half */
        SKP_memcpy( psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->PredCoef_Q12[ 1 ], psEnc->sCmn.predictLPCOrder * sizeof( SKP_int16 ) );
    }
}