void CalcSliceComplexRatio (void* pRatio, SDqLayer* pCurDq, uint32_t* pSliceConsume) {
  SSliceCtx* pSliceCtx          = &pCurDq->sSliceEncCtx;
  int32_t* pRatioList           = (int32_t*)pRatio;
  int32_t iAvI[MAX_SLICES_NUM];
  int32_t iSumAv                = 0;
  uint32_t* pSliceTime          = (uint32_t*)pSliceConsume;
  int32_t* pCountMbInSlice      = (int32_t*)pSliceCtx->pCountMbNumInSlice;
  const int32_t kiSliceCount    = pSliceCtx->iSliceNumInFrame;
  int32_t iSliceIdx             = 0;

  WelsEmms();

  while (iSliceIdx < kiSliceCount) {
    iAvI[iSliceIdx] = WELS_DIV_ROUND (INT_MULTIPLY * pCountMbInSlice[iSliceIdx], pSliceTime[iSliceIdx]);
    MT_TRACE_LOG (NULL, WELS_LOG_DEBUG, "[MT] CalcSliceComplexRatio(), pSliceConsumeTime[%d]= %d us, slice_run= %d",
                  iSliceIdx,
                  pSliceTime[iSliceIdx], pCountMbInSlice[iSliceIdx]);
    iSumAv += iAvI[iSliceIdx];

    ++ iSliceIdx;
  }
  while (-- iSliceIdx >= 0) {
    pRatioList[iSliceIdx] = WELS_DIV_ROUND (INT_MULTIPLY * iAvI[iSliceIdx], iSumAv);
  }
}
int32_t NeedDynamicAdjust (SSlice* pSliceInLayer, const int32_t iSliceNum) {
  if ( NULL == pSliceInLayer )
    return false;

  uint32_t uiTotalConsume       = 0;
  int32_t iSliceIdx             = 0;
  int32_t iNeedAdj              = false;

  WelsEmms();

  while (iSliceIdx < iSliceNum) {
    uiTotalConsume += pSliceInLayer[iSliceIdx].uiSliceConsumeTime;
    iSliceIdx ++;
  }
  if (uiTotalConsume == 0) {
    MT_TRACE_LOG (NULL, WELS_LOG_DEBUG,
                  "[MT] NeedDynamicAdjust(), herein do no adjust due first picture, iCountSliceNum= %d",
                  iSliceNum);
    return false;
  }

  iSliceIdx = 0;
  float fThr                    = EPSN; // threshold for various cores cases
  float fRmse                   = .0f;  // root mean square error of pSlice consume ratios
  const float kfMeanRatio       = 1.0f / iSliceNum;
  do {
    const float fRatio = 1.0f * pSliceInLayer[iSliceIdx].uiSliceConsumeTime / uiTotalConsume;
    const float fDiffRatio = fRatio - kfMeanRatio;
    fRmse += (fDiffRatio * fDiffRatio);
    ++ iSliceIdx;
  } while (iSliceIdx + 1 < iSliceNum);
  fRmse = sqrtf (fRmse / iSliceNum);
  if (iSliceNum >= 8) {
    fThr += THRESHOLD_RMSE_CORE8;
  } else if (iSliceNum >= 4) {
    fThr += THRESHOLD_RMSE_CORE4;
  } else if (iSliceNum >= 2) {
    fThr += THRESHOLD_RMSE_CORE2;
  } else
    fThr = 1.0f;
  if (fRmse > fThr)
    iNeedAdj = true;
  MT_TRACE_LOG (NULL, WELS_LOG_DEBUG,
                "[MT] NeedDynamicAdjust(), herein adjustment decision is made (iNeedAdj= %d) by: fRmse of pSlice complexity ratios %.6f, the corresponding threshold %.6f, iCountSliceNum %d",
                iNeedAdj, fRmse, fThr, iSliceNum);

  return iNeedAdj;
}
void CalcSliceComplexRatio (SDqLayer* pCurDq) {
  SSliceCtx* pSliceCtx          = &pCurDq->sSliceEncCtx;
  SSlice*    pSliceInLayer      = pCurDq->sLayerInfo.pSliceInLayer;
  int32_t iSumAv                = 0;
  const int32_t kiSliceCount    = pSliceCtx->iSliceNumInFrame;
  int32_t iSliceIdx             = 0;
  int32_t iAvI[MAX_SLICES_NUM];

  WelsEmms();

  while (iSliceIdx < kiSliceCount) {
    iAvI[iSliceIdx] = WELS_DIV_ROUND (INT_MULTIPLY * pSliceInLayer[iSliceIdx].iCountMbNumInSlice,
                                      pSliceInLayer[iSliceIdx].uiSliceConsumeTime);
    MT_TRACE_LOG (NULL, WELS_LOG_DEBUG, "[MT] CalcSliceComplexRatio(), uiSliceConsumeTime[%d]= %d us, slice_run= %d",
                  iSliceIdx,
                  pSliceInLayer[iSliceIdx].uiSliceConsumeTime, pSliceInLayer[iSliceIdx].iCountMbNumInSlice);
    iSumAv += iAvI[iSliceIdx];

    ++ iSliceIdx;
  }
  while (-- iSliceIdx >= 0) {
    pSliceInLayer[iSliceIdx].iSliceComplexRatio = WELS_DIV_ROUND (INT_MULTIPLY * iAvI[iSliceIdx], iSumAv);
  }
}
void DynamicAdjustSlicing (sWelsEncCtx* pCtx,
                           SDqLayer* pCurDqLayer,
                           void* pComplexRatio,
                           int32_t iCurDid) {
  SSliceCtx* pSliceCtx	= pCurDqLayer->pSliceEncCtx;
  const int32_t kiCountSliceNum	= pSliceCtx->iSliceNumInFrame;
  const int32_t kiCountNumMb		= pSliceCtx->iMbNumInFrame;
  int32_t iMinimalMbNum			= pSliceCtx->iMbWidth;	// in theory we need only 1 SMB, here let it as one SMB row required
  int32_t iMaximalMbNum			= 0;	// dynamically assign later
  int32_t* pSliceComplexRatio	= (int32_t*)pComplexRatio;
  int32_t iMbNumLeft					= kiCountNumMb;
  int32_t iRunLen[MAX_THREADS_NUM]	= {0};
  int32_t iSliceIdx					= 0;

  int32_t iNumMbInEachGom = 0;
  SWelsSvcRc* pWelsSvcRc = &pCtx->pWelsSvcRc[iCurDid];
  if (pCtx->pSvcParam->iRCMode != RC_OFF_MODE) {
    iNumMbInEachGom = pWelsSvcRc->iNumberMbGom;

    if (iNumMbInEachGom <= 0) {
      WelsLog (& (pCtx->sLogCtx), WELS_LOG_ERROR,
               "[MT] DynamicAdjustSlicing(), invalid iNumMbInEachGom= %d from RC, iDid= %d, iCountNumMb= %d", iNumMbInEachGom,
               iCurDid, kiCountNumMb);
      return;
    }

    // do not adjust in case no extra iNumMbInEachGom based left for slicing adjustment,
    // extra MB of non integrated GOM assigned at the last pSlice in default, keep up on early initial result.
    if (iNumMbInEachGom * kiCountSliceNum >= kiCountNumMb) {
      return;
    }
    iMinimalMbNum	= iNumMbInEachGom;
  }

  if (kiCountSliceNum < 2 || (kiCountSliceNum & 0x01))	// we need suppose uiSliceNum is even for multiple threading
    return;

  iMaximalMbNum	= kiCountNumMb - (kiCountSliceNum - 1) * iMinimalMbNum;

  WelsEmms();

  MT_TRACE_LOG (pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iDid= %d, iCountNumMb= %d", iCurDid, kiCountNumMb);

  iSliceIdx	= 0;
  while (iSliceIdx + 1 < kiCountSliceNum) {
    int32_t iNumMbAssigning = WELS_DIV_ROUND (kiCountNumMb * pSliceComplexRatio[iSliceIdx], INT_MULTIPLY);

    // GOM boundary aligned
    if (pCtx->pSvcParam->iRCMode != RC_OFF_MODE) {
      iNumMbAssigning = iNumMbAssigning / iNumMbInEachGom * iNumMbInEachGom;
    }

    // make sure one GOM at least in each pSlice for safe
    if (iNumMbAssigning < iMinimalMbNum)
      iNumMbAssigning	= iMinimalMbNum;
    else if (iNumMbAssigning > iMaximalMbNum)
      iNumMbAssigning	= iMaximalMbNum;

    assert (iNumMbAssigning > 0);

    iMbNumLeft -= iNumMbAssigning;
    if (iMbNumLeft <= 0) {	// error due to we can not support slice_skip now yet, do not adjust this time
      assert (0);
      return;
    }
    iRunLen[iSliceIdx]	= iNumMbAssigning;
    MT_TRACE_LOG (pCtx, WELS_LOG_DEBUG,
                  "[MT] DynamicAdjustSlicing(), uiSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d",
                  iSliceIdx, pSliceComplexRatio[iSliceIdx] * 1.0f / INT_MULTIPLY, pSliceCtx->pCountMbNumInSlice[iSliceIdx],
                  iNumMbAssigning);
    ++ iSliceIdx;
    iMaximalMbNum	= iMbNumLeft - (kiCountSliceNum - iSliceIdx - 1) * iMinimalMbNum;	// get maximal num_mb in left parts
  }
  iRunLen[iSliceIdx] = iMbNumLeft;
  MT_TRACE_LOG (pCtx, WELS_LOG_DEBUG,
                "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d",
                iSliceIdx, pSliceComplexRatio[iSliceIdx] * 1.0f / INT_MULTIPLY, pSliceCtx->pCountMbNumInSlice[iSliceIdx], iMbNumLeft);


  if (DynamicAdjustSlicePEncCtxAll (pSliceCtx, iRunLen) == 0) {
    const int32_t kiThreadNum	= pCtx->pSvcParam->iCountThreadsNum;
    int32_t iThreadIdx			= 0;
    do {
      WelsEventSignal (&pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx]);
      WelsEventSignal (&pCtx->pSliceThreading->pThreadMasterEvent[iThreadIdx]);
      ++ iThreadIdx;
    } while (iThreadIdx < kiThreadNum);

    WelsMultipleEventsWaitAllBlocking (kiThreadNum, &pCtx->pSliceThreading->pFinUpdateMbListEvent[0]);
  }
}