void CPersOver::PersOver() { if (PR_htValid) { switch (PR_htInstr) { case OVER_RD: { BUSY_RETRY(ReadMemBusy()); ReadMem_data(P_addr); ReadMemPause(OVER_WR); } break; case OVER_WR: { BUSY_RETRY(WriteMemBusy()); WriteMem(P_addr, ~PR_data); WriteMemPause(OVER_RSM); } break; case OVER_RSM: { S_bResume = true; HtPause(OVER_RTN); } break; case OVER_RTN: { BUSY_RETRY(SendReturnBusy_htmain()); SendReturn_htmain(); } break; default: assert(0); } } if (SR_bResume) { S_bResume = false; HtResume(0); } }
void CPersVwrk::PersVwrk() { VertImages_t p1_vImgIdx = PR1_imageIdx & VERT_IMAGES_MASK; // staging for memAddr to allow Vivado to infer a DSP for the multiply T1_blkCol = (McuCols_t)((P1_outMcuColStart << (S_jobInfo[p1_vImgIdx].m_vcp[P1_compIdx].m_blkColsPerMcu == 2 ? 1 : 0)) | P1_preMcuBlkCol); T1_blkRow = (McuRows_t)((P1_preMcuRow << (S_jobInfo[p1_vImgIdx].m_vcp[P1_compIdx].m_blkRowsPerMcu == 2 ? 1 : 0)) | P1_preMcuBlkRow); T1_inCompBlkCols = S_jobInfo[p1_vImgIdx].m_vcp[P1_compIdx].m_inCompBlkCols; VertImages_t p2_vImgIdx = PR2_imageIdx & VERT_IMAGES_MASK; T2_jobInfo = S_jobInfo[p2_vImgIdx]; // use signed math since the DSP adder is signed T2_memAddrSum1 = (ht_int48)(T2_jobInfo.m_vcp[P2_compIdx].m_pInCompBuf + (T2_blkCol << MEM_LINE_SIZE_W)); T2_memAddrSum2 = (ht_int48)(T2_blkRow * (T2_inCompBlkCols << MEM_LINE_SIZE_W)); T3_memAddr = T3_memAddrSum1 + T3_memAddrSum2; // will not use DSP adder because no output reg before use? T2_loopVcp = T2_jobInfo.m_vcp[P2_compIdx]; T1_bReadMem = false; // fix timing for memory read instructions T2_preMcuRow_lt_inMcuRowEnd = PR2_preMcuRow < PR2_inMcuRowEnd; T2_pendMcuRow_lt_inMcuRowEnd = PR2_pendMcuRow < PR2_inMcuRowEnd; T2_mcuBufInUseCnt_lt_VERT_PREFETCH_MCUS = S_mcuBufInUseCnt[PR2_htId] < VERT_PREFETCH_MCUS_FULL; T2_preMcuBlkColP1_eq_blkColsPerMcu = PR2_preMcuBlkCol+1 == S_jobInfo[p2_vImgIdx].m_vcp[PR2_compIdx].m_blkColsPerMcu; if (PR3_htValid) { switch (PR3_htInst) { case VWRK_ENTRY: { if (!PR3_bHtIdPushed) { S_readOrderQue.push(PR3_htId); S_pendOrderQue.push(PR3_htId); P3_bHtIdPushed = true; } VertState vrs; vrs.m_bUpScale = T3_jobInfo.m_maxBlkRowsPerMcu == 2 && T3_loopVcp.m_blkRowsPerMcu == 1; ImageRows_t outRow = (ImageRows_t)(P3_outMcuRowStart * DCTSIZE << (T3_loopVcp.m_blkRowsPerMcu == 2 ? 1 : 0)); ImageRows_t outRowEnd = (ImageRows_t)(outRow + ((4 * DCTSIZE) << (T3_loopVcp.m_blkRowsPerMcu == 2 ? 1 : 0))); if (outRowEnd > T3_loopVcp.m_outCompRows) outRowEnd = T3_loopVcp.m_outCompRows; ht_uint1 mcuBlkRowFirst; if (vrs.m_bUpScale) { PntWghtCpInt_t filterWidth = (PntWghtCpInt_t)((T3_jobInfo.m_filterWidth >> 1) + 1); PntWghtCpInt_t filterOffset = (PntWghtCpInt_t)(18 - (filterWidth << 1)); PntWghtCpInt_t negFilterOffset = -filterOffset; bool bInRowSel = P3_pntWghtStart < negFilterOffset; vrs.m_inRow = bInRowSel ? 0 : ((P3_pntWghtStart + filterOffset) >> 1); vrs.m_rowDataPos = bInRowSel ? (PntWghtCpInt_t)-18 : (PntWghtCpInt_t)((P3_pntWghtStart & ~1) - (filterWidth << 1)); vrs.m_inRowOutDiff = 0; vrs.m_inRowIgnore = 0; mcuBlkRowFirst = 0; } else { PntWghtCpInt_t filterWidth = (PntWghtCpInt_t)T3_jobInfo.m_filterWidth; PntWghtCpInt_t filterOffset = (PntWghtCpInt_t)(17 - filterWidth); PntWghtCpInt_t negFilterOffset = -filterOffset; bool bInRowSel = P3_pntWghtStart < negFilterOffset; vrs.m_inRow = bInRowSel ? (ImageRows_t)0 : (ImageRows_t)(P3_pntWghtStart + filterOffset); vrs.m_rowDataPos = bInRowSel ? -17 : (P3_pntWghtStart - filterWidth); vrs.m_inRowOutDiff = 0; vrs.m_inRowIgnore = 0; mcuBlkRowFirst = (ht_uint1)((vrs.m_inRow >> 3) & (T3_loopVcp.m_blkRowsPerMcu-1)); } P3_preMcuBlkRowFirst[P3_compIdx] = mcuBlkRowFirst; P3_wrkMcuBlkRowFirst[P3_compIdx] = mcuBlkRowFirst; // setup for VWRK_LOOP P3_preMcuRow = (McuRows_t)(P3_inImageRowStart >> ((T3_jobInfo.m_maxBlkRowsPerMcu == 2 ? 1 : 0) + DCTSIZE_W)); P3_pendMcuRow = P3_preMcuRow; P3_inMcuRowEnd = (McuRows_t)((P3_inImageRowEnd + (T3_jobInfo.m_maxBlkRowsPerMcu == 2 ? 2 : 1) * DCTSIZE - 1) >> ((T3_jobInfo.m_maxBlkRowsPerMcu == 2 ? 1 : 0) + DCTSIZE_W)); P3_readBufIdx = 0; P3_pendBufIdx = 0; S_mcuBufInUseCnt[PR3_htId] = 0; P3_preMcuBlkRow = P3_preMcuBlkRowFirst[0]; P3_preMcuBlkCol = 0; P3_rdReqGrpId = PR3_htId << VERT_PREFETCH_MCUS_W; P3_rdPollGrpId = PR3_htId << VERT_PREFETCH_MCUS_W; P3_mcuReadPendCnt = 0; P3_bFirstWorkMcu = true; P3_mcuBlkCol += 1; if (P3_mcuBlkCol == T3_loopVcp.m_blkColsPerMcu) { P3_mcuBlkCol = 0; P3_compIdx += 1; if (P3_compIdx == T3_jobInfo.m_compCnt) { P3_compIdx = 0; HtContinue(VWRK_PREREAD_WAIT); break; } } HtContinue(VWRK_ENTRY); } break; case VWRK_PREREAD_WAIT: { // wait for other threads to complete reads if (S_readBusy != 0 || S_readOrderQue.front() != PR3_htId) { S_readPaused[PR3_htId] = true; HtPause(VWRK_PREREAD_WAIT); } else { S_readPaused[PR3_htId] = false; S_readBusy = true; S_readHtId = PR3_htId; S_readOrderQue.pop(); HtContinue(VWRK_PREREAD); } } break; case VWRK_PREREAD: { T1_bMcuBufFull = S_mcuBufInUseCnt[PR3_htId] == VERT_PREFETCH_MCUS_FULL; T1_bMcuRowEnd = P3_preMcuRow == P3_inMcuRowEnd; T1_bReadMemBusy = ReadMemBusy(); // issue reads until all needed reads are issued or buffer space is exceeded BUSY_RETRY(ReadMemBusy()); #ifndef _HTV // these next statements were moved to the P1 stage to give the multiple additional registers stages McuRows_t blkRow = (McuRows_t)(P3_preMcuRow * (T3_loopVcp.m_blkRowsPerMcu == 2 ? 2 : 1) | P3_preMcuBlkRow); McuCols_t blkCol = (McuCols_t)(P3_outMcuColStart * (T3_loopVcp.m_blkColsPerMcu == 2 ? 2 : 1) | P3_preMcuBlkCol); ht_uint26 pos = (ht_uint26)((blkRow * T3_loopVcp.m_inCompBlkCols + blkCol) * MEM_LINE_SIZE); ht_uint48 memAddr = T3_loopVcp.m_pInCompBuf + pos; assert(memAddr == T3_memAddr); #endif if (SR_mcuBufInUseCnt[PR3_htId] < VERT_PREFETCH_MCUS_FULL && PR3_preMcuRow < PR3_inMcuRowEnd) { sc_uint<4+VERT_PREFETCH_MCUS_W> bufIdx = (P3_compIdx << (VERT_PREFETCH_MCUS_W+2)) | (P3_preMcuBlkRow << (VERT_PREFETCH_MCUS_W+1)) | (P3_preMcuBlkCol << VERT_PREFETCH_MCUS_W) | P3_readBufIdx; ReadMem_rowPref(T3_memAddr, bufIdx, PR3_htId, 0, 8); T1_bReadMem = true; if (TR3_preMcuBlkColP1_eq_blkColsPerMcu) { P3_preMcuBlkCol = 0; if (P3_preMcuBlkRow+1 == T3_loopVcp.m_blkRowsPerMcu) { if (P3_compIdx+1 == T3_jobInfo.m_compCnt) { P3_compIdx = 0; P3_preMcuRow += 1; P3_rdReqGrpId = (PR3_htId << VERT_PREFETCH_MCUS_W) | ((P3_rdReqGrpId+1) & (VERT_PREFETCH_MCUS-1)); P3_preMcuBlkRowFirst[P3_compIdx] = 0; P3_readBufIdx += 1; P3_mcuReadPendCnt += 1; S_mcuBufInUseCnt[PR3_htId] += 1; } else P3_compIdx += 1; P3_preMcuBlkRow = P3_preMcuBlkRowFirst[P3_compIdx]; } else P3_preMcuBlkRow += 1; } else P3_preMcuBlkCol += 1; HtContinue(VWRK_PREREAD); } else { if (PR3_preMcuRow == PR3_inMcuRowEnd) { // free read interface for next thread S_readBusy = false; } HtContinue(VWRK_VRS_WAIT); } } break; case VWRK_VRS_WAIT: { // wait until a vrs structure is available, we double buffer BUSY_RETRY(S_pendOrderQue.front() != PR3_htId || S_vrsAvl == 0); P3_vrsIdx = (S_vrsAvl & 1) ? 0 : 1; S_vrsAvl &= (S_vrsAvl & 1) ? 2u : 1u; HtContinue(VWRK_VRS_INIT); } break; case VWRK_VRS_INIT: { // init vrs structure VertState vrs; vrs.m_bUpScale = T3_jobInfo.m_maxBlkRowsPerMcu == 2 && T3_loopVcp.m_blkRowsPerMcu == 1; ImageRows_t outRow = (ImageRows_t)(P3_outMcuRowStart * DCTSIZE << (T3_loopVcp.m_blkRowsPerMcu == 2 ? 1 : 0)); ImageRows_t outRowEnd = (ImageRows_t)(outRow + ((4 * DCTSIZE) << (T3_loopVcp.m_blkRowsPerMcu == 2 ? 1 : 0))); if (outRowEnd > T3_loopVcp.m_outCompRows) outRowEnd = T3_loopVcp.m_outCompRows; if (vrs.m_bUpScale) { PntWghtCpInt_t filterWidth = (PntWghtCpInt_t)((T3_jobInfo.m_filterWidth >> 1) + 1); PntWghtCpInt_t filterOffset = (PntWghtCpInt_t)(18 - (filterWidth << 1)); PntWghtCpInt_t negFilterOffset = -filterOffset; bool bInRowSel = P3_pntWghtStart < negFilterOffset; vrs.m_inRow = bInRowSel ? 0 : ((P3_pntWghtStart + filterOffset) >> 1); vrs.m_rowDataPos = bInRowSel ? (PntWghtCpInt_t)-18 : (PntWghtCpInt_t)((P3_pntWghtStart & ~1) - (filterWidth << 1)); vrs.m_inRowOutDiff = 0; vrs.m_inRowIgnore = 0; } else {
void CPersAdd::PersAdd() { // Set read address of op1Mem/op2Mem/resMem variables // These will always be the same in every instruction for each thread S_op1Mem.read_addr(PR_htId); S_op2Mem.read_addr(PR_htId); S_resMem.read_addr(PR_htId); // Force "Inputs Valid" to default to false unless true in the ADD_PAUSE instruction P_i_vld = false; if (PR_htValid) { switch (PR_htInst) { case ADD_LD1: { if (ReadMemBusy()) { HtRetry(); break; } // Memory read request - Operand 1 MemAddr_t memRdAddr = SR_op1Addr + (P_vecIdx << 3); ReadMem_op1Mem(memRdAddr, PR_htId); HtContinue(ADD_LD2); } break; case ADD_LD2: { if (ReadMemBusy()) { HtRetry(); break; } // Memory read request - Operand 2 MemAddr_t memRdAddr = SR_op2Addr + (P_vecIdx << 3); ReadMem_op2Mem(memRdAddr, PR_htId); ReadMemPause(ADD_PAUSE); } break; case ADD_PAUSE: { // Store op1 and op2 into private variables 'a' and 'b'. P_a = S_op1Mem.read_mem(); P_b = S_op2Mem.read_mem(); // Mark inputs as valid, set htId P_i_htId = PR_htId; P_i_vld = true; // Pause thread and wait for primitive to calculate the result... // (will return to ADD_ST) HtPause(ADD_ST); } break; case ADD_ST: { if (WriteMemBusy()) { HtRetry(); break; } // Memory write request - Addition Result MemAddr_t memWrAddr = SR_resAddr + (P_vecIdx << 3); WriteMem(memWrAddr, S_resMem.read_mem()); WriteMemPause(ADD_RTN); } break; case ADD_RTN: { if (SendReturnBusy_add()) { HtRetry(); break; } // Return Result from shared ram 'resMem' SendReturn_add(S_resMem.read_mem()); } break; default: assert(0); } } // Temporary variables to use as outputs to the primitive // (these are not saved between cycles) uint64_t o_res; ht_uint7 o_htId; bool o_vld; // use clocked primitive add_5stage(P_a, P_b, P_i_htId, P_i_vld, o_res, o_htId, o_vld, add_prm_state1); // Check for valid outputs from the primitive if (o_vld) { // Store Result into shared ram to be written to memory later S_resMem.write_addr(o_htId); S_resMem.write_mem(o_res); // Wake up the thread (with corresponding htId) HtResume(o_htId); } }