예제 #1
0
void
CPersOver::PersOver()
{
	if (PR_htValid) {
		switch (PR_htInstr) {
		case OVER_RD: {
			BUSY_RETRY(ReadMemBusy());

			ReadMem_data(P_addr);
			ReadMemPause(OVER_WR);
		}
		break;
		case OVER_WR: {
			BUSY_RETRY(WriteMemBusy());

			WriteMem(P_addr, ~PR_data);
			WriteMemPause(OVER_RSM);
		}
		break;
		case OVER_RSM: {
			S_bResume = true;
			HtPause(OVER_RTN);
		}
		break;
		case OVER_RTN: {
			BUSY_RETRY(SendReturnBusy_htmain());

			SendReturn_htmain();
		}
		break;
		default:
			assert(0);
		}
	}

	if (SR_bResume) {
		S_bResume = false;
		HtResume(0);
	}
}
예제 #2
0
void
CPersVwrk::PersVwrk()
{
	VertImages_t p1_vImgIdx = PR1_imageIdx & VERT_IMAGES_MASK;

	// staging for memAddr to allow Vivado to infer a DSP for the multiply
	T1_blkCol = (McuCols_t)((P1_outMcuColStart <<
			(S_jobInfo[p1_vImgIdx].m_vcp[P1_compIdx].m_blkColsPerMcu == 2 ? 1 : 0)) | P1_preMcuBlkCol);
	T1_blkRow = (McuRows_t)((P1_preMcuRow <<
			(S_jobInfo[p1_vImgIdx].m_vcp[P1_compIdx].m_blkRowsPerMcu == 2 ? 1 : 0)) | P1_preMcuBlkRow);
	T1_inCompBlkCols = S_jobInfo[p1_vImgIdx].m_vcp[P1_compIdx].m_inCompBlkCols;

	VertImages_t p2_vImgIdx = PR2_imageIdx & VERT_IMAGES_MASK;
	T2_jobInfo = S_jobInfo[p2_vImgIdx];
	// use signed math since the DSP adder is signed
	T2_memAddrSum1 = (ht_int48)(T2_jobInfo.m_vcp[P2_compIdx].m_pInCompBuf + (T2_blkCol << MEM_LINE_SIZE_W));
	T2_memAddrSum2 = (ht_int48)(T2_blkRow * (T2_inCompBlkCols << MEM_LINE_SIZE_W));

	T3_memAddr = T3_memAddrSum1 + T3_memAddrSum2; // will not use DSP adder because no output reg before use?

    T2_loopVcp = T2_jobInfo.m_vcp[P2_compIdx];

	T1_bReadMem = false;

	// fix timing for memory read instructions
	T2_preMcuRow_lt_inMcuRowEnd = PR2_preMcuRow < PR2_inMcuRowEnd;
	T2_pendMcuRow_lt_inMcuRowEnd = PR2_pendMcuRow < PR2_inMcuRowEnd;
	T2_mcuBufInUseCnt_lt_VERT_PREFETCH_MCUS = S_mcuBufInUseCnt[PR2_htId] < VERT_PREFETCH_MCUS_FULL;
	T2_preMcuBlkColP1_eq_blkColsPerMcu = PR2_preMcuBlkCol+1 == S_jobInfo[p2_vImgIdx].m_vcp[PR2_compIdx].m_blkColsPerMcu;

	if (PR3_htValid) {

		switch (PR3_htInst) {
		case VWRK_ENTRY: {

			if (!PR3_bHtIdPushed) {
				S_readOrderQue.push(PR3_htId);
				S_pendOrderQue.push(PR3_htId);
				P3_bHtIdPushed = true;
			}

			VertState vrs;
			vrs.m_bUpScale = T3_jobInfo.m_maxBlkRowsPerMcu == 2 && T3_loopVcp.m_blkRowsPerMcu == 1;
			ImageRows_t outRow = (ImageRows_t)(P3_outMcuRowStart * DCTSIZE << (T3_loopVcp.m_blkRowsPerMcu == 2 ? 1 : 0));
			ImageRows_t outRowEnd = (ImageRows_t)(outRow + ((4 * DCTSIZE) << (T3_loopVcp.m_blkRowsPerMcu == 2 ? 1 : 0)));
			if (outRowEnd > T3_loopVcp.m_outCompRows) outRowEnd = T3_loopVcp.m_outCompRows;

			ht_uint1 mcuBlkRowFirst;

			if (vrs.m_bUpScale) {
				PntWghtCpInt_t filterWidth = (PntWghtCpInt_t)((T3_jobInfo.m_filterWidth >> 1) + 1);
				PntWghtCpInt_t filterOffset = (PntWghtCpInt_t)(18 - (filterWidth << 1));
				PntWghtCpInt_t negFilterOffset = -filterOffset;
				bool bInRowSel = P3_pntWghtStart < negFilterOffset;
				vrs.m_inRow = bInRowSel ? 0 : ((P3_pntWghtStart + filterOffset) >> 1);
				vrs.m_rowDataPos = bInRowSel ? (PntWghtCpInt_t)-18 : (PntWghtCpInt_t)((P3_pntWghtStart & ~1) - (filterWidth << 1));
				vrs.m_inRowOutDiff = 0;
				vrs.m_inRowIgnore = 0;
				mcuBlkRowFirst = 0;
			} else {
				PntWghtCpInt_t filterWidth = (PntWghtCpInt_t)T3_jobInfo.m_filterWidth;
				PntWghtCpInt_t filterOffset = (PntWghtCpInt_t)(17 - filterWidth);
				PntWghtCpInt_t negFilterOffset = -filterOffset;
				bool bInRowSel = P3_pntWghtStart < negFilterOffset;
				vrs.m_inRow = bInRowSel ? (ImageRows_t)0 : (ImageRows_t)(P3_pntWghtStart + filterOffset);
				vrs.m_rowDataPos = bInRowSel ? -17 : (P3_pntWghtStart - filterWidth);
				vrs.m_inRowOutDiff = 0;
				vrs.m_inRowIgnore = 0;
				mcuBlkRowFirst = (ht_uint1)((vrs.m_inRow >> 3) & (T3_loopVcp.m_blkRowsPerMcu-1));
			}

			P3_preMcuBlkRowFirst[P3_compIdx] = mcuBlkRowFirst;
			P3_wrkMcuBlkRowFirst[P3_compIdx] = mcuBlkRowFirst;

			// setup for VWRK_LOOP
			P3_preMcuRow = (McuRows_t)(P3_inImageRowStart >> 
				((T3_jobInfo.m_maxBlkRowsPerMcu == 2 ? 1 : 0) + DCTSIZE_W));
			P3_pendMcuRow = P3_preMcuRow;
			P3_inMcuRowEnd = (McuRows_t)((P3_inImageRowEnd + 
				(T3_jobInfo.m_maxBlkRowsPerMcu == 2 ? 2 : 1) * DCTSIZE - 1) >>
				((T3_jobInfo.m_maxBlkRowsPerMcu == 2 ? 1 : 0) + DCTSIZE_W));
			P3_readBufIdx = 0;
			P3_pendBufIdx = 0;
			S_mcuBufInUseCnt[PR3_htId] = 0;
			P3_preMcuBlkRow = P3_preMcuBlkRowFirst[0];
			P3_preMcuBlkCol = 0;
			P3_rdReqGrpId = PR3_htId << VERT_PREFETCH_MCUS_W;
			P3_rdPollGrpId = PR3_htId << VERT_PREFETCH_MCUS_W;
			P3_mcuReadPendCnt = 0;
			P3_bFirstWorkMcu = true;
			
			P3_mcuBlkCol += 1;
			if (P3_mcuBlkCol == T3_loopVcp.m_blkColsPerMcu) {
				P3_mcuBlkCol = 0;
				P3_compIdx += 1;
				if (P3_compIdx == T3_jobInfo.m_compCnt) {
					P3_compIdx = 0;

					HtContinue(VWRK_PREREAD_WAIT);
					break;
				}
			}

			HtContinue(VWRK_ENTRY);
		}
		break;
		case VWRK_PREREAD_WAIT: {
			// wait for other threads to complete reads
			if (S_readBusy != 0 || S_readOrderQue.front() != PR3_htId) {
				S_readPaused[PR3_htId] = true;
				HtPause(VWRK_PREREAD_WAIT);
			} else {
				S_readPaused[PR3_htId] = false;
				S_readBusy = true;
				S_readHtId = PR3_htId;
				S_readOrderQue.pop();
				HtContinue(VWRK_PREREAD);
			}
		}
		break;
		case VWRK_PREREAD: {
			T1_bMcuBufFull = S_mcuBufInUseCnt[PR3_htId] == VERT_PREFETCH_MCUS_FULL;
			T1_bMcuRowEnd = P3_preMcuRow == P3_inMcuRowEnd;
			T1_bReadMemBusy = ReadMemBusy();

			// issue reads until all needed reads are issued or buffer space is exceeded
			BUSY_RETRY(ReadMemBusy());

#ifndef _HTV
			// these next statements were moved to the P1 stage to give the multiple additional registers stages
			McuRows_t blkRow = (McuRows_t)(P3_preMcuRow * (T3_loopVcp.m_blkRowsPerMcu == 2 ? 2 : 1) | P3_preMcuBlkRow);
			McuCols_t blkCol = (McuCols_t)(P3_outMcuColStart * (T3_loopVcp.m_blkColsPerMcu == 2 ? 2 : 1) | P3_preMcuBlkCol);
			ht_uint26 pos = (ht_uint26)((blkRow * T3_loopVcp.m_inCompBlkCols + blkCol) * MEM_LINE_SIZE);
			ht_uint48 memAddr = T3_loopVcp.m_pInCompBuf + pos;
			assert(memAddr == T3_memAddr);
#endif
			if (SR_mcuBufInUseCnt[PR3_htId] < VERT_PREFETCH_MCUS_FULL && PR3_preMcuRow < PR3_inMcuRowEnd) {
				sc_uint<4+VERT_PREFETCH_MCUS_W> bufIdx = (P3_compIdx << (VERT_PREFETCH_MCUS_W+2)) | 
					(P3_preMcuBlkRow << (VERT_PREFETCH_MCUS_W+1)) | (P3_preMcuBlkCol << VERT_PREFETCH_MCUS_W) | P3_readBufIdx;

				ReadMem_rowPref(T3_memAddr, bufIdx, PR3_htId, 0, 8);

				T1_bReadMem = true;

				if (TR3_preMcuBlkColP1_eq_blkColsPerMcu) {
					P3_preMcuBlkCol = 0;
					if (P3_preMcuBlkRow+1 == T3_loopVcp.m_blkRowsPerMcu) {
						if (P3_compIdx+1 == T3_jobInfo.m_compCnt) {
							P3_compIdx = 0;
							P3_preMcuRow += 1;
							P3_rdReqGrpId = (PR3_htId << VERT_PREFETCH_MCUS_W) | ((P3_rdReqGrpId+1) & (VERT_PREFETCH_MCUS-1));
							P3_preMcuBlkRowFirst[P3_compIdx] = 0;

							P3_readBufIdx += 1;
							P3_mcuReadPendCnt += 1;
							S_mcuBufInUseCnt[PR3_htId] += 1;

						} else
							P3_compIdx += 1;
						P3_preMcuBlkRow = P3_preMcuBlkRowFirst[P3_compIdx];
					} else
						P3_preMcuBlkRow += 1;
				} else
					P3_preMcuBlkCol += 1;

				HtContinue(VWRK_PREREAD);

			} else {
				if (PR3_preMcuRow == PR3_inMcuRowEnd) {
					// free read interface for next thread
					S_readBusy = false;
				}

				HtContinue(VWRK_VRS_WAIT);
			}
		}
		break;
		case VWRK_VRS_WAIT: {
			// wait until a vrs structure is available, we double buffer
			BUSY_RETRY(S_pendOrderQue.front() != PR3_htId || S_vrsAvl == 0);

			P3_vrsIdx = (S_vrsAvl & 1) ? 0 : 1;
			S_vrsAvl &= (S_vrsAvl & 1) ? 2u : 1u;
				
			HtContinue(VWRK_VRS_INIT);
		}
		break;
		case VWRK_VRS_INIT: {
			// init vrs structure

			VertState vrs;
			vrs.m_bUpScale = T3_jobInfo.m_maxBlkRowsPerMcu == 2 && T3_loopVcp.m_blkRowsPerMcu == 1;
			ImageRows_t outRow = (ImageRows_t)(P3_outMcuRowStart * DCTSIZE << (T3_loopVcp.m_blkRowsPerMcu == 2 ? 1 : 0));
			ImageRows_t outRowEnd = (ImageRows_t)(outRow + ((4 * DCTSIZE) << (T3_loopVcp.m_blkRowsPerMcu == 2 ? 1 : 0)));
			if (outRowEnd > T3_loopVcp.m_outCompRows) outRowEnd = T3_loopVcp.m_outCompRows;

			if (vrs.m_bUpScale) {
				PntWghtCpInt_t filterWidth = (PntWghtCpInt_t)((T3_jobInfo.m_filterWidth >> 1) + 1);
				PntWghtCpInt_t filterOffset = (PntWghtCpInt_t)(18 - (filterWidth << 1));
				PntWghtCpInt_t negFilterOffset = -filterOffset;
				bool bInRowSel = P3_pntWghtStart < negFilterOffset;
				vrs.m_inRow = bInRowSel ? 0 : ((P3_pntWghtStart + filterOffset) >> 1);
				vrs.m_rowDataPos = bInRowSel ? (PntWghtCpInt_t)-18 : (PntWghtCpInt_t)((P3_pntWghtStart & ~1) - (filterWidth << 1));
				vrs.m_inRowOutDiff = 0;
				vrs.m_inRowIgnore = 0;
			} else {
예제 #3
0
void
CPersAdd::PersAdd()
{
	// Set read address of op1Mem/op2Mem/resMem variables
	// These will always be the same in every instruction for each thread
	S_op1Mem.read_addr(PR_htId);
	S_op2Mem.read_addr(PR_htId);
	S_resMem.read_addr(PR_htId);

	// Force "Inputs Valid" to default to false unless true in the ADD_PAUSE instruction
	P_i_vld = false;

	if (PR_htValid) {
		switch (PR_htInst) {
		case ADD_LD1: {
			if (ReadMemBusy()) {
				HtRetry();
				break;
			}

			// Memory read request - Operand 1
			MemAddr_t memRdAddr = SR_op1Addr + (P_vecIdx << 3);
			ReadMem_op1Mem(memRdAddr, PR_htId);
			HtContinue(ADD_LD2);
		}
		break;
		case ADD_LD2: {
			if (ReadMemBusy()) {
				HtRetry();
				break;
			}

			// Memory read request - Operand 2
			MemAddr_t memRdAddr = SR_op2Addr + (P_vecIdx << 3);
			ReadMem_op2Mem(memRdAddr, PR_htId);
			ReadMemPause(ADD_PAUSE);
		}
		break;
		case ADD_PAUSE: {
			// Store op1 and op2 into private variables 'a' and 'b'.
			P_a = S_op1Mem.read_mem();
			P_b = S_op2Mem.read_mem();

			// Mark inputs as valid, set htId
			P_i_htId = PR_htId;
			P_i_vld = true;

			// Pause thread and wait for primitive to calculate the result...
			// (will return to ADD_ST)
			HtPause(ADD_ST);
		}
		break;
		case ADD_ST: {
			if (WriteMemBusy()) {
				HtRetry();
				break;
			}

			// Memory write request - Addition Result
			MemAddr_t memWrAddr = SR_resAddr + (P_vecIdx << 3);
			WriteMem(memWrAddr, S_resMem.read_mem());
			WriteMemPause(ADD_RTN);
		}
		break;
		case ADD_RTN: {
			if (SendReturnBusy_add()) {
				HtRetry();
				break;
			}

			// Return Result from shared ram 'resMem'
			SendReturn_add(S_resMem.read_mem());
		}
		break;
		default:
			assert(0);
		}
	}

	// Temporary variables to use as outputs to the primitive
	// (these are not saved between cycles)
	uint64_t o_res;
	ht_uint7 o_htId;
	bool o_vld;

	// use clocked primitive
	add_5stage(P_a, P_b, P_i_htId, P_i_vld, o_res, o_htId, o_vld, add_prm_state1);

	// Check for valid outputs from the primitive
	if (o_vld) {
		// Store Result into shared ram to be written to memory later
		S_resMem.write_addr(o_htId);
		S_resMem.write_mem(o_res);

		// Wake up the thread (with corresponding htId)
		HtResume(o_htId);
	}
}