////////////////////////////////////////////////////////////////////////// // @brief processes a single decl from the streamout stream. Reads 4 components from the input // stream and writes N components to the output buffer given the componentMask or if // a hole, just increments the buffer pointer // @param pStream - pointer to current attribute // @param pOutBuffers - pointers to the current location of each output buffer // @param decl - input decl void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) { // @todo add this to x86 macros Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps); uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); uint32_t packedMask = (1 << numComponents) - 1; if (!decl.hole) { // increment stream pointer to correct slot Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot)); // load 4 components from stream Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4); Type* simd4PtrTy = PointerType::get(simd4Ty, 0); pAttrib = BITCAST(pAttrib, simd4PtrTy); Value *vattrib = LOAD(pAttrib); // shuffle/pack enabled components Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask)); // store to output buffer // cast SO buffer to i8*, needed by maskstore Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0)); // cast input to <4xfloat> Value* src = BITCAST(vpackedAttrib, simd4Ty); CALL(maskStore, {pOut, ToMask(packedMask), src}); } // increment SO buffer pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents)); }
static Value * arch_6502_get_operand_lvalue(cpu_t *cpu, addr_t pc, BasicBlock* bb) { int am = get_addmode(cpu->RAM[pc]); Value *index_register_before; Value *index_register_after; bool is_indirect; bool is_8bit_base; switch (am) { case ADDMODE_ACC: return ptr_A; case ADDMODE_BRA: case ADDMODE_IMPL: return NULL; case ADDMODE_IMM: { Value *ptr_temp = new AllocaInst(getIntegerType(8), "temp", bb); new StoreInst(CONST8(OPERAND_8), ptr_temp, bb); return ptr_temp; } } is_indirect = ((am == ADDMODE_IND) || (am == ADDMODE_INDX) || (am == ADDMODE_INDY)); is_8bit_base = !((am == ADDMODE_ABS) || (am == ADDMODE_ABSX) || (am == ADDMODE_ABSY)); index_register_before = NULL; if ((am == ADDMODE_ABSX) || (am == ADDMODE_INDX) || (am == ADDMODE_ZPX)) index_register_before = ptr_X; if ((am == ADDMODE_ABSY) || (am == ADDMODE_ZPY)) index_register_before = ptr_Y; index_register_after = (am == ADDMODE_INDY)? ptr_Y : NULL; #if 0 LOG("pc = %x\n", pc); LOG("index_register_before = %x\n", index_register_before); LOG("index_register_after = %x\n", index_register_after); LOG("is_indirect = %x\n", is_indirect); LOG("is_8bit_base = %x\n", is_8bit_base); #endif /* create base constant */ uint16_t base = is_8bit_base? (OPERAND_8):(OPERAND_16); Value *ea = CONST32(base); if (index_register_before) ea = ADD(ZEXT32(LOAD(index_register_before)), ea); /* wrap around in zero page */ if (is_8bit_base) ea = AND(ea, CONST32(0x00FF)); else if (base >= 0xFF00) /* wrap around in memory */ ea = AND(ea, CONST32(0xFFFF)); if (is_indirect) ea = ZEXT32(LOAD_RAM16(ea)); if (index_register_after) ea = ADD(ZEXT32(LOAD(index_register_after)), ea); return GEP(ea); }
void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc) { // get list of active SO buffers std::unordered_set<uint32_t> activeSOBuffers; for (uint32_t d = 0; d < streamState.numDecls; ++d) { const STREAMOUT_DECL& decl = streamState.decl[d]; activeSOBuffers.insert(decl.bufferIndex); } // always increment numPrimStorageNeeded Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1)); STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); // check OOB on active SO buffers. If any buffer is out of bound, don't write // the primitive to any buffer Value* oobMask = C(false); for (uint32_t buffer : activeSOBuffers) { oobMask = OR(oobMask, oob(state, pSoCtx, buffer)); } BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc); // early out if OOB COND_BR(oobMask, returnBB, validBB); IRB()->SetInsertPoint(validBB); Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); numPrimsWritten = ADD(numPrimsWritten, C(1)); STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); // compute start pointer for each output buffer Value* pOutBuffer[4]; Value* pOutBufferStartVertex[4]; Value* outBufferPitch[4]; for (uint32_t b: activeSOBuffers) { Value* pBuf = getSOBuffer(pSoCtx, b); Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer }); Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); pOutBuffer[b] = GEP(pData, streamOffset); pOutBufferStartVertex[b] = pOutBuffer[b]; outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); } // loop over the vertices of the prim Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData }); for (uint32_t v = 0; v < state.numVertsPerPrim; ++v) { buildVertex(streamState, pStreamData, pOutBuffer); // increment stream and output buffer pointers // stream verts are always 32*4 dwords apart pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4)); // output buffers offset using pitch in buffer state for (uint32_t b : activeSOBuffers) { pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]); pOutBuffer[b] = pOutBufferStartVertex[b]; } } // update each active buffer's streamOffset for (uint32_t b : activeSOBuffers) { Value* pBuf = getSOBuffer(pSoCtx, b); Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b])); STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); } }