Example #1
1
void
vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
{
   const struct brw_tes_prog_data *tes_prog_data =
      (const struct brw_tes_prog_data *) prog_data;

   switch (instr->intrinsic) {
   case nir_intrinsic_load_tess_coord:
      /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
               src_reg(brw_vec8_grf(1, 0))));
      break;
   case nir_intrinsic_load_tess_level_outer:
      if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) {
         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
                          BRW_SWIZZLE_ZWZW)));
      } else {
         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
                          BRW_SWIZZLE_WZYX)));
      }
      break;
   case nir_intrinsic_load_tess_level_inner:
      if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {
         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
                  swizzle(src_reg(ATTR, 0, glsl_type::vec4_type),
                          BRW_SWIZZLE_WZYX)));
      } else {
         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
                  src_reg(ATTR, 1, glsl_type::float_type)));
      }
      break;
   case nir_intrinsic_load_primitive_id:
      emit(TES_OPCODE_GET_PRIMITIVE_ID,
           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
      break;

   case nir_intrinsic_load_input:
   case nir_intrinsic_load_per_vertex_input: {
      src_reg indirect_offset = get_indirect_offset(instr);
      unsigned imm_offset = instr->const_index[0];
      src_reg header = input_read_header;
      bool is_64bit = nir_dest_bit_size(instr->dest) == 64;
      unsigned first_component = nir_intrinsic_component(instr);
      if (is_64bit)
         first_component /= 2;

      if (indirect_offset.file != BAD_FILE) {
         header = src_reg(this, glsl_type::uvec4_type);
         emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
              input_read_header, indirect_offset);
      } else {
         /* Arbitrarily only push up to 24 vec4 slots worth of data,
          * which is 12 registers (since each holds 2 vec4 slots).
          */
         const unsigned max_push_slots = 24;
         if (imm_offset < max_push_slots) {
            const glsl_type *src_glsl_type =
               is_64bit ? glsl_type::dvec4_type : glsl_type::ivec4_type;
            src_reg src = src_reg(ATTR, imm_offset, src_glsl_type);
            src.swizzle = BRW_SWZ_COMP_INPUT(first_component);

            const brw_reg_type dst_reg_type =
               is_64bit ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_D;
            emit(MOV(get_nir_dest(instr->dest, dst_reg_type), src));

            prog_data->urb_read_length =
               MAX2(prog_data->urb_read_length,
                    DIV_ROUND_UP(imm_offset + (is_64bit ? 2 : 1), 2));
            break;
         }
      }

      if (!is_64bit) {
         dst_reg temp(this, glsl_type::ivec4_type);
         vec4_instruction *read =
            emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
         read->offset = imm_offset;
         read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;

         src_reg src = src_reg(temp);
         src.swizzle = BRW_SWZ_COMP_INPUT(first_component);

         /* Copy to target.  We might end up with some funky writemasks landing
          * in here, but we really don't want them in the above pseudo-ops.
          */
         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
         dst.writemask = brw_writemask_for_size(instr->num_components);
         emit(MOV(dst, src));
      } else {
         /* For 64-bit we need to load twice as many 32-bit components, and for
          * dvec3/4 we need to emit 2 URB Read messages
          */
         dst_reg temp(this, glsl_type::dvec4_type);
         dst_reg temp_d = retype(temp, BRW_REGISTER_TYPE_D);

         vec4_instruction *read =
            emit(VEC4_OPCODE_URB_READ, temp_d, src_reg(header));
         read->offset = imm_offset;
         read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;

         if (instr->num_components > 2) {
            read = emit(VEC4_OPCODE_URB_READ, byte_offset(temp_d, REG_SIZE),
                        src_reg(header));
            read->offset = imm_offset + 1;
            read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
         }

         src_reg temp_as_src = src_reg(temp);
         temp_as_src.swizzle = BRW_SWZ_COMP_INPUT(first_component);

         dst_reg shuffled(this, glsl_type::dvec4_type);
         shuffle_64bit_data(shuffled, temp_as_src, false);

         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
         dst.writemask = brw_writemask_for_size(instr->num_components);
         emit(MOV(dst, src_reg(shuffled)));
      }
      break;
   }
   default:
      vec4_visitor::nir_emit_intrinsic(instr);
   }
}
Example #2
0
void
gen6_gs_visitor::emit_thread_end()
{
   /* Make sure the current primitive is ended: we know it is not ended when
    * first_vertex is not zero. This is only relevant for outputs other than
    * points because in the point case we set PrimEnd on all vertices.
    */
   if (c->gp->program.OutputType != GL_POINTS) {
      emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
      emit(IF(BRW_PREDICATE_NORMAL));
      {
         visit((ir_end_primitive *) NULL);
      }
      emit(BRW_OPCODE_ENDIF);
   }

   /* Here we have to:
    * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
    * 2) Loop over all buffered vertex data and write it to corresponding
    *    URB entries.
    * 3) Allocate new VUE handles for all vertices other than the first.
    * 4) Send a final EOT message.
    */

   /* MRF 0 is reserved for the debugger, so start with message header
    * in MRF 1.
    */
   int base_mrf = 1;

   /* In the process of generating our URB write message contents, we
    * may need to unspill a register or load from an array.  Those
    * reads would use MRFs 14-15.
    */
   int max_usable_mrf = 13;

   /* Issue the FF_SYNC message and obtain the initial VUE handle. */
   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      this->current_annotation = "gen6 thread end: ff_sync";

      vec4_instruction *inst;
      if (c->prog_data.gen6_xfb_enabled) {
         src_reg sol_temp(this, glsl_type::uvec4_type);
         emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
              dst_reg(this->svbi),
              this->vertex_count,
              this->prim_count,
              sol_temp);
         inst = emit(GS_OPCODE_FF_SYNC,
                     dst_reg(this->temp), this->prim_count, this->svbi);
      } else {
         inst = emit(GS_OPCODE_FF_SYNC,
                     dst_reg(this->temp), this->prim_count, src_reg(0u));
      }
      inst->base_mrf = base_mrf;

      /* Loop over all buffered vertices and emit URB write messages */
      this->current_annotation = "gen6 thread end: urb writes init";
      src_reg vertex(this, glsl_type::uint_type);
      emit(MOV(dst_reg(vertex), 0u));
      emit(MOV(dst_reg(this->vertex_output_offset), 0u));

      this->current_annotation = "gen6 thread end: urb writes";
      emit(BRW_OPCODE_DO);
      {
         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
         inst = emit(BRW_OPCODE_BREAK);
         inst->predicate = BRW_PREDICATE_NORMAL;

         /* First we prepare the message header */
         emit_urb_write_header(base_mrf);

         /* Then add vertex data to the message in interleaved fashion */
         int slot = 0;
         bool complete = false;
         do {
            int mrf = base_mrf + 1;

            /* URB offset is in URB row increments, and each of our MRFs is half
             * of one of those, since we're doing interleaved writes.
             */
            int urb_offset = slot / 2;

            for (; slot < prog_data->vue_map.num_slots; ++slot) {
               int varying = prog_data->vue_map.slot_to_varying[slot];
               current_annotation = output_reg_annotation[varying];

               /* Compute offset of this slot for the current vertex
                * in vertex_output
                */
               src_reg data(this->vertex_output);
               data.reladdr = ralloc(mem_ctx, src_reg);
               memcpy(data.reladdr, &this->vertex_output_offset,
                      sizeof(src_reg));

               /* Copy this slot to the appropriate message register */
               dst_reg reg = dst_reg(MRF, mrf);
               reg.type = output_reg[varying].type;
               data.type = reg.type;
               vec4_instruction *inst = emit(MOV(reg, data));
               inst->force_writemask_all = true;

               mrf++;
               emit(ADD(dst_reg(this->vertex_output_offset),
                        this->vertex_output_offset, 1u));

               /* If this was max_usable_mrf, we can't fit anything more into
                * this URB WRITE.
                */
               if (mrf > max_usable_mrf) {
                  slot++;
                  break;
               }
            }

            complete = slot >= prog_data->vue_map.num_slots;
            emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
         } while (!complete);

         /* Skip over the flags data item so that vertex_output_offset points
          * to the first data item of the next vertex, so that we can start
          * writing the next vertex.
          */
         emit(ADD(dst_reg(this->vertex_output_offset),
                  this->vertex_output_offset, 1u));

         emit(ADD(dst_reg(vertex), vertex, 1u));
      }
      emit(BRW_OPCODE_WHILE);

      if (c->prog_data.gen6_xfb_enabled)
         xfb_write();
   }
   emit(BRW_OPCODE_ENDIF);

   /* Finally, emit EOT message.
    *
    * In gen6 we need to end the thread differently depending on whether we have
    * emitted at least one vertex or not. In case we did, the EOT message must
    * always include the COMPLETE flag or else the GPU hangs. If we have not
    * produced any output we can't use the COMPLETE flag.
    *
    * However, this would lead us to end the program with an ENDIF opcode,
    * which we want to avoid, so what we do is that we always request a new
    * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
    * With this we make sure that whether we have emitted at least one vertex
    * or none at all, we have to finish the thread without writing to the URB,
    * which works for both cases by setting the COMPLETE and UNUSED flags in
    * the EOT message.
    */
   this->current_annotation = "gen6 thread end: EOT";

   if (c->prog_data.gen6_xfb_enabled) {
      /* When emitting EOT, set SONumPrimsWritten Increment Value. */
      src_reg data(this, glsl_type::uint_type);
      emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
      emit(SHL(dst_reg(data), data, src_reg(16u)));
      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
   }

   vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
   inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
   inst->base_mrf = base_mrf;
   inst->mlen = 1;
}
Example #3
0
void Jit::Comp_FPU2op(MIPSOpcode op) {
	CONDITIONAL_DISABLE;
	
	int fs = _FS;
	int fd = _FD;

	auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) {
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fs == fd, true);

		// Small optimization: 0 is our default mode anyway.
		if (setMXCSR == 0 && !js.hasSetRounding) {
			setMXCSR = -1;
		}
		if (setMXCSR != -1) {
			STMXCSR(M(&mxcsrTemp));
			MOV(32, R(TEMPREG), M(&mxcsrTemp));
			AND(32, R(TEMPREG), Imm32(~(3 << 13)));
			OR(32, R(TEMPREG), Imm32(setMXCSR << 13));
			MOV(32, M(&mips_->temp), R(TEMPREG));
			LDMXCSR(M(&mips_->temp));
		}

		(this->*conv)(TEMPREG, fpr.R(fs));

		// Did we get an indefinite integer value?
		CMP(32, R(TEMPREG), Imm32(0x80000000));
		FixupBranch skip = J_CC(CC_NE);
		if (fd != fs) {
			CopyFPReg(fpr.RX(fd), fpr.R(fs));
		}
		XORPS(XMM1, R(XMM1));
		CMPSS(fpr.RX(fd), R(XMM1), CMP_LT);

		// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
		// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
		MOVD_xmm(R(TEMPREG), fpr.RX(fd));
		XOR(32, R(TEMPREG), Imm32(0x7fffffff));

		SetJumpTarget(skip);
		MOVD_xmm(fpr.RX(fd), R(TEMPREG));

		if (setMXCSR != -1) {
			LDMXCSR(M(&mxcsrTemp));
		}
	};

	switch (op & 0x3f) {
	case 5:	//F(fd)	= fabsf(F(fs)); break; //abs
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		if (fd != fs && fpr.IsMapped(fs)) {
			MOVAPS(fpr.RX(fd), M(ssNoSignMask));
			ANDPS(fpr.RX(fd), fpr.R(fs));
		} else {
			if (fd != fs) {
				MOVSS(fpr.RX(fd), fpr.R(fs));
			}
			ANDPS(fpr.RX(fd), M(ssNoSignMask));
		}
		break;

	case 6:	//F(fd)	= F(fs);				break; //mov
		if (fd != fs) {
			fpr.SpillLock(fd, fs);
			fpr.MapReg(fd, fd == fs, true);
			CopyFPReg(fpr.RX(fd), fpr.R(fs));
		}
		break;

	case 7:	//F(fd)	= -F(fs);			 break; //neg
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		if (fd != fs && fpr.IsMapped(fs)) {
			MOVAPS(fpr.RX(fd), M(ssSignBits2));
			XORPS(fpr.RX(fd), fpr.R(fs));
		} else {
			if (fd != fs) {
				MOVSS(fpr.RX(fd), fpr.R(fs));
			}
			XORPS(fpr.RX(fd), M(ssSignBits2));
		}
		break;


	case 4:	//F(fd)	= sqrtf(F(fs)); break; //sqrt
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		SQRTSS(fpr.RX(fd), fpr.R(fs));
		break;

	case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s
		execRounding(&XEmitter::CVTTSS2SI, -1);
		break;

	case 32: //F(fd)	= (float)FsI(fs);			break; //cvt.s.w
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fs == fd, true);
		if (fpr.IsMapped(fs)) {
			CVTDQ2PS(fpr.RX(fd), fpr.R(fs));
		} else {
			// If fs was fd, we'd be in the case above since we mapped fd.
			MOVSS(fpr.RX(fd), fpr.R(fs));
			CVTDQ2PS(fpr.RX(fd), fpr.R(fd));
		}
		break;

	case 36: //FsI(fd) = (int)	F(fs);			 break; //cvt.w.s
		// Uses the current rounding mode.
		execRounding(&XEmitter::CVTSS2SI, -1);
		break;

	case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s
		execRounding(&XEmitter::CVTSS2SI, 0);
		break;
	case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s
		execRounding(&XEmitter::CVTSS2SI, 2);
		break;
	case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s
		execRounding(&XEmitter::CVTSS2SI, 1);
		break;
	default:
		DISABLE;
		return;
	}
	fpr.ReleaseSpillLocks();
}
Example #4
0
void
gen8_vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
                                               struct brw_reg dst,
                                               struct brw_reg *src)
{
   vec4_instruction *ir = (vec4_instruction *) instruction;

   if (dst.width == BRW_WIDTH_4) {
      /* This happens in attribute fixups for "dual instanced" geometry
       * shaders, since they use attributes that are vec4's.  Since the exec
       * width is only 4, it's essential that the caller set
       * force_writemask_all in order to make sure the instruction is executed
       * regardless of which channels are enabled.
       */
      assert(ir->force_writemask_all);

      /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy
       * the following register region restrictions (from Graphics BSpec:
       * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions
       * > Register Region Restrictions)
       *
       *     1. ExecSize must be greater than or equal to Width.
       *
       *     2. If ExecSize = Width and HorzStride != 0, VertStride must be set
       *        to Width * HorzStride."
       */
      for (int i = 0; i < 3; i++) {
         if (src[i].file == BRW_GENERAL_REGISTER_FILE)
            src[i] = stride(src[i], 4, 4, 1);
      }
   }

   switch (ir->opcode) {
   case BRW_OPCODE_MOV:
      MOV(dst, src[0]);
      break;

   case BRW_OPCODE_ADD:
      ADD(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_MUL:
      MUL(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_MACH:
      MACH(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_MAD:
      MAD(dst, src[0], src[1], src[2]);
      break;

   case BRW_OPCODE_FRC:
      FRC(dst, src[0]);
      break;

   case BRW_OPCODE_RNDD:
      RNDD(dst, src[0]);
      break;

   case BRW_OPCODE_RNDE:
      RNDE(dst, src[0]);
      break;

   case BRW_OPCODE_RNDZ:
      RNDZ(dst, src[0]);
      break;

   case BRW_OPCODE_AND:
      AND(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_OR:
      OR(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_XOR:
      XOR(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_NOT:
      NOT(dst, src[0]);
      break;

   case BRW_OPCODE_ASR:
      ASR(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_SHR:
      SHR(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_SHL:
      SHL(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_CMP:
      CMP(dst, ir->conditional_mod, src[0], src[1]);
      break;

   case BRW_OPCODE_SEL:
      SEL(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_DPH:
      DPH(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_DP4:
      DP4(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_DP3:
      DP3(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_DP2:
      DP2(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_F32TO16:
      F32TO16(dst, src[0]);
      break;

   case BRW_OPCODE_F16TO32:
      F16TO32(dst, src[0]);
      break;

   case BRW_OPCODE_LRP:
      LRP(dst, src[0], src[1], src[2]);
      break;

   case BRW_OPCODE_BFREV:
      /* BFREV only supports UD type for src and dst. */
      BFREV(retype(dst, BRW_REGISTER_TYPE_UD),
            retype(src[0], BRW_REGISTER_TYPE_UD));
      break;

   case BRW_OPCODE_FBH:
      /* FBH only supports UD type for dst. */
      FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
      break;

   case BRW_OPCODE_FBL:
      /* FBL only supports UD type for dst. */
      FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
      break;

   case BRW_OPCODE_CBIT:
      /* CBIT only supports UD type for dst. */
      CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
      break;

   case BRW_OPCODE_ADDC:
      ADDC(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_SUBB:
      SUBB(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_BFE:
      BFE(dst, src[0], src[1], src[2]);
      break;

   case BRW_OPCODE_BFI1:
      BFI1(dst, src[0], src[1]);
      break;

   case BRW_OPCODE_BFI2:
      BFI2(dst, src[0], src[1], src[2]);
      break;

   case BRW_OPCODE_IF:
      IF(ir->predicate);
      break;

   case BRW_OPCODE_ELSE:
      ELSE();
      break;

   case BRW_OPCODE_ENDIF:
      ENDIF();
      break;

   case BRW_OPCODE_DO:
      DO();
      break;

   case BRW_OPCODE_BREAK:
      BREAK();
      break;

   case BRW_OPCODE_CONTINUE:
      CONTINUE();
      break;

   case BRW_OPCODE_WHILE:
      WHILE();
      break;

   case SHADER_OPCODE_RCP:
      MATH(BRW_MATH_FUNCTION_INV, dst, src[0]);
      break;

   case SHADER_OPCODE_RSQ:
      MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]);
      break;

   case SHADER_OPCODE_SQRT:
      MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]);
      break;

   case SHADER_OPCODE_EXP2:
      MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]);
      break;

   case SHADER_OPCODE_LOG2:
      MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]);
      break;

   case SHADER_OPCODE_SIN:
      MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]);
      break;

   case SHADER_OPCODE_COS:
      MATH(BRW_MATH_FUNCTION_COS, dst, src[0]);
      break;

   case SHADER_OPCODE_POW:
      MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]);
      break;

   case SHADER_OPCODE_INT_QUOTIENT:
      MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]);
      break;

   case SHADER_OPCODE_INT_REMAINDER:
      MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]);
      break;

   case SHADER_OPCODE_TEX:
   case SHADER_OPCODE_TXD:
   case SHADER_OPCODE_TXF:
   case SHADER_OPCODE_TXF_CMS:
   case SHADER_OPCODE_TXF_MCS:
   case SHADER_OPCODE_TXL:
   case SHADER_OPCODE_TXS:
   case SHADER_OPCODE_TG4:
   case SHADER_OPCODE_TG4_OFFSET:
      generate_tex(ir, dst);
      break;

   case VS_OPCODE_URB_WRITE:
      generate_urb_write(ir, true);
      break;

   case SHADER_OPCODE_GEN4_SCRATCH_READ:
      generate_scratch_read(ir, dst, src[0]);
      break;

   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
      generate_scratch_write(ir, dst, src[0], src[1]);
      break;

   case VS_OPCODE_PULL_CONSTANT_LOAD:
   case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
      generate_pull_constant_load(ir, dst, src[0], src[1]);
      break;

   case GS_OPCODE_URB_WRITE:
      generate_urb_write(ir, false);
      break;

   case GS_OPCODE_THREAD_END:
      generate_gs_thread_end(ir);
      break;

   case GS_OPCODE_SET_WRITE_OFFSET:
      generate_gs_set_write_offset(dst, src[0], src[1]);
      break;

   case GS_OPCODE_SET_VERTEX_COUNT:
      generate_gs_set_vertex_count(dst, src[0]);
      break;

   case GS_OPCODE_SET_DWORD_2_IMMED:
      generate_gs_set_dword_2_immed(dst, src[0]);
      break;

   case GS_OPCODE_PREPARE_CHANNEL_MASKS:
      generate_gs_prepare_channel_masks(dst);
      break;

   case GS_OPCODE_SET_CHANNEL_MASKS:
      generate_gs_set_channel_masks(dst, src[0]);
      break;

   case SHADER_OPCODE_SHADER_TIME_ADD:
      assert(!"XXX: Missing Gen8 vec4 support for INTEL_DEBUG=shader_time");
      break;

   case SHADER_OPCODE_UNTYPED_ATOMIC:
      assert(!"XXX: Missing Gen8 vec4 support for UNTYPED_ATOMIC");
      break;

   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
      assert(!"XXX: Missing Gen8 vec4 support for UNTYPED_SURFACE_READ");
      break;

   case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
      assert(!"VS_OPCODE_UNPACK_FLAGS_SIMD4X2 should not be used on Gen8+.");
      break;

   default:
      if (ir->opcode < (int) ARRAY_SIZE(opcode_descs)) {
         _mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n",
                       opcode_descs[ir->opcode].name);
      } else {
         _mesa_problem(ctx, "Unsupported opcode %d in VS", ir->opcode);
      }
      abort();
   }
}
Example #5
0
bool
fs_visitor::opt_cse_local(bblock_t *block, exec_list *aeb)
{
   bool progress = false;

   void *mem_ctx = ralloc_context(this->mem_ctx);

   int ip = block->start_ip;
   for (fs_inst *inst = (fs_inst *)block->start;
	inst != block->end->next;
	inst = (fs_inst *) inst->next) {

      /* Skip some cases. */
      if (is_expression(inst) &&
          !inst->predicate &&
          !inst->is_partial_write() &&
          !inst->conditional_mod &&
          inst->dst.file != HW_REG)
      {
	 bool found = false;

	 aeb_entry *entry;
	 foreach_list(entry_node, aeb) {
	    entry = (aeb_entry *) entry_node;

	    /* Match current instruction's expression against those in AEB. */
	    if (inst->opcode == entry->generator->opcode &&
		inst->saturate == entry->generator->saturate &&
                inst->dst.type == entry->generator->dst.type &&
                operands_match(inst->opcode, entry->generator->src, inst->src)) {

	       found = true;
	       progress = true;
	       break;
	    }
	 }

	 if (!found) {
	    /* Our first sighting of this expression.  Create an entry. */
	    aeb_entry *entry = ralloc(mem_ctx, aeb_entry);
	    entry->tmp = reg_undef;
	    entry->generator = inst;
	    aeb->push_tail(entry);
	 } else {
	    /* This is at least our second sighting of this expression.
	     * If we don't have a temporary already, make one.
	     */
	    bool no_existing_temp = entry->tmp.file == BAD_FILE;
	    if (no_existing_temp) {
               int written = entry->generator->regs_written;

               fs_reg orig_dst = entry->generator->dst;
               fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written),
                                   orig_dst.type);
               entry->tmp = tmp;
               entry->generator->dst = tmp;

               for (int i = 0; i < written; i++) {
                  fs_inst *copy = MOV(orig_dst, tmp);
                  copy->force_writemask_all =
                     entry->generator->force_writemask_all;
                  entry->generator->insert_after(copy);

                  orig_dst.reg_offset++;
                  tmp.reg_offset++;
               }
	    }

	    /* dest <- temp */
            int written = inst->regs_written;
            assert(written == entry->generator->regs_written);
            assert(inst->dst.type == entry->tmp.type);
            fs_reg dst = inst->dst;
            fs_reg tmp = entry->tmp;
            fs_inst *copy = NULL;
            for (int i = 0; i < written; i++) {
               copy = MOV(dst, tmp);
               copy->force_writemask_all = inst->force_writemask_all;
               inst->insert_before(copy);

               dst.reg_offset++;
               tmp.reg_offset++;
            }
            inst->remove();

	    /* Appending an instruction may have changed our bblock end. */
	    if (inst == block->end) {
	       block->end = copy;
	    }

	    /* Continue iteration with copy->next */
	    inst = copy;
	 }
      }
Example #6
0
void JitArm::SafeStoreFromReg(bool fastmem, s32 dest, u32 value, s32 regOffset, int accessSize, s32 offset)
{
	if (Core::g_CoreStartupParameter.bFastmem && fastmem)
	{
		ARMReg RA;
		ARMReg RB;
		ARMReg RS = gpr.R(value);

		if (dest != -1)
			RA = gpr.R(dest);

		if (regOffset != -1)
		{
			RB = gpr.R(regOffset);
			MOV(R10, RB);
			NOP(1);
		}
		else
			MOVI2R(R10, (u32)offset, false);

		if (dest != -1)
			ADD(R10, R10, RA);
		else
			NOP(1);

		MOV(R12, RS);
		UnsafeStoreFromReg(R10, R12, accessSize, 0);
		return;
	}
	ARMReg rA = gpr.GetReg();
	ARMReg rB = gpr.GetReg();
	ARMReg rC = gpr.GetReg();
	ARMReg RA;
	ARMReg RB;
	if (dest != -1)
		RA = gpr.R(dest);
	if (regOffset != -1)
		RB = gpr.R(regOffset);
	ARMReg RS = gpr.R(value);
	switch(accessSize)
	{
		case 32:
			MOVI2R(rA, (u32)&Memory::Write_U32);
		break;
		case 16:
			MOVI2R(rA, (u32)&Memory::Write_U16);
		break;
		case 8:
			MOVI2R(rA, (u32)&Memory::Write_U8);
		break;
	}
	MOV(rB, RS);
	if (regOffset == -1)
		MOVI2R(rC, offset);
	else
		MOV(rC, RB);
	if (dest != -1)
		ADD(rC, rC, RA);

	PUSH(4, R0, R1, R2, R3);
	MOV(R0, rB);
	MOV(R1, rC);
	BL(rA);
	POP(4, R0, R1, R2, R3);
	gpr.Unlock(rA, rB, rC);
}
Example #7
0
void JitArm::stX(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITLoadStoreOff)

	u32 a = inst.RA, b = inst.RB, s = inst.RS;
	s32 offset = inst.SIMM_16;
	u32 accessSize = 0;
	s32 regOffset = -1;
	bool zeroA = true;
	bool update = false;
	bool fastmem = false;
	switch(inst.OPCD)
	{
		case 45: // sthu
			update = true;
		case 44: // sth
			accessSize = 16;
		break;
		case 31:
			switch (inst.SUBOP10)
			{
				case 183: // stwux
					zeroA = false;
					update = true;
				case 151: // stwx
					fastmem = true;
					accessSize = 32;
					regOffset = b;
				break;
				case 247: // stbux
					zeroA = false;
					update = true;
				case 215: // stbx
					accessSize = 8;
					regOffset = b;
				break;
				case 439: // sthux
					zeroA = false;
					update = true;
				case 407: // sthx
					accessSize = 16;
					regOffset = b;
				break;
			}
		break;
		case 37: // stwu
			update = true;
		case 36: // stw
			fastmem = true;
			accessSize = 32;
		break;
		case 39: // stbu
			update = true;
		case 38: // stb
			accessSize = 8;
		break;
	}
	SafeStoreFromReg(fastmem, zeroA ? a ? a : -1 : a, s, regOffset, accessSize, offset);
	if (update)
	{
		ARMReg rA = gpr.GetReg();
		ARMReg RB;
		ARMReg RA = gpr.R(a);
		if (regOffset != -1)
			RB = gpr.R(regOffset);
		// Check for DSI exception prior to writing back address
		LDR(rA, R9, PPCSTATE_OFF(Exceptions));
		CMP(rA, EXCEPTION_DSI);
		FixupBranch DoNotWrite = B_CC(CC_EQ);
		if (a)
		{
			if (regOffset == -1)
				MOVI2R(rA, offset);
			else
				MOV(rA, RB);
			ADD(RA, RA, rA);
		}
		else
			if (regOffset == -1)
				MOVI2R(RA, (u32)offset);
			else
				MOV(RA, RB);
		SetJumpTarget(DoNotWrite);
		gpr.Unlock(rA);
	}
}
Example #8
0
void Jit::Comp_mxc1(u32 op)
{
	CONDITIONAL_DISABLE;

	int fs = _FS;
	int rt = _RT;

	switch ((op >> 21) & 0x1f)
	{
	case 0: // R(rt) = FI(fs); break; //mfc1
		// Let's just go through RAM for now.
		fpr.FlushR(fs);
		gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT);
		LDR(gpr.R(rt), CTXREG, fpr.GetMipsRegOffset(fs));
		return;

	case 2: //cfc1
		if (fs == 31)
		{
			gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT);
			LDR(R0, CTXREG, offsetof(MIPSState, fpcond));
			AND(R0, R0, Operand2(1)); // Just in case
			LDR(gpr.R(rt), CTXREG, offsetof(MIPSState, fcr31));
			BIC(gpr.R(rt), gpr.R(rt), Operand2(0x1 << 23));
			ORR(gpr.R(rt), gpr.R(rt), Operand2(R0, ST_LSL, 23));
		}
		else if (fs == 0)
		{
			gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT);
			LDR(gpr.R(rt), CTXREG, offsetof(MIPSState, fcr0));
		}
		return;

	case 4: //FI(fs) = R(rt);	break; //mtc1
		// Let's just go through RAM for now.
		gpr.FlushR(rt);
		fpr.MapReg(fs, MAP_DIRTY | MAP_NOINIT);
		VLDR(fpr.R(fs), CTXREG, gpr.GetMipsRegOffset(rt));
		return;

	case 6: //ctc1
		if (fs == 31)
		{
			gpr.MapReg(rt, 0);
			// Hardware rounding method.
			// Left here in case it is faster than conditional method.
			/*
			AND(R0, gpr.R(rt), Operand2(3));
			// MIPS Rounding Mode <-> ARM Rounding Mode
			//         0, 1, 2, 3 <->  0, 3, 1, 2
			CMP(R0, Operand2(1));
			SetCC(CC_EQ); ADD(R0, R0, Operand2(2));
			SetCC(CC_GT); SUB(R0, R0, Operand2(1));
			SetCC(CC_AL);

			// Load and Store RM to FPSCR
			VMRS(R1);
			BIC(R1, R1, Operand2(0x3 << 22));
			ORR(R1, R1, Operand2(R0, ST_LSL, 22));
			VMSR(R1);
			*/
			// Update MIPS state
			STR(gpr.R(rt), CTXREG, offsetof(MIPSState, fcr31));
			MOV(R0, Operand2(gpr.R(rt), ST_LSR, 23));
			AND(R0, R0, Operand2(1));
			STR(R0, CTXREG, offsetof(MIPSState, fpcond));
		}
		return;
	}
}
Example #9
0
void Jit64::reg_imm(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(Integer)
	u32 d = inst.RD, a = inst.RA, s = inst.RS;
	switch (inst.OPCD)
	{
	case 14:  // addi
		// occasionally used as MOV - emulate, with immediate propagation
		if (gpr.R(a).IsImm() && d != a && a != 0) {
			gpr.SetImmediate32(d, (u32)gpr.R(a).offset + (u32)(s32)(s16)inst.SIMM_16);
		} else if (inst.SIMM_16 == 0 && d != a && a != 0) {
			gpr.Lock(a, d);
			gpr.BindToRegister(d, false, true);
			MOV(32, gpr.R(d), gpr.R(a));
			gpr.UnlockAll();
		} else {
			regimmop(d, a, false, (u32)(s32)inst.SIMM_16,  Add, &XEmitter::ADD); //addi
		}
		break;
	case 15:
		if (a == 0) {	// lis
			// Merge with next instruction if loading a 32-bits immediate value (lis + addi, lis + ori)
			if (!js.isLastInstruction && !Core::g_CoreStartupParameter.bEnableDebugging) {
				if ((js.next_inst.OPCD == 14) && (js.next_inst.RD == d) && (js.next_inst.RA == d)) {      // addi
					gpr.SetImmediate32(d, ((u32)inst.SIMM_16 << 16) + (u32)(s32)js.next_inst.SIMM_16);
					js.downcountAmount++;
					js.skipnext = true;
					break;
				}
				else if ((js.next_inst.OPCD == 24) && (js.next_inst.RA == d) && (js.next_inst.RS == d))	{ // ori
					gpr.SetImmediate32(d, ((u32)inst.SIMM_16 << 16) | (u32)js.next_inst.UIMM);
					js.downcountAmount++;
					js.skipnext = true;
					break;
				}
			}

			// Not merged
			regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, &XEmitter::ADD);
		}
		else {	// addis
			regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, &XEmitter::ADD);
		}
		break;
	case 24: 
		if (a == 0 && s == 0 && inst.UIMM == 0 && !inst.Rc)  //check for nop
		{NOP(); return;} //make the nop visible in the generated code. not much use but interesting if we see one.
		regimmop(a, s, true, inst.UIMM, Or, &XEmitter::OR); 
		break; //ori
	case 25: regimmop(a, s, true, inst.UIMM << 16, Or,  &XEmitter::OR, false); break;//oris
	case 28: regimmop(a, s, true, inst.UIMM,       And, &XEmitter::AND, true); break;
	case 29: regimmop(a, s, true, inst.UIMM << 16, And, &XEmitter::AND, true); break;
	case 26: regimmop(a, s, true, inst.UIMM,       Xor, &XEmitter::XOR, false); break; //xori
	case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, &XEmitter::XOR, false); break; //xoris
	case 12: regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, &XEmitter::ADD, false, true); break; //addic
	case 13: regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, &XEmitter::ADD, true, true); break; //addic_rc
	default:
		Default(inst);
		break;
	}
}
Example #10
0
const u8 *Jit::DoJit(u32 em_address, JitBlock *b)
{
	js.cancel = false;
	js.blockStart = js.compilerPC = mips_->pc;
	js.nextExit = 0;
	js.downcountAmount = 0;
	js.curBlock = b;
	js.compiling = true;
	js.inDelaySlot = false;
	js.afterOp = JitState::AFTER_NONE;
	js.PrefixStart();

	// We add a check before the block, used when entering from a linked block.
	b->checkedEntry = GetCodePtr();
	// Downcount flag check. The last block decremented downcounter, and the flag should still be available.
	FixupBranch skip = J_CC(CC_NBE);
	MOV(32, M(&mips_->pc), Imm32(js.blockStart));
	JMP(asm_.outerLoop, true);  // downcount hit zero - go advance.
	SetJumpTarget(skip);

	b->normalEntry = GetCodePtr();

	MIPSAnalyst::AnalysisResults analysis = MIPSAnalyst::Analyze(em_address);

	gpr.Start(mips_, analysis);
	fpr.Start(mips_, analysis);

	js.numInstructions = 0;
	while (js.compiling) {
		// Jit breakpoints are quite fast, so let's do them in release too.
		CheckJitBreakpoint(js.compilerPC, 0);

		MIPSOpcode inst = Memory::Read_Instruction(js.compilerPC);
		js.downcountAmount += MIPSGetInstructionCycleEstimate(inst);

		MIPSCompileOp(inst);

		if (js.afterOp & JitState::AFTER_CORE_STATE) {
			// TODO: Save/restore?
			FlushAll();

			// If we're rewinding, CORE_NEXTFRAME should not cause a rewind.
			// It doesn't really matter either way if we're not rewinding.
			// CORE_RUNNING is <= CORE_NEXTFRAME.
			CMP(32, M((void*)&coreState), Imm32(CORE_NEXTFRAME));
			FixupBranch skipCheck = J_CC(CC_LE);
			if (js.afterOp & JitState::AFTER_REWIND_PC_BAD_STATE)
				MOV(32, M(&mips_->pc), Imm32(js.compilerPC));
			else
				MOV(32, M(&mips_->pc), Imm32(js.compilerPC + 4));
			WriteSyscallExit();
			SetJumpTarget(skipCheck);

			js.afterOp = JitState::AFTER_NONE;
		}

		js.compilerPC += 4;
		js.numInstructions++;

		// Safety check, in case we get a bunch of really large jit ops without a lot of branching.
		if (GetSpaceLeft() < 0x800)
		{
			FlushAll();
			WriteExit(js.compilerPC, js.nextExit++);
			js.compiling = false;
		}
	}

	b->codeSize = (u32)(GetCodePtr() - b->normalEntry);
	NOP();
	AlignCode4();
	b->originalSize = js.numInstructions;
	return b->normalEntry;
}
Example #11
0
void
gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
{
   struct brw_gs_prog_data *prog_data =
      (struct brw_gs_prog_data *) &c->prog_data;
   unsigned binding;
   unsigned num_bindings = prog_data->num_transform_feedback_bindings;
   src_reg sol_temp(this, glsl_type::uvec4_type);

   /* Check for buffer overflow: we need room to write the complete primitive
    * (all vertices). Otherwise, avoid writing any vertices for it
    */
   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
   emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      /* Avoid overwriting MRF 1 as it is used as URB write message header */
      dst_reg mrf_reg(MRF, 2);

      this->current_annotation = "gen6: emit SOL vertex data";
      /* For each vertex, generate code to output each varying using the
       * appropriate binding table entry.
       */
      for (binding = 0; binding < num_bindings; ++binding) {
         unsigned char varying =
            prog_data->transform_feedback_bindings[binding];

         /* Set up the correct destination index for this vertex */
         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
                                       mrf_reg,
                                       this->destination_indices);
         inst->sol_vertex = vertex % num_verts;

         /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
          *
          *   "Prior to End of Thread with a URB_WRITE, the kernel must
          *   ensure that all writes are complete by sending the final
          *   write as a committed write."
          */
         bool final_write = binding == (unsigned) num_bindings - 1 &&
                            inst->sol_vertex == num_verts - 1;

         /* Compute offset of this varying for the current vertex
          * in vertex_output
          */
         this->current_annotation = output_reg_annotation[varying];
         src_reg data(this->vertex_output);
         data.reladdr = ralloc(mem_ctx, src_reg);
         int offset = get_vertex_output_offset_for_varying(vertex, varying);
         emit(MOV(dst_reg(this->vertex_output_offset), offset));
         memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
         data.type = output_reg[varying].type;

         /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
          * same slot, so make sure we write the appropriate channel
          */
         if (varying == VARYING_SLOT_PSIZ)
            data.swizzle = BRW_SWIZZLE_WWWW;
         else if (varying == VARYING_SLOT_LAYER)
            data.swizzle = BRW_SWIZZLE_YYYY;
         else if (varying == VARYING_SLOT_VIEWPORT)
            data.swizzle = BRW_SWIZZLE_ZZZZ;
         else
            data.swizzle = prog_data->transform_feedback_swizzles[binding];

         /* Write data */
         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
         inst->sol_binding = binding;
         inst->sol_final_write = final_write;

         if (final_write) {
            /* This is the last vertex of the primitive, then increment
             * SO num primitive counter and destination indices.
             */
            emit(ADD(dst_reg(this->destination_indices),
                     this->destination_indices,
                     src_reg(num_verts)));
            emit(ADD(dst_reg(this->sol_prim_written),
                     this->sol_prim_written, 1u));
         }

      }
      this->current_annotation = NULL;
   }
   emit(BRW_OPCODE_ENDIF);
}
Example #12
0
void
gen6_gs_visitor::xfb_write()
{
   unsigned num_verts;
   struct brw_gs_prog_data *prog_data =
      (struct brw_gs_prog_data *) &c->prog_data;

   if (!prog_data->num_transform_feedback_bindings)
      return;

   switch (c->prog_data.output_topology) {
   case _3DPRIM_POINTLIST:
      num_verts = 1;
      break;
   case _3DPRIM_LINELIST:
   case _3DPRIM_LINESTRIP:
   case _3DPRIM_LINELOOP:
      num_verts = 2;
      break;
   case _3DPRIM_TRILIST:
   case _3DPRIM_TRIFAN:
   case _3DPRIM_TRISTRIP:
   case _3DPRIM_RECTLIST:
      num_verts = 3;
      break;
   case _3DPRIM_QUADLIST:
   case _3DPRIM_QUADSTRIP:
   case _3DPRIM_POLYGON:
      num_verts = 3;
      break;
   default:
      unreachable("Unexpected primitive type in Gen6 SOL program.");
   }

   this->current_annotation = "gen6 thread end: svb writes init";

   emit(MOV(dst_reg(this->vertex_output_offset), 0u));
   emit(MOV(dst_reg(this->sol_prim_written), 0u));

   /* Check that at least one primitive can be written
    *
    * Note: since we use the binding table to keep track of buffer offsets
    * and stride, the GS doesn't need to keep track of a separate pointer
    * into each buffer; it uses a single pointer which increments by 1 for
    * each vertex.  So we use SVBI0 for this pointer, regardless of whether
    * transform feedback is in interleaved or separate attribs mode.
    */
   src_reg sol_temp(this, glsl_type::uvec4_type);
   emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));

   /* Compare SVBI calculated number with the maximum value, which is
    * in R1.4 (previously saved in this->max_svbi) for gen6.
    */
   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      src_reg destination_indices_uw =
         retype(destination_indices, BRW_REGISTER_TYPE_UW);

      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
                                        brw_imm_v(0x00020100))); /* (0, 1, 2) */
      inst->force_writemask_all = true;

      emit(ADD(dst_reg(this->destination_indices),
               this->destination_indices,
               this->svbi));
   }
   emit(BRW_OPCODE_ENDIF);

   /* Write transform feedback data for all processed vertices. */
   for (int i = 0; i < c->gp->program.VerticesOut; i++) {
      emit(MOV(dst_reg(sol_temp), i));
      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
               BRW_CONDITIONAL_L));
      emit(IF(BRW_PREDICATE_NORMAL));
      {
         xfb_program(i, num_verts);
      }
      emit(BRW_OPCODE_ENDIF);
   }
}
Example #13
0
void
gen6_gs_visitor::emit_prolog()
{
   vec4_gs_visitor::emit_prolog();

   /* Gen6 geometry shaders require to allocate an initial VUE handle via
    * FF_SYNC message, however the documentation remarks that only one thread
    * can write to the URB simultaneously and the FF_SYNC message provides the
    * synchronization mechanism for this, so using this message effectively
    * stalls the thread until it is its turn to write to the URB. Because of
    * this, the best way to implement geometry shader algorithms in gen6 is to
    * execute the algorithm before the FF_SYNC message to maximize parallelism.
    *
    * To achieve this we buffer the geometry shader outputs for each emitted
    * vertex in vertex_output during operation. Then, when we have processed
    * the last vertex (that is, at thread end time), we send the FF_SYNC
    * message to allocate the initial VUE handle and write all buffered vertex
    * data to the URB in one go.
    *
    * For each emitted vertex, vertex_output will hold vue_map.num_slots
    * data items plus one additional item to hold required flags
    * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
    * which come right after the data items for that vertex. Vertex data and
    * flags for the next vertex come right after the data items and flags for
    * the previous vertex.
    */
   this->current_annotation = "gen6 prolog";
   this->vertex_output = src_reg(this,
                                 glsl_type::uint_type,
                                 (prog_data->vue_map.num_slots + 1) *
                                 c->gp->program.VerticesOut);
   this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
   emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));

   /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
    * so initialize it once to R0.
    */
   vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
                                     retype(brw_vec8_grf(0, 0),
                                            BRW_REGISTER_TYPE_UD)));
   inst->force_writemask_all = true;

   /* This will be used as a temporary to store writeback data of FF_SYNC
    * and URB_WRITE messages.
    */
   this->temp = src_reg(this, glsl_type::uint_type);

   /* This will be used to know when we are processing the first vertex of
    * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
    * that we are processing the first vertex in the primitive and to zero
    * otherwise. This way we can use its value directly in the URB write
    * headers.
    */
   this->first_vertex = src_reg(this, glsl_type::uint_type);
   emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));

   /* The FF_SYNC message requires to know the number of primitives generated,
    * so keep a counter for this.
    */
   this->prim_count = src_reg(this, glsl_type::uint_type);
   emit(MOV(dst_reg(this->prim_count), 0u));

   if (c->prog_data.gen6_xfb_enabled) {
      /* Create a virtual register to hold destination indices in SOL */
      this->destination_indices = src_reg(this, glsl_type::uvec4_type);
      /* Create a virtual register to hold number of written primitives */
      this->sol_prim_written = src_reg(this, glsl_type::uint_type);
      /* Create a virtual register to hold Streamed Vertex Buffer Indices */
      this->svbi = src_reg(this, glsl_type::uvec4_type);
      /* Create a virtual register to hold max values of SVBI */
      this->max_svbi = src_reg(this, glsl_type::uvec4_type);
      emit(MOV(dst_reg(this->max_svbi),
               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));

      xfb_setup();
   }

   /* PrimitveID is delivered in r0.1 of the thread payload. If the program
    * needs it we have to move it to a separate register where we can map
    * the atttribute.
    *
    * Notice that we cannot use a virtual register for this, because we need to
    * map all input attributes to hardware registers in setup_payload(),
    * which happens before virtual registers are mapped to hardware registers.
    * We could work around that issue if we were able to compute the first
    * non-payload register here and move the PrimitiveID information to that
    * register, but we can't because at this point we don't know the final
    * number uniforms that will be included in the payload.
    *
    * So, what we do is to place PrimitiveID information in r1, which is always
    * delivered as part of the payload, but its only populated with data
    * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
    * in the 3DSTATE_GS state packet. That information can be obtained by other
    * means though, so we can safely use r1 for this purpose.
    */
   if (c->prog_data.include_primitive_id) {
      this->primitive_id =
         src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
      emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
   }
}
Example #14
0
bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) {
	if (bitsPerTexel == 4) {
		// Specialized implementation.
		return Jit_GetTexDataSwizzled4();
	}

	LEA(32, tempReg1, MScaled(vReg, SCALE_4, 0));
	AND(32, R(tempReg1), Imm8(31));
	AND(32, R(vReg), Imm8(~7));

	MOV(32, R(tempReg2), R(uReg));
	MOV(32, R(resultReg), R(uReg));
	switch (bitsPerTexel) {
	case 32:
		SHR(32, R(resultReg), Imm8(2));
		break;
	case 16:
		SHR(32, R(vReg), Imm8(1));
		SHR(32, R(tempReg2), Imm8(1));
		SHR(32, R(resultReg), Imm8(3));
		break;
	case 8:
		SHR(32, R(vReg), Imm8(2));
		SHR(32, R(tempReg2), Imm8(2));
		SHR(32, R(resultReg), Imm8(4));
		break;
	default:
		return false;
	}
	AND(32, R(tempReg2), Imm8(3));
	SHL(32, R(resultReg), Imm8(5));
	ADD(32, R(tempReg1), R(tempReg2));
	ADD(32, R(tempReg1), R(resultReg));

	// We may clobber srcReg in the MUL, so let's grab it now.
	LEA(64, tempReg1, MComplex(srcReg, tempReg1, SCALE_4, 0));

	LEA(32, EAX, MScaled(bufwReg, SCALE_4, 0));
	MUL(32, R(vReg));

	switch (bitsPerTexel) {
	case 32:
		MOV(bitsPerTexel, R(resultReg), MRegSum(tempReg1, EAX));
		break;
	case 16:
		AND(32, R(uReg), Imm8(1));
		// Multiply by two by just adding twice.
		ADD(32, R(EAX), R(uReg));
		ADD(32, R(EAX), R(uReg));
		MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX));
		break;
	case 8:
		AND(32, R(uReg), Imm8(3));
		ADD(32, R(EAX), R(uReg));
		MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX));
		break;
	default:
		return false;
	}

	return true;
}
Example #15
0
void Jit64::GetCarryEAXAndClear()
{
	MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER]));
	BTR(32, R(EAX), Imm8(29));
}
Example #16
0
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
	_assert_msg_(G3D, id.linear, "Linear should be set on sampler id");
	BeginWrite();

	// We'll first write the nearest sampler, which we will CALL.
	// This may differ slightly based on the "linear" flag.
	const u8 *nearest = AlignCode16();

	if (!Jit_ReadTextureFormat(id)) {
		EndWrite();
		SetCodePtr(const_cast<u8 *>(nearest));
		return nullptr;
	}

	RET();

	// Now the actual linear func, which is exposed externally.
	const u8 *start = AlignCode16();

	// NOTE: This doesn't use the general register mapping.
	// POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level
	// Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level
	//
	// We map these to nearest CALLs, with order: u, v, src, bufw, level

	// Let's start by saving a bunch of registers.
	PUSH(R15);
	PUSH(R14);
	PUSH(R13);
	PUSH(R12);
	// Won't need frac_u/frac_v for a while.
	PUSH(arg4Reg);
	PUSH(arg3Reg);
	// Extra space to restore alignment and save resultReg for lerp.
	// TODO: Maybe use XMMs instead?
	SUB(64, R(RSP), Imm8(24));

	MOV(64, R(R12), R(arg1Reg));
	MOV(64, R(R13), R(arg2Reg));
#ifdef _WIN32
	// First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space)
	const int argOffset = 24 + 48 + 8 + 32;
	MOV(64, R(R14), MDisp(RSP, argOffset));
	MOV(32, R(R15), MDisp(RSP, argOffset + 8));
	// level is at argOffset + 16.
#else
	MOV(64, R(R14), R(arg5Reg));
	MOV(32, R(R15), R(arg6Reg));
	// level is at 24 + 48 + 8.
#endif

	// Early exit on !srcPtr.
	FixupBranch zeroSrc;
	if (id.hasInvalidPtr) {
		CMP(PTRBITS, R(R14), Imm8(0));
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
		XOR(32, R(RAX), R(RAX));
		zeroSrc = J(true);
		SetJumpTarget(nonZeroSrc);
	}

	// At this point:
	// R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level

	auto doNearestCall = [&](int off) {
		MOV(32, R(uReg), MDisp(R12, off));
		MOV(32, R(vReg), MDisp(R13, off));
		MOV(64, R(srcReg), R(R14));
		MOV(32, R(bufwReg), R(R15));
		// Leave level, we just always load from RAM.  Separate CLUTs is uncommon.

		CALL(nearest);
		MOV(32, MDisp(RSP, off), R(resultReg));
	};

	doNearestCall(0);
	doNearestCall(4);
	doNearestCall(8);
	doNearestCall(12);

	// Convert TL, TR, BL, BR to floats for easier blending.
	if (!cpu_info.bSSE4_1) {
		PXOR(XMM0, R(XMM0));
	}

	MOVD_xmm(fpScratchReg1, MDisp(RSP, 0));
	MOVD_xmm(fpScratchReg2, MDisp(RSP, 4));
	MOVD_xmm(fpScratchReg3, MDisp(RSP, 8));
	MOVD_xmm(fpScratchReg4, MDisp(RSP, 12));

	if (cpu_info.bSSE4_1) {
		PMOVZXBD(fpScratchReg1, R(fpScratchReg1));
		PMOVZXBD(fpScratchReg2, R(fpScratchReg2));
		PMOVZXBD(fpScratchReg3, R(fpScratchReg3));
		PMOVZXBD(fpScratchReg4, R(fpScratchReg4));
	} else {
		PUNPCKLBW(fpScratchReg1, R(XMM0));
		PUNPCKLBW(fpScratchReg2, R(XMM0));
		PUNPCKLBW(fpScratchReg3, R(XMM0));
		PUNPCKLBW(fpScratchReg4, R(XMM0));
		PUNPCKLWD(fpScratchReg1, R(XMM0));
		PUNPCKLWD(fpScratchReg2, R(XMM0));
		PUNPCKLWD(fpScratchReg3, R(XMM0));
		PUNPCKLWD(fpScratchReg4, R(XMM0));
	}
	CVTDQ2PS(fpScratchReg1, R(fpScratchReg1));
	CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
	CVTDQ2PS(fpScratchReg3, R(fpScratchReg3));
	CVTDQ2PS(fpScratchReg4, R(fpScratchReg4));

	// Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)...
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 24));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	if (RipAccessible(by256)) {
		MULPS(fpScratchReg5, M(by256));  // rip accessible
	} else {
		Crash();  // TODO
	}
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg2, R(fpScratchReg5));
	MULPS(fpScratchReg3, R(XMM0));
	MULPS(fpScratchReg4, R(fpScratchReg5));

	// Now set top=fpScratchReg1, bottom=fpScratchReg3.
	ADDPS(fpScratchReg1, R(fpScratchReg2));
	ADDPS(fpScratchReg3, R(fpScratchReg4));

	// Next, time for frac_v.
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 32));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	MULPS(fpScratchReg5, M(by256));
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg3, R(fpScratchReg5));

	// Still at the 255 scale, now we're interpolated.
	ADDPS(fpScratchReg1, R(fpScratchReg3));

	// Time to convert back to a single 32 bit value.
	CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
	PACKSSDW(fpScratchReg1, R(fpScratchReg1));
	PACKUSWB(fpScratchReg1, R(fpScratchReg1));
	MOVD_xmm(R(resultReg), fpScratchReg1);

	if (id.hasInvalidPtr) {
		SetJumpTarget(zeroSrc);
	}

	ADD(64, R(RSP), Imm8(24));
	POP(arg3Reg);
	POP(arg4Reg);
	POP(R12);
	POP(R13);
	POP(R14);
	POP(R15);

	RET();

	EndWrite();
	return (LinearFunc)start;
}
Example #17
0
void decodeInstruction(instruction_t instruction,unsigned long *r[],unsigned long *bandera,unsigned long *PC,unsigned long*LR,uint8_t*memoria,unsigned long *codificacion)
{
    int auxban;
    unsigned long aux1,aux2,des;
	//            codificacion funciones de la alu
	if(strcmp(instruction.mnemonic,"ADDS") == 0)
	{
		if(instruction.op1_type=='R')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
				r[instruction.op1_value]=ADD(r[instruction.op2_value],r[instruction.op3_value],&bandera);
			}
            if((instruction.op2_type== '#' )&&(instruction.op3_type =='R' ))
            {
				r[instruction.op1_value]=ADD(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='#' ))
            {
				r[instruction.op1_value]=ADD(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
            if((instruction.op2_type== '#' )&&(instruction.op3_type =='#' ))
			{
				r[instruction.op1_value]=ADD(instruction.op2_value,instruction.op3_value,&bandera);
            }
            mostrar(r[instruction.op1_value]);
        }
		if(instruction.op1_type=='N')
		{
			if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
				ADD(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
				ADD(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
				ADD(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
				ADD(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
	}
	if(strcmp(instruction.mnemonic,"CMN") == 0)
	{
        if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' ))
        {
            ADD(r[instruction.op1_value],r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' ))
        {
            ADD(instruction.op1_value,r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' ))
        {
            ADD(r[instruction.op1_value],instruction.op2_value,&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type == '#' ))
        {
            ADD(instruction.op1_value,instruction.op2_value,&bandera);
        }
	}
	if( strcmp(instruction.mnemonic,"ADCS") == 0)
	{
		if(instruction.op1_type=='R')
		{
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
				r[instruction.op1_value]=ADC(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
			{
				r[instruction.op1_value]=ADC(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
			{
                r[instruction.op1_value]=ADC(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
				r[instruction.op1_value]=ADC(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
				ADC(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
				ADC(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
				ADC(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
				ADC(instruction.op2_value,instruction.op3_value,&bandera);
            }
            mostrar(r[instruction.op1_value]);
		}
    }
    if( strcmp(instruction.mnemonic,"ANDS") == 0)
    {
        if(instruction.op1_type=='R')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                r[instruction.op1_value]=AND(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                r[instruction.op1_value]=AND(instruction.op2_value,r[instruction.op3_value],&bandera);
			}
            if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=AND(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=AND(instruction.op2_value,instruction.op3_value,&bandera);
            }
           mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
			if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
                AND(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                AND(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                AND(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                AND(instruction.op2_value,instruction.op3_value,&bandera);
            }
            mostrar(r[instruction.op1_value]);
		}
    }
    if(strcmp(instruction.mnemonic,"TEST") == 0)
    {
        if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' ))
        {
            AND(r[instruction.op1_value],r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' ))
        {
            AND(instruction.op1_value,r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' ))
        {
            AND(r[instruction.op1_value],instruction.op2_value,&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type == '#' ))
        {
            AND(instruction.op1_value,instruction.op2_value,&bandera);
        }
    }
    if( strcmp(instruction.mnemonic,"EORS") == 0)
    {
        if(instruction.op1_type=='R')
       {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
                r[instruction.op1_value]=EOR(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
			{
                r[instruction.op1_value]=EOR(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=EOR(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=EOR(instruction.op2_value,instruction.op3_value,&bandera);
            }
           mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                EOR(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
				EOR(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
				EOR(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                EOR(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
    }
    if( (strcmp(instruction.mnemonic,"MOVS") == 0)||(strcmp(instruction.mnemonic,"MOV") == 0))
    {

		if((instruction.op1_type == 'R')&&(instruction.op2_type=='R') )
		{
			r[instruction.op1_value]=MOV(r[instruction.op1_value],r[instruction.op2_value],&bandera);
			mostrar(r[instruction.op1_value]);
		}
        if((instruction.op1_type == 'R')&&(instruction.op2_type=='#') )
        {
            r[instruction.op1_value]=MOV(instruction.op1_value,instruction.op2_value,&bandera);
            mostrar(r[instruction.op1_value]);
        }
    }
    if( strcmp(instruction.mnemonic,"ORRS") == 0)
    {
        if(instruction.op1_type=='R')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                r[instruction.op1_value]=ORR(r[instruction.op2_value],r[instruction.op3_value],&bandera);
			}
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                r[instruction.op1_value]=ORR(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=ORR(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=ORR(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                ORR(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                ORR(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
				ORR(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                ORR(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
        }
    }
	if( strcmp(instruction.mnemonic,"SUBS") == 0)
    {
        if(instruction.op1_type== 'R' )
		{
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
            {
                r[instruction.op1_value]=SUB(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
            if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                r[instruction.op1_value]=SUB(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
           if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=SUB(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                r[instruction.op1_value]=SUB(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
		}
        if(instruction.op1_type=='N')
        {
            if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' ))
			{
                SUB(r[instruction.op2_value],r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' ))
            {
                SUB(instruction.op2_value,r[instruction.op3_value],&bandera);
            }
			if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' ))
            {
                SUB(r[instruction.op2_value],instruction.op3_value,&bandera);
            }
			if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
            {
                SUB(instruction.op2_value,instruction.op3_value,&bandera);
            }
			mostrar(r[instruction.op1_value]);
        }
    }
    if(strcmp(instruction.mnemonic,"CMP") == 0)
    {
        if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' ))
        {
            SUB(r[instruction.op1_value],r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' ))
        {
            SUB(instruction.op1_value,r[instruction.op2_value],&bandera);
        }
        if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' ))
		{
            SUB(r[instruction.op1_value],instruction.op2_value,&bandera);
        }
		if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' ))
        {
            SUB(instruction.op1_value,instruction.op2_value,&bandera);
        }
    }
    //                 decodificacion funciones branch
	if(strcmp(instruction.mnemonic,"B")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(28<<11)+(instruction.op1_value);
			B(&PC,instruction.op1_value);
		}
	}
	if(strcmp(instruction.mnemonic,"BEQ")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(instruction.op1_value);
			BEQ(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BNE")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(1<<8)+(instruction.op1_value);
			BNE(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BCS")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(2<<8)+(instruction.op1_value);
			BCS(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BCC")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(3<<8)+(instruction.op1_value);
			BCC(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BMI")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(4<<8)+(instruction.op1_value);
			BMI(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BPL")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(5<<8)+(instruction.op1_value);
			BPL(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BVS")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(6<<8)+(instruction.op1_value);
			BVS(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BVC")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(2<<7)+(instruction.op1_value);
			BVC(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BHI")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(8<<8)+(instruction.op1_value);
			BHI(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BLS")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(9<<8)+(instruction.op1_value);
			BLS(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BGE")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(10<<8)+(instruction.op1_value);
			BGE(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BLT")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(11<<8)+(instruction.op1_value);
			BLT(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BGT")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(12<<8)+(instruction.op1_value);
			BGT(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BLE")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(13<<11)+(13<<8)+(instruction.op1_value);
			BLE(&PC,instruction.op1_value,&bandera);
		}
	}
	if(strcmp(instruction.mnemonic,"BL")==0)
	{
		if(instruction.op1_type=='#')
		{
			*codificacion=(31<<11)+(2047&instruction.op1_value+(((1<<31)&instruction.op1_value)>>20));
			BL(&PC,instruction.op1_value,&LR);
		}
	}
Example #18
0
void
vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
{
   dst_reg dest;
   src_reg src;

   switch (instr->intrinsic) {
   case nir_intrinsic_load_per_vertex_input: {
      /* The EmitNoIndirectInput flag guarantees our vertex index will
       * be constant.  We should handle indirects someday.
       */
      nir_const_value *vertex = nir_src_as_const_value(instr->src[0]);
      nir_const_value *offset = nir_src_as_const_value(instr->src[1]);

      /* Make up a type...we have no way of knowing... */
      const glsl_type *const type = glsl_type::ivec(instr->num_components);

      src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
                          instr->const_index[0] + offset->u32[0],
                    type);
      src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));

      /* gl_PointSize is passed in the .w component of the VUE header */
      if (instr->const_index[0] == VARYING_SLOT_PSIZ)
         src.swizzle = BRW_SWIZZLE_WWWW;

      dest = get_nir_dest(instr->dest, src.type);
      dest.writemask = brw_writemask_for_size(instr->num_components);
      emit(MOV(dest, src));
      break;
   }

   case nir_intrinsic_load_input:
      unreachable("nir_lower_io should have produced per_vertex intrinsics");

   case nir_intrinsic_emit_vertex_with_counter: {
      this->vertex_count =
         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
      int stream_id = instr->const_index[0];
      gs_emit_vertex(stream_id);
      break;
   }

   case nir_intrinsic_end_primitive_with_counter:
      this->vertex_count =
         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
      gs_end_primitive();
      break;

   case nir_intrinsic_set_vertex_count:
      this->vertex_count =
         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
      break;

   case nir_intrinsic_load_primitive_id:
      assert(gs_prog_data->include_primitive_id);
      dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
      emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D)));
      break;

   case nir_intrinsic_load_invocation_id: {
      src_reg invocation_id =
         src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]);
      assert(invocation_id.file != BAD_FILE);
      dest = get_nir_dest(instr->dest, invocation_id.type);
      emit(MOV(dest, invocation_id));
      break;
   }

   default:
      vec4_visitor::nir_emit_intrinsic(instr);
   }
}
Example #19
0
/**
 * Write out a batch of 32 control data bits from the control_data_bits
 * register to the URB.
 *
 * The current value of the vertex_count register determines which DWORD in
 * the URB receives the control data bits.  The control_data_bits register is
 * assumed to contain the correct data for the vertex that was most recently
 * output, and all previous vertices that share the same DWORD.
 *
 * This function takes care of ensuring that if no vertices have been output
 * yet, no control bits are emitted.
 */
void
vec4_gs_visitor::emit_control_data_bits()
{
    assert(c->control_data_bits_per_vertex != 0);

    /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
     * granularity, we need to use two tricks to ensure that the batch of 32
     * control data bits is written to the appropriate DWORD in the URB.  To
     * select which vec4 we are writing to, we use the "slot {0,1} offset"
     * fields of the message header.  To select which DWORD in the vec4 we are
     * writing to, we use the channel mask fields of the message header.  To
     * avoid penalizing geometry shaders that emit a small number of vertices
     * with extra bookkeeping, we only do each of these tricks when
     * c->prog_data.control_data_header_size_bits is large enough to make it
     * necessary.
     *
     * Note: this means that if we're outputting just a single DWORD of control
     * data bits, we'll actually replicate it four times since we won't do any
     * channel masking.  But that's not a problem since in this case the
     * hardware only pays attention to the first DWORD.
     */
    enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
    if (c->control_data_header_size_bits > 32)
        urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
    if (c->control_data_header_size_bits > 128)
        urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;

    /* If we are using either channel masks or a per-slot offset, then we
     * need to figure out which DWORD we are trying to write to, using the
     * formula:
     *
     *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
     *
     * Since bits_per_vertex is a power of two, and is known at compile
     * time, this can be optimized to:
     *
     *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
     */
    src_reg dword_index(this, glsl_type::uint_type);
    if (urb_write_flags) {
        src_reg prev_count(this, glsl_type::uint_type);
        emit(ADD(dst_reg(prev_count), this->vertex_count,
                 brw_imm_ud(0xffffffffu)));
        unsigned log2_bits_per_vertex =
            util_last_bit(c->control_data_bits_per_vertex);
        emit(SHR(dst_reg(dword_index), prev_count,
                 brw_imm_ud(6 - log2_bits_per_vertex)));
    }

    /* Start building the URB write message.  The first MRF gets a copy of
     * R0.
     */
    int base_mrf = 1;
    dst_reg mrf_reg(MRF, base_mrf);
    src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
    vec4_instruction *inst = emit(MOV(mrf_reg, r0));
    inst->force_writemask_all = true;

    if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
        /* Set the per-slot offset to dword_index / 4, to that we'll write to
         * the appropriate OWORD within the control data header.
         */
        src_reg per_slot_offset(this, glsl_type::uint_type);
        emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u)));
        emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset,
             brw_imm_ud(1u));
    }

    if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
        /* Set the channel masks to 1 << (dword_index % 4), so that we'll
         * write to the appropriate DWORD within the OWORD.  We need to do
         * this computation with force_writemask_all, otherwise garbage data
         * from invocation 0 might clobber the mask for invocation 1 when
         * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
         * together.
         */
        src_reg channel(this, glsl_type::uint_type);
        inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u)));
        inst->force_writemask_all = true;
        src_reg one(this, glsl_type::uint_type);
        inst = emit(MOV(dst_reg(one), brw_imm_ud(1u)));
        inst->force_writemask_all = true;
        src_reg channel_mask(this, glsl_type::uint_type);
        inst = emit(SHL(dst_reg(channel_mask), one, channel));
        inst->force_writemask_all = true;
        emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
             channel_mask);
        emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
    }

    /* Store the control data bits in the message payload and send it. */
    dst_reg mrf_reg2(MRF, base_mrf + 1);
    inst = emit(MOV(mrf_reg2, this->control_data_bits));
    inst->force_writemask_all = true;
    inst = emit(GS_OPCODE_URB_WRITE);
    inst->urb_write_flags = urb_write_flags;
    /* We need to increment Global Offset by 256-bits to make room for
     * Broadwell's extra "Vertex Count" payload at the beginning of the
     * URB entry.  Since this is an OWord message, Global Offset is counted
     * in 128-bit units, so we must set it to 2.
     */
    if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
        inst->offset = 2;
    inst->base_mrf = base_mrf;
    inst->mlen = 2;
}
Example #20
0
void JitArm::lXX(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITLoadStoreOff)

	u32 a = inst.RA, b = inst.RB, d = inst.RD;
	s32 offset = inst.SIMM_16;
	u32 accessSize = 0;
	s32 offsetReg = -1;
	bool zeroA = true;
	bool update = false;
	bool signExtend = false;
	bool reverse = false;
	bool fastmem = false;

	switch(inst.OPCD)
	{
		case 31:
			switch(inst.SUBOP10)
			{
				case 55: // lwzux
					zeroA = false;
					update = true;
				case 23: // lwzx
					accessSize = 32;
					offsetReg = b;
				break;
				case 119: //lbzux
					zeroA = false;
					update = true;
				case 87: // lbzx
					accessSize = 8;
					offsetReg = b;
				break;
				case 311: // lhzux
					zeroA = false;
					update = true;
				case 279: // lhzx
					accessSize = 16;
					offsetReg = b;
				break;
				case 375: // lhaux
					zeroA = false;
					update = true;
				case 343: // lhax
					accessSize = 16;
					signExtend = true;
					offsetReg = b;
				break;
				case 534: // lwbrx
					accessSize = 32;
					reverse = true;
				break;
				case 790: // lhbrx
					accessSize = 16;
					reverse = true;
				break;
			}
		break;
		case 33: // lwzu
			zeroA = false;
			update = true;
		case 32: // lwz
			fastmem = true;
			accessSize = 32;
		break;
		case 35: // lbzu
			zeroA = false;
			update = true;
		case 34: // lbz
			fastmem = true;
			accessSize = 8;
		break;
		case 41: // lhzu
			zeroA = false;
			update = true;
		case 40: // lhz
			fastmem = true;
			accessSize = 16;
		break;
		case 43: // lhau
			zeroA = false;
			update = true;
		case 42: // lha
			signExtend = true;
			accessSize = 16;
		break;
	}

	// Check for exception before loading
	ARMReg rA = gpr.GetReg(false);

	LDR(rA, R9, PPCSTATE_OFF(Exceptions));
	CMP(rA, EXCEPTION_DSI);
	FixupBranch DoNotLoad = B_CC(CC_EQ);

	SafeLoadToReg(fastmem, d, zeroA ? a ? a : -1 : a, offsetReg, accessSize, offset, signExtend, reverse);

	if (update)
	{
		rA = gpr.GetReg(false);
		ARMReg RA = gpr.R(a);
		if (offsetReg == -1)
			MOVI2R(rA, offset);
		else
			MOV(RA, gpr.R(offsetReg));
		ADD(RA, RA, rA);
	}

	SetJumpTarget(DoNotLoad);

	// LWZ idle skipping
	if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSkipIdle &&
		inst.OPCD == 32 &&
		(inst.hex & 0xFFFF0000) == 0x800D0000 &&
		(Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 ||
		(SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) &&
		Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8)
	{
		ARMReg RD = gpr.R(d);
		gpr.Flush();
		fpr.Flush();

		// if it's still 0, we can wait until the next event
		TST(RD, RD);
		FixupBranch noIdle = B_CC(CC_NEQ);
		rA = gpr.GetReg();

		MOVI2R(rA, (u32)&PowerPC::OnIdle);
		MOVI2R(R0, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
		BL(rA);

		gpr.Unlock(rA);
		WriteExceptionExit();

		SetJumpTarget(noIdle);

		//js.compilerPC += 8;
		return;
	}

}
Example #21
0
void
vec4_gs_visitor::gs_emit_vertex(int stream_id)
{
    this->current_annotation = "emit vertex: safety check";

    /* Haswell and later hardware ignores the "Render Stream Select" bits
     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
     * and instead sends all primitives down the pipeline for rasterization.
     * If the SOL stage is enabled, "Render Stream Select" is honored and
     * primitives bound to non-zero streams are discarded after stream output.
     *
     * Since the only purpose of primives sent to non-zero streams is to
     * be recorded by transform feedback, we can simply discard all geometry
     * bound to these streams when transform feedback is disabled.
     */
    if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
        return;

    /* If we're outputting 32 control data bits or less, then we can wait
     * until the shader is over to output them all.  Otherwise we need to
     * output them as we go.  Now is the time to do it, since we're about to
     * output the vertex_count'th vertex, so it's guaranteed that the
     * control data bits associated with the (vertex_count - 1)th vertex are
     * correct.
     */
    if (c->control_data_header_size_bits > 32) {
        this->current_annotation = "emit vertex: emit control data bits";
        /* Only emit control data bits if we've finished accumulating a batch
         * of 32 bits.  This is the case when:
         *
         *     (vertex_count * bits_per_vertex) % 32 == 0
         *
         * (in other words, when the last 5 bits of vertex_count *
         * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
         * integer n (which is always the case, since bits_per_vertex is
         * always 1 or 2), this is equivalent to requiring that the last 5-n
         * bits of vertex_count are 0:
         *
         *     vertex_count & (2^(5-n) - 1) == 0
         *
         * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
         * equivalent to:
         *
         *     vertex_count & (32 / bits_per_vertex - 1) == 0
         */
        vec4_instruction *inst =
            emit(AND(dst_null_ud(), this->vertex_count,
                     brw_imm_ud(32 / c->control_data_bits_per_vertex - 1)));
        inst->conditional_mod = BRW_CONDITIONAL_Z;

        emit(IF(BRW_PREDICATE_NORMAL));
        {
            /* If vertex_count is 0, then no control data bits have been
             * accumulated yet, so we skip emitting them.
             */
            emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u),
                     BRW_CONDITIONAL_NEQ));
            emit(IF(BRW_PREDICATE_NORMAL));
            emit_control_data_bits();
            emit(BRW_OPCODE_ENDIF);

            /* Reset control_data_bits to 0 so we can start accumulating a new
             * batch.
             *
             * Note: in the case where vertex_count == 0, this neutralizes the
             * effect of any call to EndPrimitive() that the shader may have
             * made before outputting its first vertex.
             */
            inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
            inst->force_writemask_all = true;
        }
        emit(BRW_OPCODE_ENDIF);
    }

    this->current_annotation = "emit vertex: vertex data";
    emit_vertex();

    /* In stream mode we have to set control data bits for all vertices
     * unless we have disabled control data bits completely (which we do
     * do for GL_POINTS outputs that don't use streams).
     */
    if (c->control_data_header_size_bits > 0 &&
            gs_prog_data->control_data_format ==
            GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
        this->current_annotation = "emit vertex: Stream control data bits";
        set_stream_control_data_bits(stream_id);
    }

    this->current_annotation = NULL;
}
Example #22
0
int main(int argc, const char * argv[]) {
    check_same("Mov literal", 3,
        Asm<int>(
            MOV(eax, 3_d),
            RET())()
    );
        
    check_same("64 bit register MOV", 6,
        Asm<int>(
            MOV(rax, 6_q),
            RET())()
    );
    
    check_same("Negative literal", -103,
        Asm<int>(
            MOV(eax, -3_d),
            ADD(eax, - - -100_d),
            RET())()
    );
    
    check_same("Move reg to reg", 4,
        Asm<int>(
            MOV(ecx, 4_d),
            MOV(eax, ecx),
            RET())()
    );
    
    check_same("Simple jmp", 3,
        Asm<int>(
            MOV(eax, 3_d),
            JMP("a"_rel8),
            ADD(eax, 2_d),
        "a"_label,
            RET())()
    );
    
    check_same("Simple loop", 30,
        Asm<int>(
            MOV(ecx, 5_d),
            MOV(eax, 0_d),
        "start"_label,
            CMP(ecx, 0_d),
            JE("done"_rel8),
            ADD(eax, 6_d),
            DEC(ecx),
            JMP("start"_rel8),
        "done"_label,
            RET())()
    );
    
    check_same("Macro simple loop", 30,
        Asm<int>(
            MOV(eax, 0_d),
            do_x_times(5_d,
                ADD(eax, 6_d)),
            RET())()
    );

    check_same("Access arg using esp", 1,
        Asm<int>(
            MOV(eax, _[esp + 28_d]),
            RET())(1, 2, 3)
    );
    
    check_same("Access arg using ebp", 1,
        Asm<int>(
            MOV(eax, _[ebp - 0xc_b]),
            RET())(1, 2, 3)
    );
    
    check_same("Index ebp", 1,
        Asm<int>(
            MOV(ecx, 2_d),
            MOV(eax, _[ebp + ecx * 2_b - 0x10_d]),
            RET())(1, 2, 3)
    );
    
    check_same("Access args using ebp", 5,
        Asm<int>(
            MOV(edx, 0_d),
            MOV(eax, _[ebp - 0xc_b]),
            MOV(ecx, _[ebp - 0x10_b]),
            DIV(ecx),
            MOV(ecx, _[ebp - 0x14_b]),
            DIV(ecx),
            RET())(100, 5, 4)
    );
    
    check_same("Access arg with 64 bit reg", 2,
        Asm<int>(
            MOV(rax, _[rsp + 24_d]),
            RET())(1, 2, 3)
    );
    
    check_same("Access second register zero", 1,
        Asm<int>(
            MOV(ecx, 0_d),
            MOV(eax, _[esp + 28_d + ecx]),
            RET())(1, 2, 3)
    );
    
    check_same("Access second register with offset", 1,
        Asm<int>(
            MOV(ecx, 8_d),
            MOV(eax, _[esp + 20_d + ecx]),
            RET())(1, 2, 3)
    );
    
    check_same("Access second register with offset and 1 scale", 1,
        Asm<int>(
            MOV(ecx, 8_d),
            MOV(eax, _[esp + 20_d + ecx * 1_b]),
            RET())(1, 2, 3)
    );

    check_same("Access second register with offset and 4 scale", 1,
        Asm<int>(
            MOV(ecx, 2_d),
            MOV(eax, _[esp + 20_d + ecx * 4_b]),
            RET())(1, 2, 3)
    );
    
    check_same("Call c function from assembly", 66,
        Asm<int>(
            MOV(rbx, _[rsp + 8_d]),
            CALL(rbx),
            RET())(&ret66)
    );

    check_same("Call c function from esp directly", 66,
        Asm<int>(
            CALL(_[rsp + 8_d]),
            RET())(&ret66)
    );
    
     check_same("Call c function from ebp directly", 66,
        Asm<int>(
            CALL(_[rbp - 0x10_d]),
            RET())(&ret66)
    );

  //  auto p = Asm<int>(CALL(_[rbp - 0xc_d]));
  // Print<decltype(p)::program> x{};

    std::cout << "done" << std::endl;
    return 0;
}
void JitArm::ps_cmpo1(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITFloatingPointOff);
	u32 a = inst.FA, b = inst.FB;
	int cr = inst.CRFD;

	ARMReg vA = fpr.R1(a);
	ARMReg vB = fpr.R1(b);
	ARMReg fpscrReg = gpr.GetReg();
	ARMReg crReg = gpr.GetReg();
	Operand2 FPRFMask(0x1F, 0xA); // 0x1F000
	Operand2 LessThan(0x8, 0xA); // 0x8000
	Operand2 GreaterThan(0x4, 0xA); // 0x4000
	Operand2 EqualTo(0x2, 0xA); // 0x2000
	Operand2 NANRes(0x1, 0xA); // 0x1000
	FixupBranch Done1, Done2, Done3;
	LDR(fpscrReg, R9, PPCSTATE_OFF(fpscr));
	BIC(fpscrReg, fpscrReg, FPRFMask);

	VCMPE(vA, vB);
	VMRS(_PC);
	SetCC(CC_LT);
		ORR(fpscrReg, fpscrReg, LessThan);
		MOV(crReg,  8);
		Done1 = B();
	SetCC(CC_GT);
		ORR(fpscrReg, fpscrReg, GreaterThan);
		MOV(crReg,  4);
		Done2 = B();
	SetCC(CC_EQ);
		ORR(fpscrReg, fpscrReg, EqualTo);
		MOV(crReg,  2);
		Done3 = B();
	SetCC();

	ORR(fpscrReg, fpscrReg, NANRes);
	MOV(crReg,  1);

	VCMPE(vA, vA);
	VMRS(_PC);
	FixupBranch NanA = B_CC(CC_NEQ);
	VCMPE(vB, vB);
	VMRS(_PC);
	FixupBranch NanB = B_CC(CC_NEQ);

	SetFPException(fpscrReg, FPSCR_VXVC);
	FixupBranch Done4 = B();

	SetJumpTarget(NanA);
	SetJumpTarget(NanB);

	SetFPException(fpscrReg, FPSCR_VXSNAN);

	TST(fpscrReg, VEMask);

	FixupBranch noVXVC = B_CC(CC_NEQ);
	SetFPException(fpscrReg, FPSCR_VXVC);

	SetJumpTarget(noVXVC);
	SetJumpTarget(Done1);
	SetJumpTarget(Done2);
	SetJumpTarget(Done3);
	SetJumpTarget(Done4);
	STRB(crReg, R9, PPCSTATE_OFF(cr_fast) + cr);
	STR(fpscrReg, R9, PPCSTATE_OFF(fpscr));
	gpr.Unlock(fpscrReg, crReg);
}
int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c)
{
    int64_t duration = ggl_system_time();

    mBlendFactorCached = 0;
    mBlending = 0;
    mMasking = 0;
    mAA        = GGL_READ_NEEDS(P_AA, needs.p);
    mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
    mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
    mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
    mFog       = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
    mSmooth    = GGL_READ_NEEDS(SHADE, needs.n) != 0;
    mBuilderContext.needs = needs;
    mBuilderContext.c = c;
    mBuilderContext.Rctx = reserveReg(R0); // context always in R0
    mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];

    // ------------------------------------------------------------------------

    decodeLogicOpNeeds(needs);

    decodeTMUNeeds(needs, c);

    mBlendSrc  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
    mBlendDst  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
    mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
    mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));

    if (!mCbFormat.c[GGLFormat::ALPHA].h) {
        if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendSrc == GGL_DST_ALPHA)) {
            mBlendSrc = GGL_ONE;
        }
        if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendSrcA == GGL_DST_ALPHA)) {
            mBlendSrcA = GGL_ONE;
        }
        if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendDst == GGL_DST_ALPHA)) {
            mBlendDst = GGL_ONE;
        }
        if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
            (mBlendDstA == GGL_DST_ALPHA)) {
            mBlendDstA = GGL_ONE;
        }
    }

    // if we need the framebuffer, read it now
    const int blending =    blending_codes(mBlendSrc, mBlendDst) |
                            blending_codes(mBlendSrcA, mBlendDstA);

    // XXX: handle special cases, destination not modified...
    if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
        (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
        // Destination unmodified (beware of logic ops)
    } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
        (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
        // Destination is zero (beware of logic ops)
    }
    
    int fbComponents = 0;
    const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
    for (int i=0 ; i<4 ; i++) {
        const int mask = 1<<i;
        component_info_t& info = mInfo[i];
        int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
        int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
        if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
            fs = GGL_ONE;
        info.masked =   !!(masking & mask);
        info.inDest =   !info.masked && mCbFormat.c[i].h && 
                        ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
        if (mCbFormat.components >= GGL_LUMINANCE &&
                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
            info.inDest = false;
        }
        info.needed =   (i==GGLFormat::ALPHA) && 
                        (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
        info.replaced = !!(mTextureMachine.replaced & mask);
        info.iterated = (!info.replaced && (info.inDest || info.needed)); 
        info.smooth =   mSmooth && info.iterated;
        info.fog =      mFog && info.inDest && (i != GGLFormat::ALPHA);
        info.blend =    (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));

        mBlending |= (info.blend ? mask : 0);
        mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
        fbComponents |= mCbFormat.c[i].h ? mask : 0;
    }

    mAllMasked = (mMasking == fbComponents);
    if (mAllMasked) {
        mDithering = 0;
    }
    
    fragment_parts_t parts;

    // ------------------------------------------------------------------------
    prolog();
    // ------------------------------------------------------------------------

    build_scanline_prolog(parts, needs);

    if (registerFile().status())
        return registerFile().status();

    // ------------------------------------------------------------------------
    label("fragment_loop");
    // ------------------------------------------------------------------------
    {
        Scratch regs(registerFile());

        if (mDithering) {
            // update the dither index.
            MOV(AL, 0, parts.count.reg,
                    reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT));
            ADD(AL, 0, parts.count.reg, parts.count.reg,
                    imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT)));
            MOV(AL, 0, parts.count.reg,
                    reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT));
        }

        // XXX: could we do an early alpha-test here in some cases?
        // It would probaly be used only with smooth-alpha and no texture
        // (or no alpha component in the texture).

        // Early z-test
        if (mAlphaTest==GGL_ALWAYS) {
            build_depth_test(parts, Z_TEST|Z_WRITE);
        } else {
            // we cannot do the z-write here, because
            // it might be killed by the alpha-test later
            build_depth_test(parts, Z_TEST);
        }

        { // texture coordinates
            Scratch scratches(registerFile());

            // texel generation
            build_textures(parts, regs);
        }        

        if ((blending & (FACTOR_DST|BLEND_DST)) || 
                (mMasking && !mAllMasked) ||
                (mLogicOp & LOGIC_OP_DST)) 
        {
            // blending / logic_op / masking need the framebuffer
            mDstPixel.setTo(regs.obtain(), &mCbFormat);

            // load the framebuffer pixel
            comment("fetch color-buffer");
            load(parts.cbPtr, mDstPixel);
        }

        if (registerFile().status())
            return registerFile().status();

        pixel_t pixel;
        int directTex = mTextureMachine.directTexture;
        if (directTex | parts.packed) {
            // note: we can't have both here
            // iterated color or direct texture
            pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
            pixel.flags &= ~CORRUPTIBLE;
        } else {
            if (mDithering) {
                const int ctxtReg = mBuilderContext.Rctx;
                const int mask = GGL_DITHER_SIZE-1;
                parts.dither = reg_t(regs.obtain());
                AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask));
                ADD(AL, 0, parts.dither.reg, parts.dither.reg, ctxtReg);
                LDRB(AL, parts.dither.reg, parts.dither.reg,
                        immed12_pre(GGL_OFFSETOF(ditherMatrix)));
            }
        
            // allocate a register for the resulting pixel
            pixel.setTo(regs.obtain(), &mCbFormat, FIRST);

            build_component(pixel, parts, GGLFormat::ALPHA,    regs);

            if (mAlphaTest!=GGL_ALWAYS) {
                // only handle the z-write part here. We know z-test
                // was successful, as well as alpha-test.
                build_depth_test(parts, Z_WRITE);
            }

            build_component(pixel, parts, GGLFormat::RED,      regs);
            build_component(pixel, parts, GGLFormat::GREEN,    regs);
            build_component(pixel, parts, GGLFormat::BLUE,     regs);

            pixel.flags |= CORRUPTIBLE;
        }

        if (registerFile().status())
            return registerFile().status();
        
        if (pixel.reg == -1) {
            // be defensive here. if we're here it's probably
            // that this whole fragment is a no-op.
            pixel = mDstPixel;
        }
        
        if (!mAllMasked) {
            // logic operation
            build_logic_op(pixel, regs);
    
            // masking
            build_masking(pixel, regs); 
    
            comment("store");
            store(parts.cbPtr, pixel, WRITE_BACK);
        }
    }

    if (registerFile().status())
        return registerFile().status();

    // update the iterated color...
    if (parts.reload != 3) {
        build_smooth_shade(parts);
    }

    // update iterated z
    build_iterate_z(parts);

    // update iterated fog
    build_iterate_f(parts);

    SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
    B(PL, "fragment_loop");
    label("epilog");
    epilog(registerFile().touched());

    if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
        if (mDepthTest!=GGL_ALWAYS) {
            label("discard_before_textures");
            build_iterate_texture_coordinates(parts);
        }
        label("discard_after_textures");
        build_smooth_shade(parts);
        build_iterate_z(parts);
        build_iterate_f(parts);
        if (!mAllMasked) {
            ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3));
        }
        SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
        B(PL, "fragment_loop");
        epilog(registerFile().touched());
    }
Example #25
0
   in order to stop gcc from complaining.  */
#define EMPTY 0,0,NULL

struct ia64_opcode ia64_opcodes_i[] =
  {
    /* I-type instruction encodings (sorted according to major opcode).  */

    {"break.i",	I0, OpX3X6 (0, 0, 0x00), {IMMU21}, X_IN_MLX, 0, NULL},
    {"nop.i",	I0, OpX3X6Yb (0, 0, 0x01, 0), {IMMU21}, X_IN_MLX, 0, NULL},
    {"hint.i",	I0, OpX3X6Yb (0, 0, 0x01, 1), {IMMU21}, X_IN_MLX, 0, NULL},
    {"chk.s.i",	I0, OpX3 (0, 1), {R2, TGT25b}, EMPTY},

    {"mov", I, OpX3XbIhWhTag13 (0, 7, 0, 0, 1, 0), {B1, R2}, PSEUDO, 0, NULL},
#define MOV(a,b,c,d) \
    I, OpX3XbIhWh (0, a, b, c, d), {B1, R2, TAG13b}, EMPTY
    {"mov.sptk",		MOV (7, 0, 0, 0)},
    {"mov.sptk.imp",		MOV (7, 0, 1, 0)},
    {"mov",			MOV (7, 0, 0, 1)},
    {"mov.imp",			MOV (7, 0, 1, 1)},
    {"mov.dptk",		MOV (7, 0, 0, 2)},
    {"mov.dptk.imp",		MOV (7, 0, 1, 2)},
    {"mov.ret.sptk",		MOV (7, 1, 0, 0)},
    {"mov.ret.sptk.imp",	MOV (7, 1, 1, 0)},
    {"mov.ret",			MOV (7, 1, 0, 1)},
    {"mov.ret.imp",		MOV (7, 1, 1, 1)},
    {"mov.ret.dptk",		MOV (7, 1, 0, 2)},
    {"mov.ret.dptk.imp",	MOV (7, 1, 1, 2)},
#undef MOV
    {"mov",	I, OpX3X6 (0, 0, 0x31), {R1, B2}, EMPTY},
    {"mov",	I, OpX3 (0, 3), {PR, R2, IMM17}, EMPTY},
    /* Don't remove one of the seemingly redundant FULL17-s.  */
Example #26
0
void
vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
{
   switch (instr->intrinsic) {
   case nir_intrinsic_load_invocation_id:
      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD),
               invocation_id));
      break;
   case nir_intrinsic_load_primitive_id:
      emit(TCS_OPCODE_GET_PRIMITIVE_ID,
           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
      break;
   case nir_intrinsic_load_patch_vertices_in:
      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D),
               brw_imm_d(key->input_vertices)));
      break;
   case nir_intrinsic_load_per_vertex_input: {
      src_reg indirect_offset = get_indirect_offset(instr);
      unsigned imm_offset = instr->const_index[0];

      nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
      src_reg vertex_index =
         vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0]))
                      : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);

      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
      dst.writemask = brw_writemask_for_size(instr->num_components);

      emit_input_urb_read(dst, vertex_index, imm_offset,
                          nir_intrinsic_component(instr), indirect_offset);
      break;
   }
   case nir_intrinsic_load_input:
      unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
      break;
   case nir_intrinsic_load_output:
   case nir_intrinsic_load_per_vertex_output: {
      src_reg indirect_offset = get_indirect_offset(instr);
      unsigned imm_offset = instr->const_index[0];

      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
      dst.writemask = brw_writemask_for_size(instr->num_components);

      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
         dst.type = BRW_REGISTER_TYPE_F;

         /* This is a read of gl_TessLevelInner[], which lives in the
          * Patch URB header.  The layout depends on the domain.
          */
         switch (key->tes_primitive_mode) {
         case GL_QUADS: {
            /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */
            dst_reg tmp(this, glsl_type::vec4_type);
            emit_output_urb_read(tmp, 0, 0, src_reg());
            emit(MOV(writemask(dst, WRITEMASK_XY),
                     swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
            break;
         }
         case GL_TRIANGLES:
            /* DWord 4; use offset 1 but normal swizzle/writemask. */
            emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, 0,
                                 src_reg());
            break;
         case GL_ISOLINES:
            /* All channels are undefined. */
            return;
         default:
            unreachable("Bogus tessellation domain");
         }
      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
         dst.type = BRW_REGISTER_TYPE_F;
         unsigned swiz = BRW_SWIZZLE_WZYX;

         /* This is a read of gl_TessLevelOuter[], which lives in the
          * high 4 DWords of the Patch URB header, in reverse order.
          */
         switch (key->tes_primitive_mode) {
         case GL_QUADS:
            dst.writemask = WRITEMASK_XYZW;
            break;
         case GL_TRIANGLES:
            dst.writemask = WRITEMASK_XYZ;
            break;
         case GL_ISOLINES:
            /* Isolines are not reversed; swizzle .zw -> .xy */
            swiz = BRW_SWIZZLE_ZWZW;
            dst.writemask = WRITEMASK_XY;
            return;
         default:
            unreachable("Bogus tessellation domain");
         }

         dst_reg tmp(this, glsl_type::vec4_type);
         emit_output_urb_read(tmp, 1, 0, src_reg());
         emit(MOV(dst, swizzle(src_reg(tmp), swiz)));
      } else {
         emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
                              indirect_offset);
      }
      break;
   }
   case nir_intrinsic_store_output:
   case nir_intrinsic_store_per_vertex_output: {
      src_reg value = get_nir_src(instr->src[0]);
      unsigned mask = instr->const_index[1];
      unsigned swiz = BRW_SWIZZLE_XYZW;

      src_reg indirect_offset = get_indirect_offset(instr);
      unsigned imm_offset = instr->const_index[0];

      /* The passthrough shader writes the whole patch header as two vec4s;
       * skip all the gl_TessLevelInner/Outer swizzling.
       */
      if (indirect_offset.file == BAD_FILE && !is_passthrough_shader) {
         if (imm_offset == 0) {
            value.type = BRW_REGISTER_TYPE_F;

            mask &=
               (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1;

            /* This is a write to gl_TessLevelInner[], which lives in the
             * Patch URB header.  The layout depends on the domain.
             */
            switch (key->tes_primitive_mode) {
            case GL_QUADS:
               /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
                * We use an XXYX swizzle to reverse put .xy in the .wz
                * channels, and use a .zw writemask.
                */
               swiz = BRW_SWIZZLE4(0, 0, 1, 0);
               mask = writemask_for_backwards_vector(mask);
               break;
            case GL_TRIANGLES:
               /* gl_TessLevelInner[].x lives at DWord 4, so we set the
                * writemask to X and bump the URB offset by 1.
                */
               imm_offset = 1;
               break;
            case GL_ISOLINES:
               /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
               return;
            default:
               unreachable("Bogus tessellation domain");
            }
         } else if (imm_offset == 1) {
            value.type = BRW_REGISTER_TYPE_F;

            mask &=
               (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1;

            /* This is a write to gl_TessLevelOuter[] which lives in the
             * Patch URB Header at DWords 4-7.  However, it's reversed, so
             * instead of .xyzw we have .wzyx.
             */
            if (key->tes_primitive_mode == GL_ISOLINES) {
               /* Isolines .xy should be stored in .zw, in order. */
               swiz = BRW_SWIZZLE4(0, 0, 0, 1);
               mask <<= 2;
            } else {
               /* Other domains are reversed; store .wzyx instead of .xyzw. */
               swiz = BRW_SWIZZLE_WZYX;
               mask = writemask_for_backwards_vector(mask);
            }
         }
      }

      unsigned first_component = nir_intrinsic_component(instr);
      if (first_component) {
         assert(swiz == BRW_SWIZZLE_XYZW);
         swiz = BRW_SWZ_COMP_OUTPUT(first_component);
         mask = mask << first_component;
      }

      emit_urb_write(swizzle(value, swiz), mask,
                     imm_offset, indirect_offset);
      break;
   }

   case nir_intrinsic_barrier: {
      dst_reg header = dst_reg(this, glsl_type::uvec4_type);
      emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
      emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
      break;
   }

   default:
      vec4_visitor::nir_emit_intrinsic(instr);
   }
}
void Jit64::lfd(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITLoadStoreFloatingOff);
	FALLBACK_IF(js.memcheck || !inst.RA);

	int d = inst.RD;
	int a = inst.RA;

	s32 offset = (s32)(s16)inst.SIMM_16;
	gpr.FlushLockX(ABI_PARAM1);
	gpr.Lock(a);
	MOV(32, R(ABI_PARAM1), gpr.R(a));
	// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
	fpr.Lock(d);
	fpr.BindToRegister(d, true);
	X64Reg xd = fpr.RX(d);

	if (cpu_info.bSSSE3)
	{
#if _M_X86_64
		MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
#else
		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
		MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
#endif
		PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
		MOVSD(xd, R(XMM0));
	} else {
#if _M_X86_64
		LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
		MOV(64, M(&temp64), R(EAX));

		MEMCHECK_START

		MOVSD(XMM0, M(&temp64));
		MOVSD(xd, R(XMM0));

		MEMCHECK_END
#else
		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
		MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
		BSWAP(32, EAX);
		MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX));

		MEMCHECK_START

		MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
		BSWAP(32, EAX);
		MOV(32, M(&temp64), R(EAX));
		MOVSD(XMM0, M(&temp64));
		MOVSD(xd, R(XMM0));

		MEMCHECK_END
#endif
	}

	gpr.UnlockAll();
	gpr.UnlockAllX();
	fpr.UnlockAll();
}
Example #28
0
void Jit::Comp_mxc1(MIPSOpcode op) {
	CONDITIONAL_DISABLE;

	int fs = _FS;
	MIPSGPReg rt = _RT;

	switch ((op >> 21) & 0x1f) {
	case 0: // R(rt) = FI(fs); break; //mfc1
		if (rt == MIPS_REG_ZERO)
			return;
		gpr.MapReg(rt, false, true);
		// If fs is not mapped, most likely it's being abandoned.
		// Just load from memory in that case.
		if (fpr.R(fs).IsSimpleReg()) {
			MOVD_xmm(gpr.R(rt), fpr.RX(fs));
		} else {
			MOV(32, gpr.R(rt), fpr.R(fs));
		}
		break;

	case 2: // R(rt) = currentMIPS->ReadFCR(fs); break; //cfc1
		if (rt == MIPS_REG_ZERO)
			return;
		if (fs == 31) {
			bool wasImm = gpr.IsImm(MIPS_REG_FPCOND);
			if (!wasImm) {
				gpr.Lock(rt, MIPS_REG_FPCOND);
				gpr.MapReg(MIPS_REG_FPCOND, true, false);
			}
			gpr.MapReg(rt, false, true);
			MOV(32, gpr.R(rt), M(&mips_->fcr31));
			if (wasImm) {
				if (gpr.GetImm(MIPS_REG_FPCOND) & 1) {
					OR(32, gpr.R(rt), Imm32(1 << 23));
				} else {
					AND(32, gpr.R(rt), Imm32(~(1 << 23)));
				}
			} else {
				AND(32, gpr.R(rt), Imm32(~(1 << 23)));
				MOV(32, R(TEMPREG), gpr.R(MIPS_REG_FPCOND));
				AND(32, R(TEMPREG), Imm32(1));
				SHL(32, R(TEMPREG), Imm8(23));
				OR(32, gpr.R(rt), R(TEMPREG));
			}
			gpr.UnlockAll();
		} else if (fs == 0) {
			gpr.SetImm(rt, MIPSState::FCR0_VALUE);
		} else {
			Comp_Generic(op);
		}
		return;

	case 4: //FI(fs) = R(rt);	break; //mtc1
		fpr.MapReg(fs, false, true);
		if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) {
			XORPS(fpr.RX(fs), fpr.R(fs));
		} else {
			gpr.KillImmediate(rt, true, false);
			MOVD_xmm(fpr.RX(fs), gpr.R(rt));
		}
		return;

	case 6: //currentMIPS->WriteFCR(fs, R(rt)); break; //ctc1
		if (fs == 31) {
			// Must clear before setting, since ApplyRoundingMode() assumes it was cleared.
			RestoreRoundingMode();
			if (gpr.IsImm(rt)) {
				gpr.SetImm(MIPS_REG_FPCOND, (gpr.GetImm(rt) >> 23) & 1);
				MOV(32, M(&mips_->fcr31), Imm32(gpr.GetImm(rt) & 0x0181FFFF));
				if ((gpr.GetImm(rt) & 0x1000003) == 0) {
					// Default nearest / no-flush mode, just leave it cleared.
				} else {
					UpdateRoundingMode();
					ApplyRoundingMode();
				}
			} else {
Example #29
0
void
gen6_gs_visitor::visit(ir_emit_vertex *)
{
   this->current_annotation = "gen6 emit vertex";
   /* Honor max_vertex layout indication in geometry shader by ignoring any
    * vertices coming after c->gp->program.VerticesOut.
    */
   unsigned num_output_vertices = c->gp->program.VerticesOut;
   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
            BRW_CONDITIONAL_L));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      /* Buffer all output slots for this vertex in vertex_output */
      for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
         int varying = prog_data->vue_map.slot_to_varying[slot];
         if (varying != VARYING_SLOT_PSIZ) {
            dst_reg dst(this->vertex_output);
            dst.reladdr = ralloc(mem_ctx, src_reg);
            memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
            emit_urb_slot(dst, varying);
         } else {
            /* The PSIZ slot can pack multiple varyings in different channels
             * and emit_urb_slot() will produce a MOV instruction for each of
             * them. Since we are writing to an array, that will translate to
             * possibly multiple MOV instructions with an array destination and
             * each will generate a scratch write with the same offset into
             * scratch space (thus, each one overwriting the previous). This is
             * not what we want. What we will do instead is emit PSIZ to a
             * a regular temporary register, then move that resgister into the
             * array. This way we only have one instruction with an array
             * destination and we only produce a single scratch write.
             */
            dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
            emit_urb_slot(tmp, varying);
            dst_reg dst(this->vertex_output);
            dst.reladdr = ralloc(mem_ctx, src_reg);
            memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
            vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
            inst->force_writemask_all = true;
         }

         emit(ADD(dst_reg(this->vertex_output_offset),
                  this->vertex_output_offset, 1u));
      }

      /* Now buffer flags for this vertex */
      dst_reg dst(this->vertex_output);
      dst.reladdr = ralloc(mem_ctx, src_reg);
      memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
      if (c->gp->program.OutputType == GL_POINTS) {
         /* If we are outputting points, then every vertex has PrimStart and
          * PrimEnd set.
          */
         emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
                  URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
         emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
      } else {
         /* Otherwise, we can only set the PrimStart flag, which we have stored
          * in the first_vertex register. We will have to wait until we execute
          * EndPrimitive() or we end the thread to set the PrimEnd flag on a
          * vertex.
          */
         emit(OR(dst, this->first_vertex,
                 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
         emit(MOV(dst_reg(this->first_vertex), 0u));
      }
      emit(ADD(dst_reg(this->vertex_output_offset),
               this->vertex_output_offset, 1u));

      /* Update vertex count */
      emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
   }
   emit(BRW_OPCODE_ENDIF);
}