Exemple #1
0
/**
 * Emit code that kills pixels whose X and Y coordinates are outside the
 * boundary of the rectangle defined by the push constants (dst_x0, dst_y0,
 * dst_x1, dst_y1).
 */
void
brw_blorp_eu_emitter::emit_kill_if_outside_rect(const struct brw_reg &x,
                                                const struct brw_reg &y,
                                                const struct brw_reg &dst_x0,
                                                const struct brw_reg &dst_x1,
                                                const struct brw_reg &dst_y0,
                                                const struct brw_reg &dst_y1)
{
   struct brw_reg f0 = brw_flag_reg(0, 0);
   struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);

   emit_cmp(BRW_CONDITIONAL_GE, x, dst_x0);
   emit_cmp(BRW_CONDITIONAL_GE, y, dst_y0)->predicate = BRW_PREDICATE_NORMAL;
   emit_cmp(BRW_CONDITIONAL_L, x, dst_x1)->predicate = BRW_PREDICATE_NORMAL;
   emit_cmp(BRW_CONDITIONAL_L, y, dst_y1)->predicate = BRW_PREDICATE_NORMAL;

   fs_inst *inst = new (mem_ctx) fs_inst(BRW_OPCODE_AND, 16, g1, f0, g1);
   inst->force_writemask_all = true;
   insts.push_tail(inst);
}
void bpf_jit_compile(struct bpf_prog *fp)
{
	unsigned int cleanup_addr, proglen, oldproglen = 0;
	u32 temp[8], *prog, *func, seen = 0, pass;
	const struct sock_filter *filter = fp->insns;
	int i, flen = fp->len, pc_ret0 = -1;
	unsigned int *addrs;
	void *image;

	if (!bpf_jit_enable)
		return;

	addrs = kmalloc_array(flen, sizeof(*addrs), GFP_KERNEL);
	if (addrs == NULL)
		return;

	/* Before first pass, make a rough estimation of addrs[]
	 * each bpf instruction is translated to less than 64 bytes
	 */
	for (proglen = 0, i = 0; i < flen; i++) {
		proglen += 64;
		addrs[i] = proglen;
	}
	cleanup_addr = proglen; /* epilogue address */
	image = NULL;
	for (pass = 0; pass < 10; pass++) {
		u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen;

		/* no prologue/epilogue for trivial filters (RET something) */
		proglen = 0;
		prog = temp;

		/* Prologue */
		if (seen_or_pass0) {
			if (seen_or_pass0 & SEEN_MEM) {
				unsigned int sz = BASE_STACKFRAME;
				sz += BPF_MEMWORDS * sizeof(u32);
				emit_alloc_stack(sz);
			}

			/* Make sure we dont leek kernel memory. */
			if (seen_or_pass0 & SEEN_XREG)
				emit_clear(r_X);

			/* If this filter needs to access skb data,
			 * load %o4 and %o5 with:
			 *  %o4 = skb->len - skb->data_len
			 *  %o5 = skb->data
			 * And also back up %o7 into r_saved_O7 so we can
			 * invoke the stubs using 'call'.
			 */
			if (seen_or_pass0 & SEEN_DATAREF) {
				emit_load32(r_SKB, struct sk_buff, len, r_HEADLEN);
				emit_load32(r_SKB, struct sk_buff, data_len, r_TMP);
				emit_sub(r_HEADLEN, r_TMP, r_HEADLEN);
				emit_loadptr(r_SKB, struct sk_buff, data, r_SKB_DATA);
			}
		}
		emit_reg_move(O7, r_saved_O7);

		/* Make sure we dont leak kernel information to the user. */
		if (bpf_needs_clear_a(&filter[0]))
			emit_clear(r_A); /* A = 0 */

		for (i = 0; i < flen; i++) {
			unsigned int K = filter[i].k;
			unsigned int t_offset;
			unsigned int f_offset;
			u32 t_op, f_op;
			u16 code = bpf_anc_helper(&filter[i]);
			int ilen;

			switch (code) {
			case BPF_ALU | BPF_ADD | BPF_X:	/* A += X; */
				emit_alu_X(ADD);
				break;
			case BPF_ALU | BPF_ADD | BPF_K:	/* A += K; */
				emit_alu_K(ADD, K);
				break;
			case BPF_ALU | BPF_SUB | BPF_X:	/* A -= X; */
				emit_alu_X(SUB);
				break;
			case BPF_ALU | BPF_SUB | BPF_K:	/* A -= K */
				emit_alu_K(SUB, K);
				break;
			case BPF_ALU | BPF_AND | BPF_X:	/* A &= X */
				emit_alu_X(AND);
				break;
			case BPF_ALU | BPF_AND | BPF_K:	/* A &= K */
				emit_alu_K(AND, K);
				break;
			case BPF_ALU | BPF_OR | BPF_X:	/* A |= X */
				emit_alu_X(OR);
				break;
			case BPF_ALU | BPF_OR | BPF_K:	/* A |= K */
				emit_alu_K(OR, K);
				break;
			case BPF_ANC | SKF_AD_ALU_XOR_X: /* A ^= X; */
			case BPF_ALU | BPF_XOR | BPF_X:
				emit_alu_X(XOR);
				break;
			case BPF_ALU | BPF_XOR | BPF_K:	/* A ^= K */
				emit_alu_K(XOR, K);
				break;
			case BPF_ALU | BPF_LSH | BPF_X:	/* A <<= X */
				emit_alu_X(SLL);
				break;
			case BPF_ALU | BPF_LSH | BPF_K:	/* A <<= K */
				emit_alu_K(SLL, K);
				break;
			case BPF_ALU | BPF_RSH | BPF_X:	/* A >>= X */
				emit_alu_X(SRL);
				break;
			case BPF_ALU | BPF_RSH | BPF_K:	/* A >>= K */
				emit_alu_K(SRL, K);
				break;
			case BPF_ALU | BPF_MUL | BPF_X:	/* A *= X; */
				emit_alu_X(MUL);
				break;
			case BPF_ALU | BPF_MUL | BPF_K:	/* A *= K */
				emit_alu_K(MUL, K);
				break;
			case BPF_ALU | BPF_DIV | BPF_K:	/* A /= K with K != 0*/
				if (K == 1)
					break;
				emit_write_y(G0);
				/* The Sparc v8 architecture requires
				 * three instructions between a %y
				 * register write and the first use.
				 */
				emit_nop();
				emit_nop();
				emit_nop();
				emit_alu_K(DIV, K);
				break;
			case BPF_ALU | BPF_DIV | BPF_X:	/* A /= X; */
				emit_cmpi(r_X, 0);
				if (pc_ret0 > 0) {
					t_offset = addrs[pc_ret0 - 1];
					emit_branch(BE, t_offset + 20);
					emit_nop(); /* delay slot */
				} else {
					emit_branch_off(BNE, 16);
					emit_nop();
					emit_jump(cleanup_addr + 20);
					emit_clear(r_A);
				}
				emit_write_y(G0);
				/* The Sparc v8 architecture requires
				 * three instructions between a %y
				 * register write and the first use.
				 */
				emit_nop();
				emit_nop();
				emit_nop();
				emit_alu_X(DIV);
				break;
			case BPF_ALU | BPF_NEG:
				emit_neg();
				break;
			case BPF_RET | BPF_K:
				if (!K) {
					if (pc_ret0 == -1)
						pc_ret0 = i;
					emit_clear(r_A);
				} else {
					emit_loadimm(K, r_A);
				}
				/* Fallthrough */
			case BPF_RET | BPF_A:
				if (seen_or_pass0) {
					if (i != flen - 1) {
						emit_jump(cleanup_addr);
						emit_nop();
						break;
					}
					if (seen_or_pass0 & SEEN_MEM) {
						unsigned int sz = BASE_STACKFRAME;
						sz += BPF_MEMWORDS * sizeof(u32);
						emit_release_stack(sz);
					}
				}
				/* jmpl %r_saved_O7 + 8, %g0 */
				emit_jmpl(r_saved_O7, 8, G0);
				emit_reg_move(r_A, O0); /* delay slot */
				break;
			case BPF_MISC | BPF_TAX:
				seen |= SEEN_XREG;
				emit_reg_move(r_A, r_X);
				break;
			case BPF_MISC | BPF_TXA:
				seen |= SEEN_XREG;
				emit_reg_move(r_X, r_A);
				break;
			case BPF_ANC | SKF_AD_CPU:
				emit_load_cpu(r_A);
				break;
			case BPF_ANC | SKF_AD_PROTOCOL:
				emit_skb_load16(protocol, r_A);
				break;
			case BPF_ANC | SKF_AD_PKTTYPE:
				__emit_skb_load8(__pkt_type_offset, r_A);
				emit_andi(r_A, PKT_TYPE_MAX, r_A);
				emit_alu_K(SRL, 5);
				break;
			case BPF_ANC | SKF_AD_IFINDEX:
				emit_skb_loadptr(dev, r_A);
				emit_cmpi(r_A, 0);
				emit_branch(BE_PTR, cleanup_addr + 4);
				emit_nop();
				emit_load32(r_A, struct net_device, ifindex, r_A);
				break;
			case BPF_ANC | SKF_AD_MARK:
				emit_skb_load32(mark, r_A);
				break;
			case BPF_ANC | SKF_AD_QUEUE:
				emit_skb_load16(queue_mapping, r_A);
				break;
			case BPF_ANC | SKF_AD_HATYPE:
				emit_skb_loadptr(dev, r_A);
				emit_cmpi(r_A, 0);
				emit_branch(BE_PTR, cleanup_addr + 4);
				emit_nop();
				emit_load16(r_A, struct net_device, type, r_A);
				break;
			case BPF_ANC | SKF_AD_RXHASH:
				emit_skb_load32(hash, r_A);
				break;
			case BPF_ANC | SKF_AD_VLAN_TAG:
				emit_skb_load16(vlan_tci, r_A);
				break;
			case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT:
				__emit_skb_load8(__pkt_vlan_present_offset, r_A);
				if (PKT_VLAN_PRESENT_BIT)
					emit_alu_K(SRL, PKT_VLAN_PRESENT_BIT);
				if (PKT_VLAN_PRESENT_BIT < 7)
					emit_andi(r_A, 1, r_A);
				break;
			case BPF_LD | BPF_W | BPF_LEN:
				emit_skb_load32(len, r_A);
				break;
			case BPF_LDX | BPF_W | BPF_LEN:
				emit_skb_load32(len, r_X);
				break;
			case BPF_LD | BPF_IMM:
				emit_loadimm(K, r_A);
				break;
			case BPF_LDX | BPF_IMM:
				emit_loadimm(K, r_X);
				break;
			case BPF_LD | BPF_MEM:
				seen |= SEEN_MEM;
				emit_ldmem(K * 4, r_A);
				break;
			case BPF_LDX | BPF_MEM:
				seen |= SEEN_MEM | SEEN_XREG;
				emit_ldmem(K * 4, r_X);
				break;
			case BPF_ST:
				seen |= SEEN_MEM;
				emit_stmem(K * 4, r_A);
				break;
			case BPF_STX:
				seen |= SEEN_MEM | SEEN_XREG;
				emit_stmem(K * 4, r_X);
				break;

#define CHOOSE_LOAD_FUNC(K, func) \
	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)

			case BPF_LD | BPF_W | BPF_ABS:
				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_word);
common_load:			seen |= SEEN_DATAREF;
				emit_loadimm(K, r_OFF);
				emit_call(func);
				break;
			case BPF_LD | BPF_H | BPF_ABS:
				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_half);
				goto common_load;
			case BPF_LD | BPF_B | BPF_ABS:
				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_byte);
				goto common_load;
			case BPF_LDX | BPF_B | BPF_MSH:
				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_byte_msh);
				goto common_load;
			case BPF_LD | BPF_W | BPF_IND:
				func = bpf_jit_load_word;
common_load_ind:		seen |= SEEN_DATAREF | SEEN_XREG;
				if (K) {
					if (is_simm13(K)) {
						emit_addi(r_X, K, r_OFF);
					} else {
						emit_loadimm(K, r_TMP);
						emit_add(r_X, r_TMP, r_OFF);
					}
				} else {
					emit_reg_move(r_X, r_OFF);
				}
				emit_call(func);
				break;
			case BPF_LD | BPF_H | BPF_IND:
				func = bpf_jit_load_half;
				goto common_load_ind;
			case BPF_LD | BPF_B | BPF_IND:
				func = bpf_jit_load_byte;
				goto common_load_ind;
			case BPF_JMP | BPF_JA:
				emit_jump(addrs[i + K]);
				emit_nop();
				break;

#define COND_SEL(CODE, TOP, FOP)	\
	case CODE:			\
		t_op = TOP;		\
		f_op = FOP;		\
		goto cond_branch

			COND_SEL(BPF_JMP | BPF_JGT | BPF_K, BGU, BLEU);
			COND_SEL(BPF_JMP | BPF_JGE | BPF_K, BGEU, BLU);
			COND_SEL(BPF_JMP | BPF_JEQ | BPF_K, BE, BNE);
			COND_SEL(BPF_JMP | BPF_JSET | BPF_K, BNE, BE);
			COND_SEL(BPF_JMP | BPF_JGT | BPF_X, BGU, BLEU);
			COND_SEL(BPF_JMP | BPF_JGE | BPF_X, BGEU, BLU);
			COND_SEL(BPF_JMP | BPF_JEQ | BPF_X, BE, BNE);
			COND_SEL(BPF_JMP | BPF_JSET | BPF_X, BNE, BE);

cond_branch:			f_offset = addrs[i + filter[i].jf];
				t_offset = addrs[i + filter[i].jt];

				/* same targets, can avoid doing the test :) */
				if (filter[i].jt == filter[i].jf) {
					emit_jump(t_offset);
					emit_nop();
					break;
				}

				switch (code) {
				case BPF_JMP | BPF_JGT | BPF_X:
				case BPF_JMP | BPF_JGE | BPF_X:
				case BPF_JMP | BPF_JEQ | BPF_X:
					seen |= SEEN_XREG;
					emit_cmp(r_A, r_X);
					break;
				case BPF_JMP | BPF_JSET | BPF_X:
					seen |= SEEN_XREG;
					emit_btst(r_A, r_X);
					break;
				case BPF_JMP | BPF_JEQ | BPF_K:
				case BPF_JMP | BPF_JGT | BPF_K:
				case BPF_JMP | BPF_JGE | BPF_K:
					if (is_simm13(K)) {
						emit_cmpi(r_A, K);
					} else {
						emit_loadimm(K, r_TMP);
						emit_cmp(r_A, r_TMP);
					}
					break;
				case BPF_JMP | BPF_JSET | BPF_K:
					if (is_simm13(K)) {
						emit_btsti(r_A, K);
					} else {
						emit_loadimm(K, r_TMP);
						emit_btst(r_A, r_TMP);
					}
					break;
				}
				if (filter[i].jt != 0) {
					if (filter[i].jf)
						t_offset += 8;
					emit_branch(t_op, t_offset);
					emit_nop(); /* delay slot */
					if (filter[i].jf) {
						emit_jump(f_offset);
						emit_nop();
					}
					break;
				}
				emit_branch(f_op, f_offset);
				emit_nop(); /* delay slot */
				break;

			default:
				/* hmm, too complex filter, give up with jit compiler */
				goto out;
			}
			ilen = (void *) prog - (void *) temp;
			if (image) {
				if (unlikely(proglen + ilen > oldproglen)) {
					pr_err("bpb_jit_compile fatal error\n");
					kfree(addrs);
					module_memfree(image);
					return;
				}
				memcpy(image + proglen, temp, ilen);
			}
			proglen += ilen;
			addrs[i] = proglen;
			prog = temp;
		}
		/* last bpf instruction is always a RET :
		 * use it to give the cleanup instruction(s) addr
		 */
		cleanup_addr = proglen - 8; /* jmpl; mov r_A,%o0; */
		if (seen_or_pass0 & SEEN_MEM)
			cleanup_addr -= 4; /* add %sp, X, %sp; */

		if (image) {
			if (proglen != oldproglen)
				pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n",
				       proglen, oldproglen);
			break;
		}
		if (proglen == oldproglen) {
			image = module_alloc(proglen);
			if (!image)
				goto out;
		}
		oldproglen = proglen;
	}

	if (bpf_jit_enable > 1)
		bpf_jit_dump(flen, proglen, pass + 1, image);

	if (image) {
		fp->bpf_func = (void *)image;
		fp->jited = 1;
	}
out:
	kfree(addrs);
	return;
}
Exemple #3
0
/* Emit the fragment program instructions here.
 */
void brw_wm_emit( struct brw_wm_compile *c )
{
   struct brw_compile *p = &c->func;
   struct intel_context *intel = &p->brw->intel;
   GLuint insn;

   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
   if (intel->gen >= 6)
	brw_set_acc_write_control(p, 1);

   /* Check if any of the payload regs need to be spilled:
    */
   spill_values(c, c->payload.depth, 4);
   spill_values(c, c->creg, c->nr_creg);
   spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
   

   for (insn = 0; insn < c->nr_insns; insn++) {

      struct brw_wm_instruction *inst = &c->instruction[insn];
      struct brw_reg args[3][4], dst[4];
      GLuint i, dst_flags;
      
      /* Get argument regs:
       */
      for (i = 0; i < 3; i++) 
	 get_argument_regs(c, inst->src[i], args[i]);

      /* Get dest regs:
       */
      for (i = 0; i < 4; i++)
	 if (inst->dst[i])
	    dst[i] = inst->dst[i]->hw_reg;
	 else
	    dst[i] = brw_null_reg();
      
      /* Flags
       */
      dst_flags = inst->writemask;
      if (inst->saturate) 
	 dst_flags |= SATURATE;

      switch (inst->opcode) {
	 /* Generated instructions for calculating triangle interpolants:
	  */
      case WM_PIXELXY:
	 emit_pixel_xy(c, dst, dst_flags);
	 break;

      case WM_DELTAXY:
	 emit_delta_xy(p, dst, dst_flags, args[0]);
	 break;

      case WM_WPOSXY:
	 emit_wpos_xy(c, dst, dst_flags, args[0]);
	 break;

      case WM_PIXELW:
	 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
	 break;

      case WM_LINTERP:
	 emit_linterp(p, dst, dst_flags, args[0], args[1]);
	 break;

      case WM_PINTERP:
	 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case WM_CINTERP:
	 emit_cinterp(p, dst, dst_flags, args[0]);
	 break;

      case WM_FB_WRITE:
	 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
	 break;

      case WM_FRONTFACING:
	 emit_frontfacing(p, dst, dst_flags);
	 break;

	 /* Straightforward arithmetic:
	  */
      case OPCODE_ADD:
	 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_FRC:
	 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
	 break;

      case OPCODE_FLR:
	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
	 break;

      case OPCODE_DDX:
	 emit_ddxy(p, dst, dst_flags, true, args[0]);
	 break;

      case OPCODE_DDY:
	 emit_ddxy(p, dst, dst_flags, false, args[0]);
	 break;

      case OPCODE_DP2:
	 emit_dp2(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_DP3:
	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_DP4:
	 emit_dp4(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_DPH:
	 emit_dph(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_TRUNC:
	 for (i = 0; i < 4; i++) {
	    if (dst_flags & (1<<i)) {
	       brw_RNDZ(p, dst[i], args[0][i]);
	    }
	 }
	 break;

      case OPCODE_LRP:
	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MAD:	
	 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MOV:
      case OPCODE_SWZ:
	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
	 break;

      case OPCODE_MUL:
	 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_XPD:
	 emit_xpd(p, dst, dst_flags, args[0], args[1]);
	 break;

	 /* Higher math functions:
	  */
      case OPCODE_RCP:
	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
	 break;

      case OPCODE_RSQ:
	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
	 break;

      case OPCODE_SIN:
	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
	 break;

      case OPCODE_COS:
	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
	 break;

      case OPCODE_EX2:
	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
	 break;

      case OPCODE_LG2:
	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
	 break;

      case OPCODE_SCS:
	 /* There is an scs math function, but it would need some
	  * fixup for 16-element execution.
	  */
	 if (dst_flags & WRITEMASK_X)
	    emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
	 if (dst_flags & WRITEMASK_Y)
	    emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
	 break;

      case OPCODE_POW:
	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
	 break;

	 /* Comparisons:
	  */
      case OPCODE_CMP:
	 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MAX:
	 emit_max(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_MIN:
	 emit_min(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_SLT:
	 emit_slt(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_SLE:
	 emit_sle(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SGT:
	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SGE:
	 emit_sge(p, dst, dst_flags, args[0], args[1]);
	 break;
      case OPCODE_SEQ:
	 emit_seq(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SNE:
	 emit_sne(p, dst, dst_flags, args[0], args[1]);
	break;

      case OPCODE_SSG:
	 emit_sign(p, dst, dst_flags, args[0]);
	 break;

      case OPCODE_LIT:
	 emit_lit(c, dst, dst_flags, args[0]);
	 break;

	 /* Texturing operations:
	  */
      case OPCODE_TEX:
	 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
		  inst->tex_idx, inst->tex_unit,
		  inst->tex_shadow);
	 break;

      case OPCODE_TXB:
	 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
		  inst->tex_idx, inst->tex_unit);
	 break;

      case OPCODE_KIL:
	 emit_kil(c, args[0]);
	 break;

      default:
	 printf("Unsupported opcode %i (%s) in fragment shader\n",
		inst->opcode, inst->opcode < MAX_OPCODE ?
		_mesa_opcode_string(inst->opcode) :
		"unknown");
      }
      
      for (i = 0; i < 4; i++)
	if (inst->dst[i] && inst->dst[i]->spill_slot) 
	   emit_spill(c, 
		      inst->dst[i]->hw_reg, 
		      inst->dst[i]->spill_slot);
   }

   /* Only properly tested on ILK */
   if (p->brw->intel.gen == 5) {
     brw_remove_duplicate_mrf_moves(p);
     if (c->dispatch_width == 16)
	brw_remove_grf_to_mrf_moves(p);
   }

   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
      int i;

     printf("wm-native:\n");
     for (i = 0; i < p->nr_insn; i++)
	 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
      printf("\n");
   }
}
brw_blorp_eu_emitter::brw_blorp_eu_emitter(struct brw_context *brw)
   : brw_ctx (brw), mem_ctx(ralloc_context(NULL)), c(rzalloc(mem_ctx, struct brw_wm_compile)),
     generator(brw, c, NULL, NULL, false)
{
}

brw_blorp_eu_emitter::~brw_blorp_eu_emitter()
{
   ralloc_free(mem_ctx);
}

const unsigned *
brw_blorp_eu_emitter::get_program(unsigned *program_size, FILE *dump_file)
{
   const unsigned *res;

   if (unlikely(INTEL_DEBUG & DEBUG_BLORP)) {
      fprintf(stderr, "Native code for BLORP blit:\n");
      res = generator.generate_assembly(NULL, &insts, program_size, dump_file);
      fprintf(stderr, "\n");
   } else {
      res = generator.generate_assembly(NULL, &insts, program_size);
   }

   return res;
}

/**
 * Emit code that kills pixels whose X and Y coordinates are outside the
 * boundary of the rectangle defined by the push constants (dst_x0, dst_y0,
 * dst_x1, dst_y1).
 */
void
brw_blorp_eu_emitter::emit_kill_if_outside_rect(const struct brw_reg &x,
                                                const struct brw_reg &y,
                                                const struct brw_reg &dst_x0,
                                                const struct brw_reg &dst_x1,
                                                const struct brw_reg &dst_y0,
                                                const struct brw_reg &dst_y1)
{
   struct brw_reg f0 = brw_flag_reg(0, 0);
   struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);

   emit_cmp(BRW_CONDITIONAL_GE, x, dst_x0);
   emit_cmp(BRW_CONDITIONAL_GE, y, dst_y0)->predicate = BRW_PREDICATE_NORMAL;
   emit_cmp(BRW_CONDITIONAL_L, x, dst_x1)->predicate = BRW_PREDICATE_NORMAL;
   emit_cmp(BRW_CONDITIONAL_L, y, dst_y1)->predicate = BRW_PREDICATE_NORMAL;

   fs_inst *inst = new (mem_ctx) fs_inst(BRW_OPCODE_AND, g1, f0, g1);
   inst->force_writemask_all = true;
   insts.push_tail(inst);
}

void
brw_blorp_eu_emitter::emit_texture_lookup(const struct brw_reg &dst,
                                          enum opcode op,
                                          unsigned base_mrf,
                                          unsigned msg_length)
{
   fs_inst *inst = new (mem_ctx) fs_inst(op, dst, brw_message_reg(base_mrf));

   inst->base_mrf = base_mrf;
   inst->mlen = msg_length;
   inst->sampler = 0;
   inst->header_present = false;

   insts.push_tail(inst);
}
/* Emit the fragment program instructions here.
 */
void brw_wm_emit( struct brw_wm_compile *c )
{
   struct brw_compile *p = &c->func;
   GLuint insn;

   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);

   /* Check if any of the payload regs need to be spilled:
    */
   spill_values(c, c->payload.depth, 4);
   spill_values(c, c->creg, c->nr_creg);
   spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
   

   for (insn = 0; insn < c->nr_insns; insn++) {

      struct brw_wm_instruction *inst = &c->instruction[insn];
      struct brw_reg args[3][4], dst[4];
      GLuint i, dst_flags;
      
      /* Get argument regs:
       */
      for (i = 0; i < 3; i++) 
	 get_argument_regs(c, inst->src[i], args[i]);

      /* Get dest regs:
       */
      for (i = 0; i < 4; i++)
	 if (inst->dst[i])
	    dst[i] = inst->dst[i]->hw_reg;
	 else
	    dst[i] = brw_null_reg();
      
      /* Flags
       */
      dst_flags = inst->writemask;
      if (inst->saturate) 
	 dst_flags |= SATURATE;

      switch (inst->opcode) {
	 /* Generated instructions for calculating triangle interpolants:
	  */
      case WM_PIXELXY:
	 emit_pixel_xy(p, dst, dst_flags, args[0]);
	 break;

      case WM_DELTAXY:
	 emit_delta_xy(p, dst, dst_flags, args[0], args[1]);
	 break;

      case WM_WPOSXY:
	 emit_wpos_xy(c, dst, dst_flags, args[0]);
	 break;

      case WM_PIXELW:
	 emit_pixel_w(p, dst, dst_flags, args[0], args[1]);
	 break;

      case WM_LINTERP:
	 emit_linterp(p, dst, dst_flags, args[0], args[1]);
	 break;

      case WM_PINTERP:
	 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case WM_CINTERP:
	 emit_cinterp(p, dst, dst_flags, args[0]);
	 break;

      case WM_FB_WRITE:
	 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
	 break;

	 /* Straightforward arithmetic:
	  */
      case OPCODE_ADD:
	 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_FRC:
	 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
	 break;

      case OPCODE_FLR:
	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
	 break;

      case OPCODE_DP3:	/*  */
	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_DP4:
	 emit_dp4(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_DPH:
	 emit_dph(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_LRP:	/*  */
	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MAD:	
	 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MOV:
      case OPCODE_SWZ:
	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
	 break;

      case OPCODE_MUL:
	 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_XPD:
	 emit_xpd(p, dst, dst_flags, args[0], args[1]);
	 break;

	 /* Higher math functions:
	  */
      case OPCODE_RCP:
	 emit_math1(p, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
	 break;

      case OPCODE_RSQ:
	 emit_math1(p, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
	 break;

      case OPCODE_SIN:
	 emit_math1(p, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
	 break;

      case OPCODE_COS:
	 emit_math1(p, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
	 break;

      case OPCODE_EX2:
	 emit_math1(p, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
	 break;

      case OPCODE_LG2:
	 emit_math1(p, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
	 break;

      case OPCODE_SCS:
	 /* There is an scs math function, but it would need some
	  * fixup for 16-element execution.
	  */
	 if (dst_flags & WRITEMASK_X)
	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
	 if (dst_flags & WRITEMASK_Y)
	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
	 break;

      case OPCODE_POW:
	 emit_math2(p, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
	 break;

	 /* Comparisons:
	  */
      case OPCODE_CMP:
	 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
	 break;

      case OPCODE_MAX:
	 emit_max(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_MIN:
	 emit_min(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_SLT:
	 emit_slt(p, dst, dst_flags, args[0], args[1]);
	 break;

      case OPCODE_SLE:
	 emit_sle(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SGT:
	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SGE:
	 emit_sge(p, dst, dst_flags, args[0], args[1]);
	 break;
      case OPCODE_SEQ:
	 emit_seq(p, dst, dst_flags, args[0], args[1]);
	break;
      case OPCODE_SNE:
	 emit_sne(p, dst, dst_flags, args[0], args[1]);
	break;

      case OPCODE_LIT:
	 emit_lit(p, dst, dst_flags, args[0]);
	 break;

	 /* Texturing operations:
	  */
      case OPCODE_TEX:
	 emit_tex(c, inst, dst, dst_flags, args[0]);
	 break;

      case OPCODE_TXB:
	 emit_txb(c, inst, dst, dst_flags, args[0]);
	 break;

      case OPCODE_KIL:
	 emit_kil(c, args[0]);
	 break;

      default:
	_mesa_printf("unsupport opcode %d in fragment program\n", 
		inst->opcode);
      }
      
      for (i = 0; i < 4; i++)
	if (inst->dst[i] && inst->dst[i]->spill_slot) 
	   emit_spill(c, 
		      inst->dst[i]->hw_reg, 
		      inst->dst[i]->spill_slot);
   }
}