struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
						  u8 **image_ptr)
{
	unsigned int sz, hole;
	struct bpf_binary_header *header;

	/* Most of BPF filters are really small,
	 * but if some of them fill a page, allow at least
	 * 128 extra bytes to insert a random section of int3
	 */
	sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE);
	header = module_alloc(sz);
	if (!header)
		return NULL;

	memset(header, 0xcc, sz); /* fill whole space with int3 instructions */

	header->pages = sz / PAGE_SIZE;
	hole = sz - (proglen + sizeof(*header));

	/* insert a random number of int3 instructions before BPF code */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
	*image_ptr = &header->image[prandom_u32() % hole];
#else
	*image_ptr = &header->image[32 % hole];
#endif
	return header;
}
Beispiel #2
0
/* Make page to RO mode when allocate it */
void *alloc_insn_page(void)
{
	void *page;

	page = module_alloc(PAGE_SIZE);
	if (page)
		set_memory_ro((unsigned long)page & PAGE_MASK, 1);

	return page;
}
Beispiel #3
0
static void *module_alloc_update_bounds(unsigned long size)
{
	void *ret = module_alloc(size);;
	if (ret) {
		if ((unsigned long)ret < module_addr_min)
			module_addr_min = (unsigned long)ret;
		if ((unsigned long)ret + size > module_addr_max)
			module_addr_max = (unsigned long)ret + size;
	}
	return ret;
}
void bpf_jit_compile(struct bpf_prog *fp)
{
	unsigned int cleanup_addr, proglen, oldproglen = 0;
	u32 temp[8], *prog, *func, seen = 0, pass;
	const struct sock_filter *filter = fp->insns;
	int i, flen = fp->len, pc_ret0 = -1;
	unsigned int *addrs;
	void *image;

	if (!bpf_jit_enable)
		return;

	addrs = kmalloc_array(flen, sizeof(*addrs), GFP_KERNEL);
	if (addrs == NULL)
		return;

	/* Before first pass, make a rough estimation of addrs[]
	 * each bpf instruction is translated to less than 64 bytes
	 */
	for (proglen = 0, i = 0; i < flen; i++) {
		proglen += 64;
		addrs[i] = proglen;
	}
	cleanup_addr = proglen; /* epilogue address */
	image = NULL;
	for (pass = 0; pass < 10; pass++) {
		u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen;

		/* no prologue/epilogue for trivial filters (RET something) */
		proglen = 0;
		prog = temp;

		/* Prologue */
		if (seen_or_pass0) {
			if (seen_or_pass0 & SEEN_MEM) {
				unsigned int sz = BASE_STACKFRAME;
				sz += BPF_MEMWORDS * sizeof(u32);
				emit_alloc_stack(sz);
			}

			/* Make sure we dont leek kernel memory. */
			if (seen_or_pass0 & SEEN_XREG)
				emit_clear(r_X);

			/* If this filter needs to access skb data,
			 * load %o4 and %o5 with:
			 *  %o4 = skb->len - skb->data_len
			 *  %o5 = skb->data
			 * And also back up %o7 into r_saved_O7 so we can
			 * invoke the stubs using 'call'.
			 */
			if (seen_or_pass0 & SEEN_DATAREF) {
				emit_load32(r_SKB, struct sk_buff, len, r_HEADLEN);
				emit_load32(r_SKB, struct sk_buff, data_len, r_TMP);
				emit_sub(r_HEADLEN, r_TMP, r_HEADLEN);
				emit_loadptr(r_SKB, struct sk_buff, data, r_SKB_DATA);
			}
		}
		emit_reg_move(O7, r_saved_O7);

		/* Make sure we dont leak kernel information to the user. */
		if (bpf_needs_clear_a(&filter[0]))
			emit_clear(r_A); /* A = 0 */

		for (i = 0; i < flen; i++) {
			unsigned int K = filter[i].k;
			unsigned int t_offset;
			unsigned int f_offset;
			u32 t_op, f_op;
			u16 code = bpf_anc_helper(&filter[i]);
			int ilen;

			switch (code) {
			case BPF_ALU | BPF_ADD | BPF_X:	/* A += X; */
				emit_alu_X(ADD);
				break;
			case BPF_ALU | BPF_ADD | BPF_K:	/* A += K; */
				emit_alu_K(ADD, K);
				break;
			case BPF_ALU | BPF_SUB | BPF_X:	/* A -= X; */
				emit_alu_X(SUB);
				break;
			case BPF_ALU | BPF_SUB | BPF_K:	/* A -= K */
				emit_alu_K(SUB, K);
				break;
			case BPF_ALU | BPF_AND | BPF_X:	/* A &= X */
				emit_alu_X(AND);
				break;
			case BPF_ALU | BPF_AND | BPF_K:	/* A &= K */
				emit_alu_K(AND, K);
				break;
			case BPF_ALU | BPF_OR | BPF_X:	/* A |= X */
				emit_alu_X(OR);
				break;
			case BPF_ALU | BPF_OR | BPF_K:	/* A |= K */
				emit_alu_K(OR, K);
				break;
			case BPF_ANC | SKF_AD_ALU_XOR_X: /* A ^= X; */
			case BPF_ALU | BPF_XOR | BPF_X:
				emit_alu_X(XOR);
				break;
			case BPF_ALU | BPF_XOR | BPF_K:	/* A ^= K */
				emit_alu_K(XOR, K);
				break;
			case BPF_ALU | BPF_LSH | BPF_X:	/* A <<= X */
				emit_alu_X(SLL);
				break;
			case BPF_ALU | BPF_LSH | BPF_K:	/* A <<= K */
				emit_alu_K(SLL, K);
				break;
			case BPF_ALU | BPF_RSH | BPF_X:	/* A >>= X */
				emit_alu_X(SRL);
				break;
			case BPF_ALU | BPF_RSH | BPF_K:	/* A >>= K */
				emit_alu_K(SRL, K);
				break;
			case BPF_ALU | BPF_MUL | BPF_X:	/* A *= X; */
				emit_alu_X(MUL);
				break;
			case BPF_ALU | BPF_MUL | BPF_K:	/* A *= K */
				emit_alu_K(MUL, K);
				break;
			case BPF_ALU | BPF_DIV | BPF_K:	/* A /= K with K != 0*/
				if (K == 1)
					break;
				emit_write_y(G0);
				/* The Sparc v8 architecture requires
				 * three instructions between a %y
				 * register write and the first use.
				 */
				emit_nop();
				emit_nop();
				emit_nop();
				emit_alu_K(DIV, K);
				break;
			case BPF_ALU | BPF_DIV | BPF_X:	/* A /= X; */
				emit_cmpi(r_X, 0);
				if (pc_ret0 > 0) {
					t_offset = addrs[pc_ret0 - 1];
					emit_branch(BE, t_offset + 20);
					emit_nop(); /* delay slot */
				} else {
					emit_branch_off(BNE, 16);
					emit_nop();
					emit_jump(cleanup_addr + 20);
					emit_clear(r_A);
				}
				emit_write_y(G0);
				/* The Sparc v8 architecture requires
				 * three instructions between a %y
				 * register write and the first use.
				 */
				emit_nop();
				emit_nop();
				emit_nop();
				emit_alu_X(DIV);
				break;
			case BPF_ALU | BPF_NEG:
				emit_neg();
				break;
			case BPF_RET | BPF_K:
				if (!K) {
					if (pc_ret0 == -1)
						pc_ret0 = i;
					emit_clear(r_A);
				} else {
					emit_loadimm(K, r_A);
				}
				/* Fallthrough */
			case BPF_RET | BPF_A:
				if (seen_or_pass0) {
					if (i != flen - 1) {
						emit_jump(cleanup_addr);
						emit_nop();
						break;
					}
					if (seen_or_pass0 & SEEN_MEM) {
						unsigned int sz = BASE_STACKFRAME;
						sz += BPF_MEMWORDS * sizeof(u32);
						emit_release_stack(sz);
					}
				}
				/* jmpl %r_saved_O7 + 8, %g0 */
				emit_jmpl(r_saved_O7, 8, G0);
				emit_reg_move(r_A, O0); /* delay slot */
				break;
			case BPF_MISC | BPF_TAX:
				seen |= SEEN_XREG;
				emit_reg_move(r_A, r_X);
				break;
			case BPF_MISC | BPF_TXA:
				seen |= SEEN_XREG;
				emit_reg_move(r_X, r_A);
				break;
			case BPF_ANC | SKF_AD_CPU:
				emit_load_cpu(r_A);
				break;
			case BPF_ANC | SKF_AD_PROTOCOL:
				emit_skb_load16(protocol, r_A);
				break;
			case BPF_ANC | SKF_AD_PKTTYPE:
				__emit_skb_load8(__pkt_type_offset, r_A);
				emit_andi(r_A, PKT_TYPE_MAX, r_A);
				emit_alu_K(SRL, 5);
				break;
			case BPF_ANC | SKF_AD_IFINDEX:
				emit_skb_loadptr(dev, r_A);
				emit_cmpi(r_A, 0);
				emit_branch(BE_PTR, cleanup_addr + 4);
				emit_nop();
				emit_load32(r_A, struct net_device, ifindex, r_A);
				break;
			case BPF_ANC | SKF_AD_MARK:
				emit_skb_load32(mark, r_A);
				break;
			case BPF_ANC | SKF_AD_QUEUE:
				emit_skb_load16(queue_mapping, r_A);
				break;
			case BPF_ANC | SKF_AD_HATYPE:
				emit_skb_loadptr(dev, r_A);
				emit_cmpi(r_A, 0);
				emit_branch(BE_PTR, cleanup_addr + 4);
				emit_nop();
				emit_load16(r_A, struct net_device, type, r_A);
				break;
			case BPF_ANC | SKF_AD_RXHASH:
				emit_skb_load32(hash, r_A);
				break;
			case BPF_ANC | SKF_AD_VLAN_TAG:
				emit_skb_load16(vlan_tci, r_A);
				break;
			case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT:
				__emit_skb_load8(__pkt_vlan_present_offset, r_A);
				if (PKT_VLAN_PRESENT_BIT)
					emit_alu_K(SRL, PKT_VLAN_PRESENT_BIT);
				if (PKT_VLAN_PRESENT_BIT < 7)
					emit_andi(r_A, 1, r_A);
				break;
			case BPF_LD | BPF_W | BPF_LEN:
				emit_skb_load32(len, r_A);
				break;
			case BPF_LDX | BPF_W | BPF_LEN:
				emit_skb_load32(len, r_X);
				break;
			case BPF_LD | BPF_IMM:
				emit_loadimm(K, r_A);
				break;
			case BPF_LDX | BPF_IMM:
				emit_loadimm(K, r_X);
				break;
			case BPF_LD | BPF_MEM:
				seen |= SEEN_MEM;
				emit_ldmem(K * 4, r_A);
				break;
			case BPF_LDX | BPF_MEM:
				seen |= SEEN_MEM | SEEN_XREG;
				emit_ldmem(K * 4, r_X);
				break;
			case BPF_ST:
				seen |= SEEN_MEM;
				emit_stmem(K * 4, r_A);
				break;
			case BPF_STX:
				seen |= SEEN_MEM | SEEN_XREG;
				emit_stmem(K * 4, r_X);
				break;

#define CHOOSE_LOAD_FUNC(K, func) \
	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)

			case BPF_LD | BPF_W | BPF_ABS:
				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_word);
common_load:			seen |= SEEN_DATAREF;
				emit_loadimm(K, r_OFF);
				emit_call(func);
				break;
			case BPF_LD | BPF_H | BPF_ABS:
				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_half);
				goto common_load;
			case BPF_LD | BPF_B | BPF_ABS:
				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_byte);
				goto common_load;
			case BPF_LDX | BPF_B | BPF_MSH:
				func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_byte_msh);
				goto common_load;
			case BPF_LD | BPF_W | BPF_IND:
				func = bpf_jit_load_word;
common_load_ind:		seen |= SEEN_DATAREF | SEEN_XREG;
				if (K) {
					if (is_simm13(K)) {
						emit_addi(r_X, K, r_OFF);
					} else {
						emit_loadimm(K, r_TMP);
						emit_add(r_X, r_TMP, r_OFF);
					}
				} else {
					emit_reg_move(r_X, r_OFF);
				}
				emit_call(func);
				break;
			case BPF_LD | BPF_H | BPF_IND:
				func = bpf_jit_load_half;
				goto common_load_ind;
			case BPF_LD | BPF_B | BPF_IND:
				func = bpf_jit_load_byte;
				goto common_load_ind;
			case BPF_JMP | BPF_JA:
				emit_jump(addrs[i + K]);
				emit_nop();
				break;

#define COND_SEL(CODE, TOP, FOP)	\
	case CODE:			\
		t_op = TOP;		\
		f_op = FOP;		\
		goto cond_branch

			COND_SEL(BPF_JMP | BPF_JGT | BPF_K, BGU, BLEU);
			COND_SEL(BPF_JMP | BPF_JGE | BPF_K, BGEU, BLU);
			COND_SEL(BPF_JMP | BPF_JEQ | BPF_K, BE, BNE);
			COND_SEL(BPF_JMP | BPF_JSET | BPF_K, BNE, BE);
			COND_SEL(BPF_JMP | BPF_JGT | BPF_X, BGU, BLEU);
			COND_SEL(BPF_JMP | BPF_JGE | BPF_X, BGEU, BLU);
			COND_SEL(BPF_JMP | BPF_JEQ | BPF_X, BE, BNE);
			COND_SEL(BPF_JMP | BPF_JSET | BPF_X, BNE, BE);

cond_branch:			f_offset = addrs[i + filter[i].jf];
				t_offset = addrs[i + filter[i].jt];

				/* same targets, can avoid doing the test :) */
				if (filter[i].jt == filter[i].jf) {
					emit_jump(t_offset);
					emit_nop();
					break;
				}

				switch (code) {
				case BPF_JMP | BPF_JGT | BPF_X:
				case BPF_JMP | BPF_JGE | BPF_X:
				case BPF_JMP | BPF_JEQ | BPF_X:
					seen |= SEEN_XREG;
					emit_cmp(r_A, r_X);
					break;
				case BPF_JMP | BPF_JSET | BPF_X:
					seen |= SEEN_XREG;
					emit_btst(r_A, r_X);
					break;
				case BPF_JMP | BPF_JEQ | BPF_K:
				case BPF_JMP | BPF_JGT | BPF_K:
				case BPF_JMP | BPF_JGE | BPF_K:
					if (is_simm13(K)) {
						emit_cmpi(r_A, K);
					} else {
						emit_loadimm(K, r_TMP);
						emit_cmp(r_A, r_TMP);
					}
					break;
				case BPF_JMP | BPF_JSET | BPF_K:
					if (is_simm13(K)) {
						emit_btsti(r_A, K);
					} else {
						emit_loadimm(K, r_TMP);
						emit_btst(r_A, r_TMP);
					}
					break;
				}
				if (filter[i].jt != 0) {
					if (filter[i].jf)
						t_offset += 8;
					emit_branch(t_op, t_offset);
					emit_nop(); /* delay slot */
					if (filter[i].jf) {
						emit_jump(f_offset);
						emit_nop();
					}
					break;
				}
				emit_branch(f_op, f_offset);
				emit_nop(); /* delay slot */
				break;

			default:
				/* hmm, too complex filter, give up with jit compiler */
				goto out;
			}
			ilen = (void *) prog - (void *) temp;
			if (image) {
				if (unlikely(proglen + ilen > oldproglen)) {
					pr_err("bpb_jit_compile fatal error\n");
					kfree(addrs);
					module_memfree(image);
					return;
				}
				memcpy(image + proglen, temp, ilen);
			}
			proglen += ilen;
			addrs[i] = proglen;
			prog = temp;
		}
		/* last bpf instruction is always a RET :
		 * use it to give the cleanup instruction(s) addr
		 */
		cleanup_addr = proglen - 8; /* jmpl; mov r_A,%o0; */
		if (seen_or_pass0 & SEEN_MEM)
			cleanup_addr -= 4; /* add %sp, X, %sp; */

		if (image) {
			if (proglen != oldproglen)
				pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n",
				       proglen, oldproglen);
			break;
		}
		if (proglen == oldproglen) {
			image = module_alloc(proglen);
			if (!image)
				goto out;
		}
		oldproglen = proglen;
	}

	if (bpf_jit_enable > 1)
		bpf_jit_dump(flen, proglen, pass + 1, image);

	if (image) {
		fp->bpf_func = (void *)image;
		fp->jited = 1;
	}
out:
	kfree(addrs);
	return;
}
Beispiel #5
0
void bpf_jit_compile(struct bpf_prog *fp)
{
	unsigned int proglen;
	unsigned int alloclen;
	u32 *image = NULL;
	u32 *code_base;
	unsigned int *addrs;
	struct codegen_context cgctx;
	int pass;
	int flen = fp->len;

	if (!bpf_jit_enable)
		return;

	addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL);
	if (addrs == NULL)
		return;

	/*
	 * There are multiple assembly passes as the generated code will change
	 * size as it settles down, figuring out the max branch offsets/exit
	 * paths required.
	 *
	 * The range of standard conditional branches is +/- 32Kbytes.	Since
	 * BPF_MAXINSNS = 4096, we can only jump from (worst case) start to
	 * finish with 8 bytes/instruction.  Not feasible, so long jumps are
	 * used, distinct from short branches.
	 *
	 * Current:
	 *
	 * For now, both branch types assemble to 2 words (short branches padded
	 * with a NOP); this is less efficient, but assembly will always complete
	 * after exactly 3 passes:
	 *
	 * First pass: No code buffer; Program is "faux-generated" -- no code
	 * emitted but maximum size of output determined (and addrs[] filled
	 * in).	 Also, we note whether we use M[], whether we use skb data, etc.
	 * All generation choices assumed to be 'worst-case', e.g. branches all
	 * far (2 instructions), return path code reduction not available, etc.
	 *
	 * Second pass: Code buffer allocated with size determined previously.
	 * Prologue generated to support features we have seen used.  Exit paths
	 * determined and addrs[] is filled in again, as code may be slightly
	 * smaller as a result.
	 *
	 * Third pass: Code generated 'for real', and branch destinations
	 * determined from now-accurate addrs[] map.
	 *
	 * Ideal:
	 *
	 * If we optimise this, near branches will be shorter.	On the
	 * first assembly pass, we should err on the side of caution and
	 * generate the biggest code.  On subsequent passes, branches will be
	 * generated short or long and code size will reduce.  With smaller
	 * code, more branches may fall into the short category, and code will
	 * reduce more.
	 *
	 * Finally, if we see one pass generate code the same size as the
	 * previous pass we have converged and should now generate code for
	 * real.  Allocating at the end will also save the memory that would
	 * otherwise be wasted by the (small) current code shrinkage.
	 * Preferably, we should do a small number of passes (e.g. 5) and if we
	 * haven't converged by then, get impatient and force code to generate
	 * as-is, even if the odd branch would be left long.  The chances of a
	 * long jump are tiny with all but the most enormous of BPF filter
	 * inputs, so we should usually converge on the third pass.
	 */

	cgctx.idx = 0;
	cgctx.seen = 0;
	cgctx.pc_ret0 = -1;
	/* Scouting faux-generate pass 0 */
	if (bpf_jit_build_body(fp, 0, &cgctx, addrs))
		/* We hit something illegal or unsupported. */
		goto out;

	/*
	 * Pretend to build prologue, given the features we've seen.  This will
	 * update ctgtx.idx as it pretends to output instructions, then we can
	 * calculate total size from idx.
	 */
	bpf_jit_build_prologue(fp, 0, &cgctx);
	bpf_jit_build_epilogue(0, &cgctx);

	proglen = cgctx.idx * 4;
	alloclen = proglen + FUNCTION_DESCR_SIZE;
	image = module_alloc(alloclen);
	if (!image)
		goto out;

	code_base = image + (FUNCTION_DESCR_SIZE/4);

	/* Code generation passes 1-2 */
	for (pass = 1; pass < 3; pass++) {
		/* Now build the prologue, body code & epilogue for real. */
		cgctx.idx = 0;
		bpf_jit_build_prologue(fp, code_base, &cgctx);
		bpf_jit_build_body(fp, code_base, &cgctx, addrs);
		bpf_jit_build_epilogue(code_base, &cgctx);

		if (bpf_jit_enable > 1)
			pr_info("Pass %d: shrink = %d, seen = 0x%x\n", pass,
				proglen - (cgctx.idx * 4), cgctx.seen);
	}

	if (bpf_jit_enable > 1)
		/* Note that we output the base address of the code_base
		 * rather than image, since opcodes are in code_base.
		 */
		bpf_jit_dump(flen, proglen, pass, code_base);

	if (image) {
		bpf_flush_icache(code_base, code_base + (proglen/4));
		/* Function descriptor nastiness: Address + TOC */
		((u64 *)image)[0] = (u64)code_base;
		((u64 *)image)[1] = local_paca->kernel_toc;
		fp->bpf_func = (void *)image;
		fp->jited = true;
	}
out:
	kfree(addrs);
	return;
}