void bpf_jit_compile(struct bpf_prog *fp) { unsigned int proglen; unsigned int alloclen; u32 *image = NULL; u32 *code_base; unsigned int *addrs; struct codegen_context cgctx; int pass; int flen = fp->len; if (!bpf_jit_enable) return; addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL); if (addrs == NULL) return; /* * There are multiple assembly passes as the generated code will change * size as it settles down, figuring out the max branch offsets/exit * paths required. * * The range of standard conditional branches is +/- 32Kbytes. Since * BPF_MAXINSNS = 4096, we can only jump from (worst case) start to * finish with 8 bytes/instruction. Not feasible, so long jumps are * used, distinct from short branches. * * Current: * * For now, both branch types assemble to 2 words (short branches padded * with a NOP); this is less efficient, but assembly will always complete * after exactly 3 passes: * * First pass: No code buffer; Program is "faux-generated" -- no code * emitted but maximum size of output determined (and addrs[] filled * in). Also, we note whether we use M[], whether we use skb data, etc. * All generation choices assumed to be 'worst-case', e.g. branches all * far (2 instructions), return path code reduction not available, etc. * * Second pass: Code buffer allocated with size determined previously. * Prologue generated to support features we have seen used. Exit paths * determined and addrs[] is filled in again, as code may be slightly * smaller as a result. * * Third pass: Code generated 'for real', and branch destinations * determined from now-accurate addrs[] map. * * Ideal: * * If we optimise this, near branches will be shorter. On the * first assembly pass, we should err on the side of caution and * generate the biggest code. On subsequent passes, branches will be * generated short or long and code size will reduce. With smaller * code, more branches may fall into the short category, and code will * reduce more. * * Finally, if we see one pass generate code the same size as the * previous pass we have converged and should now generate code for * real. Allocating at the end will also save the memory that would * otherwise be wasted by the (small) current code shrinkage. * Preferably, we should do a small number of passes (e.g. 5) and if we * haven't converged by then, get impatient and force code to generate * as-is, even if the odd branch would be left long. The chances of a * long jump are tiny with all but the most enormous of BPF filter * inputs, so we should usually converge on the third pass. */ cgctx.idx = 0; cgctx.seen = 0; cgctx.pc_ret0 = -1; /* Scouting faux-generate pass 0 */ if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) /* We hit something illegal or unsupported. */ goto out; /* * Pretend to build prologue, given the features we've seen. This will * update ctgtx.idx as it pretends to output instructions, then we can * calculate total size from idx. */ bpf_jit_build_prologue(fp, 0, &cgctx); bpf_jit_build_epilogue(0, &cgctx); proglen = cgctx.idx * 4; alloclen = proglen + FUNCTION_DESCR_SIZE; image = module_alloc(alloclen); if (!image) goto out; code_base = image + (FUNCTION_DESCR_SIZE/4); /* Code generation passes 1-2 */ for (pass = 1; pass < 3; pass++) { /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; bpf_jit_build_prologue(fp, code_base, &cgctx); bpf_jit_build_body(fp, code_base, &cgctx, addrs); bpf_jit_build_epilogue(code_base, &cgctx); if (bpf_jit_enable > 1) pr_info("Pass %d: shrink = %d, seen = 0x%x\n", pass, proglen - (cgctx.idx * 4), cgctx.seen); } if (bpf_jit_enable > 1) /* Note that we output the base address of the code_base * rather than image, since opcodes are in code_base. */ bpf_jit_dump(flen, proglen, pass, code_base); if (image) { bpf_flush_icache(code_base, code_base + (proglen/4)); /* Function descriptor nastiness: Address + TOC */ ((u64 *)image)[0] = (u64)code_base; ((u64 *)image)[1] = local_paca->kernel_toc; fp->bpf_func = (void *)image; fp->jited = true; } out: kfree(addrs); return; }
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { u32 proglen; u32 alloclen; u8 *image = NULL; u32 *code_base; u32 *addrs; struct powerpc64_jit_data *jit_data; struct codegen_context cgctx; int pass; int flen; struct bpf_binary_header *bpf_hdr; struct bpf_prog *org_fp = fp; struct bpf_prog *tmp_fp; bool bpf_blinded = false; bool extra_pass = false; if (!fp->jit_requested) return org_fp; tmp_fp = bpf_jit_blind_constants(org_fp); if (IS_ERR(tmp_fp)) return org_fp; if (tmp_fp != org_fp) { bpf_blinded = true; fp = tmp_fp; } jit_data = fp->aux->jit_data; if (!jit_data) { jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); if (!jit_data) { fp = org_fp; goto out; } fp->aux->jit_data = jit_data; } flen = fp->len; addrs = jit_data->addrs; if (addrs) { cgctx = jit_data->ctx; image = jit_data->image; bpf_hdr = jit_data->header; proglen = jit_data->proglen; alloclen = proglen + FUNCTION_DESCR_SIZE; extra_pass = true; goto skip_init_ctx; } addrs = kcalloc(flen + 1, sizeof(*addrs), GFP_KERNEL); if (addrs == NULL) { fp = org_fp; goto out_addrs; } memset(&cgctx, 0, sizeof(struct codegen_context)); /* Make sure that the stack is quadword aligned. */ cgctx.stack_size = round_up(fp->aux->stack_depth, 16); /* Scouting faux-generate pass 0 */ if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { /* We hit something illegal or unsupported. */ fp = org_fp; goto out_addrs; } /* * Pretend to build prologue, given the features we've seen. This will * update ctgtx.idx as it pretends to output instructions, then we can * calculate total size from idx. */ bpf_jit_build_prologue(0, &cgctx); bpf_jit_build_epilogue(0, &cgctx); proglen = cgctx.idx * 4; alloclen = proglen + FUNCTION_DESCR_SIZE; bpf_hdr = bpf_jit_binary_alloc(alloclen, &image, 4, bpf_jit_fill_ill_insns); if (!bpf_hdr) { fp = org_fp; goto out_addrs; } skip_init_ctx: code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); if (extra_pass) { /* * Do not touch the prologue and epilogue as they will remain * unchanged. Only fix the branch target address for subprog * calls in the body. * * This does not change the offsets and lengths of the subprog * call instruction sequences and hence, the size of the JITed * image as well. */ bpf_jit_fixup_subprog_calls(fp, code_base, &cgctx, addrs); /* There is no need to perform the usual passes. */ goto skip_codegen_passes; } /* Code generation passes 1-2 */ for (pass = 1; pass < 3; pass++) { /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; bpf_jit_build_prologue(code_base, &cgctx); bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass); bpf_jit_build_epilogue(code_base, &cgctx); if (bpf_jit_enable > 1) pr_info("Pass %d: shrink = %d, seen = 0x%x\n", pass, proglen - (cgctx.idx * 4), cgctx.seen); } skip_codegen_passes: if (bpf_jit_enable > 1) /* * Note that we output the base address of the code_base * rather than image, since opcodes are in code_base. */ bpf_jit_dump(flen, proglen, pass, code_base); #ifdef PPC64_ELF_ABI_v1 /* Function descriptor nastiness: Address + TOC */ ((u64 *)image)[0] = (u64)code_base; ((u64 *)image)[1] = local_paca->kernel_toc; #endif fp->bpf_func = (void *)image; fp->jited = 1; fp->jited_len = alloclen; bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); if (!fp->is_func || extra_pass) { out_addrs: kfree(addrs); kfree(jit_data); fp->aux->jit_data = NULL; } else { jit_data->addrs = addrs; jit_data->ctx = cgctx; jit_data->proglen = proglen; jit_data->image = image; jit_data->header = bpf_hdr; } out: if (bpf_blinded) bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp); return fp; }