dr_emit_flags_t bb_event(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, bool translating) { instr_t *instr, *next_instr; app_pc bb_addr = dr_fragment_app_pc(tag); if (bb_addr == start_pc) { instrument = true; } else if (bb_addr == stop_pc) { instrument = false; } if (!instrument) { return DR_EMIT_DEFAULT; } for (instr = instrlist_first(bb); instr != NULL; instr = next_instr) { next_instr = instr_get_next(instr); /* * Conditional branch. We can determine the target and * fallthrough addresses here, but we need to instrument if we * want to record the edge only if it actually executes at * runtime. Instead of using dr_insert_cbr_instrumentation, * we'll insert separate instrumentation for the taken and not * taken cases and remove it separately after we see each * case. */ if (instr_is_cbr(instr)) { app_pc src = instr_get_app_pc(instr); cbr_state_t state; bool insert_taken, insert_not_taken; /* First look up the state of this branch so we * know what instrumentation to insert, if any. */ elem_t *elem = lookup(table, src); if (elem == NULL) { state = CBR_NONE; insert(table, src, CBR_NONE); } else { state = elem->state; } insert_taken = (state & CBR_TAKEN) == 0; insert_not_taken = (state & CBR_NOT_TAKEN) == 0; if (insert_taken || insert_not_taken) { app_pc fall = (app_pc)decode_next_pc(drcontext, (byte *)src); app_pc targ = instr_get_branch_target_pc(instr); /* * Redirect the cbr to jump to the 'taken' callout. * We'll insert a 'not-taken' callout at fallthrough * address. */ instr_t *label = INSTR_CREATE_label(drcontext); instr_set_meta(instr); instr_set_translation(instr, NULL); /* If this is a short cti, make sure it can reach its new target */ if (instr_is_cti_short(instr)) { /* if jecxz/loop we want to set the target of the long-taken * so set instr to the return value */ instr = instr_convert_short_meta_jmp_to_long(drcontext, bb, instr); } instr_set_target(instr, opnd_create_instr(label)); if (insert_not_taken) { /* * Callout for the not-taken case */ dr_insert_clean_call(drcontext, bb, NULL, (void *)at_not_taken, false /* don't save fp state */, 2 /* 2 args for at_not_taken */, OPND_CREATE_INTPTR((ptr_uint_t)src), OPND_CREATE_INTPTR((ptr_uint_t)fall)); } /* * Jump to the original fall-through address. * (This should not be a meta-instruction). */ instrlist_preinsert( bb, NULL, INSTR_XL8(INSTR_CREATE_jmp(drcontext, opnd_create_pc(fall)), fall)); /* label goes before the 'taken' callout */ MINSERT(bb, NULL, label); if (insert_taken) { /* * Callout for the taken case */ dr_insert_clean_call(drcontext, bb, NULL, (void *)at_taken, false /* don't save fp state */, 2 /* 2 args for at_taken */, OPND_CREATE_INTPTR((ptr_uint_t)src), OPND_CREATE_INTPTR((ptr_uint_t)targ)); } /* * Jump to the original target block (this should * not be a meta-instruction). */ instrlist_preinsert( bb, NULL, INSTR_XL8(INSTR_CREATE_jmp(drcontext, opnd_create_pc(targ)), targ)); } } } /* since our added instrumentation is not constant, we ask to store * translations now */ return DR_EMIT_STORE_TRANSLATIONS; }
static dr_emit_flags_t bb_event(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, bool translating) { instr_t *instr, *next_instr; for (instr = instrlist_first(bb); instr != NULL; instr = next_instr) { next_instr = instr_get_next(instr); if (instr_is_cbr(instr)) { /* Conditional branch. We can determine the target and * fallthrough addresses here, but we want to note the * edge if and when it actually executes at runtime. * Instead of using dr_insert_cbr_instrumentation(), we'll * insert separate instrumentation for the taken and not * taken cases and remove the instrumentation for an edge * after it executes. */ cbr_state_t state; bool insert_taken, insert_not_taken; app_pc src = instr_get_app_pc(instr); /* First look up the state of this branch so we * know what instrumentation to insert, if any. */ elem_t *elem = lookup(table, src); if (elem == NULL) { state = CBR_NEITHER; insert(table, src, CBR_NEITHER); } else { state = elem->state; } insert_taken = (state & CBR_TAKEN) == 0; insert_not_taken = (state & CBR_NOT_TAKEN) == 0; if (insert_taken || insert_not_taken) { app_pc fall = (app_pc)decode_next_pc(drcontext, (byte *)src); app_pc targ = instr_get_branch_target_pc(instr); /* Redirect the existing cbr to jump to a callout for * the 'taken' case. We'll insert a 'not-taken' * callout at the fallthrough address. */ instr_t *label = INSTR_CREATE_label(drcontext); /* should be meta, and meta-instrs shouldn't have translations */ instr_set_meta_no_translation(instr); /* it may not reach (in particular for x64) w/ our added clean call */ if (instr_is_cti_short(instr)) { /* if jecxz/loop we want to set the target of the long-taken * so set instr to the return value */ instr = instr_convert_short_meta_jmp_to_long(drcontext, bb, instr); } instr_set_target(instr, opnd_create_instr(label)); if (insert_not_taken) { /* Callout for the not-taken case. Insert after * the cbr (i.e., 3rd argument is NULL). */ dr_insert_clean_call(drcontext, bb, NULL, (void*)at_not_taken, false /* don't save fp state */, 2 /* 2 args for at_not_taken */, OPND_CREATE_INTPTR(src), OPND_CREATE_INTPTR(fall)); } /* After the callout, jump to the original fallthrough * address. Note that this is an exit cti, and should * not be a meta-instruction. Therefore, we use * preinsert instead of meta_preinsert, and we must * set the translation field. On Windows, this jump * and the final jump below never execute since the * at_taken and at_not_taken callouts redirect * execution and never return. However, since the API * expects clients to produced well-formed code, we * insert explicit exits from the block for Windows as * well as Linux. */ instrlist_preinsert(bb, NULL, INSTR_XL8(INSTR_CREATE_jmp (drcontext, opnd_create_pc(fall)), fall)); /* label goes before the 'taken' callout */ MINSERT(bb, NULL, label); if (insert_taken) { /* Callout for the taken case */ dr_insert_clean_call(drcontext, bb, NULL, (void*)at_taken, false /* don't save fp state */, 2 /* 2 args for at_taken */, OPND_CREATE_INTPTR(src), OPND_CREATE_INTPTR(targ)); } /* After the callout, jump to the original target * block (this should not be a meta-instruction). */ instrlist_preinsert(bb, NULL, INSTR_XL8(INSTR_CREATE_jmp (drcontext, opnd_create_pc(targ)), targ)); } } } /* since our added instrumentation is not constant, we ask to store * translations now */ return DR_EMIT_STORE_TRANSLATIONS; }
/* Here we attempt to combine a loop involving ldex (load exclusive) and * stex (store exclusive) into an OP_ldstex macro-instruction. The algorithm * is roughly this: * * Decode up to (2 * N) instructions while: * - none of them are indirect branches or system calls * - none of them is a direct branch out of these (2 * N) instructions * - none of them is OP_xx (to be safe) * - there is, or might yet be, both ldex and stex in the first N * - none of them is a non-branch PC-relative instruction: ADR, ADRP, * PC-relative PRFM, literal load (this last condition could be removed * if we mangled such instructions as we encountered them) * * To save time, give up if the first instruction is neither ldex nor stex * and there is no branch to it. * Take a sub-block containing both ldex and stex from the first N instructions. * Expand this sub-block to a minimal single-entry single-exit block. * Give up if the sub-block grows beyond N instructions. * Finally, give up if the sub-block does not contain the first instruction. * Also give up if the sub-block uses all of X0-X5 and the stolen register * because we would be unable to mangle such a block. * * XXX: This function uses a lot of CPU time. It could be made faster in * several ways, for example by caching decoded instructions or using a * custom decoder to recognise the particular instructions that we care * about here. */ byte * decode_ldstex(dcontext_t *dcontext, byte *pc_, byte *orig_pc_, instr_t *instr_ldstex) { # define N (MAX_INSTR_LENGTH / AARCH64_INSTR_SIZE) instr_t ibuf[2 * N]; uint *pc = (uint *)pc_; uint *orig_pc = (uint *)orig_pc_; bool seen_ldex = false; bool seen_stex = false; bool seen_branch_to_start = false; bool failed = false; int ldstex_beg = -1; int ldstex_end = -1; int i, len; /* Decode up to 2 * N instructions. */ for (i = 0; i < N; i++) { instr_t *instr = &ibuf[i]; instr_init(dcontext, instr); decode_from_copy(dcontext, (byte *)(pc + i), (byte *)(orig_pc + i), instr); if (instr_is_mbr_arch(instr) || instr_is_syscall(instr) || instr_get_opcode(instr) == OP_xx || instr_is_nonbranch_pcrel(instr)) break; if (instr_is_ubr_arch(instr) || instr_is_cbr_arch(instr)) { ptr_uint_t target = (ptr_uint_t)instr_get_branch_target_pc(instr); if (target < (ptr_uint_t)pc || target > (ptr_uint_t)(pc + 2 * N)) break; if (target == (ptr_uint_t)pc) seen_branch_to_start = true; } if (instr_is_exclusive_load(instr)) seen_ldex = true; if (instr_is_exclusive_store(instr)) seen_stex = true; if (i + 1 >= N && !(seen_ldex && seen_stex)) break; if (ldstex_beg == -1 && (seen_ldex || seen_stex)) ldstex_beg = i; if (ldstex_end == -1 && (seen_ldex && seen_stex)) ldstex_end = i + 1; } if (i < N) { instr_reset(dcontext, &ibuf[i]); len = i; } else len = N; /* Quick check for hopeless situations. */ if (len == 0 || !(seen_ldex && seen_stex) || !(seen_branch_to_start || (instr_is_exclusive_load(&ibuf[0]) || instr_is_exclusive_store(&ibuf[0])))) { for (i = 0; i < len; i++) instr_reset(dcontext, &ibuf[i]); return NULL; } /* There are several ways we could choose a sub-block containing both ldex * and stex from the first N instructions. Investigate further, perhaps. * We have already set ldstex_beg and ldstex_end. */ ASSERT(ldstex_beg != -1 && ldstex_end != -1 && ldstex_beg < ldstex_end); /* Expand ldstex sub-block until it is a single-entry single-exit block. */ for (;;) { int new_beg = ldstex_beg; int new_end = ldstex_end; for (i = ldstex_beg; i < ldstex_end; i++) { instr_t *instr = &ibuf[i]; if (instr_is_ubr_arch(instr) || instr_is_cbr_arch(instr)) { int target = (uint *)instr_get_branch_target_pc(instr) - pc; if (target > len) { failed = true; break; } if (target < new_beg) new_beg = target; if (target > new_end) new_end = target; } } if (new_beg == ldstex_beg && new_end == ldstex_end) break; ldstex_beg = new_beg; ldstex_end = new_end; } if (ldstex_beg != 0) failed = true; if (!failed) { /* Check whether the sub-block uses the stolen register and all of X0-X5. * If it does, it would be impossible to mangle it so it is better not to * create an OP_ldstex. */ reg_id_t regs[] = { dr_reg_stolen, DR_REG_X0, DR_REG_X1, DR_REG_X2, DR_REG_X3, DR_REG_X4, DR_REG_X5 }; int r; for (r = 0; r < sizeof(regs) / sizeof(*regs); r++) { for (i = ldstex_beg; i < ldstex_end; i++) { if (instr_uses_reg(&ibuf[i], regs[r])) break; } if (i >= ldstex_end) break; } if (r >= sizeof(regs) / sizeof(*regs)) failed = true; } if (!failed) { instr_create_ldstex(dcontext, ldstex_end - ldstex_beg, pc + ldstex_beg, &ibuf[ldstex_beg], instr_ldstex); } for (i = 0; i < len; i++) instr_reset(dcontext, &ibuf[i]); return failed ? NULL : (byte *)(pc + ldstex_end); }