int main(int argc, char *argv[]) { ROSE_INITIALIZE; Diagnostics::initAndRegister(&::mlog, "tool"); Settings settings; P2::Engine engine; engine.doingPostAnalysis(false); // not needed by this tool std::vector<std::string> specimens = parseCommandLine(argc, argv, engine, settings); P2::Partitioner partitioner = engine.partition(specimens); if (settings.traceInsns || settings.traceSemantics) ::mlog[TRACE].enable(); // Find the string decoder. if (!partitioner.functionExists(settings.decoderVa)) { ::mlog[FATAL] <<"cannot find decoder function at " <<StringUtility::addrToString(settings.decoderVa) <<"\n"; exit(1); } if (settings.synthesized) { processSynthesizedCalls(partitioner, settings); } else { processExistingCalls(partitioner, settings); } }
static void processExistingCalls(const P2::Partitioner &partitioner, const Settings &settings) { P2::ControlFlowGraph::ConstVertexIterator decoderVertex = partitioner.findPlaceholder(settings.decoderVa); ASSERT_require(partitioner.cfg().isValidVertex(decoderVertex)); VirtualMachine vm(partitioner, settings); // Find all calls to the decoder function BOOST_FOREACH (const P2::ControlFlowGraph::Edge &edge, decoderVertex->inEdges()) { if (edge.value().type() != P2::E_FUNCTION_CALL) continue; const P2::ControlFlowGraph::ConstVertexIterator caller = edge.source(); if (caller->value().type() != P2::V_BASIC_BLOCK || caller->value().bblock()==NULL) continue; ::mlog[TRACE] <<"decoder called at " <<partitioner.edgeName(edge) <<"\n"; // Reset the virtual machine vm.reset(partitioner.memoryMap()); vm.setIp(caller->value().address()); // Decoder return addresses std::set<rose_addr_t> breakpoints; BOOST_FOREACH (const P2::ControlFlowGraph::ConstEdgeIterator &callret, P2::findCallReturnEdges(caller)) { const P2::ControlFlowGraph::ConstVertexIterator returnVertex = callret->target(); if (returnVertex->value().type() == P2::V_BASIC_BLOCK) breakpoints.insert(returnVertex->value().address()); } // Execute until the return address rose_addr_t resultVa = 0; std::string stringId; breakpoints.insert(settings.decoderVa); while (1) { rose_addr_t ip = 0; try { ip = vm.run(partitioner, settings, breakpoints); } catch (const std::runtime_error &e) { ::mlog[WARN] <<StringUtility::addrToString(ip) <<": " <<e.what() <<"\n"; break; } if (ip == settings.decoderVa) { // When entering the decoder, save [esp+0xc] since this is the address of the decoded string. breakpoints.erase(settings.decoderVa); if (settings.showCall) std::cout <<"(" <<partitioner.edgeName(edge) <<")" <<arguments(vm, settings.showCall) <<"\n"; stringId = "string-" + StringUtility::numberToString(vm.argument(0)->get_number()); resultVa = vm.argument(2)->get_number(); } else { // When leaving the decoder, print the decoded string std::string str = vm.readString(resultVa); std::cout <<stringId <<"\t\"" <<StringUtility::cEscape(str) <<"\"\n"; break; } } } }
//! [basicReadTest] static void basicReadTest(const P2::Partitioner &partitioner) { std::cout <<"\n" <<std::string(40, '=') <<"\nbasicReadTest\n" <<std::string(40, '=') <<"\n"; SymbolicSemantics::Formatter fmt; fmt.set_line_prefix(" "); // Create the RiscOperators and the initial state. const RegisterDictionary *regdict = partitioner.instructionProvider().registerDictionary(); const RegisterDescriptor REG = partitioner.instructionProvider().stackPointerRegister(); const std::string REG_NAME = RegisterNames(regdict)(REG); BaseSemantics::RiscOperatorsPtr ops = SymbolicSemantics::RiscOperators::instance(regdict); ops->currentState()->memoryState()->set_byteOrder(partitioner.instructionProvider().defaultByteOrder()); BaseSemantics::StatePtr initialState = ops->currentState()->clone(); ops->initialState(initialState); // lazily evaluated initial state std::cout <<"Initial state before reading:\n" <<(*initialState+fmt); // Read some memory and a register, which should cause them to spring into existence in both the current state and the // initial state. BaseSemantics::SValuePtr addr1 = ops->number_(32, 0); BaseSemantics::SValuePtr dflt1m = ops->number_(32, 0x11223344); BaseSemantics::SValuePtr read1m = ops->readMemory(RegisterDescriptor(), addr1, dflt1m, ops->boolean_(true)); BaseSemantics::SValuePtr dflt1r = ops->undefined_(REG.get_nbits()); BaseSemantics::SValuePtr read1r = ops->readRegister(REG, dflt1r); std::cout <<"Initial state after reading " <<*read1m <<" from address " <<*addr1 <<"\n" <<"and " <<*read1r <<" from " <<REG_NAME <<"\n" <<(*initialState+fmt); ASSERT_always_require(read1m->must_equal(dflt1m)); ASSERT_always_require(read1r->must_equal(dflt1r)); // Create a new current state and read again. We should get the same value even though the current state is empty. BaseSemantics::StatePtr curState = ops->currentState()->clone(); curState->clear(); ops->currentState(curState); BaseSemantics::SValuePtr dflt2m = ops->number_(32, 0x55667788); BaseSemantics::SValuePtr read2m = ops->readMemory(RegisterDescriptor(), addr1, dflt2m, ops->boolean_(true)); BaseSemantics::SValuePtr dflt2r = ops->undefined_(REG.get_nbits()); BaseSemantics::SValuePtr read2r = ops->readRegister(REG, dflt2r); std::cout <<"Initial state after reading " <<*read2m <<" from address " <<*addr1 <<"\n" <<"and " <<*read2r <<" from " <<REG_NAME <<"\n" <<(*initialState+fmt); ASSERT_always_require(read1m->must_equal(read2m)); ASSERT_always_require(read1r->must_equal(read2r)); // Disable the initial state. If we re-read the same address we'll still get the same result because it's now present in // the current state also. ops->initialState(BaseSemantics::StatePtr()); BaseSemantics::SValuePtr dflt3m = ops->number_(32, 0x99aabbcc); BaseSemantics::SValuePtr read3m = ops->readMemory(RegisterDescriptor(), addr1, dflt3m, ops->boolean_(true)); BaseSemantics::SValuePtr dflt3r = ops->undefined_(REG.get_nbits()); BaseSemantics::SValuePtr read3r = ops->readRegister(REG, dflt3r); ASSERT_always_require(read1m->must_equal(read3m)); ASSERT_always_require(read1r->must_equal(read3r)); }
static void processSynthesizedCalls(const P2::Partitioner &partitioner, const Settings &settings) { const rose_addr_t lengthVa = settings.stackVa + 0x1000; // arbitrary address (at least 4 bytes of space) const rose_addr_t resultVa = settings.stackVa + 0x1010; // arbitrary address (at least 104 bytes of space) VirtualMachine vm(partitioner, settings); // virtual machine using concrete semantics for (size_t strId=1; strId<=0x7a; ++strId) { vm.reset(partitioner.memoryMap()); // reset virtual machine to initial conditions vm.writeMemory(lengthVa, 0x68); // limited length vm.push(resultVa); // arg #3 is the address of the decoded string's buffer vm.push(lengthVa); // arg #2 is the returned length of the decoded string vm.push(strId); // arg #1 is the string id number in [1 .. 0x7a] vm.push(vm.returnMarker()); // to know when to stop vm.setIp(settings.decoderVa); // starting postion if (settings.showCall) std::cout <<"(*" <<StringUtility::addrToString(settings.decoderVa) <<")" <<arguments(vm, settings.showCall) <<"\n"; try { vm.run(partitioner, settings); // run until returnMarker is executed } catch (const std::runtime_error &e) { ::mlog[WARN] <<e.what() <<"\n"; continue; } std::string str = vm.readString(resultVa); // read the NUL-terminated string std::cout <<"string-" <<std::hex <<strId <<std::dec <<"\t\"" <<StringUtility::cEscape(str) <<"\"\n"; } }
VirtualMachine(const P2::Partitioner &partitioner, const Settings &settings) : wordSize_(0), stackVa_(settings.stackVa), returnMarker_(0xbeef0967) { const RegisterDictionary *regs = partitioner.instructionProvider().registerDictionary(); ops_ = ConcreteSemantics::RiscOperators::instance(regs); if (settings.traceSemantics) { BaseSemantics::RiscOperatorsPtr traceOps = TraceSemantics::RiscOperators::instance(ops_); cpu_ = partitioner.newDispatcher(traceOps); } else { cpu_ = partitioner.newDispatcher(ops_); } if (cpu_==NULL) throw std::runtime_error("no semantics for architecture"); regIp_ = partitioner.instructionProvider().instructionPointerRegister(); regSp_ = partitioner.instructionProvider().stackPointerRegister(); regSs_ = partitioner.instructionProvider().stackSegmentRegister(); wordSize_ = regIp_.get_nbits(); }
rose_addr_t run(const P2::Partitioner &partitioner, const Settings &settings, const std::set<rose_addr_t> &breakpoints = std::set<rose_addr_t>()) { for (size_t nInsns=0; nInsns<settings.insnLimit; ++nInsns) { rose_addr_t ip = ops_->readRegister(regIp_)->get_number(); if (ip == returnMarker_ || (nInsns>0 && breakpoints.find(ip)!=breakpoints.end())) return ip; SgAsmInstruction *insn = partitioner.instructionProvider()[ip]; if (!insn) throw std::runtime_error("no instruction at " + StringUtility::addrToString(ip)); if (settings.traceInsns && ::mlog[TRACE]) ::mlog[TRACE] <<unparseInstructionWithAddress(insn) <<"\n"; cpu_->processInstruction(insn); } throw std::runtime_error("execution limit exceeded ("+StringUtility::plural(settings.insnLimit, "instructions")+")"); }
// Run natively and return number of instructions executed and reason for termination. static std::pair<size_t, std::string> runNatively(const Settings &settings, const std::string &specimenName, Sawyer::Optional<rose_addr_t> initVa, const P2::Partitioner &partitioner, rose_addr_t randomAddress) { Stream debug(mlog[DEBUG]); BinaryDebugger debugger(specimenName); if (debugger.isTerminated()) { mlog[FATAL] <<"child " <<debugger.isAttached() <<" " <<debugger.howTerminated() <<" before we could gain control\n"; exit(1); } // Allow child to run until we hit the desired address. if (initVa) { debugger.setBreakpoint(*initVa); debugger.runToBreakpoint(); debugger.clearBreakpoint(*initVa); if (debugger.isTerminated()) { mlog[FATAL] <<"child " <<debugger.isAttached() <<" " <<debugger.howTerminated() <<" without reaching " <<addrToString(*initVa) <<"\n"; exit(1); } } // Show specimen address map so we can verify that the Linux loader used the same addresses we used. // We could have shown it earlier, but then we wouldn't have seen the results of dynamic linking. if (settings.showMaps) { std::cout <<"Linux loader specimen memory map:\n"; system(("cat /proc/" + numberToString(debugger.isAttached()) + "/maps").c_str()); } // Branch to the starting address debug <<"branching to " <<addrToString(randomAddress) <<"\n"; debugger.executionAddress(randomAddress); std::string terminationReason; size_t nExecuted = 0; // number of instructions executed while (1) { // Check for and avoid system calls if necessary if (!settings.allowSyscalls) { rose_addr_t eip = debugger.executionAddress(); SgAsmX86Instruction *insn = isSgAsmX86Instruction(partitioner.instructionProvider()[eip]); if (!insn || insn->isUnknown()) { if (settings.showInsnTrace) std::cout <<"at " <<addrToString(eip) <<": " <<(insn?"no":"unknown") <<" instruction\n"; terminationReason = "executed at " + addrToString(eip) +" which we don't know about"; break; } if (settings.showInsnTrace) std::cout <<"at " <<unparseInstructionWithAddress(insn) <<"\n"; if (insn->get_kind() == x86_int || insn->get_kind() == x86_sysenter) { terminationReason = "tried to execute a system call"; break; } } // Single-step if (debug) debug <<"single stepping at " <<addrToString(debugger.executionAddress()) <<"\n"; debugger.singleStep(); if (debugger.isTerminated()) { terminationReason = debugger.howTerminated(); break; } ++nExecuted; if (settings.maxInsns!=0 && nExecuted>=settings.maxInsns) { terminationReason = "reached instruction limit"; break; } } debugger.terminate(); return std::make_pair(nExecuted, terminationReason); }
int main(int argc, char *argv[]) { // This paragraph initializes the ROSE library, generates the man page for this tool, does command-line parsing for quite a // few switches including "--help", loads various specimen resources (ELF/PE, running process, raw memory dumps, etc), // disassembles, and partitions. We could have called Engine::frontend() and done it all in one function call, but then we // wouldn't have a Partitioner2::Partitioner object that we need below. std::string purpose = "demonstrate inter-function disassembly"; std::string description = "Disassembles and partitions the specimen(s), then tries to disassemble things between the functions."; P2::Engine engine; std::vector<std::string> specimens = engine.parseCommandLine(argc, argv, purpose, description).unreachedArgs(); P2::Partitioner partitioner = engine.partition(specimens); // The partitioner's address usage map (AUM) describes what part of memory has been disassembled as instructions or // data. We're interested in the unused parts between the lowest and highest disassembled addresses, so we loop over those // parts. The hull() is the entire used interval -- lowest to highest addresses used regardless of the unused areas in the // middle. An AddressInterval evaluated in boolean context returns false if it's empty. rose_addr_t va = partitioner.aum().hull().least(); while (AddressInterval unused = partitioner.aum().nextUnused(va)) { // Is the unused area beyond the last thing compiled? We're only interested in the stuff between functions. This // check also means that unused.greatest()+1 will not overflow, which simplifies later code. Overflows are easy to // trigger when the specimen's word size is the same as ROSE's word size. if (unused.least() > partitioner.aum().hull().greatest()) break; // The unused address might be in the middle of some very large unmapped area of memory, or perhaps in an area that // doesn't have execute permission (the partitioner will only disassemble at addresses that we've marked as // executable). A naive implementation would just increment to the next address and try again, but that could take a // very long time. This "if" statement will give us the next executable address that falls within the unused interval // if possible. The address is assigned to "va" if possible. if (!engine.memoryMap().within(unused).require(MemoryMap::EXECUTABLE).next().assignTo(va)) { va = unused.greatest() + 1; // won't overflow because of check above continue; } // "va" now points to an executable address that the partitioner doesn't know about yet. ASSERT_require(engine.memoryMap().at(va).require(MemoryMap::EXECUTABLE).exists()); ASSERT_forbid(partitioner.aum().instructionExists(va)); std::cout <<"unused address " <<StringUtility::addrToString(va) <<"\n"; // Cause the partitioner to discover (disassemble) one basic block. This doesn't add the basic block to the // partitioner or change the partitioner in any way. If the BB isn't something we want to keep then just forget about // it and garbage collection will reclaim the memory. P2::BasicBlock::Ptr bb = partitioner.discoverBasicBlock(va); if (!isGoodBasicBlock(bb)) { ++va; continue; } std::cout <<" disassembled " <<bb->printableName() <<"\n"; // Inform the partitioner that we wish to keep this BB. partitioner.attachBasicBlock(bb); // This BB was not reachable by any previous CFG edge, therefore it doesn't belong to any function. In order for it to // show up in the eventual AST we need to add it to some function (the ROSE AST has a requirement that every basic // block belongs to a function, although the partitioner can easily cope with the other case). The easiest way in this // situation is to just create a new function whose entry block is this BB. Creating a function doesn't modify the // partitioner in any way, so we need to also attach the function to the partitioner. P2::Function::Ptr function = P2::Function::instance(va, SgAsmFunction::FUNC_USERDEF); function->insertBasicBlock(va); // allowed only before attaching function to partitioner partitioner.attachOrMergeFunction(function); // This basic block might be the first block of a whole bunch that are connected by as yet undiscovered CFG edges. We // can recursively discover and attach all those blocks with one Engine method. There are also Partitioner methods to // do similar things, but they're lower level. engine.runPartitionerRecursive(partitioner); } // We've probably added a bunch more functions and basic blocks to the partitioner, but we haven't yet assigned the basic // blocks discovered by Engine::runPartitionerRecursive to any functions. We might also need to assign function labels // from ELF/PE information, re-run some analysis, etc., so do that now. engine.runPartitionerFinal(partitioner); // Most ROSE analysis is performed on an abstract syntax tree, so generate one. If the specime is an ELF or PE container // then the returned global block will also be attached somewhere below a SgProject node, otherwise the returned global // block is the root of the AST and there is no project (e.g., like when the specimen is a raw memory dump). SgAsmBlock *gblock = P2::Modules::buildAst(partitioner, engine.interpretation()); // Generate an assembly listing. These unparser properties are all optional, but they result in more informative assembly // listings. AsmUnparser unparser; unparser.set_registers(partitioner.instructionProvider().registerDictionary()); unparser.add_control_flow_graph(ControlFlow().build_block_cfg_from_ast<ControlFlow::BlockGraph>(gblock)); unparser.staticDataDisassembler.init(engine.disassembler()); unparser.unparse(std::cout, gblock); }
int main(int argc, char *argv[]) { ROSE_INITIALIZE; Diagnostics::initAndRegister(&mlog, "tool"); // Parse the command-line to configure the partitioner engine, obtain the executable and its arguments, and generate a man // page, adjust global settings, etc. This demo tool has no switches of its own, which makes this even easier. For a // production tool, it's probably better to obtain the parser and register only those switches we need (e.g., no need for // AST generation switches since we skip that step), to set it up to use our own diagnostic stream instead of exceptions, // and to adjust this tool's synopsis in the documentation. Examples of all of these can be found in other demos. P2::Engine engine; engine.doingPostAnalysis(false); // no need for any post-analysis phases (user can override on cmdline) std::vector<std::string> command; try { command = engine.parseCommandLine(argc, argv, purpose, description).unreachedArgs(); } catch (const std::runtime_error &e) { mlog[FATAL] <<"invalid command-line: " <<e.what() <<"\n"; exit(1); } if (command.empty()) { mlog[FATAL] <<"no executable specified\n"; exit(1); } // Since we'll be tracing this program's execution, we might as well disassemble the process's memory directly. That way we // don't have to worry about ROSE mapping the specimen to the same virtual address as the kernel (which might be using // address randomization). We can stop short of generating the AST because we won't need it. BinaryAnalysis::BinaryDebugger debugger(command); std::string specimenResourceName = "proc:noattach:" + StringUtility::numberToString(debugger.isAttached()); P2::Partitioner partitioner = engine.partition(specimenResourceName); partitioner.memoryMap()->dump(std::cerr); // show the memory map as a debugging aid // Create a global control flow graph whose vertices are instructions from a global CFG whose verts are mostly basic // blocks. InsnCfg insnCfg; const P2::ControlFlowGraph &bbCfg = partitioner.cfg(); BOOST_FOREACH (const P2::ControlFlowGraph::Vertex &bbVert, bbCfg.vertices()) { if (P2::BasicBlock::Ptr bb = isBasicBlock(bbVert)) { const std::vector<SgAsmInstruction*> &insns = bb->instructions(); // Each basic block has one or more instructions that need to be inserted into our instruction control flow graph // with edges from each instruction to the next. The insertEdgeWithVertices automatically inserts missing // vertices, and doesn't insert vertices that already exist, making it convenient for this type of construction. for (size_t i=1; i<insns.size(); ++i) insnCfg.insertEdgeWithVertices(insns[i-1], insns[i]); // The final instruction of this block needs to flow into each of the initial instructions of the successor basic // blocks. Be careful that the successors are actually existing basic blocks. Note that in ROSE's global CFG, a // function call has at least two successors: the function being called (normal edges), and the address to which // the function returns ("callret" edges). There are other types of edges too, but we want only the normal edges. BOOST_FOREACH (const P2::ControlFlowGraph::Edge &bbEdge, bbVert.outEdges()) { if (bbEdge.value().type() == P2::E_NORMAL) { if (P2::BasicBlock::Ptr target = isBasicBlock(*bbEdge.target())) insnCfg.insertEdgeWithVertices(insns.back(), target->instructions()[0]); } } } } mlog[INFO] <<"block CFG: " <<StringUtility::plural(bbCfg.nVertices(), "vertices", "vertex") <<", " <<StringUtility::plural(bbCfg.nEdges(), "edges") <<"\n"; mlog[INFO] <<"insn CFG: " <<StringUtility::plural(insnCfg.nVertices(), "vertices", "vertex") <<", " <<StringUtility::plural(insnCfg.nEdges(), "edges") <<"\n"; // Run the executable to obtain a trace. We use the instruction pointer to look up a SgAsmInstruction in the insnCfg and // thus map the trace onto the instruction CFG. mlog[INFO] <<"running subordinate to obtain trace: " <<boost::join(command, " ") <<"\n"; std::set<rose_addr_t> missingAddresses; Trace trace; while (!debugger.isTerminated()) { // Find the instruction CFG vertex corresponding to the current execution address. It could be that the execution // address doesn't exist in the CFG, and this can be caused by a number of things including failure of ROSE to // statically find the address, dynamic libraries that weren't loaded statically, etc. rose_addr_t va = debugger.executionAddress(); InsnCfg::ConstVertexIterator vertex = insnCfg.findVertexKey(va); if (!insnCfg.isValidVertex(vertex)) { missingAddresses.insert(va); } else { trace.append(vertex->id()); } debugger.singleStep(); } mlog[INFO] <<"subordinate " <<debugger.howTerminated() <<"\n"; mlog[INFO] <<"trace length: " <<StringUtility::plural(trace.size(), "instructions") <<"\n"; Diagnostics::mfprintf(mlog[INFO])("overall burstiness: %6.2f%%\n", 100.0 * trace.burstiness()); mlog[INFO] <<"distinct executed addresses missing from CFG: " <<missingAddresses.size() <<"\n"; // Print a list of CFG vertices that were never reached. We use std::cout rather than diagnostics because this is one of // the main outputs of this demo. The "if" condition is constant time. BOOST_FOREACH (const InsnCfg::Vertex &vertex, insnCfg.vertices()) { if (!trace.exists(vertex.id())) std::cout <<"not executed: " <<unparseInstructionWithAddress(vertex.value()) <<"\n"; } // Print list of addresses that were executed but did not appear in the CFG BOOST_FOREACH (rose_addr_t va, missingAddresses) std::cout <<"missing address: " <<StringUtility::addrToString(va) <<"\n"; // Print those branch instructions that were executed by the trace but always took the same branch. Just to mix things up, // I'll iterate over the trace labels this time instead of the CFG vertices. Remember, the labels are the integer IDs of // the CFG vertices. The "if" condition executes in constant time, as does the next line. for (size_t i = 0; i < trace.nLabels(); ++i) { if (insnCfg.findVertex(i)->nOutEdges() > 1 && trace.successors(i).size() == 1) { SgAsmInstruction *successor = insnCfg.findVertex(*trace.successorSet(i).begin())->value(); std::cout <<"single flow: " <<unparseInstructionWithAddress(insnCfg.findVertex(i)->value()) <<" --> " <<unparseInstructionWithAddress(successor) <<"\n"; } } // Get a list of executed instructions that are branch points and sort them by their burstiness. The "if" condition is // constant time. std::vector<InsnTraceInfo> info; BOOST_FOREACH (const InsnCfg::Vertex &vertex, insnCfg.vertices()) { if (vertex.nOutEdges() > 1 && trace.exists(vertex.id())) info.push_back(InsnTraceInfo(vertex.value(), trace.burstiness(vertex.id()), trace.size(vertex.id()))); } std::sort(info.begin(), info.end()); std::reverse(info.begin(), info.end()); BOOST_FOREACH (const InsnTraceInfo &record, info) { Diagnostics::mfprintf(std::cout)("burstiness %6.2f%% %5zu hits at %s\n", 100.0*record.burstiness, record.nHits, unparseInstructionWithAddress(record.insn).c_str()); }