int main(int argc, char *argv[]) { ROSE_INITIALIZE; Diagnostics::initAndRegister(&mlog, "tool"); // Parse command line Settings settings; std::vector<std::string> args = parseCommandLine(argc, argv, settings).unreachedArgs(); if (args.size()!=2) throw std::runtime_error("invalid usage; see --help"); // Load the CSV files FunctionByAddress code1, data1, code2, data2; readCsvFile(args[0], code1 /*out*/, data1 /*out*/); readCsvFile(args[1], code2 /*out*/, data2 /*out*/); showStats(FileSystem::Path(args[0]).filename().string(), code1, data1, FileSystem::Path(args[1]).filename().string(), code2, data2); std::cout <<"\n"; // Parse the specimen if (!settings.specimenName.empty()) { P2::Engine engine; MemoryMap::Ptr map = engine.loadSpecimens(settings.specimenName); InstructionProvider::Ptr insns = InstructionProvider::instance(engine.obtainDisassembler(), map); map->dump(std::cout); listInstructions(insns, map, code1, code2); } }
static std::vector<SgAsmFunction*> loadFunctions(const std::vector<std::string> &specimen, P2::Engine &engine) { engine.reset(); // clear all but config properties engine.doingPostAnalysis(false); // not needed for this tool SgAsmBlock *gblock = engine.buildAst(specimen); // parse, load, link, disassemble, partition, build AST return SageInterface::querySubTree<SgAsmFunction>(gblock); // return just the functions }
int main(int argc, char *argv[]) { Diagnostics::initialize(); mlog = Sawyer::Message::Facility("tool"); Diagnostics::mfacilities.insertAndAdjust(mlog); // Parse command-line P2::Engine engine; Settings settings; std::vector<std::string> specimenNames = parseCommandLine(argc, argv, settings).unreachedArgs(); if (specimenNames.size() != 1) throw std::runtime_error("exactly one binary specimen file should be specified; see --help"); std::string specimenName = boost::starts_with(specimenNames[0], "run:") ? specimenNames[0].substr(4) : specimenNames[0]; Stream info(mlog[INFO]); // Parse, map, link, and/or relocate info <<"performing parse, map, and optional link steps"; engine.parse(specimenNames); Sawyer::Stopwatch loadTimer; if (settings.performLink) { BinaryLoader *loader = engine.obtainLoader(); ASSERT_not_null(loader); loader->set_perform_dynamic_linking(true); #if 0 // [Robb P. Matzke 2014-10-09]: not always working, but maybe not needed for this analysis loader->set_perform_relocations(true); #endif BOOST_FOREACH (const std::string &paths, settings.libDirs) { BOOST_FOREACH (const std::string &path, split(':', paths)) { loader->add_directory(path); } } }
int main(int argc, char *argv[]) { ROSE_INITIALIZE; Diagnostics::initAndRegister(&::mlog, "tool"); Settings settings; P2::Engine engine; engine.doingPostAnalysis(false); // not needed by this tool std::vector<std::string> specimens = parseCommandLine(argc, argv, engine, settings); P2::Partitioner partitioner = engine.partition(specimens); if (settings.traceInsns || settings.traceSemantics) ::mlog[TRACE].enable(); // Find the string decoder. if (!partitioner.functionExists(settings.decoderVa)) { ::mlog[FATAL] <<"cannot find decoder function at " <<StringUtility::addrToString(settings.decoderVa) <<"\n"; exit(1); } if (settings.synthesized) { processSynthesizedCalls(partitioner, settings); } else { processExistingCalls(partitioner, settings); } }
int main(int argc, char *argv[]) { std::string purpose = "tests StaticSemantics"; std::string description = "ROSE has two forms of instruction semantics. The most general is that which is part of the Semantics2 API " "and which provides semantic lookup tables, arbitrary domains, and virtual machine states. But users coming " "from other platforms might be more accustomed to having semantics represented statically as part of an " "abstract syntax tree. ROSE can build these static semantics ASTs from its StaticSemantics domain in the " "Semantics2 API, and that is what this program tests."; P2::Engine engine; SgAsmBlock *gblock = engine.frontend(argc, argv, purpose, description); InstructionSemantics2::StaticSemantics::attachInstructionSemantics(gblock, engine.disassembler()); #if 1 // DEBUGGING [Robb P. Matzke 2015-06-08] generateDOT(*SageInterface::getProject()); #endif }
int main(int argc, char *argv[]) { // Parse the command-line switches P2::Engine engine; Settings settings; std::vector<std::string> args = parseCommandLine(argc, argv, engine, settings); if (args.empty()) { mlog[FATAL] <<"no binary specimen specified; see --help\n"; exit(1); } // Parse the binary specimen. We're not actually adding it to the AST. P2::Partitioner binary = engine.partition(args); // Process the binary to add its instructions to the source template BinaryToSource(settings.generator).generateSource(binary, std::cout); }
SgProject* buildAst(int argc, char *argv[], Settings &settings) { using namespace Sawyer::CommandLine; P2::Engine engine; // Parse the commane-line Parser p = engine.commandLineParser("transcode to LLVM", "Convert an ELF/PE specimen to LLVM assembly for testing."); SwitchGroup tool("Tool-specific switches"); tool.insert(Switch("llvm") .argument("version", anyParser(settings.llvmVersionString)) .doc("Version number for LLVM. The version number is a doublet or triplet of integers such as \"3.5\" or " "\"3.5.0\" and indicates which dialect of assembly should be emitted. The LLVM assembly syntax, being " "mostly an LLVM internal language, changes in incompatible ways between LLVM versions. This transcoder " "supports only certain versions (e.g., 3.5.0 and 3.7.0 as of December 2015).")); std::vector<std::string> specimen = p.with(tool).parse(argc, argv).apply().unreachedArgs(); if (specimen.empty()) { ::mlog[FATAL] <<"no binary specimen; see --help for usage\n"; exit(1); } // Parse the LLVM version number specified on the command-line if (!settings.llvmVersionString.empty()) { const char *s = settings.llvmVersionString.c_str(); char *rest = NULL; errno = 0; int a = strtol(s, &rest, 10), b = 0, c = 0; if ('.'==*rest && 0==errno) { b = strtol(rest+1, &rest, 10); if ('.'==*rest && 0==errno) c = strtol(rest+1, &rest, 10); } settings.llvmVersion = 1000000 * a + 1000 * b + c; } // Parse, load, disassemble, and partition the specimen. (void) engine.buildAst(specimen); SgProject *project = SageInterface::getProject(); if (!project) { ::mlog[FATAL] <<"This tool only supports ELF/PE specimens.\n"; exit(1); } return project; }
int main(int argc, char *argv[]) { ROSE_INITIALIZE; Diagnostics::initAndRegister(&::mlog, "tool"); // Parse command-line P2::Engine engine; Settings settings; std::vector<std::string> specimen = parseCommandLine(argc, argv, engine, settings); if (specimen.empty()) { ::mlog[FATAL] <<"no specimen supplied on command-line; see --help\n"; exit(1); } // Load specimen into ROSE's simulated memory if (!engine.parseContainers(specimen.front())) { ::mlog[FATAL] <<"cannot parse specimen binary container\n"; exit(1); } Disassembler *disassembler = engine.obtainDisassembler(); if (!disassembler) { ::mlog[FATAL] <<"no disassembler for this architecture\n"; exit(1); } const RegisterDescriptor REG_IP = disassembler->instructionPointerRegister(); ASSERT_require2(REG_IP.is_valid(), "simulation must know what register serves as the instruction pointer"); // Single-step the specimen natively in a debugger and show each instruction. BinaryDebugger debugger(specimen); while (!debugger.isTerminated()) { uint64_t ip = debugger.readRegister(REG_IP).toInteger(); uint8_t buf[16]; // 16 should be large enough for any instruction size_t nBytes = debugger.readMemory(ip, sizeof buf, buf); if (0 == nBytes) { ::mlog[ERROR] <<"cannot read memory at " <<StringUtility::addrToString(ip) <<"\n"; } else if (SgAsmInstruction *insn = disassembler->disassembleOne(buf, ip, nBytes, ip)) { std::cout <<unparseInstructionWithAddress(insn) <<"\n"; } else { ::mlog[ERROR] <<"cannot disassemble instruction at " <<StringUtility::addrToString(ip) <<"\n"; } debugger.singleStep(); } std::cout <<debugger.howTerminated(); }
static std::vector<std::string> parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) { using namespace Sawyer::CommandLine; std::string purpose = "generates low-level source code from a binary specimen"; std::string description = "This command generates a C source file from a binary specimen. The binary specimen is parsed, disassembled, " "and partitioned into functions, basic blocks, and instructions. These are then traversed to build C source " "code which is emitted to standard output."; Parser parser = engine.commandLineParser(purpose, description); SwitchGroup tool("Tool-specific switches"); tool.insert(Switch("trace-generation") .intrinsicValue(true, settings.generator.traceRiscOps) .doc("Cause the source generation phase to emit information about the basic RISC-like steps performed for " "each instruction. This can preserve a developer's sanity because the C expressions often become large, " "deeply nested, and not always intuitive about from whence each part came. The @s{no-trace-generation} " "switch turns this off. The default is to " + std::string(settings.generator.traceRiscOps?"":"not ") + "show this information.")); tool.insert(Switch("no-trace-generation") .key("trace-generation") .intrinsicValue(false, settings.generator.traceRiscOps) .hidden(true)); tool.insert(Switch("trace-instructions") .intrinsicValue(true, settings.generator.traceInsnExecution) .doc("Cause the generated source to contain extra \"printf\" calls to emit each instruction as it is " "processed. The @s{no-trace-instructions} switch turns this off. The default is to " + std::string(settings.generator.traceInsnExecution?"":"not ") + "add these printf calls.")); tool.insert(Switch("no-trace-instructions") .key("trace-instructions") .intrinsicValue(false, settings.generator.traceInsnExecution) .hidden(true)); tool.insert(Switch("ip") .longName("instruction-pointer") .argument("address", nonNegativeIntegerParser(settings.generator.initialInstructionPointer)) .doc("Initial value for the instruction pointer. The default is to not initialize the instruction pointer.")); tool.insert(Switch("sp") .longName("stack-pointer") .argument("address", nonNegativeIntegerParser(settings.generator.initialStackPointer)) .doc("Initial value for the stack pointer. The default is to not initialize the stack pointer.")); tool.insert(Switch("allocate-memory") .argument("size", nonNegativeIntegerParser(settings.generator.allocateMemoryArray)) .doc("Causes the global \"mem\" array to be allocated instead of being declared \"extern\". The switch " "argument is the amount of memory to allocate. If the argument is zero, then the memory array is " "allocated to be just large enough to hold the value at the maximum initialized address. The default " "is to not allocate the array.")); return parser.with(tool).parse(argc, argv).apply().unreachedArgs(); }
int main(int argc, char *argv[]) { ROSE_INITIALIZE; Diagnostics::initAndRegister(mlog, "tool"); Sawyer::ProgressBarSettings::minimumUpdateInterval(0.2); // more fluid spinner // Parse command-line P2::Engine engine; Settings settings; std::vector<std::string> args = parseCommandLine(argc, argv, engine, settings); ASSERT_always_require2(args.size() >= 2, "incorrect usage; see --help"); // Parse file containing instruction addresses std::string addrFileName = args[0]; std::set<rose_addr_t> knownVas = parseAddressFile(addrFileName); mlog[INFO] <<"parsed " <<plural(knownVas.size(), "unique addresses") <<"\n"; // Load specimen natively and attach debugger std::vector<std::string> specimen_cmd(args.begin()+1, args.end()); BinaryDebugger debugger(specimen_cmd); debugger.setBreakpoint(AddressInterval::whole()); ASSERT_always_require(debugger.isAttached()); ASSERT_always_forbid(debugger.isTerminated()); pid_t pid = debugger.isAttached(); mlog[INFO] <<"child PID " <<pid <<"\n"; // Get memory map. MemoryMap map; if (MAP_ROSE==settings.mapSource) { map = engine.loadSpecimens(specimen_cmd[0]); } else { map.insertProcess(":noattach:" + numberToString(pid)); } map.dump(mlog[INFO]); // The addresses specified in the instruction address file must all be in memory that is mapped. BOOST_FOREACH (rose_addr_t va, knownVas) { ASSERT_always_require2(map.at(va).require(MemoryMap::EXECUTABLE).exists(), "given address " + addrToString(va) + " is not mapped or lacks execute permission"); }
static std::vector<std::string> parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) { using namespace Sawyer::CommandLine; std::string purpose = "show instructions executed natively"; std::string description = "Runs the specimen in a debugger and prints each instruction that is executed."; Parser parser; parser .purpose(purpose) .version(std::string(ROSE_SCM_VERSION_ID).substr(0, 8), ROSE_CONFIGURE_DATE) .chapter(1, "ROSE Command-line Tools") .doc("Synopsis", "@prop{programName} [@v{switches}] @v{specimen} [@v{args}...]") .doc("Description", description) .with(engine.engineSwitches()); return parser.parse(argc, argv).apply().unreachedArgs(); }
SgAsmInterpretation* RSIM_ColdFire::parseMainExecutable(RSIM_Process *process) { namespace P2 = rose::BinaryAnalysis::Partitioner2; using namespace Sawyer::CommandLine; // This is raw hardware, so assume that all the arguments are for loading the specimen. P2::Engine engine; Parser parser; parser .purpose("initializes ColdFire memory") .version(std::string(ROSE_SCM_VERSION_ID).substr(0, 8), ROSE_CONFIGURE_DATE) .chapter(1, "ROSE Command-line Tools") .doc("Synopsis", "@prop{programName} ... -- [@v{loader_switches}] @v{resources}") .doc("Description", "This part of the simulator command-line is responsible for configuring how @v{resources} are loaded into " "simulated FreeScale ColdFire system memory. If switches are provided here they must be separated from " "simulator switches with a \"--\" to prevent the simulator itself from interpreting them.\n\n" + engine.specimenNameDocumentation()) .with(Switch("help", 'h') .hidden(true) .action(showHelpAndExit(0))) .with(engine.loaderSwitches()); std::vector<std::string> resources = parser.parse(exeArgs()).apply().unreachedArgs(); engine.isaName("coldfire"); MemoryMap::Ptr map = engine.loadSpecimens(resources); process->mem_transaction_start("specimen main memory"); *process->get_memory() = *map; // shallow copy, new segments point to same old data // The initial program counter is stored at address 4, the second entry in the interrupt vector. uint32_t initialIpBe = 0; if (!map->at(4).limit(sizeof initialIpBe).read((uint8_t*)&initialIpBe)) { mlog[FATAL] <<"failed to read initial program counter from address zero\n"; exit(1); } uint32_t initialIp = ByteOrder::be_to_host(initialIpBe); process->entryPointOriginalVa(initialIp); process->entryPointStartVa(initialIp); process->disassembler(engine.obtainDisassembler()); return engine.interpretation(); // probably null since args not likely to be ELF or PE }
int main(int argc, char *argv[]) { // This paragraph initializes the ROSE library, generates the man page for this tool, does command-line parsing for quite a // few switches including "--help", loads various specimen resources (ELF/PE, running process, raw memory dumps, etc), // disassembles, and partitions. We could have called Engine::frontend() and done it all in one function call, but then we // wouldn't have a Partitioner2::Partitioner object that we need below. std::string purpose = "demonstrate inter-function disassembly"; std::string description = "Disassembles and partitions the specimen(s), then tries to disassemble things between the functions."; P2::Engine engine; std::vector<std::string> specimens = engine.parseCommandLine(argc, argv, purpose, description).unreachedArgs(); P2::Partitioner partitioner = engine.partition(specimens); // The partitioner's address usage map (AUM) describes what part of memory has been disassembled as instructions or // data. We're interested in the unused parts between the lowest and highest disassembled addresses, so we loop over those // parts. The hull() is the entire used interval -- lowest to highest addresses used regardless of the unused areas in the // middle. An AddressInterval evaluated in boolean context returns false if it's empty. rose_addr_t va = partitioner.aum().hull().least(); while (AddressInterval unused = partitioner.aum().nextUnused(va)) { // Is the unused area beyond the last thing compiled? We're only interested in the stuff between functions. This // check also means that unused.greatest()+1 will not overflow, which simplifies later code. Overflows are easy to // trigger when the specimen's word size is the same as ROSE's word size. if (unused.least() > partitioner.aum().hull().greatest()) break; // The unused address might be in the middle of some very large unmapped area of memory, or perhaps in an area that // doesn't have execute permission (the partitioner will only disassemble at addresses that we've marked as // executable). A naive implementation would just increment to the next address and try again, but that could take a // very long time. This "if" statement will give us the next executable address that falls within the unused interval // if possible. The address is assigned to "va" if possible. if (!engine.memoryMap().within(unused).require(MemoryMap::EXECUTABLE).next().assignTo(va)) { va = unused.greatest() + 1; // won't overflow because of check above continue; } // "va" now points to an executable address that the partitioner doesn't know about yet. ASSERT_require(engine.memoryMap().at(va).require(MemoryMap::EXECUTABLE).exists()); ASSERT_forbid(partitioner.aum().instructionExists(va)); std::cout <<"unused address " <<StringUtility::addrToString(va) <<"\n"; // Cause the partitioner to discover (disassemble) one basic block. This doesn't add the basic block to the // partitioner or change the partitioner in any way. If the BB isn't something we want to keep then just forget about // it and garbage collection will reclaim the memory. P2::BasicBlock::Ptr bb = partitioner.discoverBasicBlock(va); if (!isGoodBasicBlock(bb)) { ++va; continue; } std::cout <<" disassembled " <<bb->printableName() <<"\n"; // Inform the partitioner that we wish to keep this BB. partitioner.attachBasicBlock(bb); // This BB was not reachable by any previous CFG edge, therefore it doesn't belong to any function. In order for it to // show up in the eventual AST we need to add it to some function (the ROSE AST has a requirement that every basic // block belongs to a function, although the partitioner can easily cope with the other case). The easiest way in this // situation is to just create a new function whose entry block is this BB. Creating a function doesn't modify the // partitioner in any way, so we need to also attach the function to the partitioner. P2::Function::Ptr function = P2::Function::instance(va, SgAsmFunction::FUNC_USERDEF); function->insertBasicBlock(va); // allowed only before attaching function to partitioner partitioner.attachOrMergeFunction(function); // This basic block might be the first block of a whole bunch that are connected by as yet undiscovered CFG edges. We // can recursively discover and attach all those blocks with one Engine method. There are also Partitioner methods to // do similar things, but they're lower level. engine.runPartitionerRecursive(partitioner); } // We've probably added a bunch more functions and basic blocks to the partitioner, but we haven't yet assigned the basic // blocks discovered by Engine::runPartitionerRecursive to any functions. We might also need to assign function labels // from ELF/PE information, re-run some analysis, etc., so do that now. engine.runPartitionerFinal(partitioner); // Most ROSE analysis is performed on an abstract syntax tree, so generate one. If the specime is an ELF or PE container // then the returned global block will also be attached somewhere below a SgProject node, otherwise the returned global // block is the root of the AST and there is no project (e.g., like when the specimen is a raw memory dump). SgAsmBlock *gblock = P2::Modules::buildAst(partitioner, engine.interpretation()); // Generate an assembly listing. These unparser properties are all optional, but they result in more informative assembly // listings. AsmUnparser unparser; unparser.set_registers(partitioner.instructionProvider().registerDictionary()); unparser.add_control_flow_graph(ControlFlow().build_block_cfg_from_ast<ControlFlow::BlockGraph>(gblock)); unparser.staticDataDisassembler.init(engine.disassembler()); unparser.unparse(std::cout, gblock); }
// Describe and parse the command-line static std::vector<std::string> parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) { using namespace Sawyer::CommandLine; std::string purpose = "compares actual execution with known instructions"; std::string description = "Reads instruction addresses from a file, the so-called \"expected\" addresses and and then executes the specimen " "and compares actual executed addresses with the expected addresses. An actual executed address falls into one of " "three categories: (1) the address is an expected address, or else (2) the address is not mapped, or else (3) the " "address not expected.\n\n" "One method of obtaining a list of expected addresses is to use the @man{recursiveDisassemble}{--help} tool's " "@s{list-instruction-addressses}{noerror} switch. Although this produces output that contains instruction sizes " "as well as addresses, @prop{programName} ignores the sizes. This can be used to test whether a process executes any " "instructions that were not also disassembled, thereby testing some aspect of the disassembly quality."; // The parser is the same as that created by Engine::commandLineParser except we don't need any disassemler or partitioning // switches since this tool doesn't disassemble or partition. Parser parser; parser .purpose(purpose) .version(std::string(ROSE_SCM_VERSION_ID).substr(0, 8), ROSE_CONFIGURE_DATE) .chapter(1, "ROSE Command-line Tools") .doc("Synopsis", "@prop{programName} [@v{switches}] @v{address_file} @v{specimen_name} @v{specimen_arguments}...") .doc("Description", description) .doc("Specimens", engine.specimenNameDocumentation()) .with(engine.engineSwitches()) .with(engine.loaderSwitches()); SwitchGroup tool("Tool specific switches"); tool.name("tool"); tool.insert(Switch("map") .argument("how", enumParser(settings.mapSource) ->with("native", MAP_NATIVE) ->with("rose", MAP_ROSE)) .doc("Specifies how the memory map should be obtained, where @v{how} is either \"native\" to obtain the map " "from a running process, or \"rose\" to obtain the map by parsing the specimen container with ROSE. When " "obtained natively the map may contain addresses that were not visible to the original disassembler. When " "obtained from ROSE the map might not be identical to map actually used by the process. The default is \"" + std::string(MAP_ROSE==settings.mapSource?"rose":"native") + "\".")); tool.insert(Switch("trace") .intrinsicValue(true, settings.trace) .doc("When @s{trace} is specified each execution address is printed to a file named \"@v{pid}.trace\" where " "@v{pid} is the process ID of the specimen. Each line of the file will contain the following " "space-separated fields:" "@bullet{The hexadecimal address of the instruction that was executed.}" "@bullet{The number of times this address has been executed so far.}" "@bullet{The letter '1' or '0' to indicate whether the address known (from the @v{address_file}) or not.}" "The @s{no-trace} switch disables tracing. The default is to " + std::string(settings.trace?"":"not ") + "produce a trace.")); tool.insert(Switch("no-trace") .key("trace") .intrinsicValue(false, settings.trace) .hidden(true)); tool.insert(Switch("show-expected") .intrinsicValue(true, settings.showExpected) .doc("List addresses that were expected and show how many times each was executed. The output will be one line " "per address, containing a hexadecimal address and a decimal count separated by white space. The " "@s{no-show-expected} switch turns this listing off. The default is to " + std::string(settings.showExpected?"":"not ") + "show this information.")); tool.insert(Switch("no-show-expected") .key("show-expected") .intrinsicValue(false, settings.showExpected) .hidden(true)); tool.insert(Switch("show-unexpected") .intrinsicValue(true, settings.showUnexpected) .doc("List the addresses that were executed where no instruction was expected. The output will be one line per " "address, containing a hexadecimal address and the number of times the address was executed separated by " "white space. The @s{no-show-unexpected} switch turns this listing off. The default is to " + std::string(settings.showUnexpected?"":"not ") + "show this information.")); tool.insert(Switch("no-show-unexpected") .key("show-unexpected") .intrinsicValue(false, settings.showUnexpected) .hidden(true)); tool.insert(Switch("show-unmapped") .intrinsicValue(true, settings.showUnmapped) .doc("List addresses that were executed but are not present in the memory map. These are probably instructions " "that belong to the dynamic linker, dynamically-linked libraries, or virtual dynamic shared objects. The " "@s{no-show-unmapped} switch turns this listing off. The default is to " + std::string(settings.showUnmapped?"":"not ") + "show this information.")); tool.insert(Switch("no-show-unmapped") .key("show-unmapped") .intrinsicValue(false, settings.showUnmapped) .hidden(true)); return parser.with(tool).parse(argc, argv).apply().unreachedArgs(); }
// Parse command-line and apply to settings. static std::vector<std::string> parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) { using namespace Sawyer::CommandLine; std::string purpose = "finds similar functions"; std::string description = "This tool attempts to correlate functions in one binary specimen with related functions in the other specimen. " "It does so by parsing, loading, disassembling, and partitioning each specimen to obtain a list of functions. " "Then it computes a syntactic distance between all pairs of functions using a specified distance metric " "(see @s{metric}) to create an edge-weighted, bipartite graph. Finally, a minimum weight perfect matching is " "found using the Kuhn-Munkres algorithm. The answer is output as a list of function correlations and their " "distance from each other. The specimens need not have the same number of functions, in which case one of " "the specimens will have null functions inserted to make them the same size. The distance between a null " "function and some other function is always zero regardless of metric.\n\n" "The specimens can be specified as two files or resources, or multiple files and/or resources per specimen. When " "more than two arguments are specified, a \"--\" must separate the files and resources of the first secimen from " "those of the second."; Parser parser = engine.commandLineParser(purpose, description); parser.doc("Synopsis", "@prop{programName} [@v{switches}] @v{specimen1} [--] @v{specimen2}"); SwitchGroup tool("Switches for this tool"); tool.name("tool"); tool.insert(Switch("metric") .argument("name", enumParser(settings.metric) ->with("tree", METRIC_TREE) ->with("linear", METRIC_LINEAR) ->with("insn", METRIC_INSN) ->with("size", METRIC_SIZE) ->with("sizeaddr", METRIC_SIZE_ADDR)) .doc("Metric to use when comparing two functions. The following metrics are implemented:" "@named{linear}{The \"linear\" method creates a list consisting of AST node types and, in the case " "of SgAsmInstruction nodes, the instruction kind (e.g., \"x86_pop\", \"x86_mov\", etc) for each function. " "It then computes an edit distance for any pair of lists by using the Levenshtein algorithm and normalizes " "the edit cost according to the size of the lists that were compared.}" "@named{insn}{This is the same as the \"linear\" method but it computes the edit distance for only " "the instruction types without considering their operands.}" "@named{tree}{The \"tree\" method is similar to the \"linear\" method but restricts edit operations " "according to the depth of the nodes in the functions' ASTs. This method is orders of magnitude slower " "than the \"linear\" method and doesn't seem to give better results.}" "@named{size}{Uses difference in AST size as the distance metric. The difference between two functions " "is the absolute value of the difference in the size of their ASTs. This is easily the fastest metric.}" "@named{sizeaddr}{Uses difference in AST size and difference in entry address as the distance metric. " "Functions are sorted into a vector according to their entry address and the difference in vector index " "contributes to the distance between two functions.}" "The default metric is \"" + metricName(settings.metric) + "\".")); tool.insert(Switch("list") .intrinsicValue(true, settings.listPairings) .doc("Produce a listing that indicates how functions in the first specimen map into functions into the " "second specimen. The default is to " + std::string(settings.listPairings?"":"not ") + " show " "this information. The @s{no-list} switch is the inverse. Regardless of whether the pairings are " "listed, the output will contain summary information.")); tool.insert(Switch("no-list") .key("list") .intrinsicValue(false, settings.listPairings) .hidden(true)); return parser.expandIncludedFiles(parser.with(tool).parse(argc, argv).apply().unreachedArgs()); }
int main(int argc, char *argv[]) { ROSE_INITIALIZE; Diagnostics::initAndRegister(&mlog, "tool"); // Parse the command-line to configure the partitioner engine, obtain the executable and its arguments, and generate a man // page, adjust global settings, etc. This demo tool has no switches of its own, which makes this even easier. For a // production tool, it's probably better to obtain the parser and register only those switches we need (e.g., no need for // AST generation switches since we skip that step), to set it up to use our own diagnostic stream instead of exceptions, // and to adjust this tool's synopsis in the documentation. Examples of all of these can be found in other demos. P2::Engine engine; engine.doingPostAnalysis(false); // no need for any post-analysis phases (user can override on cmdline) std::vector<std::string> command; try { command = engine.parseCommandLine(argc, argv, purpose, description).unreachedArgs(); } catch (const std::runtime_error &e) { mlog[FATAL] <<"invalid command-line: " <<e.what() <<"\n"; exit(1); } if (command.empty()) { mlog[FATAL] <<"no executable specified\n"; exit(1); } // Since we'll be tracing this program's execution, we might as well disassemble the process's memory directly. That way we // don't have to worry about ROSE mapping the specimen to the same virtual address as the kernel (which might be using // address randomization). We can stop short of generating the AST because we won't need it. BinaryAnalysis::BinaryDebugger debugger(command); std::string specimenResourceName = "proc:noattach:" + StringUtility::numberToString(debugger.isAttached()); P2::Partitioner partitioner = engine.partition(specimenResourceName); partitioner.memoryMap()->dump(std::cerr); // show the memory map as a debugging aid // Create a global control flow graph whose vertices are instructions from a global CFG whose verts are mostly basic // blocks. InsnCfg insnCfg; const P2::ControlFlowGraph &bbCfg = partitioner.cfg(); BOOST_FOREACH (const P2::ControlFlowGraph::Vertex &bbVert, bbCfg.vertices()) { if (P2::BasicBlock::Ptr bb = isBasicBlock(bbVert)) { const std::vector<SgAsmInstruction*> &insns = bb->instructions(); // Each basic block has one or more instructions that need to be inserted into our instruction control flow graph // with edges from each instruction to the next. The insertEdgeWithVertices automatically inserts missing // vertices, and doesn't insert vertices that already exist, making it convenient for this type of construction. for (size_t i=1; i<insns.size(); ++i) insnCfg.insertEdgeWithVertices(insns[i-1], insns[i]); // The final instruction of this block needs to flow into each of the initial instructions of the successor basic // blocks. Be careful that the successors are actually existing basic blocks. Note that in ROSE's global CFG, a // function call has at least two successors: the function being called (normal edges), and the address to which // the function returns ("callret" edges). There are other types of edges too, but we want only the normal edges. BOOST_FOREACH (const P2::ControlFlowGraph::Edge &bbEdge, bbVert.outEdges()) { if (bbEdge.value().type() == P2::E_NORMAL) { if (P2::BasicBlock::Ptr target = isBasicBlock(*bbEdge.target())) insnCfg.insertEdgeWithVertices(insns.back(), target->instructions()[0]); } } } } mlog[INFO] <<"block CFG: " <<StringUtility::plural(bbCfg.nVertices(), "vertices", "vertex") <<", " <<StringUtility::plural(bbCfg.nEdges(), "edges") <<"\n"; mlog[INFO] <<"insn CFG: " <<StringUtility::plural(insnCfg.nVertices(), "vertices", "vertex") <<", " <<StringUtility::plural(insnCfg.nEdges(), "edges") <<"\n"; // Run the executable to obtain a trace. We use the instruction pointer to look up a SgAsmInstruction in the insnCfg and // thus map the trace onto the instruction CFG. mlog[INFO] <<"running subordinate to obtain trace: " <<boost::join(command, " ") <<"\n"; std::set<rose_addr_t> missingAddresses; Trace trace; while (!debugger.isTerminated()) { // Find the instruction CFG vertex corresponding to the current execution address. It could be that the execution // address doesn't exist in the CFG, and this can be caused by a number of things including failure of ROSE to // statically find the address, dynamic libraries that weren't loaded statically, etc. rose_addr_t va = debugger.executionAddress(); InsnCfg::ConstVertexIterator vertex = insnCfg.findVertexKey(va); if (!insnCfg.isValidVertex(vertex)) { missingAddresses.insert(va); } else { trace.append(vertex->id()); } debugger.singleStep(); } mlog[INFO] <<"subordinate " <<debugger.howTerminated() <<"\n"; mlog[INFO] <<"trace length: " <<StringUtility::plural(trace.size(), "instructions") <<"\n"; Diagnostics::mfprintf(mlog[INFO])("overall burstiness: %6.2f%%\n", 100.0 * trace.burstiness()); mlog[INFO] <<"distinct executed addresses missing from CFG: " <<missingAddresses.size() <<"\n"; // Print a list of CFG vertices that were never reached. We use std::cout rather than diagnostics because this is one of // the main outputs of this demo. The "if" condition is constant time. BOOST_FOREACH (const InsnCfg::Vertex &vertex, insnCfg.vertices()) { if (!trace.exists(vertex.id())) std::cout <<"not executed: " <<unparseInstructionWithAddress(vertex.value()) <<"\n"; } // Print list of addresses that were executed but did not appear in the CFG BOOST_FOREACH (rose_addr_t va, missingAddresses) std::cout <<"missing address: " <<StringUtility::addrToString(va) <<"\n"; // Print those branch instructions that were executed by the trace but always took the same branch. Just to mix things up, // I'll iterate over the trace labels this time instead of the CFG vertices. Remember, the labels are the integer IDs of // the CFG vertices. The "if" condition executes in constant time, as does the next line. for (size_t i = 0; i < trace.nLabels(); ++i) { if (insnCfg.findVertex(i)->nOutEdges() > 1 && trace.successors(i).size() == 1) { SgAsmInstruction *successor = insnCfg.findVertex(*trace.successorSet(i).begin())->value(); std::cout <<"single flow: " <<unparseInstructionWithAddress(insnCfg.findVertex(i)->value()) <<" --> " <<unparseInstructionWithAddress(successor) <<"\n"; } } // Get a list of executed instructions that are branch points and sort them by their burstiness. The "if" condition is // constant time. std::vector<InsnTraceInfo> info; BOOST_FOREACH (const InsnCfg::Vertex &vertex, insnCfg.vertices()) { if (vertex.nOutEdges() > 1 && trace.exists(vertex.id())) info.push_back(InsnTraceInfo(vertex.value(), trace.burstiness(vertex.id()), trace.size(vertex.id()))); } std::sort(info.begin(), info.end()); std::reverse(info.begin(), info.end()); BOOST_FOREACH (const InsnTraceInfo &record, info) { Diagnostics::mfprintf(std::cout)("burstiness %6.2f%% %5zu hits at %s\n", 100.0*record.burstiness, record.nHits, unparseInstructionWithAddress(record.insn).c_str()); }
static std::vector<std::string> parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) { using namespace Sawyer::CommandLine; std::string purpose = "decode encoded strings"; std::string description = "Demonstrates the use of ROSE instruction semantics and ability to start execution an an arbitrary address and " "machine state. The @s{decoder} switch is required--it is the entry address of a string decoding function. This " "analysis finds all statically-detected calls to that function, obtains three arguments from the call's basic block, " "and calls the function. The third argument is used as the address of a buffer where the decoded string is stored, " "and the string will be printed as the result.\n\n" "This tool can also run in a mode where the calls are synthesized by varying the first of three arguments."; Parser parser = engine.commandLineParser(purpose, description); parser.doc("Bugs", "z", "Being a demo, this tool is not very flexible when it comes to how the decrypted string is located or " "what argument values are used in the synthesis mode."); SwitchGroup sg("Tool-specific switches"); sg.name("tool"); sg.insert(Switch("decoder") .argument("address", nonNegativeIntegerParser(settings.decoderVa)) .doc("Virtual address of the string decoding function.")); sg.insert(Switch("stack") .argument("address", nonNegativeIntegerParser(settings.stackVa)) .doc("Initial value for the stack pointer. The default is " + StringUtility::addrToString(settings.stackVa) + ".")); sg.insert(Switch("trace-insns") .intrinsicValue(true, settings.traceInsns) .doc("Cause instructions to be printed to standard error as they are executed. The @s{no-trace-insns} switch " "turns tracing off. The default is to " + std::string(settings.traceInsns?"":"not ") + "show tracing.")); sg.insert(Switch("no-trace-insns") .key("trace-insns") .intrinsicValue(false, settings.traceInsns) .hidden(true)); sg.insert(Switch("trace-semantics") .intrinsicValue(true, settings.traceSemantics) .doc("Cause instruction semantics (the RISC-like operations) to be printed to standard error as they are " "executed. The @s{no-trace-semantics} switch turns tracing off. The default is to " + std::string(settings.traceSemantics?"":"not ") + "show tracing.")); sg.insert(Switch("no-trace-semantics") .key("trace-semantics") .intrinsicValue(false, settings.traceSemantics) .hidden(true)); sg.insert(Switch("insn-limit") .argument("n", nonNegativeIntegerParser(settings.insnLimit)) .doc("Maximum number of instructions to execute per decoder call before giving up. The default is " + StringUtility::plural(settings.insnLimit, "instructions") + ".")); sg.insert(Switch("show-call") .argument("n", nonNegativeIntegerParser(settings.showCall)) .doc("Show calls to the decryption function along with their arguments. The @v{n} specifies how many arguments " "(each being the natural length of a word) to display. If @v{n} is zero then call information is not " "displayed. The default is " + StringUtility::plural(settings.showCall, "arguments") + ".")); sg.insert(Switch("synthesized") .intrinsicValue(true, settings.synthesized) .doc("Synthesize calls from scratch instead of looking for existing calls.")); return parser.with(sg).parse(argc, argv).apply().unreachedArgs(); }