int main(int argc, char *argv[]) {
    ROSE_INITIALIZE;
    Diagnostics::initAndRegister(&mlog, "tool");

    // Parse command line
    Settings settings;
    std::vector<std::string> args = parseCommandLine(argc, argv, settings).unreachedArgs();
    if (args.size()!=2)
        throw std::runtime_error("invalid usage; see --help");

    // Load the CSV files
    FunctionByAddress code1, data1, code2, data2;
    readCsvFile(args[0], code1 /*out*/, data1 /*out*/);
    readCsvFile(args[1], code2 /*out*/, data2 /*out*/);

    showStats(FileSystem::Path(args[0]).filename().string(), code1, data1,
              FileSystem::Path(args[1]).filename().string(), code2, data2);
    std::cout <<"\n";

    // Parse the specimen
    if (!settings.specimenName.empty()) {
        P2::Engine engine;
        MemoryMap::Ptr map = engine.loadSpecimens(settings.specimenName);
        InstructionProvider::Ptr insns = InstructionProvider::instance(engine.obtainDisassembler(), map);
        map->dump(std::cout);
        listInstructions(insns, map, code1, code2);
    }
}
static std::vector<SgAsmFunction*>
loadFunctions(const std::vector<std::string> &specimen, P2::Engine &engine) {
    engine.reset();                                            // clear all but config properties
    engine.doingPostAnalysis(false);                           // not needed for this tool
    SgAsmBlock *gblock = engine.buildAst(specimen);            // parse, load, link, disassemble, partition, build AST
    return SageInterface::querySubTree<SgAsmFunction>(gblock); // return just the functions
}
Exemple #3
0
int
main(int argc, char *argv[]) {
    Diagnostics::initialize();
    mlog = Sawyer::Message::Facility("tool");
    Diagnostics::mfacilities.insertAndAdjust(mlog);

    // Parse command-line
    P2::Engine engine;
    Settings settings;
    std::vector<std::string> specimenNames = parseCommandLine(argc, argv, settings).unreachedArgs();
    if (specimenNames.size() != 1)
        throw std::runtime_error("exactly one binary specimen file should be specified; see --help");
    std::string specimenName = boost::starts_with(specimenNames[0], "run:") ? specimenNames[0].substr(4) : specimenNames[0];
    Stream info(mlog[INFO]);

    // Parse, map, link, and/or relocate
    info <<"performing parse, map, and optional link steps";
    engine.parse(specimenNames);
    Sawyer::Stopwatch loadTimer;
    if (settings.performLink) {
        BinaryLoader *loader = engine.obtainLoader();
        ASSERT_not_null(loader);
        loader->set_perform_dynamic_linking(true);
#if 0 // [Robb P. Matzke 2014-10-09]: not always working, but maybe not needed for this analysis
        loader->set_perform_relocations(true);
#endif
        BOOST_FOREACH (const std::string &paths, settings.libDirs) {
            BOOST_FOREACH (const std::string &path, split(':', paths)) {
                loader->add_directory(path);
            }
        }
    }
int
main(int argc, char *argv[]) {
    ROSE_INITIALIZE;
    Diagnostics::initAndRegister(&::mlog, "tool");

    Settings settings;
    P2::Engine engine;
    engine.doingPostAnalysis(false);                    // not needed by this tool
    std::vector<std::string> specimens = parseCommandLine(argc, argv, engine, settings);
    P2::Partitioner partitioner = engine.partition(specimens);
    if (settings.traceInsns || settings.traceSemantics)
        ::mlog[TRACE].enable();

    // Find the string decoder.
    if (!partitioner.functionExists(settings.decoderVa)) {
        ::mlog[FATAL] <<"cannot find decoder function at " <<StringUtility::addrToString(settings.decoderVa) <<"\n";
        exit(1);
    }

    if (settings.synthesized) {
        processSynthesizedCalls(partitioner, settings);
    } else {
        processExistingCalls(partitioner, settings);
    }
}
int
main(int argc, char *argv[]) {
    std::string purpose = "tests StaticSemantics";
    std::string description =
        "ROSE has two forms of instruction semantics. The most general is that which is part of the Semantics2 API "
        "and which provides semantic lookup tables, arbitrary domains, and virtual machine states. But users coming "
        "from other platforms might be more accustomed to having semantics represented statically as part of an "
        "abstract syntax tree. ROSE can build these static semantics ASTs from its StaticSemantics domain in the "
        "Semantics2 API, and that is what this program tests.";
    P2::Engine engine;
    SgAsmBlock *gblock = engine.frontend(argc, argv, purpose, description);
    InstructionSemantics2::StaticSemantics::attachInstructionSemantics(gblock, engine.disassembler());

#if 1 // DEBUGGING [Robb P. Matzke 2015-06-08]
    generateDOT(*SageInterface::getProject());
#endif
}
Exemple #6
0
int
main(int argc, char *argv[]) {
    // Parse the command-line switches
    P2::Engine engine;
    Settings settings;
    std::vector<std::string> args = parseCommandLine(argc, argv, engine, settings);
    if (args.empty()) {
        mlog[FATAL] <<"no binary specimen specified; see --help\n";
        exit(1);
    }

    // Parse the binary specimen. We're not actually adding it to the AST.
    P2::Partitioner binary = engine.partition(args);

    // Process the binary to add its instructions to the source template
    BinaryToSource(settings.generator).generateSource(binary, std::cout);
}
SgProject*
buildAst(int argc, char *argv[], Settings &settings) {
    using namespace Sawyer::CommandLine;
    P2::Engine engine;

    // Parse the commane-line
    Parser p = engine.commandLineParser("transcode to LLVM", "Convert an ELF/PE specimen to LLVM assembly for testing.");
    SwitchGroup tool("Tool-specific switches");
    tool.insert(Switch("llvm")
                .argument("version", anyParser(settings.llvmVersionString))
                .doc("Version number for LLVM.  The version number is a doublet or triplet of integers such as \"3.5\" or "
                     "\"3.5.0\" and indicates which dialect of assembly should be emitted. The LLVM assembly syntax, being "
                     "mostly an LLVM internal language, changes in incompatible ways between LLVM versions. This transcoder "
                     "supports only certain versions (e.g., 3.5.0 and 3.7.0 as of December 2015)."));

    std::vector<std::string> specimen = p.with(tool).parse(argc, argv).apply().unreachedArgs();
    if (specimen.empty()) {
        ::mlog[FATAL] <<"no binary specimen; see --help for usage\n";
        exit(1);
    }

    // Parse the LLVM version number specified on the command-line
    if (!settings.llvmVersionString.empty()) {
        const char *s = settings.llvmVersionString.c_str();
        char *rest = NULL;
        errno = 0;
        int a = strtol(s, &rest, 10), b = 0, c = 0;
        if ('.'==*rest && 0==errno) {
            b = strtol(rest+1, &rest, 10);
            if ('.'==*rest && 0==errno)
                c = strtol(rest+1, &rest, 10);
        }
        settings.llvmVersion = 1000000 * a + 1000 * b + c;
    }

    // Parse, load, disassemble, and partition the specimen.
    (void) engine.buildAst(specimen);
    SgProject *project = SageInterface::getProject();
    if (!project) {
        ::mlog[FATAL] <<"This tool only supports ELF/PE specimens.\n";
        exit(1);
    }
    
    return project;
}
int
main(int argc, char *argv[]) {
    ROSE_INITIALIZE;
    Diagnostics::initAndRegister(&::mlog, "tool");

    // Parse command-line
    P2::Engine engine;
    Settings settings;
    std::vector<std::string> specimen = parseCommandLine(argc, argv, engine, settings);
    if (specimen.empty()) {
        ::mlog[FATAL] <<"no specimen supplied on command-line; see --help\n";
        exit(1);
    }

    // Load specimen into ROSE's simulated memory
    if (!engine.parseContainers(specimen.front())) {
        ::mlog[FATAL] <<"cannot parse specimen binary container\n";
        exit(1);
    }
    Disassembler *disassembler = engine.obtainDisassembler();
    if (!disassembler) {
        ::mlog[FATAL] <<"no disassembler for this architecture\n";
        exit(1);
    }
    const RegisterDescriptor REG_IP = disassembler->instructionPointerRegister();
    ASSERT_require2(REG_IP.is_valid(), "simulation must know what register serves as the instruction pointer");

    // Single-step the specimen natively in a debugger and show each instruction.
    BinaryDebugger debugger(specimen);
    while (!debugger.isTerminated()) {
        uint64_t ip = debugger.readRegister(REG_IP).toInteger();
        uint8_t buf[16];                                // 16 should be large enough for any instruction
        size_t nBytes = debugger.readMemory(ip, sizeof buf, buf);
        if (0 == nBytes) {
            ::mlog[ERROR] <<"cannot read memory at " <<StringUtility::addrToString(ip) <<"\n";
        } else if (SgAsmInstruction *insn = disassembler->disassembleOne(buf, ip, nBytes, ip)) {
            std::cout <<unparseInstructionWithAddress(insn) <<"\n";
        } else {
            ::mlog[ERROR] <<"cannot disassemble instruction at " <<StringUtility::addrToString(ip) <<"\n";
        }
        debugger.singleStep();
    }
    std::cout <<debugger.howTerminated();
}
Exemple #9
0
static std::vector<std::string>
parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) {
    using namespace Sawyer::CommandLine;

    std::string purpose = "generates low-level source code from a binary specimen";
    std::string description =
        "This command generates a C source file from a binary specimen. The binary specimen is parsed, disassembled, "
        "and partitioned into functions, basic blocks, and instructions. These are then traversed to build C source "
        "code which is emitted to standard output.";

    Parser parser = engine.commandLineParser(purpose, description);

    SwitchGroup tool("Tool-specific switches");
    tool.insert(Switch("trace-generation")
                .intrinsicValue(true, settings.generator.traceRiscOps)
                .doc("Cause the source generation phase to emit information about the basic RISC-like steps performed for "
                     "each instruction. This can preserve a developer's sanity because the C expressions often become large, "
                     "deeply nested, and not always intuitive about from whence each part came. The @s{no-trace-generation} "
                     "switch turns this off.  The default is to " + std::string(settings.generator.traceRiscOps?"":"not ") +
                     "show this information."));
    tool.insert(Switch("no-trace-generation")
                .key("trace-generation")
                .intrinsicValue(false, settings.generator.traceRiscOps)
                .hidden(true));

    tool.insert(Switch("trace-instructions")
                .intrinsicValue(true, settings.generator.traceInsnExecution)
                .doc("Cause the generated source to contain extra \"printf\" calls to emit each instruction as it is "
                     "processed. The @s{no-trace-instructions} switch turns this off. The default is to " +
                     std::string(settings.generator.traceInsnExecution?"":"not ") + "add these printf calls."));
    tool.insert(Switch("no-trace-instructions")
                .key("trace-instructions")
                .intrinsicValue(false, settings.generator.traceInsnExecution)
                .hidden(true));

    tool.insert(Switch("ip")
                .longName("instruction-pointer")
                .argument("address", nonNegativeIntegerParser(settings.generator.initialInstructionPointer))
                .doc("Initial value for the instruction pointer.  The default is to not initialize the instruction pointer."));

    tool.insert(Switch("sp")
                .longName("stack-pointer")
                .argument("address", nonNegativeIntegerParser(settings.generator.initialStackPointer))
                .doc("Initial value for the stack pointer. The default is to not initialize the stack pointer."));

    tool.insert(Switch("allocate-memory")
                .argument("size", nonNegativeIntegerParser(settings.generator.allocateMemoryArray))
                .doc("Causes the global \"mem\" array to be allocated instead of being declared \"extern\". The switch "
                     "argument is the amount of memory to allocate. If the argument is zero, then the memory array is "
                     "allocated to be just large enough to hold the value at the maximum initialized address. The default "
                     "is to not allocate the array."));

    return parser.with(tool).parse(argc, argv).apply().unreachedArgs();
}
int
main(int argc, char *argv[]) {
    ROSE_INITIALIZE;
    Diagnostics::initAndRegister(mlog, "tool");
    Sawyer::ProgressBarSettings::minimumUpdateInterval(0.2); // more fluid spinner

    // Parse command-line
    P2::Engine engine;
    Settings settings;
    std::vector<std::string> args = parseCommandLine(argc, argv, engine, settings);
    ASSERT_always_require2(args.size() >= 2, "incorrect usage; see --help");

    // Parse file containing instruction addresses
    std::string addrFileName = args[0];
    std::set<rose_addr_t> knownVas = parseAddressFile(addrFileName);
    mlog[INFO] <<"parsed " <<plural(knownVas.size(), "unique addresses") <<"\n";

    // Load specimen natively and attach debugger
    std::vector<std::string> specimen_cmd(args.begin()+1, args.end());
    BinaryDebugger debugger(specimen_cmd);
    debugger.setBreakpoint(AddressInterval::whole());
    ASSERT_always_require(debugger.isAttached());
    ASSERT_always_forbid(debugger.isTerminated());
    pid_t pid = debugger.isAttached();
    mlog[INFO] <<"child PID " <<pid <<"\n";

    // Get memory map.
    MemoryMap map;
    if (MAP_ROSE==settings.mapSource) {
        map = engine.loadSpecimens(specimen_cmd[0]);
    } else {
        map.insertProcess(":noattach:" + numberToString(pid));
    }
    map.dump(mlog[INFO]);

    // The addresses specified in the instruction address file must all be in memory that is mapped.
    BOOST_FOREACH (rose_addr_t va, knownVas) {
        ASSERT_always_require2(map.at(va).require(MemoryMap::EXECUTABLE).exists(),
                               "given address " + addrToString(va) + " is not mapped or lacks execute permission");
    }
static std::vector<std::string>
parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) {
    using namespace Sawyer::CommandLine;

    std::string purpose = "show instructions executed natively";
    std::string description = "Runs the specimen in a debugger and prints each instruction that is executed.";
    Parser parser;
    parser
        .purpose(purpose)
        .version(std::string(ROSE_SCM_VERSION_ID).substr(0, 8), ROSE_CONFIGURE_DATE)
        .chapter(1, "ROSE Command-line Tools")
        .doc("Synopsis", "@prop{programName} [@v{switches}] @v{specimen} [@v{args}...]")
        .doc("Description", description)
        .with(engine.engineSwitches());

    return parser.parse(argc, argv).apply().unreachedArgs();
}
SgAsmInterpretation*
RSIM_ColdFire::parseMainExecutable(RSIM_Process *process) {
    namespace P2 = rose::BinaryAnalysis::Partitioner2;
    using namespace Sawyer::CommandLine;

    // This is raw hardware, so assume that all the arguments are for loading the specimen.
    P2::Engine engine;
    Parser parser;
    parser
        .purpose("initializes ColdFire memory")
        .version(std::string(ROSE_SCM_VERSION_ID).substr(0, 8), ROSE_CONFIGURE_DATE)
        .chapter(1, "ROSE Command-line Tools")
        .doc("Synopsis", "@prop{programName} ... -- [@v{loader_switches}] @v{resources}")
        .doc("Description",
             "This part of the simulator command-line is responsible for configuring how @v{resources} are loaded into "
             "simulated FreeScale ColdFire system memory.  If switches are provided here they must be separated from "
             "simulator switches with a \"--\" to prevent the simulator itself from interpreting them.\n\n" +
             engine.specimenNameDocumentation())
        .with(Switch("help", 'h')
              .hidden(true)
              .action(showHelpAndExit(0)))
        .with(engine.loaderSwitches());
    std::vector<std::string> resources = parser.parse(exeArgs()).apply().unreachedArgs();
    engine.isaName("coldfire");
    MemoryMap::Ptr map = engine.loadSpecimens(resources);
    process->mem_transaction_start("specimen main memory");
    *process->get_memory() = *map;                      // shallow copy, new segments point to same old data

    // The initial program counter is stored at address 4, the second entry in the interrupt vector.
    uint32_t initialIpBe = 0;
    if (!map->at(4).limit(sizeof initialIpBe).read((uint8_t*)&initialIpBe)) {
        mlog[FATAL] <<"failed to read initial program counter from address zero\n";
        exit(1);
    }
    uint32_t initialIp = ByteOrder::be_to_host(initialIpBe);
    process->entryPointOriginalVa(initialIp);
    process->entryPointStartVa(initialIp);

    process->disassembler(engine.obtainDisassembler());
    return engine.interpretation();                     // probably null since args not likely to be ELF or PE
}
Exemple #13
0
int
main(int argc, char *argv[]) {

    // This paragraph initializes the ROSE library, generates the man page for this tool, does command-line parsing for quite a
    // few switches including "--help", loads various specimen resources (ELF/PE, running process, raw memory dumps, etc),
    // disassembles, and partitions.  We could have called Engine::frontend() and done it all in one function call, but then we
    // wouldn't have a Partitioner2::Partitioner object that we need below.
    std::string purpose = "demonstrate inter-function disassembly";
    std::string description =
        "Disassembles and partitions the specimen(s), then tries to disassemble things between the functions.";
    P2::Engine engine;
    std::vector<std::string> specimens = engine.parseCommandLine(argc, argv, purpose, description).unreachedArgs();
    P2::Partitioner partitioner = engine.partition(specimens);

    // The partitioner's address usage map (AUM) describes what part of memory has been disassembled as instructions or
    // data. We're interested in the unused parts between the lowest and highest disassembled addresses, so we loop over those
    // parts.  The hull() is the entire used interval -- lowest to highest addresses used regardless of the unused areas in the
    // middle.  An AddressInterval evaluated in boolean context returns false if it's empty.
    rose_addr_t va = partitioner.aum().hull().least();
    while (AddressInterval unused = partitioner.aum().nextUnused(va)) {

        // Is the unused area beyond the last thing compiled?  We're only interested in the stuff between functions.  This
        // check also means that unused.greatest()+1 will not overflow, which simplifies later code. Overflows are easy to
        // trigger when the specimen's word size is the same as ROSE's word size.
        if (unused.least() > partitioner.aum().hull().greatest())
            break;

        // The unused address might be in the middle of some very large unmapped area of memory, or perhaps in an area that
        // doesn't have execute permission (the partitioner will only disassemble at addresses that we've marked as
        // executable). A naive implementation would just increment to the next address and try again, but that could take a
        // very long time.  This "if" statement will give us the next executable address that falls within the unused interval
        // if possible. The address is assigned to "va" if possible.
        if (!engine.memoryMap().within(unused).require(MemoryMap::EXECUTABLE).next().assignTo(va)) {
            va = unused.greatest() + 1;                 // won't overflow because of check above
            continue;
        }

        // "va" now points to an executable address that the partitioner doesn't know about yet.
        ASSERT_require(engine.memoryMap().at(va).require(MemoryMap::EXECUTABLE).exists());
        ASSERT_forbid(partitioner.aum().instructionExists(va));
        std::cout <<"unused address " <<StringUtility::addrToString(va) <<"\n";

        // Cause the partitioner to discover (disassemble) one basic block. This doesn't add the basic block to the
        // partitioner or change the partitioner in any way.  If the BB isn't something we want to keep then just forget about
        // it and garbage collection will reclaim the memory.
        P2::BasicBlock::Ptr bb = partitioner.discoverBasicBlock(va);
        if (!isGoodBasicBlock(bb)) {
            ++va;
            continue;
        }
        std::cout <<"  disassembled " <<bb->printableName() <<"\n";

        // Inform the partitioner that we wish to keep this BB.
        partitioner.attachBasicBlock(bb);

        // This BB was not reachable by any previous CFG edge, therefore it doesn't belong to any function. In order for it to
        // show up in the eventual AST we need to add it to some function (the ROSE AST has a requirement that every basic
        // block belongs to a function, although the partitioner can easily cope with the other case). The easiest way in this
        // situation is to just create a new function whose entry block is this BB.  Creating a function doesn't modify the
        // partitioner in any way, so we need to also attach the function to the partitioner.
        P2::Function::Ptr function = P2::Function::instance(va, SgAsmFunction::FUNC_USERDEF);
        function->insertBasicBlock(va);                 // allowed only before attaching function to partitioner
        partitioner.attachOrMergeFunction(function);

        // This basic block might be the first block of a whole bunch that are connected by as yet undiscovered CFG edges. We
        // can recursively discover and attach all those blocks with one Engine method.  There are also Partitioner methods to
        // do similar things, but they're lower level.
        engine.runPartitionerRecursive(partitioner);
    }

    // We've probably added a bunch more functions and basic blocks to the partitioner, but we haven't yet assigned the basic
    // blocks discovered by Engine::runPartitionerRecursive to any functions.  We might also need to assign function labels
    // from ELF/PE information, re-run some analysis, etc., so do that now.
    engine.runPartitionerFinal(partitioner);

    // Most ROSE analysis is performed on an abstract syntax tree, so generate one.  If the specime is an ELF or PE container
    // then the returned global block will also be attached somewhere below a SgProject node, otherwise the returned global
    // block is the root of the AST and there is no project (e.g., like when the specimen is a raw memory dump).
    SgAsmBlock *gblock = P2::Modules::buildAst(partitioner, engine.interpretation());

    // Generate an assembly listing. These unparser properties are all optional, but they result in more informative assembly
    // listings.
    AsmUnparser unparser;
    unparser.set_registers(partitioner.instructionProvider().registerDictionary());
    unparser.add_control_flow_graph(ControlFlow().build_block_cfg_from_ast<ControlFlow::BlockGraph>(gblock));
    unparser.staticDataDisassembler.init(engine.disassembler());
    unparser.unparse(std::cout, gblock);
}
// Describe and parse the command-line
static std::vector<std::string>
parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings)
{
    using namespace Sawyer::CommandLine;

    std::string purpose = "compares actual execution with known instructions";
    std::string description = 
        "Reads instruction addresses from a file, the so-called \"expected\" addresses and and then executes the specimen "
        "and compares actual executed addresses with the expected addresses.  An actual executed address falls into one of "
        "three categories:  (1) the address is an expected address, or else (2) the address is not mapped, or else (3) the "
        "address not expected.\n\n"

        "One method of obtaining a list of expected addresses is to use the @man{recursiveDisassemble}{--help} tool's "
        "@s{list-instruction-addressses}{noerror} switch. Although this produces output that contains instruction sizes "
        "as well as addresses, @prop{programName} ignores the sizes.  This can be used to test whether a process executes any "
        "instructions that were not also disassembled, thereby testing some aspect of the disassembly quality.";

    // The parser is the same as that created by Engine::commandLineParser except we don't need any disassemler or partitioning
    // switches since this tool doesn't disassemble or partition.
    Parser parser;
    parser
        .purpose(purpose)
        .version(std::string(ROSE_SCM_VERSION_ID).substr(0, 8), ROSE_CONFIGURE_DATE)
        .chapter(1, "ROSE Command-line Tools")
        .doc("Synopsis",
             "@prop{programName} [@v{switches}] @v{address_file} @v{specimen_name} @v{specimen_arguments}...")
        .doc("Description", description)
        .doc("Specimens", engine.specimenNameDocumentation())
        .with(engine.engineSwitches())
        .with(engine.loaderSwitches());
    
    SwitchGroup tool("Tool specific switches");
    tool.name("tool");
    tool.insert(Switch("map")
                .argument("how", enumParser(settings.mapSource)
                          ->with("native", MAP_NATIVE)
                          ->with("rose", MAP_ROSE))
                .doc("Specifies how the memory map should be obtained, where @v{how} is either \"native\" to obtain the map "
                     "from a running process, or \"rose\" to obtain the map by parsing the specimen container with ROSE.  When "
                     "obtained natively the map may contain addresses that were not visible to the original disassembler. When "
                     "obtained from ROSE the map might not be identical to map actually used by the process. The default is \"" +
                     std::string(MAP_ROSE==settings.mapSource?"rose":"native") + "\"."));

    tool.insert(Switch("trace")
                .intrinsicValue(true, settings.trace)
                .doc("When @s{trace} is specified each execution address is printed to a file named \"@v{pid}.trace\" where "
                     "@v{pid} is the process ID of the specimen.  Each line of the file will contain the following "
                     "space-separated fields:"
                     "@bullet{The hexadecimal address of the instruction that was executed.}"
                     "@bullet{The number of times this address has been executed so far.}"
                     "@bullet{The letter '1' or '0' to indicate whether the address known (from the @v{address_file}) or not.}"
                     "The @s{no-trace} switch disables tracing. The default is to " + std::string(settings.trace?"":"not ") +
                     "produce a trace."));
    tool.insert(Switch("no-trace")
                .key("trace")
                .intrinsicValue(false, settings.trace)
                .hidden(true));

    tool.insert(Switch("show-expected")
                .intrinsicValue(true, settings.showExpected)
                .doc("List addresses that were expected and show how many times each was executed.  The output will be one line "
                     "per address, containing a hexadecimal address and a decimal count separated by white space.  The "
                     "@s{no-show-expected} switch turns this listing off.  The default is to " +
                     std::string(settings.showExpected?"":"not ") + "show this information."));
    tool.insert(Switch("no-show-expected")
                .key("show-expected")
                .intrinsicValue(false, settings.showExpected)
                .hidden(true));
    
    tool.insert(Switch("show-unexpected")
                .intrinsicValue(true, settings.showUnexpected)
                .doc("List the addresses that were executed where no instruction was expected.  The output will be one line per "
                     "address, containing a hexadecimal address and the number of times the address was executed separated by "
                     "white space.  The @s{no-show-unexpected} switch turns this listing off.  The default is to " +
                     std::string(settings.showUnexpected?"":"not ") + "show this information."));
    tool.insert(Switch("no-show-unexpected")
                .key("show-unexpected")
                .intrinsicValue(false, settings.showUnexpected)
                .hidden(true));

    tool.insert(Switch("show-unmapped")
                .intrinsicValue(true, settings.showUnmapped)
                .doc("List addresses that were executed but are not present in the memory map.  These are probably instructions "
                     "that belong to the dynamic linker, dynamically-linked libraries, or virtual dynamic shared objects.  The "
                     "@s{no-show-unmapped} switch turns this listing off.  The default is to " +
                     std::string(settings.showUnmapped?"":"not ") + "show this information."));
    tool.insert(Switch("no-show-unmapped")
                .key("show-unmapped")
                .intrinsicValue(false, settings.showUnmapped)
                .hidden(true));

    return parser.with(tool).parse(argc, argv).apply().unreachedArgs();
}
// Parse command-line and apply to settings.
static std::vector<std::string>
parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) {
    using namespace Sawyer::CommandLine;

    std::string purpose = "finds similar functions";
    std::string description =
         "This tool attempts to correlate functions in one binary specimen with related functions in the other specimen. "
         "It does so by parsing, loading, disassembling, and partitioning each specimen to obtain a list of functions. "
         "Then it computes a syntactic distance between all pairs of functions using a specified distance metric "
         "(see @s{metric}) to create an edge-weighted, bipartite graph.  Finally, a minimum weight perfect matching is "
         "found using the Kuhn-Munkres algorithm.  The answer is output as a list of function correlations and their "
         "distance from each other.  The specimens need not have the same number of functions, in which case one of "
         "the specimens will have null functions inserted to make them the same size.  The distance between a null "
         "function and some other function is always zero regardless of metric.\n\n"

        "The specimens can be specified as two files or resources, or multiple files and/or resources per specimen. When "
        "more than two arguments are specified, a \"--\" must separate the files and resources of the first secimen from "
        "those of the second.";

    Parser parser = engine.commandLineParser(purpose, description);
    parser.doc("Synopsis", "@prop{programName} [@v{switches}] @v{specimen1} [--] @v{specimen2}");

    SwitchGroup tool("Switches for this tool");
    tool.name("tool");

    tool.insert(Switch("metric")
                .argument("name", enumParser(settings.metric)
                          ->with("tree", METRIC_TREE)
                          ->with("linear", METRIC_LINEAR)
                          ->with("insn", METRIC_INSN)
                          ->with("size", METRIC_SIZE)
                          ->with("sizeaddr", METRIC_SIZE_ADDR))
                .doc("Metric to use when comparing two functions.  The following metrics are implemented:"

                     "@named{linear}{The \"linear\" method creates a list consisting of AST node types and, in the case "
                     "of SgAsmInstruction nodes, the instruction kind (e.g., \"x86_pop\", \"x86_mov\", etc) for each function. "
                     "It then computes an edit distance for any pair of lists by using the Levenshtein algorithm and normalizes "
                     "the edit cost according to the size of the lists that were compared.}"

                     "@named{insn}{This is the same as the \"linear\" method but it computes the edit distance for only "
                     "the instruction types without considering their operands.}"

                     "@named{tree}{The \"tree\" method is similar to the \"linear\" method but restricts edit operations "
                     "according to the depth of the nodes in the functions' ASTs.  This method is orders of magnitude slower "
                     "than the \"linear\" method and doesn't seem to give better results.}"

                     "@named{size}{Uses difference in AST size as the distance metric.  The difference between two functions "
                     "is the absolute value of the difference in the size of their ASTs. This is easily the fastest metric.}"

                     "@named{sizeaddr}{Uses difference in AST size and difference in entry address as the distance metric. "
                     "Functions are sorted into a vector according to their entry address and the difference in vector index "
                     "contributes to the distance between two functions.}"
                     
                     "The default metric is \"" + metricName(settings.metric) + "\"."));

    tool.insert(Switch("list")
                .intrinsicValue(true, settings.listPairings)
                .doc("Produce a listing that indicates how functions in the first specimen map into functions into the "
                     "second specimen.  The default is to " + std::string(settings.listPairings?"":"not ") + " show "
                     "this information.  The @s{no-list} switch is the inverse.  Regardless of whether the pairings are "
                     "listed, the output will contain summary information."));
    tool.insert(Switch("no-list")
                .key("list")
                .intrinsicValue(false, settings.listPairings)
                .hidden(true));


    return parser.expandIncludedFiles(parser.with(tool).parse(argc, argv).apply().unreachedArgs());
}
Exemple #16
0
int
main(int argc, char *argv[]) {
    ROSE_INITIALIZE;
    Diagnostics::initAndRegister(&mlog, "tool");

    // Parse the command-line to configure the partitioner engine, obtain the executable and its arguments, and generate a man
    // page, adjust global settings, etc. This demo tool has no switches of its own, which makes this even easier. For a
    // production tool, it's probably better to obtain the parser and register only those switches we need (e.g., no need for
    // AST generation switches since we skip that step), to set it up to use our own diagnostic stream instead of exceptions,
    // and to adjust this tool's synopsis in the documentation.  Examples of all of these can be found in other demos.
    P2::Engine engine;
    engine.doingPostAnalysis(false);                    // no need for any post-analysis phases (user can override on cmdline)
    std::vector<std::string> command;
    try {
        command = engine.parseCommandLine(argc, argv, purpose, description).unreachedArgs();
    } catch (const std::runtime_error &e) {
        mlog[FATAL] <<"invalid command-line: " <<e.what() <<"\n";
        exit(1);
    }
    if (command.empty()) {
        mlog[FATAL] <<"no executable specified\n";
        exit(1);
    }

    // Since we'll be tracing this program's execution, we might as well disassemble the process's memory directly. That way we
    // don't have to worry about ROSE mapping the specimen to the same virtual address as the kernel (which might be using
    // address randomization). We can stop short of generating the AST because we won't need it.
    BinaryAnalysis::BinaryDebugger debugger(command);
    std::string specimenResourceName = "proc:noattach:" + StringUtility::numberToString(debugger.isAttached());
    P2::Partitioner partitioner = engine.partition(specimenResourceName);
    partitioner.memoryMap()->dump(std::cerr);           // show the memory map as a debugging aid

    // Create a global control flow graph whose vertices are instructions from a global CFG whose verts are mostly basic
    // blocks.
    InsnCfg insnCfg;
    const P2::ControlFlowGraph &bbCfg = partitioner.cfg();
    BOOST_FOREACH (const P2::ControlFlowGraph::Vertex &bbVert, bbCfg.vertices()) {
        if (P2::BasicBlock::Ptr bb = isBasicBlock(bbVert)) {
            const std::vector<SgAsmInstruction*> &insns = bb->instructions();

            // Each basic block has one or more instructions that need to be inserted into our instruction control flow graph
            // with edges from each instruction to the next.  The insertEdgeWithVertices automatically inserts missing
            // vertices, and doesn't insert vertices that already exist, making it convenient for this type of construction.
            for (size_t i=1; i<insns.size(); ++i)
                insnCfg.insertEdgeWithVertices(insns[i-1], insns[i]);

            // The final instruction of this block needs to flow into each of the initial instructions of the successor basic
            // blocks. Be careful that the successors are actually existing basic blocks.  Note that in ROSE's global CFG, a
            // function call has at least two successors: the function being called (normal edges), and the address to which
            // the function returns ("callret" edges). There are other types of edges too, but we want only the normal edges.
            BOOST_FOREACH (const P2::ControlFlowGraph::Edge &bbEdge, bbVert.outEdges()) {
                if (bbEdge.value().type() == P2::E_NORMAL) {
                    if (P2::BasicBlock::Ptr target = isBasicBlock(*bbEdge.target()))
                        insnCfg.insertEdgeWithVertices(insns.back(), target->instructions()[0]);
                }
            }
        }
    }
    mlog[INFO] <<"block CFG: "
               <<StringUtility::plural(bbCfg.nVertices(), "vertices", "vertex") <<", "
               <<StringUtility::plural(bbCfg.nEdges(), "edges") <<"\n";
    mlog[INFO] <<"insn CFG:  "
               <<StringUtility::plural(insnCfg.nVertices(), "vertices", "vertex") <<", "
               <<StringUtility::plural(insnCfg.nEdges(), "edges") <<"\n";
    
    // Run the executable to obtain a trace.  We use the instruction pointer to look up a SgAsmInstruction in the insnCfg and
    // thus map the trace onto the instruction CFG.
    mlog[INFO] <<"running subordinate to obtain trace: " <<boost::join(command, " ") <<"\n";
    std::set<rose_addr_t> missingAddresses;
    Trace trace;
    while (!debugger.isTerminated()) {
        // Find the instruction CFG vertex corresponding to the current execution address. It could be that the execution
        // address doesn't exist in the CFG, and this can be caused by a number of things including failure of ROSE to
        // statically find the address, dynamic libraries that weren't loaded statically, etc.
        rose_addr_t va = debugger.executionAddress();
        InsnCfg::ConstVertexIterator vertex = insnCfg.findVertexKey(va);
        if (!insnCfg.isValidVertex(vertex)) {
            missingAddresses.insert(va);
        } else {
            trace.append(vertex->id());
        }
        debugger.singleStep();
    }
    mlog[INFO] <<"subordinate " <<debugger.howTerminated() <<"\n";
    mlog[INFO] <<"trace length: " <<StringUtility::plural(trace.size(), "instructions") <<"\n";
    Diagnostics::mfprintf(mlog[INFO])("overall burstiness: %6.2f%%\n", 100.0 * trace.burstiness());
    mlog[INFO] <<"distinct executed addresses missing from CFG: " <<missingAddresses.size() <<"\n";

    // Print a list of CFG vertices that were never reached.  We use std::cout rather than diagnostics because this is one of
    // the main outputs of this demo. The "if" condition is constant time.
    BOOST_FOREACH (const InsnCfg::Vertex &vertex, insnCfg.vertices()) {
        if (!trace.exists(vertex.id()))
            std::cout <<"not executed: " <<unparseInstructionWithAddress(vertex.value()) <<"\n";
    }

    // Print list of addresses that were executed but did not appear in the CFG
    BOOST_FOREACH (rose_addr_t va, missingAddresses)
        std::cout <<"missing address: " <<StringUtility::addrToString(va) <<"\n";

    // Print those branch instructions that were executed by the trace but always took the same branch.  Just to mix things up,
    // I'll iterate over the trace labels this time instead of the CFG vertices.  Remember, the labels are the integer IDs of
    // the CFG vertices. The "if" condition executes in constant time, as does the next line.
    for (size_t i = 0; i < trace.nLabels(); ++i) {
        if (insnCfg.findVertex(i)->nOutEdges() > 1 && trace.successors(i).size() == 1) {
            SgAsmInstruction *successor = insnCfg.findVertex(*trace.successorSet(i).begin())->value();
            std::cout <<"single flow: " <<unparseInstructionWithAddress(insnCfg.findVertex(i)->value())
                      <<" --> " <<unparseInstructionWithAddress(successor) <<"\n";
        }
    }

    // Get a list of executed instructions that are branch points and sort them by their burstiness.  The "if" condition is
    // constant time.
    std::vector<InsnTraceInfo> info;
    BOOST_FOREACH (const InsnCfg::Vertex &vertex, insnCfg.vertices()) {
        if (vertex.nOutEdges() > 1 && trace.exists(vertex.id()))
            info.push_back(InsnTraceInfo(vertex.value(), trace.burstiness(vertex.id()), trace.size(vertex.id())));
    }
    std::sort(info.begin(), info.end());
    std::reverse(info.begin(), info.end());
    BOOST_FOREACH (const InsnTraceInfo &record, info) {
        Diagnostics::mfprintf(std::cout)("burstiness %6.2f%% %5zu hits at %s\n",
                                         100.0*record.burstiness, record.nHits,
                                         unparseInstructionWithAddress(record.insn).c_str());
    }
static std::vector<std::string>
parseCommandLine(int argc, char *argv[], P2::Engine &engine, Settings &settings) {
    using namespace Sawyer::CommandLine;
    std::string purpose = "decode encoded strings";
    std::string description =
        "Demonstrates the use of ROSE instruction semantics and ability to start execution an an arbitrary address and "
        "machine state. The @s{decoder} switch is required--it is the entry address of a string decoding function. This "
        "analysis finds all statically-detected calls to that function, obtains three arguments from the call's basic block, "
        "and calls the function.  The third argument is used as the address of a buffer where the decoded string is stored, "
        "and the string will be printed as the result.\n\n"
        "This tool can also run in a mode where the calls are synthesized by varying the first of three arguments.";

    Parser parser = engine.commandLineParser(purpose, description);
    parser.doc("Bugs", "z",
               "Being a demo, this tool is not very flexible when it comes to how the decrypted string is located or "
               "what argument values are used in the synthesis mode.");

    SwitchGroup sg("Tool-specific switches");
    sg.name("tool");

    sg.insert(Switch("decoder")
              .argument("address", nonNegativeIntegerParser(settings.decoderVa))
              .doc("Virtual address of the string decoding function."));

    sg.insert(Switch("stack")
              .argument("address", nonNegativeIntegerParser(settings.stackVa))
              .doc("Initial value for the stack pointer.  The default is " +
                   StringUtility::addrToString(settings.stackVa) + "."));

    sg.insert(Switch("trace-insns")
              .intrinsicValue(true, settings.traceInsns)
              .doc("Cause instructions to be printed to standard error as they are executed. The @s{no-trace-insns} switch "
                   "turns tracing off. The default is to " + std::string(settings.traceInsns?"":"not ") +
                   "show tracing."));
    sg.insert(Switch("no-trace-insns")
              .key("trace-insns")
              .intrinsicValue(false, settings.traceInsns)
              .hidden(true));

    sg.insert(Switch("trace-semantics")
              .intrinsicValue(true, settings.traceSemantics)
              .doc("Cause instruction semantics (the RISC-like operations) to be printed to standard error as they are "
                   "executed. The @s{no-trace-semantics} switch turns tracing off. The default is to " +
                   std::string(settings.traceSemantics?"":"not ") + "show tracing."));
    sg.insert(Switch("no-trace-semantics")
              .key("trace-semantics")
              .intrinsicValue(false, settings.traceSemantics)
              .hidden(true));

    sg.insert(Switch("insn-limit")
              .argument("n", nonNegativeIntegerParser(settings.insnLimit))
              .doc("Maximum number of instructions to execute per decoder call before giving up. The default is " +
                   StringUtility::plural(settings.insnLimit, "instructions") + "."));

    sg.insert(Switch("show-call")
              .argument("n", nonNegativeIntegerParser(settings.showCall))
              .doc("Show calls to the decryption function along with their arguments.  The @v{n} specifies how many arguments "
                   "(each being the natural length of a word) to display. If @v{n} is zero then call information is not "
                   "displayed.  The default is " + StringUtility::plural(settings.showCall, "arguments") + "."));

    sg.insert(Switch("synthesized")
              .intrinsicValue(true, settings.synthesized)
              .doc("Synthesize calls from scratch instead of looking for existing calls."));

    return parser.with(sg).parse(argc, argv).apply().unreachedArgs();
}