Exemple #1
0
int
main(int argc, char *argv[])
{
    std::ios::sync_with_stdio();
    argv0 = argv[0];
    {
        size_t slash = argv0.rfind('/');
        argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1);
        if (0==argv0.substr(0, 3).compare("lt-"))
            argv0 = argv0.substr(3);
    }

    int argno = 1;
    bool link = false;
    std::vector<std::string> signature_components;

    for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) {
        std::cout << argv[argno] << std::endl;
        if (!strcmp(argv[argno], "--")) {
            ++argno;
            break;
        } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) {
            ::usage(0);
        } else if (!strcmp(argv[argno], "--link")) {
            link = true;
        } else if (!strcmp(argv[argno], "--no-link")) {
            link = false;
        } else {
            std::cerr <<argv0 <<": unrecognized switch: " <<argv[argno] <<"\n"
                      <<"see \"" <<argv0 <<" --help\" for usage info.\n";
            exit(1);
        }
    }
    if (argno+2!=argc)
        ::usage(1);

    std::string db_name(argv[argno++]);
    std::cout << "Connecting to db:" << db_name << std::endl;
    SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(db_name);
    transaction = conn->transaction();

    transaction->execute("drop table if exists syscalls_made;");
    transaction->execute("create table syscalls_made (caller integer references semantic_functions(id),"
                         " syscall_id integer, syscall_name text)");

    std::cout << "database name is : " << std::string(argv[argno]) << std::endl;
    std::string specimen_name = argv[argno++];

    // Parse the binary specimen
    SgAsmInterpretation *interp = open_specimen(specimen_name, argv0, link);
    assert(interp!=NULL);

    // Figure out what functions need to be added to the database.
    std::vector<SgAsmFunction*> all_functions = SageInterface::querySubTree<SgAsmFunction>(interp);
    DirectedGraph* G = create_reachability_graph(all_functions, interp);
    add_calls_to_syscalls_to_db(transaction, G, all_functions);
    analyze_data(transaction);
    transaction->commit();
    return 0;
}
    void operator()() {
        if (work.empty())
            return;
        int specimen_id = work.front().specimen_id;

        // Database connections don't survive over fork() according to SqLite and PostgreSQL documentation, so open it again
        SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(databaseUrl)->transaction();

        OutputGroups ogroups; // do not load from database (that might take a very long time)

        if (opt.verbosity>=LACONIC) {
            if (opt.verbosity>=EFFUSIVE)
                std::cerr <<argv0 <<": " <<std::string(100, '#') <<"\n";
            std::cerr <<argv0 <<": processing binary specimen \"" <<files.name(specimen_id) <<"\"\n";
        }

        // Parse the specimen
        SgProject *project = files.load_ast(tx, specimen_id);
        if (!project)
            project = open_specimen(tx, files, specimen_id, argv0);
        if (!project) {
            std::cerr <<argv0 <<": problems loading specimen\n";
            exit(1);
        }

        // Get list of specimen functions and initialize the instruction cache
        std::vector<SgAsmFunction*> all_functions = SageInterface::querySubTree<SgAsmFunction>(project);
        IdFunctionMap functions = existing_functions(tx, files, all_functions);
        FunctionIdMap function_ids;
        AddressIdMap entry2id;                              // maps function entry address to function ID
        for (IdFunctionMap::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
            function_ids[fi->second] = fi->first;
            entry2id[fi->second->get_entry_va()] = fi->first;
        }
        InstructionProvidor insns = InstructionProvidor(all_functions);

        // Split the work list into chunks, each containing testsPerChunk except the last, which may contain fewer.
        static const size_t testsPerChunk = 25;
        size_t nChunks = (work.size() + testsPerChunk - 1) / testsPerChunk;
        std::vector<SomeTests> jobs;
        for (size_t i=0; i<nChunks; ++i) {
            size_t beginWorkIdx = i * testsPerChunk;
            size_t endWorkIdx = std::min((i+1)*testsPerChunk, work.size());
            Work partWork(work.begin()+beginWorkIdx, work.begin()+endWorkIdx);
            jobs.push_back(SomeTests(partWork, databaseUrl, functions, function_ids, &insns, cmd_id, &entry2id));
        }

        // Run the parts in parallel using the maximum parallelism specified on the command-line.  We must commit our
        // transaction before forking, otherwise the children won't see the rows we've added to various tables.
        tx->commit();
        tx.reset();
        size_t nfailed = runInParallel(jobs, opt.nprocs);
        if (nfailed!=0) {
            std::cerr <<"SpecimenProcessor: " <<StringUtility::plural(nfailed, "jobs") <<" failed\n";
            exit(1);
        }
    }
Exemple #3
0
int
main(int argc, char* argv[])
{
    std::string database;
    int norm = 1;
    double similarity_threshold=1.;

    size_t k;
    size_t l;
    try {
        options_description desc("Allowed options");
        desc.add_options()
            ("help", "produce a help message")
            ("database,q", value< string >()->composing(), 
             "the sqlite database that we are to use")
            ("norm,p", value< int >(&norm), "Exponent in p-norm to use (1 or 2 or 3 (MIT implementation) )")
            ("hash-function-size,k", value< size_t >(&k), "The number of elements in a single hash function")
            ("hash-table-count,l", value< size_t >(&l), "The number of separate hash tables to create")
            ("similarity,t", value< double >(&similarity_threshold), "The similarity threshold that is allowed in a clone pair");

        variables_map vm;
        store(parse_command_line(argc, argv, desc), vm);
        notify(vm);

        if (vm.count("help")) {
            cout << desc;            
            exit(0);
        }
		
        if (vm.count("database")!=1) {
            std::cerr << "Missing options. Call as: findClones --database <database-name>" 
                      << std::endl;
            exit(1);

        }

        database = vm["database"].as<string >();
        similarity_threshold = vm["similarity"].as<double>();
        cout << "database: " << database << std::endl;
    } catch(exception& e) {
        cout << e.what() << "\n";
    }

    std::cout << "The similarity threshold is " << similarity_threshold << std::endl;
    SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(database)->transaction();
    tx->statement("update run_parameters set similarity_threshold = ?")
        ->bind(0, similarity_threshold)
        ->execute();

    OperateOnClusters op(database, norm, similarity_threshold, k , l);
    op.analyzeClusters();
    //op.calculate_false_positives();

    tx->commit();
    return 0;
};
int
main(int argc, char *argv[])
{
    // Parse command-line
    opt.nprocs = nProcessors();
    int argno = parse_commandline(argc, argv);
    if (argno+1!=argc)
        usage(1);
    std::string databaseUrl = argv[argno++];

    SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(databaseUrl)->transaction();
    int64_t cmd_id = start_command(tx, argc, argv, "running tests");

    // Load worklist
    MultiWork work;
    load_sorted_work(work/*out*/);
    if (work.empty())
        return 0;


    // Load information about files.  The transaction is not saved anywhere.
    FilesTable files(tx);

    // We must commit our transaction before we fork, otherwise the child processes won't be able to see the rows we've
    // inserted. Specifically, the row in the semantic_history table that says who we are.  Destroy the smart pointer so that
    // the connection is even closed.
    tx->commit();
    tx.reset();

    // Process work items for each specimen sequentially
    BOOST_FOREACH (const Work &workForSpecimen, work)
    if (forkAndWait(SpecimenProcessor(workForSpecimen, files, databaseUrl, cmd_id)))
        exit(1);

    // Indicate that this command is finished
    tx = SqlDatabase::Connection::create(databaseUrl)->transaction();
    finish_command(tx, cmd_id, "ran tests");
    tx->commit();

    return 0;
}
Exemple #5
0
int
main(int argc, char *argv[]) {
    Sawyer::initializeLibrary();
    mlog = Sawyer::Message::Facility("tool");
    Sawyer::Message::mfacilities.insertAndAdjust(mlog);

    // Parse the command-line
    Settings settings;
    std::vector<std::string> args = parseCommandLine(argc, argv, settings);
    SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(settings.databaseUri)->transaction();
    if (args.size() != 1) {
        mlog[FATAL] <<"incorrect usage; see --help\n";
        exit(1);
    }

    if (args[0] == "clear") {
        clearErrors(tx, settings);
    } else if (args[0] == "update") {
        updateDatabase(tx, settings);
    } else if (args[0] == "missing") {
        listMissingErrors(tx, settings);
    } else if (args[0] == "count-missing") {
        countMissingErrors(tx, settings);
    } else if (args[0] == "list") {
        listErrors(tx, settings);
    } else {
        mlog[FATAL] <<"unknown command \"" <<StringUtility::cEscape(args[0]) <<"\"; see --help\n";
        exit(1);
    }

    if (settings.dryRun) {
        mlog[WARN] <<"database was not modified (running with --dry-run)\n";
    } else {
        tx->commit();
    }
}
int
main(int argc, char *argv[])
{
    std::ios::sync_with_stdio();
    argv0 = argv[0];
    {
        size_t slash = argv0.rfind('/');
        argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1);
        if (0==argv0.substr(0, 3).compare("lt-"))
            argv0 = argv0.substr(3);
    }

    bool ignore_inline_candidates = false;
    bool ignore_no_compares = false;
    int  call_depth = -1;
    bool ignore_faults = true;
    double semantic_similarity_threshold = 0.70;
    bool expand_ncalls = false;
    bool reachability_graph = true;
    bool show_progress = false;
    bool verbose = false;
    std::string input_file_name;

    int argno = 1;
    for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) {
        if (!strcmp(argv[argno], "--")) {
            ++argno;
            break;
        } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) {
            usage(0);
        } else if (!strcmp(argv[argno], "--ignore-inline-candidates")) {
            ignore_inline_candidates = true;
        } else if (!strcmp(argv[argno], "--ignore-no-compares")) {
            ignore_no_compares = false;
        } else if (!strcmp(argv[argno], "--progress")) {
            show_progress = true;
        } else if (!strcmp(argv[argno], "--no-expand-ncalls")) {
            expand_ncalls = false;
        } else if (!strncmp(argv[argno], "--file=", 7)) {
            input_file_name = argv[argno]+7;
        } else if (!strcmp(argv[argno], "--verbose")) {
            verbose = true;
        } else if (!strncmp(argv[argno], "--call-depth=",13)) {
            call_depth = strtol(argv[argno]+13, NULL, 0);
        } else {
            std::cerr <<argv0 <<": unknown switch: " <<argv[argno] <<"\n"
                      <<argv0 <<": see --help for more info\n";
            exit(1);
        }
    }
    if (argno+1!=argc)
        usage(1);

    SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(argv[argno++]);
    transaction = conn->transaction();
    int64_t cmd_id = CloneDetection::start_command(transaction, argc, argv, "calculating api similarity");

    // Read function pairs from standard input or the file
    FunctionPairs worklist;
    if (input_file_name.empty()) {
        std::cerr <<argv0 <<": reading function pairs worklist from stdin...\n";
        worklist = load_worklist("stdin", stdin);
    } else {
        FILE *in = fopen(input_file_name.c_str(), "r");
        if (NULL==in) {
            std::cerr <<argv0 <<": " <<strerror(errno) <<": " << input_file_name <<"\n";
            exit(1);
        }
        worklist = load_worklist(input_file_name, in);
        fclose(in);
    }
    size_t npairs = worklist.size();
    std::cerr <<argv0 <<": work list has " <<npairs <<" function pair" <<(1==npairs?"":"s") <<"\n";

    // Process each function pair
    CloneDetection::Progress progress(npairs);
    progress.force_output(show_progress);

    // Load the computational equivalence classes
    std::map<int,int> norm_map;
    computational_equivalent_classes(norm_map);

    // Create list of functions and igroups to analyze
    SqlDatabase::StatementPtr insert_stmt = transaction->statement("insert into api_call_similarity"
                                                                   "(func1_id, func2_id, max_similarity, min_similarity,"
                                                                   " ave_similarity, cg_similarity)"
                                                                   " values (?, ?, ?, ?, ?, ?)");
    while (!worklist.empty()) {
        ++progress;
        int func1_id, func2_id;
        boost::tie(func1_id, func2_id) = worklist.shift();
        if (verbose)
            std::cerr <<argv0 <<": func1_id=" <<func1_id <<" func2_id=" <<func2_id <<"\n";

        SqlDatabase::StatementPtr igroup_stmt = transaction->statement("select distinct sem1.igroup_id"
                                                                       " from semantic_fio as sem1 "
                                                                       " join semantic_fio as sem2"
                                                                       "   on sem2.igroup_id = sem1.igroup_id"
                                                                       "   and sem2.func_id = ?"
                                                                       " where sem1.func_id = ? " +
                                                                       std::string(ignore_faults ?
                                                                                   " and sem1.status = 0 and sem2.status = 0" :
                                                                                   "") +
                                                                       " order by sem1.igroup_id");
        igroup_stmt->bind(0, func2_id);
        igroup_stmt->bind(1, func1_id);

        int ncompares = 0;
        double max_api_similarity = 0;
        double min_api_similarity = INT_MAX;
        double ave_api_similarity = 0;
        for (SqlDatabase::Statement::iterator row=igroup_stmt->begin(); row!=igroup_stmt->end(); ++row) {
            int igroup_id = row.get<int>(0);
            double api_similarity = similarity(func1_id, func2_id, igroup_id, semantic_similarity_threshold,
                                               ignore_inline_candidates, ignore_no_compares, call_depth, expand_ncalls, norm_map);

            if (api_similarity < 0)
                continue;

            max_api_similarity = std::max(api_similarity, max_api_similarity);
            min_api_similarity = std::min(api_similarity, min_api_similarity);
            ave_api_similarity += api_similarity;
            ncompares++;
        }

        if (ncompares == 0) {
            ave_api_similarity = 1.0;
            max_api_similarity = 1.0;
            min_api_similarity = 1.0;
        } else {
            ave_api_similarity = ave_api_similarity/ncompares;
        }

        // Find call similarity between functions
        double cg_similarity = whole_function_similarity(func1_id, func2_id, norm_map, reachability_graph);

        insert_stmt->bind(0, func1_id);
        insert_stmt->bind(1, func2_id);
        insert_stmt->bind(2, max_api_similarity);
        insert_stmt->bind(3, min_api_similarity);
        insert_stmt->bind(4, ave_api_similarity);
        insert_stmt->bind(5, cg_similarity);

        insert_stmt->execute();
    }

    progress.message("committing changes");
    std::string mesg = "calculated api similarity for "+
                       StringUtility::numberToString(npairs)+" function pair"+(1==npairs?"":"s");
    CloneDetection::finish_command(transaction, cmd_id, mesg);
    transaction->commit();
    progress.clear();
    return 0;
}
int
main(int argc, char *argv[])
{
    std::ios::sync_with_stdio();
    argv0 = argv[0];
    {
        size_t slash = argv0.rfind('/');
        argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1);
        if (0==argv0.substr(0, 3).compare("lt-"))
            argv0 = argv0.substr(3);
    }

    // Parse switches
    Switches opt;
    int argno = 1, ngenerators = 0;
    for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) {
        if (!strcmp(argv[argno], "--")) {
            ++argno;
            break;
        } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) {
            usage(0);
        } else if (!strncmp(argv[argno], "--ngroups=", 10)) {
            opt.ngroups = strtoul(argv[argno]+10, NULL, 0);
            opt.ngroups_set = true;
        } else if (!strcmp(argv[argno], "--collection")) {
            opt.single_collection = true;
        } else if (!strncmp(argv[argno], "--collection=", 13)) {
            opt.collection_id = strtoul(argv[argno]+13, NULL, 0);
            opt.collection_id_set = opt.single_collection = true;
        } else if (!strncmp(argv[argno], "--default=", 10)) {
            opt.default_queue = parse_queuename(argv[argno]+10);
        } else if (!strcmp(argv[argno], "--memhash") || !strncmp(argv[argno], "--memhash=", 10)) {
            std::string s;
            if (char *equal = strchr(argv[argno], '='))
                s = equal+1;
            std::vector<std::string> valargs = StringUtility::split(',', s, 3);
            valargs.resize(3);
            if (valargs[0].empty())
                valargs[0] = "0";
            if (valargs[1].empty())
                valargs[1] = "255";
            std::vector<std::string> randargs(1, "1");
            randargs.push_back(valargs[2]);
            valargs.pop_back();
            opt.queue_modifiers.push_back(new ValuesGenerator(IQ_MEMHASH, valargs));
            opt.queue_modifiers.push_back(new RandomGenerator(IQ_MEMHASH, randargs));
            ++ngenerators;
        } else {
            std::cerr <<argv0 <<": unrecognized switch: " <<argv[argno] <<"\n"
                      <<"see \"" <<argv0 <<" --help\" for usage info.\n";
            exit(1);
        }
    }
    if (!opt.ngroups_set) {
        std::cerr <<argv0 <<": missing --ngroups switch\n"
                  <<argv0 <<": see --help for more info\n";
        exit(1);
    }

    // Parse non-switch arguments (allow the database connection string to appear anywhere in this list since simplifies
    // the run-analysis.sh script.
    for (/*void*/; argno<argc; argno++) {
        std::vector<std::string> colon_parts = StringUtility::split(':', argv[argno], 2);
        InputQueueName qn = parse_queuename(colon_parts[0]);
        if (IQ_NONE!=qn && 2==colon_parts.size()) {
            std::vector<std::string> equal_parts = StringUtility::split('=', colon_parts[1], 2);
            std::string gname = equal_parts[0];
            std::vector<std::string> args;
            try {
                if (2==equal_parts.size())
                    args = StringUtility::split(',', equal_parts[1], (size_t)-1, true);
                if (0==gname.compare("values")) {
                    opt.queue_modifiers.push_back(new ValuesGenerator(qn, args));
                } else if (0==gname.compare("set") || 0==gname.compare("reset")) {
                    opt.queue_modifiers.push_back(new ResetGenerator(qn, args));
                } else if (0==gname.compare("pad")) {
                    opt.queue_modifiers.push_back(new PaddingGenerator(qn, args));
                    ++ngenerators;
                } else if (0==gname.compare("random")) {
                    opt.queue_modifiers.push_back(new RandomGenerator(qn, args));
                    ++ngenerators;
                } else if (0==gname.compare("copy")) {
                    opt.queue_modifiers.push_back(new CopyGenerator(qn, args));
                    ++ngenerators;
                } else if (0==gname.compare("permute")) {
                    opt.queue_modifiers.push_back(new PermuteFilter(qn, args));
                } else if (0==gname.compare("shuffle")) {
                    opt.queue_modifiers.push_back(new ShuffleFilter(qn, args));
                } else if (0==gname.compare("redirect")) {
                    opt.queue_modifiers.push_back(new RedirectFilter(qn, args));
                } else {
                    std::cerr <<argv0 <<": unknown generator or filter name: " <<gname <<"\n";
                    exit(1);
                }
            } catch (const Exception &e) {
                std::cerr <<argv0 <<": " <<argv[argno] <<": " <<e <<"\n";
                exit(1);
            }
        } else if (transaction==NULL) {
            transaction = SqlDatabase::Connection::create(argv[argno])->transaction();
        } else {
            std::cerr <<argv0 <<": unknown generator or filter: " <<argv[argno] <<"\n";
            exit(1);
        }
    }
    if (transaction==NULL) {
        std::cerr <<argv0 <<": missing database URL\n"
                  <<argv0 <<": see --help for more info\n";
        exit(1);
    }
    if (opt.queue_modifiers.empty()) {
        if (opt.ngroups>0) {
            std::cerr <<argv0 <<": no generators specified; all input groups would be empty\n";
            exit(1);
        } else {
            exit(0);
        }
    }

    // Get the list of queues that are affected.
    std::vector<int> queue_modified(IQ_NQUEUES, 0);
    for (size_t i=0; i<opt.queue_modifiers.size(); ++i)
        queue_modified[opt.queue_modifiers[i]->queuename] = 1;

    // Generate the inputs
    int64_t cmd_id = start_command(transaction, argc, argv, "generating input groups");
    int first_id = transaction->statement("select coalesce(max(igroup_id),-1)+1 from semantic_inputvalues")->execute_int();
    if (!opt.collection_id_set)
        opt.collection_id = first_id;
    Progress progress(opt.ngroups);
    for (size_t gi=0; gi<opt.ngroups; ++gi) {
        ++progress;
        int igroup_id = first_id + gi;
        InputGroup igroup;
        igroup.set_collection_id(opt.single_collection ? opt.collection_id : igroup_id);

        // All queues initialize redirect to the default queue.  If any generator or filter is applied, then
        // we don't do the default redirect.
        for (size_t qi=0; qi<IQ_NQUEUES; ++qi) {
            if (!queue_modified[qi])
                igroup.queue((InputQueueName)qi).redirect(opt.default_queue);
        }

        // Build the queues
        for (size_t qmi=0; qmi<opt.queue_modifiers.size(); ++qmi) {
            QueueModifier *qm = opt.queue_modifiers[qmi];
            qm->reseed(igroup_id);
            InputQueue &q = igroup.queue(qm->queuename);
            qm->operator()(q, igroup_id);
        }

        // Save all queues
        igroup.save(transaction, igroup_id, cmd_id);
    }
    progress.clear();

    std::string desc = "generated "+StringUtility::numberToString(opt.ngroups)+" input group"+(1==opt.ngroups?"":"s")+
                       " starting at "+StringUtility::numberToString(first_id);
    if (opt.ngroups>0) {
        finish_command(transaction, cmd_id, desc);
        transaction->commit();
    }

    std::cerr <<argv0 <<": " <<desc <<"\n";
    return 0;
}
Exemple #8
0
int
main(int argc, char* argv[])
{
    std::string database;
    size_t l = 4, k = 700;
    size_t hashTableNumBuckets = 13000000, hashTableElementsPerBucket = 20;
    double distBound = 1.;
    double similarity=1.;
    double r = 4.;
    int norm = 1;
    int groupLow=-1;
    int groupHigh=-1;

    //Timing
    struct timeval before, after;
    struct rusage ru_before, ru_after;
    gettimeofday(&before, NULL);
    getrusage(RUSAGE_SELF, &ru_before);

    bool nodelete = false;
    try {
        options_description desc("Allowed options");
        desc.add_options()
            ("help", "Produce a help message")
            ("nodelete", "Do not delete from vectors")
            ("groupLow,g", value< int >(&groupLow), "The lowest count of elements")
            ("groupHigh,G", value< int >(&groupHigh), "The highest count of elements")
            ("database", value< string >(&database), "The sqlite database that we are to use")
            ("hash-function-size,k", value< size_t >(&k), "The number of elements in a single hash function")
            ("hash-table-count,l", value< size_t >(&l), "The number of separate hash tables to create")
            ("buckets,b", value< size_t >(&hashTableNumBuckets),
             "The number of buckets in each hash table (buckets may store multiple elements)")
            ("bucket-size,s", value< size_t >(&hashTableElementsPerBucket),
             "The number of elements that can be stored in each hash table bucket")
            ("similarity,t", value< double >(&similarity), "The similarity threshold that is allowed in a clone pair")
            ("distance,d", value< double >(&distBound), "The maximum distance that is allowed in a clone pair")
            ("interval-size,r", value< double >(&r), "The divisor for the l_2 hash function family")
            ("norm,p", value< int >(&norm), "Exponent in p-norm to use (1 or 2)")
            ;
        variables_map vm;
        store(parse_command_line(argc, argv, desc), vm);
        notify(vm);

        distBound = similarity==1  ? 0.0 : sqrt(2*groupLow*(1.-similarity));

        std::cerr << "similarity " << similarity << " distBound " << distBound << std::endl;

        if (vm.count("help")) {
            cout << desc << endl;
            exit(0);
        }
        if (vm.count("nodelete")) {
            nodelete = true;
        }
        if (vm.count("groupLow") == 0) {
            groupLow = -1;
        }
        if (vm.count("groupHigh") == 0) {
            groupHigh = -1;
        }
        if (database == "") {
            std::cerr << "Missing options. Call as: " << argv[0] << " --database <database-name> [other parameters]" 
                      << std::endl;
            exit(1);
        }
        if (hashTableNumBuckets >= (1ULL << 32)) {
            cerr << "Number of buckets must be less than 2**32" << endl;
            exit (1);
        }
        if (norm != 1 && norm != 2) {
            cerr << "Norm must be either 1 or 2" << endl;
            exit (1);
        }
        if (nodelete == false) {
            cerr << "groupLow: " << groupLow << std::endl;
            cerr << "groupHigh: " << groupHigh << std::endl;
            cerr << "norm: l_" << norm << std::endl;
            cerr << "database: " << database << std::endl;
            cerr << "k: " << k << std::endl;
            cerr << "l: " << l << std::endl;
            cerr << "buckets: " << hashTableNumBuckets << std::endl;
            cerr << "bucket size: " << hashTableElementsPerBucket << std::endl;
            cerr << "distance: " << distBound << std::endl;
            cerr << "r: " << r << std::endl;
        }
    } catch(exception& e) {
        cout << e.what() << "\n";
        exit (1);
    }

    SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(database)->transaction();

    scoped_array_with_size<VectorEntry> vectors;
    scoped_array_with_size<scoped_array_with_size<VectorEntry> > duplicateVectors;
  
    //Step to pass to LSH only the vectors that are not part of an exact clone pass
    {
        std::vector<int> functionsThatWeAreInterestedIn;
        scoped_array_with_size<VectorEntry> allVectors;
        map<string, std::vector<int> > internTable;
        read_vector_data(tx, allVectors, functionsThatWeAreInterestedIn, internTable, groupLow, groupHigh, false);

        //Assign to vectors the first element of each hash bucket
        int numberOfBuckets = 0;
        for (map<string, std::vector<int> >::iterator iItr = internTable.begin(); iItr != internTable.end() ; iItr++)
            numberOfBuckets++;

        vectors.allocate(numberOfBuckets);
        duplicateVectors.allocate(numberOfBuckets);

        int indexInVectors=0;
        std::cout << "All is size: " << allVectors.size() << " reduced size is " << vectors.size() << std::endl;
        for (map<string, std::vector<int> >::iterator iItr = internTable.begin(); iItr != internTable.end() ; iItr++) {
            if (iItr->second.size()>1)
                duplicateVectors[indexInVectors].allocate(iItr->second.size()-1);
            for (unsigned int i = 0 ; i < iItr->second.size() ; i++) {
                VectorEntry& allVe = allVectors[iItr->second[i]];
                VectorEntry& ve = i == 0 ? vectors[indexInVectors] : duplicateVectors[indexInVectors][i-1];
                ve.rowNumber  = allVe.rowNumber;
                ve.functionId = allVe.functionId;
                ve.indexWithinFunction = allVe.indexWithinFunction;
                ve.line = allVe.line;
                ve.offset = allVe.offset;
                ve.compressedCounts.allocate(allVe.compressedCounts.size());
                memcpy(ve.compressedCounts.get(), allVe.compressedCounts.get(), allVe.compressedCounts.size());
            }
            indexInVectors++;
        }
    }

    if (vectors[0].compressedCounts.size() == 0) {
        cerr << "Vector slot 0 is empty" << endl;
        abort();
    }

    size_t numVectorElements = getUncompressedSizeOfVector(vectors[0].compressedCounts.get(), vectors[0].compressedCounts.size());
    if (debug_messages) {
        cout << "Vectors have " << numVectorElements << " elements" << endl;
        cout << "Number of vectors fetched is " << vectors.size() << std::endl;
    }
    
    LSHTableBase* table = NULL;
    switch (norm) {
        case 1:
            table = new LSHTable<HammingHashFunctionSet, L1DistanceObject>(vectors, L1DistanceObject(), k, l, r,
                                                                           numVectorElements, hashTableNumBuckets,
                                                                           hashTableElementsPerBucket, distBound);
            break;
        case 2:
            table = new LSHTable<StableDistributionHashFunctionSet, L2DistanceObject>(vectors, L2DistanceObject(), k, l, r,
                                                                                      numVectorElements, hashTableNumBuckets,
                                                                                      hashTableElementsPerBucket, distBound);
            break;
        default:
            cerr << "Bad value for --norm" << endl;
            abort(); // Should have been caught earlier
    }
    assert(table);

    // Setup stuff for postprocessing
    int windowSize = 0;
    int stride = 0;
    get_run_parameters(tx, windowSize, stride);

    if (nodelete == false) {
        cerr << "About to delete from clusters" << endl;
        tx->execute("delete from clusters");
        cerr << "... done" << endl;
        cerr << "About to delete from postprocessed_clusters" << endl;
        tx->execute("delete from postprocessed_clusters");
        cerr << "... done" << endl;
    }
    const size_t numStridesThatMustBeDifferent = windowSize / (stride * 2);

    // Get clusters and postprocess them
    vector<bool> liveVectors(vectors.size(), true);
    size_t clusterNum = 0, postprocessedClusterNum = 0;
    for (size_t i = 0; i < vectors.size(); ++i) { //Loop over vectors
        //Creating potential clusters
        if (!liveVectors[i])
            continue;
        liveVectors[i] = false;
        vector<pair<size_t, double> > clusterElementsRaw = table->query(i); // Pairs are vector number, distance
        vector<pair<uint64_t, double> > clusterElements;
        vector<uint64_t > postprocessedClusterElements;
        clusterElements.push_back(make_pair(i, 0));

        //const VectorEntry& ve = vectors[i];
        for (size_t j = 0; j < clusterElementsRaw.size(); ++j) {
            size_t entry = clusterElementsRaw[j].first;
            //double dist = clusterElementsRaw[j].second;
            // All entries less than i were in previous clusters, so we save an array lookup
            if (entry <= i || !liveVectors[entry]) continue;
            clusterElements.push_back(clusterElementsRaw[j]);
            liveVectors[entry] = false;
        }
        if (clusterElements.size() < 2 && duplicateVectors[i].size() == 0 )
            continue;

        //Insert raw cluster data 
        for (vector<pair<uint64_t, double> >::const_iterator j = clusterElements.begin(); j != clusterElements.end(); ++j) {
            for(size_t k = 0; k < duplicateVectors[j->first].size(); k++) {
                const VectorEntry& ve = duplicateVectors[j->first][k];
                insert_into_clusters(tx, clusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, j->second);
            }

            const VectorEntry& ve = vectors[j->first];
            insert_into_clusters(tx, clusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, j->second);
        }
        if (clusterNum % 10000 == 0 && debug_messages)
            cerr << "cluster " << clusterNum << " has " << clusterElements.size() << " elements" << endl;
        ++clusterNum;

        //Postprocessing does not make sense for inexact clones
        if (similarity != 1.0 )
            continue;

        // This implicitly groups elements in the same function together and order by index_within_function in each function
        // Not needed because of the sort in LSHTable::query() which is on the cluster number:
        // std::sort(clusterElements.begin(), clusterElements.end());

        //The next two variables will we initialized in first run
        size_t lastFunctionId=0;
        size_t lastIndexWithinFunction=0;
        bool first = true;
        std::vector<VectorEntry*> clusterElemPtr;
        for (size_t j = 0; j < clusterElements.size(); ++j) {
            clusterElemPtr.push_back( &vectors[ clusterElements[j].first ]  );
            for (size_t k = 0; k < duplicateVectors[clusterElements[j].first].size(); k++)
                clusterElemPtr.push_back(&duplicateVectors[ clusterElements[j].first ][k]);
        }

        std::sort(clusterElemPtr.begin(), clusterElemPtr.end(), compare_rows );
        for (size_t j = 0; j < clusterElemPtr.size(); ++j) {
            const VectorEntry& ve = *clusterElemPtr[j];
            if (first || ve.functionId != lastFunctionId ||
                ve.indexWithinFunction >= lastIndexWithinFunction + numStridesThatMustBeDifferent) {
                lastFunctionId = ve.functionId;
                lastIndexWithinFunction = ve.indexWithinFunction;
                postprocessedClusterElements.push_back(j);
            }
            first = false;
        }
        if (postprocessedClusterElements.size() >= 2) { //insert post processed data 
            for (vector<uint64_t >::const_iterator j = postprocessedClusterElements.begin();
                 j != postprocessedClusterElements.end(); ++j) {
                const VectorEntry& ve = *clusterElemPtr[*j];
                insert_into_postprocessed_clusters(tx, postprocessedClusterNum, ve.functionId, ve.indexWithinFunction,
                                                   ve.rowNumber, 0);
            }
            if (postprocessedClusterNum % 1000 == 0) {
                cerr << "postprocessed cluster " << postprocessedClusterNum
                     << " has " << postprocessedClusterElements.size() << " elements" << endl;
            }
            ++postprocessedClusterNum;
        }
    }
    cerr << clusterNum << " total cluster(s), " << postprocessedClusterNum << " after postprocessing" << endl;

    gettimeofday(&after, NULL);
    getrusage(RUSAGE_SELF, &ru_after);
    insert_timing(tx, "lsh", groupLow, groupHigh, vectors.size(), k, l, before,after, ru_before, ru_after);

    tx->commit();
    return 0;
}
int
main(int argc, char *argv[])
{
    std::ios::sync_with_stdio();
    argv0 = argv[0];
    {
        size_t slash = argv0.rfind('/');
        argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1);
        if (0==argv0.substr(0, 3).compare("lt-"))
            argv0 = argv0.substr(3);
    }

    int argno = 1;
    for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) {
        if (!strcmp(argv[argno], "--")) {
            ++argno;
            break;
        } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) {
            ::usage(0);
        } else if (!strcmp(argv[argno], "--delete")) {
            opt.delete_old_data = true;
        } else if (!strncmp(argv[argno], "--exclude-functions=", 20)) {
            opt.exclude_functions_table = argv[argno]+20;
        } else if (!strcmp(argv[argno], "--no-delete")) {
            opt.delete_old_data = false;
        } else if (!strncmp(argv[argno], "--relation=", 11)) {
            opt.relation_id = strtol(argv[argno]+11, NULL, 0);
        } else {
            std::cerr <<argv0 <<": unknown switch: " <<argv[argno] <<"\n"
                      <<argv0 <<": see --help for more info\n";
            exit(1);
        }
    };
    if (argno+1!=argc)
        ::usage(1);
    time_t start_time = time(NULL);
    SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(argv[argno++]);
    SqlDatabase::TransactionPtr tx = conn->transaction();

    // Save ourself in the history if we're modifying the database.
    int64_t cmd_id=-1;
    if (opt.delete_old_data)
        cmd_id = CloneDetection::start_command(tx, argc, argv, "clearing funcsim data for relation #"+
                                               StringUtility::numberToString(opt.relation_id), start_time);

    // The 32-func-similarity tool needs this index, so we might as well create it here when we're running serially.  The
    // semantic_outputvalues table can be HUGE depending on how the analysis is configured (i.e., whether it saves output
    // values as a vector or set, whether it saves function calls and system calls, etc.).  Since creating the index could take
    // a few minutes, we'd rather not create it if it alread exists, but PostgreSQL v8 doesn't have a "CREATE INDEX IF NOT
    // EXISTS" ability.  Therefore, try to create the index right away before we make any other changes, and if creation fails
    // then start a new transaction (because the current one is hosed).
    std::cerr <<argv0 <<": creating output group index (could take a while)\n";
    try {
        SqlDatabase::TransactionPtr tx = conn->transaction();
        tx->execute("create index idx_ogroups_hashkey on semantic_outputvalues(hashkey)");
        tx->commit();
    } catch (const SqlDatabase::Exception&) {
        std::cerr <<argv0 <<": idx_ogroups_hashkey index already exists; NOT dropping and recreating\n";
    }

    // Delete old data.
    if (opt.delete_old_data)
        tx->statement("delete from semantic_funcsim where relation_id = ?")->bind(0, opt.relation_id)->execute();

    // Get the list of functions that should appear in the worklist.
    std::cerr <<argv0 <<": obtaining function list\n";
    std::string stmt1 = "create temporary table tmp_tested_funcs as"
                        " select distinct fio.func_id as func_id"
                        " from semantic_fio as fio";
    if (!opt.exclude_functions_table.empty()) {
        std::vector<std::string> parts = StringUtility::split('.', opt.exclude_functions_table, 2, true);
        if (parts.size()<2)
            parts.push_back("func_id");
        stmt1 += " left join " + parts.front() + " as exclude"
                 " on fio.func_id = exclude." + parts.back() +
                 " where exclude." + parts.back() + " is null";
    }
    tx->execute(stmt1);

    // Create pairs of function IDs for those functions which have been tested and for which no similarity measurement has been
    // computed.  (FIXME: We should probably recompute similarity that might have changed due to rerunning tests or running the
    // same function but with more input groups. [Robb P. Matzke 2013-06-19])
    std::cerr <<argv0 <<": creating work list\n";
    SqlDatabase::StatementPtr stmt2 = tx->statement("select distinct f1.func_id as func1_id, f2.func_id as func2_id"
                                                    " from tmp_tested_funcs as f1"
                                                    " join tmp_tested_funcs as f2 on f1.func_id < f2.func_id"
                                                    " except"
                                                    " select func1_id, func2_id from semantic_funcsim as sim"
                                                    " where sim.relation_id = ?");
    stmt2->bind(0, opt.relation_id);
    for (SqlDatabase::Statement::iterator row=stmt2->begin(); row!=stmt2->end(); ++row)
        std::cout <<row.get<int>(0) <<"\t" <<row.get<int>(1) <<"\n";

    if (cmd_id>=0)
        CloneDetection::finish_command(tx, cmd_id, "cleared funcsim table for relation #"+
                                       StringUtility::numberToString(opt.relation_id));

    tx->commit();
    return 0;
}