Ejemplo n.º 1
0
void
compute_percent_similarity_statistics(double bucket_size, double increment,  SqlDatabase::TransactionPtr transaction)
{
    int num_pairs = transaction->statement("select count(*) from semantic_funcsim")->execute_int();
    transaction->execute("drop table IF EXISTS fr_percent_similar");
    transaction->execute("create table fr_percent_similar(similarity_low double precision, similarity_middle double precision,"
                         " similarity_high double precision, percent double precision, num_matches integer);");

    SqlDatabase::StatementPtr pecent_similar_stmt = transaction->statement("insert into fr_percent_similar"
            // 0              1                  2
            "(similarity_low, similarity_middle, similarity_high,"
            // 3       4
            " percent, num_matches) "
            " values (?, ?, ?, ?, ?)");
    for (double cur_bucket = 0.0; cur_bucket <= 1.0+bucket_size; cur_bucket+=increment) {
        int num_matches = transaction->statement("select count(*) from semantic_funcsim where "
                          " similarity >= " +
                          boost::lexical_cast<std::string>(cur_bucket - bucket_size) +
                          " and similarity < " +
                          boost::lexical_cast<std::string>(cur_bucket + bucket_size))->execute_int();
        pecent_similar_stmt->bind(0, cur_bucket - bucket_size < 0 ? 0 : cur_bucket - bucket_size);
        pecent_similar_stmt->bind(1, cur_bucket);
        pecent_similar_stmt->bind(2, cur_bucket + bucket_size >= 1.0 ? 1.0 : cur_bucket + bucket_size);
        pecent_similar_stmt->bind(3, num_pairs > 0 ? ((double) num_matches*100.0)/num_pairs : 0);
        pecent_similar_stmt->bind(4, num_matches);
        pecent_similar_stmt->execute();
    }
}
Ejemplo n.º 2
0
int
main(int argc, char *argv[])
{
    std::ios::sync_with_stdio();
    argv0 = argv[0];
    {
        size_t slash = argv0.rfind('/');
        argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1);
        if (0==argv0.substr(0, 3).compare("lt-"))
            argv0 = argv0.substr(3);
    }

    int argno = 1;
    bool link = false;
    std::vector<std::string> signature_components;

    for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) {
        std::cout << argv[argno] << std::endl;
        if (!strcmp(argv[argno], "--")) {
            ++argno;
            break;
        } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) {
            ::usage(0);
        } else if (!strcmp(argv[argno], "--link")) {
            link = true;
        } else if (!strcmp(argv[argno], "--no-link")) {
            link = false;
        } else {
            std::cerr <<argv0 <<": unrecognized switch: " <<argv[argno] <<"\n"
                      <<"see \"" <<argv0 <<" --help\" for usage info.\n";
            exit(1);
        }
    }
    if (argno+2!=argc)
        ::usage(1);

    std::string db_name(argv[argno++]);
    std::cout << "Connecting to db:" << db_name << std::endl;
    SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(db_name);
    transaction = conn->transaction();

    transaction->execute("drop table if exists syscalls_made;");
    transaction->execute("create table syscalls_made (caller integer references semantic_functions(id),"
                         " syscall_id integer, syscall_name text)");

    std::cout << "database name is : " << std::string(argv[argno]) << std::endl;
    std::string specimen_name = argv[argno++];

    // Parse the binary specimen
    SgAsmInterpretation *interp = open_specimen(specimen_name, argv0, link);
    assert(interp!=NULL);

    // Figure out what functions need to be added to the database.
    std::vector<SgAsmFunction*> all_functions = SageInterface::querySubTree<SgAsmFunction>(interp);
    DirectedGraph* G = create_reachability_graph(all_functions, interp);
    add_calls_to_syscalls_to_db(transaction, G, all_functions);
    analyze_data(transaction);
    transaction->commit();
    return 0;
}
Ejemplo n.º 3
0
static void
gather_source_code(const SqlDatabase::TransactionPtr &tx)
{
    tx->execute("create temporary table tmp_src as"
                "  select distinct src.*"
                "    from tmp_insns as insn"
                "    join semantic_sources as src"
                "      on insn.src_file_id=src.file_id"
                "      and src.linenum >= insn.src_line-10"
                "      and src.linenum <= insn.src_line+10");
}
Ejemplo n.º 4
0
Archivo: callLSH.C Proyecto: 8l/rose
static void
postprocess(const SqlDatabase::TransactionPtr &tx)
{
    int windowSize = tx->statement("select window_size from run_parameters limit 1")->execute_int();
    int stride = tx->statement("select stride from run_parameters limit 1")->execute_int();
    assert(windowSize != 0);
    assert(stride != 0);

    cerr << "About to delete from postprocessed_clusters" << endl;
    tx->execute("delete from postprocessed_clusters");
    cerr << "... done" << endl;

    cerr << "About to postprocess" << endl;
    SqlDatabase::StatementPtr cmd = tx->statement("select cluster, function_id, index_within_function, vectors_row"
                                                  " from clusters order by cluster, function_id, index_within_function");
    SqlDatabase::StatementPtr insertCmd = tx->statement("insert into postprocessed_clusters"
                                                        " select * from clusters where row_number = ?");
    const size_t numStridesThatMustBeDifferent = windowSize / (stride * 2);
    string last_cluster = "";
    string last_func_id = "";
    size_t last_index_within_function = 0;
    vector<string> rows_in_this_cluster;
    bool first = true;
    for (SqlDatabase::Statement::iterator postproc_reader=cmd->begin(); postproc_reader!=cmd->end(); ++postproc_reader) {
        string cluster = postproc_reader.get<std::string>(0);
        string function_id = postproc_reader.get<std::string>(1);
        size_t index_within_function = postproc_reader.get<size_t>(2);
        string cluster_row_number = postproc_reader.get<std::string>(3);
        bool differentFunction = cluster != last_cluster || function_id != last_func_id;
        bool endingCluster = differentFunction;
        bool beginningNewCluster = first || differentFunction;
        first = false;
        if (endingCluster) {
            if (rows_in_this_cluster.size() > 1) { // Skip clusters that have only one element left
                for (size_t i = 0; i < rows_in_this_cluster.size(); ++i) {
                    insertCmd->bind(0, rows_in_this_cluster[i]);
                    insertCmd->execute();
                }
            }
        }
        if (beginningNewCluster) {
            last_cluster = cluster;
            last_func_id = function_id;
            last_index_within_function = index_within_function;
            rows_in_this_cluster.clear();
        }
        bool keep = beginningNewCluster || (index_within_function >= last_index_within_function + numStridesThatMustBeDifferent);
        if (keep) {
            last_index_within_function = index_within_function;
            rows_in_this_cluster.push_back(cluster_row_number);
        }
    }
    cerr << "... done" << endl;
}
Ejemplo n.º 5
0
// Create the tmp_insns table to hold all the instructions for the function-to-be-listed and all the instructions of all
// the functions that are mentioned in events.
static void
gather_instructions(const SqlDatabase::TransactionPtr tx, int func_id, const Events &events)
{
    std::set<std::string> func_ids;
    func_ids.insert(StringUtility::numberToString(func_id));
    for (Events::const_iterator ei=events.begin(); ei!=events.end(); ++ei)
        func_ids.insert(StringUtility::numberToString(ei->second.func_id));
    std::string sql = "create temporary table tmp_insns as"
                      " select * from semantic_instructions"
                      " where func_id in ("+StringUtility::join_range(", ", func_ids.begin(), func_ids.end())+")";
    tx->execute(sql);
}
Ejemplo n.º 6
0
// Create and populate the tmp_events table.
static void
gather_events(const SqlDatabase::TransactionPtr &tx, int func_id)
{
    tx->execute("create temporary table tmp_events as select * from semantic_fio_trace limit 0");
    if (opt.show_trace) {
        std::string sql = "insert into tmp_events select * from semantic_fio_trace where func_id = ?";
        std::vector<std::string> igroups;
        for (std::set<int>::const_iterator i=opt.traces.begin(); i!=opt.traces.end(); ++i)
            igroups.push_back(StringUtility::numberToString(*i));
        if (!igroups.empty())
            sql += " and igroup_id in (" + StringUtility::join(", ", igroups) + ")";
        tx->statement(sql)->bind(0, func_id)->execute();
    }
}
Ejemplo n.º 7
0
/****************************************************************************************
 *
 *
 * Compute how mean similar functions are to all other functions.
 *
 * The result is inserted into fr_mean_similarity on the test db, and fr_mean_similar on
 * the global db.
 *
 */
void
compute_mean_similarity_statistics(double bucket_size, double increment, SqlDatabase::TransactionPtr transaction)
{
    int num_pairs = transaction->statement("select count(*) from semantic_funcsim")->execute_int();


    transaction->execute("drop table IF EXISTS fr_mean_similarity;");
    transaction->execute("create table fr_mean_similarity as  select coalesce(sum(sf.similarity)/"+
                         boost::lexical_cast<std::string>(num_pairs)+
                         " ,0) as similarity,  ttf.id as func_id from semantic_funcsim as sf"+
                         " join semantic_functions as ttf on ttf.id = sf.func1_id  OR ttf.id = sf.func2_id GROUP BY ttf.id");

    transaction->execute("drop table IF EXISTS fr_mean_similar");
    transaction->execute("create table fr_mean_similar(similarity_low double precision, similarity_middle double precision,"
                         " similarity_high double precision, percent double precision);");

    SqlDatabase::StatementPtr mean_similar_stmt = transaction->statement("insert into fr_mean_similar"
            // 0              1                  2
            "(similarity_low, similarity_middle, similarity_high,"
            // 3
            " percent) "
            " values (?, ?, ?, ?)");

    for (double cur_bucket = 0.0; cur_bucket <= 1.0+bucket_size; cur_bucket+=increment) {
        int num_matches = transaction->statement("select count(*) from fr_mean_similarity where "
                          " similarity >= " +
                          boost::lexical_cast<std::string>(cur_bucket - bucket_size) +
                          " and similarity < " +
                          boost::lexical_cast<std::string>(cur_bucket + bucket_size))->execute_int();
        mean_similar_stmt->bind(0, cur_bucket - bucket_size < 0 ? 0 : cur_bucket - bucket_size);
        mean_similar_stmt->bind(1, cur_bucket);
        mean_similar_stmt->bind(2, cur_bucket + bucket_size >= 1.0 ? 1.0 : cur_bucket + bucket_size);
        mean_similar_stmt->bind(3, num_pairs > 0 ? ((double) num_matches*100.0)/num_pairs : 0);
        mean_similar_stmt->execute();
    }
}
Ejemplo n.º 8
0
Archivo: callLSH.C Proyecto: 8l/rose
static void 
callExact(const SqlDatabase::TransactionPtr &tx, const std::string databaseName, const string& Exec)
{
    // FIXME: We can't pass parameters to the exec'd process this way because the parent's SQL statements are
    // being executed in a transaction -- they won't be visible in the child. [Robb P. Matzke 2013-08-12]
    tx->execute("delete from detection_parameters");
    tx->statement("insert into detection_parameters (similarity_threshold, false_negative_rate) values (?, ?)")
        ->bind(0, 1.0)
        ->bind(1, 0)
        ->execute();

    std::cout << "Start running exact clone detection" << std::endl;
    pid_t p = fork();
    if (p == -1) { // Error
        perror("fork: ");
        exit (1);
    }
    if (p == 0) { // Child
        vector<char*> args;
        args.push_back(strdup(Exec.c_str()));
        args.push_back(strdup("--database"));
        args.push_back(strdup(databaseName.c_str()));
        args.push_back(0);

        ostringstream outStr; 
        for (vector<char*>::iterator iItr = args.begin(); iItr != args.end(); ++iItr)
            outStr << *iItr << " ";
        std::cout << "Calling " << outStr.str() << std::endl;
        execv(Exec.c_str(), &args[0]);
        perror("execv: ");
        exit (1);
    } else { // Parent
        int status;
        if (waitpid(p, &status, 0) == -1) {
            perror("waitpid");
            abort();
        }
        cerr << "Status: " << status << endl;
        cerr << "Done waiting for Exact Clone Detection" << endl;
    }
}
Ejemplo n.º 9
0
void
createDatabases(const SqlDatabase::TransactionPtr &tx)
{
    extern const char *syntactic_schema_create; // defined in machine-generated SyntacticSchema.C
    tx->execute(syntactic_schema_create);
}
Ejemplo n.º 10
0
Archivo: callLSH.C Proyecto: 8l/rose
static void
callLSH(const SqlDatabase::TransactionPtr &tx, const std::string databaseName, double similarity_threshold, const string& Exec,
        int norm, size_t hash_function_size, size_t hash_table_count)
{
    double distance = sqrt((1. - similarity_threshold) * 50.);
    double false_negative_rate = ( similarity_threshold != 1.0) ? 0.0100 : 0;
    vector<CloneRange> ranges = computeranges(distance, 50, 100000);
    int maxNumElementsInGroup = -1;
    int maxNumElementIdx = -1;

    // FIXME: We can't pass parameters to the exec'd process this way because the parent's SQL statements are
    // being executed in a transaction -- they won't be visible in the child. [Robb P. Matzke 2013-08-12]
    tx->execute("delete from detection_parameters");
    tx->statement("insert into detection_parameters (similarity_threshold, false_negative_rate) values (?, ?)")
        ->bind(0, similarity_threshold)
        ->bind(1, false_negative_rate)
        ->execute();
  
    map<size_t, int> groupSizes;
    std::cout << "Looking for the biggest group" << std::endl;
    for (size_t i = 0; i < ranges.size(); ++i) {
        std::string sql = std::string("select count(*) from vectors where sum_of_counts >= ?") +
                          (ranges[i].high != -1 ? " and sum_of_counts <= ?" : "");
        SqlDatabase::StatementPtr cmd = tx->statement(sql);
        cmd->bind(0, ranges[i].low);
        if (ranges[i].high != -1)
            cmd->bind(1, ranges[i].high);
        int numElementsInGroup = cmd->execute_int();
        groupSizes[i] = numElementsInGroup;
        std::cerr << "The current group from " << ranges[i].low << " to " << ranges[i].high
                  << " is of size " << numElementsInGroup << std::endl;
        if (numElementsInGroup > maxNumElementsInGroup) {
            maxNumElementsInGroup = numElementsInGroup;
            maxNumElementIdx = i;
        }
    }

    std::cout << "Biggest group found " << ranges[maxNumElementIdx].low << " " << ranges[maxNumElementIdx].high << std::endl;
    char tempDirName[] = "/tmp/paramdirXXXXXX";
    char* mkdtempResult = mkdtemp(tempDirName);
    if (!mkdtempResult) {
	perror("mkdtemp: ");
	exit (1);
    }
    string paramFileName = string(tempDirName) + "/params";
    paramFileName = "/tmp/lshparamdirE40hF1/params";
    std::cout << "Number of groups :" << ranges.size() << std::endl;

    for (int i = 0; i < (int)ranges.size(); ++i) {
        size_t group = (i == 0) ? maxNumElementIdx : (i <= maxNumElementIdx) ? i - 1 : i;
        if (groupSizes[group] > 1) {
            std::cout << "Executing LSH code low " << ranges[group].low  
                      << " high " << ranges[group].high << " group  " << group << " size " << groupSizes[group] << std::endl;
            if(norm == 3) {
                executeLSHCode(tx, databaseName, Exec, paramFileName, ranges[group]);
            } else {
                executeLSHCodeLLNL(tx, databaseName, Exec, paramFileName, ranges[group], norm, similarity_threshold,
                                   false_negative_rate, groupSizes[group]);
            }
        }
    }
    unlink(paramFileName.c_str());
    rmdir(tempDirName);
}
Ejemplo n.º 11
0
void
analyze_data(SqlDatabase::TransactionPtr tx)
{
    transaction->execute("drop table IF EXISTS functions_cg_accumulate;");
    transaction->execute("create table functions_cg_accumulate as select sm.caller, count(sm.callee) as num_calls,"
                         " sf.ninsns, sf.name from semantic_rg as sm join semantic_functions as sf on sf.id=sm.caller "
                         " group by sm.caller, sf.ninsns, sf.name;");

    transaction->execute("drop table IF EXISTS functions_rg_accumulate;");
    transaction->execute("create table functions_rg_accumulate as select sm.caller, count(sm.callee) as num_calls,"
                         " sf.ninsns, sf.name from semantic_cg as sm join semantic_functions as sf on sf.id=sm.caller "
                         " group by sm.caller, sf.ninsns, sf.name;");

    transaction->execute("drop table IF EXISTS syscalls_cg_accumulate;");
    transaction->execute("create table syscalls_cg_accumulate as select sm.caller, count(sm.callee) as num_calls,"
                         " sf.ninsns, sf.name from semantic_cg as sm join semantic_functions as sf on sf.id=sm.caller "
                         " join syscalls_made as sysm on sysm.caller=sm.callee "
                         " group by sm.caller, sf.ninsns, sf.name;");

    transaction->execute("drop table IF EXISTS syscalls_rg_accumulate;");
    transaction->execute("create table syscalls_rg_accumulate as select sm.caller, count(sm.callee) as num_calls,"
                         " sf.ninsns, sf.name from semantic_rg as sm join semantic_functions as sf on sf.id=sm.caller "
                         " join syscalls_made as sysm on sysm.caller=sm.callee "
                         " group by sm.caller, sf.ninsns, sf.name;");

    transaction->execute("drop table IF EXISTS syscalls_fio_accumulate");
    transaction->execute("create table syscalls_fio_accumulate as  "
                         " select fio.caller_id, count(fio.callee_id) as num_calls, sf.ninsns, sf.name from syscalls_made as sm "
                         " join semantic_fio_calls as fio on fio.callee_id = sm.caller "
                         " join semantic_functions as sf on sf.id=fio.caller_id "
                         " group by fio.caller_id, sf.ninsns, sf.name");
    {
        //all functions that is not a stub function for a dynamic library call
        std::cout << "\n\n\n################# COMPARING ALL FUNCTIONS \n\n";
	
        int num_functions    = tx->statement("select count(*) from functions_rg_accumulate where name NOT LIKE '%@plt'")
                               ->execute_int();
        int num_cg_syscalls  = tx->statement("select count(*) from syscalls_cg_accumulate where name NOT LIKE '%@plt'; ")
                               ->execute_int();
        int num_rg_syscalls  = tx->statement("select count(*) from syscalls_rg_accumulate where name NOT LIKE '%@plt'; ")
                               ->execute_int();
        int path_calls       = tx->statement("select count(distinct fio.caller_id) from syscalls_made as sm"
                                             " join semantic_fio_calls as fio on fio.callee_id = sm.caller"
                                             " join semantic_functions as sf on fio.caller_id=sf.id"
                                             " where sf.name not like '%@plt'")
                               ->execute_int();
        int total_num_functions = tx->statement("select count(*) from semantic_functions where name not like '%@plt'")
                                  ->execute_int();

        std::cout << std::fixed << std::setprecision(2);
        std::cout << "num functions with calls:        " << num_functions   << std::endl;
        std::cout << "num functions:                   " << total_num_functions   << std::endl;

        std::cout << "num callgraph syscalls:          " << num_cg_syscalls
                  << " fraction " << 100*((double) num_cg_syscalls/num_functions) << std::endl;
        std::cout << "num reachability graph syscalls: " << num_rg_syscalls
                  << " fraction " << 100*((double) num_rg_syscalls/num_functions) << std::endl;
        std::cout << "path calls:                      " << path_calls
                  << " fraction " << 100*((double) path_calls/num_functions)      << std::endl;
    }

    {
        std::cout << "\n\n\n################# COMPARING FUNCTIONS WITH CALLS\n\n";
        int num_functions    = tx->statement("select count(distinct caller)"
                                             " from functions_rg_accumulate where name not like '%@plt'")->execute_int();
        int num_cg_syscalls  = tx->statement("select count(distinct caller) from syscalls_cg_accumulate"
                                             " where name not like '%@plt'; ")->execute_int();
        int num_rg_syscalls  = tx->statement("select count(distinct caller) from syscalls_rg_accumulate"
                                             " where name not like '%@plt'; ")->execute_int();
        int path_calls       = tx->statement("select count(distinct fio.caller_id) from semantic_fio_calls as fio"
                                             " join syscalls_made as sm on fio.callee_id = sm.caller"
                                             " join semantic_functions as sf on sf.id=fio.caller_id"
                                             " where sf.name not like '%@plt'")->execute_int();
        int total_num_functions = tx->statement("select count(*) from semantic_functions"
                                                " where name not like '%@plt'")->execute_int();

        std::cout << std::fixed << std::setprecision(2);
        std::cout << "num functions with calls:         " << num_functions   << std::endl;
        std::cout << "num functions:                    " << total_num_functions   << std::endl;
        std::cout << "fraction of functions with calls: " << 100.0*num_functions/total_num_functions << std::endl;
        std::cout << "num callgraph syscalls:           " << num_cg_syscalls
                  << " fraction " << 100*((double) num_cg_syscalls/num_functions) << std::endl;
        std::cout << "num reachability graph syscalls:  "  << num_rg_syscalls
                  << " fraction " << 100*((double) num_rg_syscalls/num_functions) << std::endl;
        std::cout << "path calls:                       " << path_calls
                  << " fraction " << 100*((double) path_calls/num_functions)      << std::endl;
    }

    {
        std::cout << "\n\n\n################# COMPARING FUNCTIONS WITH CALLS WITH MORE THAN 100 INSTRUCTIONS\n\n";
        int num_functions    = tx->statement("select count(distinct fr.caller) from functions_rg_accumulate as fr"
                                             " join semantic_functions as sf on sf.id=fr.caller"
                                             " where sf.name not like '%@plt' and sf.ninsns >=100")->execute_int();
        int num_cg_syscalls  = tx->statement("select count(distinct sc.caller) from syscalls_cg_accumulate as sc"
                                             " join semantic_functions as sf on sf.id=sc.caller"
                                             " where sf.name not like '%@plt' and sf.ninsns >=100")->execute_int();
        int num_rg_syscalls  = tx->statement("select count(distinct sr.caller) from syscalls_rg_accumulate as sr"
                                             " join semantic_functions as sf on sf.id=sr.caller"
                                             " where sf.name NOT LIKE '%@plt' and sf.ninsns >= 100")->execute_int();
        int path_calls       = tx->statement("select count(distinct fio.caller_id) from semantic_fio_calls as fio"
                                             " join syscalls_made as sm on fio.callee_id = sm.caller"
                                             " join semantic_functions as sf on sf.id=fio.caller_id"
                                             " where sf.name not like '%@plt' and sf.ninsns >= 100")->execute_int();
        int total_num_functions = tx->statement("select count(*) from semantic_functions"
                                                " where ninsns >= 100 and name not like '%@plt'")->execute_int();

        std::cout << std::fixed << std::setprecision(2);
        std::cout << "num functions with calls:         " << num_functions   << std::endl;
        std::cout << "num functions:                    " << total_num_functions   << std::endl;
        std::cout << "fraction of functions with calls: " << 100.0*num_functions/total_num_functions << std::endl;
        std::cout << "num callgraph syscalls:           " << num_cg_syscalls
                  << " fraction " << 100*((double) num_cg_syscalls/num_functions) << std::endl;
        std::cout << "num reachability graph syscalls:  " << num_rg_syscalls
                  << " fraction " << 100*((double) num_rg_syscalls/num_functions) << std::endl;
        std::cout << "path calls:                       " << path_calls
                  << " fraction " << 100*((double) path_calls/num_functions)      << std::endl;
    }

    tx->statement("drop table IF EXISTS syscall_statistics;");
    tx->statement("create table syscall_statistics as select distinct rg.caller, sm.syscall_id, sm.syscall_name"
                  " from syscalls_made as sm"
                  " join semantic_rg as rg on rg.callee = sm.caller");
}
Ejemplo n.º 12
0
int
main(int argc, char *argv[])
{
    std::ios::sync_with_stdio();
    argv0 = argv[0];
    {
        size_t slash = argv0.rfind('/');
        argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1);
        if (0==argv0.substr(0, 3).compare("lt-"))
            argv0 = argv0.substr(3);
    }

    Switches opt;
    int argno = 1;
    for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) {
        if (!strcmp(argv[argno], "--")) {
            ++argno;
            break;
        } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) {
            ::usage(0);
        } else if (!strncmp(argv[argno], "--entry=", 8)) {
            opt.entry_vas.insert(strtoull(argv[argno]+8, NULL, 0));
        } else if (!strcmp(argv[argno], "--file=list") || !strcmp(argv[argno], "--files=list")) {
            opt.list_files = true;
        } else if (!strncmp(argv[argno], "--file=", 7) || !strncmp(argv[argno], "--files=", 8)) {
            std::vector<std::string> ids = StringUtility::split(",", strchr(argv[argno], '=')+1, (size_t)-1, true);
            for (size_t i=0; i<ids.size(); ++i) {
                const char *s = ids[i].c_str();
                char *rest;
                errno = 0;
                int id = strtoul(s, &rest, 0);
                if (errno || rest==s || *rest) {
                    std::cerr <<argv0 <<": invalid file ID: " <<ids[i] <<"\n";
                    exit(1);
                }
                opt.files.insert(id);
            }
        } else if (!strncmp(argv[argno], "--function=", 11) || !strncmp(argv[argno], "--functions=", 12)) {
            std::vector<std::string> ids = StringUtility::split(",", strchr(argv[argno], '=')+1, (size_t)-1, true);
            if (ids.size()==1 && isalpha(ids[0][0]) && ids[0].find_first_of('.')!=std::string::npos) {
                std::vector<std::string> words = StringUtility::split(".", ids[0]);
                if (words.size()!=2 ||
                    !SqlDatabase::is_valid_table_name(words[0]) || !SqlDatabase::is_valid_table_name(words[1])) {
                    std::cerr <<argv0 <<": --function switch needs either IDs or a database TABLE.COLUMN\n";
                    exit(1);
                }
                opt.function_table = words[0];
                opt.function_column = words[1];
            } else {
                for (size_t i=0; i<ids.size(); ++i) {
                    const char *s = ids[i].c_str();
                    char *rest;
                    errno = 0;
                    int id = strtoul(s, &rest, 0);
                    if (errno || rest==s || *rest) {
                        std::cerr <<argv0 <<": invalid function ID: " <<ids[i] <<"\n";
                        exit(1);
                    }
                    opt.functions.insert(id);
                }
            }
        } else if (!strncmp(argv[argno], "--first-fuzz=", 13)) {
            opt.first_fuzz = strtoul(argv[argno]+13, NULL, 0);
        } else if (!strncmp(argv[argno], "--name=", 7)) {
            opt.names.insert(argv[argno]+7);
        } else if (!strncmp(argv[argno], "--nfuzz=", 8)) {
            opt.nfuzz = strtoul(argv[argno]+8, NULL, 0);
            opt.nfuzz_set = true;
        } else if (!strncmp(argv[argno], "--size=", 7)) {
            opt.ninsns = strtoul(argv[argno]+7, NULL, 0);
        } else if (!strcmp(argv[argno], "--specimen=list") || !strcmp(argv[argno], "--specimens=list")) {
            opt.list_specimens = true;
        } else if (!strncmp(argv[argno], "--specimen=", 11) || !strncmp(argv[argno], "--specimens=", 12)) {
            std::vector<std::string> ids = StringUtility::split(",", strchr(argv[argno], '=')+1, (size_t)-1, true);
            for (size_t i=0; i<ids.size(); ++i) {
                const char *s = ids[i].c_str();
                char *rest;
                errno = 0;
                int id = strtoul(s, &rest, 0);
                if (errno || rest==s || *rest) {
                    std::cerr <<argv0 <<": invalid specimen ID: " <<ids[i] <<"\n";
                    exit(1);
                }
                opt.specimens.insert(id);
            }
        } else {
            std::cerr <<argv0 <<": unrecognized switch: " <<argv[argno] <<"\n"
                      <<"see \"" <<argv0 <<" --help\" for usage info.\n";
            exit(1);
        }
    }
    if (argno+1!=argc)
        ::usage(1);
    SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(argv[argno++])->transaction();

    // List the ID numbers and names for all specimen files
    if (opt.list_specimens) {
        SqlDatabase::Table<int, std::string> specimens;
        specimens.insert(tx->statement("select file.id, file.name"
                                       " from (select distinct specimen_id as id from semantic_functions) as specimen"
                                       " join semantic_files as file on specimen.id = file.id"
                                       " order by file.name"));
        specimens.headers("File ID", "Specimen Name");
        specimens.print(std::cout);
        return 0;
    }

    // List the ID numbers and names for all files containing functions
    if (opt.list_files) {
        SqlDatabase::Table<int, std::string> files;
        files.insert(tx->statement("select file.id, file.name"
                                   " from (select distinct file_id as id from semantic_functions) as used"
                                   " join semantic_files as file on used.id = file.id"
                                   " order by file.name"));
        files.headers("File ID", "Binary File Name");
        files.print(std::cout);
        return 0;
    }

    // Sanity checks
    if (!opt.functions.empty() && !opt.function_table.empty()) {
        std::cerr <<argv0 <<": --function=ID and --function=TABLE are mutually exclusive\n";
        exit(1);
    }
    if (0==tx->statement("select count(*) from semantic_functions")->execute_int()) {
        std::cerr <<argv0 <<": database has no functions; nothing to test\n";
        return 0;
    }
    if (0==tx->statement("select count(*) from semantic_inputvalues")->execute_int()) {
        std::cerr <<argv0 <<": database has no input groups; nothing to test\n";
        return 0;
    }

    // Create table tmp_functions containing IDs for selected functions and their specimen IDs
    std::vector<std::string> constraints;
    if (!opt.entry_vas.empty())
        constraints.push_back("func.entry_va " + SqlDatabase::in(opt.entry_vas));
    if (!opt.names.empty())
        constraints.push_back("func.name " + SqlDatabase::in_strings(opt.names, tx->driver()));
    if (!opt.specimens.empty())
        constraints.push_back("func.specimen_id " + SqlDatabase::in(opt.specimens));
    if (!opt.files.empty())
        constraints.push_back("func.file_id " + SqlDatabase::in(opt.files));
    if (!opt.functions.empty())
        constraints.push_back("func.id " + SqlDatabase::in(opt.functions));
    if (opt.ninsns>0)
        constraints.push_back("func.ninsns >= " + StringUtility::numberToString(opt.ninsns));
    std::string sql1 = "select func.id, func.specimen_id from semantic_functions as func";
    if (!opt.function_table.empty())
        sql1 += " join "+opt.function_table+" as flist on func.id = flist."+opt.function_column;
    if (!constraints.empty())
        sql1 += " where " + StringUtility::join(" and ", constraints);
    tx->execute("create temporary table tmp_functions as " + sql1);

    // Create table tmp_inputgroups containing IDs for selected input groups
    std::string sql2 = "select distinct igroup_id from semantic_inputvalues where igroup_id >= " +
                       StringUtility::numberToString(opt.first_fuzz);
    if (opt.nfuzz_set)
        sql2 += " and igroup_id < " + StringUtility::numberToString(opt.first_fuzz+opt.nfuzz);
    tx->execute("create temporary table tmp_inputgroups as " + sql2);

    // Create tmp_pending as the cross product of functions and inputgroups except for those already tested
    tx->execute("create temporary table tmp_pending as"
                "    select func.specimen_id as specimen_id, func.id as func_id, igroup.igroup_id as igroup_id"
                "      from tmp_functions as func"
                "      join tmp_inputgroups as igroup"
                "      on igroup.igroup_id is not null" // "on" clause and "is not null" (rather than "true") for portability
                "  except"
                "    select func.specimen_id, func.id, fio.igroup_id"
                "      from semantic_fio as fio"
                "      join semantic_functions as func on fio.func_id=func.id");
    SqlDatabase::StatementPtr stmt = tx->statement("select distinct specimen_id, func_id, igroup_id"
                                                   " from tmp_pending"
                                                   " order by specimen_id, igroup_id, func_id");
    for (SqlDatabase::Statement::iterator row=stmt->begin(); row!=stmt->end(); ++row)
        std::cout <<row.get<int>(0) <<"\t" <<row.get<int>(1) <<"\t" <<row.get<int>(2) <<"\n";

    // no need to commit, but if we change this in the future, be sure to add begin_command()/finish_command()
    return 0;
}
Ejemplo n.º 13
0
int
main(int argc, char* argv[])
{
    std::string database;
    size_t l = 4, k = 700;
    size_t hashTableNumBuckets = 13000000, hashTableElementsPerBucket = 20;
    double distBound = 1.;
    double similarity=1.;
    double r = 4.;
    int norm = 1;
    int groupLow=-1;
    int groupHigh=-1;

    //Timing
    struct timeval before, after;
    struct rusage ru_before, ru_after;
    gettimeofday(&before, NULL);
    getrusage(RUSAGE_SELF, &ru_before);

    bool nodelete = false;
    try {
        options_description desc("Allowed options");
        desc.add_options()
            ("help", "Produce a help message")
            ("nodelete", "Do not delete from vectors")
            ("groupLow,g", value< int >(&groupLow), "The lowest count of elements")
            ("groupHigh,G", value< int >(&groupHigh), "The highest count of elements")
            ("database", value< string >(&database), "The sqlite database that we are to use")
            ("hash-function-size,k", value< size_t >(&k), "The number of elements in a single hash function")
            ("hash-table-count,l", value< size_t >(&l), "The number of separate hash tables to create")
            ("buckets,b", value< size_t >(&hashTableNumBuckets),
             "The number of buckets in each hash table (buckets may store multiple elements)")
            ("bucket-size,s", value< size_t >(&hashTableElementsPerBucket),
             "The number of elements that can be stored in each hash table bucket")
            ("similarity,t", value< double >(&similarity), "The similarity threshold that is allowed in a clone pair")
            ("distance,d", value< double >(&distBound), "The maximum distance that is allowed in a clone pair")
            ("interval-size,r", value< double >(&r), "The divisor for the l_2 hash function family")
            ("norm,p", value< int >(&norm), "Exponent in p-norm to use (1 or 2)")
            ;
        variables_map vm;
        store(parse_command_line(argc, argv, desc), vm);
        notify(vm);

        distBound = similarity==1  ? 0.0 : sqrt(2*groupLow*(1.-similarity));

        std::cerr << "similarity " << similarity << " distBound " << distBound << std::endl;

        if (vm.count("help")) {
            cout << desc << endl;
            exit(0);
        }
        if (vm.count("nodelete")) {
            nodelete = true;
        }
        if (vm.count("groupLow") == 0) {
            groupLow = -1;
        }
        if (vm.count("groupHigh") == 0) {
            groupHigh = -1;
        }
        if (database == "") {
            std::cerr << "Missing options. Call as: " << argv[0] << " --database <database-name> [other parameters]" 
                      << std::endl;
            exit(1);
        }
        if (hashTableNumBuckets >= (1ULL << 32)) {
            cerr << "Number of buckets must be less than 2**32" << endl;
            exit (1);
        }
        if (norm != 1 && norm != 2) {
            cerr << "Norm must be either 1 or 2" << endl;
            exit (1);
        }
        if (nodelete == false) {
            cerr << "groupLow: " << groupLow << std::endl;
            cerr << "groupHigh: " << groupHigh << std::endl;
            cerr << "norm: l_" << norm << std::endl;
            cerr << "database: " << database << std::endl;
            cerr << "k: " << k << std::endl;
            cerr << "l: " << l << std::endl;
            cerr << "buckets: " << hashTableNumBuckets << std::endl;
            cerr << "bucket size: " << hashTableElementsPerBucket << std::endl;
            cerr << "distance: " << distBound << std::endl;
            cerr << "r: " << r << std::endl;
        }
    } catch(exception& e) {
        cout << e.what() << "\n";
        exit (1);
    }

    SqlDatabase::TransactionPtr tx = SqlDatabase::Connection::create(database)->transaction();

    scoped_array_with_size<VectorEntry> vectors;
    scoped_array_with_size<scoped_array_with_size<VectorEntry> > duplicateVectors;
  
    //Step to pass to LSH only the vectors that are not part of an exact clone pass
    {
        std::vector<int> functionsThatWeAreInterestedIn;
        scoped_array_with_size<VectorEntry> allVectors;
        map<string, std::vector<int> > internTable;
        read_vector_data(tx, allVectors, functionsThatWeAreInterestedIn, internTable, groupLow, groupHigh, false);

        //Assign to vectors the first element of each hash bucket
        int numberOfBuckets = 0;
        for (map<string, std::vector<int> >::iterator iItr = internTable.begin(); iItr != internTable.end() ; iItr++)
            numberOfBuckets++;

        vectors.allocate(numberOfBuckets);
        duplicateVectors.allocate(numberOfBuckets);

        int indexInVectors=0;
        std::cout << "All is size: " << allVectors.size() << " reduced size is " << vectors.size() << std::endl;
        for (map<string, std::vector<int> >::iterator iItr = internTable.begin(); iItr != internTable.end() ; iItr++) {
            if (iItr->second.size()>1)
                duplicateVectors[indexInVectors].allocate(iItr->second.size()-1);
            for (unsigned int i = 0 ; i < iItr->second.size() ; i++) {
                VectorEntry& allVe = allVectors[iItr->second[i]];
                VectorEntry& ve = i == 0 ? vectors[indexInVectors] : duplicateVectors[indexInVectors][i-1];
                ve.rowNumber  = allVe.rowNumber;
                ve.functionId = allVe.functionId;
                ve.indexWithinFunction = allVe.indexWithinFunction;
                ve.line = allVe.line;
                ve.offset = allVe.offset;
                ve.compressedCounts.allocate(allVe.compressedCounts.size());
                memcpy(ve.compressedCounts.get(), allVe.compressedCounts.get(), allVe.compressedCounts.size());
            }
            indexInVectors++;
        }
    }

    if (vectors[0].compressedCounts.size() == 0) {
        cerr << "Vector slot 0 is empty" << endl;
        abort();
    }

    size_t numVectorElements = getUncompressedSizeOfVector(vectors[0].compressedCounts.get(), vectors[0].compressedCounts.size());
    if (debug_messages) {
        cout << "Vectors have " << numVectorElements << " elements" << endl;
        cout << "Number of vectors fetched is " << vectors.size() << std::endl;
    }
    
    LSHTableBase* table = NULL;
    switch (norm) {
        case 1:
            table = new LSHTable<HammingHashFunctionSet, L1DistanceObject>(vectors, L1DistanceObject(), k, l, r,
                                                                           numVectorElements, hashTableNumBuckets,
                                                                           hashTableElementsPerBucket, distBound);
            break;
        case 2:
            table = new LSHTable<StableDistributionHashFunctionSet, L2DistanceObject>(vectors, L2DistanceObject(), k, l, r,
                                                                                      numVectorElements, hashTableNumBuckets,
                                                                                      hashTableElementsPerBucket, distBound);
            break;
        default:
            cerr << "Bad value for --norm" << endl;
            abort(); // Should have been caught earlier
    }
    assert(table);

    // Setup stuff for postprocessing
    int windowSize = 0;
    int stride = 0;
    get_run_parameters(tx, windowSize, stride);

    if (nodelete == false) {
        cerr << "About to delete from clusters" << endl;
        tx->execute("delete from clusters");
        cerr << "... done" << endl;
        cerr << "About to delete from postprocessed_clusters" << endl;
        tx->execute("delete from postprocessed_clusters");
        cerr << "... done" << endl;
    }
    const size_t numStridesThatMustBeDifferent = windowSize / (stride * 2);

    // Get clusters and postprocess them
    vector<bool> liveVectors(vectors.size(), true);
    size_t clusterNum = 0, postprocessedClusterNum = 0;
    for (size_t i = 0; i < vectors.size(); ++i) { //Loop over vectors
        //Creating potential clusters
        if (!liveVectors[i])
            continue;
        liveVectors[i] = false;
        vector<pair<size_t, double> > clusterElementsRaw = table->query(i); // Pairs are vector number, distance
        vector<pair<uint64_t, double> > clusterElements;
        vector<uint64_t > postprocessedClusterElements;
        clusterElements.push_back(make_pair(i, 0));

        //const VectorEntry& ve = vectors[i];
        for (size_t j = 0; j < clusterElementsRaw.size(); ++j) {
            size_t entry = clusterElementsRaw[j].first;
            //double dist = clusterElementsRaw[j].second;
            // All entries less than i were in previous clusters, so we save an array lookup
            if (entry <= i || !liveVectors[entry]) continue;
            clusterElements.push_back(clusterElementsRaw[j]);
            liveVectors[entry] = false;
        }
        if (clusterElements.size() < 2 && duplicateVectors[i].size() == 0 )
            continue;

        //Insert raw cluster data 
        for (vector<pair<uint64_t, double> >::const_iterator j = clusterElements.begin(); j != clusterElements.end(); ++j) {
            for(size_t k = 0; k < duplicateVectors[j->first].size(); k++) {
                const VectorEntry& ve = duplicateVectors[j->first][k];
                insert_into_clusters(tx, clusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, j->second);
            }

            const VectorEntry& ve = vectors[j->first];
            insert_into_clusters(tx, clusterNum, ve.functionId, ve.indexWithinFunction, ve.rowNumber, j->second);
        }
        if (clusterNum % 10000 == 0 && debug_messages)
            cerr << "cluster " << clusterNum << " has " << clusterElements.size() << " elements" << endl;
        ++clusterNum;

        //Postprocessing does not make sense for inexact clones
        if (similarity != 1.0 )
            continue;

        // This implicitly groups elements in the same function together and order by index_within_function in each function
        // Not needed because of the sort in LSHTable::query() which is on the cluster number:
        // std::sort(clusterElements.begin(), clusterElements.end());

        //The next two variables will we initialized in first run
        size_t lastFunctionId=0;
        size_t lastIndexWithinFunction=0;
        bool first = true;
        std::vector<VectorEntry*> clusterElemPtr;
        for (size_t j = 0; j < clusterElements.size(); ++j) {
            clusterElemPtr.push_back( &vectors[ clusterElements[j].first ]  );
            for (size_t k = 0; k < duplicateVectors[clusterElements[j].first].size(); k++)
                clusterElemPtr.push_back(&duplicateVectors[ clusterElements[j].first ][k]);
        }

        std::sort(clusterElemPtr.begin(), clusterElemPtr.end(), compare_rows );
        for (size_t j = 0; j < clusterElemPtr.size(); ++j) {
            const VectorEntry& ve = *clusterElemPtr[j];
            if (first || ve.functionId != lastFunctionId ||
                ve.indexWithinFunction >= lastIndexWithinFunction + numStridesThatMustBeDifferent) {
                lastFunctionId = ve.functionId;
                lastIndexWithinFunction = ve.indexWithinFunction;
                postprocessedClusterElements.push_back(j);
            }
            first = false;
        }
        if (postprocessedClusterElements.size() >= 2) { //insert post processed data 
            for (vector<uint64_t >::const_iterator j = postprocessedClusterElements.begin();
                 j != postprocessedClusterElements.end(); ++j) {
                const VectorEntry& ve = *clusterElemPtr[*j];
                insert_into_postprocessed_clusters(tx, postprocessedClusterNum, ve.functionId, ve.indexWithinFunction,
                                                   ve.rowNumber, 0);
            }
            if (postprocessedClusterNum % 1000 == 0) {
                cerr << "postprocessed cluster " << postprocessedClusterNum
                     << " has " << postprocessedClusterElements.size() << " elements" << endl;
            }
            ++postprocessedClusterNum;
        }
    }
    cerr << clusterNum << " total cluster(s), " << postprocessedClusterNum << " after postprocessing" << endl;

    gettimeofday(&after, NULL);
    getrusage(RUSAGE_SELF, &ru_after);
    insert_timing(tx, "lsh", groupLow, groupHigh, vectors.size(), k, l, before,after, ru_before, ru_after);

    tx->commit();
    return 0;
}
int
main(int argc, char *argv[])
{
    std::ios::sync_with_stdio();
    argv0 = argv[0];
    {
        size_t slash = argv0.rfind('/');
        argv0 = slash==std::string::npos ? argv0 : argv0.substr(slash+1);
        if (0==argv0.substr(0, 3).compare("lt-"))
            argv0 = argv0.substr(3);
    }

    int argno = 1;
    for (/*void*/; argno<argc && '-'==argv[argno][0]; ++argno) {
        if (!strcmp(argv[argno], "--")) {
            ++argno;
            break;
        } else if (!strcmp(argv[argno], "--help") || !strcmp(argv[argno], "-h")) {
            ::usage(0);
        } else if (!strcmp(argv[argno], "--delete")) {
            opt.delete_old_data = true;
        } else if (!strncmp(argv[argno], "--exclude-functions=", 20)) {
            opt.exclude_functions_table = argv[argno]+20;
        } else if (!strcmp(argv[argno], "--no-delete")) {
            opt.delete_old_data = false;
        } else if (!strncmp(argv[argno], "--relation=", 11)) {
            opt.relation_id = strtol(argv[argno]+11, NULL, 0);
        } else {
            std::cerr <<argv0 <<": unknown switch: " <<argv[argno] <<"\n"
                      <<argv0 <<": see --help for more info\n";
            exit(1);
        }
    };
    if (argno+1!=argc)
        ::usage(1);
    time_t start_time = time(NULL);
    SqlDatabase::ConnectionPtr conn = SqlDatabase::Connection::create(argv[argno++]);
    SqlDatabase::TransactionPtr tx = conn->transaction();

    // Save ourself in the history if we're modifying the database.
    int64_t cmd_id=-1;
    if (opt.delete_old_data)
        cmd_id = CloneDetection::start_command(tx, argc, argv, "clearing funcsim data for relation #"+
                                               StringUtility::numberToString(opt.relation_id), start_time);

    // The 32-func-similarity tool needs this index, so we might as well create it here when we're running serially.  The
    // semantic_outputvalues table can be HUGE depending on how the analysis is configured (i.e., whether it saves output
    // values as a vector or set, whether it saves function calls and system calls, etc.).  Since creating the index could take
    // a few minutes, we'd rather not create it if it alread exists, but PostgreSQL v8 doesn't have a "CREATE INDEX IF NOT
    // EXISTS" ability.  Therefore, try to create the index right away before we make any other changes, and if creation fails
    // then start a new transaction (because the current one is hosed).
    std::cerr <<argv0 <<": creating output group index (could take a while)\n";
    try {
        SqlDatabase::TransactionPtr tx = conn->transaction();
        tx->execute("create index idx_ogroups_hashkey on semantic_outputvalues(hashkey)");
        tx->commit();
    } catch (const SqlDatabase::Exception&) {
        std::cerr <<argv0 <<": idx_ogroups_hashkey index already exists; NOT dropping and recreating\n";
    }

    // Delete old data.
    if (opt.delete_old_data)
        tx->statement("delete from semantic_funcsim where relation_id = ?")->bind(0, opt.relation_id)->execute();

    // Get the list of functions that should appear in the worklist.
    std::cerr <<argv0 <<": obtaining function list\n";
    std::string stmt1 = "create temporary table tmp_tested_funcs as"
                        " select distinct fio.func_id as func_id"
                        " from semantic_fio as fio";
    if (!opt.exclude_functions_table.empty()) {
        std::vector<std::string> parts = StringUtility::split('.', opt.exclude_functions_table, 2, true);
        if (parts.size()<2)
            parts.push_back("func_id");
        stmt1 += " left join " + parts.front() + " as exclude"
                 " on fio.func_id = exclude." + parts.back() +
                 " where exclude." + parts.back() + " is null";
    }
    tx->execute(stmt1);

    // Create pairs of function IDs for those functions which have been tested and for which no similarity measurement has been
    // computed.  (FIXME: We should probably recompute similarity that might have changed due to rerunning tests or running the
    // same function but with more input groups. [Robb P. Matzke 2013-06-19])
    std::cerr <<argv0 <<": creating work list\n";
    SqlDatabase::StatementPtr stmt2 = tx->statement("select distinct f1.func_id as func1_id, f2.func_id as func2_id"
                                                    " from tmp_tested_funcs as f1"
                                                    " join tmp_tested_funcs as f2 on f1.func_id < f2.func_id"
                                                    " except"
                                                    " select func1_id, func2_id from semantic_funcsim as sim"
                                                    " where sim.relation_id = ?");
    stmt2->bind(0, opt.relation_id);
    for (SqlDatabase::Statement::iterator row=stmt2->begin(); row!=stmt2->end(); ++row)
        std::cout <<row.get<int>(0) <<"\t" <<row.get<int>(1) <<"\n";

    if (cmd_id>=0)
        CloneDetection::finish_command(tx, cmd_id, "cleared funcsim table for relation #"+
                                       StringUtility::numberToString(opt.relation_id));

    tx->commit();
    return 0;
}